├── .gitmodules
├── README.md
├── omnireduce-DPDK
    ├── README.md
    ├── build_all.sh
    ├── daiet
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README.md
    │   ├── example
    │   │   ├── CMakeLists.txt
    │   │   ├── daiet.cfg
    │   │   └── main.cpp
    │   ├── experiments
    │   │   ├── exp1
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── README.md
    │   │   │   ├── common.h
    │   │   │   └── main.cc
    │   │   └── exp2
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── README.md
    │   │   │   ├── common.h
    │   │   │   ├── float16.cc
    │   │   │   ├── float32.cc
    │   │   │   ├── int32.cc
    │   │   │   └── switchml_dense.cc
    │   ├── ps
    │   │   ├── Makefile
    │   │   ├── ps.cfg
    │   │   └── src
    │   │   │   ├── common.cpp
    │   │   │   ├── common.hpp
    │   │   │   ├── dpdk.h
    │   │   │   ├── main.cpp
    │   │   │   ├── msgs.h
    │   │   │   ├── params.cpp
    │   │   │   ├── params.hpp
    │   │   │   ├── ps.cpp
    │   │   │   ├── ps.hpp
    │   │   │   ├── stats.cpp
    │   │   │   ├── stats.hpp
    │   │   │   ├── utils.cpp
    │   │   │   └── utils.hpp
    │   ├── scripts
    │   │   └── dpdk-config.sh
    │   └── src
    │   │   ├── DaietContext.cpp
    │   │   ├── DaietContext.hpp
    │   │   ├── common.cpp
    │   │   ├── common.hpp
    │   │   ├── daiet.cpp
    │   │   ├── daiet.hpp
    │   │   ├── dpdk.h
    │   │   ├── msgs.h
    │   │   ├── params.cpp
    │   │   ├── params.hpp
    │   │   ├── ps.cpp
    │   │   ├── ps.hpp
    │   │   ├── stats.cpp
    │   │   ├── stats.hpp
    │   │   ├── utils.cpp
    │   │   ├── utils.hpp
    │   │   ├── worker.cpp
    │   │   └── worker.hpp
    ├── docker
    │   ├── Dockerfile
    │   └── aggregator_Dockerfile
    ├── environment.yml
    ├── get_cuda_arch_code.sh
    ├── gloo.patch
    ├── prepare.sh
    └── pytorch.patch
└── omnireduce-RDMA
    ├── Makefile
    ├── README.md
    ├── docker
        ├── Dockerfile
        └── README.md
    ├── docs
        └── tutorial.md
    ├── example
        ├── Makefile
        ├── README.md
        ├── aggregator_test.cpp
        ├── cuda_worker_test.cpp
        ├── omnireduce.cfg
        └── worker_test.cpp
    ├── frameworks_integration
        ├── horovod_patch
        │   ├── README.md
        │   ├── omnireduce-horovod.patch
        │   ├── test_hvd_tensorflow.py
        │   └── test_hvd_torch.py
        └── pytorch_patch
        │   ├── README.md
        │   └── omnireduce-pytorch.patch
    └── omnireduce
        ├── aggcontext.cpp
        ├── aggcontext.hpp
        ├── aggregator.cpp
        ├── aggregator.hpp
        ├── common.cpp
        ├── common.hpp
        ├── context.cpp
        ├── context.hpp
        ├── cuda_utils.cu
        ├── cuda_utils.hpp
        ├── omnireduce.cpp
        ├── omnireduce.hpp
        ├── params.cpp
        ├── params.hpp
        ├── worker.cpp
        └── worker.hpp


/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "omnireduce-DPDK/gloo"]
 2 | 	path = omnireduce-DPDK/gloo
 3 | 	url = https://github.com/facebookincubator/gloo.git
 4 | [submodule "omnireduce-DPDK/pytorch"]
 5 | 	path = omnireduce-DPDK/pytorch
 6 | 	url = https://github.com/pytorch/pytorch.git
 7 | [submodule "omnireduce-DPDK/daiet/lib/dpdk"]
 8 | 	path = omnireduce-DPDK/daiet/lib/dpdk
 9 | 	url = https://github.com/sands-lab/dpdk.git
10 | [submodule "omnireduce-DPDK/daiet/lib/vcl"]
11 | 	path = omnireduce-DPDK/daiet/lib/vcl
12 | 	url = https://github.com/vcoda/vectorclass.git
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OmniReduce
 2 | OmniReduce is an efficient sparse collective communication library. It maximizes effective bandwidth use by exploiting the sparsity of data.
 3 | 
 4 | For clusters without RDMA support, OmniReduce uses Intel DPDK for kernel bypass. GPUDirect can also be used where available.
 5 | 
 6 | ## Contents
 7 | - omnireduce-DPDK: source code of DPDK-based OmniReduce
 8 | - omnireduce-RDMA: source code of RDMA-based OmniReduce
 9 | - [experiments](https://github.com/sands-lab/omnireduce-experiments): micro-benchmark and end-to-end scripts
10 | 
11 | ## Publications
12 | 
13 | [OmniReduce](https://sands.kaust.edu.sa/project/omnireduce/) accepted at SIGCOMM’21.
14 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/README.md:
--------------------------------------------------------------------------------
 1 | # OmniReduce-DPDK
 2 | 
 3 | ## prepare submodules
 4 | ```bash
 5 | ./prepare.sh [--depth=10] # optional --depth shallow copys submodules
 6 | ```
 7 | 
 8 | ## create conda environment
 9 | ```bash
10 | conda env create --prefix ../env --file environment.yml
11 | ```
12 | 
13 | ## build
14 | ```bash
15 | conda activate ../env
16 | ./build_all.sh MLX5 CONDA INSTALL NOSCALING PYTORCH HOROVOD
17 | ```
18 | 
19 | ## offload bitmap (only supports PyTorch)
20 | ```bash
21 | conda activate ../env
22 | ./build_all.sh MLX5 CONDA INSTALL OFFLOAD_BITMAP NOSCALING PYTORCH
23 | ```
24 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/build_all.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # FLAGS: INSTALL MLX5 COLOCATED LATENCIES TIMESTAMPS TIMERS DEBUG CONDA OFFLOAD_BITMAP NOSCALING PYTORCH ALGO2 COUNTERS NO_FILL_STORE
  4 | set -e
  5 | set -x
  6 | 
  7 | CWD="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
  8 | DPDK_ARGS='-fPIC '
  9 | DAIET_ARGS=''
 10 | EXP_ARGS=''
 11 | PS_ARGS=''
 12 | GLOO_CMAKE_ARGS=''
 13 | 
 14 | if [[ $@ == *'CONDA'* ]]; then
 15 |   echo "will install libraries to ${CONDA_PREFIX:-'/'}"
 16 |   THIS_TIME=`date`
 17 |   echo "build_all.sh invoked at ${THIS_TIME} with $@" > ${CONDA_PREFIX}/build-info.txt
 18 | fi
 19 | 
 20 | if [[ $@ == *'MLX5'* ]]; then
 21 |   echo 'MLX5 SUPPORT'
 22 |   EXP_ARGS+='-DUSE_MLX5=1 '
 23 | fi
 24 | if [[ $@ == *'MLX4'* ]]; then
 25 |   echo 'MLX4 SUPPORT'
 26 |   EXP_ARGS+='-DUSE_MLX4=1 '
 27 | fi
 28 | if [[ $@ == *'COLOCATED'* ]]; then
 29 |   echo 'COLOCATED SET'
 30 |   DAIET_ARGS+='COLOCATED=ON '
 31 | fi
 32 | if [[ $@ == *'LATENCIES'* ]]; then
 33 |   echo 'LATENCIES SET'
 34 |   DAIET_ARGS+='LATENCIES=ON '
 35 | fi
 36 | if [[ $@ == *'TIMESTAMPS'* ]]; then
 37 |   echo 'TIMESTAMPS SET'
 38 |   DAIET_ARGS+='TIMESTAMPS=ON '
 39 | fi
 40 | if [[ $@ == *'COUNTERS'* ]]; then
 41 |   echo 'COUNTERS SET'
 42 |   DAIET_ARGS+='COUNTERS=ON '
 43 | fi
 44 | if [[ $@ == *'ALGO2'* ]]; then
 45 |   echo 'ALGO2 SET'
 46 |   DAIET_ARGS+='ALGO2=ON '
 47 |   PS_ARGS+='ALGO2=ON '
 48 | fi
 49 | if [[ $@ == *'TIMERS'* ]]; then
 50 |   echo 'TIMERS SET'
 51 |   DAIET_ARGS+='TIMERS=ON '
 52 |   PS_ARGS+='TIMERS=ON '
 53 | fi
 54 | if [[ $@ == *'NO_FILL_STORE'* ]]; then
 55 |   echo 'NO_FILL_STORE SET'
 56 |   DAIET_ARGS+='NO_FILL_STORE=ON '
 57 | fi
 58 | if [[ $@ == *'DEBUG'* ]]; then
 59 |   echo 'DEBUG SET'
 60 |   DAIET_ARGS+='DEBUG=ON COUNTERS=ON '
 61 |   DPDK_ARGS+='-g -O0 '
 62 |   PS_ARGS+='DEBUG=ON '
 63 |   EXP_ARGS+='-DDEBUG=1 '
 64 | fi
 65 | if [[ $@ == *'CONDA'* ]]; then
 66 |   GLOO_CMAKE_ARGS+='-DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" '
 67 |   GLOO_CMAKE_ARGS+="-DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} "
 68 |   EXP_ARGS+='-DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1"'
 69 |   DAIET_EXTRA_CXX_FLAGS+="-I${CONDA_PREFIX}/include -L${CONDA_PREFIX}/lib "
 70 | fi
 71 | if [[ $@ == *'OFFLOAD_BITMAP'* ]]; then
 72 |   echo 'OFFLOAD_BITMAP SET'
 73 |   DAIET_ARGS+='OFFLOAD_BITMAP=ON '
 74 |   OFFLOAD_BITMAP=1
 75 | else
 76 |   OFFLOAD_BITMAP=0
 77 | fi
 78 | if [[ $@ == *'NOSCALING'* ]]; then
 79 |   echo 'NOSCALING SET'
 80 |   DAIET_ARGS+='NOSCALING=ON '
 81 |   PS_ARGS+='NOSCALING=ON '
 82 | fi
 83 | 
 84 | # Build DPDK
 85 | cd $CWD/daiet/lib/dpdk/
 86 | 
 87 | if [[ $@ != *'SKIP_DPDK'* ]]; then
 88 |   rm -rf build
 89 | 
 90 |   if [[ $@ == *'MLX5'* ]]; then
 91 |     sed -i 's/CONFIG_RTE_LIBRTE_MLX5_PMD=n/CONFIG_RTE_LIBRTE_MLX5_PMD=y/' config/common_base
 92 |   else
 93 |     sed -i 's/CONFIG_RTE_LIBRTE_MLX5_PMD=y/CONFIG_RTE_LIBRTE_MLX5_PMD=n/' config/common_base
 94 |   fi
 95 |   if [[ $@ == *'MLX4'* ]]; then
 96 |     sed -i 's/CONFIG_RTE_LIBRTE_MLX4_PMD=n/CONFIG_RTE_LIBRTE_MLX4_PMD=y/' config/common_base
 97 |   else
 98 |     sed -i 's/CONFIG_RTE_LIBRTE_MLX4_PMD=y/CONFIG_RTE_LIBRTE_MLX4_PMD=n/' config/common_base
 99 |   fi
100 | 
101 |   make defconfig T=x86_64-native-linuxapp-gcc
102 |   make EXTRA_CFLAGS="${DPDK_ARGS}" -j
103 | 
104 |   if [[ $@ == *'INSTALL'* ]]; then
105 |     if [[ $@ == *'CONDA'* ]]; then
106 |       make install-sdk install-runtime prefix=${CONDA_PREFIX}
107 |     else
108 |       make install
109 |     fi
110 |   fi
111 | fi
112 | 
113 | 
114 | if [[ $@ != *'SKIP_DAIET'* ]]; then
115 |   cd $CWD/daiet
116 |   # Build DAIET
117 |   make clean
118 |   rm -rf build
119 |   EXTRA_CXX_FLAGS=${DAIET_EXTRA_CXX_FLAGS} make ${DAIET_ARGS} -j
120 |   if [[ $@ == *'INSTALL'* ]]; then
121 |     if [[ $@ == *'CONDA'* ]]; then
122 |       make libinstall PREFIX=${CONDA_PREFIX}
123 |     else
124 |       make libinstall
125 |     fi
126 |   fi
127 | fi
128 | 
129 | if [[ $@ != *'SKIP_GLOO'* ]]; then
130 |   cd $CWD/gloo
131 |   # Build Gloo
132 |   rm -rf build
133 |   mkdir build
134 |   cd build
135 | 
136 |   if [[ $@ == *'DEBUG'* ]]; then
137 |     CXXFLAGS='-g -O0' cmake -DUSE_DAIET=1 -DUSE_REDIS=1 -DUSE_AVX=1 -DUSE_MPI=1 $GLOO_CMAKE_ARGS ..
138 |   else
139 |     cmake -DBUILD_TEST=OFF -DBUILD_BENCHMARK=OFF -DUSE_DAIET=1 -DUSE_REDIS=1 -DUSE_AVX=1 -DUSE_MPI=1 $GLOO_CMAKE_ARGS ..
140 |   fi
141 | 
142 |   make -j
143 |   if [[ $@ == *'INSTALL'* ]]; then
144 |     cd $CWD/gloo/build
145 |     if [[ $@ == *'CONDA'* ]]; then
146 |       cmake -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} ..
147 |     fi
148 |     make install
149 |   fi
150 | fi
151 | 
152 | 
153 | # Build experiments
154 | if [[ $@ != *'SKIP_EXPS'* ]]; then
155 |   cd $CWD/daiet/experiments/exp1/
156 |   mkdir -p build
157 |   cd build
158 |   find . ! -name 'daiet.cfg'   ! -name '.'  ! -name '..' -exec rm -rf {} +
159 |   cmake ${EXP_ARGS} ..
160 |   make -j
161 | fi
162 | 
163 | if [[ $@ != *'SKIP_EXPS'* ]]; then
164 |   cd $CWD/daiet/experiments/exp2/
165 |   mkdir -p build
166 |   cd build
167 |   find . ! -name 'daiet.cfg'   ! -name '.'  ! -name '..' -exec rm -rf {} +
168 |   cmake ${EXP_ARGS} ..
169 |   make -j
170 | fi
171 | 
172 | # Build example
173 | if [[ $@ != *'SKIP_EXAMPLE'* ]]; then
174 |   cd $CWD/daiet/example
175 |   mkdir -p build
176 |   cd build
177 |   find . ! -name 'daiet.cfg'   ! -name '.'  ! -name '..' -exec rm -rf {} +
178 |   cmake ${EXP_ARGS} ..
179 |   make -j
180 | fi
181 | 
182 | # Build dedicated PS
183 | if [[ $@ != *'SKIP_PS'* ]]; then
184 |   cd $CWD/daiet/ps
185 |   make clean
186 |   make ${PS_ARGS} -j
187 | fi
188 | 
189 | # Build PyTorch
190 | if [[ $@ == *'PYTORCH'* ]]; then
191 |   cd $CWD/pytorch
192 |   OFFLOAD_BITMAP=$OFFLOAD_BITMAP BUILD_TEST=0 BUILD_CAFFE2=0 USE_SYSTEM_NCCL=1 NCCL_INCLUDE_DIR=${CONDA_PREFIX}/include NCCL_LIB_DIR=${CONDA_PREFIX}/lib ${CONDA_PREFIX}/bin/python setup.py install --prefix=${CONDA_PREFIX} --record=`basename ${CONDA_PREFIX}`_files.txt
193 | fi
194 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | /build
3 | /lib/dpdk/build
4 | /example/build
5 | /ps/build
6 | /experiments/exp1/build
7 | /experiments/exp2/build
8 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/Makefile:
--------------------------------------------------------------------------------
  1 | # DAIET project
  2 | # author: amedeo.sapio@kaust.edu.sa
  3 | 
  4 | ifeq ($(DAIET_PATH),)
  5 | DAIET_PATH = $(shell pwd)
  6 | export DAIET_PATH
  7 | endif
  8 | 
  9 | RTE_SDK = ${DAIET_PATH}/lib/dpdk
 10 | RTE_TARGET = build
 11 | 
 12 | include $(RTE_SDK)/mk/rte.vars.mk
 13 | 
 14 | # App name
 15 | APPNAME = daiet
 16 | 
 17 | # binary name
 18 | LIB = libdaiet.a
 19 | 
 20 | # install directory
 21 | PREFIX = /usr/local
 22 | 
 23 | # all source are stored in SRCS-y
 24 | SRCS-y := $(shell find ${DAIET_PATH}/src -maxdepth 1 -name "*.cpp")
 25 | HDRS := $(shell find ${DAIET_PATH}/src -maxdepth 1 -name "*.hpp" -o -name "*.h")
 26 | 
 27 | #SIMDFLAGS = -msse2 -mssse3 -msse4.1 -msse4.2 -mavx -fabi-version=0 -mfma -mavx2 -mavx512f -mavx512dq -mavx512cd -mavx512bw -mavx512vl 
 28 | CXXFLAGS += -Wall -Wextra -std=c++11 -fPIC -I ${DAIET_PATH}/../gloo/ -I ${DAIET_PATH}/lib/
 29 | LDFLAGS += -lstdc++ -l boost_program_options
 30 | 
 31 | ifeq ($(COLOCATED),ON)
 32 | $(info "COLOCATED ON")
 33 | CXXFLAGS += -DCOLOCATED
 34 | endif
 35 | 
 36 | ifeq ($(MLX),ON)
 37 | $(info "MLX ON")
 38 | CXXFLAGS += -DMLX
 39 | endif
 40 | 
 41 | ifeq ($(LATENCIES),ON)
 42 | $(info "LATENCIES ON")
 43 | CXXFLAGS += -DLATENCIES
 44 | endif
 45 | 
 46 | ifeq ($(TIMERS),ON)
 47 | $(info "TIMERS ON")
 48 | CXXFLAGS += -DTIMERS
 49 | endif
 50 | 
 51 | ifeq ($(NOSCALING),ON)
 52 | $(info "NOSCALING ON")
 53 | CXXFLAGS += -DNOSCALING
 54 | endif
 55 | 
 56 | ifeq ($(ALGO2),ON)
 57 | $(info "ALGO2 ON")
 58 | CXXFLAGS += -DALGO2
 59 | endif
 60 | 
 61 | ifeq ($(COUNTERS),ON)
 62 | $(info "COUNTERS ON")
 63 | CXXFLAGS += -DCOUNTERS
 64 | endif
 65 | 
 66 | ifeq ($(NO_FILL_STORE),ON)
 67 | $(info "NO_FILL_STORE ON")
 68 | CXXFLAGS += -DNO_FILL_STORE
 69 | endif
 70 | 
 71 | ifeq ($(DEBUG),ON)
 72 | $(info "DEBUG ON")
 73 | CXXFLAGS += -DDEBUG -g -O0
 74 | else
 75 | CXXFLAGS += -O3
 76 | endif
 77 | 
 78 | ifeq ($(OFFLOAD_BITMAP),ON)
 79 | $(info "OFFLOAD BITMAP ON")
 80 | CXXFLAGS += -DOFFLOAD_BITMAP
 81 | endif
 82 | 
 83 | .PHONY: local_install
 84 | local_install: _postbuild
 85 | 	$(Q)$(MAKE) clean
 86 | 	$(RM) build/_postbuild
 87 | 	$(RM) _postclean
 88 | 	mkdir -p build/include/$(APPNAME)
 89 | 	cp $(HDRS) build/include/$(APPNAME)
 90 | 
 91 | include $(RTE_SDK)/mk/rte.extlib.mk
 92 | 
 93 | distclean: clean
 94 | 	$(RM) -r build
 95 | 
 96 | .PHONY: libinstall
 97 | libinstall:
 98 | 	mkdir -p $(DESTDIR)$(PREFIX)/lib
 99 | 	mkdir -p $(DESTDIR)$(PREFIX)/include/$(APPNAME)
100 | 	cp $(RTE_OUTPUT)/$(LIB) $(DESTDIR)$(PREFIX)/lib/$(LIB)
101 | 	cp $(HDRS) $(DESTDIR)$(PREFIX)/include/$(APPNAME)
102 | 
103 | .PHONY: libuninstall
104 | libuninstall:
105 | 	$(RM) $(DESTDIR)$(PREFIX)/lib/$(LIB)
106 | 	$(RM) -r $(DESTDIR)$(PREFIX)/include/$(APPNAME)
107 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/README.md:
--------------------------------------------------------------------------------
  1 | # DAIET
  2 | ### Install dependencies
  3 | - Dependencies: 
  4 | `make, coreutils, gcc, libc headers, kernel headers, libnuma-dev, Python version 2.7+ or 3.2+, kmod, pciutils, build-essential, boost-program-options, cmake, libhiredis-dev`
  5 | 
  6 | In Debian/Ubuntu:
  7 | ```sh
  8 | apt install -f make \
  9 | coreutils \
 10 | gcc \
 11 | libc6-dev \
 12 | linux-headers-$(uname -r) \
 13 | libnuma-dev \
 14 | python \
 15 | kmod \
 16 | pciutils \
 17 | build-essential \
 18 | libboost-program-options-dev \
 19 | libboost-all-dev \
 20 | cmake \
 21 | libhiredis-dev
 22 | ```
 23 | 
 24 | In Fedora:
 25 | ```sh
 26 | dnf install make \
 27 | automake \
 28 | coreutils \
 29 | gcc \
 30 | gcc-c++ \
 31 | glibc-devel \
 32 | kernel-devel \
 33 | kernel-headers \
 34 | numactl-devel \
 35 | python \
 36 | kmod \
 37 | pciutils \
 38 | boost-program-options
 39 | ```
 40 | 
 41 | ### DPDK Setup
 42 | - See [here](https://doc.dpdk.org/guides/linux_gsg/sys_reqs.html)
 43 | 
 44 | ### Compile DPDK
 45 | - Run:
 46 | ```sh
 47 | cd lib/dpdk
 48 | make defconfig T=x86_64-native-linuxapp-gcc
 49 | make -j
 50 | sudo make install
 51 | cd ../..
 52 | ```
 53 | 
 54 | ### Bind the interfaces
 55 | - Set the name of the interface to bind with DPDK in `dpdk-config.sh`
 56 | - Run: `. ./dpdk-config.sh`
 57 | 
 58 | ### Compile the DAIET library
 59 | - Run:
 60 | ```sh
 61 | make -j
 62 | sudo make libinstall
 63 | ```
 64 | 
 65 | ### Configuration
 66 | - Configuration parameters (e.g., number of workers, IPs and ports) in `daiet.cfg`
 67 | 
 68 | ## Utils
 69 | 
 70 | - Get the hugepage size:
 71 | ```sh
 72 | awk '/Hugepagesize/ {print $2}' /proc/meminfo
 73 | ```
 74 | 
 75 | - Get the total huge page numbers:
 76 | ```sh
 77 | awk '/HugePages_Total/ {print $2} ' /proc/meminfo
 78 | ```
 79 | 
 80 | - Unmount the hugepages:
 81 | ```sh
 82 | umount `awk '/hugetlbfs/ {print $2}' /proc/mounts`
 83 | ```
 84 | 
 85 | - Mount hugepage folder:
 86 | ```sh
 87 | mkdir -p /mnt/huge
 88 | mount -t hugetlbfs nodev /mnt/huge
 89 | ```
 90 | 
 91 | - Check the CPU layout using using the DPDK cpu\_layout utility:
 92 | ```sh
 93 | cd lib/dpdk
 94 | usertools/cpu_layout.py
 95 | ```
 96 | 
 97 | - Check your NIC id and related socket id:
 98 | ```sh
 99 | # List all the NICs with PCI address and device IDs.
100 | lspci -nn | grep Eth
101 | ```
102 | - Check the PCI device related numa node id:
103 | ```sh
104 | cat /sys/bus/pci/devices/0000\:xx\:00.x/numa_node
105 | ```
106 | Usually 0x:00.x is on socket 0 and 8x:00.x is on socket 1. 
107 | Note: To get the best performance, ensure that the core and NICs are in the same socket. In the example above 85:00.0 is on socket 1 and should be used by cores on socket 1 for the best performance.
108 | 
109 | > Note:
110 | > to support C++ applications, DPDK is patched with:
111 | > ```sh
112 | > cd lib/dpdk
113 | > patch -p1 < cpp_support.patch
114 | > ```
115 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/example/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(switchml)
 2 | cmake_minimum_required(VERSION 3.5)
 3 | 
 4 | set(USE_MLX5_DEFAULT OFF)
 5 | set(USE_MLX4_DEFAULT OFF)
 6 | set(DEBUG_DEFAULT OFF)
 7 | 
 8 | # Options
 9 | option(USE_MLX5 "Use MLX5 and ibverbs" ${USE_MLX5_DEFAULT})
10 | option(USE_MLX4 "Use MLX4 and ibverbs" ${USE_MLX4_DEFAULT})
11 | option(DEBUG "Compile in debug mode" ${DEBUG_DEFAULT})
12 | 
13 | if(DEBUG)
14 |     message(WARNING "Compile in debug mode")
15 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -g -O0")
16 | else()
17 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -O3 -DNDEBUG")
18 | endif()
19 | 
20 | add_definitions("-DHIREDIS_NESTED_INCLUDE")
21 | 
22 | include_directories(${CMAKE_SOURCE_DIR}/../../gloo)
23 | include_directories(${CMAKE_SOURCE_DIR}/../build/include)
24 | link_directories(${CMAKE_SOURCE_DIR}/../build)
25 | link_directories(${CMAKE_SOURCE_DIR}/../lib/dpdk/build/lib)
26 | 
27 | add_executable(example main.cpp)
28 | 
29 | target_link_libraries(example -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options pthread)
30 | 
31 | if(USE_MLX5)
32 |     target_link_libraries(example ibverbs mlx5 mnl)
33 | endif()
34 | if(USE_MLX4)
35 |     target_link_libraries(example ibverbs mlx4 mnl)
36 | endif()
37 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/example/daiet.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # DAIET project
 3 | # author: amedeo.sapio@kaust.edu.sa
 4 | #
 5 | 
 6 | [daiet]
 7 | # Number of workers
 8 | num_workers = 8
 9 | # Weights per packet
10 | num_updates = 35
11 | # Maximum number of pending messages
12 | max_num_pending_messages = 256
13 | # Worker UDP port
14 | worker_port = 4000
15 | # Parameter Server UDP port
16 | ps_port = 48864
17 | # Worker IP
18 | worker_ip = 10.0.0.1
19 | # Parameter Server IP and MACS
20 | ps_ips = 10.0.0.101, 10.0.0.102, 10.0.0.103, 10.0.0.104, 10.0.0.105, 10.0.0.106, 10.0.0.107, 10.0.0.108
21 | ps_macs = 0c:c4:7a:63:76:ab, 0c:c4:7a:63:76:ed, 0c:c4:7a:63:75:51, 0c:c4:7a:63:78:33, 0c:c4:7a:63:76:eb, 0c:c4:7a:63:76:dd, 0c:c4:7a:63:76:e3, 0c:c4:7a:63:78:31
22 | 
23 | [dpdk]
24 | # Number of cores
25 | cores = 0-3
26 | # Process prefix
27 | prefix = daiet
28 | # Extra EAL options
29 | extra_eal_options =
30 | # Port id
31 | port_id = 0
32 | # Pool and pool cache sizes
33 | pool_size = 262144
34 | pool_cache_size = 512
35 | # Number of packets in a burst
36 | burst_rx = 64
37 | burst_tx = 64
38 | # Bulk drain timer (microseconds)
39 | bulk_drain_tx_us = 10
40 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/example/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <fstream>
  3 | #include <signal.h>
  4 | #include <chrono>
  5 | #include <thread>
  6 | #include <daiet/DaietContext.hpp>
  7 | 
  8 | using namespace daiet;
  9 | using namespace std;
 10 | 
 11 | void signal_handler(int signum) {
 12 |     if (signum == SIGINT || signum == SIGTERM) {
 13 |         cout << " Signal " << signum << " received, preparing to exit...";
 14 |         exit(EXIT_SUCCESS);
 15 |     }
 16 | }
 17 | 
 18 | int main() {
 19 | 
 20 |     DaietContext& ctx = DaietContext::getInstance();
 21 | 
 22 |     long int count = 1024 * 1024 * 200;
 23 |     int num_rounds = 10;
 24 |     int num_workers = 2;
 25 |     float base_value = 1.2; // must be [1,2[
 26 |     double accuracy=0.0001;
 27 |     int min_exp = -126 + log2(num_workers * count * num_rounds);
 28 |     int max_exp = 127 - log2(num_workers * count * num_rounds);
 29 |     int exp = min_exp;
 30 | 
 31 |     int faulty = 0, neg = 1, base_value_int = (base_value-1)*10;
 32 |     double avg_err = 0, err = 0;
 33 | 
 34 |     int32_t* p = new int32_t[count];
 35 |     int32_t expected_int;
 36 | 
 37 |     float* fp = new float[count];
 38 |     float expected_float;
 39 | 
 40 |     /* Set signal handler */
 41 |     signal(SIGINT, signal_handler);
 42 |     signal(SIGTERM, signal_handler);
 43 | 
 44 |     for (int jj = 1; jj <= num_rounds; jj++) {
 45 | 
 46 |         std::cout << "INT round " << jj << std::endl;
 47 | 
 48 |         faulty = 0;
 49 |         neg = 1;
 50 | 
 51 |         for (int i = 0; i < count; i++) {
 52 |             p[i] = neg * base_value_int * jj * i;
 53 | 
 54 |             neg = -neg;
 55 |         }
 56 | 
 57 |         auto begin = std::chrono::high_resolution_clock::now();
 58 |         if (!ctx.try_daiet(p, count,1)){
 59 |             cout << "Daiet failed";
 60 |             exit(EXIT_FAILURE);
 61 |         }
 62 |         auto end = std::chrono::high_resolution_clock::now();
 63 | 
 64 |         neg = 1;
 65 |         for (int i = 0; i < count; i++) {
 66 | 
 67 |             expected_int = neg * base_value_int * jj * i * num_workers;
 68 | 
 69 |             if (p[i] != expected_int) {
 70 |                 faulty++;
 71 |                 std::cerr << "Index: " << i
 72 |                           << " Received: " << p[i]
 73 |                           << " Expected: " << expected_int << std::endl;
 74 |             }
 75 | 
 76 |             neg = -neg;
 77 |         }
 78 | 
 79 |         std::cout << "Done INT round " << jj
 80 |                   << ": Faulty: " << faulty
 81 |                   << " Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count()
 82 |                   << " ms" << std::endl;
 83 |     }
 84 | 
 85 |     for (int jj = 1; jj <= num_rounds; jj++) {
 86 | 
 87 |         std::cout << "FLOAT round " << jj << std::endl;
 88 | 
 89 |         faulty = 0;
 90 |         neg = 1;
 91 | 
 92 |         for (int i = 0; i < count; i++) {
 93 | 
 94 |             fp[i] = neg * ldexpf(base_value,exp) * jj * i;
 95 | 
 96 |             neg = -neg;
 97 |             exp++;
 98 | 
 99 |             if (exp > max_exp)
100 |                 exp=min_exp;
101 |         }
102 | 
103 |         auto begin = std::chrono::high_resolution_clock::now();
104 |         if (!ctx.try_daiet(fp, count,1)){
105 |             cout << "Daiet failed";
106 |             exit(EXIT_FAILURE);
107 |         }
108 | 
109 |         auto end = std::chrono::high_resolution_clock::now();
110 | 
111 |         neg = 1;
112 | 
113 |         for (int i = 0; i < count; i++) {
114 | 
115 |             expected_float = neg * ldexpf(base_value,exp) * jj * i * num_workers;
116 | 
117 |             err = abs(expected_float - fp[i]) / abs(expected_float);
118 | 
119 |             if (err > accuracy){
120 | 
121 |                 faulty++;
122 |                 avg_err += err;
123 | 
124 |                 std::cerr << "Index: " << i
125 |                           << " Received: " << fp[i]
126 |                           << " Expected: " << expected_float
127 |                           << " Error: " << err*100<<"%"<<std::endl;
128 |             }
129 | 
130 |             neg = -neg;
131 |             exp++;
132 | 
133 |             if (exp > max_exp)
134 |                 exp=min_exp;
135 |         }
136 | 
137 |         avg_err = avg_err * 100 / count;
138 | 
139 |         std::cout << "Done FLOAT round " << jj
140 |                   << ": Faulty: " << faulty
141 |                   << " AVG err: "<< avg_err
142 |                   <<"% Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count()
143 |                   << " ms" << std::endl;
144 |     }
145 | 
146 |     exit(EXIT_SUCCESS);
147 | }
148 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp1/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(switchml)
 2 | cmake_minimum_required(VERSION 3.5)
 3 | 
 4 | set(USE_VANILLA_DEFAULT OFF)
 5 | set(USE_MLX5_DEFAULT OFF)
 6 | set(USE_MLX4_DEFAULT OFF)
 7 | set(DEBUG_DEFAULT OFF)
 8 | 
 9 | # Options
10 | option(USE_VANILLA "Use vanilla version of gloo" ${USE_VANILLA_DEFAULT})
11 | option(USE_MLX5 "Use MLX5 and ibverbs" ${USE_MLX5_DEFAULT})
12 | option(USE_MLX4 "Use MLX4 and ibverbs" ${USE_MLX4_DEFAULT})
13 | option(DEBUG "Compile in debug mode" ${DEBUG_DEFAULT})
14 | 
15 | if(DEBUG)
16 |     message(WARNING "Compile in debug mode")
17 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -g -O0")
18 | else()
19 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -O3 -DNDEBUG")
20 | endif()
21 | 
22 | add_definitions("-DHIREDIS_NESTED_INCLUDE")
23 | 
24 | link_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build/gloo)
25 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build)
26 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo)
27 | 
28 | if(NOT USE_VANILLA)
29 |     include_directories(${CMAKE_SOURCE_DIR}/../../build/include)
30 |     link_directories(${CMAKE_SOURCE_DIR}/../../build)
31 |     link_directories(${CMAKE_SOURCE_DIR}/../../lib/dpdk/build/lib)
32 | else()
33 |     message(WARNING "Compiling with vanilla gloo")
34 | endif()
35 | 
36 | add_executable(exp1 main.cc)
37 | 
38 | if(NOT USE_VANILLA)
39 |     target_link_libraries(exp1 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options)
40 | endif()
41 | 
42 | if(USE_MLX5)
43 |     target_link_libraries(exp1 ibverbs mlx5 mnl)
44 | endif()
45 | 
46 | if(USE_MLX4)
47 |     target_link_libraries(exp1 ibverbs mlx4 mnl)
48 | endif()
49 | 
50 | target_link_libraries(exp1 gloo hiredis pthread)
51 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp1/README.md:
--------------------------------------------------------------------------------
1 | # DEPENDENDENCIES
2 | apt install libboost-chrono-dev libboost-system-dev libboost-thread-dev
3 | 
4 | # COMPILE
5 | Build gloo with the "-DUSE\_REDIS=ON" option
6 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp1/common.h:
--------------------------------------------------------------------------------
 1 | #include "malloc.h"
 2 | // Align buffers to 32 bytes to support vectorized code
 3 | const size_t kBufferAlignment = 32;
 4 | 
 5 | template <typename T, int ALIGNMENT = kBufferAlignment>
 6 | class aligned_allocator {
 7 |  public:
 8 |   using value_type = T;
 9 |   using pointer = value_type*;
10 |   using const_pointer = const value_type*;
11 |   using reference = value_type&;
12 |   using const_reference = const value_type&;
13 |   using size_type = std::size_t;
14 |   using difference_type = std::ptrdiff_t;
15 | 
16 |   template <typename U>
17 |   struct rebind {
18 |     using other = aligned_allocator<U, ALIGNMENT>;
19 |   };
20 | 
21 |   inline explicit aligned_allocator() = default;
22 |   inline ~aligned_allocator() = default;
23 |   inline explicit aligned_allocator(const aligned_allocator& a) = default;
24 | 
25 |   inline pointer address(reference r) {
26 |     return &r;
27 |   }
28 |   inline const_pointer address(const_reference r) {
29 |     return &r;
30 |   }
31 | 
32 |   inline pointer allocate(
33 |       size_type sz,
34 |       typename std::allocator<void>::const_pointer = 0) {
35 |     auto x = memalign(ALIGNMENT, sizeof(T) * sz);
36 |     return reinterpret_cast<pointer>(x);
37 |   }
38 | 
39 |   void deallocate(pointer p, size_type /*sz*/) {
40 |     free(p);
41 |   }
42 | };
43 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp1/main.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <memory>
  3 | #include <chrono>
  4 | #include <cmath>
  5 | #include <signal.h>
  6 | #include <string.h>
  7 | #include <unistd.h>
  8 | 
  9 | #include "gloo/barrier_all_to_one.h"
 10 | #include "gloo/allreduce_halving_doubling.h"
 11 | #include "gloo/rendezvous/context.h"
 12 | #include "gloo/rendezvous/redis_store.h"
 13 | #include "gloo/rendezvous/prefix_store.h"
 14 | #include "gloo/transport/tcp/device.h"
 15 | #if GLOO_USE_IBVERBS
 16 | #include "gloo/transport/ibverbs/device.h"
 17 | #endif
 18 | 
 19 | #include "common.h"
 20 | 
 21 | using namespace std;
 22 | 
 23 | shared_ptr<gloo::rendezvous::Context> context;
 24 | 
 25 | void signal_handler(int signum) {
 26 | 
 27 |     if (signum == SIGINT || signum == SIGTERM) {
 28 | 
 29 |         cerr << " Signal " << signum << " received!";
 30 | 
 31 | #ifdef DAIET
 32 |         context->daietContext.StopMaster();
 33 | #endif
 34 |         exit(1);
 35 |     }
 36 | }
 37 | 
 38 | int main(int argc, char* argv[]) {
 39 | 
 40 |     if (argc != 8) {
 41 | #if GLOO_USE_IBVERBS
 42 |         cout << " Usage: " << argv[0] << " [rdma:|tcp:]INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl;
 43 | #else
 44 |         cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl;
 45 | #endif
 46 |         return 0;
 47 |     }
 48 | 
 49 |     /* Set signal handler */
 50 |     signal(SIGINT, signal_handler);
 51 |     signal(SIGTERM, signal_handler);
 52 | 
 53 |     vector<int32_t, aligned_allocator<int32_t, kBufferAlignment>> data;
 54 |     int roundnum = 0;
 55 | 
 56 |     // GLOO transport
 57 |     std::shared_ptr<gloo::transport::Device> dev;
 58 | 
 59 | #if GLOO_USE_IBVERBS
 60 |     if (strncmp("rdma:", argv[1], 5) == 0) {
 61 | 	string name(argv[1] + 5);
 62 |         gloo::transport::ibverbs::attr attr = {
 63 |             .name = name,
 64 |             .port = 1,
 65 |            .index = 0,
 66 |         };
 67 |         dev = gloo::transport::ibverbs::CreateDevice(attr);
 68 |     } else {
 69 |         if (strncmp("tcp:", argv[1], 4) == 0) {
 70 |             argv[1] += 4;
 71 |         }
 72 | 	string iface(argv[1]);
 73 |         gloo::transport::tcp::attr attr;
 74 |         attr.iface = iface;
 75 |         dev = gloo::transport::tcp::CreateDevice(attr);
 76 |     }
 77 | #else
 78 |     gloo::transport::tcp::attr attr;
 79 |     string iface(argv[1]);
 80 |     attr.iface = iface;
 81 |     dev = gloo::transport::tcp::CreateDevice(attr);
 82 | #endif
 83 | 
 84 |     // Rendezvous
 85 |     auto redisStore = gloo::rendezvous::RedisStore(argv[2]);
 86 |     string prefix = argv[3];
 87 |     auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore);
 88 | 
 89 |     const int size = atoi(argv[4]);
 90 |     const int rank = atoi(argv[5]);
 91 |     const int tensor_size = atoi(argv[6]);
 92 |     const int num_rounds = atoi(argv[7]);
 93 | 
 94 |     // Init data
 95 |     data.reserve(tensor_size);
 96 |     cout << "-- Tensor initialization" << endl;
 97 |     for (int i = 0; i < tensor_size; i++) {
 98 |         data.insert(data.begin()+i, 1);
 99 |     }
100 |     cout << "---- Ended" << endl;
101 | 
102 |     vector<int32_t*> ptrs;
103 |     ptrs.push_back(&data[0]);
104 | 
105 |     int count = data.size();
106 | 
107 |     // Context
108 |     context = make_shared<gloo::rendezvous::Context>(rank, size);
109 |     context->connectFullMesh(prefixStore, dev);
110 | 
111 |     auto barrier = make_shared<gloo::BarrierAllToOne>(context);
112 | 
113 |     barrier->run();
114 | 
115 |     //Warm up rounds
116 |     for (int i=0; i<10; i++){
117 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<int32_t>>(context, ptrs, count);
118 |         allreduce->run();
119 |     }
120 | 
121 |     // Start rounds
122 |     for (roundnum = 0; roundnum < num_rounds; roundnum++) {
123 |         // Instantiate the collective algorithm
124 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<int32_t>>(context, ptrs, count);
125 | 
126 |         cout << "-- Allreduce Round " << roundnum << endl;
127 | 
128 |         auto begin = chrono::high_resolution_clock::now();
129 |         // Run the algorithm
130 |         allreduce->run();
131 | 
132 |         auto end = chrono::high_resolution_clock::now();
133 | 
134 |         cout << "---- Ended" << endl << "#ms " << chrono::duration_cast<chrono::milliseconds>(end - begin).count() << endl;
135 | 
136 | 	usleep(100000);
137 |     }
138 | 
139 |     cout << "-- Final check" << endl;
140 |     for (int i = 0; i < tensor_size; i++) {
141 |         if (data[i] != powf(size, num_rounds+10)) {
142 |             cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << powf(size, num_rounds+10) << endl;
143 |             break;
144 |         }
145 |     }
146 |     cout << "---- Ended" << endl;
147 | 
148 |     return 0;
149 | }
150 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp2/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(switchml)
 2 | cmake_minimum_required(VERSION 3.5)
 3 | find_package(MPI)
 4 | 
 5 | set(USE_VANILLA_DEFAULT OFF)
 6 | set(USE_MLX5_DEFAULT OFF)
 7 | set(USE_MLX4_DEFAULT OFF)
 8 | set(DEBUG_DEFAULT OFF)
 9 | 
10 | # Options
11 | option(USE_VANILLA "Use vanilla version of gloo" ${USE_VANILLA_DEFAULT})
12 | option(USE_MLX5 "Use MLX5 and ibverbs" ${USE_MLX5_DEFAULT})
13 | option(USE_MLX4 "Use MLX4 and ibverbs" ${USE_MLX4_DEFAULT})
14 | option(DEBUG "Compile in debug mode" ${DEBUG_DEFAULT})
15 | 
16 | if(DEBUG)
17 |     message(WARNING "Compile in debug mode")
18 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -g -O0")
19 | else()
20 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -O3 -DNDEBUG")
21 | endif()
22 | 
23 | add_definitions("-DHIREDIS_NESTED_INCLUDE")
24 | 
25 | link_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build/gloo)
26 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo/build)
27 | include_directories(${CMAKE_SOURCE_DIR}/../../../gloo)
28 | include_directories(SYSTEM ${MPI_INCLUDE_PATH})
29 | 
30 | if(NOT USE_VANILLA)
31 |     include_directories(${CMAKE_SOURCE_DIR}/../../build/include)
32 |     link_directories(${CMAKE_SOURCE_DIR}/../../build)
33 |     link_directories(${CMAKE_SOURCE_DIR}/../../lib/dpdk/build/lib)
34 | else()
35 |     message(WARNING "Compiling with vanilla gloo")
36 | endif()
37 | 
38 | add_executable(float16 float16.cc)
39 | add_executable(float32 float32.cc)
40 | add_executable(int32 int32.cc)
41 | add_executable(switchml_dense switchml_dense.cc)
42 | 
43 | 
44 | if(NOT USE_VANILLA)
45 |     target_link_libraries(float16 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options)
46 |     target_link_libraries(float32 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options)
47 |     target_link_libraries(int32 -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options)
48 |     target_link_libraries(switchml_dense -Wl,--whole-archive daiet dpdk -Wl,--no-whole-archive dl numa boost_chrono boost_system boost_thread boost_program_options)
49 | endif()
50 | 
51 | if(USE_MLX5)
52 |     target_link_libraries(float16 ibverbs mlx5 mnl)
53 |     target_link_libraries(float32 ibverbs mlx5 mnl)
54 |     target_link_libraries(int32 ibverbs mlx5 mnl)
55 |     target_link_libraries(switchml_dense ibverbs mlx5 mnl)
56 | endif()
57 | if(USE_MLX4)
58 |     target_link_libraries(float16 ibverbs mlx4 mnl)
59 |     target_link_libraries(float32 ibverbs mlx4 mnl)
60 |     target_link_libraries(int32 ibverbs mlx4 mnl)
61 |     target_link_libraries(switchml_dense ibverbs mlx4 mnl)
62 | endif()
63 | 
64 | target_link_libraries(float16 gloo hiredis pthread)
65 | target_link_libraries(float32 gloo hiredis pthread)
66 | target_link_libraries(int32 gloo hiredis pthread)
67 | target_link_libraries(switchml_dense gloo hiredis pthread ${MPI_C_LIBRARIES})
68 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp2/README.md:
--------------------------------------------------------------------------------
1 | # DEPENDENDENCIES
2 | apt install libboost-chrono-dev libboost-system-dev libboost-thread-dev
3 | 
4 | # COMPILE
5 | Build gloo with the "-DUSE\_REDIS=ON" option
6 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp2/common.h:
--------------------------------------------------------------------------------
 1 | #include "malloc.h"
 2 | // Align buffers to 32 bytes to support vectorized code
 3 | const size_t kBufferAlignment = 32;
 4 | 
 5 | template <typename T, int ALIGNMENT = kBufferAlignment>
 6 | class aligned_allocator {
 7 |   static_assert(
 8 |       !(ALIGNMENT & (ALIGNMENT - 1)),
 9 |       "alignment must be a power of 2");
10 | 
11 |  public:
12 |   using value_type = T;
13 |   using pointer = value_type*;
14 |   using const_pointer = const value_type*;
15 |   using reference = value_type&;
16 |   using const_reference = const value_type&;
17 |   using size_type = std::size_t;
18 |   using difference_type = std::ptrdiff_t;
19 | 
20 |   template <typename U>
21 |   struct rebind {
22 |     using other = aligned_allocator<U, ALIGNMENT>;
23 |   };
24 | 
25 |   inline explicit aligned_allocator() = default;
26 |   inline ~aligned_allocator() = default;
27 |   inline explicit aligned_allocator(const aligned_allocator& a) = default;
28 | 
29 |   inline pointer address(reference r) {
30 |     return &r;
31 |   }
32 | 
33 |   inline const_pointer address(const_reference r) {
34 |     return &r;
35 |   }
36 | 
37 |   inline pointer allocate(
38 |       size_type sz,
39 |       typename std::allocator<void>::const_pointer = 0) {
40 |     pointer p;
41 |     if (posix_memalign(
42 |             reinterpret_cast<void**>(&p), ALIGNMENT, sizeof(T) * sz)) {
43 |       abort();
44 |     }
45 |     return p;
46 |   }
47 | 
48 |   void deallocate(pointer p, size_type /*sz*/) {
49 |     free(p);
50 |   }
51 | };
52 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp2/float16.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <memory>
  3 | #include <chrono>
  4 | #include <cmath>
  5 | 
  6 | #include "gloo/allreduce_halving_doubling.h"
  7 | #include "gloo/rendezvous/context.h"
  8 | #include "gloo/rendezvous/redis_store.h"
  9 | #include "gloo/rendezvous/prefix_store.h"
 10 | #include "gloo/transport/tcp/device.h"
 11 | #include "gloo/barrier_all_to_one.h"
 12 | #include "gloo/types.h"
 13 | 
 14 | #include <signal.h>
 15 | 
 16 | #include "common.h"
 17 | 
 18 | using namespace std;
 19 | 
 20 | shared_ptr<gloo::rendezvous::Context> context;
 21 | 
 22 | void signal_handler(int signum) {
 23 | 
 24 |     if (signum == SIGINT || signum == SIGTERM) {
 25 | 
 26 |         cerr << " Signal " << signum << " received!";
 27 | 
 28 | #ifdef DAIET
 29 |         context->daietContext.StopMaster();
 30 | #endif
 31 |         exit(1);
 32 | 
 33 |     }
 34 | }
 35 | 
 36 | int main(int argc, char* argv[]) {
 37 | 
 38 |     if (argc != 8) {
 39 |         cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl;
 40 |         return 0;
 41 |     }
 42 | 
 43 |     /* Set signal handler */
 44 |     signal(SIGINT, signal_handler);
 45 |     signal(SIGTERM, signal_handler);
 46 | 
 47 |     vector<gloo::float16, aligned_allocator<gloo::float16, kBufferAlignment>> base_data;
 48 |     vector<gloo::float16, aligned_allocator<gloo::float16, kBufferAlignment>> data;
 49 |     int roundnum = 0;
 50 | 
 51 |     gloo::float16 elem = gloo::cpu_float2half_rn(0.01), expected;
 52 | 
 53 |     // GLOO transport
 54 |     gloo::transport::tcp::attr attr;
 55 |     attr.iface = argv[1];
 56 |     auto dev = gloo::transport::tcp::CreateDevice(attr);
 57 | 
 58 |     // Rendezvous
 59 |     auto redisStore = gloo::rendezvous::RedisStore(argv[2]);
 60 |     string prefix = argv[3];
 61 |     auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore);
 62 | 
 63 |     const int size = atoi(argv[4]);
 64 |     const int rank = atoi(argv[5]);
 65 |     const int tensor_size = atoi(argv[6]);
 66 |     const int num_rounds = atoi(argv[7]);
 67 |     int num_last_rounds = 0;
 68 | 
 69 |     // Init data
 70 |     base_data.reserve(tensor_size);
 71 |     data.resize(tensor_size);
 72 |     cout << "-- Tensor initialization" << endl;
 73 |     for (int i = 0; i < tensor_size; i++) {
 74 |         base_data.insert(base_data.begin() + i, gloo::cpu_float2half_rn(i%100)*elem);
 75 |     }
 76 |     copy(base_data.begin(), base_data.end(), data.begin());
 77 |     cout << "---- Ended" << endl;
 78 | 
 79 |     vector<gloo::float16*> ptrs;
 80 |     ptrs.push_back(&data[0]);
 81 | 
 82 |     int count = data.size();
 83 | 
 84 |     // Context
 85 |     context = make_shared<gloo::rendezvous::Context>(rank, size);
 86 |     context->connectFullMesh(prefixStore, dev);
 87 | 
 88 |     auto barrier = make_shared<gloo::BarrierAllToOne>(context);
 89 | 
 90 |     barrier->run();
 91 | 
 92 |     //Warm up rounds
 93 |     for (int i = 0; i < 10; i++) {
 94 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<gloo::float16>>(context, ptrs, count);
 95 |         allreduce->run();
 96 |     }
 97 |     copy(base_data.begin(), base_data.end(), data.begin());
 98 | 
 99 |     // Start rounds
100 |     for (roundnum = 0; roundnum < num_rounds; roundnum++) {
101 | 
102 |         if (roundnum % 10 == 0) {
103 |             copy(base_data.begin(), base_data.end(), data.begin());
104 |             num_last_rounds = 0;
105 |         }
106 | 
107 |         // Instantiate the collective algorithm
108 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<gloo::float16>>(context, ptrs, count);
109 | 
110 |         cout << "-- Allreduce Round " << roundnum << endl;
111 | 
112 |         auto begin = chrono::high_resolution_clock::now();
113 |         // Run the algorithm
114 |         allreduce->run();
115 | 
116 |         auto end = chrono::high_resolution_clock::now();
117 | 
118 |         cout << "---- Ended" << endl << "#ms " << chrono::duration_cast<chrono::milliseconds>(end - begin).count() << endl;
119 |         num_last_rounds++;
120 | 
121 |     }
122 | 
123 |     cout << "-- Final check" << endl;
124 |     for (int i = 0; i < tensor_size; i++) {
125 |         expected = (i%100) * gloo::cpu_half2float(elem) * powf(size, num_last_rounds);
126 |         if (data[i] != expected) {
127 |             cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << expected << endl;
128 |             break;
129 |         }
130 |     }
131 |     cout << "---- Ended" << endl;
132 | 
133 |     return 0;
134 | }
135 | 
136 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp2/float32.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <memory>
  3 | #include <chrono>
  4 | #include <cmath>
  5 | 
  6 | #include "gloo/allreduce_halving_doubling.h"
  7 | #include "gloo/rendezvous/context.h"
  8 | #include "gloo/rendezvous/redis_store.h"
  9 | #include "gloo/rendezvous/prefix_store.h"
 10 | #include "gloo/transport/tcp/device.h"
 11 | #include "gloo/barrier_all_to_one.h"
 12 | #include "gloo/allreduce.h"
 13 | 
 14 | #include <signal.h>
 15 | 
 16 | #include "common.h"
 17 | 
 18 | using namespace std;
 19 | 
 20 | shared_ptr<gloo::rendezvous::Context> context;
 21 | 
 22 | void signal_handler(int signum) {
 23 | 
 24 |     if (signum == SIGINT || signum == SIGTERM) {
 25 | 
 26 |         cerr << " Signal " << signum << " received!";
 27 | 
 28 | #ifdef DAIET
 29 |         context->daietContext.StopMaster();
 30 | #endif
 31 |         exit(1);
 32 |     }
 33 | }
 34 | 
 35 | int main(int argc, char* argv[]) {
 36 | 
 37 |     if (argc != 8) {
 38 |         cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl;
 39 |         return 0;
 40 |     }
 41 | 
 42 |     /* Set signal handler */
 43 |     signal(SIGINT, signal_handler);
 44 |     signal(SIGTERM, signal_handler);
 45 | 
 46 |     vector<float, aligned_allocator<float, kBufferAlignment>> base_data;
 47 |     vector<float, aligned_allocator<float, kBufferAlignment>> data;
 48 |     int roundnum = 0;
 49 | 
 50 |     float elem = 0.01, expected = 0;
 51 | 
 52 |     // GLOO transport
 53 |     gloo::transport::tcp::attr attr;
 54 |     attr.iface = argv[1];
 55 |     auto dev = gloo::transport::tcp::CreateDevice(attr);
 56 | 
 57 |     // Rendezvous
 58 |     auto redisStore = gloo::rendezvous::RedisStore(argv[2]);
 59 |     string prefix = argv[3];
 60 |     auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore);
 61 | 
 62 |     const int size = atoi(argv[4]);
 63 |     const int rank = atoi(argv[5]);
 64 |     const int tensor_size = atoi(argv[6]);
 65 |     const int num_rounds = atoi(argv[7]);
 66 |     int num_last_rounds = 0;
 67 | 
 68 |     // Init data
 69 |     base_data.reserve(tensor_size);
 70 |     data.resize(tensor_size);
 71 |     cout << "-- Tensor initialization" << endl;
 72 |     for (int i = 0; i < tensor_size; i++) {
 73 |         base_data.insert(base_data.begin() + i, (i%100)*elem);
 74 |     }
 75 |     copy(base_data.begin(), base_data.end(), data.begin());
 76 |     cout << "---- Ended" << endl;
 77 | 
 78 |     vector<float*> ptrs;
 79 |     ptrs.push_back(&data[0]);
 80 | 
 81 |     int count = data.size();
 82 | 
 83 |     // Context
 84 |     context = make_shared<gloo::rendezvous::Context>(rank, size);
 85 |     context->connectFullMesh(prefixStore, dev);
 86 | 
 87 |     auto barrier = make_shared<gloo::BarrierAllToOne>(context);
 88 | 
 89 |     barrier->run();
 90 | 
 91 |     //Warm up rounds
 92 |     for (int i = 0; i < 10; i++) {
 93 |         gloo::AllreduceOptions opts(context);
 94 |         opts.setOutputs(ptrs, count);
 95 |         opts.setReduceFunction(
 96 |             static_cast<void(*)(void*, const void*, const void*, size_t)>(
 97 |                 &gloo::sum<float>));
 98 |         gloo::allreduce(opts);
 99 |         //auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<float>>(context, ptrs, count);
100 |         //allreduce->run();
101 |     }
102 |     copy(base_data.begin(), base_data.end(), data.begin());
103 | 
104 |     // Start rounds
105 |     for (roundnum = 0; roundnum < num_rounds; roundnum++) {
106 | 
107 |         if (roundnum % 10 == 0) {
108 |             copy(base_data.begin(), base_data.end(), data.begin());
109 |             num_last_rounds = 0;
110 |         }
111 | 
112 |         // Instantiate the collective algorithm
113 |         //auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<float>>(context, ptrs, count);
114 | 
115 |         cout << "-- Allreduce Round " << roundnum << endl;
116 | 
117 |         auto begin = chrono::high_resolution_clock::now();
118 |         // Run the algorithm
119 |         //allreduce->run();
120 | 
121 |         gloo::AllreduceOptions opts(context);
122 |         opts.setOutputs(ptrs, count);
123 |         opts.setReduceFunction(
124 |             static_cast<void(*)(void*, const void*, const void*, size_t)>(
125 |                 &gloo::sum<float>));
126 |         gloo::allreduce(opts);
127 |         auto end = chrono::high_resolution_clock::now();
128 | 
129 |         cout << "---- Ended" << endl << "#ms " << chrono::duration_cast<chrono::milliseconds>(end - begin).count() << endl;
130 |         num_last_rounds++;
131 | 
132 |     }
133 | 
134 |     cout << "-- Final check" << endl;
135 |     for (int i = 0; i < tensor_size; i++) {
136 |         expected = (i%100) * elem * powf(size, num_last_rounds);
137 |         if (data[i] != expected) {
138 |             cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << expected << endl;
139 |             break;
140 |         }
141 |     }
142 |     cout << "---- Ended" << endl;
143 | 
144 |     return 0;
145 | }
146 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp2/int32.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <memory>
  3 | #include <chrono>
  4 | #include <cmath>
  5 | 
  6 | #include "gloo/allreduce_halving_doubling.h"
  7 | #include "gloo/rendezvous/context.h"
  8 | #include "gloo/rendezvous/redis_store.h"
  9 | #include "gloo/rendezvous/prefix_store.h"
 10 | #include "gloo/transport/tcp/device.h"
 11 | #include "gloo/barrier_all_to_one.h"
 12 | 
 13 | #include <signal.h>
 14 | 
 15 | #include "common.h"
 16 | 
 17 | using namespace std;
 18 | 
 19 | shared_ptr<gloo::rendezvous::Context> context;
 20 | 
 21 | void signal_handler(int signum) {
 22 | 
 23 |     if (signum == SIGINT || signum == SIGTERM) {
 24 | 
 25 |         cerr << " Signal " << signum << " received!";
 26 | 
 27 | #ifdef DAIET
 28 |         context->daietContext.StopMaster();
 29 | #endif
 30 |         exit(1);
 31 |     }
 32 | }
 33 | 
 34 | int main(int argc, char* argv[]) {
 35 | 
 36 |     if (argc != 8) {
 37 |         cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS" << endl;
 38 |         return 0;
 39 |     }
 40 | 
 41 |     /* Set signal handler */
 42 |     signal(SIGINT, signal_handler);
 43 |     signal(SIGTERM, signal_handler);
 44 | 
 45 |     vector<int32_t, aligned_allocator<int32_t, kBufferAlignment>> base_data;
 46 |     vector<int32_t, aligned_allocator<int32_t, kBufferAlignment>> data;
 47 |     int roundnum = 0;
 48 | 
 49 |     int32_t elem = 1, expected = 0;
 50 | 
 51 |     // GLOO transport
 52 |     gloo::transport::tcp::attr attr;
 53 |     attr.iface = argv[1];
 54 |     auto dev = gloo::transport::tcp::CreateDevice(attr);
 55 | 
 56 |     // Rendezvous
 57 |     auto redisStore = gloo::rendezvous::RedisStore(argv[2]);
 58 |     string prefix = argv[3];
 59 |     auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore);
 60 | 
 61 |     const int size = atoi(argv[4]);
 62 |     const int rank = atoi(argv[5]);
 63 |     const int tensor_size = atoi(argv[6]);
 64 |     const int num_rounds = atoi(argv[7]);
 65 |     int num_last_rounds = 0;
 66 | 
 67 |     // Init data
 68 |     base_data.reserve(tensor_size);
 69 |     data.resize(tensor_size);
 70 |     cout << "-- Tensor initialization" << endl;
 71 |     for (int i = 0; i < tensor_size; i++) {
 72 |         base_data.insert(base_data.begin() + i, (i%100)*elem);
 73 |     }
 74 |     copy(base_data.begin(), base_data.end(), data.begin());
 75 |     cout << "---- Ended" << endl;
 76 | 
 77 |     vector<int32_t*> ptrs;
 78 |     ptrs.push_back(&data[0]);
 79 | 
 80 |     int count = data.size();
 81 | 
 82 |     // Context
 83 |     context = make_shared<gloo::rendezvous::Context>(rank, size);
 84 |     context->connectFullMesh(prefixStore, dev);
 85 | 
 86 |     auto barrier = make_shared<gloo::BarrierAllToOne>(context);
 87 | 
 88 |     barrier->run();
 89 | 
 90 |     //Warm up rounds
 91 |     for (int i = 0; i < 10; i++) {
 92 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<int32_t>>(context, ptrs, count);
 93 |         allreduce->run();
 94 |     }
 95 |     copy(base_data.begin(), base_data.end(), data.begin());
 96 | 
 97 |     // Start rounds
 98 |     for (roundnum = 0; roundnum < num_rounds; roundnum++) {
 99 | 
100 |         if (roundnum % 10 == 0) {
101 |             copy(base_data.begin(), base_data.end(), data.begin());
102 |             num_last_rounds = 0;
103 |         }
104 | 
105 |         // Instantiate the collective algorithm
106 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<int32_t>>(context, ptrs, count);
107 | 
108 |         cout << "-- Allreduce Round " << roundnum << endl;
109 | 
110 |         auto begin = chrono::high_resolution_clock::now();
111 |         // Run the algorithm
112 |         allreduce->run();
113 | 
114 |         auto end = chrono::high_resolution_clock::now();
115 | 
116 |         cout << "---- Ended" << endl << "#ms " << chrono::duration_cast<chrono::milliseconds>(end - begin).count() << endl;
117 |         num_last_rounds++;
118 | 
119 |     }
120 | 
121 |     cout << "-- Final check" << endl;
122 |     for (int i = 0; i < tensor_size; i++) {
123 |         expected = (i%100) * elem * powf(size, num_last_rounds);
124 |         if (data[i] != expected) {
125 |             cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << expected << endl;
126 |             break;
127 |         }
128 |     }
129 |     cout << "---- Ended" << endl;
130 | 
131 |     return 0;
132 | }
133 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/experiments/exp2/switchml_dense.cc:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <memory>
  3 | #include <chrono>
  4 | #include <cmath>
  5 | #include <numeric>
  6 | #include <algorithm>
  7 | 
  8 | #include "gloo/allreduce_halving_doubling.h"
  9 | #include "gloo/rendezvous/context.h"
 10 | #include "gloo/rendezvous/redis_store.h"
 11 | #include "gloo/rendezvous/prefix_store.h"
 12 | #include "gloo/transport/tcp/device.h"
 13 | #include "gloo/barrier_all_to_one.h"
 14 | 
 15 | #include <signal.h>
 16 | #include <assert.h>
 17 | 
 18 | #include "mpi.h"
 19 | #include "common.h"
 20 | 
 21 | using namespace std;
 22 | 
 23 | //#define SAVE_RESULT
 24 | #define OUTPUT_RANK 0
 25 | #define INTTYPE
 26 | 
 27 | #ifdef INTTYPE
 28 | typedef int ValType;
 29 | #else
 30 | typedef float ValType;
 31 | #endif
 32 | 
 33 | void set_seed(unsigned int seed) {
 34 |   srand(seed);
 35 |   srand48(seed);
 36 | }
 37 | 
 38 | void set_seed_random(int id) {
 39 |   set_seed(clock() + (id * 147));
 40 | }
 41 | 
 42 | // Between 0 (included) and max (excluded)
 43 | unsigned int get_random_int(unsigned int max) {
 44 |   return rand()%max;
 45 | }
 46 | 
 47 | // Between 0 (included) and max(excluded)
 48 | float get_random_float(unsigned int max) {
 49 |   return drand48()*max;
 50 | }
 51 | 
 52 | ValType get_random_value() {
 53 | #ifdef FLOATTYPE
 54 |   return get_random_float(100) - 50;
 55 | #elif defined(INTTYPE)
 56 |   return get_random_int(200) - 100; // Change to int if this changes
 57 | #endif
 58 | }
 59 | 
 60 | void create_sparse(const unsigned dim, const float density, ValType* v, const int blocksize) {
 61 |   // Create indices from 0 to dim 
 62 |   
 63 |   int block_num = (int)(dim/blocksize);
 64 |   int count = (int)(density*block_num);
 65 |   std::vector<unsigned int> indices(block_num);
 66 |   std::iota (indices.begin(), indices.end(), 0);
 67 | 
 68 |   // Random suffel indices
 69 |   std::random_shuffle ( indices.begin(), indices.end() );
 70 |   // Sort first count items
 71 |   std::sort( indices.begin(), indices.begin() + count);
 72 | 
 73 |   size_t idx = 0;
 74 |   for(std::vector<unsigned int>::const_iterator index = indices.begin(); index != indices.end() && index < indices.begin() + count; ++index) {
 75 |     for(int i=(*index)*blocksize;i<(*index+1)*blocksize; i++){
 76 |       ValType val = get_random_value();
 77 |       v[i]= val;
 78 |     }
 79 |   }
 80 |   return;
 81 | }
 82 | 
 83 | shared_ptr<gloo::rendezvous::Context> context;
 84 | 
 85 | void signal_handler(int signum) {
 86 | 
 87 |     if (signum == SIGINT || signum == SIGTERM) {
 88 | 
 89 |         cerr << " Signal " << signum << " received!";
 90 | 
 91 | #ifdef DAIET
 92 |         context->daietContext.StopMaster();
 93 | #endif
 94 |         exit(1);
 95 |     }
 96 | }
 97 | 
 98 | int main(int argc, char* argv[]) {
 99 | 
100 | 
101 |     MPI_Init(&argc, &argv);
102 |     if (argc != 9) {
103 |         cout << " Usage: " << argv[0] << " INTERFACE REDIS_SERVER_IP PREFIX NUM_WORKERS RANK TENSOR_SIZE NUM_ROUNDS DENSITY" << endl;
104 |         return 0;
105 |     }
106 | 
107 |     int myrank, worldsize;
108 |     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
109 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
110 | 
111 |     /* Set signal handler */
112 |     signal(SIGINT, signal_handler);
113 |     signal(SIGTERM, signal_handler);
114 | 
115 |     vector<ValType, aligned_allocator<ValType, kBufferAlignment>> base_data;
116 |     vector<ValType, aligned_allocator<ValType, kBufferAlignment>> data;
117 |     vector<ValType, aligned_allocator<ValType, kBufferAlignment>> results;
118 | 
119 | 
120 |     int roundnum = 0;
121 | 
122 |     float elem = 0.01, expected = 0;
123 | 
124 |     // GLOO transport
125 |     gloo::transport::tcp::attr attr;
126 |     attr.iface = argv[1];
127 |     auto dev = gloo::transport::tcp::CreateDevice(attr);
128 | 
129 |     // Rendezvous
130 |     auto redisStore = gloo::rendezvous::RedisStore(argv[2]);
131 |     time_t t = time(0);
132 |     char ch[64];
133 |     strftime(ch, sizeof(ch), "%Y-%m-%d-%H-%M-%S", localtime(&t));
134 |     string prefix = ch;
135 |     auto prefixStore = gloo::rendezvous::PrefixStore(prefix, redisStore);
136 | 
137 |     const int size = worldsize;
138 |     const int rank = myrank;
139 |     const int tensor_size = atoi(argv[6]);
140 |     const int num_rounds = atoi(argv[7]);
141 |     const float density = atof(argv[8]);
142 |     const int blocksize = 256;
143 |     int num_last_rounds = 0;
144 |     int* timecost = (int*)malloc(sizeof(int)*num_rounds);
145 | 
146 |     // Init data
147 |     set_seed_random(rank);
148 |     srand(time(NULL));
149 |     int cnt = (int)(density*tensor_size);
150 |     base_data.resize(tensor_size);
151 |     data.resize(tensor_size);
152 |     results.resize(tensor_size);
153 |     cout << "-- Tensor initialization" << endl;
154 |     create_sparse(tensor_size, density, &base_data[0], blocksize);
155 |     copy(base_data.begin(), base_data.end(), data.begin());
156 |     copy(base_data.begin(), base_data.end(), results.begin());
157 |     cout << "---- Ended" << endl;
158 | 
159 |     vector<ValType*> ptrs;
160 |     ptrs.push_back(&data[0]);
161 | 
162 |     ValType* result_ptr = &results[0];
163 |     
164 | 
165 |     int count = data.size();
166 | 
167 |     // Context
168 |     context = make_shared<gloo::rendezvous::Context>(rank, size);
169 |     context->connectFullMesh(prefixStore, dev);
170 | 
171 |     auto barrier = make_shared<gloo::BarrierAllToOne>(context);
172 | 
173 |     barrier->run();
174 | 
175 |     //Warm up rounds
176 |     for (int i = 0; i < 10; i++) {
177 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<ValType>>(context, ptrs, count);
178 |         allreduce->run();
179 |     }
180 |     copy(base_data.begin(), base_data.end(), data.begin());
181 |     copy(base_data.begin(), base_data.end(), results.begin());
182 | 
183 |     // Start rounds
184 |     for (roundnum = 0; roundnum < num_rounds; roundnum++) {
185 |         MPI_Barrier(MPI_COMM_WORLD);
186 |         double t_mpi, maxT;
187 |         if (roundnum % 5 == 0) {
188 |             copy(base_data.begin(), base_data.end(), data.begin());
189 |             copy(base_data.begin(), base_data.end(), results.begin());
190 |             num_last_rounds = 0;
191 |         }
192 | 
193 |         MPI_Allreduce(MPI_IN_PLACE, result_ptr, count, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
194 | 
195 |         // Instantiate the collective algorithm
196 |         
197 |         auto allreduce = make_shared<gloo::AllreduceHalvingDoubling<ValType>>(context, ptrs, count);
198 | 
199 |         //cout << "-- Allreduce Round " << roundnum << endl;
200 | 
201 |         auto begin = chrono::high_resolution_clock::now();
202 |         // Run the algorithm
203 |         t_mpi = -MPI_Wtime();
204 |         allreduce->run();
205 |         t_mpi += MPI_Wtime();
206 |         MPI_Reduce(&t_mpi, &maxT, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);  
207 |         if (myrank==OUTPUT_RANK) {
208 |             printf("switchml Dense Allreduce: %f secs\n", maxT);
209 |         }
210 | 
211 |         auto end = chrono::high_resolution_clock::now();
212 | 
213 |         //cout << "---- Ended" << endl << "#ms " << chrono::duration_cast<chrono::milliseconds>(end - begin).count() << endl;
214 |         num_last_rounds++;
215 |         timecost[roundnum] = (int)(maxT*1000000);
216 |         MPI_Barrier(MPI_COMM_WORLD);
217 |     }
218 | 
219 |     cout << "-- Final check" << endl;
220 |     for (int i = 0; i < tensor_size; i++) {
221 |         //if (i<100) cout<<data[i]<<" "<<results[i]<<endl;
222 |         expected = results[i];
223 |         if (data[i] != expected) {
224 |             cout << "---- Failed: index: " << i << " -> received " << data[i] << " instead of " << expected << endl;
225 |             break;
226 |         }
227 |     }
228 |     cout << "---- Ended" << endl;
229 | #ifdef SAVE_RESULT
230 |     if(rank==OUTPUT_RANK){
231 |         FILE *fp = NULL;
232 |         char* filename = (char*)malloc(200*sizeof(char));
233 |         strcpy(filename, "result/switchML_Dense");
234 |         strcat(filename, "-");
235 |         char temp[20];
236 |         sprintf(temp, "%d", tensor_size);
237 |         strcat(filename, temp);
238 |         strcat(filename, "-");
239 |         char temp2[20];
240 |         sprintf(temp2, "%f", density);
241 |         strcat(filename, temp2);
242 |         strcat(filename, ".txt");
243 |         fp = fopen(filename, "w+");
244 |         fprintf(fp, "%s", "switchML_Dense\n");
245 |         for(int j=0; j<num_rounds-1; j++){
246 |             fprintf(fp, "%d", timecost[j]);
247 |             fprintf(fp, "%s",",");
248 |         }
249 |         fprintf(fp, "%d", timecost[num_rounds-1]);
250 |         fprintf(fp, "%s","\n");
251 |         fclose(fp);
252 |     }    
253 | #endif
254 |     MPI_Finalize();
255 |     return 0;
256 | }
257 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/Makefile:
--------------------------------------------------------------------------------
 1 | # DAIET project
 2 | # author: amedeo.sapio@kaust.edu.sa
 3 | 
 4 | ifeq ($(PS_PATH),)
 5 | PS_PATH = $(shell pwd)
 6 | export PS_PATH
 7 | endif
 8 | 
 9 | ifeq ($(DAIET_PATH),)
10 | DAIET_PATH = ${PS_PATH}/..
11 | export DAIET_PATH
12 | endif
13 | 
14 | RTE_SDK = ${DAIET_PATH}/lib/dpdk
15 | RTE_TARGET = build
16 | 
17 | include $(RTE_SDK)/mk/rte.vars.mk
18 | 
19 | # binary name
20 | APP = ps
21 | 
22 | # all source are stored in SRCS-y
23 | SRCS-y := $(shell find ${PS_PATH}/src -maxdepth 1 -name "*.cpp")
24 | HDRS := $(shell find ${PS_PATH}/src -maxdepth 1 -name "*.hpp" -o -name "*.h")
25 | 
26 | #SIMDFLAGS = -msse2 -mssse3 -msse4.1 -msse4.2 -mavx -fabi-version=0 -mfma -mavx2 -mavx512f -mavx512dq -mavx512cd -mavx512bw -mavx512vl 
27 | CXXFLAGS += -Wall -Wextra -std=c++11 -fPIC -I ${DAIET_PATH}/lib/
28 | LDFLAGS += -lstdc++ -l boost_program_options -l boost_system
29 | 
30 | ifeq ($(NOSCALING),ON)
31 | $(info "NOSCALING ON")
32 | CXXFLAGS += -DNOSCALING
33 | endif
34 | 
35 | ifeq ($(ALGO2),ON)
36 | $(info "ALGO2 ON")
37 | CXXFLAGS += -DALGO2
38 | endif
39 | 
40 | ifeq ($(TIMERS),ON)
41 | $(info "TIMERS ON")
42 | CXXFLAGS += -DTIMERS
43 | endif
44 | 
45 | ifeq ($(DEBUG),ON)
46 | $(info "DEBUG ON")
47 | CXXFLAGS += -DDEBUG -g -O0
48 | else
49 | CXXFLAGS += -O3
50 | endif
51 | 
52 | .PHONY: final_clean
53 | final_clean: _postbuild
54 | 	$(Q)$(MAKE) clean
55 | 	$(RM) build/ps.map lib build/_postbuild
56 | 
57 | include $(RTE_SDK)/mk/rte.extapp.mk
58 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/ps.cfg:
--------------------------------------------------------------------------------
 1 | #
 2 | # DAIET project
 3 | # author: amedeo.sapio@kaust.edu.sa
 4 | #
 5 | 
 6 | [daiet]
 7 | # Number of workers
 8 | num_workers = 8
 9 | # Weights per packet
10 | num_updates = 32
11 | # Maximum number of pending messages
12 | max_num_pending_messages = 256
13 | # Parameter Server UDP port
14 | ps_port = 5000
15 | 
16 | [dpdk]
17 | # Number of cores
18 | cores = 0-3
19 | # Process prefix
20 | prefix = daiet
21 | # Extra EAL options
22 | extra_eal_options = 
23 | # Port id
24 | port_id = 0
25 | # Pool and pool cache sizes
26 | pool_size = 262144
27 | pool_cache_size = 512
28 | # Number of packets in a burst
29 | burst_rx = 64
30 | burst_tx = 64
31 | # Bulk drain timer (microseconds)
32 | bulk_drain_tx_us = 10
33 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/common.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #include "common.hpp"
 7 | 
 8 | namespace daiet {
 9 | 
10 |     volatile bool force_quit;
11 | }
12 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/common.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <stdio.h>
 9 | #include <sys/types.h>
10 | #include <unistd.h>
11 | #include <stdlib.h>
12 | #include <string.h>
13 | #include <stdint.h>
14 | #include <inttypes.h>
15 | #include <errno.h>
16 | #include <netinet/in.h>
17 | #include <stdarg.h>
18 | #include <ctype.h>
19 | #include <getopt.h>
20 | #include <stdbool.h>
21 | #include <arpa/inet.h>
22 | 
23 | #include <iostream>
24 | #include <string>
25 | #include <cstring>
26 | #include <fstream>
27 | 
28 | #include "dpdk.h"
29 | #include "msgs.h"
30 | 
31 | namespace daiet {
32 | 
33 |     extern volatile bool force_quit;
34 | }
35 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/dpdk.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <rte_config.h>
 9 | #include <rte_memory.h>
10 | #include <rte_memzone.h>
11 | #include <rte_launch.h>
12 | #include <rte_eal.h>
13 | #include <rte_per_lcore.h>
14 | #include <rte_lcore.h>
15 | #include <rte_debug.h>
16 | #include <rte_common.h>
17 | #include <rte_log.h>
18 | #include <rte_malloc.h>
19 | #include <rte_memcpy.h>
20 | #include <rte_atomic.h>
21 | #include <rte_cycles.h>
22 | #include <rte_prefetch.h>
23 | #include <rte_branch_prediction.h>
24 | #include <rte_interrupts.h>
25 | #include <rte_random.h>
26 | #include <rte_ether.h>
27 | #include <rte_ip.h>
28 | #include <rte_udp.h>
29 | #include <rte_ethdev.h>
30 | #include <rte_mempool.h>
31 | #include <rte_mbuf.h>
32 | #include <rte_byteorder.h>
33 | #include <rte_rwlock.h>
34 | #include <rte_timer.h>
35 | #include <rte_bitmap.h>
36 | #include <rte_flow.h>
37 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/msgs.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | namespace daiet {
 9 | 
10 | #ifdef __cplusplus
11 |     extern "C" {
12 | #endif
13 | 
14 |         /**
15 |          * DAIET Header
16 |          */
17 |         struct daiet_hdr {
18 |                 uint32_t tsi; /**< tensor start index */
19 |                 uint16_t pool_index; /**< pool index */
20 |                 uint32_t next_tsi; /**< next tensor start index */
21 | #ifdef NOSCALING
22 |                 uint8_t data_type;
23 | #endif
24 |         }__attribute__((__packed__));
25 | 
26 |         struct entry_hdr {
27 |                 int32_t upd; /**< vector entry */
28 |         }__attribute__((__packed__));
29 | #ifndef NOSCALING
30 |         struct exp_hdr {
31 |                 int16_t exp; /**< exponent */
32 |         }__attribute__((__packed__));
33 | #endif
34 | #ifdef __cplusplus
35 |     }
36 | #endif
37 | 
38 | }  // End namespace


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/params.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #include "params.hpp"
  7 | #include <boost/program_options.hpp>
  8 | 
  9 | namespace po = boost::program_options;
 10 | 
 11 | namespace daiet {
 12 | 
 13 |     struct dpdk_data dpdk_data;
 14 |     struct dpdk_params dpdk_par;
 15 |     daiet_params daiet_par;
 16 | 
 17 |     void parse_parameters() {
 18 | 
 19 |         string config_file;
 20 |         ifstream ifs;
 21 |         uint16_t ps_port;
 22 |         uint32_t num_updates;
 23 | 
 24 |         po::options_description dpdk_options("DPDK options");
 25 |         po::options_description daiet_options("DAIET options");
 26 |         po::options_description config_file_options;
 27 | 
 28 |         dpdk_options.add_options()
 29 |                 ("dpdk.cores", po::value<string>(&dpdk_par.corestr)->default_value("0-2"), "List of cores")
 30 |                 ("dpdk.prefix", po::value<string>(&dpdk_par.prefix)->default_value("daiet"), "Process prefix")
 31 |                 ("dpdk.extra_eal_options", po::value<string>(&dpdk_par.eal_options)->default_value(""), "Extra EAL options")
 32 |                 ("dpdk.port_id", po::value<uint16_t>(&dpdk_par.portid)->default_value(0), "Port ID")
 33 |                 ("dpdk.pool_size", po::value<uint32_t>(&dpdk_par.pool_size)->default_value(8192 * 32), "Pool size")
 34 |                 ("dpdk.pool_cache_size", po::value<uint32_t>(&dpdk_par.pool_cache_size)->default_value(256 * 2), "Pool cache size")
 35 |                 ("dpdk.burst_rx", po::value<uint32_t>(&dpdk_par.burst_rx)->default_value(64), "RX burst size")
 36 |                 ("dpdk.burst_tx", po::value<uint32_t>(&dpdk_par.burst_tx)->default_value(64), "TX burst size")
 37 |                 ("dpdk.bulk_drain_tx_us", po::value<uint32_t>(&dpdk_par.bulk_drain_tx_us)->default_value(100), "TX bulk drain timer (us)");
 38 | 
 39 |         daiet_options.add_options()
 40 |                 ("daiet.ps_port", po::value<uint16_t>(&ps_port)->default_value(48879), "PS UDP port")
 41 |                 ("daiet.max_num_pending_messages", po::value<uint32_t>(&(daiet_par.getMaxNumPendingMessages()))->default_value(256), "Max number of pending, unaggregated messages")
 42 |                 ("daiet.num_updates", po::value<uint32_t>(&num_updates)->default_value(32), "Number of updates per packet")
 43 |                 ("daiet.num_workers", po::value<uint16_t>(&(daiet_par.getNumWorkers()))->default_value(0), "Number of workers");
 44 | 
 45 |         config_file_options.add(daiet_options).add(dpdk_options);
 46 | 
 47 |         config_file = "/etc/ps.cfg";
 48 |         ifs.open(config_file.c_str());
 49 |         if(!ifs.good()){
 50 |             ifs.close();
 51 | 
 52 |             char hostname[500];
 53 |             if (gethostname(hostname,sizeof(hostname))!=0)
 54 |                 LOG_FATAL("gethostname failed: "+ string(strerror(errno)));
 55 | 
 56 |             config_file = "ps-"+string(hostname)+".cfg";
 57 |             ifs.open(config_file.c_str());
 58 |             if(!ifs.good()){
 59 |                 ifs.close();
 60 | 
 61 |                 config_file = "ps.cfg";
 62 |                 ifs.open(config_file.c_str());
 63 |                 if(!ifs.good()){
 64 |                     ifs.close();
 65 |                     LOG_FATAL("No config file found! (/etc/ps.cfg, ps-"+string(hostname)+".cfg, ps.cfg)");
 66 |                 }
 67 |             }
 68 |         }
 69 |         LOG_INFO("Configuration file "+config_file);
 70 | 
 71 |         po::variables_map vm;
 72 |         po::store(po::parse_config_file(ifs, config_file_options), vm);
 73 |         po::notify(vm);
 74 | 
 75 |         daiet_par.setBasePsPort(ps_port);
 76 |         daiet_par.setNumUpdates(num_updates);
 77 | 
 78 |         if (daiet_par.getNumWorkers()<=0)
 79 |             LOG_FATAL("Number of workers must be greater than 0.");
 80 |     }
 81 | 
 82 |     void print_dpdk_params() {
 83 | 
 84 |         LOG_INFO("** DPDK parameters **");
 85 |         LOG_INFO("Cores: " + dpdk_par.corestr);
 86 |         LOG_INFO("Port ID: " + to_string(dpdk_par.portid));
 87 |         LOG_INFO("Port RX ring size: " + to_string(dpdk_par.port_rx_ring_size));
 88 |         LOG_INFO("Port TX ring size: " + to_string(dpdk_par.port_tx_ring_size));
 89 |         LOG_INFO("Pool size: " + to_string(dpdk_par.pool_size));
 90 |         LOG_INFO("Pool cache size: " + to_string(dpdk_par.pool_cache_size));
 91 |         LOG_INFO("Burst size RX: " + to_string(dpdk_par.burst_rx));
 92 |         LOG_INFO("Burst size TX: " + to_string(dpdk_par.burst_tx));
 93 |         LOG_INFO("Burst drain TX us: " + to_string(dpdk_par.bulk_drain_tx_us));
 94 |         LOG_INFO("Prefix: " + dpdk_par.prefix);
 95 |         LOG_INFO("Extra EAL options: " + dpdk_par.eal_options);
 96 |     }
 97 | 
 98 |     daiet_params::daiet_params() {
 99 | 
100 |         // Defaults
101 |         num_updates = 32;
102 | 
103 |         max_num_pending_messages = 256;
104 | 
105 |         tx_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4 | PKT_TX_UDP_CKSUM;
106 | 
107 |         ps_port = 5000;
108 | 
109 |         num_workers = 0;
110 |     }
111 | 
112 |     daiet_params::~daiet_params() {
113 |     }
114 | 
115 |     void daiet_params::print_params() {
116 | 
117 |         LOG_INFO("** DAIET parameters **");
118 |         LOG_INFO("Num updates: " + to_string(num_updates));
119 |         LOG_INFO("Max num pending messages: " + to_string(max_num_pending_messages));
120 |         LOG_INFO("PS port: " + to_string(ps_port));
121 |         LOG_INFO("Num workers: " + to_string(num_workers));
122 |     }
123 | 
124 |     uint16_t& daiet_params::getNumWorkers() {
125 |         return num_workers;
126 |     }
127 | 
128 |     void daiet_params::setNumUpdates(uint32_t numUpdates) {
129 |         num_updates = numUpdates;
130 |     }
131 | 
132 |     void daiet_params::setBasePsPort(uint16_t psPort) {
133 |         ps_port = psPort;
134 | 
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/params.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "common.hpp"
  9 | #include "utils.hpp"
 10 | 
 11 | using namespace std;
 12 | 
 13 | namespace daiet {
 14 | 
 15 |     void print_dpdk_params();
 16 |     void parse_parameters();
 17 | 
 18 |     struct dpdk_data {
 19 | 
 20 |             // Buffer pool size
 21 |             uint32_t pool_buffer_size;
 22 |             uint16_t core_to_thread_id[RTE_MAX_LCORE];
 23 | 
 24 |             dpdk_data() {
 25 |                 // Defaults
 26 | 
 27 |                 pool_buffer_size = RTE_MBUF_DEFAULT_BUF_SIZE;
 28 |             }
 29 |     }__rte_cache_aligned;
 30 | 
 31 |     extern struct dpdk_data dpdk_data;
 32 | 
 33 |     struct dpdk_params {
 34 | 
 35 |             // Ports
 36 |             uint16_t portid;
 37 |             uint16_t port_rx_ring_size;
 38 |             uint16_t port_tx_ring_size;
 39 | 
 40 |             // Buffer pool
 41 |             uint32_t pool_size;
 42 |             uint32_t pool_cache_size;
 43 | 
 44 |             // Burst sizes
 45 |             uint32_t burst_rx;
 46 |             uint32_t burst_tx;
 47 |             uint32_t bulk_drain_tx_us;
 48 | 
 49 |             // Extra EAL options
 50 |             string eal_options;
 51 | 
 52 |             // Process prefix
 53 |             string prefix;
 54 | 
 55 |             // Cores string
 56 |             string corestr;
 57 | 
 58 |             dpdk_params() {
 59 |                 // Defaults
 60 | 
 61 |                 portid = 0;
 62 |                 port_rx_ring_size = 1024;
 63 |                 port_tx_ring_size = 1024;
 64 | 
 65 |                 pool_size = 8192 * 32;
 66 |                 pool_cache_size = 256 * 2;
 67 | 
 68 |                 burst_rx = 64;
 69 |                 burst_tx = 64;
 70 |                 bulk_drain_tx_us = 10;
 71 | 
 72 |                 prefix = "daiet";
 73 |                 eal_options = "";
 74 | 
 75 |                 corestr = "";
 76 |             }
 77 |     }__rte_cache_aligned;
 78 | 
 79 |     extern struct dpdk_params dpdk_par;
 80 | 
 81 |     class daiet_params {
 82 |         private:
 83 | 
 84 |             uint32_t num_updates;
 85 | 
 86 |             uint32_t max_num_pending_messages;
 87 | 
 88 |             uint64_t tx_flags;
 89 | 
 90 |             uint16_t ps_port;
 91 | 
 92 |             uint16_t num_workers;
 93 | 
 94 |         public:
 95 |             daiet_params();
 96 |             ~daiet_params();
 97 | 
 98 |             void print_params();
 99 | 
100 |             uint16_t& getNumWorkers();
101 | 
102 |             __rte_always_inline uint32_t getNumUpdates() const {
103 |                 return num_updates;
104 |             }
105 | 
106 |             __rte_always_inline uint32_t& getMaxNumPendingMessages() {
107 |                 return max_num_pending_messages;
108 |             }
109 | 
110 |             void setNumUpdates(uint32_t);
111 | 
112 |             __rte_always_inline int64_t getTxFlags() const {
113 |                 return tx_flags;
114 |             }
115 | 
116 |             __rte_always_inline uint16_t getBasePsPort() const {
117 |                 return ps_port;
118 |             }
119 | 
120 |             void setBasePsPort(uint16_t);
121 |     };
122 | 
123 |     extern daiet_params daiet_par;
124 | }
125 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/ps.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | namespace daiet {
 9 | 
10 |     void ps_setup();
11 |     void ps_cleanup();
12 |     int ps(void*);
13 | }
14 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/stats.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #include "stats.hpp"
 7 | #include "utils.hpp"
 8 | 
 9 | namespace daiet {
10 | 
11 |     pkt_statistics pkt_stats;
12 | 
13 |     pkt_statistics::pkt_statistics() : total_ps_tx(0), total_ps_rx(0) {
14 |     }
15 | 
16 |     void pkt_statistics::init(uint32_t nb_ps) {
17 | 
18 |         total_ps_tx = 0;
19 |         total_ps_rx = 0;
20 | 
21 |         ps_tx.resize(nb_ps);
22 |         ps_rx.resize(nb_ps);
23 |     }
24 | 
25 |     void pkt_statistics::set_ps(uint32_t psid, uint64_t tx, uint64_t rx) {
26 | 
27 |         boost::unique_lock<boost::mutex> lock(ps_mutex);
28 | 
29 |         ps_tx[psid] = tx;
30 |         ps_rx[psid] = rx;
31 | 
32 |         total_ps_tx += tx;
33 |         total_ps_rx += rx;
34 |     }
35 | 
36 |     void pkt_statistics::dump(){
37 | 
38 |             LOG_INFO("PS TX " + to_string(total_ps_tx));
39 |             LOG_INFO("PS RX " + to_string(total_ps_rx));
40 | 
41 |             for (uint32_t i = 0; i < ps_tx.size(); i++) {
42 | 
43 |                 LOG_INFO("## PS" + to_string(i));
44 |                 LOG_INFO("TX " + to_string(ps_tx[i]));
45 |                 LOG_INFO("RX " + to_string(ps_rx[i]));
46 |             }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/stats.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <vector>
 9 | #include <boost/thread.hpp>
10 | 
11 | #include "common.hpp"
12 | 
13 | using namespace std;
14 | 
15 | namespace daiet {
16 | 
17 |     class pkt_statistics {
18 | 
19 |         public:
20 |             pkt_statistics();
21 |             void dump();
22 | 
23 |             void init(uint32_t);
24 |             void set_ps(uint32_t, uint64_t, uint64_t);
25 | 
26 |         private:
27 | 
28 |             boost::mutex ps_mutex;
29 |             uint64_t total_ps_tx;
30 |             uint64_t total_ps_rx;
31 |             vector<uint64_t>  ps_tx;
32 |             vector<uint64_t>  ps_rx;
33 |     };
34 | 
35 |     extern pkt_statistics pkt_stats;
36 | }
37 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/ps/src/utils.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sys/queue.h>
 9 | 
10 | #include <sstream>
11 | #include <utility>
12 | #include <mutex>
13 | #include <vector>
14 | #include <iomanip>
15 | #include <algorithm>
16 | 
17 | #include "common.hpp"
18 | 
19 | using namespace std;
20 | 
21 | namespace daiet {
22 | 
23 |     extern std::ofstream daiet_log;
24 | 
25 |     template<typename T>
26 |     void LOG_FATAL(T) __attribute__((used));
27 |     template<typename T>
28 |     void LOG_ERROR(T) __attribute__((used));
29 |     template<typename T>
30 |     void LOG_INFO(T) __attribute__((used));
31 | 
32 | #ifdef DEBUG
33 |     template<typename T>
34 |     void LOG_DEBUG(T) __attribute__((used));
35 | #else
36 | #define LOG_DEBUG(T)
37 | #endif
38 | 
39 |     template<typename T>
40 |     string to_hex(T);
41 | 
42 |     vector<string> split(const string &);
43 |     vector<string> split(const string &, const string &);
44 | 
45 |     string mac_to_str(const rte_ether_addr);
46 |     string mac_to_str(const uint64_t, bool = true);
47 |     int64_t str_to_mac(string const&, bool = true);
48 |     string ip_to_str(uint32_t);
49 | 
50 |     void swap_eth_addr(rte_ether_hdr *);
51 |     void deep_copy_single_segment_pkt(rte_mbuf*, const rte_mbuf*);
52 |     void check_port_link_status(uint16_t);
53 |     void print_packet(struct rte_ether_hdr *, uint16_t);
54 |     void print_dev_info(struct rte_eth_dev_info&);
55 |     void print_dev_stats(uint16_t);
56 |     void print_dev_xstats(uint16_t);
57 | 
58 | } // End namespace
59 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/scripts/dpdk-config.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -x
 4 | 
 5 | IFACE="eno1"
 6 | 
 7 | cwd=$(pwd)
 8 | 
 9 | RTE_SDK=$cwd/../lib/dpdk
10 | RTE_TARGET=build
11 | 
12 | cd $RTE_SDK/$RTE_TARGET
13 | 
14 | modprobe uio
15 | insmod kmod/igb_uio.ko
16 | 
17 | cd ../usertools
18 | 
19 | ./dpdk-devbind.py --bind=igb_uio ${IFACE}
20 | 
21 | cd $cwd
22 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/DaietContext.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #include "DaietContext.hpp"
  7 | #include "daiet.hpp"
  8 | #include "utils.hpp"
  9 | #include "gloo/common/error.h"
 10 | 
 11 | namespace daiet {
 12 | 
 13 |     void* DaietMaster(void *ctx) {
 14 | 
 15 |         DaietContext* d_ctx_ptr = (DaietContext *) ctx;
 16 | 
 17 |         d_ctx_ptr->ret = master(d_ctx_ptr);
 18 | 
 19 |         return NULL;
 20 |     }
 21 | 
 22 |     DaietContext::DaietContext() :
 23 |             num_worker_threads (1), master_ready(0), data_ready(0), results(0), tensor_update_ptr(NULL), result_id(0), one_msec(1) {
 24 | 
 25 |         tid_counter.store(0);
 26 |         StartMaster();
 27 |     }
 28 | 
 29 |     DaietContext::~DaietContext() {
 30 | 
 31 |         StopMaster();
 32 |     }
 33 | 
 34 |     void DaietContext::set_num_worker_threads(uint32_t nt){
 35 |         num_worker_threads = nt;
 36 |     }
 37 | 
 38 |     void DaietContext::wait_master_ready() {
 39 |         boost::unique_lock<boost::mutex> lock(master_ready_mutex);
 40 | 
 41 |         while (master_ready!=num_worker_threads)
 42 |             master_ready_event.wait(lock);
 43 |     }
 44 | 
 45 |     void DaietContext::set_master_ready() {
 46 | 
 47 |         boost::unique_lock<boost::mutex> lock(master_ready_mutex);
 48 | 
 49 |         if ((++master_ready) == num_worker_threads)
 50 |             master_ready_event.notify_one();
 51 |     }
 52 | 
 53 |     void DaietContext::send_tensor(TensorUpdate* tuptr) {
 54 |         boost::unique_lock<boost::mutex> lock(data_ready_mutex);
 55 | 
 56 |         while (data_ready!=0)
 57 |             data_pop_event.wait(lock);
 58 | 
 59 |         tensor_update_ptr = tuptr;
 60 |         data_ready = num_worker_threads;
 61 |         data_push_event.notify_all();
 62 |     }
 63 | 
 64 |     bool DaietContext::receive_tensor(TensorUpdate& tu, uint16_t worker_id) {
 65 |         boost::unique_lock<boost::mutex> lock(data_ready_mutex);
 66 | 
 67 |         while (data_ready!=(uint32_t)(worker_id+1)) {
 68 |             if (data_push_event.wait_for(lock, one_msec) == boost::cv_status::timeout)
 69 |                 return false;
 70 |         }
 71 | 
 72 |         tu = *tensor_update_ptr; // Copy
 73 | 
 74 |         if (data_ready != 1){
 75 | #ifdef OFFLOAD_BITMAP
 76 |             tu.block_count /= num_worker_threads;
 77 |             if (tu.block_count%num_worker_threads>worker_id)
 78 |                 tu.block_count += 1;
 79 |             tu.count = tu.block_count * block_size;
 80 | #else
 81 |             tu.count /= num_worker_threads;
 82 | #endif
 83 |         } else {
 84 |             tu.count -= tu.start_idx;
 85 |         }
 86 | 
 87 |         tensor_update_ptr->start_idx += tu.count;
 88 | 
 89 |         if ((--data_ready) == 0)
 90 |             data_pop_event.notify_one();
 91 | 
 92 |         return true;
 93 |     }
 94 | 
 95 |     bool DaietContext::send_result(const int32_t rid) {
 96 |         boost::unique_lock<boost::mutex> lock(result_mutex);
 97 | 
 98 |         while (results == num_worker_threads) {
 99 |             if (result_pop_event.wait_for(lock, one_msec) == boost::cv_status::timeout)
100 |                 return false;
101 |         }
102 | 
103 |         if ((++results)==num_worker_threads) {
104 |             result_id = rid;
105 |             result_push_event.notify_all();
106 |         }
107 | 
108 |         return true;
109 |     }
110 | 
111 |     void DaietContext::receive_result(const int32_t rid) {
112 |         boost::unique_lock<boost::mutex> lock(result_mutex);
113 | 
114 |         while (results != num_worker_threads && result_id != rid)
115 |             result_push_event.wait(lock);
116 | 
117 |         results = 0;
118 |         result_id = 0;
119 | 
120 |         result_pop_event.notify_all();
121 |     }
122 | 
123 |     void DaietContext::StartMaster() {
124 | 
125 |         /* Launch dpdk master thread */
126 |         if (pthread_create(&masterThread, NULL, DaietMaster, this))
127 |             GLOO_THROW("Error starting master dpdk thread");
128 | 
129 |         //Wait for EAL setup
130 |         wait_master_ready();
131 |     }
132 | 
133 |     void DaietContext::StopMaster() {
134 | 
135 |         force_quit = true;
136 | 
137 |         int join_ret = pthread_join(masterThread, NULL);
138 |         if (join_ret)
139 |             GLOO_THROW("Error joining master dpdk thread: returned ", join_ret);
140 | 
141 |         if (this->ret < 0)
142 |             GLOO_THROW("Master dpdk thread returned ", this->ret);
143 | 
144 |     }
145 | 
146 | #ifdef OFFLOAD_BITMAP
147 |     void DaietContext::AllReduce(gloo::float16* ptr, int count, uint8_t* bitmap_ptr, int block_count) {
148 |         int32_t tensor_id = tid_counter.fetch_add(1)+1;
149 |         TensorUpdate tu;
150 |         tu.ptr = ptr;
151 |         tu.count = count;
152 |         tu.start_idx = 0;
153 |         tu.id = tensor_id;
154 |         tu.type = FLOAT16;
155 |         tu.bitmap_ptr = bitmap_ptr;
156 |         tu.block_count = block_count;
157 |         send_tensor(&tu);
158 |         receive_result(tensor_id);
159 |     }
160 | #endif
161 | 
162 |     void DaietContext::AllReduce(gloo::float16* ptr, int count) {
163 | 
164 |         int32_t tensor_id = tid_counter.fetch_add(1)+1;
165 |         TensorUpdate tu;
166 |         tu.ptr = ptr;
167 |         tu.count = count;
168 |         tu.start_idx = 0;
169 |         tu.id = tensor_id;
170 |         tu.type = FLOAT16;
171 | 
172 |         send_tensor(&tu);
173 |         receive_result(tensor_id);
174 |     }
175 | 
176 | #ifdef OFFLOAD_BITMAP
177 |     void DaietContext::AllReduce(float* ptr, int count, uint8_t* bitmap_ptr, int block_count) {
178 |         int32_t tensor_id = tid_counter.fetch_add(1)+1;
179 |         TensorUpdate tu;
180 |         tu.ptr = ptr;
181 |         tu.count = count;
182 |         tu.start_idx = 0;
183 |         tu.id = tensor_id;
184 |         tu.type = FLOAT32;
185 |         tu.bitmap_ptr = bitmap_ptr;
186 |         tu.block_count = block_count;
187 |         send_tensor(&tu);
188 |         receive_result(tensor_id);
189 |     }
190 | #endif
191 | 
192 |     void DaietContext::AllReduce(float* ptr, int count) {
193 | 
194 |         int32_t tensor_id = tid_counter.fetch_add(1)+1;
195 |         TensorUpdate tu;
196 |         tu.ptr = ptr;
197 |         tu.count = count;
198 |         tu.start_idx = 0;
199 |         tu.id = tensor_id;
200 |         tu.type = FLOAT32;
201 | 
202 |         send_tensor(&tu);
203 |         receive_result(tensor_id);
204 |     }
205 | 
206 | #ifdef OFFLOAD_BITMAP
207 |     void DaietContext::AllReduce(int32_t* ptr, int count, uint8_t* bitmap_ptr, int block_count) {
208 |         int32_t tensor_id = tid_counter.fetch_add(1)+1;
209 |         TensorUpdate tu;
210 |         tu.ptr = ptr;
211 |         tu.count = count;
212 |         tu.start_idx = 0;
213 |         tu.id = tensor_id;
214 |         tu.type = INT32;
215 |         tu.bitmap_ptr = bitmap_ptr;
216 |         tu.block_count = block_count;
217 |         send_tensor(&tu);
218 |         receive_result(tensor_id);
219 |     }
220 | #endif
221 | 
222 |     void DaietContext::AllReduce(int32_t* ptr, int count) {
223 | 
224 |         int32_t tensor_id = tid_counter.fetch_add(1)+1;
225 |         TensorUpdate tu;
226 |         tu.ptr = ptr;
227 |         tu.count = count;
228 |         tu.start_idx = 0;
229 |         tu.id = tensor_id;
230 |         tu.type = INT32;
231 | 
232 |         send_tensor(&tu);
233 |         receive_result(tensor_id);
234 |     }
235 | 
236 |     bool DaietContext::try_daiet(gloo::float16* ptr, int count, int fn_) {
237 |         if (fn_ == 1) { //sum
238 | 
239 |             AllReduce(ptr, count);
240 | 
241 |             return true;
242 |         }
243 | 
244 |         return false;
245 |     }
246 | 
247 |     bool DaietContext::try_daiet(float* ptr, int count, int fn_) {
248 |         if (fn_ == 1) { //sum
249 | 
250 |             AllReduce(ptr, count);
251 | 
252 |             return true;
253 |         }
254 | 
255 |         return false;
256 |     }
257 | 
258 |     bool DaietContext::try_daiet(int32_t* ptr, int count, int fn_) {
259 |         if (fn_ == 1) { //sum
260 | 
261 |             AllReduce(ptr, count);
262 | 
263 |             return true;
264 |         }
265 | 
266 |         return false;
267 |     }
268 | 
269 |     bool DaietContext::try_daiet(__attribute__((unused)) void* ptr, __attribute__((unused)) int count, __attribute__((unused)) int fn_) {
270 | 
271 |         return false;
272 |     }
273 | }
274 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/DaietContext.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #pragma once
  7 | 
  8 | #define DAIET
  9 | 
 10 | #include <boost/thread.hpp>
 11 | #include <atomic>
 12 | #include "gloo/types.h"
 13 | 
 14 | namespace daiet {
 15 | 
 16 |     void *DaietMaster(void *ctx);
 17 | 
 18 |     enum TensorUpdateType {
 19 |         NONE = 0, INT32 = 1, FLOAT32 = 2, FLOAT16 = 3
 20 |     };
 21 | 
 22 |     struct TensorUpdate {
 23 |             void* ptr;
 24 |             int count;
 25 |             int start_idx;
 26 |             int32_t id;
 27 |             TensorUpdateType type;
 28 | #ifdef OFFLOAD_BITMAP
 29 |             uint8_t* bitmap_ptr;
 30 |             int block_count;
 31 | #endif
 32 |     };
 33 | 
 34 |     /* Singleton class*/
 35 |     class DaietContext {
 36 |         public:
 37 | 
 38 |             static DaietContext& getInstance() {
 39 |                 // Guaranteed to be destroyed and instantiated on first use.
 40 |                 static DaietContext instance;
 41 |                 return instance;
 42 |             }
 43 | 
 44 |             DaietContext(DaietContext const&) = delete;
 45 |             void operator=(DaietContext const&) = delete;
 46 | 
 47 |             void wait_master_ready();
 48 |             void set_master_ready();
 49 |             void set_num_worker_threads(uint32_t);
 50 | 
 51 |             void receive_result(const int32_t);
 52 |             bool send_result(const int32_t);
 53 |             bool receive_tensor(TensorUpdate&, uint16_t);
 54 |             void send_tensor(TensorUpdate*);
 55 | 
 56 |             void StartMaster();
 57 |             void StopMaster();
 58 | 
 59 | #ifdef OFFLOAD_BITMAP
 60 |             void AllReduce(gloo::float16*, int, uint8_t*, int);
 61 |             void AllReduce(float*, int, uint8_t*, int);
 62 |             void AllReduce(int32_t*, int, uint8_t*, int);
 63 |             static const uint32_t block_size = 256;
 64 | #endif
 65 |             void AllReduce(gloo::float16*, int);
 66 |             void AllReduce(float*, int);
 67 |             void AllReduce(int32_t*, int);
 68 | 
 69 |             bool try_daiet(gloo::float16*, int, int);
 70 |             bool try_daiet(float*, int, int);
 71 |             bool try_daiet(int32_t*, int, int);
 72 |             bool try_daiet(void*, int, int);
 73 | 
 74 |             friend void *DaietMaster(void*);
 75 | 
 76 |         private:
 77 | 
 78 |             DaietContext();
 79 |             virtual ~DaietContext();
 80 | 
 81 |             pthread_t masterThread;
 82 |             int ret;
 83 | 
 84 |             std::atomic_uint_fast32_t tid_counter;
 85 |             boost::mutex master_ready_mutex, data_ready_mutex, result_mutex;
 86 |             boost::condition_variable master_ready_event, data_push_event, data_pop_event, result_push_event, result_pop_event;
 87 |             uint32_t num_worker_threads;
 88 | 
 89 |             // Shared
 90 |             uint32_t master_ready;
 91 |             uint32_t data_ready;
 92 |             uint32_t results;
 93 |             TensorUpdate* tensor_update_ptr;
 94 |             int32_t result_id;
 95 |             // ***
 96 | 
 97 |             boost::chrono::milliseconds one_msec;
 98 |     };
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/common.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #include "common.hpp"
 7 | 
 8 | namespace daiet {
 9 | 
10 |     volatile bool force_quit;
11 | }
12 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/common.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <stdio.h>
 9 | #include <sys/types.h>
10 | #include <unistd.h>
11 | #include <stdlib.h>
12 | #include <string.h>
13 | #include <stdint.h>
14 | #include <inttypes.h>
15 | #include <errno.h>
16 | #include <netinet/in.h>
17 | #include <stdarg.h>
18 | #include <ctype.h>
19 | #include <getopt.h>
20 | #include <stdbool.h>
21 | #include <arpa/inet.h>
22 | 
23 | #include <iostream>
24 | #include <string>
25 | #include <cstring>
26 | #include <fstream>
27 | 
28 | #include "dpdk.h"
29 | #include "msgs.h"
30 | 
31 | namespace daiet {
32 | 
33 |     extern volatile bool force_quit;
34 | }
35 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/daiet.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "DaietContext.hpp"
 9 | 
10 | namespace daiet {
11 | 
12 |     int master(DaietContext* dctx);
13 | }
14 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/dpdk.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <rte_config.h>
 9 | #include <rte_memory.h>
10 | #include <rte_memzone.h>
11 | #include <rte_launch.h>
12 | #include <rte_eal.h>
13 | #include <rte_per_lcore.h>
14 | #include <rte_lcore.h>
15 | #include <rte_debug.h>
16 | #include <rte_common.h>
17 | #include <rte_log.h>
18 | #include <rte_malloc.h>
19 | #include <rte_memcpy.h>
20 | #include <rte_atomic.h>
21 | #include <rte_cycles.h>
22 | #include <rte_prefetch.h>
23 | #include <rte_branch_prediction.h>
24 | #include <rte_interrupts.h>
25 | #include <rte_random.h>
26 | #include <rte_ether.h>
27 | #include <rte_ip.h>
28 | #include <rte_udp.h>
29 | #include <rte_ethdev.h>
30 | #include <rte_mempool.h>
31 | #include <rte_mbuf.h>
32 | #include <rte_byteorder.h>
33 | #include <rte_rwlock.h>
34 | #include <rte_timer.h>
35 | #include <rte_bitmap.h>
36 | #include <rte_flow.h>
37 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/msgs.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | namespace daiet {
 9 | 
10 | #ifdef __cplusplus
11 |     extern "C" {
12 | #endif
13 | 
14 |         /**
15 |          * DAIET Header
16 |          */
17 |         struct daiet_hdr {
18 |                 uint32_t tsi; /**< tensor start index */
19 |                 uint16_t pool_index; /**< pool index */
20 |                 uint32_t next_tsi; /**< next tensor start index */
21 | #ifdef NOSCALING
22 |                 uint8_t data_type;
23 | #endif
24 |         }__attribute__((__packed__));
25 | 
26 |         struct entry_hdr {
27 |                 int32_t upd; /**< vector entry */
28 |         }__attribute__((__packed__));
29 | #ifndef NOSCALING
30 |         struct exp_hdr {
31 |                 int16_t exp; /**< exponent */
32 |         }__attribute__((__packed__));
33 | #endif
34 | #ifdef __cplusplus
35 |     }
36 | #endif
37 | 
38 | }  // End namespace


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/params.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #include "params.hpp"
  7 | #include <boost/program_options.hpp>
  8 | 
  9 | namespace po = boost::program_options;
 10 | 
 11 | namespace daiet {
 12 | 
 13 |     struct dpdk_data dpdk_data;
 14 |     struct dpdk_params dpdk_par;
 15 |     daiet_params daiet_par;
 16 | 
 17 |     void parse_parameters() {
 18 | 
 19 |         string config_file;
 20 |         ifstream ifs;
 21 |         uint16_t worker_port, ps_port;
 22 |         uint32_t num_updates;
 23 |         string worker_ip_str, ps_ips_str, ps_macs_str;
 24 | 
 25 |         po::options_description dpdk_options("DPDK options");
 26 |         po::options_description daiet_options("DAIET options");
 27 |         po::options_description config_file_options;
 28 | 
 29 |         dpdk_options.add_options()
 30 |                 ("dpdk.cores", po::value<string>(&dpdk_par.corestr)->default_value("0-2"), "List of cores")
 31 |                 ("dpdk.prefix", po::value<string>(&dpdk_par.prefix)->default_value("daiet"), "Process prefix")
 32 |                 ("dpdk.extra_eal_options", po::value<string>(&dpdk_par.eal_options)->default_value(""), "Extra EAL options")
 33 |                 ("dpdk.port_id", po::value<uint16_t>(&dpdk_par.portid)->default_value(0), "Port ID")
 34 |                 ("dpdk.pool_size", po::value<uint32_t>(&dpdk_par.pool_size)->default_value(8192 * 32), "Pool size")
 35 |                 ("dpdk.pool_cache_size", po::value<uint32_t>(&dpdk_par.pool_cache_size)->default_value(256 * 2), "Pool cache size")
 36 |                 ("dpdk.burst_rx", po::value<uint32_t>(&dpdk_par.burst_rx)->default_value(64), "RX burst size")
 37 |                 ("dpdk.burst_tx", po::value<uint32_t>(&dpdk_par.burst_tx)->default_value(64), "TX burst size")
 38 |                 ("dpdk.bulk_drain_tx_us", po::value<uint32_t>(&dpdk_par.bulk_drain_tx_us)->default_value(100), "TX bulk drain timer (us)");
 39 | 
 40 |         daiet_options.add_options()
 41 |                 ("daiet.worker_ip", po::value<string>(&worker_ip_str)->default_value("10.0.0.1"), "IP address of this worker")
 42 |                 ("daiet.worker_port", po::value<uint16_t>(&worker_port)->default_value(4000), "Worker UDP port")
 43 |                 ("daiet.ps_port", po::value<uint16_t>(&ps_port)->default_value(48879), "PS UDP port")
 44 |                 ("daiet.ps_ips", po::value<string>(&ps_ips_str)->required(), "Comma-separated list of PS IP addresses")
 45 |                 ("daiet.ps_macs", po::value<string>(&ps_macs_str)->required(), "Comma-separated list of PS MAC addresses")
 46 |                 ("daiet.max_num_pending_messages", po::value<uint32_t>(&(daiet_par.getMaxNumPendingMessages()))->default_value(256), "Max number of pending, unaggregated messages")
 47 |                 ("daiet.num_updates", po::value<uint32_t>(&num_updates)->default_value(32), "Number of updates per packet")
 48 |                 ("daiet.num_workers", po::value<uint16_t>(&(daiet_par.getNumWorkers()))->default_value(0), "Number of workers")
 49 |                 ("daiet.sync_blocks", po::value<uint32_t>(&(daiet_par.getSyncBlocks()))->default_value(10), "Synchronization Blocks ")
 50 | #ifdef TIMERS
 51 |                 ("daiet.timeout", po::value<double>(&(daiet_par.getTimeout()))->default_value(1), "Timeout in millisecond")
 52 | #endif
 53 |                 ;
 54 | 
 55 |         config_file_options.add(daiet_options).add(dpdk_options);
 56 | 
 57 |         config_file = "/etc/daiet.cfg";
 58 |         ifs.open(config_file.c_str());
 59 |         if(!ifs.good()){
 60 |             ifs.close();
 61 | 
 62 |             char hostname[500];
 63 |             if (gethostname(hostname,sizeof(hostname))!=0)
 64 |                 LOG_FATAL("gethostname failed: "+ string(strerror(errno)));
 65 | 
 66 |             config_file = "daiet-"+string(hostname)+".cfg";
 67 |             ifs.open(config_file.c_str());
 68 |             if(!ifs.good()){
 69 |                 ifs.close();
 70 | 
 71 |                 config_file = "daiet.cfg";
 72 |                 ifs.open(config_file.c_str());
 73 |                 if(!ifs.good()){
 74 |                     ifs.close();
 75 |                     LOG_FATAL("No config file found! (/etc/daiet.cfg, daiet-"+string(hostname)+".cfg, daiet.cfg)");
 76 |                 }
 77 |             }
 78 |         }
 79 |         LOG_INFO("Configuration file "+config_file);
 80 | 
 81 |         po::variables_map vm;
 82 |         po::store(po::parse_config_file(ifs, config_file_options), vm);
 83 |         po::notify(vm);
 84 | 
 85 |         if (!daiet_par.setWorkerIp(worker_ip_str))
 86 |             LOG_FATAL("Invalid worker IP: " + worker_ip_str);
 87 | 
 88 |         daiet_par.setBaseWorkerPort(worker_port);
 89 |         daiet_par.setBasePsPort(ps_port);
 90 | 
 91 |         if (!daiet_par.setPs(ps_ips_str, ps_macs_str))
 92 |             LOG_FATAL("Invalid PS address: \n" + ps_ips_str + "\n" + ps_macs_str);
 93 | 
 94 |         daiet_par.setNumUpdates(num_updates);
 95 | 
 96 |         if (daiet_par.getNumWorkers()<=0)
 97 |             LOG_FATAL("Number of workers must be positive.");
 98 |         daiet_par.print_params();
 99 |     }
100 | 
101 |     void print_dpdk_params() {
102 | 
103 |         LOG_INFO("** DPDK parameters **");
104 |         LOG_INFO("Cores: " + dpdk_par.corestr);
105 |         LOG_INFO("Port ID: " + to_string(dpdk_par.portid));
106 |         LOG_INFO("Port RX ring size: " + to_string(dpdk_par.port_rx_ring_size));
107 |         LOG_INFO("Port TX ring size: " + to_string(dpdk_par.port_tx_ring_size));
108 |         LOG_INFO("Pool size: " + to_string(dpdk_par.pool_size));
109 |         LOG_INFO("Pool cache size: " + to_string(dpdk_par.pool_cache_size));
110 |         LOG_INFO("Burst size RX: " + to_string(dpdk_par.burst_rx));
111 |         LOG_INFO("Burst size TX: " + to_string(dpdk_par.burst_tx));
112 |         LOG_INFO("Burst drain TX us: " + to_string(dpdk_par.bulk_drain_tx_us));
113 |         LOG_INFO("Prefix: " + dpdk_par.prefix);
114 |         LOG_INFO("Extra EAL options: " + dpdk_par.eal_options);
115 |     }
116 | 
117 |     daiet_params::daiet_params() {
118 | 
119 |         // Defaults
120 |         num_updates = 32;
121 | 
122 |         max_num_pending_messages = 256;
123 | 
124 |         tx_flags = PKT_TX_IP_CKSUM | PKT_TX_IPV4 | PKT_TX_UDP_CKSUM;
125 | 
126 |         worker_port = 4000;
127 |         ps_port = 48879;
128 |         worker_ip_be = rte_cpu_to_be_32(0x0a000001);
129 | 
130 |         ps_ips_be = NULL;
131 | 
132 |         ps_macs_be = NULL;
133 | 
134 |         num_ps = 0;
135 | 
136 |         num_workers = 0;
137 |     }
138 | 
139 |     daiet_params::~daiet_params() {
140 |         if (ps_ips_be != NULL)
141 |             delete[] ps_ips_be;
142 |         if (ps_macs_be != NULL)
143 |             delete[] ps_macs_be;
144 |     }
145 | 
146 |     void daiet_params::print_params() {
147 | 
148 |         LOG_INFO("** DAIET parameters **");
149 |         LOG_INFO("Num updates: " + to_string(num_updates));
150 |         LOG_INFO("Max num pending messages: " + to_string(max_num_pending_messages));
151 |         LOG_INFO("Worker port: " + to_string(worker_port));
152 |         LOG_INFO("PS port: " + to_string(ps_port));
153 | 
154 |         LOG_INFO("Worker IP: " + ip_to_str(worker_ip_be));
155 | 
156 |         for (uint32_t i = 0; i < num_ps; i++) {
157 | 
158 |             LOG_INFO("PS" + to_string(i) + ": " + mac_to_str(ps_macs_be[i]) + " " + ip_to_str(ps_ips_be[i]));
159 |         }
160 | 
161 |         LOG_INFO("Num workers: " + to_string(num_workers));
162 |     }
163 | 
164 |     uint16_t& daiet_params::getNumWorkers() {
165 |         return num_workers;
166 |     }
167 | 
168 |     uint32_t& daiet_params::getSyncBlocks() {
169 |         return sync_blocks;
170 |     }
171 | 
172 |     void daiet_params::setNumUpdates(uint32_t numUpdates) {
173 |         num_updates = numUpdates;
174 |     }
175 | 
176 |     void daiet_params::setBaseWorkerPort(uint16_t workerPort) {
177 |         worker_port = workerPort;
178 |     }
179 | 
180 |     void daiet_params::setBasePsPort(uint16_t psPort) {
181 |         ps_port = psPort;
182 | 
183 |     }
184 | 
185 |     /*
186 |      * Returns false if the IP is invalid
187 |      */
188 |     bool daiet_params::setWorkerIp(string workerIp) {
189 | 
190 |         struct in_addr addr;
191 | 
192 |         if (inet_aton(workerIp.c_str(), &addr) == 0)
193 |             return false;
194 | 
195 |         worker_ip_be = addr.s_addr;
196 |         return true;
197 |     }
198 | 
199 |     bool daiet_params::setPs(string psIps, string psMacs) {
200 | 
201 |         int64_t rc;
202 | 
203 |         vector<string> ips = split(psIps, ", ");
204 |         vector<string> macs = split(psMacs, ", ");
205 | 
206 |         num_ps = ips.size() < macs.size() ? ips.size() : macs.size();
207 | 
208 |         if (ps_ips_be != NULL)
209 |             delete[] ps_ips_be;
210 |         if (ps_macs_be != NULL)
211 |             delete[] ps_macs_be;
212 | 
213 |         ps_ips_be = new uint32_t[num_ps];
214 |         ps_macs_be = new uint64_t[num_ps];
215 | 
216 |         struct in_addr addr;
217 | 
218 |         for (uint32_t i = 0; i < num_ps; i++) {
219 | 
220 |             if (inet_aton(ips[i].c_str(), &addr) == 0)
221 |                 return false;
222 | 
223 |             ps_ips_be[i] = addr.s_addr;
224 | 
225 |             rc = str_to_mac(macs[i]);
226 |             if (rc < 0)
227 |                 return false;
228 | 
229 |             ps_macs_be[i] = rc;
230 |         }
231 | 
232 |         return true;
233 |     }
234 | }


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/params.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #pragma once
  7 | 
  8 | #include "common.hpp"
  9 | #include "utils.hpp"
 10 | 
 11 | using namespace std;
 12 | 
 13 | namespace daiet {
 14 | 
 15 |     void print_dpdk_params();
 16 |     void parse_parameters();
 17 | 
 18 |     struct dpdk_data {
 19 | 
 20 |             // Buffer pool size
 21 |             uint32_t pool_buffer_size;
 22 |             uint16_t core_to_thread_id[RTE_MAX_LCORE];
 23 | 
 24 |             dpdk_data() {
 25 |                 // Defaults
 26 | 
 27 |                 pool_buffer_size = RTE_MBUF_DEFAULT_BUF_SIZE;
 28 |             }
 29 |     }__rte_cache_aligned;
 30 | 
 31 |     extern struct dpdk_data dpdk_data;
 32 | 
 33 |     struct dpdk_params {
 34 | 
 35 |             // Ports
 36 |             uint16_t portid;
 37 |             uint16_t port_rx_ring_size;
 38 |             uint16_t port_tx_ring_size;
 39 | 
 40 |             // Buffer pool
 41 |             uint32_t pool_size;
 42 |             uint32_t pool_cache_size;
 43 | 
 44 |             // Burst sizes
 45 |             uint32_t burst_rx;
 46 |             uint32_t burst_tx;
 47 |             uint32_t bulk_drain_tx_us;
 48 | 
 49 |             // Extra EAL options
 50 |             string eal_options;
 51 | 
 52 |             // Process prefix
 53 |             string prefix;
 54 | 
 55 |             // Cores string
 56 |             string corestr;
 57 | 
 58 |             dpdk_params() {
 59 |                 // Defaults
 60 | 
 61 |                 portid = 0;
 62 |                 port_rx_ring_size = 1024;
 63 |                 port_tx_ring_size = 1024;
 64 | 
 65 |                 pool_size = 8192 * 32;
 66 |                 pool_cache_size = 256 * 2;
 67 | 
 68 |                 burst_rx = 64;
 69 |                 burst_tx = 64;
 70 |                 bulk_drain_tx_us = 10;
 71 | 
 72 |                 prefix = "daiet";
 73 |                 eal_options = "";
 74 | 
 75 |                 corestr = "";
 76 |             }
 77 |     }__rte_cache_aligned;
 78 | 
 79 |     extern struct dpdk_params dpdk_par;
 80 | 
 81 |     class daiet_params {
 82 |         private:
 83 | 
 84 |             uint32_t num_updates;
 85 | 
 86 |             uint32_t max_num_pending_messages;
 87 | 
 88 |             uint64_t tx_flags;
 89 | 
 90 |             uint16_t worker_port;
 91 |             uint16_t ps_port;
 92 |             uint32_t worker_ip_be;
 93 | 
 94 |             uint32_t* ps_ips_be;
 95 | 
 96 |             uint64_t* ps_macs_be;
 97 | 
 98 |             uint32_t num_ps;
 99 | 
100 | #ifdef TIMERS
101 |             double timeout;
102 | #endif
103 | 
104 |             uint16_t num_workers;
105 | 
106 |             uint32_t sync_blocks;
107 | 
108 |         public:
109 |             daiet_params();
110 |             ~daiet_params();
111 | 
112 |             void print_params();
113 | 
114 |             uint16_t& getNumWorkers();
115 | 
116 |             uint32_t& getSyncBlocks();
117 | 
118 |             __rte_always_inline uint32_t getNumUpdates() const {
119 |                 return num_updates;
120 |             }
121 | 
122 |             __rte_always_inline uint32_t& getMaxNumPendingMessages() {
123 |                 return max_num_pending_messages;
124 |             }
125 | 
126 |             void setNumUpdates(uint32_t);
127 | 
128 |             __rte_always_inline int64_t getTxFlags() const {
129 |                 return tx_flags;
130 |             }
131 | 
132 |             __rte_always_inline uint16_t getBaseWorkerPort() const {
133 |                 return worker_port;
134 |             }
135 | 
136 |             void setBaseWorkerPort(uint16_t workerPort);
137 | 
138 |             __rte_always_inline uint16_t getBasePsPort() const {
139 |                 return ps_port;
140 |             }
141 | 
142 |             void setBasePsPort(uint16_t);
143 | 
144 |             /*
145 |              * Returns false if the IP is invalid
146 |              */
147 |             bool setWorkerIp(string);
148 | 
149 |             __rte_always_inline uint32_t getWorkerIpBe() {
150 |                 return worker_ip_be;
151 |             }
152 | 
153 |             __rte_always_inline const uint32_t* getPsIpsBe() {
154 |                 return ps_ips_be;
155 |             }
156 | 
157 |             __rte_always_inline const uint64_t* getPsMacsBe() {
158 |                 return ps_macs_be;
159 |             }
160 | 
161 |             __rte_always_inline uint32_t getPsIpBe(int i) {
162 |                 return ps_ips_be[i % num_ps];
163 |             }
164 | 
165 |             __rte_always_inline uint64_t getPsMacBe(int i) {
166 |                 return ps_macs_be[i % num_ps];
167 |             }
168 | 
169 |             bool setPs(string, string);
170 | 
171 |             __rte_always_inline uint32_t getNumPs() const {
172 |                 return num_ps;
173 |             }
174 | 
175 | #ifdef TIMERS
176 |             __rte_always_inline double& getTimeout() {
177 |                 return timeout;
178 |             }
179 | #endif
180 |     };
181 | 
182 |     extern daiet_params daiet_par;
183 | }


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/ps.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #ifdef COLOCATED
  7 | #include "ps.hpp"
  8 | #include "common.hpp"
  9 | #include "utils.hpp"
 10 | #include "params.hpp"
 11 | #include "stats.hpp"
 12 | 
 13 | using namespace std;
 14 | 
 15 | namespace daiet {
 16 | 
 17 |     struct mac_ip_pair {
 18 |         struct rte_ether_addr mac;
 19 |         uint32_t be_ip;
 20 |     };
 21 | 
 22 |     thread_local static uint32_t num_updates;
 23 |     thread_local static mac_ip_pair* ps_workers_ip_to_mac;
 24 |     thread_local static uint32_t known_workers = 0;
 25 | 
 26 |     thread_local static int32_t** ps_aggregated_messages;
 27 |     thread_local static uint32_t* ps_received_message_counters;
 28 | 
 29 |     thread_local static uint16_t ps_port_be;
 30 | 
 31 | #ifdef DEBUG
 32 |     __rte_always_inline struct daiet_hdr * is_daiet_pkt_to_ps(struct rte_ether_hdr* eth_hdr, uint16_t size) {
 33 | 
 34 |         int idx;
 35 |         uint16_t etherType;
 36 |         struct rte_ipv4_hdr* ip_hdr;
 37 |         struct rte_udp_hdr* rte_udp_hdr;
 38 | 
 39 |         idx = sizeof(struct rte_ether_hdr);
 40 |         etherType = rte_be_to_cpu_16(eth_hdr->ether_type);
 41 | 
 42 |         if (etherType == RTE_ETHER_TYPE_IPV4 && size >= idx + sizeof(struct rte_ipv4_hdr)) {
 43 | 
 44 |             idx += sizeof(struct rte_ipv4_hdr);
 45 |             ip_hdr = (struct rte_ipv4_hdr *) (eth_hdr + 1);
 46 | 
 47 |             if (ip_hdr->next_proto_id == IPPROTO_UDP && size >= idx + sizeof(struct rte_udp_hdr)) {
 48 |                 idx += sizeof(struct rte_udp_hdr);
 49 |                 rte_udp_hdr = (struct rte_udp_hdr *) (ip_hdr + 1);
 50 | 
 51 |                 if (rte_udp_hdr->dst_port == ps_port_be && size >= idx + sizeof(struct daiet_hdr)) {
 52 | 
 53 |                     return (struct daiet_hdr *) (rte_udp_hdr + 1);
 54 |                 }
 55 |             }
 56 |         }
 57 |         return NULL;
 58 |     }
 59 | #endif
 60 | 
 61 |     __rte_always_inline void ps_msg_setup(struct daiet_hdr * daiet, uint16_t pool_index) {
 62 | 
 63 |         struct entry_hdr *entry;
 64 |         int32_t* base_ptr = ps_aggregated_messages[pool_index];
 65 | 
 66 |         entry = (struct entry_hdr *) (daiet + 1);
 67 |         for (uint32_t i = 0; i < num_updates; i++, entry++) {
 68 |             entry->upd = rte_cpu_to_be_32(base_ptr[i]);
 69 |             base_ptr[i] = 0;
 70 |         }
 71 |     }
 72 | 
 73 |     /* Returns true if the aggregation for the offset is complete */
 74 |     __rte_always_inline bool ps_aggregate_message(struct daiet_hdr* daiet, uint32_t be_src_ip, struct rte_ether_addr src_mac, uint16_t pool_index, uint16_t num_workers) {
 75 | 
 76 |         struct entry_hdr * entry = (struct entry_hdr *) (daiet + 1);
 77 |         int32_t* base_ptr = ps_aggregated_messages[pool_index];
 78 | 
 79 |         for (uint32_t i = 0; i < num_updates; i++, entry++) {
 80 |             base_ptr[i] += rte_be_to_cpu_32(entry->upd);
 81 |         }
 82 | 
 83 |         if (unlikely(known_workers < num_workers)) {
 84 | 
 85 |             bool found = false;
 86 | 
 87 |             for (uint32_t i = 0; i < known_workers && !found; i++) {
 88 | 
 89 |                 if (ps_workers_ip_to_mac[i].be_ip==be_src_ip)
 90 |                     found = true;
 91 |             }
 92 | 
 93 |             if (!found) {
 94 | 
 95 |                 // New worker
 96 |                 char ipstring[INET_ADDRSTRLEN];
 97 | 
 98 |                 if (unlikely(inet_ntop(AF_INET, &be_src_ip, ipstring, INET_ADDRSTRLEN) == NULL)) {
 99 |                     LOG_FATAL("Wrong IP: error " + to_string(errno));
100 |                 }
101 | 
102 |                 LOG_INFO("Worker: " + string(ipstring) + " " + mac_to_str(src_mac));
103 | 
104 |                 ps_workers_ip_to_mac[known_workers].mac = src_mac;
105 |                 ps_workers_ip_to_mac[known_workers].be_ip = be_src_ip;
106 |                 known_workers++;
107 |             }
108 |         }
109 | 
110 |         ps_received_message_counters[pool_index]--;
111 | 
112 |         if (unlikely(ps_received_message_counters[pool_index]==0)) {
113 |             ps_received_message_counters[pool_index] = num_workers;
114 |             return true;
115 |         }
116 | 
117 |         return false;
118 |     }
119 | 
120 |     void ps_setup() {
121 |     }
122 | 
123 |     void ps_cleanup() {
124 |     }
125 | 
126 |     int ps(void* num_worker_threads) {
127 | 
128 |         int ret;
129 | 
130 |         unsigned lcore_id;
131 |         unsigned nb_rx = 0, j = 0, i = 0, nb_tx = 0, sent = 0;
132 | 
133 |         uint16_t ps_id, id_shift = *((uint16_t*)(num_worker_threads));
134 |         uint16_t num_workers = daiet_par.getNumWorkers();
135 |         const uint32_t max_num_pending_messages = daiet_par.getMaxNumPendingMessages();
136 |         num_updates = daiet_par.getNumUpdates();
137 |         uint64_t ps_tx = 0, ps_rx = 0;
138 | 
139 |         struct rte_mempool *pool;
140 |         string pool_name = "ps_pool";
141 |         struct rte_mbuf** pkts_burst;
142 |         struct rte_mbuf* m;
143 |         struct rte_mbuf** clone_burst;
144 | 
145 |         struct rte_ether_hdr* eth;
146 |         struct rte_ipv4_hdr * ip;
147 |         struct rte_udp_hdr * udp;
148 |         struct daiet_hdr* daiet;
149 |         uint16_t pool_index = 0, start_pool_index = 0;
150 | 
151 |         // Get core ID
152 |         lcore_id = rte_lcore_id();
153 |         ps_id = dpdk_data.core_to_thread_id[lcore_id];
154 |         LOG_DEBUG("PS core: " + to_string(lcore_id) + " PS id: " + to_string(ps_id));
155 | 
156 |         start_pool_index = (ps_id - id_shift) * max_num_pending_messages;
157 |         ps_port_be = rte_cpu_to_be_16(daiet_par.getBasePsPort() + ps_id - id_shift);
158 | 
159 |         ps_aggregated_messages = (int32_t**) rte_malloc_socket(NULL, max_num_pending_messages * sizeof(int32_t*), RTE_CACHE_LINE_SIZE, rte_socket_id());
160 |         if (ps_aggregated_messages == NULL)
161 |             LOG_FATAL("Failed PS aggregated messages allocation!");
162 | 
163 |         for (i = 0; i < max_num_pending_messages; i++) {
164 |             ps_aggregated_messages[i] = (int32_t*) rte_zmalloc_socket(NULL, num_updates * sizeof(int32_t), RTE_CACHE_LINE_SIZE, rte_socket_id());
165 |             if (ps_aggregated_messages[i] == NULL)
166 |                 LOG_FATAL("Failed PS aggregated messages allocation: element " + to_string(i));
167 |         }
168 | 
169 |         ps_received_message_counters = (uint32_t*) rte_zmalloc_socket(NULL, max_num_pending_messages * sizeof(uint32_t), RTE_CACHE_LINE_SIZE, rte_socket_id());
170 |         if (ps_received_message_counters == NULL)
171 |             LOG_FATAL("Failed PS aggregated messages allocation!");
172 | 
173 |         for (i = 0; i < max_num_pending_messages; i++) {
174 |             ps_received_message_counters[i] = num_workers;
175 |         }
176 | 
177 |         ps_workers_ip_to_mac = (mac_ip_pair*) rte_zmalloc_socket(NULL, num_workers * sizeof(struct mac_ip_pair), RTE_CACHE_LINE_SIZE, rte_socket_id());
178 |         if (ps_workers_ip_to_mac == NULL)
179 |             LOG_FATAL("PS thread: cannot allocate ps_workers_ip_to_mac");
180 | 
181 |         pkts_burst = (rte_mbuf **) rte_malloc_socket(NULL, dpdk_par.burst_rx * sizeof(struct rte_mbuf*), RTE_CACHE_LINE_SIZE, rte_socket_id());
182 |         if (pkts_burst == NULL)
183 |             LOG_FATAL("PS thread: cannot allocate pkts burst");
184 | 
185 |         clone_burst = (rte_mbuf **) rte_malloc_socket(NULL, num_workers * sizeof(struct rte_mbuf*), RTE_CACHE_LINE_SIZE, rte_socket_id());
186 |         if (clone_burst == NULL)
187 |             LOG_FATAL("PS thread: cannot allocate clone burst");
188 | 
189 |         // Init the buffer pool
190 |         pool_name = pool_name + to_string(ps_id);
191 |         pool = rte_pktmbuf_pool_create(pool_name.c_str(), dpdk_par.pool_size, dpdk_par.pool_cache_size, 0, dpdk_data.pool_buffer_size, rte_socket_id());
192 |         if (pool == NULL)
193 |             LOG_FATAL("Cannot init mbuf pool: " + string(rte_strerror(rte_errno)));
194 | 
195 |         while (!force_quit) {
196 | 
197 |             nb_rx = rte_eth_rx_burst(dpdk_par.portid, ps_id, pkts_burst, dpdk_par.burst_rx);
198 | 
199 |             for (j = 0; j < nb_rx; j++) {
200 | 
201 |                 m = pkts_burst[j];
202 | 
203 |                 rte_prefetch0 (rte_pktmbuf_mtod(m, void *));
204 |                 eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
205 | 
206 | #ifdef DEBUG
207 |                 daiet = is_daiet_pkt_to_ps(eth, m->data_len);
208 |                 if (likely(daiet != NULL)) {
209 | #else
210 |                     daiet = (struct daiet_hdr *) ((uint8_t *) (eth+1) + sizeof(struct rte_ipv4_hdr) + sizeof(struct rte_udp_hdr));
211 | #endif
212 | 
213 |                     ps_rx++;
214 |                     ip = (struct rte_ipv4_hdr *) (eth + 1);
215 |                     udp = (struct rte_udp_hdr *) (ip + 1);
216 | 
217 |                     pool_index = (rte_be_to_cpu_16(daiet->pool_index) & 0x7FFF) - start_pool_index;
218 | 
219 |                     if (ps_aggregate_message(daiet, ip->src_addr, eth->s_addr, pool_index, num_workers)) {
220 | 
221 |                         // Checksum offload
222 |                         m->l2_len = sizeof(struct rte_ether_hdr);
223 |                         m->l3_len = sizeof(struct rte_ipv4_hdr);
224 |                         m->ol_flags |= daiet_par.getTxFlags();
225 | 
226 |                         // Set src MAC
227 |                         rte_ether_addr_copy(&(eth->d_addr), &(eth->s_addr));
228 | 
229 |                         // Set src IP
230 |                         ip->hdr_checksum = 0;
231 |                         ip->src_addr = ip->dst_addr;
232 | 
233 |                         // Swap ports
234 |                         swap((uint16_t&) (udp->dst_port), (uint16_t&) (udp->src_port));
235 |                         udp->dgram_cksum = rte_ipv4_phdr_cksum(ip, m->ol_flags);
236 | 
237 |                         ps_msg_setup(daiet, pool_index);
238 | 
239 |                         // Allocate pkt burst
240 |                         ret = rte_pktmbuf_alloc_bulk(pool, clone_burst, num_workers);
241 |                         if (unlikely(ret < 0))
242 |                             LOG_FATAL("Cannot allocate clone burst");
243 | 
244 |                         for (i = 0; i < num_workers; i++) {
245 | 
246 |                             // Clone packet
247 |                             deep_copy_single_segment_pkt(clone_burst[i], m);
248 | 
249 |                             eth = rte_pktmbuf_mtod(clone_burst[i], struct rte_ether_hdr *);
250 | 
251 |                             // Set dst MAC
252 |                             rte_ether_addr_copy(&(ps_workers_ip_to_mac[i].mac), &(eth->d_addr));
253 | 
254 |                             // Set dst IP
255 |                             ip = (struct rte_ipv4_hdr *) (eth + 1);
256 |                             ip->dst_addr = ps_workers_ip_to_mac[i].be_ip;
257 |                         }
258 | 
259 |                         // Send packet burst
260 |                         sent = 0;
261 |                         do {
262 |                             nb_tx = rte_eth_tx_burst(dpdk_par.portid, ps_id,clone_burst, num_workers);
263 | 
264 |                             sent += nb_tx;
265 |                         } while (sent < num_workers);
266 | 
267 |                         ps_tx += num_workers;
268 | 
269 |                         // Free original packet
270 |                         rte_pktmbuf_free(m);
271 | 
272 |                     } else {
273 |                         // Free original packet
274 |                         rte_pktmbuf_free(m);
275 |                     }
276 | #ifdef DEBUG
277 |                 } else {
278 |                     // Free original packet
279 |                     rte_pktmbuf_free(m);
280 |                 }
281 | #endif
282 |             }
283 |         }
284 | 
285 |         // Set stats
286 |         pkt_stats.set_ps(ps_id - id_shift, ps_tx, ps_rx);
287 | 
288 |         // Cleanup
289 |         rte_free(clone_burst);
290 |         rte_free(pkts_burst);
291 | 
292 |         rte_free(ps_workers_ip_to_mac);
293 | 
294 |         rte_free(ps_received_message_counters);
295 | 
296 |         for (uint32_t i = 0; i < max_num_pending_messages; i++) {
297 |             rte_free(ps_aggregated_messages[i]);
298 |         }
299 | 
300 |         rte_free(ps_aggregated_messages);
301 | 
302 |         return 0;
303 |     }
304 | }
305 | #endif
306 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/ps.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #ifdef COLOCATED
 7 | #pragma once
 8 | 
 9 | namespace daiet {
10 | 
11 |     void ps_setup();
12 |     void ps_cleanup();
13 |     int ps(void*);
14 | }
15 | #endif
16 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/stats.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * DAIET project
  3 |  * author: amedeo.sapio@kaust.edu.sa
  4 |  */
  5 | 
  6 | #include "stats.hpp"
  7 | #include "utils.hpp"
  8 | 
  9 | namespace daiet {
 10 | 
 11 |     pkt_statistics pkt_stats;
 12 | 
 13 |     pkt_statistics::pkt_statistics() : total_w_tx(0), total_w_rx(0), total_w_unsent(0) {
 14 | #ifdef COLOCATED
 15 |         total_ps_tx = 0;
 16 |         total_ps_rx = 0;
 17 | #endif
 18 |     }
 19 | 
 20 | #ifndef COLOCATED
 21 |     void pkt_statistics::init(uint32_t nb_w) {
 22 | #else
 23 |     void pkt_statistics::init(uint32_t nb_w, uint32_t nb_ps) {
 24 | #endif
 25 | 
 26 |         total_w_tx = 0;
 27 |         total_w_rx = 0;
 28 |         total_w_unsent = 0;
 29 | 
 30 |         w_tx.resize(nb_w);
 31 |         w_rx.resize(nb_w);
 32 |         w_unsent.resize(nb_w);
 33 | 
 34 | #ifdef COLOCATED
 35 |         total_ps_tx = 0;
 36 |         total_ps_rx = 0;
 37 | 
 38 |         ps_tx.resize(nb_ps);
 39 |         ps_rx.resize(nb_ps);
 40 | #endif
 41 | 
 42 | #ifdef TIMERS
 43 |         w_timeouts.resize(nb_w);
 44 | #endif
 45 |     }
 46 | 
 47 |     void pkt_statistics::set_workers(uint16_t wid, uint64_t tx, uint64_t rx, uint64_t unsent) {
 48 | 
 49 |         boost::unique_lock<boost::mutex> lock(w_mutex);
 50 | 
 51 |         w_tx[wid] = tx;
 52 |         w_rx[wid] = rx;
 53 |         w_unsent[wid] = unsent;
 54 | 
 55 |         total_w_tx += tx;
 56 |         total_w_rx += rx;
 57 |         total_w_unsent += unsent;
 58 |     }
 59 | 
 60 | #ifdef COLOCATED
 61 |     void pkt_statistics::set_ps(uint32_t psid, uint64_t tx, uint64_t rx) {
 62 | 
 63 |         boost::unique_lock<boost::mutex> lock(ps_mutex);
 64 | 
 65 |         ps_tx[psid] = tx;
 66 |         ps_rx[psid] = rx;
 67 | 
 68 |         total_ps_tx += tx;
 69 |         total_ps_rx += rx;
 70 |     }
 71 | #endif
 72 | 
 73 | #ifdef TIMERS
 74 |     void pkt_statistics::set_timeouts(uint32_t wid, uint64_t timeouts) {
 75 | 
 76 |         boost::unique_lock<boost::mutex> lock(timeouts_mutex);
 77 | 
 78 |         w_timeouts[wid] = timeouts;
 79 | 
 80 |         total_timeouts += timeouts;
 81 |     }
 82 | #endif
 83 | 
 84 |     void pkt_statistics::dump(){
 85 | 
 86 | #ifndef COLOCATED
 87 |             LOG_INFO("TX " + to_string(total_w_tx));
 88 |             LOG_INFO("RX " + to_string(total_w_rx));
 89 |             LOG_INFO("UNSENT " + to_string(total_w_unsent));
 90 | #else
 91 |             LOG_INFO("Worker TX " + to_string(total_w_tx));
 92 |             LOG_INFO("Worker RX " + to_string(total_w_rx));
 93 |             LOG_INFO("Worker UNSENT " + to_string(total_w_unsent));
 94 |             LOG_INFO("PS TX " + to_string(total_ps_tx));
 95 |             LOG_INFO("PS RX " + to_string(total_ps_rx));
 96 | #endif
 97 | 
 98 | #ifdef TIMERS
 99 |             LOG_INFO("Timeouts " + to_string(total_timeouts));
100 | #endif
101 | 
102 |             for (uint32_t i = 0; i < w_tx.size(); i++) {
103 | 
104 |                 LOG_INFO("## Worker " + to_string(i));
105 |                 LOG_INFO("TX " + to_string(w_tx[i]));
106 |                 LOG_INFO("RX " + to_string(w_rx[i]));
107 |                 LOG_INFO("UNSENT " + to_string(w_unsent[i]));
108 | #ifdef TIMERS
109 |                 LOG_INFO("Timeouts " + to_string(w_timeouts[i]));
110 | #endif
111 |             }
112 | 
113 | #ifdef COLOCATED
114 |             for (uint32_t i = 0; i < ps_tx.size(); i++) {
115 | 
116 |                 LOG_INFO("## PS" + to_string(i));
117 |                 LOG_INFO("TX " + to_string(ps_tx[i]));
118 |                 LOG_INFO("RX " + to_string(ps_rx[i]));
119 |             }
120 | #endif
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/stats.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <vector>
 9 | #include <boost/thread.hpp>
10 | 
11 | #include "common.hpp"
12 | 
13 | using namespace std;
14 | 
15 | namespace daiet {
16 | 
17 |     class pkt_statistics {
18 | 
19 |         public:
20 |             pkt_statistics();
21 |             void set_workers(uint16_t, uint64_t, uint64_t, uint64_t);
22 |             void dump();
23 | 
24 | #ifndef COLOCATED
25 |             void init(uint32_t);
26 | #else
27 |             void init(uint32_t, uint32_t);
28 |             void set_ps(uint32_t, uint64_t, uint64_t);
29 | #endif
30 | 
31 | #ifdef TIMERS
32 |             void set_timeouts(uint32_t, uint64_t);
33 | #endif
34 | 
35 |         private:
36 | 
37 |             boost::mutex w_mutex;
38 |             uint64_t total_w_tx;
39 |             uint64_t total_w_rx;
40 |             uint64_t total_w_unsent;
41 |             vector<uint64_t> w_tx;
42 |             vector<uint64_t> w_rx;
43 |             vector<uint64_t> w_unsent;
44 | 
45 | #ifdef COLOCATED
46 |             boost::mutex ps_mutex;
47 |             uint64_t total_ps_tx;
48 |             uint64_t total_ps_rx;
49 |             vector<uint64_t>  ps_tx;
50 |             vector<uint64_t>  ps_rx;
51 | #endif
52 | 
53 | #ifdef TIMERS
54 |             boost::mutex timeouts_mutex;
55 |             vector<uint64_t> w_timeouts;
56 |             uint64_t total_timeouts;
57 | #endif
58 |     };
59 | 
60 |     extern pkt_statistics pkt_stats;
61 | }
62 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/utils.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <sys/queue.h>
 9 | 
10 | #include <sstream>
11 | #include <utility>
12 | #include <mutex>
13 | #include <vector>
14 | #include <iomanip>
15 | #include <algorithm>
16 | 
17 | #include "common.hpp"
18 | 
19 | using namespace std;
20 | 
21 | namespace daiet {
22 | 
23 |     extern std::ofstream daiet_log;
24 | 
25 |     template<typename T>
26 |     void LOG_FATAL(T) __attribute__((used));
27 |     template<typename T>
28 |     void LOG_ERROR(T) __attribute__((used));
29 |     template<typename T>
30 |     void LOG_INFO(T) __attribute__((used));
31 | 
32 | #ifdef DEBUG
33 |     template<typename T>
34 |     void LOG_DEBUG(T) __attribute__((used));
35 | #else
36 | #define LOG_DEBUG(T)
37 | #endif
38 | 
39 |     template<typename T>
40 |     string to_hex(T);
41 | 
42 |     vector<string> split(const string &);
43 |     vector<string> split(const string &, const string &);
44 | 
45 |     string mac_to_str(const rte_ether_addr);
46 |     string mac_to_str(const uint64_t, bool = true);
47 |     int64_t str_to_mac(string const&, bool = true);
48 |     string ip_to_str(uint32_t);
49 | 
50 |     void swap_eth_addr(rte_ether_hdr *);
51 |     void deep_copy_single_segment_pkt(rte_mbuf*, const rte_mbuf*);
52 |     void check_port_link_status(uint16_t);
53 |     void print_packet(struct rte_ether_hdr *, uint16_t);
54 |     void print_dev_info(struct rte_eth_dev_info&);
55 |     void print_dev_stats(uint16_t);
56 |     void print_dev_xstats(uint16_t);
57 | 
58 | } // End namespace
59 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/daiet/src/worker.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * DAIET project
 3 |  * author: amedeo.sapio@kaust.edu.sa
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "common.hpp"
 9 | 
10 | namespace daiet {
11 | 
12 |     void worker_setup();
13 |     void worker_cleanup();
14 |     int worker(void*);
15 | }
16 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:10.1-devel-ubuntu18.04
  2 | RUN apt-get update && \
  3 |     DEBIAN_FRONTEND="noninteractive" apt-get install -qy \
  4 |         autotools-dev \
  5 |         bison \
  6 |         build-essential \
  7 |         ca-certificates \
  8 |         chrpath \
  9 |         coreutils \
 10 |         debhelper \
 11 |         dh-python \
 12 |         dpatch \
 13 |         ethtool \
 14 |         flex \
 15 |         gcc \
 16 |         gfortran \
 17 |         git \
 18 |         graphviz \
 19 |         iproute2 \
 20 |         kmod \
 21 |         libboost-program-options-dev \
 22 |         libboost-chrono-dev \
 23 |         libboost-system-dev \
 24 |         libboost-thread-dev \
 25 |         libc6-dev \
 26 |         libelf1 \
 27 |         libgfortran3 \
 28 |         libglib2.0-0 \
 29 |         libhiredis-dev \
 30 |         libjpeg-dev \
 31 |         libltdl-dev \
 32 |         libmnl-dev \
 33 |         libnl-3-200 \
 34 |         libnl-3-dev \
 35 |         libnl-route-3-200 \
 36 |         libnl-route-3-dev \
 37 |         libnuma-dev \
 38 |         libnuma1 \
 39 |         libpng-dev \
 40 |         libpython3-dev \
 41 |         libssl1.0.0 \
 42 |         linux-headers-$(uname -r) \
 43 |         linux-modules-$(uname -r) \
 44 |         lsb-release \
 45 |         lsof \
 46 |         m4 \
 47 |         net-tools \
 48 |         openssh-client \
 49 |         openssh-server \
 50 |         pciutils \
 51 |         perl \
 52 |         pkg-config \
 53 |         python3 \
 54 |         python3-dev \
 55 |         python3-distutils \
 56 |         swig \
 57 |         tk \
 58 |         udev \
 59 |         vim \
 60 |         wget && rm -rf /var/lib/apt/lists/*
 61 | 
 62 | 
 63 | # Allow OpenSSH to talk to containers without asking for confirmation
 64 | RUN mkdir -p /var/run/sshd && cat /etc/ssh/ssh_config | grep -v 'StrictHostKeyChecking' > /etc/ssh/ssh_config.new && \
 65 |     echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
 66 |     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 67 | 
 68 | # MLNX driver
 69 | ARG MOFED_VER=5.3-1.0.0.1
 70 | RUN mkdir -p /tmp/mofed && cd /tmp/mofed && \
 71 |         wget http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-ubuntu18.04-$(uname -m).tgz && \
 72 |         tar -xzvf *.tgz && \
 73 |         */mlnxofedinstall --user-space-only --without-fw-update --upstream-libs --dpdk --force && \
 74 |         cd /tmp && \
 75 |         rm -rf mofed
 76 | 
 77 | # mamba
 78 | RUN cd ~ && \
 79 |     wget -O Mambaforge.sh https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh && \
 80 |     bash Mambaforge.sh -b && \
 81 |     /root/mambaforge/bin/mamba install \
 82 |     pip \
 83 |     python=3.7.*=*_cpython \
 84 |     cudnn=7.6 \
 85 |     nccl=2.4 \
 86 |     cudatoolkit \
 87 |     jupyter \
 88 |     matplotlib \
 89 |     astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses \
 90 |     magma-cuda101 -y -c pytorch && \
 91 |     rm Mambaforge.sh
 92 | 
 93 | # Install Open MPI
 94 | RUN mkdir /tmp/openmpi && \
 95 |     cd /tmp/openmpi && \
 96 |     wget -q https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-4.1.1.tar.gz && \
 97 |     tar zxf openmpi-4.1.1.tar.gz && \
 98 |     cd openmpi-4.1.1 && \
 99 |     ./configure --enable-orterun-prefix-by-default && \
100 |     make -j $(nproc) all && \
101 |     make install && \
102 |     ldconfig && \
103 |     rm -rf /tmp/openmpi
104 | 
105 | # Create a wrapper for OpenMPI to allow running as root by default
106 | # Configure OpenMPI to run good defaults:
107 | #   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
108 | RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
109 |     echo '#!/bin/bash' > /usr/local/bin/mpirun && \
110 |     echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
111 |     chmod a+x /usr/local/bin/mpirun && \
112 |     echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
113 |     echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
114 |     echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
115 | 
116 | SHELL ["/root/mambaforge/bin/conda", "run", "--no-capture-output", "-n", "base", "/bin/bash", "-c"]
117 | ENV PATH="/root/mambaforge/bin:/root/mambaforge/condabin:${PATH}"
118 | ENV CPLUS_INCLUDE_PATH=/root/mambaforge/include LIBRARY_PATH=/root/mambaforge/lib LD_LIBRARY_PATH=/root/mambaforge/lib
119 | ARG TORCH_CUDA_ARCH_LIST
120 | RUN cd ~ && git clone --branch docker --depth 1 https://github.com/ChenYuHo/omnireduce.git && cd omnireduce && ./prepare.sh --depth 1 && \
121 |     ./build_all.sh INSTALL MLX5 TIMERS CONDA OFFLOAD_BITMAP NOSCALING PYTORCH ALGO2
122 | 
123 | ARG EXPS_BASE_PATH=/root
124 | ARG EXPS_PATH=$EXPS_BASE_PATH/exps
125 | ARG EXPS_GIT_LINK=https://github.com/Phlix1/exps.git
126 | 
127 | RUN cd $EXPS_BASE_PATH && git clone $EXPS_GIT_LINK
128 | 
129 | #For benchmark
130 | RUN cd $EXPS_PATH/benchmark && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg
131 | 
132 | #For DeepLight
133 | RUN mamba install scikit-learn python=3.7.*=*_cpython -y
134 | RUN cd $EXPS_PATH/models/DeepLight && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg
135 | 
136 | #For LSTM
137 | RUN mamba install cython python=3.7.*=*_cpython -y
138 | RUN cd $EXPS_PATH/models/LSTM/lm/log_uniform && make && python setup.py install
139 | RUN cd $EXPS_PATH/models/LSTM && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg
140 | 
141 | #For NCF
142 | RUN mamba install numpy-indexed python=3.7.*=*_cpython -y
143 | RUN pip install mlperf_compliance
144 | RUN cd $EXPS_PATH/models/NCF && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg
145 | 
146 | #For CNN
147 | RUN mamba install pillow python=3.7.*=*_cpython -y
148 | RUN mamba install torchvision=0.8.0 python=3.7.*=*_cpython -c pytorch --no-deps -y
149 | RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda100
150 | RUN cd /usr/local && git clone https://github.com/NVIDIA/apex && cd apex && git reset --hard a651e2c24ecf97cbf367fd3f330df36760e1c597 && \
151 |     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
152 | RUN cd $EXPS_PATH/models/CNN && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg
153 | 
154 | 
155 | #For BERT
156 | RUN pip install nvidia-pyindex 
157 | RUN pip install nvidia-dllogger
158 | RUN mamba install unzip -y
159 | RUN cd $EXPS_PATH/models/BERT/dataset/checkpoint && \
160 |     wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_qa_squad11_amp/versions/19.09.0/zip -O bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip && \
161 |     unzip bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip
162 | RUN cd $EXPS_PATH/models/BERT && ln -s ~/omnireduce/daiet/example/daiet.cfg daiet.cfg && mkdir results
163 | 
164 | ARG OMNIREDUCE_CONTAINER_PORT=2222
165 | ENV OMNIREDUCE_CONTAINER_PORT ${OMNIREDUCE_CONTAINER_PORT}
166 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/docker/aggregator_Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | RUN apt-get update && \
  3 |     DEBIAN_FRONTEND=noninteractive apt-get install -qy \
  4 |         autotools-dev \
  5 |         bison \
  6 |         build-essential \
  7 |         ca-certificates \
  8 |         chrpath \
  9 |         coreutils \
 10 |         debhelper \
 11 |         dh-python \
 12 |         dpatch \
 13 |         ethtool \
 14 |         flex \
 15 |         gcc \
 16 |         gfortran \
 17 |         git \
 18 |         graphviz \
 19 |         iproute2 \
 20 |         kmod \
 21 |         libboost-program-options-dev \
 22 |         libboost-chrono-dev \
 23 |         libboost-system-dev \
 24 |         libboost-thread-dev \
 25 |         libc6-dev \
 26 |         libelf1 \
 27 |         libgfortran3 \
 28 |         libglib2.0-0 \
 29 |         libhiredis-dev \
 30 |         libjpeg-dev \
 31 |         libltdl-dev \
 32 |         libmnl-dev \
 33 |         libnl-3-200 \
 34 |         libnl-3-dev \
 35 |         libnl-route-3-200 \
 36 |         libnl-route-3-dev \
 37 |         libnuma-dev \
 38 |         libnuma1 \
 39 |         libpng-dev \
 40 |         libpython3-dev \
 41 |         libssl1.0.0 \
 42 |         linux-headers-$(uname -r) \
 43 |         linux-modules-$(uname -r) \
 44 |         lsb-release \
 45 |         lsof \
 46 |         m4 \
 47 |         net-tools \
 48 |         openssh-client \
 49 |         openssh-server \
 50 |         pciutils \
 51 |         perl \
 52 |         pkg-config \
 53 |         python3 \
 54 |         python3-dev \
 55 |         python3-distutils \
 56 |         swig \
 57 |         tk \
 58 |         udev \
 59 |         vim \
 60 |         wget && rm -rf /var/lib/apt/lists/*
 61 | 
 62 | 
 63 | # Allow OpenSSH to talk to containers without asking for confirmation
 64 | RUN mkdir -p /var/run/sshd && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
 65 |     echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
 66 |     mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
 67 | 
 68 | # MLNX driver
 69 | ARG MOFED_VER=5.3-1.0.0.1
 70 | RUN mkdir -p /tmp/mofed && cd /tmp/mofed && \
 71 |         wget http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-ubuntu18.04-$(uname -m).tgz && \
 72 |         tar -xzvf *.tgz && \
 73 |         */mlnxofedinstall --user-space-only --without-fw-update --upstream-libs --dpdk --force && \
 74 |         cd /tmp && \
 75 |         rm -rf mofed
 76 | 
 77 | ## Install Open MPI
 78 | #RUN mkdir /tmp/openmpi && \
 79 | #    cd /tmp/openmpi && \
 80 | #    wget -q https://www.open-mpi.org/software/ompi/v4.1/downloads/openmpi-4.1.1.tar.gz && \
 81 | #    tar zxf openmpi-4.1.1.tar.gz && \
 82 | #    cd openmpi-4.1.1 && \
 83 | #    ./configure --enable-orterun-prefix-by-default && \
 84 | #    make -j $(nproc) all && \
 85 | #    make install && \
 86 | #    ldconfig && \
 87 | #    rm -rf /tmp/openmpi
 88 | #
 89 | ## Create a wrapper for OpenMPI to allow running as root by default
 90 | ## Configure OpenMPI to run good defaults:
 91 | ##   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
 92 | #RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
 93 | #    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
 94 | #    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
 95 | #    chmod a+x /usr/local/bin/mpirun && \
 96 | #    echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
 97 | #    echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \
 98 | #    echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf
 99 | 
100 | RUN cd ~ && git clone --branch docker --depth 1 https://github.com/ChenYuHo/omnireduce.git && cd omnireduce && git submodule update --init --depth 1 --recursive daiet && \
101 |     ./build_all.sh INSTALL MLX5 TIMERS NOSCALING ALGO2 SKIP_DAIET SKIP_GLOO SKIP_EXPS SKIP_EXAMPLE
102 | 
103 | ARG OMNIREDUCE_CONTAINER_PORT=2222
104 | ENV OMNIREDUCE_CONTAINER_PORT ${OMNIREDUCE_CONTAINER_PORT}
105 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/environment.yml:
--------------------------------------------------------------------------------
 1 | name: omnireduce
 2 | 
 3 | channels:
 4 |   - conda-forge
 5 |   - defaults
 6 |   - pytorch
 7 | 
 8 | dependencies:
 9 |   - absl-py<0.9 # WideAndDeep
10 |     #  - apache-beam # WideAndDeep
11 |   - defaults::boost-cpp=1.65 # compatible with daiet
12 |   - cffi # horovod, pytorch
13 |   - cloudpickle  # horovod
14 |   - cmake # pytorch
15 |   - cudatoolkit-dev=10.1
16 |   - cudnn=7.6
17 |   - cupy # ncf
18 |   - cython=0.28 # pytorch ssd detection
19 |   - h5py
20 |   - html2text # bert
21 |   - libprotobuf=3.8 # tensorflow build
22 |   - magma-cuda101
23 |   - mkl # pytorch
24 |   - mkl-include # pytorch
25 |   - mpi4py
26 |   - nccl=2.4
27 |   - networkx # bert
28 |   - ninja # pytorch
29 |   - nltk # bert
30 |   - numpy<2 # pytorch
31 |   - openmpi=4.0
32 |   - pandas # ncf
33 |   - pip=20.0
34 |   - pip:
35 |     - git+git://github.com/NVIDIA/dllogger#egg=dllogger # ncf
36 |       #- gluoncv
37 |       #- mxnet-cu101mkl==1.6.0
38 |     - opt_einsum # WideAndDeep
39 |     - ray[rllib]
40 |     - sacremoses==0.0.35 # Transformer-XL
41 |     - scikit-learn # DLRM
42 |     - tensorflow-transform==0.21.* # WideAndDeep
43 |   - progressbar # bert
44 |   - protobuf # WideAndDeep
45 |   - psutil # horovod
46 |   - pycocotools=2.0 # pytorch ssd detection
47 |   - pycparser # horovod
48 |   - pydot<2 # WideAndDeep
49 |   - py-opencv # GAN
50 |   - pyspark
51 |   - pytest # bert
52 |   - python=3.7
53 |   - pyyaml # horovod, pytorch
54 |   - scikit-image=0.15 # pytorch ssd detection
55 |   - scipy
56 |   - setuptools # pytorch
57 |   - six<2 # WideAndDeep
58 |   - tensorflow-gpu=1.15
59 |   - toposort # bert
60 |   - tqdm
61 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/get_cuda_arch_code.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create a 'here document' that is code we compile and use to probe the card
 4 | cat << EOF > /tmp/cudaComputeVersion.cu
 5 | #include <stdio.h>
 6 | int main()
 7 | {
 8 | cudaDeviceProp prop;
 9 | cudaGetDeviceProperties(&prop,0);
10 | printf("%d.%d\n", prop.major,prop.minor);
11 | }
12 | EOF
13 | 
14 | # probe the card and cleanup
15 | /usr/local/cuda/bin/nvcc /tmp/cudaComputeVersion.cu -o /tmp/cudaComputeVersion
16 | /tmp/cudaComputeVersion
17 | rm /tmp/cudaComputeVersion.cu
18 | rm /tmp/cudaComputeVersion
19 | 


--------------------------------------------------------------------------------
/omnireduce-DPDK/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 4 | git submodule update --init "$@" --recursive
 5 | cd $SCRIPTPATH/pytorch
 6 | git apply $SCRIPTPATH/pytorch.patch
 7 | git rm third_party/gloo
 8 | cd third_party/protobuf
 9 | git fetch --unshallow
10 | git checkout 09745575a923640154bcf307fba8aedff47f240a
11 | cd $SCRIPTPATH/gloo
12 | git apply $SCRIPTPATH/gloo.patch
13 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/Makefile:
--------------------------------------------------------------------------------
 1 | ifeq ($(OMNIREDUCE_PATH),)
 2 | OMNIREDUCE_PATH = $(shell pwd)
 3 | export OMNIREDUCE_PATH
 4 | endif
 5 | 
 6 | SOURCEDIR  := ${OMNIREDUCE_PATH}/omnireduce
 7 | DESTDIR  := ${OMNIREDUCE_PATH}/build
 8 | 
 9 | INCLUDE  :=-I ${OMNIREDUCE_PATH}
10 | LDFLAGS  := -shared -lstdc++
11 | LDLIBS  := -libverbs -lboost_system -lboost_thread -lboost_chrono -lboost_program_options
12 | CXXFLAGS  := -O3 -std=c++11
13 | ifeq ($(USE_CUDA),ON)
14 | $(info "USE_CUDA ON")
15 | CXXFLAGS += -DUSE_CUDA --compiler-options -fPIC 
16 | CC  := nvcc
17 | LD  := nvcc
18 | else
19 | CXXFLAGS += -fPIC
20 | CC  := g++
21 | LD  := g++
22 | endif
23 | 
24 | SOURCE:=${wildcard ${SOURCEDIR}/*.cpp}
25 | OBJS:=${patsubst ${SOURCEDIR}/%.cpp,${SOURCEDIR}/%.o,${SOURCE}}
26 | 
27 | ifeq ($(USE_CUDA),ON)
28 | SOURCE:=${wildcard ${SOURCEDIR}/*.cu}
29 | OBJS+=${patsubst ${SOURCEDIR}/%.cu,${SOURCEDIR}/%.o,${SOURCE}}
30 | endif
31 | 
32 | TARGET_LIB  := libomnireduce.so
33 | 
34 | all:${OBJS}
35 | 	${LD} ${LDFLAGS} -o ${SOURCEDIR}/${TARGET_LIB} ${OBJS} ${LDLIBS}
36 | 	mkdir -p ${DESTDIR}/include/omnireduce
37 | 	cp ${SOURCEDIR}/${TARGET_LIB} ${DESTDIR}
38 | 	cp ${SOURCEDIR}/*.hpp ${DESTDIR}/include/omnireduce
39 | 
40 | ${SOURCEDIR}/%.o:${SOURCEDIR}/%.cpp
41 | 	${CC} -c ${CXXFLAGS} $< -o ${SOURCEDIR}/$*.o ${INCLUDE}
42 | 
43 | ${SOURCEDIR}/%.o:${SOURCEDIR}/%.cu
44 | 	${CC} -c ${CXXFLAGS} $< -o ${SOURCEDIR}/$*.o ${INCLUDE}
45 | 
46 | .PHONY: clean
47 | 
48 | clean:
49 | 	rm ${SOURCEDIR}/*.so ${SOURCEDIR}/*.o -rf
50 | 	rm -rf ${DESTDIR}
51 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/README.md:
--------------------------------------------------------------------------------
 1 | # OmniReduce-RDMA
 2 | 
 3 | ## Getting Started
 4 | The simplest way to start is to use our [docker image](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/docker). We provide a [tutorial](https://github.com/sands-lab/omnireduce/blob/master/omnireduce-RDMA/docs/tutorial.md) to help you run RDMA-based OmniReduce with docker image quickly.
 5 | Below, we introduce how to build and use OmniReduce.
 6 | 
 7 | ### Building
 8 | OmniReduce is built to run on Linux and the dependencies include CUDA, ibverbs and Boost C++ library.
 9 | To build OmniReduce, run:
10 | 
11 |     git clone https://github.com/sands-lab/omnireduce
12 |     cd omnireduce-RDMA
13 |     make USE_CUDA=ON
14 | 
15 | ### Examples
16 | Basic examples are provided under the [example](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/example) folder. 
17 | To reproduce the evaluation in our SIGCOMM'21 paper, find the code at this [repo](https://github.com/sands-lab/omnireduce-experiments).
18 | 
19 | ## Frameworks Integration
20 | OmniReduce is only integrated with PyTorch currently. The integration method is under the [frameworks_integration](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/frameworks_integration/pytorch_patch) folder.
21 | 
22 | ## Limitations
23 | 
24 | - Only support AllReduce operation
25 | - Only support int32 and float data type
26 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:10.1-devel-ubuntu18.04
  2 | 
  3 | ARG https_proxy
  4 | ARG http_proxy
  5 | 
  6 | ARG OMNIREDUCE_BASE_PATH=/usr/local
  7 | ARG OMNIREDUCE_PATH=$OMNIREDUCE_BASE_PATH/omnireduce
  8 | ARG OMNIREDUCE_GIT_LINK=https://github.com/Phlix1/omnireduce.git
  9 | ARG OMNIREDUCE_BRANCH=master
 10 | 
 11 | ARG DEBIAN_FRONTEND=noninteractive
 12 | RUN apt-get update
 13 | RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
 14 |         build-essential \
 15 |         tzdata \
 16 |         ca-certificates \
 17 |         git \
 18 |         curl \
 19 |         wget \
 20 |         vim \
 21 |         cmake \
 22 |         lsb-release \
 23 |         libcudnn7=7.6.0.64-1+cuda10.1 \
 24 |         libnuma-dev \
 25 |         ibverbs-providers \
 26 |         librdmacm-dev \
 27 |         ibverbs-utils \
 28 |         rdmacm-utils \
 29 |         libibverbs-dev \
 30 |         python3 \
 31 |         python3-dev \
 32 |         python3-pip \
 33 |         python3-setuptools \
 34 |         libnccl2=2.4.7-1+cuda10.1 \
 35 |         libnccl-dev=2.4.7-1+cuda10.1 \
 36 |         iputils-ping \
 37 |         net-tools \
 38 |         perftest 
 39 | 
 40 | RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
 41 |     mkdir -p /var/run/sshd
 42 | 
 43 | RUN apt-get install -y --no-install-recommends libboost-all-dev=1.65.1.0ubuntu1
 44 | 
 45 | RUN cd /usr/local && \
 46 |     wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ./miniconda.sh && \
 47 |     bash miniconda.sh -b -p /usr/local/conda && \
 48 |     rm miniconda.sh
 49 | ENV PATH $PATH:/usr/local/conda/bin
 50 | 
 51 | RUN conda install -y -c conda-forge -c defaults -c pytorch magma-cuda101 mkl mkl-include ninja numpy=1.20.1 pyyaml scipy setuptools six=1.15.0 cffi typing_extensions future requests dataclasses
 52 | 
 53 | RUN cd $OMNIREDUCE_BASE_PATH && git clone $OMNIREDUCE_GIT_LINK && cd $OMNIREDUCE_PATH && make USE_CUDA=ON && cp $OMNIREDUCE_PATH/build/libomnireduce.so /usr/lib/x86_64-linux-gnu/ && \
 54 |     cp -r $OMNIREDUCE_PATH/build/include/omnireduce /usr/include/ && cd $OMNIREDUCE_PATH/example && CUDA_HOME=/usr/local/cuda/ make USE_CUDA=ON
 55 | 
 56 | RUN cd $OMNIREDUCE_BASE_PATH && git clone --recursive https://github.com/pytorch/pytorch && cd $OMNIREDUCE_BASE_PATH/pytorch && git checkout 57bffc3a8e4fee0cce31e1ff1f662ccf7b16db57 && \
 57 |     git submodule sync && git submodule update --init --recursive && git apply $OMNIREDUCE_PATH/frameworks_integration/pytorch_patch/omnireduce-pytorch.patch && \
 58 |     USE_SYSTEM_NCCL=0 /usr/local/conda/bin/python setup.py install
 59 | 
 60 | ##experiments
 61 | 
 62 | ARG EXPS_BASE_PATH=/home
 63 | ARG EXPS_PATH=$EXPS_BASE_PATH/exps
 64 | ARG EXPS_GIT_LINK=https://github.com/Phlix1/exps.git
 65 | 
 66 | RUN cd /usr/bin && rm python
 67 | 
 68 | RUN cd $EXPS_BASE_PATH && git clone $EXPS_GIT_LINK
 69 | 
 70 | #For benchmark
 71 | RUN cd $EXPS_PATH/benchmark && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg
 72 | 
 73 | #For DeepLight
 74 | RUN pip install -U scikit-learn
 75 | RUN cd $EXPS_PATH/models/DeepLight && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg
 76 | 
 77 | #For LSTM
 78 | RUN conda install -y -c conda-forge -c defaults -c pytorch cython
 79 | RUN cd $EXPS_PATH/models/LSTM/lm/log_uniform && make && python setup.py install
 80 | RUN cd $EXPS_PATH/models/LSTM && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg
 81 | 
 82 | #For NCF
 83 | RUN conda install -y -c conda-forge numpy-indexed
 84 | RUN pip install mlperf_compliance
 85 | RUN cd $EXPS_PATH/models/NCF && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg
 86 | 
 87 | #For CNN
 88 | RUN pip install Pillow
 89 | RUN pip install torchvision===0.8.0 --no-dependencies
 90 | RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist --upgrade nvidia-dali-cuda100
 91 | RUN cd /usr/local && git clone https://github.com/NVIDIA/apex && cd apex && git reset --hard a651e2c24ecf97cbf367fd3f330df36760e1c597 && \
 92 |     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
 93 | RUN cd $EXPS_PATH/models/CNN && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg
 94 | 
 95 | #For BERT
 96 | RUN pip install nvidia-pyindex
 97 | RUN pip install nvidia-dllogger
 98 | RUN conda install -y unzip
 99 | RUN cd $EXPS_PATH/models/BERT/dataset/checkpoint && \
100 |     wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_large_qa_squad11_amp/versions/19.09.0/zip -O bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip && \
101 |     unzip bert_pyt_ckpt_large_qa_squad11_amp_19.09.0.zip
102 | RUN cd $EXPS_PATH/models/BERT && ln -s $OMNIREDUCE_PATH/example/omnireduce.cfg omnireduce.cfg && mkdir results
103 | 
104 | RUN pip install jupyter
105 | RUN pip install matplotlib
106 | RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
107 | RUN sed -i 's/#   StrictHostKeyChecking ask/    StrictHostKeyChecking no/' /etc/ssh/ssh_config
108 | ENTRYPOINT /usr/sbin/sshd -p 2222 && /bin/bash
109 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/docker/README.md:
--------------------------------------------------------------------------------
1 | # Docker usage
2 | The docker image includes PyTorch with OmniReduce and some experiments in [this repo](https://github.com/sands-lab/omnireduce-experiments).
3 | To build the docker image, run:
4 | 
5 |     docker build -t omnireduce/pytorch:exps . -f Dockerfile


--------------------------------------------------------------------------------
/omnireduce-RDMA/docs/tutorial.md:
--------------------------------------------------------------------------------
 1 | # Tutorial
 2 | In this tutorial, we will introduce how to use OmniReduce. We use the docker image to ensure that you don't encounter problems with the system environment. We take the benchmark in [this repo](https://github.com/sands-lab/omnireduce-experiments) as an example to introduce how to use OmniReduce step by step.
 3 | 
 4 | ## Build Image
 5 | Build the docker image according to [this](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/docker).
 6 | 
 7 | ## Distributed Training (RDMA)
 8 | Let's say you have two workers and two aggregators. Each worker has one GPU. Assume that the network interface to use is `eth0` and the IP addresses are as follows:
 9 | 
10 | | Machine | IP address |
11 | |--|--|
12 | | worker-0 | 10.0.0.10 |
13 | | worker-1 | 10.0.0.11 |
14 | | aggregator-0 | 10.0.0.20 |
15 | | aggregator-1 | 10.0.0.21 |
16 | 
17 | ### Create configuration file
18 | Firstly, you need to create the `omnireduce.cfg` according to [this](https://github.com/sands-lab/omnireduce/tree/master/omnireduce-RDMA/example#1-configuration-file). The following parameters need to be updated:
19 | 
20 | | Parameter | Value |
21 | |--|--|
22 | | `num_worker` | 2 |
23 | | `num_aggregator` | 2 |
24 | | `worker_ips` | 10.0.0.10,10.0.0.11 |
25 | | `worker_ips` | 10.0.0.20,10.0.0.21 |
26 | 
27 | If your worker supports GPUDirect, set `direct_memory` to be 1. With regard to RDMA configuration, you need to update related parameters according to you system information. You can use the MLNX OFED's [show_gids](https://enterprise-support.nvidia.com/s/article/understanding-show-gids-script) script to get the device (`ib_hca`), port(`ib_port`) and index GID(`gid_idx`).
28 | 
29 | ### Run benchmark
30 | 
31 | For aggregator-0 and aggregator-1:
32 | 
33 |     docker run -it --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash
34 |     # now you are in docker environment
35 |     # step 1: update /usr/local/omnireduce/example/omnireduce.cfg
36 |     # step 2: start aggregator
37 |     cd /usr/local/omnireduce/example
38 |     ./aggregator
39 | 
40 | For worker-0
41 | 
42 |     docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash
43 |     # now you are in docker environment
44 |     # step 1: update /usr/local/omnireduce/example/omnireduce.cfg
45 |     # step 2: start worker 0
46 |     cd /home/exps/benchmark
47 |     CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 python benchmark.py  -d 1.0 --backend gloo -t 26214400 -r 0 -s 2 --ip 10.0.0.10
48 | 
49 | For worker-1
50 | 
51 |     docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash
52 |     # now you are in docker environment
53 |     # step 1: update /usr/local/omnireduce/example/omnireduce.cfg
54 |     # step 2: start worker 0
55 |     cd /home/exps/benchmark
56 |     CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 python benchmark.py  -d 1.0 --backend gloo -t 26214400 -r 1 -s 2 --ip 10.0.0.10
57 | 
58 | ### Run end-to-end
59 | To run the end-to-end experiments, please refer to [this](https://github.com/sands-lab/omnireduce-experiments/tree/master/models).  Here we take LSTM training as an example.
60 | 
61 | #### LSTM training
62 | 
63 | For aggregator-0 and aggregator-1:
64 | 
65 |     docker run -it --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash
66 |     # now you are in docker environment
67 |     # step 1: update /usr/local/omnireduce/example/omnireduce.cfg
68 |     # step 2: start aggregator
69 |     cd /usr/local/omnireduce/example
70 |     ./aggregator
71 | 
72 | For worker-0
73 | 
74 |     docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash
75 |     # now you are in docker environment
76 |     # step 1: update /usr/local/omnireduce/example/omnireduce.cfg
77 |     # step 2: start worker 0
78 |     cd /home/exps/models/LSTM
79 |     CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 OMPI_COMM_WORLD_SIZE=2 OMPI_COMM_WORLD_RANK=0 OMPI_COMM_WORLD_LOCAL_RANK=0 ./run.sh --init tcp://10.0.0.10:4000 --backend gloo
80 | 
81 | For worker-1
82 | 
83 |     docker run -it --gpus all --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/uverbs1 omnireduce/pytorch:exps /bin/bash
84 |     # now you are in docker environment
85 |     # step 1: update /usr/local/omnireduce/example/omnireduce.cfg
86 |     # step 2: start worker 0
87 |     cd /home/exps/models/LSTM
88 |     CUDA_VISIBLE_DEVICES=0 GLOO_SOCKET_IFNAME=eth0 OMPI_COMM_WORLD_SIZE=2 OMPI_COMM_WORLD_RANK=1 OMPI_COMM_WORLD_LOCAL_RANK=0 ./run.sh --init tcp://10.0.0.10:4000 --backend gloo
89 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/example/Makefile:
--------------------------------------------------------------------------------
 1 | MPICC  := mpicxx
 2 | CC  := g++
 3 | INCLUDE  :=-I../build/include
 4 | LIBPATH  :=-L../build
 5 | LDLIBS  := -libverbs -lboost_system -lboost_thread -lboost_chrono -lboost_program_options -lomnireduce 
 6 | CXXFLAGS  := -Wall -Wextra -fPIC -O3 -std=c++11 
 7 | 
 8 | APPS  :=  worker aggregator
 9 | ifeq ($(USE_CUDA),ON)
10 | INCLUDE += -I${CUDA_HOME}/include
11 | LIBPATH += -L${CUDA_HOME}/lib -L${CUDA_HOME}/lib64
12 | APPS += cuda_worker
13 | CXXFLAGS += -DUSE_CUDA
14 | LDLIBS += -lcudart
15 | endif
16 | 
17 | all: ${APPS}
18 | 
19 | worker: worker_test.cpp
20 | 	${MPICC} ${INCLUDE} ${LIBPATH} ${CXXFLAGS} -o $@ $^ ${LDLIBS} 
21 | 
22 | cuda_worker: cuda_worker_test.cpp
23 | 	${MPICC} ${INCLUDE} ${LIBPATH} ${CXXFLAGS} -o $@ $^ ${LDLIBS}
24 | 
25 | aggregator: aggregator_test.cpp
26 | 	${CC} ${INCLUDE} ${LIBPATH} ${CXXFLAGS} -o $@ $^ ${LDLIBS}
27 | 
28 | clean:
29 | 	rm -f *.o ${APPS} cuda_worker
30 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/example/README.md:
--------------------------------------------------------------------------------
 1 | # OmniReduce Examples
 2 | ## Building
 3 | MPI compiler (MPICH/OpenMPI) is required to build the example. 
 4 | To build example, run:
 5 | ``` shell
 6 | make USE_CUDA=ON
 7 | ```
 8 | After building, the output programs include `worker`, `cuda_worker` and `aggregator`.
 9 | ## Run example
10 | ### 1. Configuration file
11 | Before running the example, the [omnireduce.cfg](https://github.com/sands-lab/omnireduce/blob/master/omnireduce-RDMA/example/omnireduce.cfg) requires to be edited according to the cluster. This file needs to be copied to all the workers and aggregators.
12 | Below, we introduce the parameters in the configuration file.
13 | - **RDMA configuration**
14 | 	- **`ib_hca`** specify which RDMA interfaces to use for communication. Example：mlx5_1.
15 | 	- **`ib_port`**: specify the port number of the RDMA interface.
16 | 	- **`gid_idx`**: specify GID index.
17 | 	- **`sl`**: set the service level.
18 | 	- **`num_threads`**: number of threads used for communication for both workers and aggregators.
19 | 	- **`worker_cores`** and **`aggregator_cores`**: set CPU affinity for threads. The number of values should be equal to the `num_threads` parameter. value -1 means no CPU affinity setting and values $\geq$ 0 mean the core ids for different threads.
20 | - **Worker configuration**
21 | 	- **`num_workers`**: number of workers.
22 | 	- **`threshold`**: threshold for calculating block bitmap.
23 | 	- **`direct_memory`**: enable GPUDirect. Value 1 means using GDR.
24 | 	- **`buffer_size`**: send/recv buffer size (only used when `direct_memory`=1). 
25 | 	- **`message_size`**: RDMA message size.
26 | 	- **`block_size`**: block size used in OmniReduce algorithm.
27 | 	- **`gpu_devId`**: index of the used GPU.
28 | 	- **`worker_ips`**: IP addresses of workers, used for negotiation.
29 | - **Aggregator configuration**
30 | 	- **`num_aggregators`**: number of workers.
31 | 	- **`aggregator_ips`**: IP addresses of aggregators, used for negotiation.
32 | 	
33 | `bitmap_chunk_size` and `adaptive_blocksize` are not used in current version.
34 | 
35 | ### 2. Run aggregators
36 | For each aggregator, copy program to each aggregator and run:
37 | 
38 |     ./aggregator
39 | 
40 | ### 3. Run workers
41 | Run `worker` program with `mpirun` on one worker. Here is an example with MPICH.
42 | 
43 |     mpirun -n num_workers -hosts IP_1,...,IP_n ./cuda_worker
44 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/example/aggregator_test.cpp:
--------------------------------------------------------------------------------
1 | #include "omnireduce/aggcontext.hpp"
2 | int main() {
3 |     omnireduce::AggContext& omniContext = omnireduce::AggContext::getInstance();
4 |     return 0;
5 | }
6 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/example/cuda_worker_test.cpp:
--------------------------------------------------------------------------------
  1 | #include "omnireduce/context.hpp"
  2 | #include <unistd.h>
  3 | #include <iostream>
  4 | #include "mpi.h"
  5 | #include <cuda_runtime.h>
  6 | #define DATA_TYPE float
  7 | //#define DATA_TYPE int
  8 | 
  9 | int main(int argc, char *argv[]) {
 10 |     int devID=0;
 11 |     cudaSetDevice(devID);
 12 |     cudaDeviceProp deviceProps;
 13 |     cudaGetDeviceProperties(&deviceProps, devID);
 14 |     cudaStream_t stream;
 15 |     cudaStreamCreate(&stream);
 16 |     printf("CUDA device [%s]\n", deviceProps.name);
 17 |     MPI_Init(&argc, &argv);
 18 |     int myrank=0, worldsize=1;
 19 |     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
 20 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
 21 |     omnireduce::OmniContext& omniContext = omnireduce::OmniContext::getInstance();
 22 |     srand(omniContext.workerId+1);
 23 |     uint32_t block_size = omnireduce::omnireduce_par.getBlockSize();
 24 |     uint32_t tensor_size = 67108864;
 25 |     uint32_t block_count = tensor_size/block_size;
 26 |     if (tensor_size%block_size!=0)
 27 |         block_count += 1;
 28 |     DATA_TYPE *input = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE));
 29 |     DATA_TYPE *d_input;
 30 |     cudaMalloc((void **)&d_input, tensor_size*sizeof(DATA_TYPE));
 31 |     cudaMemset(d_input, 0, tensor_size*sizeof(DATA_TYPE));
 32 |     DATA_TYPE *output = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE));
 33 |     DATA_TYPE *output_dev = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE));
 34 |     memset(input, 0, tensor_size*sizeof(DATA_TYPE));
 35 |     uint8_t *bitmap = (uint8_t *)malloc(block_count*sizeof(uint8_t));
 36 |     double density_ratio = 0.01;
 37 |     double rnum = 0;
 38 |     for(uint32_t i=0; i<block_count; i++)
 39 |     {
 40 |         rnum = rand()%100/(double)101;
 41 |         if (rnum < density_ratio && omniContext.workerId!=-1)
 42 |         {
 43 |             bitmap[i] = 0;
 44 |         }
 45 |         else
 46 |         {
 47 |             bitmap[i] = 1;
 48 |         }
 49 |         if (bitmap[i]==0)
 50 |         {
 51 |             for(uint32_t j=0; j<block_size; j++)
 52 |             {
 53 |                 if(i*block_size+j<tensor_size)
 54 |                     input[i*block_size+j] = 1;
 55 |             }
 56 |         }
 57 |     }
 58 |     MPI_Allreduce(input, output, tensor_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
 59 |     //MPI_Allreduce(input, output, tensor_size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 60 |     int round = 0;
 61 |     int warmups = 10;
 62 |     int num_rounds = 100;
 63 |     struct timeval cur_time;
 64 |     unsigned long start_time_usec;
 65 |     unsigned long diff_time_usec;
 66 |     while(round<warmups) {
 67 |         cudaMemcpy(d_input, input, sizeof(DATA_TYPE)*tensor_size, cudaMemcpyHostToDevice);
 68 |         omniContext.AllReduce(d_input, tensor_size, stream, devID);
 69 |         round++;
 70 |     }
 71 |     
 72 |     round = 0;
 73 |     while (round<num_rounds) {
 74 |         if(myrank==0)
 75 |             std::cout<<"round: "<<round<<std::endl;
 76 |         cudaMemcpy(d_input, input, sizeof(DATA_TYPE)*tensor_size, cudaMemcpyHostToDevice);
 77 |         MPI_Barrier(MPI_COMM_WORLD);
 78 |         gettimeofday(&cur_time, NULL);
 79 |         start_time_usec = (cur_time.tv_sec * 1000000) + (cur_time.tv_usec);
 80 |         omniContext.AllReduce(d_input, tensor_size, stream, devID);
 81 |         gettimeofday(&cur_time, NULL);
 82 |         diff_time_usec = (cur_time.tv_sec * 1000000) + (cur_time.tv_usec) - start_time_usec;
 83 |         if(myrank==0)
 84 |             std::cout<<"tensor size:"<<tensor_size*4<<" Bytes; time: "<<diff_time_usec<<" us; alg bw: "<<tensor_size*4*1.0/(1024*1024*1024)/((double)diff_time_usec/1000000)<<" GB/s"<<std::endl;
 85 |         round++;
 86 |         cudaMemcpy(output_dev, d_input, sizeof(DATA_TYPE)*tensor_size, cudaMemcpyDeviceToHost);
 87 |         for(uint32_t i=0; i<tensor_size; i++)
 88 |             if(output_dev[i]!=output[i])
 89 |             {
 90 |                 std::cout<<"rank: "<<myrank<<"; result check: error"<<std::endl;
 91 |                 std::cout<<i<<": "<<output_dev[i]<<" "<<output[i]<<std::endl;
 92 |                 break;
 93 |             }
 94 |     }
 95 |     cudaMemcpy(output_dev, d_input, sizeof(DATA_TYPE)*tensor_size, cudaMemcpyDeviceToHost);
 96 |     
 97 |     for(uint32_t i=0; i<tensor_size; i++)
 98 |         if(output_dev[i]!=output[i])
 99 |         {
100 |             std::cout<<"rank: "<<myrank<<"; result check: error"<<std::endl;
101 |             std::cout<<i<<": "<<output_dev[i]<<" "<<output[i]<<std::endl;
102 |             return 0;
103 |         }
104 |     std::cout<<"result check: ok"<<std::endl;
105 |     
106 |     return 0;
107 | }
108 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/example/omnireduce.cfg:
--------------------------------------------------------------------------------
 1 | [omnireduce]
 2 | num_workers = 1
 3 | num_aggregators = 1
 4 | num_threads = 8
 5 | worker_cores = -1,-1,-1,-1,-1,-1,-1,-1
 6 | aggregator_cores = -1,-1,-1,-1,-1,-1,-1,-1
 7 | threshold = 0.0
 8 | buffer_size = 1024
 9 | chunk_size = 1048576
10 | bitmap_chunk_size = 16777216
11 | message_size = 256
12 | block_size = 256
13 | ib_hca = mlx5_1
14 | ib_port = 1
15 | gid_idx = 2
16 | sl = 2
17 | gpu_devId = 0
18 | direct_memory = 1
19 | adaptive_blocksize = 0
20 | tcp_port = 19875
21 | worker_ips = 127.0.0.1
22 | aggregator_ips = 127.0.0.1
23 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/example/worker_test.cpp:
--------------------------------------------------------------------------------
 1 | #include "omnireduce/context.hpp"
 2 | #include <unistd.h>
 3 | #include <iostream>
 4 | #include "mpi.h"
 5 | #define DATA_TYPE float
 6 | //#define DATA_TYPE int
 7 | 
 8 | int main(int argc, char *argv[]) {
 9 |     MPI_Init(&argc, &argv);
10 |     int myrank=0, worldsize=1;
11 |     MPI_Comm_size(MPI_COMM_WORLD, &worldsize);
12 |     MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
13 |     omnireduce::OmniContext& omniContext = omnireduce::OmniContext::getInstance();
14 |     srand(omniContext.workerId+1);
15 |     uint32_t block_size = omnireduce::omnireduce_par.getBlockSize();
16 |     uint32_t tensor_size = 67108864;
17 |     uint32_t block_count = tensor_size/block_size;
18 |     if (tensor_size%block_size!=0)
19 |         block_count += 1;
20 |     DATA_TYPE *input = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE));
21 |     DATA_TYPE *output = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE));
22 |     DATA_TYPE *data = (DATA_TYPE *)malloc(tensor_size*sizeof(DATA_TYPE));
23 |     memset(input, 0, tensor_size*sizeof(int));
24 |     uint8_t *bitmap = (uint8_t *)malloc(block_count*sizeof(uint8_t));
25 |     double density_ratio = 0.01;
26 |     double rnum = 0;
27 |     for(uint32_t i=0; i<block_count; i++)
28 |     {
29 |         rnum = rand()%100/(double)101;
30 |         if (rnum < density_ratio && omniContext.workerId!=-1)
31 |         {
32 |             bitmap[i] = 0;
33 |         }
34 |         else
35 |         {
36 |             bitmap[i] = 1;
37 |         }
38 |         if (bitmap[i]==0)
39 |         {
40 |             for(uint32_t j=0; j<block_size; j++)
41 |             {
42 |                 if(i*block_size+j<tensor_size)
43 |                     input[i*block_size+j] = 1;
44 |             }
45 |         }
46 |     }
47 |     MPI_Allreduce(input, output, tensor_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
48 |     //MPI_Allreduce(input, output, tensor_size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
49 |     int round = 0;
50 |     int warmups = 10;
51 |     int num_rounds = 100;
52 |     struct timeval cur_time;
53 |     unsigned long start_time_usec;
54 |     unsigned long diff_time_usec;
55 |     while(round<warmups) {
56 |         memcpy(data, input, sizeof(DATA_TYPE)*tensor_size);
57 |         omniContext.AllReduce(data, tensor_size, bitmap, block_count);
58 |         round++;
59 |     }
60 |     round = 0;
61 |     while (round<num_rounds) {
62 |         if(myrank==0)
63 |             std::cout<<"round: "<<round<<std::endl;
64 |         memcpy(data, input, sizeof(DATA_TYPE)*tensor_size);
65 |         MPI_Barrier(MPI_COMM_WORLD);
66 |         gettimeofday(&cur_time, NULL);
67 |         start_time_usec = (cur_time.tv_sec * 1000000) + (cur_time.tv_usec);
68 |         omniContext.AllReduce(data, tensor_size, bitmap, block_count);
69 |         gettimeofday(&cur_time, NULL);
70 |         diff_time_usec = (cur_time.tv_sec * 1000000) + (cur_time.tv_usec) - start_time_usec;
71 |         if(myrank==0)
72 |             std::cout<<"tensor size:"<<tensor_size*4<<" Bytes; time: "<<diff_time_usec<<" us; alg bw: "<<tensor_size*4*1.0/(1024*1024*1024)/((double)diff_time_usec/1000000)<<" GB/s"<<std::endl;
73 |         round++;
74 |         for(uint32_t i=0; i<tensor_size; i++)
75 |             if(data[i]!=output[i])
76 |             {
77 |                 std::cout<<"result check: error "<<myrank<<std::endl;
78 |                 std::cout<<i<<": "<<data[i]<<" "<<output[i]<<std::endl;
79 |                 break;
80 |             }
81 |         std::cout<<"result check: ok"<<std::endl;
82 |     }
83 |     for(uint32_t i=0; i<tensor_size; i++)
84 |         if(data[i]!=output[i])
85 |         {
86 |             std::cout<<"result check: error"<<std::endl;
87 |             std::cout<<i<<": "<<data[i]<<" "<<output[i]<<std::endl;
88 |             return 0;
89 |         }
90 |     std::cout<<"result check: ok"<<std::endl;
91 |     return 0;
92 | }
93 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/frameworks_integration/horovod_patch/README.md:
--------------------------------------------------------------------------------
 1 | # Frameworks Integration
 2 | By changing a few lines of code in Horovod we are able to delegate allreduce SUM operations to OmniReduce.
 3 | 
 4 | We introduce OminReduce into Horovod. In details, such as NCCLAllreduce, we implement OmniAllreduce's Execute function which invokes the OmniContext Allreduce op. The users who want to use omnireduce in Horovod only set `HOROVOD_GPU_ALLREDUCE=OMNI` to compile Horovod. It is nothing to change in user's code. 
 5 | 
 6 | For OmniReduce to take over from Horovod the following conditions must be met:
 7 | - The all reduce operation must be a summation
 8 | - The data type must be float or int32
 9 | - Each node/host produces 1 tensor or in other words each node/host uses 1 GPU.
10 | 
11 | ## 1. Download Horovod
12 | The patch currently applies to v0.19.4 branch in Horovod.
13 |   
14 |     git clone --branch v0.19.4 https://github.com/horovod/horovod.git
15 |     cd horovod
16 |     git submodule sync
17 |     git submodule update --init --recursive 
18 | 
19 | This will also take a good while to clone and checkout all submodules.
20 | 
21 | ## 2. Download OmniReduce
22 | Download the Omnireduce and put it into the `horovod/third_party` content.
23 | 
24 |     git clone https://github.com/sands-lab/omnireduce.git
25 |     mv omnireduce horovod/third_party
26 | 
27 | ## 3. Install Boost C++ library
28 | The version of Boost C++ library is `1.65.1`.
29 | For Ubuntu, it is simple to install it by:
30 | 
31 |     apt-get install -y libboost-all-dev=1.65.1.0ubuntu1
32 | 
33 | For CentOS, we firstly download boost in [boost.org](https://www.boost.org/users/history/version_1_65_1.html) and then using to `gcc>=5.3.1` to complie boost from source:
34 | 
35 |     tar zxvf boost_1_65_1.tar.gz
36 |     cd boost_1_65_1
37 |     ./bootstrap.sh
38 |     ./b2 install --with=all
39 | 
40 | ## 4. Install OmniReduce
41 | Build OmniReduce and copy the omnireduce `dynami library` and `header files` in the `build` folder to the system `library` and `include` path.
42 | 
43 |     cd horovod/third_party/omnireduce/omnireduce-RDMA
44 |     make USE_CUDA=ON
45 |     cp ./build/libomnireduce.so SYSTEM_LIBRARY_PATH
46 |     cp -r ./build/include/omnireduce SYSTEM_INCLUDE_PATH
47 |     cd horovod/third_party/omnireduce/omnireduce-RDMA/example
48 |     make USE_CUDA=ON
49 |     
50 | ## 5. Apply patch to Horovod
51 | 
52 |     cd horovod
53 |     git apply omnireduce-horovod.patch
54 |     
55 | ## 6. Build Horovod with OmniReduce
56 | Before install Horoovd, we need to install torch and tensorflow. The recommended version environment is：
57 | - **gcc version:** gcc>=5.3.1
58 | - **Python version:** Python <= 3.7, 3.8 is not support for tensorflow1.15
59 | - **Torch version:** Pytorch 1.6 for CUDA10.0, pytorch 1.7 for CUDA10.1 or CUDA11.0, pytorch1.8 for CUDA11.0
60 | - **Tensorflow version:** Tensorflow == 1.15 for CUDA10.0 (tensorflow official not support CUDA11.0 for tensorflow1.x). Besides that, we currently not support tensorflow2.x.
61 | 
62 | According to the Horovod official [repository](https://github.com/horovod/horovod/tree/v0.19.4#install), we only replace `HOROVOD_GPU_ALLREDUCE=NCCL` into `HOROVOD_GPU_ALLREDUCE=OMNI` and then compiling horovod from source. We both support omnireduce for TensorFlow and PyTorch in Horovod. 
63 | 
64 |     cd horovod
65 |     CC=`which gcc` CXX=`which g++` HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_NCCL_LINK=SHARED HOROVOD_GPU_ALLREDUCE=OMNI HOROVOD_GPU_BROADCAST=NCCL python3.6 setup.py install
66 | 
67 | ## 7. Running Horovod With OmniReduce
68 | 
69 | Before running, we must to configure the `omnireduce.cfg` file. Then:
70 | - Launch `./aggregator` in `horovod/third_party/omnireduce/omnireduce-RDMA/example` for all aggregator's machines.
71 | - Configure the hostfile and use `mpirun` only in master-worker machine to launch Horovod. The entire command as follows:
72 | 
73 |     mpirun --hostfile ./hostfile -map-by slot --display-map --tag-output --timestamp-output --mca btl_tcp_if_exclude lo,docker0 -x NCCL_NET_GDR_READ=1 -x NCCL_IB_HCA=mlx5_0 -x NCCL_DEBUG=INFO -x NCCL_SOCKET_IFNAME=^lo,docker0 -x NCCL_IB_GID_INDEX=3 -x NCCL_IB_DISABLE=0 -x HOROVOD_MPI_THREADS_DISABLE=1 python3 test_hvd_torch.py (or test_hvd_tensorflow.py)
74 | 
75 | The hostfile of 2 machines 2 gpus is configured as followed:
76 | 
77 |     machine1_ip port=xxx max_slots=1
78 |     machine2_ip port=xxx max_slots=1
79 | 
80 | Since each node/host uses 1 GPU, CUDA_VISIBLE_DEVICES is must be set one gpu device id. In order to GDR, we need to check the machine's topo by `nvidia-smi topo -m`and select the gpu device id which binds to the network card.
81 | 
82 | ## 8. Horovod Timeline With OmniReduce
83 | The OmniReduce implements allreduce op contains synchronize which leads to inaccurate time-consuming at each stage (`MEMCPY_IN_FUSION_BUFFER`, `OMNI_ALLREDUCE` and `MEMCPY_OUT_FUSION_BUFFER`) in omniAllreduce's Execute function. For solving this problem, we support the feature in Horovod timeline which can accurate statistics time-consuming for synchronize's op.
84 | 
85 | For example, the follow picture shows the `OMNI_ALLREDUCE` time-consuming. The red box marks the original horovod timeline time-consuming and the green box marks the time-consuming after correction.
86 | 
87 | ![image](https://user-images.githubusercontent.com/25579435/125772127-b00c1518-fe44-4461-bbd3-d92879d8d050.png)
88 | 
89 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/frameworks_integration/horovod_patch/test_hvd_tensorflow.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import absolute_import, division, print_function
  3 | 
  4 | import argparse
  5 | import os, sys
  6 | import numpy as np
  7 | import timeit
  8 | 
  9 | import horovod.tensorflow as hvd
 10 | from tensorflow.keras import applications
 11 | import tensorflow as tf
 12 | 
 13 | # Benchmark settings
 14 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
 15 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 16 | parser.add_argument('--fp16-pushpull', action='store_true', default=False,
 17 |                     help='use fp16 compression during pushpull')
 18 | 
 19 | parser.add_argument('--model', type=str, default='ResNet50',
 20 |                     help='model to benchmark')
 21 | parser.add_argument('--batch-size', type=int, default=32,
 22 |                     help='input batch size')
 23 | 
 24 | parser.add_argument('--num-warmup-batches', type=int, default=10,
 25 |                     help='number of warm-up batches that don\'t count towards benchmark')
 26 | parser.add_argument('--num-batches-per-iter', type=int, default=10,
 27 |                     help='number of batches per benchmark iteration')
 28 | parser.add_argument('--num-iters', type=int, default=10,
 29 |                     help='number of benchmark iterations')
 30 | 
 31 | parser.add_argument('--eager', action='store_true', default=False,
 32 |                     help='enables eager execution')
 33 | parser.add_argument('--no-cuda', action='store_true', default=False,
 34 |                     help='disables CUDA training')
 35 | 
 36 | args = parser.parse_args()
 37 | args.cuda = not args.no_cuda
 38 | 
 39 | hvd.init()
 40 | 
 41 | # Horovod: pin GPU to be used to process local rank (one GPU per process)
 42 | config = tf.ConfigProto()
 43 | if args.cuda:
 44 |     config.gpu_options.allow_growth = True
 45 |     config.gpu_options.visible_device_list = str(hvd.local_rank())
 46 | else:
 47 |     os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 48 |     config.gpu_options.allow_growth = False
 49 |     config.gpu_options.visible_device_list = ''
 50 | 
 51 | if args.eager:
 52 |     tf.enable_eager_execution(config)
 53 | 
 54 | # Set up standard model.
 55 | # Check https://github.com/keras-team/keras-applications for all supported models, e.g., ResNet50, VGG16
 56 | model = getattr(applications, args.model)(weights=None)
 57 | 
 58 | opt = tf.train.GradientDescentOptimizer(0.01)
 59 | 
 60 | # Horovod: (optional) compression algorithm.
 61 | compression = hvd.Compression.fp16 if args.fp16_pushpull else hvd.Compression.none
 62 | 
 63 | # Horovod: wrap optimizer with DistributedOptimizer.
 64 | opt = hvd.DistributedOptimizer(opt, compression=compression)
 65 | 
 66 | init = tf.global_variables_initializer()
 67 | bcast_op = hvd.broadcast_global_variables(0)
 68 | 
 69 | data = tf.random_uniform([args.batch_size, 224, 224, 3])
 70 | target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)
 71 | 
 72 | 
 73 | def loss_function():
 74 |     logits = model(data, training=True)
 75 |     return tf.losses.sparse_softmax_cross_entropy(target, logits)
 76 | 
 77 | 
 78 | def log(s, nl=True):
 79 |     if hvd.rank() != 0:
 80 |         return
 81 |     print(s, end='\n' if nl else '')
 82 |     sys.stdout.flush()
 83 | 
 84 | log('Model: %s' % args.model)
 85 | log('Batch size: %d' % args.batch_size)
 86 | device = 'GPU' if args.cuda else 'CPU'
 87 | log('Number of %ss: %d' % (device, hvd.size()))
 88 | 
 89 | 
 90 | def run(benchmark_step):
 91 |     # Warm-up
 92 |     log('Running warmup...')
 93 |     timeit.timeit(benchmark_step, number=args.num_warmup_batches)
 94 | 
 95 |     # Benchmark
 96 |     log('Running benchmark...')
 97 |     img_secs = []
 98 |     for x in range(args.num_iters):
 99 |         time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
100 |         img_sec = args.batch_size * args.num_batches_per_iter / time
101 |         log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
102 |         img_secs.append(img_sec)
103 | 
104 |     # Results
105 |     img_sec_mean = np.mean(img_secs)
106 |     img_sec_conf = 1.96 * np.std(img_secs)
107 |     log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
108 |     log('Total img/sec on %d %s(s): %.1f +-%.1f' %
109 |         (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
110 | 
111 | 
112 | if tf.executing_eagerly():
113 |     with tf.device(device):
114 |         run(lambda: opt.minimize(loss_function, var_list=model.trainable_variables))
115 | else:
116 |     with tf.Session(config=config) as session:
117 |         init.run()
118 |         bcast_op.run()
119 | 
120 |         loss = loss_function()
121 |         train_opt = opt.minimize(loss)
122 |         run(lambda: session.run(train_opt))
123 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/frameworks_integration/horovod_patch/test_hvd_torch.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import argparse
  4 | import torch.backends.cudnn as cudnn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | import torch.utils.data.distributed
  8 | from torchvision import models
  9 | import horovod.torch as hvd
 10 | import timeit
 11 | import numpy as np
 12 | import os, sys
 13 | 
 14 | # Benchmark settings
 15 | parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark',
 16 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 17 | parser.add_argument('--fp16-pushpull', action='store_true', default=False,
 18 |                     help='use fp16 compression during byteps pushpull')
 19 | 
 20 | parser.add_argument('--model', type=str, default='resnet50',
 21 |                     help='model to benchmark')
 22 | parser.add_argument('--batch-size', type=int, default=64,
 23 |                     help='input batch size')
 24 | 
 25 | parser.add_argument('--num-warmup-batches', type=int, default=1,
 26 |                     help='number of warm-up batches that don\'t count towards benchmark')
 27 | parser.add_argument('--num-batches-per-iter', type=int, default=10,
 28 |                     help='number of batches per benchmark iteration')
 29 | parser.add_argument('--num-iters', type=int, default=10,
 30 |                     help='number of benchmark iterations')
 31 | parser.add_argument('--num-classes', type=int, default=1000,
 32 |                     help='number of classes')
 33 | 
 34 | parser.add_argument('--no-cuda', action='store_true', default=False,
 35 |                     help='disables CUDA training')
 36 | parser.add_argument('--profiler', action='store_true', default=False,
 37 |                     help='disables profiler')
 38 | parser.add_argument('--partition', type=int, default=None,
 39 |                     help='partition size')
 40 | 
 41 | 
 42 | args = parser.parse_args()
 43 | args.cuda = not args.no_cuda and torch.cuda.is_available()
 44 | 
 45 | hvd.init()
 46 | 
 47 | if args.cuda:
 48 |     # Horovod: pin GPU to local rank.
 49 |     cuda_device = torch.device('cuda', hvd.local_rank() % hvd.size())
 50 |     torch.cuda.set_device(cuda_device)
 51 | 
 52 | cudnn.benchmark = True
 53 | 
 54 | # Set up standard model.
 55 | model = getattr(models, args.model)(num_classes=args.num_classes)
 56 | 
 57 | if args.cuda:
 58 |     # Move model to GPU.
 59 |     model.cuda()
 60 | 
 61 | optimizer = optim.SGD(model.parameters(), lr=0.01)
 62 | 
 63 | # Horovod: (optional) compression algorithm.
 64 | compression = hvd.Compression.fp16 if args.fp16_pushpull else hvd.Compression.none
 65 | 
 66 | # Horovod: wrap optimizer with DistributedOptimizer.
 67 | optimizer = hvd.DistributedOptimizer(optimizer,
 68 |                                      named_parameters=model.named_parameters())
 69 | 
 70 | # Horovod: broadcast parameters & optimizer state.
 71 | hvd.broadcast_parameters(model.state_dict(), root_rank=0)
 72 | hvd.broadcast_optimizer_state(optimizer, root_rank=0)
 73 | 
 74 | data_index = 0
 75 | 
 76 | def benchmark_step():
 77 |     global data_index
 78 |     global cuda_device
 79 | 
 80 |     #data = datasets[data_index%len(datasets)]
 81 |     data = torch.rand(args.batch_size, 3, 224, 224, device=cuda_device)
 82 |     target = torch.randint(0, 1000, (args.batch_size,), device=cuda_device)
 83 |     data_index += 1
 84 |     optimizer.zero_grad()
 85 |     output = model(data)
 86 |     loss = F.cross_entropy(output, target)
 87 |     loss.backward()
 88 |     optimizer.step()
 89 | 
 90 | 
 91 | def log(s, nl=True):
 92 |     if hvd.rank() != 0:
 93 |         return
 94 |     print(s, end='\n' if nl else '')
 95 |     sys.stdout.flush()
 96 | 
 97 | 
 98 | log('Model: %s' % args.model)
 99 | log('Batch size: %d' % args.batch_size)
100 | device = 'GPU' if args.cuda else 'CPU'
101 | log('Number of %ss: %d' % (device, hvd.size()))
102 | 
103 | # Warm-up
104 | log('Running warmup...')
105 | timeit.timeit(benchmark_step, number=args.num_warmup_batches)
106 | 
107 | # Benchmark
108 | log('Running benchmark...')
109 | img_secs = []
110 | 
111 | for x in range(args.num_iters):
112 |     time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
113 |     img_sec = args.batch_size * args.num_batches_per_iter / time
114 |     log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
115 |     img_secs.append(img_sec)
116 | 
117 | 
118 | # Results
119 | img_sec_mean = np.mean(img_secs)
120 | img_sec_conf = 1.96 * np.std(img_secs)
121 | log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
122 | log('Total img/sec on %d %s(s): %.1f +-%.1f' %
123 |     (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
124 | 
125 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/frameworks_integration/pytorch_patch/README.md:
--------------------------------------------------------------------------------
 1 | # Frameworks Integration
 2 | By changing a few lines of code in PyTorch we are able to delegate allreduce SUM operations to OmniReduce.
 3 | 
 4 | We take advantage of PyTorch's gloo backend and customize it so that it uses OmniReduce instead of Gloo for operations and data types that OmniReduce supports.
 5 | If a job is not supported by OmniReduce then PyTorch automatically fallsback to using gloo.
 6 | 
 7 | For OmniReduce to take over from Gloo the following conditions must be met:
 8 | - The all reduce operation must be a summation
 9 | - The data type must be float or int32
10 | - Each node/host produces 1 tensor or in other words each node/host uses 1 GPU.
11 | 
12 | ## 1. Install OmniReduce
13 | Build OmniReduce and copy the omnireduce `dynami library` and `header files` in the `build` folder to the system `library` and `include` path.
14 | 
15 |     cp ./build/libomnireduce.so SYSTEM_INCLUDE_PATH
16 |     cp -r ./build/include/omnireduce SYSTEM_LIBRARY_PATH
17 | 
18 | ## 2. Download PyTorch
19 | The patch applies to a specific commit which we must checkout to.
20 | The PyTorch patch applies to a specific commit which we must checkout to.
21 |   
22 |     git clone https://github.com/pytorch/pytorch.git
23 |     cd pytorch
24 |     git checkout 57bffc3 # The 1.7.1 version
25 |     git submodule sync
26 |     git submodule update --init --recursive 
27 |     
28 | This will also take a good while to clone and checkout all submodules.
29 | ## 3. Apply patch to PyTorch 
30 | 
31 |     cd pytorch
32 |     git apply omnireduce-pytorch.patch
33 |     
34 | ## 4. Build PyTorch
35 | Install Boost C++ library with below command:
36 | 
37 |     apt-get install -y libboost-all-dev=1.65.1.0ubuntu1
38 |     
39 | Install PyTorch dependencies and build PyTorch according to the official [repository](https://github.com/pytorch/pytorch#installation).


--------------------------------------------------------------------------------
/omnireduce-RDMA/frameworks_integration/pytorch_patch/omnireduce-pytorch.patch:
--------------------------------------------------------------------------------
  1 | From be02e214e79f0a4326404c6de30c0fcfda252a8d Mon Sep 17 00:00:00 2001
  2 | From: Phlix1 <819108840@qq.com>
  3 | Date: Fri, 19 Mar 2021 18:18:15 +0000
  4 | Subject: [PATCH] Add OmniReduce Support
  5 | 
  6 | ---
  7 |  torch/csrc/distributed/c10d/init.cpp |  2 +-
  8 |  torch/lib/c10d/CMakeLists.txt        |  5 ++
  9 |  torch/lib/c10d/ProcessGroupGloo.cpp  | 84 ++++++++++++++++++++--------
 10 |  torch/lib/c10d/ProcessGroupGloo.hpp  |  2 +
 11 |  4 files changed, 70 insertions(+), 23 deletions(-)
 12 | 
 13 | diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
 14 | index 715403ac57..147fd305d0 100644
 15 | --- a/torch/csrc/distributed/c10d/init.cpp
 16 | +++ b/torch/csrc/distributed/c10d/init.cpp
 17 | @@ -913,7 +913,7 @@ Arguments:
 18 |              }
 19 |  
 20 |              options.timeout = timeout;
 21 | -            options.threads = options.devices.size() * 2;
 22 | +            options.threads = 1;//options.devices.size() * 2;
 23 |              return std::make_shared<::c10d::ProcessGroupGloo>(
 24 |                  store, rank, size, options);
 25 |            }),
 26 | diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
 27 | index 4b206f3801..b6db9afa9f 100644
 28 | --- a/torch/lib/c10d/CMakeLists.txt
 29 | +++ b/torch/lib/c10d/CMakeLists.txt
 30 | @@ -76,6 +76,7 @@ if(USE_C10D_GLOO)
 31 |  endif()
 32 |  
 33 |  add_library(c10d STATIC ${C10D_SRCS})
 34 | +target_link_libraries(c10d PUBLIC boost_system boost_thread boost_chrono boost_program_options omnireduce)
 35 |  set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
 36 |  set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
 37 |  
 38 | diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
 39 | index c139ac7a34..d16a5cdab9 100644
 40 | --- a/torch/lib/c10d/ProcessGroupGloo.cpp
 41 | +++ b/torch/lib/c10d/ProcessGroupGloo.cpp
 42 | @@ -1185,16 +1185,29 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
 43 |        const std::shared_ptr<gloo::Context>& context,
 44 |        std::vector<at::Tensor>& inputs,
 45 |        ReduceOp reduceOp,
 46 | -      uint32_t tag)
 47 | -      : AsyncAllreduceWork(context, inputs, reduceOp, tag) {
 48 | -    initializeStreamsEvents(inputs, streams, events);
 49 | +      uint32_t tag, 
 50 | +      omnireduce::OmniContext& omniContext)
 51 | +      : AsyncAllreduceWork(context, inputs, reduceOp, tag), omniContext(omniContext) {
 52 | +    const auto& scalarType = inputs[0].scalar_type();
 53 | +    if (reduceOp == ReduceOp::SUM && (scalarType == ::at::ScalarType::Float)) {
 54 | +        initializeStreamsEvents(inputs, streams, events);
 55 | +        at::cuda::OptionalCUDAStreamGuard guard;
 56 | +        for (size_t i = 0; i < inputs.size(); i++) {
 57 | +            guard.reset_stream(streams[i]);
 58 | +        }
 59 | +        use_omnireduce=true;
 60 | +    }
 61 | +    else {
 62 | +        initializeStreamsEvents(inputs, streams, events);
 63 |  
 64 | -    // Kick off copy from CUDA tensors to pinned CPU tensors.
 65 | -    tmp.reserve(inputs.size());
 66 | -    at::cuda::OptionalCUDAStreamGuard guard;
 67 | -    for (size_t i = 0; i < inputs.size(); i++) {
 68 | -      guard.reset_stream(streams[i]);
 69 | -      tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
 70 | +        // Kick off copy from CUDA tensors to pinned CPU tensors.
 71 | +        tmp.reserve(inputs.size());
 72 | +        at::cuda::OptionalCUDAStreamGuard guard;
 73 | +        for (size_t i = 0; i < inputs.size(); i++) {
 74 | +            guard.reset_stream(streams[i]);
 75 | +            tmp.push_back(pinnedLike(inputs[i]).copy_(inputs[i], true));
 76 | +        }
 77 | +        use_omnireduce=false;
 78 |      }
 79 |    }
 80 |  
 81 | @@ -1207,13 +1220,33 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
 82 |      }
 83 |  
 84 |      // Run allreduce on host side tensors.
 85 | -    allreduce(tmp);
 86 | -
 87 | -    at::cuda::OptionalCUDAStreamGuard stream_guard;
 88 | -    for (size_t i = 0; i < inputs.size(); i++) {
 89 | -      stream_guard.reset_stream(streams[i]);
 90 | -      inputs[i].copy_(tmp[i], /* non_blocking */ true);
 91 | -      events[i].record(streams[i]);
 92 | +    if (use_omnireduce) {
 93 | +      const auto& scalarType = inputs[0].scalar_type();
 94 | +      switch (scalarType) {
 95 | +        case ::at::ScalarType::Float:
 96 | +          //omniContext.AllReduce(getDataPointer<float>(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index(), true, false);
 97 | +          omniContext.AllReduce(getDataPointer<float>(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index());
 98 | +          break;
 99 | +        case ::at::ScalarType::Int:
100 | +          //omniContext.AllReduce(getDataPointer<int32_t>(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index(), true, false);
101 | +          omniContext.AllReduce(getDataPointer<int32_t>(inputs[0]), int(inputs[0].numel()), streams[0].stream(), inputs[0].device().index());
102 | +          break; 
103 | +        default:
104 | +          std::cerr<<"Data type error"<<std::endl;
105 | +      }
106 | +      at::cuda::OptionalCUDAStreamGuard stream_guard;
107 | +      for (size_t i = 0; i < inputs.size(); i++) {
108 | +          stream_guard.reset_stream(streams[i]);
109 | +      }
110 | +    }
111 | +    else {
112 | +      allreduce(tmp);
113 | +      at::cuda::OptionalCUDAStreamGuard stream_guard;
114 | +      for (size_t i = 0; i < inputs.size(); i++) {
115 | +        stream_guard.reset_stream(streams[i]);
116 | +        inputs[i].copy_(tmp[i],  true);
117 | +        events[i].record(streams[i]);
118 | +      }
119 |      }
120 |  
121 |      outputs_ = inputs;
122 | @@ -1221,16 +1254,23 @@ class AsyncAllreduceCUDAWork : public AsyncAllreduceWork {
123 |  
124 |    void synchronize() override {
125 |      // Synchronize with the copy back to CUDA tensors.
126 | -    at::cuda::OptionalCUDAGuard guard;
127 | -    for (size_t i = 0; i < inputs.size(); i++) {
128 | -      guard.set_index(inputs[i].device().index());
129 | -      events[i].block(at::cuda::getCurrentCUDAStream());
130 | +    if (!use_omnireduce) {
131 | +      at::cuda::OptionalCUDAGuard guard;
132 | +      for (size_t i = 0; i < inputs.size(); i++) {
133 | +        guard.set_index(inputs[i].device().index());
134 | +        events[i].block(at::cuda::getCurrentCUDAStream());
135 | +      }
136 |      }
137 |    }
138 | -
139 | +  bool use_omnireduce;
140 | +  omnireduce::OmniContext& omniContext;
141 |    std::vector<at::Tensor> tmp;
142 |    std::vector<at::cuda::CUDAStream> streams;
143 |    std::vector<at::cuda::CUDAEvent> events;
144 | +  std::vector<at::cuda::CUDAEvent> events_bitmap;
145 | +  std::vector<at::Tensor> bitmaps;
146 | +  std::vector<at::Tensor> tmp_bitmap;
147 | +  std::vector<at::cuda::CUDAStream> streams_bitmap;
148 |  };
149 |  
150 |  class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork {
151 | @@ -1344,7 +1384,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupGloo::allreduce(
152 |    } else if (device.type() == at::kCUDA) {
153 |      if (layout == c10::kStrided) {
154 |        work = std::make_shared<AsyncAllreduceCUDAWork>(
155 | -          std::move(context), inputs, opts.reduceOp, tag);
156 | +          std::move(context), inputs, opts.reduceOp, tag, omniContext);
157 |      } else if (layout == c10::kSparse) {
158 |        work = std::make_shared<AsyncSparseAllreduceCUDAWork>(
159 |            std::move(context), inputs, tag);
160 | diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp
161 | index dfae068de2..ac3c677dbb 100644
162 | --- a/torch/lib/c10d/ProcessGroupGloo.hpp
163 | +++ b/torch/lib/c10d/ProcessGroupGloo.hpp
164 | @@ -12,6 +12,7 @@
165 |  #include <gloo/context.h>
166 |  #include <gloo/rendezvous/store.h>
167 |  #include <gloo/transport/device.h>
168 | +#include <omnireduce/context.hpp>
169 |  
170 |  #include <c10/util/hash.h>
171 |  
172 | @@ -235,6 +236,7 @@ class ProcessGroupGloo : public ProcessGroup {
173 |    // In order to use more than one device (or allow for parallelism on
174 |    // a single device), you need multiple contexts.
175 |    std::vector<std::shared_ptr<::gloo::Context>> contexts_;
176 | +  omnireduce::OmniContext& omniContext = omnireduce::OmniContext::getInstance();
177 |    std::vector<std::thread> threads_;
178 |    bool stop_;
179 |  
180 | -- 
181 | 2.17.1
182 | 
183 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/aggcontext.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "omnireduce/common.hpp"
 3 | 
 4 | namespace omnireduce {
 5 |     class AggContext {
 6 |         public:
 7 |             static AggContext& getInstance() {
 8 |                 static AggContext instance;
 9 |                 return instance;
10 |             }
11 |             AggContext(AggContext const&) = delete;
12 |             void operator=(AggContext const&) = delete;            
13 |             uint32_t num_server_threads;
14 |             int ret;
15 |             int serverId;
16 |             int tensor_size;
17 |             TensorUpdateType typecode;
18 |             uint32_t element_size;
19 |             int *socks;
20 |             void *comm_buf;
21 |             struct ibv_context *ib_ctx;
22 |             struct ibv_port_attr port_attr;
23 |             struct ibv_pd *pd;
24 |             struct ibv_cq **cq;
25 |             struct ibv_qp **qp;
26 |             struct ibv_cq *cq_address;
27 |             struct ibv_qp **qp_address;
28 |             struct ibv_mr *mr;
29 |             uint32_t **srcs_;
30 |             struct ibv_mr **mrs_;
31 |             uint32_t **current_offset_thread;
32 |             struct remote_con_data_t *remote_props_array;
33 |             std::atomic_uint_fast32_t threadid;
34 |             AggContext();
35 |             ~AggContext();
36 |             void init();
37 |             void StartMaster();
38 |             void StopMaster();
39 |             int post_receive_address(uint32_t);
40 |             int post_send_ready(uint32_t);
41 |             void wait_master_ready();
42 |             void set_master_ready();
43 |             pthread_t aggmasterThread;
44 |             boost::mutex master_ready_mutex;
45 |             boost::condition_variable master_ready_event;
46 |             uint32_t master_ready;
47 |     };
48 | }
49 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/aggregator.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * OmniReduce project
 3 |   * author: jiawei.fei@kaust.edu.sa
 4 |   */
 5 | 
 6 | #pragma once
 7 | 
 8 | namespace omnireduce {
 9 |     void *aggregator(void*);
10 |     void *dr_aggregator(void*);
11 | }


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/common.cpp:
--------------------------------------------------------------------------------
1 | #include "omnireduce/common.hpp"
2 | 
3 | namespace omnireduce {
4 | 
5 |     volatile bool force_quit;
6 | }
7 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/common.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * OmniReduce project
 3 |   * author: jiawei.fei@kaust.edu.sa
 4 |   */
 5 | #pragma once
 6 | 
 7 | #include <boost/thread.hpp>
 8 | #include <atomic>
 9 | #include <infiniband/verbs.h>
10 | #include <sys/socket.h>
11 | #include <arpa/inet.h>
12 | #include <netdb.h>
13 | #include <sys/types.h>
14 | #include <byteswap.h>
15 | #include <iostream>
16 | #include "omnireduce/params.hpp"
17 | 
18 | #define likely(x) __builtin_expect(!!(x), 1)
19 | #define unlikely(x) __builtin_expect(!!(x), 0)
20 | 
21 | #if __BYTE_ORDER == __LITTLE_ENDIAN
22 | static inline uint64_t htonll(uint64_t x) { return bswap_64(x); }
23 | static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); }
24 | #elif __BYTE_ORDER == __BIG_ENDIAN
25 | static inline uint64_t htonll(uint64_t x) { return x; }
26 | static inline uint64_t ntohll(uint64_t x) { return x; }
27 | #else
28 | #error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN
29 | #endif
30 | 
31 | namespace omnireduce {
32 | 
33 |     struct remote_con_data_t
34 |     {
35 |         int remoteId;
36 | 	    uint64_t addr;   /* Buffer address */
37 | 	    uint32_t rkey;   /* Remote key */
38 | 	    uint32_t qp_num[MAX_NUM_AGGS*MAX_NUM_QPS*MAX_NUM_THREADS+1]; /* QP number */
39 | 	    uint16_t lid;	/* LID of the IB port */
40 | 	    uint8_t gid[16]; /* gid */
41 |     };
42 | 
43 |     struct cm_con_data_t
44 |     {
45 |     	int remoteId;
46 |     	uint32_t num_peers;
47 |     	uint64_t addr;   /* Buffer address */
48 |     	uint32_t rkey;   /* Remote key */
49 |     	uint32_t qp_num[MAX_NUM_AGGS*MAX_NUM_QPS*MAX_NUM_THREADS+1]; /* QP number */
50 |     	uint16_t lid;	/* LID of the IB port */
51 |     	uint8_t gid[16]; /* gid */
52 |     } __attribute__((packed));
53 | 
54 |     enum TensorUpdateType {
55 |         NONE = 0, INT32 = 1, FLOAT32 = 2, FLOAT16 = 3
56 |     };
57 |     enum OpType {
58 |         NOP = 0, ALLREDUCE = 1, BROADCAST = 2, ACK = 3
59 |     };
60 |     struct TensorUpdate {
61 |         void* ptr;
62 |         uint32_t count;
63 |         uint32_t start_idx;
64 |         int32_t id;
65 |         uint32_t root;
66 |         TensorUpdateType type;
67 |         OpType op;
68 |         uint8_t* bitmap_ptr;
69 |         uint32_t block_count;
70 |         int32_t devId;
71 |         bool async;
72 |         bool bitmap_async;
73 |     }; 
74 |     extern volatile bool force_quit;
75 | }
76 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/context.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * OmniReduce project
 3 |   * author: jiawei.fei@kaust.edu.sa
 4 |   */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "omnireduce/common.hpp"
 9 | 
10 | #ifdef USE_CUDA
11 | #include <cuda_runtime.h>
12 | #endif
13 | 
14 | namespace omnireduce {
15 |     void *OmniMaster(void *ctx);
16 |   
17 |     class OmniContext {
18 |         public:
19 |             static OmniContext& getInstance() {
20 |                 static OmniContext instance;
21 |                 return instance;
22 |             }
23 |          
24 |             OmniContext(OmniContext const&) = delete;
25 |             void operator=(OmniContext const&) = delete;
26 |             void wait_master_ready();
27 |             void set_master_ready();
28 |             void set_num_worker_threads(uint32_t);
29 |             uint32_t get_num_worker_threads();
30 |             void set_block_size(uint32_t);
31 | 
32 |             void receive_result(const int32_t);
33 |             bool send_result(const int32_t);
34 |             void send_tensor(TensorUpdate*);
35 |             bool receive_tensor(TensorUpdate&, uint32_t);
36 | 
37 |             void init();
38 |             void StartMaster();
39 |             void StopMaster();
40 |             void send_address(int, TensorUpdateType);
41 | 
42 |             void AllReduce(float*, int, uint8_t*, int);
43 |             void AllReduce(int32_t*, int, uint8_t*, int);
44 | #ifdef USE_CUDA
45 |             void AllReduce(float*, int, uint8_t*, int, cudaStream_t, int);
46 |             void AllReduce(int32_t*, int, uint8_t*, int, cudaStream_t, int);
47 |             void AllReduce(float*, int, uint8_t*, int, cudaStream_t, int, bool);
48 |             void AllReduce(int32_t*, int, uint8_t*, int, cudaStream_t, int, bool);
49 |             void AllReduce_NGDR(float*, int, cudaStream_t, int, bool, bool);
50 |             void AllReduce_NGDR(int32_t*, int, cudaStream_t, int, bool, bool);
51 |             void AllReduce_GDR(float*, int, cudaStream_t, int);
52 |             void AllReduce_GDR(int32_t*, int, cudaStream_t, int);
53 |             void AllReduce(float*, int, cudaStream_t, int);
54 |             void AllReduce(int32_t*, int, cudaStream_t, int);
55 |             void *host_tensor;
56 |             uint8_t *bitmap;
57 | #endif
58 |             int workerId;
59 |             int *socks;
60 |             void *comm_buf;
61 |             void *cuda_comm_buf;
62 |             struct ibv_context *ib_ctx;
63 |             struct ibv_port_attr port_attr;
64 |             struct ibv_pd *pd;
65 |             struct ibv_cq **cq;
66 |             struct ibv_qp **qp;
67 |             struct ibv_cq *cq_address;
68 |             struct ibv_qp **qp_address;
69 |             struct ibv_mr *mr;
70 |             uint32_t *src_;
71 |             struct ibv_mr *mr_;
72 |             struct remote_con_data_t *remote_props_array;
73 |             std::atomic_uint_fast32_t threadid;
74 |             int ret;
75 |         
76 |         private:
77 |             OmniContext();
78 |             virtual ~OmniContext();
79 | 
80 |             pthread_t masterThread;
81 | 
82 |             std::atomic_uint_fast32_t tid_counter;
83 |             boost::mutex master_ready_mutex, data_ready_mutex, result_mutex;
84 |             boost::condition_variable master_ready_event, data_push_event, data_pop_event, result_push_event, result_pop_event;
85 |             uint32_t num_worker_threads;
86 | 
87 |             uint32_t master_ready;
88 |             uint32_t data_ready;
89 |             uint32_t results;
90 |             TensorUpdate* tensor_update_ptr;
91 |             int32_t result_id;
92 | 
93 | 
94 |             boost::chrono::milliseconds one_msec;
95 |             boost::chrono::microseconds one_microsec;
96 |     };
97 | }
98 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/cuda_utils.cu:
--------------------------------------------------------------------------------
 1 | #include "cuda_utils.hpp"
 2 | 
 3 | template <typename scalar_t>
 4 | __global__ void bitmap_cuda_kernel(scalar_t* input, uint8_t* bitmap, int64_t len, scalar_t threshold) {
 5 |     const auto index = blockIdx.x * blockDim.x + threadIdx.x;
 6 |     __shared__ bool zero_block;
 7 |     if (threadIdx.x == 0) zero_block = true;
 8 |     __syncthreads();
 9 |     if(index < len) {
10 |       if(std::abs(input[index]) > threshold) zero_block=false;
11 |     }
12 |     __syncthreads();
13 |     if(index < len) {
14 |       if(zero_block) {
15 |         input[index]=0.0;
16 |         bitmap[blockIdx.x]=1;
17 |       }
18 |       else {
19 |         bitmap[blockIdx.x]=0;
20 |       }
21 |     }
22 |     __syncthreads();
23 | }
24 | 
25 | void compute_bitmap(float* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, float threshold) {
26 |     uint32_t block_num = tensor_size/block_size;
27 |     if (tensor_size%block_size!=0)
28 |         block_num += 1;
29 |     bitmap_cuda_kernel<<<block_num, block_size, 0, stream>>>(d_tensor, d_bitmap, tensor_size, threshold);
30 | }
31 | 
32 | void compute_bitmap(int* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, int threshold) {
33 |   uint32_t block_num = tensor_size/block_size;
34 |   if (tensor_size%block_size!=0)
35 |       block_num += 1;
36 |   bitmap_cuda_kernel<<<block_num, block_size, 0, stream>>>(d_tensor, d_bitmap, tensor_size, threshold);
37 | }


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/cuda_utils.hpp:
--------------------------------------------------------------------------------
1 | #include <cuda_runtime.h>
2 | #include <stdio.h>
3 | #include <stdint.h>
4 | 
5 | void compute_bitmap(float* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, float threshold);
6 | void compute_bitmap(int* d_tensor, uint8_t* d_bitmap, int64_t tensor_size, uint32_t block_size, cudaStream_t stream, int threshold);


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/omnireduce.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * OmniReduce project
 3 |   * author: jiawei.fei@kaust.edu.sa
 4 |   */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include "omnireduce/context.hpp"
 9 | #include "omnireduce/aggcontext.hpp"
10 | 
11 | namespace omnireduce {
12 |     int master(OmniContext* dctx);
13 |     int aggmaster(AggContext* dctx);
14 | }
15 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/params.cpp:
--------------------------------------------------------------------------------
  1 | #include "omnireduce/params.hpp"
  2 | #include <boost/program_options.hpp>
  3 | 
  4 | namespace po = boost::program_options;
  5 | 
  6 | namespace omnireduce {
  7 |     std::unordered_map<uint32_t, uint32_t> qp_num_revert {};
  8 |     std::unordered_map<uint32_t, uint32_t> qp_num_to_peerid {};
  9 |     omnireduce_params omnireduce_par;
 10 | 
 11 |     void parse_parameters()
 12 |     {
 13 |         std::string config_file;
 14 |         std::ifstream ifs;
 15 |         uint32_t num_workers, num_aggregators, num_threads, buffer_size, chunk_size, bitmap_chunk_size, message_size, block_size, direct_memory, adaptive_blocksize, gpu_devId, tcp_port;
 16 |         int ib_port, gid_idx, sl;
 17 |         float threshold;
 18 |         std::string worker_ip_str, aggregator_ips_str, worker_cores, aggregator_cores, ib_hca;
 19 |         po::options_description omnireduce_options("OmniReduce options");
 20 |         po::options_description config_file_options;
 21 |         omnireduce_options.add_options()
 22 |             ("omnireduce.num_workers", po::value<uint32_t>(&num_workers)->default_value(1), "Number of workers")
 23 |             ("omnireduce.num_aggregators", po::value<uint32_t>(&num_aggregators)->default_value(1), "Number of workers")
 24 |             ("omnireduce.num_threads", po::value<uint32_t>(&num_threads)->default_value(1), "Number of threads")
 25 |             ("omnireduce.worker_cores", po::value<std::string>(&worker_cores)->default_value("none"), "core id for each thread")
 26 |             ("omnireduce.aggregator_cores", po::value<std::string>(&aggregator_cores)->default_value("none"), "core id for each thread")
 27 |             ("omnireduce.buffer_size", po::value<uint32_t>(&buffer_size)->default_value(1024), "Buffer size(MB)")
 28 |             ("omnireduce.chunk_size", po::value<uint32_t>(&chunk_size)->default_value(4194304), "Chunk size")
 29 |             ("omnireduce.bitmap_chunk_size", po::value<uint32_t>(&bitmap_chunk_size)->default_value(4194304), "Bitmap chunk size")
 30 |             ("omnireduce.message_size", po::value<uint32_t>(&message_size)->default_value(1024), "Message size")
 31 |             ("omnireduce.block_size", po::value<uint32_t>(&block_size)->default_value(1024), "Block size")
 32 |             ("omnireduce.ib_port", po::value<int>(&ib_port)->default_value(1), "IB port")
 33 |             ("omnireduce.gid_idx", po::value<int>(&gid_idx)->default_value(2), "GID")
 34 |             ("omnireduce.sl", po::value<int>(&sl)->default_value(2), "Service level")
 35 |             ("omnireduce.gpu_devId", po::value<uint32_t>(&gpu_devId)->default_value(0), "GPU device ID")
 36 |             ("omnireduce.direct_memory", po::value<uint32_t>(&direct_memory)->default_value(0), "Use direct memory")
 37 |             ("omnireduce.adaptive_blocksize", po::value<uint32_t>(&adaptive_blocksize)->default_value(0), "Use adaptive block size")
 38 |             ("omnireduce.tcp_port", po::value<uint32_t>(&tcp_port)->default_value(19875), "TCP PORT")
 39 |             ("omnireduce.worker_ips", po::value<std::string>(&worker_ip_str)->default_value("10.0.0.1"), "Ip addresses of workers")
 40 |             ("omnireduce.aggregator_ips", po::value<std::string>(&aggregator_ips_str)->default_value("10.0.0.1"), "Ip addresses of aggregators")
 41 |             ("omnireduce.threshold", po::value<float>(&threshold)->default_value(0.0), "Threshold for bitmap calculation")
 42 |             ("omnireduce.ib_hca", po::value<std::string>(&ib_hca)->default_value("mlx5_0"), "eth name");
 43 |         config_file_options.add(omnireduce_options);
 44 |         config_file = "/etc/omnireduce.cfg";
 45 |         ifs.open(config_file.c_str());
 46 |         if(!ifs.good()){
 47 |             ifs.close();
 48 |             config_file = "omnireduce.cfg";
 49 |             ifs.open(config_file.c_str());
 50 |             if(!ifs.good()){
 51 |                 ifs.close();
 52 |                 std::cerr<<"No config file found!"<<std::endl;
 53 |                 exit(1); 
 54 |             }
 55 |         }
 56 |         po::variables_map vm;
 57 |         po::store(po::parse_config_file(ifs, config_file_options), vm);
 58 |         po::notify(vm);
 59 |         //std::cout<<"num workers: "<<num_workers<<"; num aggregators: "<<num_aggregators<<std::endl;
 60 |         //std::cout<<"num threads: "<<num_threads<<"; message size: "<<message_size<<"; block size: "<<block_size<<std::endl;
 61 |         //std::cout<<"worker ips: "<<worker_ip_str<<std::endl;
 62 |         //std::cout<<"aggregator ips: "<<aggregator_ips_str<<std::endl;
 63 |         if(direct_memory==1)
 64 |         {
 65 |             if(message_size!=block_size)
 66 |             {
 67 |                 std::cerr<<"Message size must be equal to block size when using Direct Memory"<<std::endl;
 68 |                 exit(1);
 69 |             }
 70 |         }
 71 |         omnireduce_par.setNumWorkerThreads(num_threads);
 72 |         omnireduce_par.setWorkerCoreId(worker_cores);
 73 |         omnireduce_par.setAggregatorCoreId(aggregator_cores);
 74 |         omnireduce_par.setNumWorkers(num_workers);
 75 |         omnireduce_par.setNumAggregators(num_aggregators);
 76 |         omnireduce_par.setBufferSize(buffer_size);
 77 |         omnireduce_par.setChunkSize(chunk_size);
 78 |         omnireduce_par.setBitmapChunkSize(bitmap_chunk_size);
 79 |         omnireduce_par.setMessageSize(message_size);
 80 |         omnireduce_par.setBlockSize(block_size);
 81 |         omnireduce_par.setWorkerIps(worker_ip_str);
 82 |         omnireduce_par.setAggregatorIps(aggregator_ips_str);
 83 |         omnireduce_par.setNumSlotsPerThread();
 84 |         uint32_t num_blocks_per_thread = omnireduce_par.getNumSlotsPerTh()*(message_size/block_size);
 85 |         omnireduce_par.setPrepostRecvNum(QUEUE_DEPTH_DEFAULT/omnireduce_par.getNumSlotsPerTh());
 86 |         omnireduce_par.setInfOffset(num_blocks_per_thread);
 87 |         omnireduce_par.setIbPort(ib_port);
 88 |         omnireduce_par.setGidIdx(gid_idx);
 89 |         omnireduce_par.setServiceLevel(sl);
 90 |         omnireduce_par.setDirectMemory(direct_memory);
 91 |         omnireduce_par.setAdaptiveBlockSize(adaptive_blocksize);
 92 |         omnireduce_par.setGpuDeviceId(gpu_devId);
 93 |         omnireduce_par.setIbHca(ib_hca);
 94 |         omnireduce_par.setThreshold(threshold);
 95 |         omnireduce_par.setTcpPort(tcp_port);
 96 |     }
 97 |     omnireduce_params::omnireduce_params() {
 98 |         buff_unit_size = 4;
 99 |         num_worker_threads = 8;
100 |         num_workers = 2;
101 |         num_aggregators = 2;
102 |         num_qps_per_aggregator_per_thread = 1;
103 |         message_size = 1024;
104 |         block_size = 256;
105 |         num_comm_buff = 256;
106 |         ib_port = 1;
107 |         gid_idx = 2;
108 |         sl = 2;
109 |     }
110 |     omnireduce_params::~omnireduce_params() {}
111 | }
112 | 


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/params.hpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |   * OmniReduce project
  3 |   * author: jiawei.fei@kaust.edu.sa
  4 |   */
  5 | #pragma once
  6 | 
  7 | #include <string.h>
  8 | #include <vector>
  9 | #include <unordered_map>
 10 | #include <cstring>
 11 | #include <fstream>
 12 | #include <iostream>
 13 | #include <boost/algorithm/string/split.hpp>
 14 | #include <boost/algorithm/string.hpp>
 15 | 
 16 | #define MAX_NUM_QPS 2
 17 | #define MAX_NUM_THREADS 8
 18 | #define MAX_NUM_AGGS 8
 19 | #define MAX_CONCURRENT_WRITES 4096
 20 | #define QUEUE_DEPTH_DEFAULT 4096
 21 | #define QPNUM_FACTOR 8
 22 | 
 23 | namespace omnireduce {
 24 |     void parse_parameters();
 25 |     extern std::unordered_map<uint32_t, uint32_t> qp_num_revert;
 26 |     extern std::unordered_map<uint32_t, uint32_t> qp_num_to_peerid;    
 27 | 
 28 |     class omnireduce_params {
 29 |         private:
 30 |             uint32_t buff_unit_size;
 31 |             uint32_t num_worker_threads;
 32 |             uint32_t num_workers;
 33 |             uint32_t num_aggregators;
 34 |             uint32_t num_qps_per_aggregator_per_thread;
 35 |             uint32_t num_slots_per_thread;
 36 |             uint32_t buffer_size;
 37 |             uint32_t chunk_size;
 38 |             uint32_t bitmap_chunk_size;
 39 |             uint32_t message_size;
 40 |             uint32_t block_size;
 41 |             uint32_t num_comm_buff;
 42 |             uint32_t prepost_recv_num;
 43 |             uint32_t *inf_offset;
 44 |             uint32_t direct_memory;
 45 |             uint32_t adaptive_blocksize;
 46 |             uint32_t gpu_devId;
 47 |             uint32_t tcp_port;
 48 |             float threshold;
 49 |             char *ib_hca;
 50 |             int ib_port;
 51 |             int gid_idx;
 52 |             int sl;
 53 |             char **aggregator_ipaddr;
 54 |             char **worker_ipaddr;
 55 |             int *worker_cores;
 56 |             int *aggregator_cores;
 57 |         public:
 58 |             omnireduce_params();
 59 |             ~omnireduce_params();
 60 |             void setIbPort(int p) {
 61 |                 ib_port=p;
 62 |             }
 63 |             void setGidIdx(int g) {
 64 |                 gid_idx=g;
 65 |             }
 66 |             void setServiceLevel(int s) {
 67 |                 sl=s;
 68 |             }
 69 |             void setInfOffset(uint32_t num_blocks_per_thread) {
 70 |                 inf_offset = (uint32_t *)malloc(num_blocks_per_thread*sizeof(uint32_t));      
 71 |                 for (uint32_t i=0; i<num_blocks_per_thread; i++)
 72 |                     inf_offset[i] = (UINT32_MAX/block_size/num_blocks_per_thread-1)*num_blocks_per_thread*block_size+i*block_size;                
 73 |             }
 74 |             void setPrepostRecvNum(uint32_t prn) {
 75 |                 prepost_recv_num = prn;
 76 |             }
 77 |             void setNumSlotsPerThread() {
 78 |                 num_slots_per_thread = QPNUM_FACTOR * num_qps_per_aggregator_per_thread * num_aggregators;
 79 |             }
 80 |             void setNumWorkerThreads(uint32_t tn) {
 81 |                 num_worker_threads = tn;
 82 |             }
 83 |             void setNumWorkers(uint32_t wn) {
 84 |                 num_workers = wn;
 85 |             }
 86 |             void setNumAggregators(uint32_t an) {
 87 |                 num_aggregators = an;
 88 |             }
 89 |             void setBufferSize(uint32_t bs) {
 90 |                 buffer_size = bs;
 91 |             }
 92 |             void setChunkSize(uint32_t cs) {
 93 |                 chunk_size = cs;
 94 |             }
 95 |             void setBitmapChunkSize(uint32_t bcs) {
 96 |                 bitmap_chunk_size = bcs;
 97 |             }
 98 |             void setMessageSize(uint32_t ms) {
 99 |                 message_size = ms;
100 |             }
101 |             void setBlockSize(uint32_t bs) {
102 |                 block_size = bs;
103 |             }
104 |             void setDirectMemory(uint32_t dm) {
105 |                 direct_memory = dm;
106 |             }
107 |             void setAdaptiveBlockSize(uint32_t ab) {
108 |                 adaptive_blocksize = ab;
109 |             }
110 |             void setWorkerIps(std::string workerIps) {
111 |                 std::vector<std::string> ips;
112 |                 boost::split(ips, workerIps, boost::is_any_of(","));
113 |                 if (num_workers!=ips.size())
114 |                 {
115 |                     std::cerr<<"Worker number error!"<<std::endl;
116 |                     exit(1);
117 |                 }
118 |                 worker_ipaddr = (char **)malloc(num_workers*sizeof(char *));
119 |                 for (uint32_t i=0; i<num_workers; i++)
120 |                 {
121 |                     worker_ipaddr[i] = (char*)malloc(20*sizeof(char));
122 |                     strcpy(worker_ipaddr[i], ips[i].c_str());
123 |                 }                 
124 |             }
125 |             void setAggregatorIps(std::string aggregatorIps) {
126 |                 std::vector<std::string> ips;
127 |                 boost::split(ips, aggregatorIps, boost::is_any_of(","));
128 |                 if (num_aggregators!=ips.size())
129 |                 {
130 |                     std::cerr<<"Aggregator number error!"<<std::endl;
131 |                     exit(1);
132 |                 }
133 |                 aggregator_ipaddr = (char **)malloc(num_aggregators*sizeof(char *));
134 |                 for (uint32_t i=0; i<num_aggregators; i++)
135 |                 {
136 |                     aggregator_ipaddr[i] = (char*)malloc(20*sizeof(char));
137 |                     strcpy(aggregator_ipaddr[i], ips[i].c_str());
138 |                 } 
139 |             }
140 |             void setWorkerCoreId(std::string cores_str) {
141 |                 std::vector<std::string> coreids;
142 |                 boost::split(coreids, cores_str, boost::is_any_of(","));
143 |                 if(num_worker_threads!=coreids.size())
144 |                 {
145 |                     std::cerr<<"core id set error!"<<std::endl;
146 |                     exit(1);
147 |                 }
148 |                 worker_cores = (int *)malloc(num_worker_threads*sizeof(int));
149 |                 for (uint32_t i=0; i<num_worker_threads; i++)
150 |                 {
151 |                     worker_cores[i] = std::stoi(coreids[i]);
152 |                 }
153 |             }
154 |             void setAggregatorCoreId(std::string cores_str) {
155 |                 std::vector<std::string> coreids;
156 |                 boost::split(coreids, cores_str, boost::is_any_of(","));
157 |                 if(num_worker_threads!=coreids.size())
158 |                 {
159 |                     std::cerr<<"core id set error!"<<std::endl;
160 |                     exit(1);
161 |                 }
162 |                 aggregator_cores = (int *)malloc(num_worker_threads*sizeof(int));
163 |                 for (uint32_t i=0; i<num_worker_threads; i++)
164 |                 {
165 |                     aggregator_cores[i] = std::stoi(coreids[i]);
166 |                 }
167 |             }
168 |             void setGpuDeviceId(uint32_t devId) {
169 |                 gpu_devId = devId;
170 |             }
171 |             void setIbHca(std::string en) {
172 |                 ib_hca = (char*)malloc(20*sizeof(char));
173 |                 strcpy(ib_hca, en.c_str());
174 |             }
175 |             void setThreshold(float th) {
176 |                 threshold = th;
177 |             }
178 |             void setTcpPort(uint32_t p) {
179 |                 tcp_port = p;
180 |             }
181 |             uint32_t getBuffUnitSize() {
182 |                 return buff_unit_size;
183 |             }
184 |             uint32_t getNumWorkerThreads() {
185 |                 return num_worker_threads;
186 |             }
187 |             uint32_t getNumWorkers() {
188 |                 return num_workers;
189 |             }
190 |             uint32_t getNumAggregators() {
191 |                 return num_aggregators;
192 |             }
193 |             uint32_t getNumQpsPerAggTh() {
194 |                 return num_qps_per_aggregator_per_thread;
195 |             }
196 |             uint32_t getNumSlotsPerTh() {
197 |                 return num_slots_per_thread;
198 |             }
199 |             uint32_t getBufferSize() {
200 |                 return buffer_size;
201 |             }
202 |             uint32_t getChunkSize() {
203 |                 return chunk_size;
204 |             }
205 |             uint32_t getBitmapChunkSize() {
206 |                 return bitmap_chunk_size;
207 |             }
208 |             uint32_t getMessageSize() {
209 |                 return message_size;
210 |             }
211 |             uint32_t getBlockSize() {
212 |                 return block_size;
213 |             }
214 |             uint32_t getDirectMemory() {
215 |                 return direct_memory;
216 |             }
217 |             uint32_t getAdaptiveBlockSize() {
218 |                 return adaptive_blocksize;
219 |             }
220 |             uint32_t getNumCommbuff() {
221 |                 return num_comm_buff;
222 |             }
223 |             uint32_t getPrepostRecvNum() {
224 |                 return prepost_recv_num;
225 |             }
226 |             uint32_t getInfOffset(uint32_t i) {
227 |                 return inf_offset[i];
228 |             }
229 |             char *getAggregatorIP(uint32_t i) {
230 |                 return aggregator_ipaddr[i];
231 |             }
232 |             char *getWorkerIP(uint32_t i) {
233 |                 return worker_ipaddr[i];
234 |             }
235 |             int getWorkerCoreId(uint32_t i) {
236 |                 return worker_cores[i];
237 |             }
238 |             int getAggregatorCoreId(uint32_t i) {
239 |                 return aggregator_cores[i];
240 |             }
241 |             int getIbPort() {
242 |                 return ib_port;
243 |             }
244 |             int getGidIdx() {
245 |                 return gid_idx;
246 |             }
247 |             int getServiceLevel() {
248 |                 return sl;
249 |             }
250 |             uint32_t getGpuDeviceId() {
251 |                 return gpu_devId;
252 |             }
253 |             char *getIbHca() {
254 |                 return ib_hca;
255 |             }
256 |             float getThreshold() {
257 |                 return threshold;
258 |             }
259 |             uint32_t getTcpPort() {
260 |                 return tcp_port;
261 |             }
262 |     };
263 | 
264 |     extern omnireduce_params omnireduce_par;
265 | }


--------------------------------------------------------------------------------
/omnireduce-RDMA/omnireduce/worker.hpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * OmniReduce project
 3 |   * author: jiawei.fei@kaust.edu.sa
 4 |   */
 5 | 
 6 | #pragma once
 7 | 
 8 | namespace omnireduce {
 9 |     void *worker(void*);
10 |     void *dr_worker(void*); 
11 | }


--------------------------------------------------------------------------------