├── byteps ├── __init__.py ├── misc │ └── __init__.py ├── torch │ ├── parallel │ │ └── __init__.py │ ├── cuda_util.h │ ├── adapter.h │ ├── ready_event.h │ ├── handle_manager.h │ ├── cuda_util.cc │ ├── handle_manager.cc │ ├── ops.h │ ├── compression.py │ ├── adapter.cc │ └── ready_event.cc ├── tensorflow │ ├── distribute │ │ └── __init__.py │ ├── util.py │ ├── ops.h │ └── compression.py ├── __version__.py ├── server │ ├── __init__.py │ └── queue.h ├── mxnet │ ├── cuda_util.h │ ├── ready_event.cc │ ├── util.h │ ├── ready_event.h │ ├── adapter.h │ ├── cuda_util.cc │ ├── ops.h │ ├── adapter.cc │ └── tensor_util.h └── common │ ├── compressor │ ├── momentum.cc │ ├── error_feedback.cc │ ├── impl │ │ ├── nesterov_momentum.h │ │ ├── nesterov_momentum.cc │ │ ├── vanilla_error_feedback.h │ │ ├── vanilla_error_feedback.cc │ │ ├── topk.h │ │ ├── onebit.h │ │ ├── dithering.h │ │ └── randomk.h │ ├── compressor_registry.h │ ├── compressor_registry.cc │ ├── momentum.h │ └── error_feedback.h │ ├── core_loops.h │ ├── ready_table.cc │ ├── ready_table.h │ ├── scheduled_queue.h │ ├── shared_memory.h │ ├── thread_pool.h │ ├── operations.h │ ├── shared_memory.cc │ ├── nccl_manager.h │ ├── logging.h │ └── logging.cc ├── example ├── mxnet │ ├── common │ │ ├── __init__.py │ │ ├── find_mxnet.py │ │ └── util.py │ ├── symbols │ │ ├── __init__.py │ │ ├── README.md │ │ ├── mlp.py │ │ ├── lenet.py │ │ ├── alexnet.py │ │ └── vgg.py │ ├── data │ │ ├── imagenet1k-val.sh │ │ └── caltech256.sh │ └── train_imagenet_byteps.py ├── README.md ├── tensorflow │ ├── tensorflow2_mnist.py │ ├── tensorflow2_mnist_bps_MirroredStrategy.py │ ├── tensorflow_keras_mnist.py │ └── tensorflow2_keras_mnist.py ├── keras │ ├── keras_synthetic_benchmark_tf2.py │ └── keras_mnist.py └── pytorch │ └── mnist-distributed.py ├── byteps.exp ├── .clang-format ├── byteps.lds ├── .gitmodules ├── pre_setup.py ├── MANIFEST.in ├── CONTRIBUTING.md ├── docker ├── README.md └── Dockerfile ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── tests ├── run_byteps_test.sh ├── utils.py ├── meta_test.py ├── test_tensorflow_keras.py ├── test_topk.py └── test_onebit.py ├── launcher └── README.md ├── docs ├── MirroredStrategy.md ├── DistributedDataParallel.md ├── performance.md ├── cross-barrier.md ├── troubleshooting.md ├── running.md ├── architecture.md ├── best-practice.md ├── timeline.md └── faq.md ├── CHANGELOG.rst ├── .travis.yml ├── NOTICE └── .gitignore /byteps/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /byteps/misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/mxnet/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /example/mxnet/symbols/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /byteps.exp: -------------------------------------------------------------------------------- 1 | *byteps* 2 | # PyTorch binding 3 | *PyInit* 4 | *initc_lib* 5 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | For more examples, see: https://github.com/byteps/examples -------------------------------------------------------------------------------- /byteps/torch/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedDataParallel 2 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | --- 4 | Language: Cpp 5 | ColumnLimit: 80 6 | -------------------------------------------------------------------------------- /byteps/tensorflow/distribute/__init__.py: -------------------------------------------------------------------------------- 1 | from . mirrored_strategy import MirroredStrategy 2 | -------------------------------------------------------------------------------- /byteps/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 2, 5) 2 | 3 | __version__ = '.'.join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /byteps.lds: -------------------------------------------------------------------------------- 1 | { 2 | global: 3 | *byteps*; 4 | # PyTorch binding 5 | *PyInit*; 6 | *initc_lib*; 7 | local: *; 8 | }; 9 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/ps-lite"] 2 | path = 3rdparty/ps-lite 3 | url = https://github.com/bytedance/ps-lite 4 | branch = byteps 5 | -------------------------------------------------------------------------------- /pre_setup.py: -------------------------------------------------------------------------------- 1 | # For internal use. Please do not modify this file. 2 | 3 | def setup(): 4 | return 5 | 6 | def extra_make_option(): 7 | return "" 8 | 9 | # absolute path to the ucx tar.gz file 10 | ucx_tarball_path = "" 11 | -------------------------------------------------------------------------------- /example/mxnet/symbols/README.md: -------------------------------------------------------------------------------- 1 | # Symbol 2 | 3 | This fold contains definition of various networks. To add a new network, please 4 | use the following format. 5 | 6 | ## Python 7 | 8 | - A file implements one network proposed in a paper, with the network name as the 9 | filename. 10 | - Mention the paper and the modifications made if any at the beginning 11 | of the file. 12 | - Indicate how to reproduce the accuracy numbers in the paper if it is not straightforward 13 | - Provide a function `get_symbol()` that return the network 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include */* LICENSE byteps.lds byteps.exp 2 | recursive-include * *.cc *.h 3 | prune .git 4 | prune dist 5 | prune bin 6 | prune __pycache__ 7 | prune 3rdparty 8 | graft 3rdparty/ps-lite 9 | prune 3rdparty/ps-lite/build 10 | prune 3rdparty/ps-lite/deps 11 | exclude 3rdparty/ps-lite/tests/test_benchmark 12 | exclude 3rdparty/ps-lite/tests/test_benchmark.d 13 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark 14 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark.d 15 | 16 | include pre_setup.py pre_setup_local.py zeromq-4.1.4.tar.gz ucx.tar.gz 17 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guidelines 2 | 3 | First of all, thanks for taking the time to contribute! 4 | 5 | Please refer to the following guidelines to contribute new functionality or bug fixes: 6 | 7 | 1. Use [autopep8](https://github.com/hhatto/autopep8) to format the Python code. 8 | 2. Use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C++ code. Changes to BytePS C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). 9 | 3. Add unit tests for any new code you write. 10 | 4. Run unit tests in both CI and GPU environments. 11 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Prebuilt Images 2 | 3 | Belows are prebuilt docker images, and their associated commands to build. These prebuilt images might not be up-to-date. 4 | You may need to manually build them to get the latest functionalities of BytePS using the dockerfile. 5 | 6 | | Docker image | How to build | 7 | | --- | --- | 8 | | bytepsimage/tensorflow | docker build -t bytepsimage/tensorflow . -f Dockerfile --build-arg FRAMEWORK=tensorflow | 9 | | bytepsimage/pytorch | docker build -t bytepsimage/pytorch . -f Dockerfile --build-arg FRAMEWORK=pytorch | 10 | | bytepsimage/mxnet | docker build -t bytepsimage/mxnet . -f Dockerfile --build-arg FRAMEWORK=mxnet | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. 16 | 2. 17 | 3. 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Environment (please complete the following information):** 27 | - OS: 28 | - GCC version: 29 | - CUDA and NCCL version: 30 | - Framework (TF, PyTorch, MXNet): 31 | 32 | **Additional context** 33 | Add any other context about the problem here. 34 | -------------------------------------------------------------------------------- /tests/run_byteps_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path="$(dirname $0)" 4 | 5 | export PATH=~/anaconda3/envs/mxnet_p36/bin:$PATH 6 | export DMLC_NUM_WORKER=1 7 | export DMLC_NUM_SERVER=1 8 | export DMLC_PS_ROOT_URI=127.0.0.1 9 | export DMLC_PS_ROOT_PORT=1234 10 | 11 | function cleanup() { 12 | rm -rf lr.s 13 | } 14 | 15 | trap cleanup EXIT 16 | 17 | pkill bpslaunch 18 | pkill python3 19 | 20 | echo "Launch scheduler" 21 | export DMLC_ROLE=scheduler 22 | bpslaunch & 23 | 24 | echo "Launch server" 25 | export DMLC_ROLE=server 26 | bpslaunch & 27 | 28 | export NVIDIA_VISIBLE_DEVICES=0 29 | export DMLC_WORKER_ID=0 30 | export DMLC_ROLE=worker 31 | export BYTEPS_THREADPOOL_SIZE=4 32 | export BYTEPS_FORCE_DISTRIBUTED=1 33 | export BYTEPS_LOG_LEVEL=WARNING 34 | 35 | if [ "$TEST_TYPE" == "keras" ]; then 36 | echo "TEST KERAS ..." 37 | python $path/test_tensorflow_keras.py $@ 38 | else 39 | echo "Error: unsupported $TEST_TYPE" 40 | exit 1 41 | fi 42 | -------------------------------------------------------------------------------- /launcher/README.md: -------------------------------------------------------------------------------- 1 | ### How to use distributed launcher 2 | 3 | Create two host files: `worker_hosts` and `server_hosts`, put your lists of hosts inside (one IP:port per line). 4 | 5 | For example, we want `10.0.0.1:12345` to be the scheduler, `10.0.0.2` and `10.0.0.3` to be the workers, `10.0.0.4` and `10.0.0.5` to be the servers. 6 | 7 | Then `worker_hosts` should be: 8 | ``` 9 | 10.0.0.2 10 | 10.0.0.3 11 | ``` 12 | 13 | And `server_hosts` should be: 14 | ``` 15 | 10.0.0.4 16 | 10.0.0.5 17 | ``` 18 | 19 | Finally, start the distributed ssh launcher by: 20 | 21 | ``` 22 | python dist_launcher.py --worker-hostfile worker_hosts --server-hostfile server_hosts \ 23 | --scheduler-ip 10.0.0.1 --scheduler-port 12345 \ 24 | --username root --env ENV1:1 --env ENV2:2 \ 25 | 'echo this is $DMLC_ROLE; python byteps/launcher/launch.py YOUR_COMMAND' 26 | ``` 27 | 28 | The script will automatically help you setup the necessary [environment variables](/docs/env.md) and launch BytePS processes. -------------------------------------------------------------------------------- /example/mxnet/common/find_mxnet.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import os, sys 19 | try: 20 | import mxnet as mx 21 | except ImportError: 22 | curr_path = os.path.abspath(os.path.dirname(__file__)) 23 | sys.path.append(os.path.join(curr_path, "../../../python")) 24 | import mxnet as mx 25 | -------------------------------------------------------------------------------- /byteps/server/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import ctypes 17 | import os 18 | from byteps.common import get_ext_suffix 19 | 20 | 21 | def run(): 22 | dll_path = os.path.join(os.path.dirname(__file__), 23 | 'c_lib' + get_ext_suffix()) 24 | SERVER_LIB_CTYPES = ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL) 25 | SERVER_LIB_CTYPES.byteps_server() 26 | 27 | run() 28 | -------------------------------------------------------------------------------- /docs/MirroredStrategy.md: -------------------------------------------------------------------------------- 1 | # MirroredStrategy 2 | 3 | The BytePS MirroredStrategy module is compatible with tensorflow 4 | MultiWorkerMirroredStrategy for the most part. Instead of using the builtin 5 | tensorflow collective communication implementation, it uses BytePS push-pull 6 | for gradients reduction between nodes. 7 | 8 | It currently supports the Single-Process Single-GPU mode. In this mode each 9 | process works with one GPU. Example usage: 10 | 11 | 12 | ```python 13 | import byteps.tensorflow as bps 14 | from byteps.tensorflow.distribute import MirroredStrategy 15 | 16 | bps.init() 17 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU') 18 | strategy = MirroredStrategy(devices=["/gpu:0"]) 19 | 20 | with strategy.scope(): 21 | # Model building/compiling need to be within `strategy.scope()`. 22 | multi_worker_model = build_and_compile_cnn_model() 23 | 24 | multi_worker_model.fit(multi_worker_dataset, epochs=100, steps_per_epoch=70) 25 | ``` 26 | To run the program, use `bpslaunch` to launch one process for each device you 27 | wish to use. Refer to the [running](./running.md) document for how to use 28 | `bpslaunch`. 29 | -------------------------------------------------------------------------------- /byteps/tensorflow/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | from distutils.version import LooseVersion 16 | 17 | import tensorflow as tf 18 | 19 | 20 | if LooseVersion(tf.__version__) >= LooseVersion("1.9.0"): 21 | from tensorflow.python.eager import context 22 | _has_eager = True 23 | else: 24 | _has_eager = False 25 | 26 | 27 | def _executing_eagerly(): 28 | """Returns true if eager execution is supported and enabled.""" 29 | return _has_eager and context.in_eager_mode() 30 | -------------------------------------------------------------------------------- /byteps/mxnet/cuda_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_CUDA_UTIL_H 18 | #define BYTEPS_MXNET_CUDA_UTIL_H 19 | 20 | namespace byteps { 21 | namespace mxnet { 22 | 23 | class with_device { 24 | public: 25 | with_device(int device); 26 | ~with_device(); 27 | 28 | private: 29 | int restore_device_; 30 | }; 31 | 32 | } // namespace mxnet 33 | } // namespace byteps 34 | 35 | #endif // BYTEPS_MXNET_CUDA_UTIL_H 36 | -------------------------------------------------------------------------------- /byteps/torch/cuda_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_CUDA_UTIL_H 18 | #define BYTEPS_TORCH_CUDA_UTIL_H 19 | 20 | #include "../common/common.h" 21 | 22 | namespace byteps { 23 | namespace torch { 24 | 25 | class with_device { 26 | public: 27 | with_device(int device); 28 | ~with_device(); 29 | 30 | private: 31 | int restore_device_ = CPU_DEVICE_ID; 32 | }; 33 | 34 | } // namespace torch 35 | } // namespace byteps 36 | 37 | #endif // BYTEPS_TORCH_CUDA_UTIL_H 38 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2 | Changelog for BytePS 3 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 4 | 0.2.4 (2020-06) 5 | ------------------ 6 | * Fix compatibility issue with tf2 + standalone keras 7 | * Add support for tensorflow.keras 8 | * Improve robustness of broadcast 9 | 10 | 11 | 0.2.3 (2020-05) 12 | ------------------ 13 | * Add DistributedDataParallel module for PyTorch 14 | * Fix the problem of different CPU tensor using the same name 15 | * Add skip_synchronize api for PyTorch 16 | * Add the option for lazy/non-lazy init 17 | 18 | 19 | 0.2.0 (2020-02) 20 | ------------------ 21 | * Largely improve RDMA performance by enforcing page aligned memory. 22 | * Add IPC support for RDMA. Now support colocating servers and workers without sacrificing much performance. 23 | * Fix a hanging bug in BytePS server. 24 | * Fix RDMA-related segmentation fault problem during fork() (e.g., used by PyTorch data loader). 25 | * New feature: Enable mixing use of colocate and non-colocate servers, along with a smart tensor allocation strategy. 26 | * New feature: Add ``bpslaunch`` as the command to launch tasks. 27 | * Add support for pip install: ``pip3 install byteps`` 28 | 29 | 30 | 0.1.0 (2019-12) 31 | ------------------ 32 | * First official release. 33 | -------------------------------------------------------------------------------- /byteps/common/compressor/momentum.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "momentum.h" 17 | 18 | namespace byteps { 19 | namespace common { 20 | namespace compressor { 21 | 22 | tensor_t Momentum::Compress(tensor_t grad) { 23 | // 1. m_t = \mu * m_{t-1} + g_t 24 | UpdateMom(grad); 25 | 26 | // 2. p_t = \mu m_t + g_t 27 | UpdateGradient(grad); 28 | 29 | // 3. compress 30 | return _cptr->Compress(grad); 31 | } 32 | 33 | tensor_t Momentum::Decompress(tensor_t compressed) { 34 | // directly forward to internal compressor 35 | return _cptr->Decompress(compressed); 36 | } 37 | 38 | } // namespace compressor 39 | } // namespace common 40 | } // namespace byteps -------------------------------------------------------------------------------- /byteps/mxnet/ready_event.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #include 18 | 19 | #if HAVE_CUDA 20 | #include 21 | 22 | #include "ready_event.h" 23 | 24 | namespace byteps { 25 | namespace mxnet { 26 | 27 | template 28 | MXReadyEvent::MXReadyEvent(NDArray* tensor) : tensor_(tensor) { 29 | assert(tensor->ctx().real_dev_id() != CPU_DEVICE_ID); 30 | } 31 | 32 | template 33 | MXReadyEvent::~MXReadyEvent() {} 34 | 35 | template 36 | bool MXReadyEvent::Ready() const { 37 | return true; 38 | } 39 | 40 | template class MXReadyEvent; 41 | 42 | } // namespace mxnet 43 | } // namespace byteps 44 | #endif 45 | -------------------------------------------------------------------------------- /byteps/common/core_loops.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_CORE_LOOPS_H 17 | #define BYTEPS_CORE_LOOPS_H 18 | 19 | namespace byteps { 20 | namespace common { 21 | 22 | void CoordinateReduceLoop(); 23 | 24 | void CoordinateBroadcastLoop(); 25 | 26 | void CoordinatePushLoop(); 27 | 28 | void PcieReduceLoop(); 29 | 30 | void RootNcclLoop(); 31 | 32 | void NonRootNcclLoop(); 33 | 34 | void SyncNcclLoop(); 35 | 36 | void CopyDevice2HostLoop(); 37 | 38 | void CompressLoop(); 39 | 40 | void PushLoop(); 41 | 42 | void PullLoop(); 43 | 44 | void DecompressLoop(); 45 | 46 | void RootCopyHost2DeviceLoop(); 47 | 48 | void NonRootCopyListenLoop(); 49 | 50 | void NonRootCopyHost2DeviceLoop(); 51 | 52 | } // namespace common 53 | } // namespace byteps 54 | 55 | #endif // BYTEPS_CORE_LOOPS_H 56 | -------------------------------------------------------------------------------- /byteps/mxnet/util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_UTIL_H 18 | #define BYTEPS_MXNET_UTIL_H 19 | 20 | #if HAVE_CUDA 21 | 22 | #include 23 | 24 | /*! 25 | * \brief Protected CUDA call. 26 | * \param func Expression to call. 27 | * 28 | * It checks for CUDA errors after invocation of the expression. 29 | */ 30 | #define CUDA_CALL(func) \ 31 | { \ 32 | cudaError_t e = (func); \ 33 | CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ 34 | << "CUDA: " << cudaGetErrorString(e); \ 35 | } 36 | 37 | #endif // HAVE_CUDA 38 | 39 | #endif // BYTEPS_MXNET_UTIL_H 40 | -------------------------------------------------------------------------------- /example/mxnet/symbols/mlp.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """ 19 | a simple multilayer perceptron 20 | """ 21 | import mxnet as mx 22 | 23 | def get_symbol(num_classes=10, **kwargs): 24 | data = mx.symbol.Variable('data') 25 | data = mx.sym.Flatten(data=data) 26 | fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) 27 | act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") 28 | fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) 29 | act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") 30 | fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) 31 | mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') 32 | return mlp 33 | -------------------------------------------------------------------------------- /byteps/torch/adapter.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_ADAPTER_H 18 | #define BYTEPS_TORCH_ADAPTER_H 19 | 20 | #include 21 | #include 22 | 23 | #include "../common/common.h" 24 | 25 | namespace byteps { 26 | namespace torch { 27 | 28 | using namespace byteps::common; 29 | 30 | class TorchTensor : public Tensor { 31 | public: 32 | TorchTensor(::torch::Tensor tensor); 33 | virtual const DataType dtype() const override; 34 | virtual const TensorShape shape() const override; 35 | virtual const void* data() const override; 36 | virtual int64_t size() const override; 37 | 38 | protected: 39 | ::torch::Tensor tensor_; 40 | }; 41 | 42 | void ThrowIfError(Status status); 43 | 44 | } // namespace torch 45 | } // namespace byteps 46 | 47 | #endif // BYTEPS_TORCH_ADAPTER_H 48 | -------------------------------------------------------------------------------- /byteps/torch/ready_event.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_READY_EVENT_H 18 | #define BYTEPS_TORCH_READY_EVENT_H 19 | 20 | #if HAVE_CUDA 21 | #include "cuda_runtime.h" 22 | #endif 23 | 24 | #include 25 | 26 | #include "../common/common.h" 27 | 28 | namespace byteps { 29 | namespace torch { 30 | 31 | using namespace byteps::common; 32 | 33 | #if HAVE_CUDA 34 | class TorchReadyEvent : public ReadyEvent { 35 | public: 36 | TorchReadyEvent(int device); 37 | ~TorchReadyEvent(); 38 | virtual bool Ready() const override; 39 | 40 | private: 41 | int device_ = CPU_DEVICE_ID; 42 | cudaEvent_t cuda_event_ = nullptr; 43 | }; 44 | #endif 45 | 46 | std::shared_ptr RecordReadyEvent(int device); 47 | 48 | } // namespace torch 49 | } // namespace byteps 50 | 51 | #endif // BYTEPS_TORCH_READY_EVENT_H 52 | -------------------------------------------------------------------------------- /byteps/mxnet/ready_event.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_READY_EVENT_H 18 | #define BYTEPS_MXNET_READY_EVENT_H 19 | 20 | #include 21 | 22 | #if HAVE_CUDA 23 | #include 24 | #include 25 | #include 26 | #include "cuda_runtime.h" 27 | 28 | #include "../common/common.h" 29 | 30 | namespace byteps { 31 | namespace mxnet { 32 | 33 | using namespace byteps::common; 34 | typedef ::mxnet::NDArray NDArray; 35 | 36 | template 37 | class MXReadyEvent : public ReadyEvent { 38 | public: 39 | MXReadyEvent(NDArray* tensor); 40 | ~MXReadyEvent(); 41 | virtual bool Ready() const override; 42 | 43 | private: 44 | NDArray* tensor_; 45 | }; 46 | 47 | } // namespace mxnet 48 | } // namespace byteps 49 | #endif 50 | 51 | #endif // BYTEPS_MXNET_READY_EVENT_H 52 | -------------------------------------------------------------------------------- /byteps/torch/handle_manager.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_HANDLE_MANAGER_H 18 | #define BYTEPS_TORCH_HANDLE_MANAGER_H 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "../common/common.h" 26 | 27 | namespace byteps { 28 | namespace torch { 29 | 30 | using namespace byteps::common; 31 | 32 | class HandleManager { 33 | public: 34 | int AllocateHandle(); 35 | void MarkDone(int handle, const Status& status); 36 | bool PollHandle(int handle); 37 | std::shared_ptr ReleaseHandle(int handle); 38 | 39 | private: 40 | std::atomic_int last_handle_; 41 | std::unordered_map> results_; 42 | std::mutex mutex_; 43 | }; 44 | 45 | } // namespace torch 46 | } // namespace byteps 47 | 48 | #endif // BYTEPS_TORCH_HANDLE_MANAGER_H 49 | -------------------------------------------------------------------------------- /docs/DistributedDataParallel.md: -------------------------------------------------------------------------------- 1 | # DistributedDataParallel 2 | 3 | BytePS Distributed Data Parallel module is compatible with PyTorch Distributed 4 | Data Parallel for the most part. Instead of using PyTorch communication 5 | backends, it uses BytePS push-pull for gradients reduction between nodes. 6 | 7 | It currently supports the Single-Process Single-GPU mode. In this mode each 8 | process works with one GPU. Example usage: 9 | 10 | 11 | ```python 12 | # byteps_ddp_example.py 13 | from byteps.torch.parallel import DistributedDataParallel 14 | 15 | model = DistributedDataParallel(model, device_ids=[i]) 16 | output = model(data) 17 | loss = F.nll_loss(output, target) 18 | loss.backward() 19 | optimizer.step() 20 | ``` 21 | 22 | Some models have branches, part of the model is skipped during the forward 23 | pass. In that case it's required to call the 24 | DistributedDataParallel.synchronize() function after loss.backward(), e.g.: 25 | 26 | ```python 27 | # byteps_ddp_example.py 28 | from byteps.torch.parallel import DistributedDataParallel 29 | 30 | # construct a model which skips some layers in the forward pass, then wrap the 31 | # model with DistributedDataParallel() 32 | model = DistributedDataParallel(model, device_ids=[i]) 33 | output = model(data) 34 | loss = F.nll_loss(output, target) 35 | loss.backward() 36 | # the synchronize() call here is required because some layers were skipped in 37 | # the forward pass 38 | model.synchronize() 39 | optimizer.step() 40 | ``` 41 | 42 | To run the program, use `bpslaunch` to launch one process for each device you 43 | wish to use. Refer to the [running](./running.md) document for how to use `bpslaunch`. 44 | -------------------------------------------------------------------------------- /byteps/torch/cuda_util.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #if HAVE_CUDA 17 | #include 18 | #include "cuda_runtime.h" 19 | #endif 20 | 21 | #include "../common/common.h" 22 | #include "cuda_util.h" 23 | 24 | namespace byteps { 25 | namespace torch { 26 | 27 | with_device::with_device(int device) { 28 | if (device == CPU_DEVICE_ID) { 29 | restore_device_ = CPU_DEVICE_ID; 30 | } else { 31 | #if HAVE_CUDA 32 | THCudaCheck(cudaGetDevice(&restore_device_)); 33 | THCudaCheck(cudaSetDevice(device)); 34 | #else 35 | throw std::logic_error( 36 | "Internal error. Requested device context manager " 37 | "with GPU device but not compiled with CUDA."); 38 | #endif 39 | } 40 | } 41 | 42 | with_device::~with_device() { 43 | #if HAVE_CUDA 44 | if (restore_device_ != CPU_DEVICE_ID) { 45 | THCudaCheck(cudaSetDevice(restore_device_)); 46 | } 47 | #endif 48 | } 49 | 50 | } // namespace torch 51 | } // namespace byteps 52 | -------------------------------------------------------------------------------- /byteps/common/compressor/error_feedback.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "error_feedback.h" 17 | 18 | namespace byteps { 19 | namespace common { 20 | namespace compressor { 21 | 22 | tensor_t ErrorFeedback::Compress(tensor_t grad) { 23 | // 1. grad <- grad + error 24 | UpdateGradient(grad); 25 | 26 | // 2. c <- Compress(grad) 27 | auto compressed = _cptr->Compress(grad); 28 | 29 | // 3. e <- grad - Decompress(c) 30 | UpdateError(grad, compressed); 31 | 32 | return compressed; 33 | } 34 | 35 | tensor_t ErrorFeedback::Decompress(tensor_t compressed) { 36 | // directly forward to internal compressor 37 | return _cptr->Decompress(compressed); 38 | } 39 | 40 | void ErrorFeedback::UpdateError(tensor_t corrected, tensor_t compressed) { 41 | tensor_t error{_error.get(), _size, corrected.dtype}; 42 | _cptr->FastUpdateError(error, corrected, compressed); 43 | } 44 | 45 | } // namespace compressor 46 | } // namespace common 47 | } // namespace byteps -------------------------------------------------------------------------------- /byteps/mxnet/adapter.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_ADAPTER_H 18 | #define BYTEPS_MXNET_ADAPTER_H 19 | 20 | #include 21 | #include "../common/common.h" 22 | 23 | namespace byteps { 24 | namespace mxnet { 25 | 26 | using namespace byteps::common; 27 | 28 | template 29 | class MXTensor : public Tensor { 30 | public: 31 | MXTensor(T* tensor); 32 | virtual const DataType dtype() const override; 33 | virtual const TensorShape shape() const override; 34 | virtual const void* data() const override; 35 | virtual int64_t size() const override; 36 | 37 | protected: 38 | T* tensor_; 39 | }; 40 | 41 | inline void ThrowIfError(const Status& status) { 42 | if (!status.ok()) { 43 | throw dmlc::Error(status.reason()); 44 | } 45 | } 46 | 47 | } // namespace mxnet 48 | } // namespace byteps 49 | 50 | #endif // BYTEPS_MXNET_ADAPTER_H 51 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: bionic 2 | language: python 3 | cache: pip 4 | jobs: 5 | include: 6 | - python: 2.7 7 | - python: 3.7 8 | env: 9 | - CUDA=10.1.105-1 CUDA_APT=10-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 10 | before_install: 11 | - CUDA_REPO=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb 12 | - NCCL_REPO=nvidia-machine-learning-repo-${UBUNTU_VERSION}_1.0.0-1_amd64.deb 13 | - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${CUDA_REPO} 14 | - sudo dpkg -i ${CUDA_REPO} 15 | - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub 16 | - wget http://developer.download.nvidia.com/compute/machine-learning/repos/${UBUNTU_VERSION}/x86_64/${NCCL_REPO} 17 | - sudo dpkg -i ${NCCL_REPO} 18 | - sudo apt update -qq 19 | - sudo apt install -y cuda-10-1 libnccl2 libnccl-dev libnuma-dev 20 | - sudo apt clean 21 | - export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} 22 | - export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 23 | - export PATH=${CUDA_HOME}/bin:${PATH} 24 | - pip install future mxnet-cu101 tensorflow-gpu torch torchvision 25 | install: 26 | - export BYTEPS_CUDA_HOME=${CUDA_HOME} 27 | - python setup.py install 28 | - cd 3rdparty/ps-lite && make -j && cd - 29 | script: 30 | - export DMLC_NODE_HOST=127.0.0.1 31 | - export PORT=8000 32 | - 3rdparty/ps-lite/tests/local.sh 1 1 3rdparty/ps-lite/tests/test_benchmark 1024000 10 0 33 | - export PORT=8001 34 | - 3rdparty/ps-lite/tests/local.sh 2 2 3rdparty/ps-lite/tests/test_benchmark 1024000 10 0 35 | - export PORT=8002 36 | - 3rdparty/ps-lite/tests/local.sh 4 4 3rdparty/ps-lite/tests/test_benchmark 1024000 10 0 37 | -------------------------------------------------------------------------------- /example/mxnet/data/imagenet1k-val.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | # This file download the imagnet-1k validation dataset and convert it into a rec 22 | # file. One need to provide the URL for the ILSVRC2012_img_val.tar, which can be 23 | # find at http://www.image-net.org/download-images 24 | # 25 | # Example usage (replace the URL with the correct one): 26 | # ./imagenet1k-val.sh http://xxxxxx/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar 27 | 28 | if [ ! -e ILSVRC2012_img_val.tar ]; then 29 | wget $1 30 | fi 31 | mkdir -p val 32 | tar -xf ILSVRC2012_img_val.tar -C val 33 | wget http://data.mxnet.io/models/imagenet/resnet/val.lst -O imagenet1k-val.lst 34 | 35 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 36 | MX_DIR=${CUR_DIR}/../../../ 37 | 38 | python ${CUR_DIR}/../../../tools/im2rec.py --resize 256 --quality 90 --num-thread 16 imagenet1k-val val/ 39 | 40 | rm -rf val 41 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import mxnet.ndarray as nd 3 | import numpy as np 4 | from numba import jit 5 | 6 | 7 | def fake_data(dtype="float32", batch_size=32, height=224, width=224, depth=3, num_classes=1000): 8 | image_list = [] 9 | label_list = [] 10 | for _ in range(8): 11 | image = mx.ndarray.random.normal(-1, 1, 12 | shape=[1, depth, height, width], 13 | dtype=dtype) 14 | label = mx.ndarray.random.randint(0, num_classes, [1, 1]) 15 | 16 | images = mx.ndarray.repeat(image, 128, axis=0) 17 | labels = mx.ndarray.repeat(label, 128, axis=0) 18 | # print(labels) 19 | image_list.append(images) 20 | label_list.append(labels) 21 | 22 | images = nd.concat(*image_list, dim=0) 23 | labels = nd.concat(*label_list, dim=0) 24 | # print(labels) 25 | fake_dataset = mx.gluon.data.ArrayDataset(images, labels) 26 | 27 | return mx.gluon.data.DataLoader(fake_dataset, batch_size=batch_size, num_workers=4, 28 | shuffle=True, last_batch='discard') 29 | 30 | 31 | @jit(nopython=True) 32 | def xorshift128p(state): 33 | t = state[0] 34 | s = state[1] 35 | state[0] = s 36 | t ^= t << np.uint64(23) 37 | t ^= t >> np.uint64(17) 38 | t ^= s ^ (s >> np.uint64(26)) 39 | state[1] = t 40 | return int(t + s) 41 | 42 | 43 | @jit(nopython=True) 44 | def bernoulli(p, state): 45 | t = p * np.iinfo(np.uint64).max 46 | r = np.array([xorshift128p(state) for _ in range(len(p))], dtype=np.float32) 47 | return r < t 48 | 49 | 50 | @jit(nopython=True) 51 | def randint(low, high, state): 52 | return xorshift128p(state) % (high - low) + low 53 | -------------------------------------------------------------------------------- /byteps/common/ready_table.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "ready_table.h" 17 | 18 | #include "logging.h" 19 | 20 | namespace byteps { 21 | namespace common { 22 | 23 | // below are methods for accessing/modifying the _ready_table 24 | bool ReadyTable::IsKeyReady(uint64_t key) { 25 | std::lock_guard lock(_table_mutex); 26 | return _ready_table[key] == (_ready_count); 27 | } 28 | 29 | int ReadyTable::AddReadyCount(uint64_t key) { 30 | std::lock_guard lock(_table_mutex); 31 | BPS_CHECK_LT(_ready_table[key], _ready_count) 32 | << _table_name << ": " << _ready_table[key] << ", " << (_ready_count); 33 | return ++_ready_table[key]; 34 | } 35 | 36 | int ReadyTable::SetReadyCount(uint64_t key, int cnt) { 37 | std::lock_guard lock(_table_mutex); 38 | _ready_table[key] = cnt; 39 | } 40 | 41 | void ReadyTable::ClearReadyCount(uint64_t key) { 42 | std::lock_guard lock(_table_mutex); 43 | _ready_table[key] = 0; 44 | } 45 | 46 | } // namespace common 47 | } // namespace byteps 48 | -------------------------------------------------------------------------------- /byteps/common/ready_table.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_READY_TABLE_H 17 | #define BYTEPS_READY_TABLE_H 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | namespace byteps { 24 | namespace common { 25 | 26 | class ReadyTable { 27 | public: 28 | ReadyTable(int ready_count, const char* name) { 29 | _ready_count = ready_count; 30 | _table_name = std::string(name); 31 | } 32 | // methods to access or modify the _ready_table 33 | bool IsKeyReady(uint64_t key); 34 | int AddReadyCount(uint64_t key); 35 | int SetReadyCount(uint64_t key, int cnt); 36 | void ClearReadyCount(uint64_t key); 37 | 38 | private: 39 | // (key, ready_signal_count) pair, only valid for root device 40 | std::unordered_map _ready_table; 41 | // use this mutex to access/modify the _ready_table 42 | std::mutex _table_mutex; 43 | int _ready_count; 44 | std::string _table_name; 45 | }; 46 | 47 | } // namespace common 48 | } // namespace byteps 49 | 50 | #endif // BYTEPS_READY_TABLE_H 51 | -------------------------------------------------------------------------------- /byteps/mxnet/cuda_util.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #if HAVE_CUDA 18 | #include 19 | #include "cuda_runtime.h" 20 | #endif 21 | 22 | #include "../common/common.h" 23 | #include "cuda_util.h" 24 | #include "util.h" 25 | 26 | namespace byteps { 27 | namespace mxnet { 28 | 29 | with_device::with_device(int device) { 30 | if (device == CPU_DEVICE_ID) { 31 | restore_device_ = CPU_DEVICE_ID; 32 | } else { 33 | #if HAVE_CUDA 34 | CUDA_CALL(cudaGetDevice(&restore_device_)); 35 | CUDA_CALL(cudaSetDevice(device)); 36 | #else 37 | throw std::logic_error( 38 | "Internal error. Requested device context manager " 39 | "with GPU device but not compiled with CUDA."); 40 | #endif 41 | } 42 | } 43 | 44 | with_device::~with_device() { 45 | #if HAVE_CUDA 46 | if (restore_device_ != CPU_DEVICE_ID) { 47 | CUDA_CALL(cudaSetDevice(restore_device_)); 48 | } 49 | #endif 50 | } 51 | 52 | } // namespace mxnet 53 | } // namespace byteps 54 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | BytePS includes derived work from the following: 2 | 3 | Horovod 4 | Copyright 2018 Uber Technologies, Inc. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | kennethreitz/setup.py 19 | Copyright 2019 Kenneth Reitz 20 | 21 | Permission is hereby granted, free of charge, to any person obtaining 22 | a copy of this software and associated documentation files (the 23 | "Software"), to deal in the Software without restriction, including 24 | without limitation the rights to use, copy, modify, merge, publish, 25 | distribute, sublicense, and/or sell copies of the Software, and to 26 | permit persons to whom the Software is furnished to do so, subject to 27 | the following conditions: 28 | 29 | The above copyright notice and this permission notice shall be included 30 | in all copies or substantial portions of the Software. 31 | 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 33 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 34 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 35 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 36 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 37 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 38 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 39 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-devel-ubuntu18.04 2 | 3 | ARG https_proxy 4 | ARG http_proxy 5 | 6 | ARG BYTEPS_BASE_PATH=/usr/local 7 | ARG BYTEPS_PATH=$BYTEPS_BASE_PATH/byteps 8 | ARG BYTEPS_GIT_LINK=https://github.com/bytedance/byteps 9 | ARG BYTEPS_BRANCH=master 10 | 11 | ARG DEBIAN_FRONTEND=noninteractive 12 | RUN apt-get update 13 | RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 14 | build-essential \ 15 | tzdata \ 16 | ca-certificates \ 17 | git \ 18 | curl \ 19 | wget \ 20 | vim \ 21 | cmake \ 22 | lsb-release \ 23 | libcudnn7=7.6.0.64-1+cuda10.0 \ 24 | libnuma-dev \ 25 | ibverbs-providers \ 26 | librdmacm-dev \ 27 | ibverbs-utils \ 28 | rdmacm-utils \ 29 | libibverbs-dev \ 30 | python3 \ 31 | python3-dev \ 32 | python3-pip \ 33 | python3-setuptools \ 34 | libnccl2=2.4.7-1+cuda10.0 \ 35 | libnccl-dev=2.4.7-1+cuda10.0 36 | 37 | # install framework 38 | # note: for tf <= 1.14, you need gcc-4.9 39 | ARG FRAMEWORK=tensorflow 40 | RUN if [ "$FRAMEWORK" = "tensorflow" ]; then \ 41 | pip3 install --upgrade pip; \ 42 | pip3 install -U tensorflow-gpu==1.15.0; \ 43 | elif [ "$FRAMEWORK" = "pytorch" ]; then \ 44 | pip3 install -U numpy==1.18.1 torchvision==0.5.0 torch==1.4.0; \ 45 | elif [ "$FRAMEWORK" = "mxnet" ]; then \ 46 | pip3 install -U mxnet-cu100==1.5.0; \ 47 | else \ 48 | echo "unknown framework: $FRAMEWORK"; \ 49 | exit 1; \ 50 | fi 51 | 52 | ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH 53 | 54 | RUN cd $BYTEPS_BASE_PATH &&\ 55 | git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK &&\ 56 | cd $BYTEPS_PATH &&\ 57 | python3 setup.py install 58 | -------------------------------------------------------------------------------- /byteps/mxnet/ops.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_OPS_H 18 | #define BYTEPS_MXNET_OPS_H 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "../common/common.h" 26 | 27 | namespace byteps { 28 | namespace mxnet { 29 | 30 | using namespace byteps::common; 31 | 32 | typedef ::mxnet::Engine Engine; 33 | typedef ::mxnet::NDArray NDArray; 34 | typedef ::mxnet::Engine::CallbackOnComplete Callback; 35 | 36 | extern "C" int byteps_mxnet_push_pull_async(NDArray* input, char* name, 37 | int version, int priority, 38 | bool is_average); 39 | 40 | extern "C" void byteps_mxnet_declare_tensor(char* name, int num_args, 41 | char** args_keys, 42 | char** args_vals); 43 | 44 | } // namespace mxnet 45 | } // namespace byteps 46 | 47 | #endif // BYTEPS_MXNET_OPS_H 48 | -------------------------------------------------------------------------------- /byteps/common/compressor/impl/nesterov_momentum.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H 17 | #define BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H 18 | 19 | #include "../momentum.h" 20 | 21 | namespace byteps { 22 | namespace common { 23 | namespace compressor { 24 | 25 | /*! 26 | * \brief Nesterov Momentum Compressor 27 | * 28 | * paper: A method for solving the convex programming problem with convergence 29 | * rate $O (1/k^2)$ 30 | * 31 | * m_t <- \mu m_{t-1} + g_t 32 | * g_t <- \mu m_t + g_t 33 | * 34 | */ 35 | class NesterovMomentumCompressor : public Momentum { 36 | public: 37 | NesterovMomentumCompressor(size_t size, DataType dtype, 38 | std::unique_ptr cptr, float mu) 39 | : Momentum(size, dtype, std::move(cptr), mu){}; 40 | virtual ~NesterovMomentumCompressor() = default; 41 | 42 | protected: 43 | void UpdateMom(tensor_t grad) override; 44 | void UpdateGradient(tensor_t grad) override; 45 | }; 46 | 47 | } // namespace compressor 48 | } // namespace common 49 | } // namespace byteps 50 | 51 | #endif // BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H -------------------------------------------------------------------------------- /byteps/common/scheduled_queue.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_SCHEDULED_QUEUE_H 17 | #define BYTEPS_SCHEDULED_QUEUE_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "common.h" 24 | #include "ready_table.h" 25 | 26 | namespace byteps { 27 | namespace common { 28 | 29 | class BytePSScheduledQueue { 30 | public: 31 | BytePSScheduledQueue(QueueType type); 32 | QueueType getQueueType() { return _qt; } 33 | void addTask(std::shared_ptr); 34 | void recorderTs(std::shared_ptr); 35 | std::shared_ptr getTask(); 36 | std::shared_ptr getTask(uint64_t key); 37 | uint32_t pendingSize(); 38 | void reportFinish(int size); 39 | void reset(uint64_t key, int cnt); 40 | 41 | private: 42 | // TODO: use priority queue or heap 43 | std::vector> _sq; 44 | std::mutex _mutex; 45 | uint64_t _credits; 46 | bool _is_scheduled; 47 | QueueType _qt; 48 | ReadyTable *_rt; 49 | }; 50 | 51 | } // namespace common 52 | } // namespace byteps 53 | 54 | #endif // BYTEPS_SCHEDULED_QUEUE_H 55 | -------------------------------------------------------------------------------- /byteps/common/compressor/compressor_registry.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H 17 | #define BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H 18 | 19 | #include "compressor.h" 20 | #include "utils.h" 21 | 22 | namespace byteps { 23 | namespace common { 24 | namespace compressor { 25 | 26 | class CompressorRegistry { 27 | public: 28 | // constructor of compressor 29 | using ctor_t = std::function( 30 | const kwargs_t& kwargs, size_t size, DataType dtype)>; 31 | 32 | using map_t = std::unordered_map; 33 | 34 | struct Register { 35 | Register(std::string name, ctor_t ctor); 36 | }; 37 | 38 | static ctor_t Find(const std::string& name); 39 | 40 | static std::unique_ptr Create(const kwargs_t& kwargs, size_t size, 41 | DataType dtype); 42 | 43 | private: 44 | static map_t _ctor_map; 45 | 46 | CompressorRegistry() = delete; 47 | ~CompressorRegistry() = delete; 48 | }; 49 | 50 | } // namespace compressor 51 | } // namespace common 52 | } // namespace byteps 53 | 54 | #endif // BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | # BytePS Performance when training CNN 2 | 3 | ## NVLink + TCP 4 | 5 | We test two models: VGG16 (communication-intensive) and Resnet50 (computation-intensive) on a popular public cloud. Both models are trained using fp32. 6 | 7 | We use Tesla V100 16GB GPUs and set batch size equal to 64 *per GPU*. The machines are VMs on the cloud. Each machine has 8 V100 GPUs with NVLink-enabled. Machines are inter-connected with 20 Gbps TCP/IP network. 8 | 9 | BytePS outperforms Horovod (NCCL) by 44% for Resnet50, and 100% for VGG16. 10 | 11 | ![vgg16_tcp](https://user-images.githubusercontent.com/13852819/69873424-41e37500-12f3-11ea-93b8-705215e3e901.png) 12 | ![resnet50_tcp](https://user-images.githubusercontent.com/13852819/69873419-40b24800-12f3-11ea-9ff3-0f11347c089e.png) 13 | 14 | You can reproduce the results using the Dockerfiles and example scripts we provide. 15 | 16 | ## PCIe + RDMA 17 | 18 | Note: here we present the *worse case scenario* of BytePS, i.e., 100Gbps RDMA + no NVLinks. 19 | 20 | We get below results on machines that are based on PCIe-switch architecture -- 4 GPUs under one PCIe switch, and each machine contains two PCIe switches. 21 | The machines are inter-connected by 100 Gbps RoCEv2 networks. 22 | In this case, BytePS outperforms Horovod (NCCL) by 7% for Resnet50, and 17% for VGG16. 23 | 24 | ![perf_rdma_pcie_resnet50](https://user-images.githubusercontent.com/13852819/68925125-57b64d80-07bd-11ea-9f72-d108cf4294ad.png) 25 | 26 | ![perf_rdma_pcie_vgg16](https://user-images.githubusercontent.com/13852819/68925175-70befe80-07bd-11ea-98d6-ca7df3670bbd.png) 27 | 28 | 29 | To have BytePS outperform NCCL by so little, you have to have 100Gbps RDMA network *and* no NVLinks. In this case, the communication is actually bottlenecked by internal PCI-e switches, not the network. BytePS has done some optimization so that it still outperforms NCCL. However, the performance gain is not as large as other cases where the network is the bottleneck. 30 | -------------------------------------------------------------------------------- /docs/cross-barrier.md: -------------------------------------------------------------------------------- 1 | # Cross Global Barrier 2 | 3 | This eliminates the global barrier between training iterations for distributed training frameworks (e.g., 4 | PyTorch), so that the priority-based communication scheduling in BytePS can be effective. 5 | 6 | ## Why Crossing Barrier? 7 | 8 | Existing distributed training frameworks (PyTorch, TensorFlow, etc) do not fully utilize the potentials of overlapping 9 | computation and communication to speed up neural network training: they only support communication overlapping with 10 | backward propagation. But due to layer-wise dependencies in DNN training, we can actually schedule gradient 11 | synchronization order based on when they are consumed in the next iteration, and hence overlap communication with 12 | forward-propagation of the next iteration! Read the paper https://dl.acm.org/citation.cfm?id=3359642 for more 13 | communication scheduling details. 14 | 15 | To make this idea work, the first step is to remove the global barrier between two iterations to build layer-wise 16 | dependencies, so that the forward computation of next step can start without waiting for parameter synchronization 17 | completion of all parameters. 18 | 19 | Fig.1 shows the dependency graph with global barrier. Machine learning frameworks such as PyTorch and TensorFlow have 20 | similar dependencies when using BytePS for push and pull. 21 | 22 | ![dag_barrier](https://user-images.githubusercontent.com/13852819/69863244-4b5ee400-12d7-11ea-9356-2dd41dff95ab.png) 23 | 24 | *Fig.1: Dependency Graph With Global Barrier* 25 | 26 | Fig. 2 shows the dependency graph after removing global barrier. What we do here is to change the dependency 27 | graph from Fig. 1 to Fig. 2 by removing the barrier, building layer-wise dependencies while guaranteeing computation correctness. 28 | 29 | 30 | ![dag_without_barrier](https://user-images.githubusercontent.com/13852819/69863268-5d408700-12d7-11ea-8b39-5e48e3d94c2b.png) 31 | *Fig.2: Dependency Graph After Removing Global Barrier* 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /docs/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | We suggest you read the Horovod troubleshooting, especially for problems during the build process. BytePS has almost the same dependencies as Horovod minus MPI. 4 | 5 | https://github.com/horovod/horovod/blob/v0.16.4/docs/troubleshooting.rst 6 | 7 | ## Network connectivity 8 | 9 | When launching distributed jobs, if you see hanging at the beginning, one possible reason is that your network connection has trouble. You can use `ps-lite` benchmark to verify the connectivity. 10 | 11 | Install ps-lite: 12 | 13 | ``` 14 | git clone -b byteps https://github.com/bytedance/ps-lite.git 15 | cd ps-lite 16 | make -j 17 | ``` 18 | 19 | 20 | For the scheduler 21 | ``` 22 | export DMLC_ROLE=scheduler 23 | export DMLC_NUM_WORKER=1 24 | export DMLC_NUM_SERVER=1 25 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP] 26 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT] 27 | export DMLC_INTERFACE=eth0 28 | ./ps-lite/tests/test_benchmark 29 | ``` 30 | 31 | For the server 32 | ``` 33 | export DMLC_ROLE=server 34 | export DMLC_NUM_WORKER=1 35 | export DMLC_NUM_SERVER=1 36 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP] 37 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT] 38 | export DMLC_INTERFACE=eth0 39 | ./ps-lite/tests/test_benchmark 40 | ``` 41 | 42 | For the worker: 43 | ``` 44 | export DMLC_ROLE=worker 45 | export DMLC_NUM_WORKER=1 46 | export DMLC_NUM_SERVER=1 47 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP] 48 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT] 49 | export DMLC_INTERFACE=eth0 50 | ./ps-lite/tests/test_benchmark 1024000 100 0 51 | ``` 52 | 53 | If it succeed, you should be able to see something like this on the worker. 54 | ``` 55 | push_byte=4096000, repeat=100, total_time=128.842ms 56 | pull_byte=4096000, repeat=100, total_time=353.38ms 57 | ``` 58 | 59 | (Note: for RDMA networks, use `make -j USE_RDMA=1` to build, and `export DMLC_ENABLE_RDMA=1` for running the scheduler / server / worker) 60 | 61 | If it still hang, you may need to check your network connectivity. 62 | -------------------------------------------------------------------------------- /docs/running.md: -------------------------------------------------------------------------------- 1 | # Running BytePS 2 | 3 | BytePS follows the same running model as MXNet's PS implemenation, and provides a script, launcher/launcher.py, to help you start individual processes. **Below instructions, including those DMLC variables, apply to all frameworks.** 4 | 5 | Let's say you have two worker machines (or docker containers) that have GPUs, one machine or container as a server, and a scheduler. The scheduler binds on 10.0.0.1 and port 9000. The workers and the server can connect to the scheduler via the IP and port using TCP. 6 | 7 | To use launcher/launcher.py, NVIDIA_VISIBLE_DEVICES should exist -- either automatically set by nvidia-docker, or manually set by you. 8 | 9 | On worker 0, run: 10 | 11 | ``` 12 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 13 | DMLC_WORKER_ID=0 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \ 14 | bpslaunch YOUR_COMMAND 15 | ``` 16 | 17 | On worker 1, run (only DMLC_WORKER_ID is different from above): 18 | 19 | ``` 20 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 21 | DMLC_WORKER_ID=1 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \ 22 | bpslaunch YOUR_COMMAND 23 | ``` 24 | 25 | **For servers and schedulers, we highly recommend you use the docker image we build:** 26 | 27 | ``` 28 | docker pull bytepsimage/byteps_server 29 | ``` 30 | 31 | Start server and scheduler docker instances with this image. In the server, run the following. Compared with the worker command, we remove DMLC_WORKER_ID, and set role to server. 32 | 33 | ``` 34 | DMLC_ROLE=server DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 35 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch 36 | ``` 37 | 38 | On the scheduler, run (we also remove DMLC_WORKER_ID, and set role to scheduler): 39 | 40 | ``` 41 | DMLC_ROLE=scheduler DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 42 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch 43 | ``` 44 | 45 | In this example, your scheduler must be able to bind to `10.0.0.1:9000`. 46 | 47 | The order of starting workers/servers/scheduler does not matter. 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # vscode 2 | .vscode 3 | *.gz 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | bin/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | 111 | # pycharm 112 | .idea 113 | 114 | # mac 115 | .DS_Store 116 | 117 | # for development 118 | scripts/ 119 | exps/ 120 | 121 | # dependency tarballs 122 | ucx.tar.gz 123 | zeromq-4.1.4.tar.gz 124 | -------------------------------------------------------------------------------- /byteps/tensorflow/ops.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_TENSORFLOW_OPS_H 17 | #define BYTEPS_TENSORFLOW_OPS_H 18 | 19 | #include 20 | 21 | #include "tensorflow/core/framework/op.h" 22 | #include "tensorflow/core/framework/op_kernel.h" 23 | #include "tensorflow/core/framework/shape_inference.h" 24 | 25 | #define EIGEN_USE_THREADS 26 | #include "tensorflow/stream_executor/stream.h" 27 | 28 | #include "../common/operations.h" 29 | 30 | namespace byteps { 31 | namespace tensorflow { 32 | 33 | class TFReadyEvent : public common::ReadyEvent { 34 | public: 35 | TFReadyEvent(::tensorflow::DeviceContext* device_context); 36 | bool Ready() const override; 37 | 38 | private: 39 | std::shared_ptr event_; 40 | }; 41 | 42 | class TFTensor : public common::Tensor { 43 | public: 44 | TFTensor(::tensorflow::Tensor& tensor); 45 | virtual const common::DataType dtype() const override; 46 | virtual const common::TensorShape shape() const override; 47 | virtual const void* data() const override; 48 | virtual int64_t size() const override; 49 | 50 | protected: 51 | ::tensorflow::Tensor tensor_; 52 | }; 53 | 54 | extern "C" void byteps_tensorflow_declare_tensor(char* name); 55 | 56 | } // namespace tensorflow 57 | } // namespace byteps 58 | 59 | #endif // BYTEPS_TENSORFLOW_OPS_H 60 | -------------------------------------------------------------------------------- /byteps/mxnet/adapter.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #if HAVE_CUDA 18 | #include "cuda.h" 19 | #endif 20 | 21 | #include "adapter.h" 22 | #include "cuda_util.h" 23 | #include "tensor_util.h" 24 | 25 | namespace byteps { 26 | namespace mxnet { 27 | 28 | 29 | template 30 | MXTensor::MXTensor(T* tensor) : tensor_(tensor) {} 31 | 32 | template 33 | const DataType MXTensor::dtype() const { 34 | return TensorUtil::GetDType(tensor_); 35 | } 36 | 37 | template 38 | const TensorShape MXTensor::shape() const { 39 | auto shape = TensorUtil::GetShape(tensor_); 40 | if (shape.dims() == 0) { 41 | // Tensor with empty shape is a Tensor with no values in MXNet, unlike a 42 | // constant in TensorFlow. So, we inject a dummy zero dimension to make sure 43 | // that the number-of-elements calculation is correct. 44 | shape.AddDim(0); 45 | } 46 | return shape; 47 | } 48 | 49 | template 50 | const void* MXTensor::data() const { 51 | return TensorUtil::GetData(tensor_); 52 | } 53 | 54 | template 55 | int64_t MXTensor::size() const { 56 | return TensorUtil::GetSize(tensor_); 57 | } 58 | 59 | template class MXTensor; 60 | 61 | } // namespace mxnet 62 | } // namespace byteps 63 | -------------------------------------------------------------------------------- /byteps/common/shared_memory.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_SHARED_MEMORY_H 17 | #define BYTEPS_SHARED_MEMORY_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "logging.h" 30 | 31 | namespace byteps { 32 | namespace common { 33 | 34 | class BytePSSharedMemory { 35 | public: 36 | BytePSSharedMemory() {} 37 | 38 | ~BytePSSharedMemory() { 39 | for (auto &it : _key_shm_addr) { 40 | CUDA_CALL(cudaHostUnregister(it.second)); 41 | munmap(it.second, _key_shm_size[it.first]); 42 | shm_unlink(it.first.c_str()); 43 | } 44 | 45 | BPS_LOG(DEBUG) << "Clear shared memory: all BytePS shared memory " 46 | "released/unregistered."; 47 | } 48 | 49 | void *openSharedMemory(const std::string &prefix, uint64_t key, size_t size); 50 | std::vector openPcieSharedMemory(uint64_t key, size_t size); 51 | 52 | private: 53 | std::unordered_map _key_shm_addr; 54 | std::unordered_map _key_shm_size; 55 | 56 | std::mutex _shm_mu; 57 | }; 58 | 59 | } // namespace common 60 | } // namespace byteps 61 | 62 | #endif // BYTEPS_SHARED_MEMORY_H 63 | -------------------------------------------------------------------------------- /byteps/common/thread_pool.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copy From https://github.com/progschj/ThreadPool/blob/master/ThreadPool.h 3 | */ 4 | #ifndef THREAD_POOL_H 5 | #define THREAD_POOL_H 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | class ThreadPool { 18 | public: 19 | ThreadPool(size_t); 20 | template 21 | void enqueue(F&& f); 22 | ~ThreadPool(); 23 | 24 | private: 25 | // need to keep track of threads so we can join them 26 | std::vector workers; 27 | // the task queue 28 | std::queue > tasks; 29 | 30 | // synchronization 31 | std::mutex queue_mutex; 32 | std::condition_variable condition; 33 | bool stop; 34 | }; 35 | 36 | // the constructor just launches some amount of workers 37 | inline ThreadPool::ThreadPool(size_t threads) : stop(false) { 38 | for (size_t i = 0; i < threads; ++i) 39 | workers.emplace_back([this] { 40 | for (;;) { 41 | std::function task; 42 | 43 | { 44 | std::unique_lock lock(this->queue_mutex); 45 | this->condition.wait( 46 | lock, [this] { return this->stop || !this->tasks.empty(); }); 47 | if (this->stop && this->tasks.empty()) return; 48 | task = std::move(this->tasks.front()); 49 | this->tasks.pop(); 50 | } 51 | 52 | task(); 53 | } 54 | }); 55 | } 56 | 57 | // add new work item to the pool 58 | template 59 | void ThreadPool::enqueue(F&& f) { 60 | { 61 | std::lock_guard lock(queue_mutex); 62 | if (stop) throw std::runtime_error("enqueue on stopped ThreadPool"); 63 | tasks.emplace(std::forward(f)); 64 | } 65 | condition.notify_one(); 66 | } 67 | // the destructor joins all threads 68 | inline ThreadPool::~ThreadPool() { 69 | { 70 | std::unique_lock lock(queue_mutex); 71 | stop = true; 72 | } 73 | condition.notify_all(); 74 | for (std::thread& worker : workers) worker.join(); 75 | } 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /byteps/torch/handle_manager.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #include "handle_manager.h" 18 | 19 | namespace byteps { 20 | namespace torch { 21 | 22 | int HandleManager::AllocateHandle() { 23 | int handle = last_handle_.fetch_add(1) + 1; 24 | std::lock_guard guard(mutex_); 25 | results_[handle] = nullptr; 26 | return handle; 27 | } 28 | 29 | void HandleManager::MarkDone(int handle, const Status& status) { 30 | std::lock_guard guard(mutex_); 31 | results_[handle] = std::make_shared(status); 32 | } 33 | 34 | bool HandleManager::PollHandle(int handle) { 35 | std::lock_guard guard(mutex_); 36 | if (results_.find(handle) == results_.end()) { 37 | throw std::invalid_argument("Handle " + std::to_string(handle) + 38 | " was not created or has been cleared."); 39 | } 40 | return results_[handle] != nullptr; 41 | } 42 | 43 | std::shared_ptr HandleManager::ReleaseHandle(int handle) { 44 | std::lock_guard guard(mutex_); 45 | if (results_.find(handle) == results_.end()) { 46 | throw std::invalid_argument("Handle " + std::to_string(handle) + 47 | " was not created or has been cleared."); 48 | } 49 | auto status = results_[handle]; 50 | results_.erase(handle); 51 | return status; 52 | } 53 | 54 | } // namespace torch 55 | } // namespace byteps 56 | -------------------------------------------------------------------------------- /byteps/common/compressor/impl/nesterov_momentum.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "nesterov_momentum.h" 17 | #include "../compressor_registry.h" 18 | 19 | namespace byteps { 20 | namespace common { 21 | namespace compressor { 22 | namespace { 23 | CompressorRegistry::Register reg( 24 | "nesterov_momentum", 25 | [](const kwargs_t& kwargs, size_t size, 26 | DataType dtype) -> std::unique_ptr { 27 | // register cptr 28 | auto kwargs_clone = kwargs; 29 | kwargs_clone.erase("momentum_type"); 30 | auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype); 31 | BPS_CHECK_NE(cptr, nullptr); 32 | // find \mu 33 | auto mu = HyperParamFinder(kwargs, "momentum_mu"); 34 | return std::unique_ptr( 35 | new NesterovMomentumCompressor(size, dtype, std::move(cptr), mu)); 36 | }); 37 | } 38 | 39 | void NesterovMomentumCompressor::UpdateMom(tensor_t grad) { 40 | // m_t = \mu * m_{t-1} + g_t 41 | this->_cpu_reducer->sum(_mom.get(), grad.data, _mom.get(), grad.size, 42 | static_cast(grad.dtype), _mu); 43 | } 44 | 45 | void NesterovMomentumCompressor::UpdateGradient(tensor_t grad) { 46 | // p_t = \mu m_t + g_t 47 | this->_cpu_reducer->sum(grad.data, _mom.get(), grad.size, 48 | static_cast(grad.dtype), _mu); 49 | } 50 | 51 | } // namespace compressor 52 | } // namespace common 53 | } // namespace byteps -------------------------------------------------------------------------------- /example/mxnet/common/util.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import subprocess 19 | import os 20 | import errno 21 | 22 | def download_file(url, local_fname=None, force_write=False): 23 | # requests is not default installed 24 | import requests 25 | if local_fname is None: 26 | local_fname = url.split('/')[-1] 27 | if not force_write and os.path.exists(local_fname): 28 | return local_fname 29 | 30 | dir_name = os.path.dirname(local_fname) 31 | 32 | if dir_name != "": 33 | if not os.path.exists(dir_name): 34 | try: # try to create the directory if it doesn't exists 35 | os.makedirs(dir_name) 36 | except OSError as exc: 37 | if exc.errno != errno.EEXIST: 38 | raise 39 | 40 | r = requests.get(url, stream=True) 41 | assert r.status_code == 200, "failed to open %s" % url 42 | with open(local_fname, 'wb') as f: 43 | for chunk in r.iter_content(chunk_size=1024): 44 | if chunk: # filter out keep-alive new chunks 45 | f.write(chunk) 46 | return local_fname 47 | 48 | def get_gpus(): 49 | """ 50 | return a list of GPUs 51 | """ 52 | try: 53 | re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True) 54 | except OSError: 55 | return [] 56 | return range(len([i for i in re.split('\n') if 'GPU' in i])) 57 | -------------------------------------------------------------------------------- /byteps/mxnet/tensor_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_TENSOR_UTIL_H 18 | #define BYTEPS_MXNET_TENSOR_UTIL_H 19 | 20 | #include 21 | #include 22 | #include 23 | #include "../common/common.h" 24 | #include "cuda_util.h" 25 | #include "util.h" 26 | 27 | namespace byteps { 28 | namespace mxnet { 29 | 30 | using namespace byteps::common; 31 | using namespace ::mxnet; 32 | 33 | class TensorUtil { 34 | public: 35 | static const DataType GetDType(NDArray* tensor); 36 | static const TensorShape GetShape(NDArray* tensor); 37 | static const void* GetData(NDArray* tensor); 38 | static int64_t GetSize(NDArray* tensor); 39 | static int GetDevice(NDArray* tensor); 40 | 41 | static NDArray* New(int device, int dtype); 42 | static void Free(NDArray* tensor); 43 | static void Copy(NDArray* output, NDArray* tensor); 44 | static void DivideTensorInPlace(NDArray* tensor, int value); 45 | 46 | #if HAVE_CUDA 47 | static void CopyCPUToCuda(NDArray* cpu, NDArray* cuda); 48 | static void AsyncCopyCudaToCPU(NDArray* cuda, NDArray* cpu); 49 | #endif 50 | 51 | private: 52 | static const size_t kFloat32Size = 4; 53 | static const size_t kFloat64Size = 8; 54 | static const size_t kFloat16Size = 2; 55 | static const size_t kUInt8Size = 1; 56 | static const size_t kInt32Size = 4; 57 | static const size_t kInt8Size = 1; 58 | static const size_t kInt64Size = 8; 59 | }; 60 | 61 | } // namespace mxnet 62 | } // namespace byteps 63 | 64 | #endif // BYTEPS_MXNET_TENSOR_UTIL_H 65 | -------------------------------------------------------------------------------- /byteps/common/compressor/compressor_registry.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "compressor_registry.h" 17 | 18 | namespace byteps { 19 | namespace common { 20 | namespace compressor { 21 | 22 | CompressorRegistry::map_t CompressorRegistry::_ctor_map; 23 | 24 | CompressorRegistry::Register::Register(std::string name, ctor_t ctor) { 25 | BPS_CHECK_EQ(_ctor_map.count(name), 0) 26 | << "Duplicate registration of compressor under name " << name; 27 | _ctor_map.emplace(name + "_type", std::move(ctor)); 28 | BPS_LOG(INFO) << name << " compressor is registered"; 29 | } 30 | 31 | CompressorRegistry::ctor_t CompressorRegistry::Find(const std::string& name) { 32 | auto it = _ctor_map.find(name); 33 | if (it == _ctor_map.end()) { 34 | BPS_LOG(FATAL) << "No compressor registered under name:" << name; 35 | } 36 | return it->second; 37 | } 38 | 39 | std::unique_ptr CompressorRegistry::Create(const kwargs_t& kwargs, 40 | size_t size, DataType dtype) { 41 | #ifndef BYTEPS_BUILDING_SERVER 42 | const std::string types[] = {"momentum_type", "ef_type", "compressor_type"}; 43 | #else 44 | // server do not need momentum 45 | const std::string types[] = {"ef_type", "compressor_type"}; 46 | #endif 47 | for (auto& type : types) { 48 | auto iter = kwargs.find(type); 49 | if (iter != kwargs.end()) { 50 | auto ctor = CompressorRegistry::Find(iter->second + "_" + type); 51 | return ctor(kwargs, size, dtype); 52 | } 53 | } 54 | 55 | return nullptr; 56 | } 57 | 58 | } // namespace compressor 59 | } // namespace common 60 | } // namespace byteps -------------------------------------------------------------------------------- /byteps/torch/ops.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 ByteDance, Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_OPS_H 18 | #define BYTEPS_TORCH_OPS_H 19 | 20 | #include 21 | 22 | #if HAVE_CUDA 23 | #include 24 | #endif 25 | 26 | #include "../common/operations.h" 27 | 28 | namespace byteps { 29 | namespace torch { 30 | 31 | using namespace byteps::common; 32 | 33 | std::mutex mutex_; 34 | /* total number of gradients to push-pull */ 35 | size_t num_grads_; 36 | /* number of push-pulls that have been triggered */ 37 | size_t grad_count_; 38 | 39 | #define PUSHPULL_H(torch_Tensor, THTensor) \ 40 | extern "C" int byteps_torch_push_pull_async_##torch_Tensor( \ 41 | THTensor* tensor, THTensor* output, int average, char* name, \ 42 | int version, int priority); 43 | 44 | PUSHPULL_H(torch_ByteTensor, THByteTensor) 45 | PUSHPULL_H(torch_IntTensor, THIntTensor) 46 | PUSHPULL_H(torch_LongTensor, THLongTensor) 47 | PUSHPULL_H(torch_FloatTensor, THFloatTensor) 48 | PUSHPULL_H(torch_DoubleTensor, THDoubleTensor) 49 | 50 | #if HAVE_CUDA 51 | PUSHPULL_H(torch_cuda_ByteTensor, THCudaByteTensor) 52 | PUSHPULL_H(torch_cuda_IntTensor, THCudaIntTensor) 53 | PUSHPULL_H(torch_cuda_LongTensor, THCudaLongTensor) 54 | PUSHPULL_H(torch_cuda_FloatTensor, THCudaTensor) 55 | PUSHPULL_H(torch_cuda_DoubleTensor, THCudaDoubleTensor) 56 | #endif 57 | 58 | extern "C" int byteps_torch_poll(int handle); 59 | extern "C" void byteps_torch_wait_and_clear(int handle); 60 | extern "C" void byteps_torch_declare_tensor(char* name); 61 | 62 | } // namespace torch 63 | } // namespace byteps 64 | 65 | #endif // BYTEPS_TORCH_OPS_H 66 | -------------------------------------------------------------------------------- /byteps/common/compressor/impl/vanilla_error_feedback.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H 17 | #define BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H 18 | 19 | #include "../error_feedback.h" 20 | 21 | namespace byteps { 22 | namespace common { 23 | namespace compressor { 24 | 25 | /*! 26 | * \brief Vanilla Error Feedback Compressor 27 | * 28 | * paper: Communication-efficient distributed blockwise momentum sgd with 29 | * error-feedback 30 | * https://arxiv.org/pdf/1905.10936.pdf 31 | * 32 | * each worker i: 33 | * p_{t,i} <- g_{t,i} + \frac{\eta_{t-1}}{\eta_t} e_{t,i} 34 | * c_{t,i} <- Compress(p_{t,i}) 35 | * e_{t,i} <- p_{t,i} - c_{t,i} 36 | * 37 | * server: 38 | * \tilde{p}_{t} <- \frac{1}{M} \sum_{i=1}^{M} c_{t,i} 39 | * +\frac{\eta_{t-1}}{\eta_{t}} \tilde{e_t} \tilde{e}_{t+1} <- 40 | * \tilde{p}_{t}-\tilde{c_t} 41 | * 42 | * Error-correction: error needs to be scaled with \frac{\eta_{t-1}}{\eta_t}. 43 | */ 44 | class VanillaErrorFeedbackCompressor : public ErrorFeedback { 45 | public: 46 | VanillaErrorFeedbackCompressor(size_t size, DataType dtype, 47 | std::unique_ptr cptr); 48 | virtual ~VanillaErrorFeedbackCompressor(); 49 | 50 | protected: 51 | void UpdateGradient(tensor_t grad) override; 52 | 53 | private: 54 | /*! 55 | * \brief learning rate 56 | * 57 | * read from file each step 58 | */ 59 | double _pre_lr, _cur_lr; 60 | 61 | int _fd; 62 | void* _mm; 63 | }; 64 | } // namespace compressor 65 | } // namespace common 66 | } // namespace byteps 67 | 68 | #endif // BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H -------------------------------------------------------------------------------- /example/mxnet/data/caltech256.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | # This file download the caltech 256 dataset 22 | # (http://www.vision.caltech.edu/Image_Datasets/Caltech256/), and split it into 23 | # the train and val rec files. 24 | 25 | # number of images per class for training 26 | IMG_TRAIN=60 27 | 28 | # download 29 | if [ ! -e 256_ObjectCategories.tar ]; then 30 | wget http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar 31 | fi 32 | 33 | # split into train and val set 34 | tar -xf 256_ObjectCategories.tar 35 | TRAIN_DIR=caltech_256_train 36 | mkdir -p ${TRAIN_DIR} 37 | for i in 256_ObjectCategories/*; do 38 | c=`basename $i` 39 | echo "spliting $c" 40 | mkdir -p ${TRAIN_DIR}/$c 41 | for j in `ls $i/*.jpg | shuf | head -n ${IMG_TRAIN}`; do 42 | mv $j ${TRAIN_DIR}/$c/ 43 | done 44 | done 45 | 46 | # generate lst files 47 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 48 | MX_DIR=${CUR_DIR}/../../../ 49 | python ${MX_DIR}/tools/im2rec.py --list --recursive caltech256-train ${TRAIN_DIR}/ 50 | python ${MX_DIR}/tools/im2rec.py --list --recursive caltech256-val 256_ObjectCategories/ 51 | mv caltech256-train_train.lst caltech256-train.lst 52 | rm caltech256-train_* 53 | mv caltech256-val_train.lst caltech256-val.lst 54 | rm caltech256-val_* 55 | 56 | # generate rec files 57 | python ${MX_DIR}/tools/im2rec.py --resize 256 --quality 95 --num-thread 16 caltech256-val 256_ObjectCategories/ 58 | python ${MX_DIR}/tools/im2rec.py --resize 256 --quality 95 --num-thread 16 caltech256-train ${TRAIN_DIR}/ 59 | 60 | # clean 61 | rm -rf ${TRAIN_DIR} 256_ObjectCategories/ 62 | -------------------------------------------------------------------------------- /example/mxnet/train_imagenet_byteps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | import os 21 | import argparse 22 | import logging 23 | logging.basicConfig(level=logging.DEBUG) 24 | from common import find_mxnet 25 | from common import data_byteps as data 26 | from common import fit_byteps as fit 27 | from common.util import download_file 28 | import byteps.mxnet as bps 29 | import mxnet as mx 30 | 31 | if __name__ == '__main__': 32 | # init byteps 33 | bps.init() 34 | 35 | # parse args 36 | parser = argparse.ArgumentParser(description="train imagenet-1k", 37 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 38 | fit.add_fit_args(parser) 39 | data.add_data_args(parser) 40 | data.add_data_aug_args(parser) 41 | # use a large aug level 42 | data.set_data_aug_level(parser, 3) 43 | parser.set_defaults( 44 | # network 45 | network = 'resnet', 46 | num_layers = 50, 47 | # data 48 | num_classes = 1000, 49 | num_examples = 1281167, 50 | image_shape = '3,224,224', 51 | min_random_scale = 1, # if input image has min size k, suggest to use 52 | # 256.0/x, e.g. 0.533 for 480 53 | # train 54 | num_epochs = 80, 55 | lr_step_epochs = '30,60', 56 | dtype = 'float32' 57 | ) 58 | args = parser.parse_args() 59 | 60 | # load network 61 | from importlib import import_module 62 | net = import_module('symbols.'+args.network) 63 | sym = net.get_symbol(**vars(args)) 64 | 65 | # train 66 | fit.fit(args, sym, data.get_rec_iter) 67 | -------------------------------------------------------------------------------- /example/tensorflow/tensorflow2_mnist.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import byteps.tensorflow as bps 3 | 4 | bps.init() 5 | 6 | # BytePS: pin GPU to be used to process local rank (one GPU per process) 7 | gpus = tf.config.experimental.list_physical_devices('GPU') 8 | for gpu in gpus: 9 | tf.config.experimental.set_memory_growth(gpu, True) 10 | if gpus: 11 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU') 12 | 13 | # Before launching, need to fist download the dataset to ~/.keras/datasets 14 | (mnist_images, mnist_labels), _ = \ 15 | tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % bps.rank()) 16 | 17 | dataset = tf.data.Dataset.from_tensor_slices( 18 | (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), 19 | tf.cast(mnist_labels, tf.int64)) 20 | ) 21 | dataset = dataset.repeat().shuffle(10000).batch(128) 22 | 23 | mnist_model = tf.keras.Sequential([ 24 | tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), 25 | tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), 26 | tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), 27 | tf.keras.layers.Dropout(0.25), 28 | tf.keras.layers.Flatten(), 29 | tf.keras.layers.Dense(128, activation='relu'), 30 | tf.keras.layers.Dropout(0.5), 31 | tf.keras.layers.Dense(10, activation='softmax') 32 | ]) 33 | loss = tf.losses.SparseCategoricalCrossentropy() 34 | 35 | opt = tf.optimizers.Adam(0.001 * bps.size()) 36 | 37 | checkpoint_dir = './checkpoints' 38 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) 39 | 40 | 41 | @tf.function 42 | def training_step(images, labels, first_batch): 43 | with tf.GradientTape() as tape: 44 | probs = mnist_model(images, training=True) 45 | loss_value = loss(labels, probs) 46 | 47 | tape = bps.DistributedGradientTape(tape) 48 | 49 | grads = tape.gradient(loss_value, mnist_model.trainable_variables) 50 | opt.apply_gradients(zip(grads, mnist_model.trainable_variables)) 51 | 52 | # Note: broadcast should be done after the first gradient step to ensure optimizer 53 | # initialization. 54 | if first_batch: 55 | bps.broadcast_variables(mnist_model.variables, root_rank=0) 56 | bps.broadcast_variables(opt.variables(), root_rank=0) 57 | 58 | return loss_value 59 | 60 | 61 | # BytePS: adjust number of steps based on number of GPUs. 62 | for batch, (images, labels) in enumerate(dataset.take(10000 // bps.size())): 63 | loss_value = training_step(images, labels, batch == 0) 64 | 65 | if batch % 10 == 0 and bps.local_rank() == 0: 66 | print('Step #%d\tLoss: %.6f' % (batch, loss_value)) 67 | 68 | if bps.rank() == 0: 69 | checkpoint.save(checkpoint_dir) -------------------------------------------------------------------------------- /byteps/common/compressor/impl/vanilla_error_feedback.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "../compressor_registry.h" 22 | #include "vanilla_error_feedback.h" 23 | 24 | namespace byteps { 25 | namespace common { 26 | namespace compressor { 27 | namespace { 28 | CompressorRegistry::Register reg( 29 | "vanilla_ef", 30 | [](const kwargs_t& kwargs, size_t size, 31 | DataType dtype) -> std::unique_ptr { 32 | // register cptr 33 | auto kwargs_clone = kwargs; 34 | kwargs_clone.erase("ef_type"); 35 | auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype); 36 | BPS_CHECK_NE(cptr, nullptr); 37 | return std::unique_ptr( 38 | new VanillaErrorFeedbackCompressor(size, dtype, std::move(cptr))); 39 | }); 40 | } 41 | 42 | VanillaErrorFeedbackCompressor::VanillaErrorFeedbackCompressor( 43 | size_t size, DataType dtype, std::unique_ptr cptr) 44 | : ErrorFeedback(size, dtype, std::move(cptr)) { 45 | _fd = open("lr.s", O_RDONLY); 46 | BPS_CHECK(_fd > 0) << "open lr.s failed, errno=" << strerror(errno); 47 | void* ptr = mmap(0, 8, PROT_READ, MAP_SHARED, _fd, 0); 48 | BPS_CHECK_NE(ptr, MAP_FAILED) << "mmap failed, errno=" << strerror(errno); 49 | _mm = ptr; 50 | _pre_lr = _cur_lr = *reinterpret_cast(_mm); 51 | } 52 | 53 | VanillaErrorFeedbackCompressor::~VanillaErrorFeedbackCompressor() { 54 | munmap(_mm, 8); 55 | close(_fd); 56 | } 57 | 58 | void VanillaErrorFeedbackCompressor::UpdateGradient(tensor_t grad) { 59 | _cur_lr = *reinterpret_cast(_mm); 60 | this->_cpu_reducer->sum(grad.data, _error.get(), grad.size, 61 | static_cast(grad.dtype), 62 | (_pre_lr / _cur_lr)); 63 | _pre_lr = _cur_lr; 64 | } 65 | 66 | } // namespace compressor 67 | } // namespace common 68 | } // namespace byteps -------------------------------------------------------------------------------- /byteps/torch/compression.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | """Gradient compression algorithms.""" 17 | 18 | import torch 19 | 20 | 21 | class Compressor(object): 22 | """Interface for compressing and decompressing a given tensor.""" 23 | @staticmethod 24 | def compress(tensor): 25 | """Compresses a tensor and returns it with the context needed to decompress it.""" 26 | pass 27 | 28 | @staticmethod 29 | def decompress(tensor, ctx): 30 | """Decompress the tensor with the given context.""" 31 | pass 32 | 33 | 34 | class NoneCompressor(Compressor): 35 | """Default no-op compression.""" 36 | @staticmethod 37 | def compress(tensor): 38 | """Returns the tensor unmodified.""" 39 | return tensor, None 40 | 41 | @staticmethod 42 | def decompress(tensor, ctx): 43 | """Returns the tensor unmodified.""" 44 | return tensor 45 | 46 | 47 | class FP16Compressor(Compressor): 48 | """Compress all floating point gradients to 16-bit.""" 49 | @staticmethod 50 | def compress(tensor): 51 | """Downcasts the tensor to 16-bit.""" 52 | tensor_compressed = tensor 53 | if tensor.dtype.is_floating_point: 54 | # Only allow compression from other floating point types 55 | tensor_compressed = tensor.type(torch.float16) 56 | return tensor_compressed, tensor.dtype 57 | 58 | @staticmethod 59 | def decompress(tensor, ctx): 60 | """Upcasts the tensor to the initialization dtype.""" 61 | tensor_decompressed = tensor 62 | dtype = ctx 63 | if dtype.is_floating_point: 64 | tensor_decompressed = tensor.type(dtype) 65 | return tensor_decompressed 66 | 67 | 68 | class Compression(object): 69 | """Optional gradient compression algorithm used during push_pull.""" 70 | 71 | """Do not compress the gradients. This is the default.""" 72 | none = NoneCompressor 73 | 74 | """Compress all floating point gradients to 16-bit.""" 75 | fp16 = FP16Compressor 76 | -------------------------------------------------------------------------------- /byteps/tensorflow/compression.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | """Gradient compression algorithms.""" 17 | 18 | import tensorflow as tf 19 | 20 | 21 | class Compressor(object): 22 | """Interface for compressing and decompressing a given tensor.""" 23 | @staticmethod 24 | def compress(tensor): 25 | """Compresses a tensor and returns it with the context needed to decompress it.""" 26 | pass 27 | 28 | @staticmethod 29 | def decompress(tensor, ctx): 30 | """Decompress the tensor with the given context.""" 31 | pass 32 | 33 | 34 | class NoneCompressor(Compressor): 35 | """Default no-op compression.""" 36 | @staticmethod 37 | def compress(tensor): 38 | """Returns the tensor unmodified.""" 39 | return tensor, None 40 | 41 | @staticmethod 42 | def decompress(tensor, ctx): 43 | """Returns the tensor unmodified.""" 44 | return tensor 45 | 46 | 47 | class FP16Compressor(Compressor): 48 | """Compress all floating point gradients to 16-bit.""" 49 | @staticmethod 50 | def compress(tensor): 51 | """Downcasts the tensor to 16-bit.""" 52 | tensor_compressed = tensor 53 | if tensor.dtype.is_floating: 54 | # Only allow compression from other floating point types 55 | tensor_compressed = tf.cast(tensor, dtype=tf.float16) 56 | return tensor_compressed, tensor.dtype 57 | 58 | @staticmethod 59 | def decompress(tensor, ctx): 60 | """Upcasts the tensor to the initialization dtype.""" 61 | tensor_decompressed = tensor 62 | dtype = ctx 63 | if dtype.is_floating: 64 | tensor_decompressed = tf.cast(tensor, dtype=dtype) 65 | return tensor_decompressed 66 | 67 | 68 | class Compression(object): 69 | """Optional gradient compression algorithm used during push_pull.""" 70 | 71 | """Do not compress the gradients. This is the default.""" 72 | none = NoneCompressor 73 | 74 | """Compress all floating point gradients to 16-bit.""" 75 | fp16 = FP16Compressor 76 | -------------------------------------------------------------------------------- /byteps/torch/adapter.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #include "adapter.h" 18 | #include "cuda_util.h" 19 | 20 | namespace byteps { 21 | namespace torch { 22 | 23 | TorchTensor::TorchTensor(::torch::Tensor tensor) : tensor_(tensor) {} 24 | 25 | const DataType TorchTensor::dtype() const { 26 | switch (tensor_.scalar_type()) { 27 | case ::torch::kByte: 28 | return DataType::BYTEPS_UINT8; 29 | case ::torch::kChar: 30 | return DataType::BYTEPS_INT8; 31 | // case ::torch::kShort: 32 | // return DataType::BYTEPS_INT16; 33 | case ::torch::kInt: 34 | return DataType::BYTEPS_INT32; 35 | case ::torch::kLong: 36 | return DataType::BYTEPS_INT64; 37 | case ::torch::kHalf: 38 | return DataType::BYTEPS_FLOAT16; 39 | case ::torch::kFloat: 40 | return DataType::BYTEPS_FLOAT32; 41 | case ::torch::kDouble: 42 | return DataType::BYTEPS_FLOAT64; 43 | default: 44 | throw std::logic_error("Invalid or unsupported tensor type."); 45 | } 46 | } 47 | 48 | const TensorShape TorchTensor::shape() const { 49 | TensorShape shape; 50 | for (int idx = 0; idx < tensor_.dim(); ++idx) { 51 | shape.AddDim(tensor_.size(idx)); 52 | } 53 | return shape; 54 | } 55 | 56 | const void* TorchTensor::data() const { return tensor_.data_ptr(); } 57 | 58 | int64_t TorchTensor::size() const { 59 | #if TORCH_VERSION >= 1001000000 60 | return tensor_.element_size() * tensor_.numel(); 61 | #else 62 | return tensor_.type().elementSizeInBytes() * tensor_.numel(); 63 | #endif 64 | } 65 | 66 | void ThrowIfError(Status status) { 67 | switch (status.type()) { 68 | case StatusType::OK: 69 | return; 70 | case StatusType::PRECONDITION_ERROR: 71 | throw std::logic_error(status.reason()); 72 | case StatusType::ABORTED: 73 | throw std::runtime_error(status.reason()); 74 | case StatusType::INVALID_ARGUMENT: 75 | throw std::invalid_argument(status.reason()); 76 | default: // Includes UNKNOWN_ERROR 77 | throw std::runtime_error(status.reason()); 78 | } 79 | } 80 | 81 | } // namespace torch 82 | } // namespace byteps 83 | -------------------------------------------------------------------------------- /example/tensorflow/tensorflow2_mnist_bps_MirroredStrategy.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import json 4 | import os 5 | import sys 6 | import argparse 7 | import byteps.tensorflow as bps 8 | from byteps.tensorflow.distribute import MirroredStrategy 9 | 10 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', 11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 12 | parser.add_argument('--rank', default=-1, type=int, 13 | help='node rank for distributed training') 14 | args = parser.parse_args() 15 | 16 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) 17 | 18 | bps.init() 19 | args.rank = bps.local_rank() 20 | print("my rank ", args.rank) 21 | 22 | gpus = tf.config.experimental.list_physical_devices('GPU') 23 | for gpu in gpus: 24 | tf.config.experimental.set_memory_growth(gpu, True) 25 | if gpus: 26 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU') 27 | 28 | def mnist_dataset(batch_size): 29 | (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() 30 | # The `x` arrays are in uint8 and have values in the range [0, 255]. 31 | # We need to convert them to float32 with values in the range [0, 1] 32 | x_train = x_train / np.float32(255) 33 | y_train = y_train.astype(np.int64) 34 | train_dataset = tf.data.Dataset.from_tensor_slices( 35 | (x_train, y_train)).shuffle(60000).repeat().batch(batch_size) 36 | return train_dataset 37 | 38 | def build_and_compile_cnn_model(): 39 | model = tf.keras.Sequential([ 40 | tf.keras.Input(shape=(28, 28)), 41 | tf.keras.layers.Reshape(target_shape=(28, 28, 1)), 42 | tf.keras.layers.Conv2D(32, 3, activation='relu'), 43 | tf.keras.layers.Flatten(), 44 | tf.keras.layers.Dense(128, activation='relu'), 45 | tf.keras.layers.Dense(10) 46 | ]) 47 | model.compile( 48 | loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 49 | optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), 50 | metrics=['accuracy']) 51 | return model 52 | 53 | per_worker_batch_size = 64 54 | 55 | strategy = MirroredStrategy(devices=["/gpu:0"]) 56 | 57 | 58 | num_workers = 1 59 | 60 | per_worker_batch_size = 64 61 | # Here the batch size scales up by number of workers since 62 | # `tf.data.Dataset.batch` expects the global batch size. Previously we used 64, 63 | # and now this becomes 128. 64 | global_batch_size = per_worker_batch_size * num_workers 65 | multi_worker_dataset = mnist_dataset(global_batch_size) 66 | 67 | with strategy.scope(): 68 | # Model building/compiling need to be within `strategy.scope()`. 69 | multi_worker_model = build_and_compile_cnn_model() 70 | 71 | # Keras' `model.fit()` trains the model with specified number of epochs and 72 | # number of steps per epoch. Note that the numbers here are for demonstration 73 | # purposes only and may not sufficiently produce a model with good quality. 74 | multi_worker_model.fit(multi_worker_dataset, epochs=100, steps_per_epoch=70) 75 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # BytePS Architecture 2 | 3 | We highly recommend you to read [BytePS's rationale](./rationale.md) first before reading this doc. 4 | 5 | From application views, BytePS is a communication library just like Horovod. The plugins handle framework-specific transformation (e.g., on data structure), and 6 | put communication tasks into BytePS priority queues. The BytePS Core then gets the tasks (priority-aware, not FIFO) and handles the actual communication. 7 | 8 | ![byteps_architecture](https://user-images.githubusercontent.com/13852819/69873605-c3d39e00-12f3-11ea-942d-97af2606bb40.png) 9 | 10 | 11 | ## General Workflow 12 | To demonstrate the work flow of BytePS, below we use a common data-parallel training scenario as an example. Say we have multiple worker machines (we refer them as "**workers**"), and each machine (worker) has multiple GPUs. We also have some CPU machines that serve as PS (we refer them as "**servers**"). 13 | 14 | In BytePS, a general walk-through of an iteration goes like this (we call each step as a **stage**): 15 | 16 | 1. **Computation**: Each GPU performs computation (forward/backward propagation), which is irrelevant to BytePS; 17 | 2. **Local Reduce**: Multiple GPUs on the same machine reduces the gradients; 18 | 3. **Push**: The workers push the aggregated gradients to the servers; 19 | 4. **Global Reduce**: Once the servers receive the gradients from different workers, it aggregates the gradients; 20 | 5. **Pull**: The workers pull the aggregated gradients from the servers; 21 | 6. **Local Broadcast**: The workers broadcasts the updated gradients to local GPUs; 22 | 8. Goto next iteration and repeat from 1. 23 | 24 | 25 | ## Local Communication 26 | 27 | We use NCCL for local communication, including **Local Reduce** and **Local Broadcast**. 28 | 29 | For **Local Reduce** stage we use [ReduceScatter](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#reducescatter) to evenly distribute the gradients on multiple GPUs. 30 | 31 | For **Local Broadcast** stage we use [AllGather](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather) to broadcast the gradients back to multiple GPUs. 32 | 33 | ## Distributed Communication 34 | 35 | We use [ps-lite](https://github.com/bytedance/ps-lite/tree/byteps) for **Push** and **Pull** between workers and servers. 36 | 37 | For **Push** stage, the workers send the gradients to servers, as the traditional PS does. 38 | 39 | For **Pull** stage, the workers pull gradients rather than parameters from the servers, which is different from traditional PS. Here is why: 40 | 41 | In past, the SGD update is performed on servers, so the workers need to tell the servers what SGD optimizer to use. However, for different frameworks, even the same optimizer algorithm may be implemented in completely different ways, and not to mention there are many user-defined optimizers. So BytePS moves the SGD update from the servers to the workers, leaving the servers only do gradient reduction. We believe this is generic because it applies to all frameworks we know so far. 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /example/keras/keras_synthetic_benchmark_tf2.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import argparse 4 | import os 5 | import numpy as np 6 | import timeit 7 | 8 | import tensorflow as tf 9 | import byteps.tensorflow.keras as bps 10 | from tensorflow.keras import applications 11 | 12 | tf.compat.v1.disable_eager_execution() 13 | 14 | # Benchmark settings 15 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', 16 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 17 | parser.add_argument('--fp16-allreduce', action='store_true', default=False, 18 | help='use fp16 compression during allreduce') 19 | 20 | parser.add_argument('--model', type=str, default='ResNet50', 21 | help='model to benchmark') 22 | parser.add_argument('--batch-size', type=int, default=32, 23 | help='input batch size') 24 | 25 | parser.add_argument('--num-warmup-batches', type=int, default=10, 26 | help='number of warm-up batches that don\'t count towards benchmark') 27 | parser.add_argument('--num-batches-per-iter', type=int, default=10, 28 | help='number of batches per benchmark iteration') 29 | parser.add_argument('--num-iters', type=int, default=10, 30 | help='number of benchmark iterations') 31 | 32 | parser.add_argument('--no-cuda', action='store_true', default=False, 33 | help='disables CUDA training') 34 | 35 | args = parser.parse_args() 36 | args.cuda = not args.no_cuda 37 | 38 | bps.init() 39 | 40 | # pin GPU to be used to process local rank (one GPU per process) 41 | if args.cuda: 42 | gpus = tf.config.experimental.list_physical_devices('GPU') 43 | for gpu in gpus: 44 | tf.config.experimental.set_memory_growth(gpu, True) 45 | if gpus: 46 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU') 47 | else: 48 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 49 | 50 | data = tf.random.uniform([args.batch_size, 224, 224, 3]) 51 | target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) 52 | 53 | callbacks = [ 54 | # BytePS: broadcast initial variable states from rank 0 to all other processes. 55 | # This is necessary to ensure consistent initialization of all workers when 56 | # training is started with random weights or restored from a checkpoint. 57 | bps.callbacks.BroadcastGlobalVariablesCallback(0), 58 | ] 59 | # Set up standard model. 60 | model = getattr(applications, args.model)(weights=None) 61 | opt = tf.keras.optimizers.Adam(0.01) 62 | opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, loss_scale="dynamic") 63 | opt = bps.DistributedOptimizer(opt) 64 | 65 | model.compile(loss=tf.keras.losses.categorical_crossentropy, 66 | optimizer=opt, 67 | metrics=['accuracy', 'top_k_categorical_accuracy'], 68 | experimental_run_tf_function=False) 69 | model.fit(data, target, epochs=10, steps_per_epoch=16, callbacks=callbacks) 70 | 71 | test_loss, test_acc, test_topk = model.evaluate(data, target, verbose=2, steps=16) 72 | print('\nTest accuracy:', test_acc) 73 | -------------------------------------------------------------------------------- /byteps/common/compressor/momentum.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_MOMENTUM_H 17 | #define BYTEPS_COMPRESSOR_MOMENTUM_H 18 | 19 | #include "../cpu_reducer.h" 20 | #include "compressor.h" 21 | 22 | namespace byteps { 23 | namespace common { 24 | namespace compressor { 25 | /*! 26 | * \brief Momentum 27 | * 28 | * Stochastic gradient descent with momentum 29 | * 30 | * \note 31 | * The momentum is added to gradient before compression. This should not be used 32 | * at the same time with the momentum implemented in the framework such as 33 | * MXNet, Tensorflow or PyTorch etc. The key difference between the two is the 34 | * position where they are added to the gradients. For this one, it is added 35 | * before push_pull. But for framework's momentum, it is added after push_pull. 36 | * 37 | * \note 38 | * The framework's momentum is disabled when using this momentum. User do not 39 | * need to disable it manully. 40 | * 41 | * \sa Compressor, NesterovMomentumCompressor 42 | */ 43 | class Momentum : public Compressor { 44 | public: 45 | // momentum should be cleared to zeros 46 | Momentum(size_t size, DataType dtype, std::unique_ptr cptr, 47 | float mu) 48 | : Compressor(size, dtype), 49 | _mom(new byte_t[size]()), 50 | _mu(mu), 51 | _cpu_reducer(new CpuReducer(nullptr)), 52 | _cptr(std::move(cptr)){}; 53 | virtual ~Momentum() = default; 54 | 55 | virtual tensor_t Compress(tensor_t grad) final; 56 | 57 | virtual tensor_t Decompress(tensor_t compressed) final; 58 | 59 | protected: 60 | /*! 61 | * \brief Update momentum 62 | * 63 | * e.g. m_t = \mu * m_{t-1} + g_t 64 | * 65 | * \param grad refers to gradient 66 | */ 67 | virtual void UpdateMom(tensor_t grad) = 0; 68 | 69 | /*! 70 | * \brief Update gradient with momentum 71 | * 72 | * e.g. g_t = \mu m_t + g_t 73 | * 74 | * \param grad refers to gradient which adds momentum in place. 75 | */ 76 | virtual void UpdateGradient(tensor_t grad) = 0; 77 | 78 | protected: 79 | /*! \brief buffer of momentum */ 80 | std::unique_ptr _mom; 81 | 82 | /*! \brief momentum factor */ 83 | float _mu; 84 | 85 | std::unique_ptr _cpu_reducer; 86 | 87 | private: 88 | /*! \brief compressor pointer */ 89 | std::unique_ptr _cptr; 90 | }; 91 | } // namespace compressor 92 | } // namespace common 93 | } // namespace byteps 94 | 95 | #endif // BYTEPS_COMPRESSOR_MOMENTUM_H -------------------------------------------------------------------------------- /example/mxnet/symbols/lenet.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """ 19 | LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner. 20 | Gradient-based learning applied to document recognition. 21 | Proceedings of the IEEE (1998) 22 | """ 23 | import mxnet as mx 24 | 25 | def get_loc(data, attr={'lr_mult':'0.01'}): 26 | """ 27 | the localisation network in lenet-stn, it will increase acc about more than 1%, 28 | when num-epoch >=15 29 | """ 30 | loc = mx.symbol.Convolution(data=data, num_filter=30, kernel=(5, 5), stride=(2,2)) 31 | loc = mx.symbol.Activation(data = loc, act_type='relu') 32 | loc = mx.symbol.Pooling(data=loc, kernel=(2, 2), stride=(2, 2), pool_type='max') 33 | loc = mx.symbol.Convolution(data=loc, num_filter=60, kernel=(3, 3), stride=(1,1), pad=(1, 1)) 34 | loc = mx.symbol.Activation(data = loc, act_type='relu') 35 | loc = mx.symbol.Pooling(data=loc, global_pool=True, kernel=(2, 2), pool_type='avg') 36 | loc = mx.symbol.Flatten(data=loc) 37 | loc = mx.symbol.FullyConnected(data=loc, num_hidden=6, name="stn_loc", attr=attr) 38 | return loc 39 | 40 | 41 | def get_symbol(num_classes=10, add_stn=False, **kwargs): 42 | data = mx.symbol.Variable('data') 43 | if add_stn: 44 | data = mx.sym.SpatialTransformer(data=data, loc=get_loc(data), target_shape = (28,28), 45 | transform_type="affine", sampler_type="bilinear") 46 | # first conv 47 | conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20) 48 | tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh") 49 | pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max", 50 | kernel=(2,2), stride=(2,2)) 51 | # second conv 52 | conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50) 53 | tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh") 54 | pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max", 55 | kernel=(2,2), stride=(2,2)) 56 | # first fullc 57 | flatten = mx.symbol.Flatten(data=pool2) 58 | fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500) 59 | tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh") 60 | # second fullc 61 | fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=num_classes) 62 | # loss 63 | lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') 64 | return lenet 65 | -------------------------------------------------------------------------------- /byteps/common/compressor/impl/topk.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_TOPK_H 17 | #define BYTEPS_COMPRESSOR_IMPL_TOPK_H 18 | 19 | #include "../compressor.h" 20 | 21 | namespace byteps { 22 | namespace common { 23 | namespace compressor { 24 | 25 | /*! 26 | * \brief TopK Compressor 27 | * 28 | * paper: Sparsified SGD with Memory 29 | * https://arxiv.org/pdf/1809.07599.pdf 30 | * 31 | * sending the most significant entries of the stochastic gradient 32 | * 33 | */ 34 | class TopkCompressor : public Compressor { 35 | public: 36 | TopkCompressor(size_t size, DataType dtype, unsigned int k) 37 | : Compressor(size, dtype), _k(k){}; 38 | virtual ~TopkCompressor() = default; 39 | 40 | /*! 41 | * \brief Compress function 42 | * 43 | * select topk entries and corresponding indices 44 | * 45 | * \note compare with absolute values 46 | * 47 | * \param grad gradient tensor 48 | * \param compressed compressed tensor 49 | */ 50 | tensor_t Compress(tensor_t grad) override; 51 | 52 | /*! 53 | * \brief Decompress function 54 | * 55 | * fill a zero tensor with topk entries and corresponding indices 56 | * 57 | * \param compressed compressed tensor 58 | * \param decompressed decompressed tensor 59 | */ 60 | tensor_t Decompress(tensor_t compressed) override; 61 | 62 | /*! 63 | * \brief faster version of `UpdateError` 64 | * 65 | * 1. e <- p (e is the error and p is the corrected gradient) 66 | * 2. zero-fill e with selected k indices 67 | * 68 | * \param corrected gradient corrected with error 69 | * \param error error 70 | * \param compressed compressed gradient 71 | */ 72 | void FastUpdateError(tensor_t error, tensor_t corrected, 73 | tensor_t compressed) override; 74 | 75 | private: 76 | template 77 | tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len); 78 | 79 | template 80 | tensor_t DecompressImpl(scalar_t* dst, const index_t* src, 81 | size_t compressed_size); 82 | 83 | template 84 | void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected, 85 | const index_t* compressed, size_t compressed_size); 86 | 87 | private: 88 | unsigned int _k; 89 | }; 90 | } // namespace compressor 91 | } // namespace common 92 | } // namespace byteps 93 | 94 | #endif // BYTEPS_COMPRESSOR_IMPL_TOPK_H -------------------------------------------------------------------------------- /byteps/common/compressor/impl/onebit.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_ONEBIT_H 17 | #define BYTEPS_COMPRESSOR_IMPL_ONEBIT_H 18 | 19 | #include "../compressor.h" 20 | 21 | namespace byteps { 22 | namespace common { 23 | namespace compressor { 24 | 25 | /*! 26 | * \brief Onebit Compressor 27 | * 28 | * paper: SIGNSGD: Compressed Optimisation for Non-Convex Problems 29 | * https://arxiv.org/pdf/1802.04434.pdf 30 | * 31 | * each worker i: 32 | * c_i <- sign(grad) 33 | * 34 | * server: majority vote 35 | * sign(\sum_i c_i) 36 | * 37 | * \note 0 represents positive and 1 represents negative. 38 | */ 39 | class OnebitCompressor : public Compressor { 40 | public: 41 | OnebitCompressor(size_t size, DataType dtype, bool use_scale = false) 42 | : Compressor(size, dtype), _use_scale(use_scale) {} 43 | virtual ~OnebitCompressor() = default; 44 | 45 | /*! 46 | * \brief Compress function 47 | * 48 | * compress and pack into byte array. 49 | * each bit represents a sign. 50 | * 51 | * \param grad gradient tensor 52 | * \param compressed compressed tensor 53 | */ 54 | tensor_t Compress(tensor_t grad) override; 55 | 56 | /*! 57 | * \brief Decompress function 58 | * 59 | * unpack from byte array to FP tensor 60 | * 61 | * \param compressed compressed tensor 62 | * \param decompressed decompressed tensor 63 | */ 64 | tensor_t Decompress(tensor_t compressed) override; 65 | 66 | /*! 67 | * \brief help function for error feedback `UpdateError` 68 | * 69 | * \param corrected gradient corrected with error 70 | * \param error error 71 | * \param compressed compressed gradient 72 | */ 73 | void FastUpdateError(tensor_t error, tensor_t corrected, 74 | tensor_t compressed) override; 75 | 76 | private: 77 | template 78 | tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len); 79 | 80 | template 81 | tensor_t DecompressImpl(scalar_t* dst, const index_t* src, 82 | size_t compressed_size); 83 | 84 | template 85 | void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected, 86 | const index_t* compressed, size_t compressed_size); 87 | 88 | private: 89 | bool _use_scale; 90 | }; 91 | } // namespace compressor 92 | } // namespace common 93 | } // namespace byteps 94 | 95 | #endif // BYTEPS_COMPRESSOR_IMPL_ONEBIT_H -------------------------------------------------------------------------------- /byteps/common/operations.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_OPERATIONS_H 17 | #define BYTEPS_OPERATIONS_H 18 | 19 | #include 20 | #include "common.h" 21 | 22 | namespace byteps { 23 | namespace common { 24 | 25 | // Check that byteps is initialized. 26 | Status CheckInitialized(); 27 | 28 | extern "C" { 29 | 30 | // C interface to initialize byteps. 31 | void byteps_init(); 32 | 33 | // C interface to initialize byteps (without initializing ps-lite). 34 | void byteps_lazy_init(); 35 | 36 | // C interface to shut down byteps. 37 | void byteps_shutdown(); 38 | 39 | // C interface to restart byteps. 40 | void byteps_resume(int num_workers, int num_servers); 41 | 42 | // C interface to suspend byteps. 43 | void byteps_suspend(); 44 | 45 | // C interface to get index of current byteps process. 46 | // Returns -1 if byteps is not initialized. 47 | int byteps_rank(); 48 | 49 | // C interface to get index of current byteps process in the node it is on. 50 | // Returns -1 if byteps is not initialized. 51 | int byteps_local_rank(); 52 | 53 | // C interface to return number of byteps processes. 54 | // Returns -1 if byteps is not initialized. 55 | int byteps_size(); 56 | 57 | // C interface to return number of byteps processes in the node it is on. 58 | // Returns -1 if byteps is not initialized. 59 | int byteps_local_size(); 60 | } 61 | 62 | extern "C" PyObject* byteps_get_pushpull_speed(); 63 | 64 | // Below are all for Framework plugins 65 | Status EnqueueTensor(BPSContext &context, std::shared_ptr input, 66 | std::shared_ptr output, 67 | std::shared_ptr ready_event, const int device, 68 | const int priority, const int version, 69 | StatusCallback callback, 70 | std::shared_ptr> queue_list); 71 | 72 | void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff); 73 | 74 | // Only call these in Framework plugins for the best performance 75 | bool IsTensorDeclared(const std::string &name); 76 | 77 | void RegisterCompressor(const std::string &name, 78 | std::unordered_map &kwargs); 79 | 80 | BPSContext &GetContextFromName(const std::string &name); 81 | 82 | std::shared_ptr> GetPushQueueList(int device); 83 | 84 | std::shared_ptr> GetPullQueueList(int device); 85 | 86 | } // namespace common 87 | } // namespace byteps 88 | 89 | #endif // BYTEPS_OPERATIONS_H 90 | -------------------------------------------------------------------------------- /byteps/common/compressor/impl/dithering.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H 17 | #define BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H 18 | 19 | #include "../compressor.h" 20 | #include "../utils.h" 21 | 22 | namespace byteps { 23 | namespace common { 24 | namespace compressor { 25 | 26 | /*! 27 | * \brief Dithering Compressor 28 | * 29 | * paper: Natural Compression for Distributed Deep Learning 30 | * https://arxiv.org/pdf/1905.10988.pdf 31 | * 32 | * two kinds of partition: 33 | * 1. linear: {0, 1/s, 2/s, ..., (s-1)/s, 1} 34 | * 35 | * 2. natural: {0, 2^{1-s}, 2^(2-s), ..., 2^{-1}, 1} 36 | * 37 | * two kinds of normalization: 38 | * 1. max: it gives better accuracy but less sparsity. 39 | * 40 | * 2. l2 norm: it is more sparse but less accurate. and 41 | * empirically we found it will diverge with error-feedback. 42 | */ 43 | class DitheringCompressor : public Compressor { 44 | public: 45 | enum class PartitionType { LINEAR = 0, NATURAL = 1 }; 46 | enum class NomalizeType { MAX = 0, L2 = 1 }; 47 | 48 | DitheringCompressor(size_t size, DataType dtype, unsigned int s, 49 | unsigned int seed = 0, 50 | PartitionType ptype = PartitionType::LINEAR, 51 | NomalizeType ntype = NomalizeType::MAX) 52 | : Compressor(size, dtype), _s(s), _ptype(ptype), _ntype(ntype) { 53 | if (seed) { 54 | _rng.set_seed(seed); 55 | } 56 | }; 57 | virtual ~DitheringCompressor() = default; 58 | 59 | tensor_t Compress(tensor_t grad) override; 60 | 61 | tensor_t Decompress(tensor_t compressed) override; 62 | 63 | void FastUpdateError(tensor_t error, tensor_t corrected, 64 | tensor_t compressed) override; 65 | 66 | private: 67 | template 68 | tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len); 69 | 70 | template 71 | tensor_t DecompressImpl(scalar_t* dst, const index_t* src, 72 | size_t compressed_size); 73 | 74 | template 75 | void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected, 76 | const index_t* compressed, size_t compressed_size); 77 | 78 | /*! \brief number of levels */ 79 | const unsigned int _s; 80 | 81 | PartitionType _ptype; 82 | NomalizeType _ntype; 83 | XorShift128PlusBitShifterRNG _rng; 84 | }; 85 | } // namespace compressor 86 | } // namespace common 87 | } // namespace byteps 88 | 89 | #endif // BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H -------------------------------------------------------------------------------- /tests/meta_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon Technologies, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import copy 17 | import time 18 | import os 19 | import subprocess 20 | import sys 21 | import threading 22 | 23 | import byteps.mxnet as bps 24 | 25 | 26 | class MetaTest(type): 27 | BASE_ENV = {"DMLC_NUM_WORKER": "1", 28 | "DMLC_NUM_SERVER": "1", 29 | "DMLC_PS_ROOT_URI": "127.0.0.1", 30 | "DMLC_PS_ROOT_PORT": "1234", 31 | "BYTEPS_LOG_LEVEL": "INFO", 32 | "BYTEPS_MIN_COMPRESS_BYTES": "0", 33 | "BYTEPS_PARTITION_BYTES": "2147483647"} 34 | for name, value in os.environ.items(): 35 | if name not in BASE_ENV: 36 | BASE_ENV[name] = value 37 | SCHEDULER_ENV = copy.copy(BASE_ENV) 38 | SCHEDULER_ENV.update(DMLC_ROLE="scheduler") 39 | SERVER_ENV = copy.copy(BASE_ENV) 40 | SERVER_ENV.update(DMLC_ROLE="server") 41 | 42 | def __new__(cls, name, bases, dict): 43 | # decorate all test cases 44 | for k, v in dict.items(): 45 | if k.startswith("test_") and hasattr(v, "__call__"): 46 | dict[k] = cls.launch_bps(v) 47 | 48 | for k, v in cls.BASE_ENV.items(): 49 | os.environ[k] = v 50 | os.environ["NVIDIA_VISIBLE_DEVICES"] = "0" 51 | os.environ["DMLC_WORKER_ID"] = "0" 52 | os.environ["DMLC_ROLE"] = "worker" 53 | os.environ["BYTEPS_THREADPOOL_SIZE"] = "4" 54 | os.environ["BYTEPS_FORCE_DISTRIBUTED"] = "1" 55 | os.environ["BYTEPS_LOCAL_RANK"] = "0" 56 | os.environ["BYTEPS_LOCAL_SIZE"] = "1" 57 | return type(name, bases, dict) 58 | 59 | @classmethod 60 | def launch_bps(cls, func): 61 | def wrapper(*args, **kwargs): 62 | def run(env): 63 | subprocess.check_call(args=["bpslaunch"], shell=True, 64 | stdout=sys.stdout, stderr=sys.stderr, 65 | env=env) 66 | 67 | print("bps init") 68 | scheduler = threading.Thread(target=run, 69 | args=(cls.SCHEDULER_ENV,)) 70 | server = threading.Thread(target=run, args=(cls.SERVER_ENV,)) 71 | scheduler.daemon = True 72 | server.daemon = True 73 | scheduler.start() 74 | server.start() 75 | 76 | bps.init() 77 | func(*args, **kwargs) 78 | bps.shutdown() 79 | 80 | scheduler.join() 81 | server.join() 82 | print("bps shutdown") 83 | time.sleep(2) 84 | 85 | return wrapper 86 | -------------------------------------------------------------------------------- /byteps/common/shared_memory.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "shared_memory.h" 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "global.h" 24 | 25 | namespace byteps { 26 | namespace common { 27 | 28 | void* BytePSSharedMemory::openSharedMemory(const std::string& prefix, 29 | uint64_t key, size_t size) { 30 | size = BytePSGlobal::RoundUpToPageSize(size); 31 | std::string shm_name(prefix); 32 | shm_name += std::to_string(key); 33 | int shm_fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0666); 34 | BPS_CHECK_GE(shm_fd, 0) << "shm_open failed for " << shm_name << " " << strerror(errno); 35 | 36 | BPS_CHECK_GE(ftruncate(shm_fd, size), 0) << strerror(errno); 37 | 38 | void* ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); 39 | CUDA_CALL(cudaHostRegister(ptr, size, cudaHostRegisterDefault)); 40 | // mlock(ptr, size); 41 | 42 | BPS_CHECK_NE(ptr, (void*)-1) << strerror(errno); 43 | 44 | BPS_LOG(TRACE) << "initialized share memory size " << size; 45 | 46 | std::lock_guard lock(_shm_mu); 47 | _key_shm_addr[shm_name] = ptr; 48 | _key_shm_size[shm_name] = size; 49 | return ptr; 50 | } 51 | 52 | std::vector BytePSSharedMemory::openPcieSharedMemory(uint64_t key, 53 | size_t size) { 54 | std::vector r; 55 | for (int i = 0; i < BytePSGlobal::GetPcieSwitchNum(); i++) { 56 | auto prefix = std::string("BytePS_Pcie") + std::to_string(i) + "_Shm_"; 57 | if (BytePSGlobal::IsDistributed()) { 58 | if (BytePSGlobal::IsCrossPcieSwitch()) { 59 | if (i <= numa_max_node()) { 60 | numa_set_preferred(i); 61 | r.push_back(openSharedMemory(prefix, key, size)); 62 | numa_set_preferred(-1); 63 | } else { 64 | numa_set_preferred(numa_max_node()); 65 | r.push_back(openSharedMemory(prefix, key, size)); 66 | numa_set_preferred(-1); 67 | } 68 | } else { 69 | r.push_back(openSharedMemory(prefix, key, size)); 70 | } 71 | } else { 72 | if (BytePSGlobal::IsCrossPcieSwitch()) { 73 | numa_set_interleave_mask(numa_all_nodes_ptr); 74 | r.push_back(openSharedMemory(prefix, key, size)); 75 | numa_set_interleave_mask(numa_no_nodes_ptr); 76 | } else { 77 | r.push_back(openSharedMemory(prefix, key, size)); 78 | } 79 | } 80 | } 81 | return r; 82 | } 83 | 84 | } // namespace common 85 | 86 | } // namespace byteps 87 | -------------------------------------------------------------------------------- /example/mxnet/symbols/alexnet.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """ 19 | Reference: 20 | 21 | Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012. 22 | """ 23 | import mxnet as mx 24 | import numpy as np 25 | 26 | def get_symbol(num_classes, dtype='float32', **kwargs): 27 | input_data = mx.sym.Variable(name="data") 28 | if dtype == 'float16': 29 | input_data = mx.sym.Cast(data=input_data, dtype=np.float16) 30 | # stage 1 31 | conv1 = mx.sym.Convolution(name='conv1', 32 | data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=96) 33 | relu1 = mx.sym.Activation(data=conv1, act_type="relu") 34 | lrn1 = mx.sym.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=2, nsize=5) 35 | pool1 = mx.sym.Pooling( 36 | data=lrn1, pool_type="max", kernel=(3, 3), stride=(2,2)) 37 | # stage 2 38 | conv2 = mx.sym.Convolution(name='conv2', 39 | data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=256) 40 | relu2 = mx.sym.Activation(data=conv2, act_type="relu") 41 | lrn2 = mx.sym.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=2, nsize=5) 42 | pool2 = mx.sym.Pooling(data=lrn2, kernel=(3, 3), stride=(2, 2), pool_type="max") 43 | # stage 3 44 | conv3 = mx.sym.Convolution(name='conv3', 45 | data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=384) 46 | relu3 = mx.sym.Activation(data=conv3, act_type="relu") 47 | conv4 = mx.sym.Convolution(name='conv4', 48 | data=relu3, kernel=(3, 3), pad=(1, 1), num_filter=384) 49 | relu4 = mx.sym.Activation(data=conv4, act_type="relu") 50 | conv5 = mx.sym.Convolution(name='conv5', 51 | data=relu4, kernel=(3, 3), pad=(1, 1), num_filter=256) 52 | relu5 = mx.sym.Activation(data=conv5, act_type="relu") 53 | pool3 = mx.sym.Pooling(data=relu5, kernel=(3, 3), stride=(2, 2), pool_type="max") 54 | # stage 4 55 | flatten = mx.sym.Flatten(data=pool3) 56 | fc1 = mx.sym.FullyConnected(name='fc1', data=flatten, num_hidden=4096) 57 | relu6 = mx.sym.Activation(data=fc1, act_type="relu") 58 | dropout1 = mx.sym.Dropout(data=relu6, p=0.5) 59 | # stage 5 60 | fc2 = mx.sym.FullyConnected(name='fc2', data=dropout1, num_hidden=4096) 61 | relu7 = mx.sym.Activation(data=fc2, act_type="relu") 62 | dropout2 = mx.sym.Dropout(data=relu7, p=0.5) 63 | # stage 6 64 | fc3 = mx.sym.FullyConnected(name='fc3', data=dropout2, num_hidden=num_classes) 65 | if dtype == 'float16': 66 | fc3 = mx.sym.Cast(data=fc3, dtype=np.float32) 67 | softmax = mx.sym.SoftmaxOutput(data=fc3, name='softmax') 68 | return softmax 69 | -------------------------------------------------------------------------------- /byteps/common/compressor/error_feedback.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H 17 | #define BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H 18 | 19 | #include "../cpu_reducer.h" 20 | #include "compressor.h" 21 | 22 | namespace byteps { 23 | namespace common { 24 | namespace compressor { 25 | 26 | /*! 27 | * \brief Error feedback Decorator 28 | * 29 | * paper: 1-bit stochastic gradient descent and its application to data-parallel 30 | * distributed training of speech dnns 31 | * https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/IS140694.pdf 32 | * 33 | * 1. UpdateGradient: g <- g + e 34 | * 2. UpdateError: e <- g - c 35 | * 36 | * These two functions should be implemented in children classes. 37 | * 38 | * \par 39 | * The caller do not need allocate an additional buffer to store error. There is 40 | * a buffer already inside the class. 41 | * 42 | * \par 43 | * Add error feedback behavior to any compressor at run-time via decorator 44 | * pattern. It keeps the same interface as Compressor. Compress and Decompress 45 | * have been implemented and can not be changed in children classes. 46 | * 47 | * \sa Compressor, VanillaErrorFeedbackCompressor 48 | */ 49 | class ErrorFeedback : public Compressor { 50 | public: 51 | // error buffer should be cleared to zeros at the beginning. 52 | ErrorFeedback(size_t size, DataType dtype, std::unique_ptr cptr) 53 | : Compressor(size, dtype), 54 | _error(new byte_t[size]()), 55 | _cpu_reducer(new CpuReducer(nullptr)), 56 | _cptr(std::move(cptr)) {} 57 | virtual ~ErrorFeedback() = default; 58 | 59 | virtual tensor_t Compress(tensor_t grad) final; 60 | 61 | virtual tensor_t Decompress(tensor_t compressed) final; 62 | 63 | protected: 64 | /*! 65 | * \brief Correct gradient with error 66 | * 67 | * grad += error 68 | * 69 | * \note it is an inplace operation. 70 | * 71 | * \param grad input gradient to be updated inplace 72 | * \param dtype type 73 | */ 74 | virtual void UpdateGradient(tensor_t grad) = 0; 75 | 76 | /*! 77 | * \brief Update error 78 | * 79 | * error = corrected_grad - decompressed 80 | * 81 | * \param corrected refers to gradient + error 82 | * \param compressed compressed tensor 83 | */ 84 | virtual void UpdateError(tensor_t corrected, tensor_t compressed); 85 | 86 | protected: 87 | /*! \brief buffer of error */ 88 | std::unique_ptr _error; 89 | 90 | std::unique_ptr _cpu_reducer; 91 | 92 | private: 93 | /*! \brief compressor pointer */ 94 | std::unique_ptr _cptr; 95 | }; 96 | } // namespace compressor 97 | } // namespace common 98 | } // namespace byteps 99 | 100 | #endif // BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H -------------------------------------------------------------------------------- /example/keras/keras_mnist.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | import keras 3 | from keras.datasets import mnist 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Dropout, Flatten 6 | from keras.layers import Conv2D, MaxPooling2D 7 | from keras import backend as K 8 | import math 9 | import tensorflow as tf 10 | import byteps.keras as bps 11 | 12 | # BytePS: initialize BytePS. 13 | bps.init() 14 | 15 | # BytePS: pin GPU to be used to process local rank (one GPU per process) 16 | config = tf.ConfigProto() 17 | config.gpu_options.allow_growth = True 18 | config.gpu_options.visible_device_list = str(bps.local_rank()) 19 | K.set_session(tf.Session(config=config)) 20 | 21 | batch_size = 128 22 | num_classes = 10 23 | 24 | # BytePS: adjust number of epochs based on number of GPUs. 25 | epochs = int(math.ceil(12.0 / bps.size())) 26 | 27 | # Input image dimensions 28 | img_rows, img_cols = 28, 28 29 | 30 | # The data, shuffled and split between train and test sets 31 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 32 | 33 | if K.image_data_format() == 'channels_first': 34 | x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) 35 | x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) 36 | input_shape = (1, img_rows, img_cols) 37 | else: 38 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 39 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 40 | input_shape = (img_rows, img_cols, 1) 41 | 42 | x_train = x_train.astype('float32') 43 | x_test = x_test.astype('float32') 44 | x_train /= 255 45 | x_test /= 255 46 | print('x_train shape:', x_train.shape) 47 | print(x_train.shape[0], 'train samples') 48 | print(x_test.shape[0], 'test samples') 49 | 50 | # Convert class vectors to binary class matrices 51 | y_train = keras.utils.to_categorical(y_train, num_classes) 52 | y_test = keras.utils.to_categorical(y_test, num_classes) 53 | 54 | model = Sequential() 55 | model.add(Conv2D(32, kernel_size=(3, 3), 56 | activation='relu', 57 | input_shape=input_shape)) 58 | model.add(Conv2D(64, (3, 3), activation='relu')) 59 | model.add(MaxPooling2D(pool_size=(2, 2))) 60 | model.add(Dropout(0.25)) 61 | model.add(Flatten()) 62 | model.add(Dense(128, activation='relu')) 63 | model.add(Dropout(0.5)) 64 | model.add(Dense(num_classes, activation='softmax')) 65 | 66 | # BytePS: adjust learning rate based on number of GPUs. 67 | opt = keras.optimizers.Adadelta(1.0 * bps.size()) 68 | 69 | # BytePS: add BytePS Distributed Optimizer. 70 | opt = bps.DistributedOptimizer(opt) 71 | 72 | model.compile(loss=keras.losses.categorical_crossentropy, 73 | optimizer=opt, 74 | metrics=['accuracy']) 75 | 76 | callbacks = [ 77 | # BytePS: broadcast initial variable states from rank 0 to all other processes. 78 | # This is necessary to ensure consistent initialization of all workers when 79 | # training is started with random weights or restored from a checkpoint. 80 | bps.callbacks.BroadcastGlobalVariablesCallback(0), 81 | ] 82 | 83 | # BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them. 84 | if bps.rank() == 0: 85 | callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) 86 | 87 | model.fit(x_train, y_train, 88 | batch_size=batch_size, 89 | callbacks=callbacks, 90 | epochs=epochs, 91 | verbose=1 if bps.rank() == 0 else 0, 92 | validation_data=(x_test, y_test)) 93 | score = model.evaluate(x_test, y_test, verbose=0) 94 | print('Test loss:', score[0]) 95 | print('Test accuracy:', score[1]) 96 | -------------------------------------------------------------------------------- /byteps/common/compressor/impl/randomk.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_RANDOMK_H 17 | #define BYTEPS_COMPRESSOR_IMPL_RANDOMK_H 18 | 19 | #include 20 | 21 | #include "../compressor.h" 22 | #include "../utils.h" 23 | 24 | namespace byteps { 25 | namespace common { 26 | namespace compressor { 27 | 28 | /*! 29 | * \brief RandomK Compressor 30 | * 31 | * paper: Sparsified SGD with Memory 32 | * https://arxiv.org/pdf/1809.07599.pdf 33 | * 34 | * randomly sending k entries of the stochastic gradient 35 | * 36 | * \note it is a stochastic algorithm. If you want to have deterministic 37 | * behavior, please set a seed in the configurations. 38 | */ 39 | class RandomkCompressor : public Compressor { 40 | public: 41 | RandomkCompressor(size_t size, DataType dtype, unsigned int k, unsigned int seed = 0) 42 | : Compressor(size, dtype), _k(k) { 43 | if (seed != 0) { 44 | BPS_LOG(INFO) << "SET SEED = " << seed; 45 | _rng.set_seed(seed); 46 | } 47 | }; 48 | virtual ~RandomkCompressor() = default; 49 | 50 | /*! 51 | * \brief Compress function 52 | * 53 | * randomly select k entries and corresponding indices 54 | * 55 | * \param grad gradient tensor 56 | * \param compressed compressed tensor 57 | */ 58 | tensor_t Compress(tensor_t grad) override; 59 | 60 | /*! 61 | * \brief Decompress function 62 | * 63 | * fill a zero tensor with topk entries and corresponding indices 64 | * 65 | * \param compressed compressed tensor 66 | * \param decompressed decompressed tensor 67 | */ 68 | tensor_t Decompress(tensor_t compressed) override; 69 | 70 | /*! 71 | * \brief faster version of `UpdateError` 72 | * 73 | * 1. e <- p (e is the error and p is the corrected gradient) 74 | * 2. zero-fill e with selected k indices 75 | * 76 | * \param corrected gradient corrected with error 77 | * \param error error 78 | * \param compressed compressed gradient 79 | */ 80 | void FastUpdateError(tensor_t error, tensor_t corrected, 81 | tensor_t compressed) override; 82 | 83 | private: 84 | template 85 | tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len); 86 | 87 | template 88 | tensor_t DecompressImpl(scalar_t* dst, const index_t* src, 89 | size_t compressed_size); 90 | 91 | template 92 | void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected, 93 | const index_t* compressed, size_t compressed_size); 94 | 95 | private: 96 | unsigned int _k; 97 | std::random_device _rd; 98 | XorShift128PlusBitShifterRNG _rng; 99 | }; 100 | } // namespace compressor 101 | } // namespace common 102 | } // namespace byteps 103 | 104 | #endif // BYTEPS_COMPRESSOR_IMPL_RANDOMK_H -------------------------------------------------------------------------------- /byteps/server/queue.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_SERVER_QUEUE_H 17 | #define BYTEPS_SERVER_QUEUE_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace byteps { 26 | namespace server { 27 | 28 | /** 29 | * \brief thread-safe queue allowing push and waited pop 30 | */ 31 | class PriorityQueue { 32 | public: 33 | PriorityQueue(bool is_schedule) { 34 | enable_schedule_ = is_schedule; 35 | if (enable_schedule_) { 36 | std::make_heap(queue_.begin(), queue_.end(), 37 | [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) { 38 | return ComparePriority(a, b); 39 | } 40 | ); 41 | } 42 | } 43 | ~PriorityQueue() { } 44 | 45 | /** 46 | * \brief push an value and sort using heap. threadsafe. 47 | * \param new_value the value 48 | */ 49 | void Push(BytePSEngineMessage new_value) { 50 | mu_.lock(); 51 | queue_.push_back(std::move(new_value)); 52 | if (enable_schedule_) { 53 | ++push_cnt_[new_value.key]; 54 | std::push_heap(queue_.begin(), queue_.end(), 55 | [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) { 56 | return ComparePriority(a, b); 57 | } 58 | ); 59 | } 60 | mu_.unlock(); 61 | cond_.notify_all(); 62 | } 63 | 64 | /** 65 | * \brief wait until pop an element from the beginning, threadsafe 66 | * \param value the poped value 67 | */ 68 | void WaitAndPop(BytePSEngineMessage* value) { 69 | std::unique_lock lk(mu_); 70 | cond_.wait(lk, [this]{return !queue_.empty();}); 71 | if (enable_schedule_) { 72 | std::pop_heap(queue_.begin(), queue_.end(), 73 | [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) { 74 | return ComparePriority(a, b); 75 | } 76 | ); 77 | *value = queue_.back(); 78 | queue_.pop_back(); 79 | } else { 80 | *value = std::move(queue_.front()); 81 | queue_.erase(queue_.begin()); 82 | } 83 | } 84 | 85 | void ClearCounter(uint64_t key) { 86 | if (!enable_schedule_) return; 87 | std::unique_lock lk(mu_); 88 | push_cnt_[key] = 0; 89 | } 90 | 91 | bool ComparePriority(const BytePSEngineMessage& a, const BytePSEngineMessage& b) { 92 | if (push_cnt_[a.key] == push_cnt_[b.key]) { 93 | return (a.id > b.id); 94 | } else { 95 | return (push_cnt_[a.key] > push_cnt_[b.key]); 96 | } 97 | } 98 | 99 | private: 100 | mutable std::mutex mu_; 101 | std::vector queue_; 102 | std::condition_variable cond_; 103 | std::unordered_map push_cnt_; 104 | volatile bool enable_schedule_ = false; 105 | }; 106 | 107 | } // namespace server 108 | } // namespace byteps 109 | 110 | #endif // BYTEPS_SERVER_QUEUE_H -------------------------------------------------------------------------------- /example/tensorflow/tensorflow_keras_mnist.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | from tensorflow import keras 3 | from tensorflow.keras.datasets import mnist 4 | from tensorflow.keras.models import Sequential 5 | from tensorflow.keras.layers import Dense, Dropout, Flatten 6 | from tensorflow.keras.layers import Conv2D, MaxPooling2D 7 | from tensorflow.keras import backend as K 8 | import math 9 | import tensorflow as tf 10 | import byteps.keras as bps 11 | 12 | # BytePS: initialize BytePS. 13 | bps.init() 14 | 15 | # BytePS: pin GPU to be used to process local rank (one GPU per process) 16 | config = tf.ConfigProto() 17 | config.gpu_options.allow_growth = True 18 | config.gpu_options.visible_device_list = str(bps.local_rank()) 19 | K.set_session(tf.Session(config=config)) 20 | 21 | batch_size = 128 22 | num_classes = 10 23 | 24 | # BytePS: adjust number of epochs based on number of GPUs. 25 | epochs = int(math.ceil(12.0 / bps.size())) 26 | 27 | # Input image dimensions 28 | img_rows, img_cols = 28, 28 29 | 30 | # The data, shuffled and split between train and test sets 31 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 32 | 33 | if K.image_data_format() == 'channels_first': 34 | x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) 35 | x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) 36 | input_shape = (1, img_rows, img_cols) 37 | else: 38 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 39 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 40 | input_shape = (img_rows, img_cols, 1) 41 | 42 | x_train = x_train.astype('float32') 43 | x_test = x_test.astype('float32') 44 | x_train /= 255 45 | x_test /= 255 46 | print('x_train shape:', x_train.shape) 47 | print(x_train.shape[0], 'train samples') 48 | print(x_test.shape[0], 'test samples') 49 | 50 | # Convert class vectors to binary class matrices 51 | y_train = keras.utils.to_categorical(y_train, num_classes) 52 | y_test = keras.utils.to_categorical(y_test, num_classes) 53 | 54 | model = Sequential() 55 | model.add(Conv2D(32, kernel_size=(3, 3), 56 | activation='relu', 57 | input_shape=input_shape)) 58 | model.add(Conv2D(64, (3, 3), activation='relu')) 59 | model.add(MaxPooling2D(pool_size=(2, 2))) 60 | model.add(Dropout(0.25)) 61 | model.add(Flatten()) 62 | model.add(Dense(128, activation='relu')) 63 | model.add(Dropout(0.5)) 64 | model.add(Dense(num_classes, activation='softmax')) 65 | 66 | # BytePS: adjust learning rate based on number of GPUs. 67 | opt = keras.optimizers.Adadelta(1.0 * bps.size()) 68 | 69 | # BytePS: add BytePS Distributed Optimizer. 70 | opt = bps.DistributedOptimizer(opt) 71 | 72 | model.compile(loss=keras.losses.categorical_crossentropy, 73 | optimizer=opt, 74 | metrics=['accuracy']) 75 | 76 | callbacks = [ 77 | # BytePS: broadcast initial variable states from rank 0 to all other processes. 78 | # This is necessary to ensure consistent initialization of all workers when 79 | # training is started with random weights or restored from a checkpoint. 80 | bps.callbacks.BroadcastGlobalVariablesCallback(0), 81 | ] 82 | 83 | # BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them. 84 | if bps.rank() == 0: 85 | callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) 86 | 87 | model.fit(x_train, y_train, 88 | batch_size=batch_size, 89 | callbacks=callbacks, 90 | epochs=epochs, 91 | verbose=1, 92 | validation_data=(x_test, y_test)) 93 | score = model.evaluate(x_test, y_test, verbose=0) 94 | print('Test loss:', score[0]) 95 | print('Test accuracy:', score[1]) 96 | -------------------------------------------------------------------------------- /byteps/common/nccl_manager.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_NCCL_MANAGER_H 17 | #define BYTEPS_NCCL_MANAGER_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include "common.h" 23 | #include "communicator.h" 24 | #include "scheduled_queue.h" 25 | 26 | namespace byteps { 27 | namespace common { 28 | 29 | class NcclGroupEntry { 30 | public: 31 | void RecordEvents(); 32 | void SynchronizeEvents(); 33 | void DestroyEvents(); 34 | 35 | std::vector> tasks; 36 | std::vector queues; 37 | 38 | private: 39 | std::vector _events; 40 | }; 41 | 42 | class NcclManager { 43 | public: 44 | NcclManager(std::shared_ptr comm); 45 | ~NcclManager() { 46 | if (_nccl_stream) { 47 | CUDA_CALL(cudaStreamDestroy(*_nccl_stream)); 48 | } 49 | if (_nccl_id) { 50 | free(_nccl_id); 51 | } 52 | if (_nccl_comm) { 53 | free(_nccl_comm); 54 | } 55 | if (_signal_comm) { 56 | _signal_comm.reset(); 57 | } 58 | if (_global_comm) { 59 | _global_comm.reset(); 60 | } 61 | while (!_nccl_pipeline.empty()) _nccl_pipeline.pop(); 62 | 63 | BPS_LOG(DEBUG) << "Clear NcclManager"; 64 | } 65 | 66 | int GetGroupSize() { return _nccl_group_size; } 67 | void EnqueueGroup(std::shared_ptr e); 68 | std::shared_ptr DequeueGroup(); 69 | 70 | virtual cudaStream_t GetStream(uint64_t key, QueueType op); 71 | virtual ncclComm_t GetComm(uint64_t key, QueueType op); 72 | virtual int GetRoot(uint64_t key, QueueType op); 73 | virtual int GetRank(uint64_t key, QueueType op); 74 | 75 | int GetSize() { return _nccl_size; } 76 | std::shared_ptr GetSignalComm() { return _signal_comm; } 77 | bool IsSignalRoot(); 78 | 79 | protected: 80 | void InitGlobalEnv(); 81 | virtual void ConstructRings(); 82 | 83 | cudaStream_t* _nccl_stream; 84 | ncclUniqueId* _nccl_id; 85 | ncclComm_t* _nccl_comm; 86 | 87 | // global user-defined env 88 | size_t _nccl_group_size; 89 | size_t _nccl_pcie_size; 90 | size_t _nccl_pcie_num; 91 | size_t _nccl_num_rings; 92 | 93 | int _nccl_size; 94 | 95 | // for pipelining nccl 96 | std::mutex _nccl_mutex; 97 | std::queue> _nccl_pipeline; 98 | 99 | std::shared_ptr _signal_comm; 100 | std::shared_ptr _global_comm; 101 | }; 102 | 103 | class NcclManagerExpr : public NcclManager { 104 | public: 105 | cudaStream_t GetStream(uint64_t key, QueueType op); 106 | ncclComm_t GetComm(uint64_t key, QueueType op); 107 | int GetRoot(uint64_t key, QueueType op); 108 | int GetRank(uint64_t key, QueueType op); 109 | 110 | protected: 111 | void ConstructRings(); 112 | 113 | // for multi-ring 114 | std::vector> _rings; 115 | }; 116 | 117 | } // namespace common 118 | } // namespace byteps 119 | 120 | #endif // BYTEPS_NCCL_MANAGER_H 121 | -------------------------------------------------------------------------------- /byteps/torch/ready_event.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #if HAVE_CUDA 18 | #if TORCH_VERSION >= 1005000000 19 | #include 20 | #include 21 | #else 22 | #include 23 | #endif 24 | #include 25 | #include 26 | #include 27 | #include 28 | #endif 29 | 30 | #include "cuda_util.h" 31 | #include "ready_event.h" 32 | 33 | #if TORCH_VERSION < 1005000000 34 | #if HAVE_CUDA 35 | extern THCState* state; 36 | #endif 37 | #endif 38 | 39 | namespace byteps { 40 | namespace torch { 41 | 42 | #if HAVE_CUDA 43 | struct ReadyEventRegistry { 44 | std::unordered_map> cuda_events; 45 | std::mutex mutex; 46 | }; 47 | 48 | static ReadyEventRegistry ready_event_registry; 49 | 50 | TorchReadyEvent::TorchReadyEvent(int device) : device_(device) { 51 | assert(device_ != CPU_DEVICE_ID); 52 | 53 | with_device device_context(device_); 54 | { 55 | std::lock_guard guard(ready_event_registry.mutex); 56 | auto& queue = ready_event_registry.cuda_events[device_]; 57 | if (!queue.empty()) { 58 | cuda_event_ = queue.front(); 59 | queue.pop(); 60 | } else { 61 | #if TORCH_VERSION >= 1005000000 62 | C10_CUDA_CHECK(cudaEventCreateWithFlags( 63 | &cuda_event_, cudaEventBlockingSync | cudaEventDisableTiming)); 64 | #else 65 | THCudaCheck(cudaEventCreateWithFlags( 66 | &cuda_event_, cudaEventBlockingSync | cudaEventDisableTiming)); 67 | #endif 68 | } 69 | } 70 | #if TORCH_VERSION >= 1005000000 71 | auto stream = c10::cuda::getCurrentCUDAStream(device_); 72 | C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream)); 73 | #else 74 | auto stream = THCState_getCurrentStreamOnDevice(state, device_); 75 | THCudaCheck(cudaEventRecord(cuda_event_, stream)); 76 | #endif 77 | } 78 | 79 | TorchReadyEvent::~TorchReadyEvent() { 80 | { 81 | std::lock_guard guard(ready_event_registry.mutex); 82 | auto& queue = ready_event_registry.cuda_events[device_]; 83 | queue.push(cuda_event_); 84 | } 85 | } 86 | 87 | bool TorchReadyEvent::Ready() const { 88 | auto status = cudaEventQuery(cuda_event_); 89 | if (status == cudaErrorNotReady) { 90 | return false; 91 | } 92 | #if TORCH_VERSION >= 1005000000 93 | C10_CUDA_CHECK(status); 94 | #else 95 | THCudaCheck(status); 96 | #endif 97 | return true; 98 | } 99 | #endif 100 | 101 | // On GPU this event will signal that GPU computations are done and data is 102 | // ready. 103 | std::shared_ptr RecordReadyEvent(int device) { 104 | if (device == CPU_DEVICE_ID) { 105 | return std::shared_ptr(); 106 | } else { 107 | #if HAVE_CUDA 108 | return std::make_shared(device); 109 | #else 110 | throw std::logic_error( 111 | "Internal error. Requested ReadyEvent " 112 | "with GPU device but not compiled with CUDA."); 113 | #endif 114 | } 115 | } 116 | 117 | } // namespace torch 118 | } // namespace byteps -------------------------------------------------------------------------------- /example/mxnet/symbols/vgg.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """References: 19 | 20 | Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for 21 | large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014). 22 | """ 23 | 24 | import mxnet as mx 25 | import numpy as np 26 | 27 | def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs): 28 | for i, num in enumerate(layers): 29 | for j in range(num): 30 | internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1)) 31 | if batch_norm: 32 | internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1)) 33 | internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1)) 34 | internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1)) 35 | return internel_layer 36 | 37 | def get_classifier(input_data, num_classes, **kwargs): 38 | flatten = mx.sym.Flatten(data=input_data, name="flatten") 39 | fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6") 40 | relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6") 41 | drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6") 42 | fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7") 43 | relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7") 44 | drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7") 45 | fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8") 46 | return fc8 47 | 48 | def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs): 49 | """ 50 | Parameters 51 | ---------- 52 | num_classes : int, default 1000 53 | Number of classification classes. 54 | num_layers : int 55 | Number of layers for the variant of densenet. Options are 11, 13, 16, 19. 56 | batch_norm : bool, default False 57 | Use batch normalization. 58 | dtype: str, float32 or float16 59 | Data precision. 60 | """ 61 | vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]), 62 | 13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]), 63 | 16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]), 64 | 19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])} 65 | if num_layers not in vgg_spec: 66 | raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers)) 67 | layers, filters = vgg_spec[num_layers] 68 | data = mx.sym.Variable(name="data") 69 | if dtype == 'float16': 70 | data = mx.sym.Cast(data=data, dtype=np.float16) 71 | feature = get_feature(data, layers, filters, batch_norm) 72 | classifier = get_classifier(feature, num_classes) 73 | if dtype == 'float16': 74 | classifier = mx.sym.Cast(data=classifier, dtype=np.float32) 75 | symbol = mx.sym.SoftmaxOutput(data=classifier, name='softmax') 76 | return symbol 77 | -------------------------------------------------------------------------------- /docs/best-practice.md: -------------------------------------------------------------------------------- 1 | # BytePS Best Practice 2 | 3 | ## Single machine (non-distributed mode) 4 | 5 | When `DMLC_NUM_WORKER=1`, BytePS will not use the parameter servers or schedulers at all. In this case, BytePS runs in non-distributed mode. You do not even need to run server/scheduler. 6 | 7 | In non-distributed mode, BytePS is basically doing NCCL allreduce, so it will not outperform Horovod/NCCL much. BytePS implemented priority-based scheduling, which may improve the training speed by 0%~15%, depending on your training task. 8 | 9 | The only thing you can tune is `BYTEPS_PCIE_SWITCH_SIZE`. If you know your hardware topology, e.g., say you have 8 GPUs in total, 4 GPUs connect to one PCI-e switch, the other 4 GPUs connect to another PCI-e switch, then you should set `BYTEPS_PCIE_SWITCH_SIZE=4`. In this case, you may see 20%~30% performance improvement compared with Horovod/NCCL. 10 | 11 | If you have NVLinks, leave `BYTEPS_PCIE_SWITCH_SIZE` unmodified. If you don't know your hardware topology, leave `BYTEPS_PCIE_SWITCH_SIZE` unmodified. 12 | 13 | 14 | ## Multi-machine (distributed mode) 15 | 16 | ### With additional CPU servers 17 | 18 | This mode requires at least **4** physical machines. Two of the machines should have GPUs and run as workers. The other two run as CPU servers and do not need GPUs. The scheduler can run on any machine. 19 | 20 | The key here is to make sure the following: 21 | * Servers must be on different physical machines from workers. 22 | * The total bandwidth of the servers must be equal or larger than the total bandwidth of workers. 23 | 24 | If you are using RDMA, this should be sufficient. However, with TCP and >=25Gbps networks, it's possible that BytePS cannot fully utilize the bandwidth because a single TCP connection usually cannot run up to 25Gbps. 25 | 26 | To address this, you can try running more BytePS server instances on the server machines. For example, you can try running two server instances per server machines. This effectively doubles the number of TCP connections and should be sufficient for 25Gbps networks. For 40Gbps/50Gbps networks, you need three server instances per server machine, and so on. 27 | 28 | ### No additional CPU servers 29 | 30 | When you don't have additional CPU servers, then for each physical machine, you should launch a worker and a server process. We call this *co-locate* mode, and the resource consumption is the same with Horovod (no additional servers). 31 | 32 | If you are using TCP, you will probably get near-identical performance with Horovod-TCP. However, if you are using RDMA, you can set `BYTEPS_ENABLE_IPC=1` to enable the IPC communication between the co-located worker and server. And eventually you will get higher end-to-end performance than Horovod. 33 | 34 | ## The expected performance 35 | 36 | In the single machine case, if you leave `BYTEPS_PCIE_SWITCH_SIZE` unmodified, BytePS performance should never be lower than Horovod/NCCL. 37 | 38 | In multi-machine case, if the deployment satisfies the two requirements above, you should see BytePS is at least as fast as Horovod or TF and MXNet's native PS. If each of your workers has two or more GPUs, you should see significant improvement, like 40% - 100% compared with other existing solutions. 39 | 40 | If you have to deploy server instances on the same physical machines as workers, the performance will be similar to Horovod/NCCL. 41 | 42 | If you have less servers than workers, the performance will be proportionally lower. For example, if you have only 1 server and 2 workers, you'll only get half of the performance compared with 2 servers + 2 workers. 43 | 44 | ## How to compare with other solutions 45 | 46 | To compare with Horovod is simple. Install Horovod, and change `bps` back to `hvd`. 47 | 48 | To compare with other PS architecture, make sure that you use the same hardware setup. Most of the existing PS implementations cannot run as fast as Horovod/NCCL. So, usually you just need to compare with Horovod/NCCL. 49 | -------------------------------------------------------------------------------- /tests/test_tensorflow_keras.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for byteps.keras.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | import numpy as np 24 | import warnings 25 | 26 | from distutils.version import LooseVersion 27 | from tensorflow import keras 28 | from tensorflow.python.keras import backend as K 29 | 30 | import byteps.tensorflow.keras as bps 31 | 32 | class TfKerasTests: 33 | """ 34 | Tests for ops in byteps.keras. 35 | """ 36 | 37 | def __init__(self, *args, **kwargs): 38 | super(TfKerasTests, self).__init__(*args, **kwargs) 39 | warnings.simplefilter('module') 40 | bps.init() 41 | 42 | self.config = tf.ConfigProto() 43 | self.config.gpu_options.allow_growth = True 44 | self.config.gpu_options.visible_device_list = str(bps.local_rank()) 45 | 46 | def test_train_model(self): 47 | with self.test_session(config=self.config) as sess: 48 | K.set_session(sess) 49 | 50 | opt = keras.optimizers.RMSprop(lr=0.0001) 51 | opt = bps.DistributedOptimizer(opt) 52 | 53 | model = keras.models.Sequential() 54 | model.add(keras.layers.Dense(2, input_shape=(3,))) 55 | model.add(keras.layers.RepeatVector(3)) 56 | model.add(keras.layers.ThresholdedReLU(0.5)) 57 | model.compile(loss=keras.losses.mean_squared_error, 58 | optimizer=opt, 59 | metrics=[keras.metrics.categorical_accuracy], 60 | sample_weight_mode='temporal') 61 | 62 | x = np.random.random((1, 3)) 63 | y = np.random.random((1, 3, 3)) 64 | 65 | def generator(): 66 | while 1: 67 | yield (x, y) 68 | 69 | print ('x is: ', x) 70 | print ('y is: ', y) 71 | # No assertions, we just need to verify that it doesn't hang 72 | callbacks = [bps.callbacks.BroadcastGlobalVariablesCallback(0)] 73 | model.fit_generator(generator(), 74 | steps_per_epoch=10, 75 | callbacks=callbacks, 76 | epochs=0, 77 | verbose=0, 78 | workers=4, 79 | initial_epoch=1) 80 | print ('x-trained is: ', x) 81 | print ('y-trained is: ', y) 82 | 83 | def test_sparse_as_dense(self): 84 | with self.test_session(config=self.config) as sess: 85 | K.set_session(sess) 86 | 87 | opt = keras.optimizers.RMSprop(lr=0.0001) 88 | opt = bps.DistributedOptimizer(opt, sparse_as_dense=True) 89 | 90 | model = keras.models.Sequential() 91 | model.add(keras.layers.Embedding(1000, 64, input_length=10)) 92 | model.compile(loss=keras.losses.mean_squared_error, 93 | optimizer=opt) 94 | 95 | x = np.random.randint(1000, size=(32, 10)) 96 | y = np.random.random((32, 10, 64)) 97 | # No assertions, we just need to verify that it doesn't hang 98 | model.train_on_batch(x, y) 99 | 100 | 101 | if __name__ == '__main__': 102 | keras_test = TfKerasTests() 103 | keras_test.test_train_model() 104 | -------------------------------------------------------------------------------- /example/tensorflow/tensorflow2_keras_mnist.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Uber Technologies, Inc. All Rights Reserved. 2 | # Copyright 2019 Uber Technologies, Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | from __future__ import absolute_import, division, print_function 17 | 18 | import tensorflow as tf 19 | import byteps.tensorflow.keras as bps 20 | 21 | # tf.compat.v1.disable_eager_execution() 22 | 23 | # byteps: initialize byteps. 24 | bps.init() 25 | 26 | # byteps: pin GPU to be used to process local rank (one GPU per process) 27 | gpus = tf.config.experimental.list_physical_devices('GPU') 28 | for gpu in gpus: 29 | tf.config.experimental.set_memory_growth(gpu, True) 30 | if gpus: 31 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU') 32 | 33 | (mnist_images, mnist_labels), _ = \ 34 | tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % bps.rank()) 35 | 36 | dataset = tf.data.Dataset.from_tensor_slices( 37 | (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), 38 | tf.cast(mnist_labels, tf.int64)) 39 | ) 40 | dataset = dataset.repeat().shuffle(10000).batch(128) 41 | 42 | mnist_model = tf.keras.Sequential([ 43 | tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), 44 | tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), 45 | tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), 46 | tf.keras.layers.Dropout(0.25), 47 | tf.keras.layers.Flatten(), 48 | tf.keras.layers.Dense(128, activation='relu'), 49 | tf.keras.layers.Dropout(0.5), 50 | tf.keras.layers.Dense(10, activation='softmax') 51 | ]) 52 | 53 | # byteps: adjust learning rate based on number of GPUs. 54 | scaled_lr = 0.001 * bps.size() 55 | opt = tf.optimizers.Adam(scaled_lr) 56 | 57 | # byteps: add byteps DistributedOptimizer. 58 | opt = bps.DistributedOptimizer(opt) 59 | 60 | # byteps: Specify `experimental_run_tf_function=False` to ensure TensorFlow 61 | # uses bps.DistributedOptimizer() to compute gradients. 62 | mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), 63 | optimizer=opt, 64 | metrics=['accuracy'], 65 | experimental_run_tf_function=False) 66 | 67 | callbacks = [ 68 | # byteps: broadcast initial variable states from rank 0 to all other processes. 69 | # This is necessary to ensure consistent initialization of all workers when 70 | # training is started with random weights or restored from a checkpoint. 71 | bps.callbacks.BroadcastGlobalVariablesCallback(0, device="GPU:0"), 72 | 73 | # byteps: average metrics among workers at the end of every epoch. 74 | # 75 | # Note: This callback must be in the list before the ReduceLROnPlateau, 76 | # TensorBoard or other metrics-based callbacks. 77 | bps.callbacks.MetricAverageCallback(device="GPU:0"), 78 | 79 | # byteps: using `lr = 1.0 * bps.size()` from the very beginning leads to worse final 80 | # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * bps.size()` during 81 | # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. 82 | bps.callbacks.LearningRateWarmupCallback(warmup_epochs=3, initial_lr=scaled_lr, verbose=1), 83 | ] 84 | 85 | # byteps: save checkpoints only on worker 0 to prevent other workers from corrupting them. 86 | if bps.rank() == 0: 87 | callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) 88 | 89 | # byteps: write logs on worker 0. 90 | verbose = 1 if bps.rank() == 0 else 0 91 | 92 | # Train the model. 93 | # byteps: adjust number of steps based on number of GPUs. 94 | mnist_model.fit(dataset, steps_per_epoch=500 // bps.size(), callbacks=callbacks, epochs=24, verbose=verbose) 95 | -------------------------------------------------------------------------------- /byteps/common/logging.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_LOGGING_H 18 | #define BYTEPS_LOGGING_H 19 | 20 | #include 21 | #include 22 | 23 | namespace byteps { 24 | namespace common { 25 | 26 | enum class LogLevel { TRACE, DEBUG, INFO, WARNING, ERROR, FATAL }; 27 | 28 | #define LOG_LEVELS "TDIWEF" 29 | 30 | // Always-on checking 31 | #define BPS_CHECK(x) \ 32 | if (!(x)) \ 33 | common::LogMessageFatal(__FILE__, __LINE__) << "Check failed: " #x << ' ' 34 | 35 | #define BPS_CHECK_LT(x, y) BPS_CHECK((x) < (y)) 36 | #define BPS_CHECK_GT(x, y) BPS_CHECK((x) > (y)) 37 | #define BPS_CHECK_LE(x, y) BPS_CHECK((x) <= (y)) 38 | #define BPS_CHECK_GE(x, y) BPS_CHECK((x) >= (y)) 39 | #define BPS_CHECK_EQ(x, y) BPS_CHECK((x) == (y)) 40 | #define BPS_CHECK_NE(x, y) BPS_CHECK((x) != (y)) 41 | #define BPS_CHECK_NOTNULL(x) \ 42 | ((x) == NULL ? common::LogMessageFatal(__FILE__, __LINE__) \ 43 | << "Check notnull: " #x << ' ', \ 44 | (x) : (x)) // NOLINT(*) 45 | 46 | /*! 47 | * \brief Protected CUDA call. 48 | * \param func Expression to call. 49 | * 50 | * It checks for CUDA errors after invocation of the expression. 51 | */ 52 | #define CUDA_CALL(func) \ 53 | { \ 54 | cudaError_t e = (func); \ 55 | BPS_CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ 56 | << "CUDA: " << cudaGetErrorString(e); \ 57 | } 58 | 59 | /* 60 | * \brief Protected NCCL call. 61 | */ 62 | #define NCCLCHECK(cmd) \ 63 | { \ 64 | ncclResult_t r = (cmd); \ 65 | BPS_CHECK(r == ncclSuccess) << "NCCL error: " << ncclGetErrorString(r); \ 66 | } 67 | 68 | class LogMessage : public std::basic_ostringstream { 69 | public: 70 | LogMessage(const char* fname, int line, LogLevel severity); 71 | ~LogMessage(); 72 | 73 | protected: 74 | void GenerateLogMessage(bool log_time); 75 | 76 | private: 77 | const char* fname_; 78 | int line_; 79 | LogLevel severity_; 80 | }; 81 | 82 | // LogMessageFatal ensures the process will exit in failure after 83 | // logging this message. 84 | class LogMessageFatal : public LogMessage { 85 | public: 86 | LogMessageFatal(const char* file, int line); 87 | ~LogMessageFatal(); 88 | }; 89 | 90 | #define _BPS_LOG_TRACE LogMessage(__FILE__, __LINE__, LogLevel::TRACE) 91 | #define _BPS_LOG_DEBUG LogMessage(__FILE__, __LINE__, LogLevel::DEBUG) 92 | #define _BPS_LOG_INFO LogMessage(__FILE__, __LINE__, LogLevel::INFO) 93 | #define _BPS_LOG_WARNING LogMessage(__FILE__, __LINE__, LogLevel::WARNING) 94 | #define _BPS_LOG_ERROR LogMessage(__FILE__, __LINE__, LogLevel::ERROR) 95 | #define _BPS_LOG_FATAL LogMessageFatal(__FILE__, __LINE__) 96 | 97 | #define _LOG(severity) _BPS_LOG_##severity 98 | 99 | #define _LOG_RANK(severity, rank) _BPS_LOG_##severity << "[" << rank << "]: " 100 | 101 | #define GET_LOG(_1, _2, NAME, ...) NAME 102 | #define BPS_LOG(...) GET_LOG(__VA_ARGS__, _LOG_RANK, _LOG)(__VA_ARGS__) 103 | 104 | LogLevel MinLogLevelFromEnv(); 105 | bool LogTimeFromEnv(); 106 | 107 | } // namespace common 108 | } // namespace byteps 109 | 110 | #endif // BYTEPS_LOGGING_H 111 | -------------------------------------------------------------------------------- /tests/test_topk.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon Technologies, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import itertools 17 | import random 18 | import unittest 19 | 20 | import byteps.mxnet as bps 21 | import mxnet as mx 22 | import mxnet.ndarray as nd 23 | import numpy as np 24 | from gluoncv.model_zoo import get_model 25 | from mxnet import autograd, gluon 26 | from parameterized import parameterized 27 | from tqdm import tqdm 28 | 29 | from meta_test import MetaTest 30 | from utils import fake_data 31 | 32 | 33 | def topk(x, k): 34 | y = x.flatten() 35 | indices = np.argsort(np.abs(y))[-k:][::-1] 36 | vals = y[indices] 37 | y.fill(0) 38 | for idx, val in zip(indices, vals): 39 | y[idx] = val 40 | return y.reshape(x.shape) 41 | 42 | 43 | class TopkTestCase(unittest.TestCase, metaclass=MetaTest): 44 | @parameterized.expand(itertools.product([1, 3, 5])) 45 | def test_topk(self, k): 46 | ctx = mx.gpu(0) 47 | net = get_model("resnet18_v2") 48 | net.initialize(mx.init.Xavier(), ctx=ctx) 49 | net.summary(nd.ones((1, 3, 224, 224), ctx=ctx)) 50 | 51 | # hyper-params 52 | batch_size = 32 53 | optimizer_params = {'momentum': 0, 'wd': 0, 54 | 'learning_rate': 0.01} 55 | 56 | compression_params = { 57 | "compressor": "topk", 58 | "k": k, 59 | } 60 | 61 | trainer = bps.DistributedTrainer(net.collect_params( 62 | ), "sgd", optimizer_params, compression_params=compression_params) 63 | 64 | loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() 65 | 66 | train_data = fake_data(batch_size=batch_size) 67 | 68 | params = {} 69 | 70 | for i, param in enumerate(trainer._params): 71 | if param.grad_req != 'null': 72 | params[i] = param._data[0].asnumpy() 73 | 74 | for it, batch in tqdm(enumerate(train_data)): 75 | data = batch[0].as_in_context(ctx) 76 | label = batch[1].as_in_context(ctx) 77 | 78 | with autograd.record(): 79 | output = net(data) 80 | loss = loss_fn(output, label) 81 | 82 | loss.backward() 83 | 84 | gs = {} 85 | xs = {} 86 | 87 | for i, param in enumerate(trainer._params): 88 | if param.grad_req != 'null': 89 | gs[i] = param._grad[0].asnumpy() 90 | xs[i] = param._data[0].asnumpy() 91 | 92 | trainer.step(batch_size) 93 | 94 | for i, param in enumerate(trainer._params): 95 | if param.grad_req != "null": 96 | g = gs[i] / (batch_size * bps.size()) 97 | c = topk(g, k) 98 | 99 | cs = topk(c, k) 100 | c = cs 101 | 102 | params[i] -= optimizer_params["learning_rate"] * c 103 | 104 | cnt = 0 105 | tot = 0 106 | for i, param in enumerate(trainer._params): 107 | if param.grad_req != "null": 108 | x = param._data[0].asnumpy() 109 | tot += len(x.flatten()) 110 | if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps): 111 | diff = np.abs(x.flatten() - params[i].flatten()) 112 | idx = np.where(diff > np.finfo(np.float32).eps) 113 | cnt += len(idx[0]) 114 | 115 | assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot) 116 | 117 | 118 | if __name__ == '__main__': 119 | unittest.main() 120 | -------------------------------------------------------------------------------- /docs/timeline.md: -------------------------------------------------------------------------------- 1 | # Performance Analysis of BytePS 2 | 3 | You can analyze the fine-grained performance of BytePS with the profiling tool. 4 | 5 | ## For Communication Operations 6 | 7 | ### Usage 8 | 9 | Use the following environment variables to enable profiling the communication operations: 10 | 11 | ``` python 12 | "BYTEPS_TRACE_ON" = "1" 13 | "BYTEPS_TRACE_END_STEP" = "20" 14 | "BYTEPS_TRACE_START_STEP"="10" 15 | "BYTEPS_TRACE_DIR"= "./traces" 16 | ``` 17 | First `BYTEPS_TRACE_ON` should be set to `1` to enable profiling communication traces. `BYTEPS_TRACE_START_STEP` and `BYTEPS_TRACE_END_STEP` decide the step interval we want to profile, traces from step `BYTEPS_TRACE_START_STEP` to step `BYTEPS_TRACE_END_STEP` steps will be automatically collected and the result traces will be output in the chrome trace format. `BYTEPS_TRACE_DIR` denotes the path where you want to store traces. 18 | 19 | The result directory is organized as follows. 20 | ``` 21 | traces/ 22 | ├── 0 23 | │   └── comm.json 24 | │  25 | └── 1 26 | └── comm.json 27 | ``` 28 | 29 | Here, `traces/` is the trace directory we defined using `BYTEPS_TRACE_DIR`. `traces/` contains several sub-directories, each of which denotes one GPU and is named with the local rank of this GPU, e.g., path `./traces/0/` stores the traces results of the GPU whose local rank is `0`. Each sub-directory contains following directories/files: 30 | * `comm.json`: the final trace file which contains the communication traces of all gradients; 31 | 32 | ### Trace Format 33 | Let's look deep into the traces. 34 | ``` json 35 | { 36 | "ph": "X", 37 | "args": { 38 | "name": "Comm.byteps.gradient_0" 39 | }, 40 | "pid": "Comm.byteps.gradient_0", 41 | "name": "Comm.byteps.gradient_0", 42 | "ts": 1574685989504865, 43 | "dur": 24026, 44 | "tid": "total" 45 | }, 46 | { 47 | "ph": "X", 48 | "args": { 49 | "name": "Comm.byteps.gradient_0" 50 | }, 51 | "pid": "Comm.byteps.gradient_0", 52 | "name": "Comm.byteps.gradient_0.BROADCAST", 53 | "ts": 1574685984662375, 54 | "dur": 1074, 55 | "tid": "26148864" 56 | } 57 | ``` 58 | Basically, the trace event format is the same as the standard [Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit). Here, `name` is the name of one event, which can be shown on `chrome://tracing/`. Considering BytePS divides each gradinets to multiple partitions if necessary and each partition needs to go through several types of following operations, namely `QueueType`. 59 | ``` 60 | "COORDINATE_REDUCE", 61 | "REDUCE", 62 | "COPYD2H", 63 | "PCIE_REDUCE", 64 | "COORDINATE_PUSH", 65 | "PUSH", 66 | "PULL", 67 | "COPYH2D", 68 | "COORDINATE_BROADCAST", 69 | "BROADCAST" 70 | ``` 71 | So there are two types of events: 72 | 1. If `tid` is `total`, the event records the entire interval to synchronize one gradient, including the queue time. In this case, `name` ends with the gradient index. 73 | 2. If `tid` is a number, the event records the interval for each `QueueType` of each partition of one gradient. In this case, `name` ends with the gradient index and the corresponding `QueueType`, `tid` denotes the partition id. 74 | 75 | Note that for BytePS, for multiple GPUs on one worker, only the root GPU is responsible for synchronizing with servers, and these GPUs located on one worker update parameters through all-reduce. Therefore, you can observe `PUSH` and `PULL` operations only in the traces of the root GPU. By default, the root GPU is one with the largest local rank. 76 | 77 | Below shows a visualization example of `comm.json`. 78 | 79 | 80 | ### Overhead 81 | Below shows the latency when running [`bert_12_768_12`](https://github.com/joapolarbear/gluon-nlp/tree/bert-byteprofile/scripts/bert) model with 2 workers, each containing 2 V100 GPUs with 16GB of memory. BytePS Timeline collects traces during step 10 to step 20 and after step 20, it asynchronously outputs the trace results, which may also cause extra overhead. Ignoring the warm up phase (the first 10 steps), the overhead induced by BytePS Timeline is small. 82 | 83 | 84 | -------------------------------------------------------------------------------- /byteps/common/logging.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #include "logging.h" 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace byteps { 24 | namespace common { 25 | 26 | LogMessage::LogMessage(const char* fname, int line, LogLevel severity) 27 | : fname_(fname), line_(line), severity_(severity) {} 28 | 29 | void LogMessage::GenerateLogMessage(bool log_time) { 30 | bool use_cout = 31 | static_cast(severity_) <= static_cast(LogLevel::INFO); 32 | std::ostream& os = use_cout ? std::cout : std::cerr; 33 | if (log_time) { 34 | auto now = std::chrono::system_clock::now(); 35 | auto as_time_t = std::chrono::system_clock::to_time_t(now); 36 | 37 | auto duration = now.time_since_epoch(); 38 | auto seconds = std::chrono::duration_cast(duration); 39 | auto micros_remainder = 40 | std::chrono::duration_cast(duration - 41 | seconds); 42 | 43 | const size_t time_buffer_size = 30; 44 | char time_buffer[time_buffer_size]; 45 | strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S", 46 | localtime(&as_time_t)); 47 | os << "[" << time_buffer << "." << std::setw(6) << micros_remainder.count() 48 | << ": " << LOG_LEVELS[static_cast(severity_)] << " " << fname_ 49 | << ":" << line_ << "] " << str() << std::endl; 50 | } else { 51 | os << "[" << LOG_LEVELS[static_cast(severity_)] << " " << fname_ << ":" 52 | << line_ << "] " << str() << std::endl; 53 | } 54 | } 55 | 56 | LogMessage::~LogMessage() { 57 | static LogLevel min_log_level = MinLogLevelFromEnv(); 58 | static bool log_time = LogTimeFromEnv(); 59 | if (severity_ >= min_log_level) { 60 | GenerateLogMessage(log_time); 61 | } 62 | } 63 | 64 | LogMessageFatal::LogMessageFatal(const char* file, int line) 65 | : LogMessage(file, line, LogLevel::FATAL) {} 66 | 67 | LogMessageFatal::~LogMessageFatal() { 68 | static bool log_time = LogTimeFromEnv(); 69 | GenerateLogMessage(log_time); 70 | abort(); 71 | } 72 | 73 | LogLevel ParseLogLevelStr(const char* env_var_val) { 74 | std::string min_log_level(env_var_val); 75 | std::transform(min_log_level.begin(), min_log_level.end(), 76 | min_log_level.begin(), ::tolower); 77 | if (min_log_level == "trace") { 78 | return LogLevel::TRACE; 79 | } else if (min_log_level == "debug") { 80 | return LogLevel::DEBUG; 81 | } else if (min_log_level == "info") { 82 | return LogLevel::INFO; 83 | } else if (min_log_level == "warning") { 84 | return LogLevel::WARNING; 85 | } else if (min_log_level == "error") { 86 | return LogLevel::ERROR; 87 | } else if (min_log_level == "fatal") { 88 | return LogLevel::FATAL; 89 | } else { 90 | return LogLevel::WARNING; 91 | } 92 | } 93 | 94 | LogLevel MinLogLevelFromEnv() { 95 | const char* env_var_val = getenv("BYTEPS_LOG_LEVEL"); 96 | if (env_var_val == nullptr) { 97 | // default to WARNING 98 | return LogLevel::WARNING; 99 | } 100 | return ParseLogLevelStr(env_var_val); 101 | } 102 | 103 | bool LogTimeFromEnv() { 104 | const char* env_var_val = getenv("BYTEPS_LOG_HIDE_TIME"); 105 | if (env_var_val != nullptr && std::strtol(env_var_val, nullptr, 10) > 0) { 106 | return false; 107 | } else { 108 | return true; 109 | } 110 | } 111 | 112 | } // namespace common 113 | } // namespace byteps 114 | -------------------------------------------------------------------------------- /example/pytorch/mnist-distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | import argparse 4 | import torch.multiprocessing as mp 5 | import torchvision 6 | import torchvision.transforms as transforms 7 | import torch 8 | import torch.nn as nn 9 | import torch.distributed as dist 10 | from torch.nn.parallel import DistributedDataParallel as DDP 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-n', '--nodes', default=4, type=int, metavar='N', 16 | help='number of data loading workers (default: 4)') 17 | parser.add_argument('-g', '--gpus', default=1, type=int, 18 | help='number of gpus per node') 19 | parser.add_argument('-nr', '--nr', default=0, type=int, 20 | help='ranking within the nodes') 21 | parser.add_argument('--epochs', default=2, type=int, metavar='N', 22 | help='number of total epochs to run') 23 | args = parser.parse_args() 24 | args.world_size = args.gpus * args.nodes 25 | os.environ['MASTER_ADDR'] = '10.57.23.164' 26 | os.environ['MASTER_PORT'] = '8888' 27 | mp.spawn(train, nprocs=args.gpus, args=(args,)) 28 | 29 | 30 | class ConvNet(nn.Module): 31 | def __init__(self, num_classes=10): 32 | super(ConvNet, self).__init__() 33 | self.layer1 = nn.Sequential( 34 | nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2), 35 | nn.BatchNorm2d(16), 36 | nn.ReLU(), 37 | nn.MaxPool2d(kernel_size=2, stride=2)) 38 | self.layer2 = nn.Sequential( 39 | nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2), 40 | nn.BatchNorm2d(32), 41 | nn.ReLU(), 42 | nn.MaxPool2d(kernel_size=2, stride=2)) 43 | self.fc = nn.Linear(7*7*32, num_classes) 44 | 45 | def forward(self, x): 46 | out = self.layer1(x) 47 | out = self.layer2(out) 48 | out = out.reshape(out.size(0), -1) 49 | out = self.fc(out) 50 | return out 51 | 52 | 53 | def train(gpu, args): 54 | rank = args.nr * args.gpus + gpu 55 | dist.init_process_group( 56 | backend='nccl', 57 | init_method='env://', 58 | world_size=args.world_size, 59 | rank=rank) 60 | torch.manual_seed(0) 61 | model = ConvNet() 62 | torch.cuda.set_device(gpu) 63 | model.cuda(gpu) 64 | batch_size = 100 65 | # define loss function (criterion) and optimizer 66 | criterion = nn.CrossEntropyLoss().cuda(gpu) 67 | optimizer = torch.optim.SGD(model.parameters(), 1e-4) 68 | # Wrap the model 69 | 70 | model = DDP(model, device_ids=[gpu]) 71 | # Data loading code 72 | train_dataset = torchvision.datasets.MNIST( 73 | root='./data', 74 | train=True, 75 | transform=transforms.ToTensor(), 76 | download=True 77 | ) 78 | train_sampler = torch.utils.data.distributed.DistributedSampler( 79 | train_dataset, 80 | num_replicas=args.world_size, 81 | rank=rank) 82 | train_loader = torch.utils.data.DataLoader( 83 | dataset=train_dataset, 84 | batch_size=batch_size, 85 | shuffle=False, 86 | num_workers=0, 87 | pin_memory=True, 88 | sampler=train_sampler 89 | ) 90 | 91 | start = datetime.now() 92 | total_step = len(train_loader) 93 | for epoch in range(args.epochs): 94 | for i, (images, labels) in enumerate(train_loader): 95 | images = images.cuda(non_blocking=True) 96 | labels = labels.cuda(non_blocking=True) 97 | # Forward pass 98 | outputs = model(images) 99 | loss = criterion(outputs, labels) 100 | 101 | # Backward and optimize 102 | optimizer.zero_grad() 103 | loss.backward() 104 | optimizer.step() 105 | if (i + 1) % 100 == 0 and gpu == 0: 106 | print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step, 107 | loss.item())) 108 | if gpu == 0: 109 | print("Training complete in: " + str(datetime.now() - start)) 110 | 111 | 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ about PS vs Allreduce 2 | 3 | Below we summarize a list of questions or *incorrect* statements that many users are confused of. 4 | 5 | ### **BytePS has a better performance because of some data path details, e.g., less copying?** 6 | 7 | Not really. BytePS uses share memory in a similar way as NCCL, and BytePS copies the data for as many times as NCCL. In addition, both BytePS and NCCL use tensor partition/segmentation internally, which hides most of the copying delay. 8 | 9 | I would consider BytePS and NCCL are both close to their theoretical optimal based on their communication patterns. 10 | 11 | ### **BytePS has a better performance because it has a hierarchical strategy, i.e., local reduce followed by inter-machine transfers?** 12 | 13 | Hierarchical strategy does help a bit. However, it is not as fundamental as the PS communication pattern. Consider the following example: 14 | 15 | You have N worker machines connected to the same network switch, and each worker has only 1 GPU. In this case, the topology is flat -- no hierarchy at all. In this case, you'll find the analysis in [rationale.md](/docs/rationale.md) still applies. PS communication pattern has less traffic volume *from worker's stand point of view*. 16 | 17 | ### **Allreduce is equivelant to PS, as long as you find the correct allreduce strategy?** 18 | 19 | Not really. Consider the above flat N-worker example again. No matter with which allreduce strategy, the conclusion of PS vs. allreduce does not change. 20 | 21 | ### **Okay, I get that PS has less traffic from workers. But, PS Push and Pull are not duplex, and waste half bandwidth?** 22 | 23 | BytePS does not have this problem. It can fully utilize bi-direction network bandwidth. The key ideas are tensor partitioning and pipelining. For example, you have a 100MB tensor to be pushed and pulled. Inside BytePS, we will partition the tensor into small pieces. After pushing the first piece, we will start pulling the first piece. At the same time, we will start pushing the second piece. And so on. For most of the time except the first piece and the last piece, the bi-directional bandwidth is fully utilized. 24 | 25 | ### **Since the bottleneck is the NIC of GPU machines, why not add more NICs?** 26 | 27 | There are indeed [specialized physical server designs](https://images.nvidia.com/content/pdf/dgx1-v100-system-architecture-whitepaper.pdf) doing that. Unfortunately, cloud or shared clusters usually prefer not do this. This is because, as a matter of fact, many training jobs are not distributed. For these jobs, users want the GPUs to be deployed as dense as possible and the network bandwidth requirement is low. 28 | 29 | If you are building your own cluster for a *single* dedicated *distributed* training job, of course you can go with the HPC route, carefully calculate the best ratio between GPUs and NICs, build a homogeneous cluster and use allreduce. However, please realize that cloud and shared clusters are not HPC. This is the whole point of BytePS. 30 | 31 | ### **Does PS architecture impose heavier cross rack traffic, and may be impacted by physical network over-subscription ratio?** 32 | 33 | This is true. For a large job that workers and PS cannot fit side a rack, PS does have more cross-rack traffic. 34 | 35 | However, the comparison with allreduce in real life is more complicated. It depends on how well you can control the physical job placement and allreduce rings. If you don't have the full control of placement, or your MPI/NCCL rank assignment is not physical network topology-aware, allreduce would face the exactly same problem. NCCL and most MPIs today are unaware of physical network topology, unless specifically designed for a given HPC. 36 | 37 | Don't be scared of the oversubscription ratio. It exists for a reason -- usually, not all servers in a rack are simultaneously busy on networking. Multiple researches from major cloud providers show that the average bandwidth utilization is low. Remember, this is a shared cluster, not everyone is running distributed training. 38 | 39 | ### **Final remarks** 40 | 41 | With BytePS, we want to share two key insights -- 42 | 43 | * Cloud, either public or private, is different from HPC. Using ideas from HPC is a shortcut, but not optimal. 44 | * In a (public or private) cloud, PS architecture is theoretically better than allreduce, with minimal additional costs. 45 | 46 | BytePS is a realization of the idea. 47 | -------------------------------------------------------------------------------- /tests/test_onebit.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon Technologies, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import itertools 17 | import unittest 18 | 19 | import byteps.mxnet as bps 20 | import mxnet as mx 21 | import mxnet.ndarray as nd 22 | import numpy as np 23 | from gluoncv.model_zoo import get_model 24 | from mxnet import autograd, gluon 25 | from parameterized import parameterized 26 | from tqdm import tqdm 27 | 28 | from meta_test import MetaTest 29 | from utils import fake_data 30 | 31 | 32 | def onebit(x, scaling): 33 | if scaling: 34 | l1 = np.linalg.norm(x.flatten(), 1) 35 | sign = x < 0 36 | sign = -((sign << 1) - 1) 37 | if scaling: 38 | return l1 / len(x.flatten()) * sign 39 | else: 40 | return sign 41 | 42 | 43 | class OnebitTestCase(unittest.TestCase, metaclass=MetaTest): 44 | @parameterized.expand(itertools.product([True, False])) 45 | def test_onebit(self, scaling): 46 | bps.init() 47 | ctx = mx.gpu(0) 48 | net = get_model("resnet18_v2") 49 | net.initialize(mx.init.Xavier(), ctx=ctx) 50 | net.summary(nd.ones((1, 3, 224, 224), ctx=ctx)) 51 | 52 | # hyper-params 53 | batch_size = 32 54 | optimizer_params = {'momentum': 0, 'wd': 0, 55 | 'learning_rate': 0.01} 56 | 57 | compression_params = { 58 | "compressor": "onebit", 59 | "scaling": scaling, 60 | } 61 | 62 | trainer = bps.DistributedTrainer(net.collect_params( 63 | ), "sgd", optimizer_params, compression_params=compression_params) 64 | 65 | loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() 66 | 67 | train_data = fake_data(batch_size=batch_size) 68 | 69 | params = {} 70 | 71 | for i, param in enumerate(trainer._params): 72 | if param.grad_req != 'null': 73 | params[i] = param._data[0].asnumpy() 74 | 75 | for it, batch in tqdm(enumerate(train_data)): 76 | data = batch[0].as_in_context(ctx) 77 | label = batch[1].as_in_context(ctx) 78 | 79 | with autograd.record(): 80 | output = net(data) 81 | loss = loss_fn(output, label) 82 | 83 | loss.backward() 84 | 85 | gs = {} 86 | xs = {} 87 | 88 | for i, param in enumerate(trainer._params): 89 | if param.grad_req != 'null': 90 | gs[i] = param._grad[0].asnumpy() 91 | xs[i] = param._data[0].asnumpy() 92 | 93 | trainer.step(batch_size) 94 | 95 | for i, param in enumerate(trainer._params): 96 | if param.grad_req != "null": 97 | g = gs[i] / (batch_size * bps.size()) 98 | c = onebit(g, scaling) 99 | 100 | cs = onebit(c, scaling) 101 | c = cs 102 | 103 | params[i] -= optimizer_params["learning_rate"] * c 104 | 105 | cnt = 0 106 | tot = 0 107 | for i, param in enumerate(trainer._params): 108 | if param.grad_req != "null": 109 | x = param._data[0].asnumpy() 110 | tot += len(x.flatten()) 111 | if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps): 112 | diff = np.abs(x.flatten() - params[i].flatten()) 113 | idx = np.where(diff > np.finfo(np.float32).eps) 114 | cnt += len(idx[0]) 115 | 116 | assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot) 117 | 118 | 119 | if __name__ == '__main__': 120 | unittest.main() 121 | --------------------------------------------------------------------------------