├── byteps
    ├── __init__.py
    ├── misc
    │   └── __init__.py
    ├── torch
    │   ├── parallel
    │   │   └── __init__.py
    │   ├── cuda_util.h
    │   ├── adapter.h
    │   ├── ready_event.h
    │   ├── handle_manager.h
    │   ├── cuda_util.cc
    │   ├── handle_manager.cc
    │   ├── ops.h
    │   ├── compression.py
    │   ├── adapter.cc
    │   └── ready_event.cc
    ├── tensorflow
    │   ├── distribute
    │   │   └── __init__.py
    │   ├── util.py
    │   ├── ops.h
    │   └── compression.py
    ├── __version__.py
    ├── server
    │   ├── __init__.py
    │   └── queue.h
    ├── mxnet
    │   ├── cuda_util.h
    │   ├── ready_event.cc
    │   ├── util.h
    │   ├── ready_event.h
    │   ├── adapter.h
    │   ├── cuda_util.cc
    │   ├── ops.h
    │   ├── adapter.cc
    │   └── tensor_util.h
    └── common
    │   ├── compressor
    │       ├── momentum.cc
    │       ├── error_feedback.cc
    │       ├── impl
    │       │   ├── nesterov_momentum.h
    │       │   ├── nesterov_momentum.cc
    │       │   ├── vanilla_error_feedback.h
    │       │   ├── vanilla_error_feedback.cc
    │       │   ├── topk.h
    │       │   ├── onebit.h
    │       │   ├── dithering.h
    │       │   └── randomk.h
    │       ├── compressor_registry.h
    │       ├── compressor_registry.cc
    │       ├── momentum.h
    │       └── error_feedback.h
    │   ├── core_loops.h
    │   ├── ready_table.cc
    │   ├── ready_table.h
    │   ├── scheduled_queue.h
    │   ├── shared_memory.h
    │   ├── thread_pool.h
    │   ├── operations.h
    │   ├── shared_memory.cc
    │   ├── nccl_manager.h
    │   ├── logging.h
    │   └── logging.cc
├── example
    ├── mxnet
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── find_mxnet.py
    │   │   └── util.py
    │   ├── symbols
    │   │   ├── __init__.py
    │   │   ├── README.md
    │   │   ├── mlp.py
    │   │   ├── lenet.py
    │   │   ├── alexnet.py
    │   │   └── vgg.py
    │   ├── data
    │   │   ├── imagenet1k-val.sh
    │   │   └── caltech256.sh
    │   └── train_imagenet_byteps.py
    ├── README.md
    ├── tensorflow
    │   ├── tensorflow2_mnist.py
    │   ├── tensorflow2_mnist_bps_MirroredStrategy.py
    │   ├── tensorflow_keras_mnist.py
    │   └── tensorflow2_keras_mnist.py
    ├── keras
    │   ├── keras_synthetic_benchmark_tf2.py
    │   └── keras_mnist.py
    └── pytorch
    │   └── mnist-distributed.py
├── byteps.exp
├── .clang-format
├── byteps.lds
├── .gitmodules
├── pre_setup.py
├── MANIFEST.in
├── CONTRIBUTING.md
├── docker
    ├── README.md
    └── Dockerfile
├── .github
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── tests
    ├── run_byteps_test.sh
    ├── utils.py
    ├── meta_test.py
    ├── test_tensorflow_keras.py
    ├── test_topk.py
    └── test_onebit.py
├── launcher
    └── README.md
├── docs
    ├── MirroredStrategy.md
    ├── DistributedDataParallel.md
    ├── performance.md
    ├── cross-barrier.md
    ├── troubleshooting.md
    ├── running.md
    ├── architecture.md
    ├── best-practice.md
    ├── timeline.md
    └── faq.md
├── CHANGELOG.rst
├── .travis.yml
├── NOTICE
└── .gitignore


/byteps/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/byteps/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/mxnet/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/example/mxnet/symbols/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/byteps.exp:
--------------------------------------------------------------------------------
1 | *byteps*
2 | # PyTorch binding
3 | *PyInit*
4 | *initc_lib*
5 | 


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
1 | For more examples, see: https://github.com/byteps/examples


--------------------------------------------------------------------------------
/byteps/torch/parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import  DistributedDataParallel
2 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | ---
2 | BasedOnStyle: Google
3 | ---
4 | Language: Cpp
5 | ColumnLimit: 80
6 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/distribute/__init__.py:
--------------------------------------------------------------------------------
1 | from . mirrored_strategy import MirroredStrategy
2 | 


--------------------------------------------------------------------------------
/byteps/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 2, 5)
2 | 
3 | __version__ = '.'.join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/byteps.lds:
--------------------------------------------------------------------------------
1 | {
2 |   global:
3 |     *byteps*;
4 |     # PyTorch binding
5 |     *PyInit*;
6 |     *initc_lib*;
7 |   local: *;
8 | };
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "3rdparty/ps-lite"]
2 | 	path = 3rdparty/ps-lite
3 | 	url = https://github.com/bytedance/ps-lite
4 | 	branch = byteps
5 | 


--------------------------------------------------------------------------------
/pre_setup.py:
--------------------------------------------------------------------------------
 1 | # For internal use. Please do not modify this file.
 2 | 
 3 | def setup():
 4 |     return
 5 | 
 6 | def extra_make_option():
 7 |     return ""
 8 | 
 9 | # absolute path to the ucx tar.gz file
10 | ucx_tarball_path = ""
11 | 


--------------------------------------------------------------------------------
/example/mxnet/symbols/README.md:
--------------------------------------------------------------------------------
 1 | # Symbol
 2 | 
 3 | This fold contains definition of various networks. To add a new network, please
 4 | use the following format.
 5 | 
 6 | ## Python
 7 | 
 8 | - A file implements one network proposed in a paper, with the network name as the
 9 | filename.
10 | - Mention the paper and the modifications made if any at the beginning
11 | of the file.
12 | - Indicate how to reproduce the accuracy numbers in the paper if it is not straightforward
13 | - Provide a function `get_symbol()` that return the network
14 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include */* LICENSE byteps.lds byteps.exp
 2 | recursive-include * *.cc *.h
 3 | prune .git
 4 | prune dist
 5 | prune bin
 6 | prune __pycache__
 7 | prune 3rdparty
 8 | graft 3rdparty/ps-lite
 9 | prune 3rdparty/ps-lite/build
10 | prune 3rdparty/ps-lite/deps
11 | exclude 3rdparty/ps-lite/tests/test_benchmark
12 | exclude 3rdparty/ps-lite/tests/test_benchmark.d
13 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark
14 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark.d
15 | 
16 | include pre_setup.py pre_setup_local.py zeromq-4.1.4.tar.gz ucx.tar.gz
17 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution guidelines
 2 | 
 3 | First of all, thanks for taking the time to contribute!
 4 | 
 5 | Please refer to the following guidelines to contribute new functionality or bug fixes:
 6 | 
 7 | 1. Use [autopep8](https://github.com/hhatto/autopep8) to format the Python code.
 8 | 2. Use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C++ code. Changes to BytePS C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 9 | 3. Add unit tests for any new code you write.
10 | 4. Run unit tests in both CI and GPU environments.
11 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Prebuilt Images
 2 | 
 3 | Belows are prebuilt docker images, and their associated commands to build. These prebuilt images might not be up-to-date.
 4 | You may need to manually build them to get the latest functionalities of BytePS using the dockerfile.
 5 | 
 6 | | Docker image | How to build |
 7 | | --- | --- |
 8 | | bytepsimage/tensorflow       | docker build -t bytepsimage/tensorflow . -f Dockerfile --build-arg FRAMEWORK=tensorflow |
 9 | | bytepsimage/pytorch          | docker build -t bytepsimage/pytorch . -f Dockerfile --build-arg FRAMEWORK=pytorch |
10 | | bytepsimage/mxnet            | docker build -t bytepsimage/mxnet . -f Dockerfile --build-arg FRAMEWORK=mxnet |
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1.
16 | 2.
17 | 3.
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Environment (please complete the following information):**
27 |  - OS:
28 |  - GCC version:
29 |  - CUDA and NCCL version:
30 |  - Framework (TF, PyTorch, MXNet):
31 | 
32 | **Additional context**
33 | Add any other context about the problem here.
34 | 


--------------------------------------------------------------------------------
/tests/run_byteps_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | path="$(dirname $0)"
 4 | 
 5 | export PATH=~/anaconda3/envs/mxnet_p36/bin:$PATH
 6 | export DMLC_NUM_WORKER=1
 7 | export DMLC_NUM_SERVER=1
 8 | export DMLC_PS_ROOT_URI=127.0.0.1
 9 | export DMLC_PS_ROOT_PORT=1234
10 | 
11 | function cleanup() {
12 |   rm -rf lr.s
13 | }
14 | 
15 | trap cleanup EXIT
16 | 
17 | pkill bpslaunch
18 | pkill python3
19 | 
20 | echo "Launch scheduler"
21 | export DMLC_ROLE=scheduler
22 | bpslaunch &
23 | 
24 | echo "Launch server"
25 | export DMLC_ROLE=server
26 | bpslaunch &
27 | 
28 | export NVIDIA_VISIBLE_DEVICES=0
29 | export DMLC_WORKER_ID=0
30 | export DMLC_ROLE=worker
31 | export BYTEPS_THREADPOOL_SIZE=4
32 | export BYTEPS_FORCE_DISTRIBUTED=1
33 | export BYTEPS_LOG_LEVEL=WARNING
34 | 
35 | if [ "$TEST_TYPE" == "keras" ]; then
36 |   echo "TEST KERAS ..."
37 |   python $path/test_tensorflow_keras.py $@
38 | else
39 |   echo "Error: unsupported $TEST_TYPE"
40 |   exit 1
41 | fi
42 | 


--------------------------------------------------------------------------------
/launcher/README.md:
--------------------------------------------------------------------------------
 1 | ### How to use distributed launcher
 2 | 
 3 | Create two host files: `worker_hosts` and `server_hosts`, put your lists of hosts inside (one IP:port per line).
 4 | 
 5 | For example, we want `10.0.0.1:12345` to be the scheduler, `10.0.0.2` and `10.0.0.3` to be the workers, `10.0.0.4` and `10.0.0.5` to be the servers.
 6 | 
 7 | Then `worker_hosts` should be:
 8 | ```
 9 | 10.0.0.2
10 | 10.0.0.3
11 | ```
12 | 
13 | And `server_hosts` should be:
14 | ```
15 | 10.0.0.4
16 | 10.0.0.5
17 | ```
18 | 
19 | Finally, start the distributed ssh launcher by:
20 | 
21 | ```
22 | python dist_launcher.py --worker-hostfile worker_hosts --server-hostfile server_hosts \
23 |         --scheduler-ip 10.0.0.1 --scheduler-port 12345 \
24 |         --username root --env ENV1:1 --env ENV2:2 \
25 |         'echo this is $DMLC_ROLE; python byteps/launcher/launch.py YOUR_COMMAND'
26 | ```
27 | 
28 | The script will automatically help you setup the necessary [environment variables](/docs/env.md) and launch BytePS processes.


--------------------------------------------------------------------------------
/example/mxnet/common/find_mxnet.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | import os, sys
19 | try:
20 |     import mxnet as mx
21 | except ImportError:
22 |     curr_path = os.path.abspath(os.path.dirname(__file__))
23 |     sys.path.append(os.path.join(curr_path, "../../../python"))
24 |     import mxnet as mx
25 | 


--------------------------------------------------------------------------------
/byteps/server/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import ctypes
17 | import os
18 | from byteps.common import get_ext_suffix
19 | 
20 | 
21 | def run():
22 |     dll_path = os.path.join(os.path.dirname(__file__),
23 |                             'c_lib' + get_ext_suffix())
24 |     SERVER_LIB_CTYPES = ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL)
25 |     SERVER_LIB_CTYPES.byteps_server()
26 | 
27 | run()
28 | 


--------------------------------------------------------------------------------
/docs/MirroredStrategy.md:
--------------------------------------------------------------------------------
 1 | # MirroredStrategy
 2 | 
 3 | The BytePS MirroredStrategy module is compatible with tensorflow
 4 | MultiWorkerMirroredStrategy for the most part. Instead of using the builtin
 5 | tensorflow collective communication implementation, it uses BytePS push-pull
 6 | for gradients reduction between nodes.
 7 | 
 8 | It currently supports the Single-Process Single-GPU mode. In this mode each
 9 | process works with one GPU. Example usage:
10 | 
11 | 
12 | ```python
13 | import byteps.tensorflow as bps
14 | from  byteps.tensorflow.distribute import MirroredStrategy
15 | 
16 | bps.init()
17 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU')
18 | strategy = MirroredStrategy(devices=["/gpu:0"])
19 | 
20 | with strategy.scope():
21 |   # Model building/compiling need to be within `strategy.scope()`.
22 |   multi_worker_model = build_and_compile_cnn_model()
23 | 
24 | multi_worker_model.fit(multi_worker_dataset, epochs=100, steps_per_epoch=70)
25 | ```
26 | To run the program, use `bpslaunch` to launch one process for each device you
27 | wish to use. Refer to the [running](./running.md) document for how to use
28 | `bpslaunch`.
29 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | from distutils.version import LooseVersion
16 | 
17 | import tensorflow as tf
18 | 
19 | 
20 | if LooseVersion(tf.__version__) >= LooseVersion("1.9.0"):
21 |     from tensorflow.python.eager import context
22 |     _has_eager = True
23 | else:
24 |     _has_eager = False
25 | 
26 | 
27 | def _executing_eagerly():
28 |     """Returns true if eager execution is supported and enabled."""
29 |     return _has_eager and context.in_eager_mode()
30 | 


--------------------------------------------------------------------------------
/byteps/mxnet/cuda_util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_CUDA_UTIL_H
18 | #define BYTEPS_MXNET_CUDA_UTIL_H
19 | 
20 | namespace byteps {
21 | namespace mxnet {
22 | 
23 | class with_device {
24 |  public:
25 |   with_device(int device);
26 |   ~with_device();
27 | 
28 |  private:
29 |   int restore_device_;
30 | };
31 | 
32 | }  // namespace mxnet
33 | }  // namespace byteps
34 | 
35 | #endif  // BYTEPS_MXNET_CUDA_UTIL_H
36 | 


--------------------------------------------------------------------------------
/byteps/torch/cuda_util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_CUDA_UTIL_H
18 | #define BYTEPS_TORCH_CUDA_UTIL_H
19 | 
20 | #include "../common/common.h"
21 | 
22 | namespace byteps {
23 | namespace torch {
24 | 
25 | class with_device {
26 |  public:
27 |   with_device(int device);
28 |   ~with_device();
29 | 
30 |  private:
31 |   int restore_device_ = CPU_DEVICE_ID;
32 | };
33 | 
34 | }  // namespace torch
35 | }  // namespace byteps
36 | 
37 | #endif  // BYTEPS_TORCH_CUDA_UTIL_H
38 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 2 | Changelog for BytePS
 3 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 4 | 0.2.4 (2020-06)
 5 | ------------------
 6 | * Fix compatibility issue with tf2 + standalone keras
 7 | * Add support for tensorflow.keras
 8 | * Improve robustness of broadcast
 9 | 
10 | 
11 | 0.2.3 (2020-05)
12 | ------------------
13 | * Add DistributedDataParallel module for PyTorch
14 | * Fix the problem of different CPU tensor using the same name
15 | * Add skip_synchronize api for PyTorch
16 | * Add the option for lazy/non-lazy init
17 | 
18 | 
19 | 0.2.0 (2020-02)
20 | ------------------
21 | * Largely improve RDMA performance by enforcing page aligned memory.
22 | * Add IPC support for RDMA. Now support colocating servers and workers without sacrificing much performance.
23 | * Fix a hanging bug in BytePS server.
24 | * Fix RDMA-related segmentation fault problem during fork() (e.g., used by PyTorch data loader).
25 | * New feature: Enable mixing use of colocate and non-colocate servers, along with a smart tensor allocation strategy.
26 | * New feature: Add ``bpslaunch`` as the command to launch tasks.
27 | * Add support for pip install: ``pip3 install byteps``
28 | 
29 | 
30 | 0.1.0 (2019-12)
31 | ------------------
32 | * First official release.
33 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/momentum.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "momentum.h"
17 | 
18 | namespace byteps {
19 | namespace common {
20 | namespace compressor {
21 | 
22 | tensor_t Momentum::Compress(tensor_t grad) {
23 |   // 1. m_t = \mu * m_{t-1} + g_t
24 |   UpdateMom(grad);
25 | 
26 |   // 2. p_t = \mu m_t + g_t
27 |   UpdateGradient(grad);
28 | 
29 |   // 3. compress
30 |   return _cptr->Compress(grad);
31 | }
32 | 
33 | tensor_t Momentum::Decompress(tensor_t compressed) {
34 |   // directly forward to internal compressor
35 |   return _cptr->Decompress(compressed);
36 | }
37 | 
38 | }  // namespace compressor
39 | }  // namespace common
40 | }  // namespace byteps


--------------------------------------------------------------------------------
/byteps/mxnet/ready_event.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #include <mxnet/base.h>
18 | 
19 | #if HAVE_CUDA
20 | #include <cassert>
21 | 
22 | #include "ready_event.h"
23 | 
24 | namespace byteps {
25 | namespace mxnet {
26 | 
27 | template <class T>
28 | MXReadyEvent<T>::MXReadyEvent(NDArray* tensor) : tensor_(tensor) {
29 |   assert(tensor->ctx().real_dev_id() != CPU_DEVICE_ID);
30 | }
31 | 
32 | template <class T>
33 | MXReadyEvent<T>::~MXReadyEvent() {}
34 | 
35 | template <class T>
36 | bool MXReadyEvent<T>::Ready() const {
37 |   return true;
38 | }
39 | 
40 | template class MXReadyEvent<NDArray>;
41 | 
42 | }  // namespace mxnet
43 | }  // namespace byteps
44 | #endif
45 | 


--------------------------------------------------------------------------------
/byteps/common/core_loops.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_CORE_LOOPS_H
17 | #define BYTEPS_CORE_LOOPS_H
18 | 
19 | namespace byteps {
20 | namespace common {
21 | 
22 | void CoordinateReduceLoop();
23 | 
24 | void CoordinateBroadcastLoop();
25 | 
26 | void CoordinatePushLoop();
27 | 
28 | void PcieReduceLoop();
29 | 
30 | void RootNcclLoop();
31 | 
32 | void NonRootNcclLoop();
33 | 
34 | void SyncNcclLoop();
35 | 
36 | void CopyDevice2HostLoop();
37 | 
38 | void CompressLoop();
39 | 
40 | void PushLoop();
41 | 
42 | void PullLoop();
43 | 
44 | void DecompressLoop();
45 | 
46 | void RootCopyHost2DeviceLoop();
47 | 
48 | void NonRootCopyListenLoop();
49 | 
50 | void NonRootCopyHost2DeviceLoop();
51 | 
52 | }  // namespace common
53 | }  // namespace byteps
54 | 
55 | #endif  // BYTEPS_CORE_LOOPS_H
56 | 


--------------------------------------------------------------------------------
/byteps/mxnet/util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_UTIL_H
18 | #define BYTEPS_MXNET_UTIL_H
19 | 
20 | #if HAVE_CUDA
21 | 
22 | #include <cuda_runtime.h>
23 | 
24 | /*!
25 |  * \brief Protected CUDA call.
26 |  * \param func Expression to call.
27 |  *
28 |  * It checks for CUDA errors after invocation of the expression.
29 |  */
30 | #define CUDA_CALL(func)                                      \
31 |   {                                                          \
32 |     cudaError_t e = (func);                                  \
33 |     CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
34 |         << "CUDA: " << cudaGetErrorString(e);                \
35 |   }
36 | 
37 | #endif  // HAVE_CUDA
38 | 
39 | #endif  // BYTEPS_MXNET_UTIL_H
40 | 


--------------------------------------------------------------------------------
/example/mxnet/symbols/mlp.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | """
19 | a simple multilayer perceptron
20 | """
21 | import mxnet as mx
22 | 
23 | def get_symbol(num_classes=10, **kwargs):
24 |     data = mx.symbol.Variable('data')
25 |     data = mx.sym.Flatten(data=data)
26 |     fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
27 |     act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
28 |     fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
29 |     act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
30 |     fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
31 |     mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
32 |     return mlp
33 | 


--------------------------------------------------------------------------------
/byteps/torch/adapter.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_ADAPTER_H
18 | #define BYTEPS_TORCH_ADAPTER_H
19 | 
20 | #include <torch/extension.h>
21 | #include <torch/torch.h>
22 | 
23 | #include "../common/common.h"
24 | 
25 | namespace byteps {
26 | namespace torch {
27 | 
28 | using namespace byteps::common;
29 | 
30 | class TorchTensor : public Tensor {
31 |  public:
32 |   TorchTensor(::torch::Tensor tensor);
33 |   virtual const DataType dtype() const override;
34 |   virtual const TensorShape shape() const override;
35 |   virtual const void* data() const override;
36 |   virtual int64_t size() const override;
37 | 
38 |  protected:
39 |   ::torch::Tensor tensor_;
40 | };
41 | 
42 | void ThrowIfError(Status status);
43 | 
44 | }  // namespace torch
45 | }  // namespace byteps
46 | 
47 | #endif  // BYTEPS_TORCH_ADAPTER_H
48 | 


--------------------------------------------------------------------------------
/byteps/torch/ready_event.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_READY_EVENT_H
18 | #define BYTEPS_TORCH_READY_EVENT_H
19 | 
20 | #if HAVE_CUDA
21 | #include "cuda_runtime.h"
22 | #endif
23 | 
24 | #include <memory>
25 | 
26 | #include "../common/common.h"
27 | 
28 | namespace byteps {
29 | namespace torch {
30 | 
31 | using namespace byteps::common;
32 | 
33 | #if HAVE_CUDA
34 | class TorchReadyEvent : public ReadyEvent {
35 |  public:
36 |   TorchReadyEvent(int device);
37 |   ~TorchReadyEvent();
38 |   virtual bool Ready() const override;
39 | 
40 |  private:
41 |   int device_ = CPU_DEVICE_ID;
42 |   cudaEvent_t cuda_event_ = nullptr;
43 | };
44 | #endif
45 | 
46 | std::shared_ptr<ReadyEvent> RecordReadyEvent(int device);
47 | 
48 | }  // namespace torch
49 | }  // namespace byteps
50 | 
51 | #endif  // BYTEPS_TORCH_READY_EVENT_H
52 | 


--------------------------------------------------------------------------------
/byteps/mxnet/ready_event.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_READY_EVENT_H
18 | #define BYTEPS_MXNET_READY_EVENT_H
19 | 
20 | #include <mxnet/ndarray.h>
21 | 
22 | #if HAVE_CUDA
23 | #include <mutex>
24 | #include <queue>
25 | #include <unordered_map>
26 | #include "cuda_runtime.h"
27 | 
28 | #include "../common/common.h"
29 | 
30 | namespace byteps {
31 | namespace mxnet {
32 | 
33 | using namespace byteps::common;
34 | typedef ::mxnet::NDArray NDArray;
35 | 
36 | template <class T>
37 | class MXReadyEvent : public ReadyEvent {
38 |  public:
39 |   MXReadyEvent(NDArray* tensor);
40 |   ~MXReadyEvent();
41 |   virtual bool Ready() const override;
42 | 
43 |  private:
44 |   NDArray* tensor_;
45 | };
46 | 
47 | }  // namespace mxnet
48 | }  // namespace byteps
49 | #endif
50 | 
51 | #endif  // BYTEPS_MXNET_READY_EVENT_H
52 | 


--------------------------------------------------------------------------------
/byteps/torch/handle_manager.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_HANDLE_MANAGER_H
18 | #define BYTEPS_TORCH_HANDLE_MANAGER_H
19 | 
20 | #include <atomic>
21 | #include <memory>
22 | #include <mutex>
23 | #include <unordered_map>
24 | 
25 | #include "../common/common.h"
26 | 
27 | namespace byteps {
28 | namespace torch {
29 | 
30 | using namespace byteps::common;
31 | 
32 | class HandleManager {
33 |  public:
34 |   int AllocateHandle();
35 |   void MarkDone(int handle, const Status& status);
36 |   bool PollHandle(int handle);
37 |   std::shared_ptr<Status> ReleaseHandle(int handle);
38 | 
39 |  private:
40 |   std::atomic_int last_handle_;
41 |   std::unordered_map<int, std::shared_ptr<Status>> results_;
42 |   std::mutex mutex_;
43 | };
44 | 
45 | }  // namespace torch
46 | }  // namespace byteps
47 | 
48 | #endif  // BYTEPS_TORCH_HANDLE_MANAGER_H
49 | 


--------------------------------------------------------------------------------
/docs/DistributedDataParallel.md:
--------------------------------------------------------------------------------
 1 | # DistributedDataParallel
 2 | 
 3 | BytePS Distributed Data Parallel module is compatible with PyTorch Distributed
 4 | Data Parallel for the most part. Instead of using PyTorch communication
 5 | backends, it uses BytePS push-pull for gradients reduction between nodes.
 6 | 
 7 | It currently supports the Single-Process Single-GPU mode. In this mode each
 8 | process works with one GPU. Example usage:
 9 | 
10 | 
11 | ```python
12 | # byteps_ddp_example.py
13 | from byteps.torch.parallel import DistributedDataParallel
14 | 
15 | model = DistributedDataParallel(model, device_ids=[i])
16 | output = model(data)
17 | loss = F.nll_loss(output, target)
18 | loss.backward()
19 | optimizer.step()
20 | ```
21 | 
22 | Some models have branches, part of the model is skipped during the forward
23 | pass. In that case it's required to call the
24 | DistributedDataParallel.synchronize() function after loss.backward(), e.g.:
25 | 
26 | ```python
27 | # byteps_ddp_example.py
28 | from byteps.torch.parallel import DistributedDataParallel
29 | 
30 | # construct a model which skips some layers in the forward pass, then wrap the
31 | # model with DistributedDataParallel()
32 | model = DistributedDataParallel(model, device_ids=[i])
33 | output = model(data)
34 | loss = F.nll_loss(output, target)
35 | loss.backward()
36 | # the synchronize() call here is required because some layers were skipped in
37 | # the forward pass
38 | model.synchronize()
39 | optimizer.step()
40 | ```
41 | 
42 | To run the program, use `bpslaunch` to launch one process for each device you
43 | wish to use. Refer to the [running](./running.md) document for how to use `bpslaunch`.
44 | 


--------------------------------------------------------------------------------
/byteps/torch/cuda_util.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #if HAVE_CUDA
17 | #include <THC/THC.h>
18 | #include "cuda_runtime.h"
19 | #endif
20 | 
21 | #include "../common/common.h"
22 | #include "cuda_util.h"
23 | 
24 | namespace byteps {
25 | namespace torch {
26 | 
27 | with_device::with_device(int device) {
28 |   if (device == CPU_DEVICE_ID) {
29 |     restore_device_ = CPU_DEVICE_ID;
30 |   } else {
31 | #if HAVE_CUDA
32 |     THCudaCheck(cudaGetDevice(&restore_device_));
33 |     THCudaCheck(cudaSetDevice(device));
34 | #else
35 |     throw std::logic_error(
36 |         "Internal error. Requested device context manager "
37 |         "with GPU device but not compiled with CUDA.");
38 | #endif
39 |   }
40 | }
41 | 
42 | with_device::~with_device() {
43 | #if HAVE_CUDA
44 |   if (restore_device_ != CPU_DEVICE_ID) {
45 |     THCudaCheck(cudaSetDevice(restore_device_));
46 |   }
47 | #endif
48 | }
49 | 
50 | }  // namespace torch
51 | }  // namespace byteps
52 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/error_feedback.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "error_feedback.h"
17 | 
18 | namespace byteps {
19 | namespace common {
20 | namespace compressor {
21 | 
22 | tensor_t ErrorFeedback::Compress(tensor_t grad) {
23 |   // 1. grad <- grad + error
24 |   UpdateGradient(grad);
25 | 
26 |   // 2. c <- Compress(grad)
27 |   auto compressed = _cptr->Compress(grad);
28 | 
29 |   // 3. e <- grad - Decompress(c)
30 |   UpdateError(grad, compressed);
31 | 
32 |   return compressed;
33 | }
34 | 
35 | tensor_t ErrorFeedback::Decompress(tensor_t compressed) {
36 |   // directly forward to internal compressor
37 |   return _cptr->Decompress(compressed);
38 | }
39 | 
40 | void ErrorFeedback::UpdateError(tensor_t corrected, tensor_t compressed) {
41 |   tensor_t error{_error.get(), _size, corrected.dtype};
42 |   _cptr->FastUpdateError(error, corrected, compressed);
43 | }
44 | 
45 | }  // namespace compressor
46 | }  // namespace common
47 | }  // namespace byteps


--------------------------------------------------------------------------------
/byteps/mxnet/adapter.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_ADAPTER_H
18 | #define BYTEPS_MXNET_ADAPTER_H
19 | 
20 | #include <mxnet/base.h>
21 | #include "../common/common.h"
22 | 
23 | namespace byteps {
24 | namespace mxnet {
25 | 
26 | using namespace byteps::common;
27 | 
28 | template <class T>
29 | class MXTensor : public Tensor {
30 |  public:
31 |   MXTensor(T* tensor);
32 |   virtual const DataType dtype() const override;
33 |   virtual const TensorShape shape() const override;
34 |   virtual const void* data() const override;
35 |   virtual int64_t size() const override;
36 | 
37 |  protected:
38 |   T* tensor_;
39 | };
40 | 
41 | inline void ThrowIfError(const Status& status) {
42 |   if (!status.ok()) {
43 |     throw dmlc::Error(status.reason());
44 |   }
45 | }
46 | 
47 | }  // namespace mxnet
48 | }  // namespace byteps
49 | 
50 | #endif  // BYTEPS_MXNET_ADAPTER_H
51 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: bionic
 2 | language: python
 3 | cache: pip
 4 | jobs:
 5 |   include:
 6 |     - python: 2.7
 7 |     - python: 3.7
 8 | env:
 9 |   - CUDA=10.1.105-1 CUDA_APT=10-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804
10 | before_install:
11 |   - CUDA_REPO=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
12 |   - NCCL_REPO=nvidia-machine-learning-repo-${UBUNTU_VERSION}_1.0.0-1_amd64.deb
13 |   - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${CUDA_REPO}
14 |   - sudo dpkg -i ${CUDA_REPO}
15 |   - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
16 |   - wget http://developer.download.nvidia.com/compute/machine-learning/repos/${UBUNTU_VERSION}/x86_64/${NCCL_REPO}
17 |   - sudo dpkg -i ${NCCL_REPO}
18 |   - sudo apt update -qq
19 |   - sudo apt install -y cuda-10-1 libnccl2 libnccl-dev libnuma-dev
20 |   - sudo apt clean
21 |   - export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
22 |   - export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
23 |   - export PATH=${CUDA_HOME}/bin:${PATH}
24 |   - pip install future mxnet-cu101 tensorflow-gpu torch torchvision
25 | install:
26 |   - export BYTEPS_CUDA_HOME=${CUDA_HOME}
27 |   - python setup.py install
28 |   - cd 3rdparty/ps-lite && make -j && cd -
29 | script:
30 |   - export DMLC_NODE_HOST=127.0.0.1
31 |   - export PORT=8000
32 |   - 3rdparty/ps-lite/tests/local.sh 1 1 3rdparty/ps-lite/tests/test_benchmark 1024000 10 0
33 |   - export PORT=8001
34 |   - 3rdparty/ps-lite/tests/local.sh 2 2 3rdparty/ps-lite/tests/test_benchmark 1024000 10 0
35 |   - export PORT=8002
36 |   - 3rdparty/ps-lite/tests/local.sh 4 4 3rdparty/ps-lite/tests/test_benchmark 1024000 10 0
37 | 


--------------------------------------------------------------------------------
/example/mxnet/data/imagenet1k-val.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | 
21 | # This file download the imagnet-1k validation dataset and convert it into a rec
22 | # file. One need to provide the URL for the ILSVRC2012_img_val.tar, which can be
23 | # find at http://www.image-net.org/download-images
24 | #
25 | # Example usage (replace the URL with the correct one):
26 | # ./imagenet1k-val.sh http://xxxxxx/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
27 | 
28 | if [ ! -e ILSVRC2012_img_val.tar ]; then
29 |     wget $1
30 | fi
31 | mkdir -p val
32 | tar -xf ILSVRC2012_img_val.tar -C val
33 | wget http://data.mxnet.io/models/imagenet/resnet/val.lst -O imagenet1k-val.lst
34 | 
35 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
36 | MX_DIR=${CUR_DIR}/../../../
37 | 
38 | python ${CUR_DIR}/../../../tools/im2rec.py --resize 256 --quality 90 --num-thread 16 imagenet1k-val val/
39 | 
40 | rm -rf val
41 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import mxnet.ndarray as nd
 3 | import numpy as np
 4 | from numba import jit
 5 | 
 6 | 
 7 | def fake_data(dtype="float32", batch_size=32, height=224, width=224, depth=3, num_classes=1000):
 8 |     image_list = []
 9 |     label_list = []
10 |     for _ in range(8):
11 |         image = mx.ndarray.random.normal(-1, 1,
12 |                                          shape=[1, depth, height, width],
13 |                                          dtype=dtype)
14 |         label = mx.ndarray.random.randint(0, num_classes, [1, 1])
15 | 
16 |         images = mx.ndarray.repeat(image, 128, axis=0)
17 |         labels = mx.ndarray.repeat(label, 128, axis=0)
18 |         # print(labels)
19 |         image_list.append(images)
20 |         label_list.append(labels)
21 | 
22 |     images = nd.concat(*image_list, dim=0)
23 |     labels = nd.concat(*label_list, dim=0)
24 |     # print(labels)
25 |     fake_dataset = mx.gluon.data.ArrayDataset(images, labels)
26 | 
27 |     return mx.gluon.data.DataLoader(fake_dataset, batch_size=batch_size, num_workers=4,
28 |                                     shuffle=True, last_batch='discard')
29 | 
30 | 
31 | @jit(nopython=True)
32 | def xorshift128p(state):
33 |     t = state[0]
34 |     s = state[1]
35 |     state[0] = s
36 |     t ^= t << np.uint64(23)
37 |     t ^= t >> np.uint64(17)
38 |     t ^= s ^ (s >> np.uint64(26))
39 |     state[1] = t
40 |     return int(t + s)
41 | 
42 | 
43 | @jit(nopython=True)
44 | def bernoulli(p, state):
45 |     t = p * np.iinfo(np.uint64).max
46 |     r = np.array([xorshift128p(state) for _ in range(len(p))], dtype=np.float32)
47 |     return r < t
48 | 
49 | 
50 | @jit(nopython=True)
51 | def randint(low, high, state):
52 |     return xorshift128p(state) % (high - low) + low
53 | 


--------------------------------------------------------------------------------
/byteps/common/ready_table.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "ready_table.h"
17 | 
18 | #include "logging.h"
19 | 
20 | namespace byteps {
21 | namespace common {
22 | 
23 | // below are methods for accessing/modifying the _ready_table
24 | bool ReadyTable::IsKeyReady(uint64_t key) {
25 |   std::lock_guard<std::mutex> lock(_table_mutex);
26 |   return _ready_table[key] == (_ready_count);
27 | }
28 | 
29 | int ReadyTable::AddReadyCount(uint64_t key) {
30 |   std::lock_guard<std::mutex> lock(_table_mutex);
31 |   BPS_CHECK_LT(_ready_table[key], _ready_count)
32 |       << _table_name << ": " << _ready_table[key] << ", " << (_ready_count);
33 |   return ++_ready_table[key];
34 | }
35 | 
36 | int ReadyTable::SetReadyCount(uint64_t key, int cnt) {
37 |   std::lock_guard<std::mutex> lock(_table_mutex);
38 |   _ready_table[key] = cnt;
39 | }
40 | 
41 | void ReadyTable::ClearReadyCount(uint64_t key) {
42 |   std::lock_guard<std::mutex> lock(_table_mutex);
43 |   _ready_table[key] = 0;
44 | }
45 | 
46 | }  // namespace common
47 | }  // namespace byteps
48 | 


--------------------------------------------------------------------------------
/byteps/common/ready_table.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_READY_TABLE_H
17 | #define BYTEPS_READY_TABLE_H
18 | 
19 | #include <mutex>
20 | #include <thread>
21 | #include <unordered_map>
22 | 
23 | namespace byteps {
24 | namespace common {
25 | 
26 | class ReadyTable {
27 |  public:
28 |   ReadyTable(int ready_count, const char* name) {
29 |     _ready_count = ready_count;
30 |     _table_name = std::string(name);
31 |   }
32 |   // methods to access or modify the _ready_table
33 |   bool IsKeyReady(uint64_t key);
34 |   int AddReadyCount(uint64_t key);
35 |   int SetReadyCount(uint64_t key, int cnt);
36 |   void ClearReadyCount(uint64_t key);
37 | 
38 |  private:
39 |   // (key, ready_signal_count) pair, only valid for root device
40 |   std::unordered_map<uint64_t, int> _ready_table;
41 |   // use this mutex to access/modify the _ready_table
42 |   std::mutex _table_mutex;
43 |   int _ready_count;
44 |   std::string _table_name;
45 | };
46 | 
47 | }  // namespace common
48 | }  // namespace byteps
49 | 
50 | #endif  // BYTEPS_READY_TABLE_H
51 | 


--------------------------------------------------------------------------------
/byteps/mxnet/cuda_util.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #if HAVE_CUDA
18 | #include <mxnet/base.h>
19 | #include "cuda_runtime.h"
20 | #endif
21 | 
22 | #include "../common/common.h"
23 | #include "cuda_util.h"
24 | #include "util.h"
25 | 
26 | namespace byteps {
27 | namespace mxnet {
28 | 
29 | with_device::with_device(int device) {
30 |   if (device == CPU_DEVICE_ID) {
31 |     restore_device_ = CPU_DEVICE_ID;
32 |   } else {
33 | #if HAVE_CUDA
34 |     CUDA_CALL(cudaGetDevice(&restore_device_));
35 |     CUDA_CALL(cudaSetDevice(device));
36 | #else
37 |     throw std::logic_error(
38 |         "Internal error. Requested device context manager "
39 |         "with GPU device but not compiled with CUDA.");
40 | #endif
41 |   }
42 | }
43 | 
44 | with_device::~with_device() {
45 | #if HAVE_CUDA
46 |   if (restore_device_ != CPU_DEVICE_ID) {
47 |     CUDA_CALL(cudaSetDevice(restore_device_));
48 |   }
49 | #endif
50 | }
51 | 
52 | }  // namespace mxnet
53 | }  // namespace byteps
54 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | BytePS includes derived work from the following:
 2 | 
 3 | Horovod
 4 | Copyright 2018 Uber Technologies, Inc.
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this file except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 |     http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | 
18 | kennethreitz/setup.py
19 | Copyright 2019 Kenneth Reitz
20 | 
21 | Permission is hereby granted, free of charge, to any person obtaining
22 | a copy of this software and associated documentation files (the
23 | "Software"), to deal in the Software without restriction, including
24 | without limitation the rights to use, copy, modify, merge, publish,
25 | distribute, sublicense, and/or sell copies of the Software, and to
26 | permit persons to whom the Software is furnished to do so, subject to
27 | the following conditions:
28 | 
29 | The above copyright notice and this permission notice shall be included
30 | in all copies or substantial portions of the Software.
31 | 
32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
35 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
36 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
37 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
38 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
39 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.0-devel-ubuntu18.04
 2 | 
 3 | ARG https_proxy
 4 | ARG http_proxy
 5 | 
 6 | ARG BYTEPS_BASE_PATH=/usr/local
 7 | ARG BYTEPS_PATH=$BYTEPS_BASE_PATH/byteps
 8 | ARG BYTEPS_GIT_LINK=https://github.com/bytedance/byteps
 9 | ARG BYTEPS_BRANCH=master
10 | 
11 | ARG DEBIAN_FRONTEND=noninteractive
12 | RUN apt-get update
13 | RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
14 |         build-essential \
15 |         tzdata \
16 |         ca-certificates \
17 |         git \
18 |         curl \
19 |         wget \
20 |         vim \
21 |         cmake \
22 |         lsb-release \
23 |         libcudnn7=7.6.0.64-1+cuda10.0 \
24 |         libnuma-dev \
25 |         ibverbs-providers \
26 |         librdmacm-dev \
27 |         ibverbs-utils \
28 |         rdmacm-utils \
29 |         libibverbs-dev \
30 |         python3 \
31 |         python3-dev \
32 |         python3-pip \
33 |         python3-setuptools \
34 |         libnccl2=2.4.7-1+cuda10.0 \
35 |         libnccl-dev=2.4.7-1+cuda10.0
36 | 
37 | # install framework
38 | # note: for tf <= 1.14, you need gcc-4.9
39 | ARG FRAMEWORK=tensorflow
40 | RUN if [ "$FRAMEWORK" = "tensorflow" ]; then \
41 |         pip3 install --upgrade pip; \
42 |         pip3 install -U tensorflow-gpu==1.15.0; \
43 |     elif [ "$FRAMEWORK" = "pytorch" ]; then \
44 |         pip3 install -U numpy==1.18.1 torchvision==0.5.0 torch==1.4.0; \
45 |     elif [ "$FRAMEWORK" = "mxnet" ]; then \
46 |         pip3 install -U mxnet-cu100==1.5.0; \
47 |     else \
48 |         echo "unknown framework: $FRAMEWORK"; \
49 |         exit 1; \
50 |     fi
51 | 
52 | ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
53 | 
54 | RUN cd $BYTEPS_BASE_PATH &&\
55 |     git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK &&\
56 |     cd $BYTEPS_PATH &&\
57 |     python3 setup.py install
58 | 


--------------------------------------------------------------------------------
/byteps/mxnet/ops.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_OPS_H
18 | #define BYTEPS_MXNET_OPS_H
19 | 
20 | #include <mxnet/base.h>
21 | #include <mxnet/c_api.h>
22 | #include <mxnet/c_api_error.h>
23 | #include <mxnet/engine.h>
24 | #include <mxnet/ndarray.h>
25 | #include "../common/common.h"
26 | 
27 | namespace byteps {
28 | namespace mxnet {
29 | 
30 | using namespace byteps::common;
31 | 
32 | typedef ::mxnet::Engine Engine;
33 | typedef ::mxnet::NDArray NDArray;
34 | typedef ::mxnet::Engine::CallbackOnComplete Callback;
35 | 
36 | extern "C" int byteps_mxnet_push_pull_async(NDArray* input, char* name,
37 |                                             int version, int priority,
38 |                                             bool is_average);
39 | 
40 | extern "C" void byteps_mxnet_declare_tensor(char* name, int num_args,
41 |                                             char** args_keys,
42 |                                             char** args_vals);
43 | 
44 | }  // namespace mxnet
45 | }  // namespace byteps
46 | 
47 | #endif  // BYTEPS_MXNET_OPS_H
48 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/nesterov_momentum.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H
17 | #define BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H
18 | 
19 | #include "../momentum.h"
20 | 
21 | namespace byteps {
22 | namespace common {
23 | namespace compressor {
24 | 
25 | /*!
26 |  * \brief Nesterov Momentum Compressor
27 |  *
28 |  * paper: A method for solving the convex programming problem with convergence
29 |  * rate $O (1/k^2)$
30 |  *
31 |  * m_t <- \mu m_{t-1} + g_t
32 |  * g_t <- \mu m_t + g_t
33 |  *
34 |  */
35 | class NesterovMomentumCompressor : public Momentum {
36 |  public:
37 |   NesterovMomentumCompressor(size_t size, DataType dtype,
38 |                              std::unique_ptr<Compressor> cptr, float mu)
39 |       : Momentum(size, dtype, std::move(cptr), mu){};
40 |   virtual ~NesterovMomentumCompressor() = default;
41 | 
42 |  protected:
43 |   void UpdateMom(tensor_t grad) override;
44 |   void UpdateGradient(tensor_t grad) override;
45 | };
46 | 
47 | }  // namespace compressor
48 | }  // namespace common
49 | }  // namespace byteps
50 | 
51 | #endif  // BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H


--------------------------------------------------------------------------------
/byteps/common/scheduled_queue.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_SCHEDULED_QUEUE_H
17 | #define BYTEPS_SCHEDULED_QUEUE_H
18 | 
19 | #include <atomic>
20 | #include <memory>
21 | #include <unordered_map>
22 | #include <vector>
23 | #include "common.h"
24 | #include "ready_table.h"
25 | 
26 | namespace byteps {
27 | namespace common {
28 | 
29 | class BytePSScheduledQueue {
30 |  public:
31 |   BytePSScheduledQueue(QueueType type);
32 |   QueueType getQueueType() { return _qt; }
33 |   void addTask(std::shared_ptr<TensorTableEntry>);
34 |   void recorderTs(std::shared_ptr<TensorTableEntry>);
35 |   std::shared_ptr<TensorTableEntry> getTask();
36 |   std::shared_ptr<TensorTableEntry> getTask(uint64_t key);
37 |   uint32_t pendingSize();
38 |   void reportFinish(int size);
39 |   void reset(uint64_t key, int cnt);
40 | 
41 |  private:
42 |   // TODO: use priority queue or heap
43 |   std::vector<std::shared_ptr<TensorTableEntry>> _sq;
44 |   std::mutex _mutex;
45 |   uint64_t _credits;
46 |   bool _is_scheduled;
47 |   QueueType _qt;
48 |   ReadyTable *_rt;
49 | };
50 | 
51 | }  // namespace common
52 | }  // namespace byteps
53 | 
54 | #endif  // BYTEPS_SCHEDULED_QUEUE_H
55 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/compressor_registry.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H
17 | #define BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H
18 | 
19 | #include "compressor.h"
20 | #include "utils.h"
21 | 
22 | namespace byteps {
23 | namespace common {
24 | namespace compressor {
25 | 
26 | class CompressorRegistry {
27 |  public:
28 |   // constructor of compressor
29 |   using ctor_t = std::function<std::unique_ptr<Compressor>(
30 |       const kwargs_t& kwargs, size_t size, DataType dtype)>;
31 | 
32 |   using map_t = std::unordered_map<std::string, ctor_t>;
33 | 
34 |   struct Register {
35 |     Register(std::string name, ctor_t ctor);
36 |   };
37 | 
38 |   static ctor_t Find(const std::string& name);
39 | 
40 |   static std::unique_ptr<Compressor> Create(const kwargs_t& kwargs, size_t size,
41 |                                             DataType dtype);
42 | 
43 |  private:
44 |   static map_t _ctor_map;
45 | 
46 |   CompressorRegistry() = delete;
47 |   ~CompressorRegistry() = delete;
48 | };
49 | 
50 | }  // namespace compressor
51 | }  // namespace common
52 | }  // namespace byteps
53 | 
54 | #endif  // BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H


--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
 1 | # BytePS Performance when training CNN
 2 | 
 3 | ## NVLink + TCP
 4 | 
 5 | We test two models: VGG16 (communication-intensive) and Resnet50 (computation-intensive) on a popular public cloud. Both models are trained using fp32.
 6 | 
 7 | We use Tesla V100 16GB GPUs and set batch size equal to 64 *per GPU*. The machines are VMs on the cloud. Each machine has 8 V100 GPUs with NVLink-enabled. Machines are inter-connected with 20 Gbps TCP/IP network.
 8 | 
 9 | BytePS outperforms Horovod (NCCL) by 44% for Resnet50, and 100% for VGG16.
10 | 
11 | ![vgg16_tcp](https://user-images.githubusercontent.com/13852819/69873424-41e37500-12f3-11ea-93b8-705215e3e901.png)
12 | ![resnet50_tcp](https://user-images.githubusercontent.com/13852819/69873419-40b24800-12f3-11ea-9ff3-0f11347c089e.png)
13 | 
14 | You can reproduce the results using the Dockerfiles and example scripts we provide.
15 | 
16 | ## PCIe + RDMA
17 | 
18 | Note: here we present the *worse case scenario* of BytePS, i.e., 100Gbps RDMA + no NVLinks.
19 | 
20 | We get below results on machines that are based on PCIe-switch architecture -- 4 GPUs under one PCIe switch, and each machine contains two PCIe switches.
21 | The machines are inter-connected by 100 Gbps RoCEv2 networks.
22 | In this case, BytePS outperforms Horovod (NCCL) by 7% for Resnet50, and 17% for VGG16.
23 | 
24 | ![perf_rdma_pcie_resnet50](https://user-images.githubusercontent.com/13852819/68925125-57b64d80-07bd-11ea-9f72-d108cf4294ad.png)
25 | 
26 | ![perf_rdma_pcie_vgg16](https://user-images.githubusercontent.com/13852819/68925175-70befe80-07bd-11ea-98d6-ca7df3670bbd.png)
27 | 
28 | 
29 | To have BytePS outperform NCCL by so little, you have to have 100Gbps RDMA network *and* no NVLinks. In this case, the communication is actually bottlenecked by internal PCI-e switches, not the network. BytePS has done some optimization so that it still outperforms NCCL. However, the performance gain is not as large as other cases where the network is the bottleneck.
30 | 


--------------------------------------------------------------------------------
/docs/cross-barrier.md:
--------------------------------------------------------------------------------
 1 | # Cross Global Barrier
 2 | 
 3 | This eliminates the global barrier between training iterations for distributed training frameworks (e.g.,
 4 | PyTorch), so that the priority-based communication scheduling in BytePS can be effective.
 5 | 
 6 | ## Why Crossing Barrier?
 7 | 
 8 | Existing distributed training frameworks (PyTorch, TensorFlow, etc) do not fully utilize the potentials of overlapping
 9 | computation and communication to speed up neural network training: they only support communication overlapping with
10 | backward propagation. But due to layer-wise dependencies in DNN training, we can actually schedule gradient
11 | synchronization order based on when they are consumed in the next iteration, and hence overlap communication with
12 | forward-propagation of the next iteration! Read the paper https://dl.acm.org/citation.cfm?id=3359642 for more
13 | communication scheduling details.
14 | 
15 | To make this idea work, the first step is to remove the global barrier between two iterations to build layer-wise
16 | dependencies, so that the forward computation of next step can start without waiting for parameter synchronization
17 | completion of all parameters.
18 | 
19 | Fig.1 shows the dependency graph with global barrier. Machine learning frameworks such as PyTorch and TensorFlow have
20 | similar dependencies when using BytePS for push and pull.
21 | 
22 | ![dag_barrier](https://user-images.githubusercontent.com/13852819/69863244-4b5ee400-12d7-11ea-9356-2dd41dff95ab.png)
23 | 
24 | *Fig.1: Dependency Graph With Global Barrier*
25 | 
26 | Fig. 2 shows the dependency graph after removing global barrier. What we do here is to change the dependency
27 | graph from Fig. 1 to Fig. 2 by removing the barrier, building layer-wise dependencies while guaranteeing computation correctness.
28 | 
29 | 
30 | ![dag_without_barrier](https://user-images.githubusercontent.com/13852819/69863268-5d408700-12d7-11ea-8b39-5e48e3d94c2b.png)
31 | *Fig.2: Dependency Graph After Removing Global Barrier*
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/docs/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting
 2 | 
 3 | We suggest you read the Horovod troubleshooting, especially for problems during the build process. BytePS has almost the same dependencies as Horovod minus MPI.
 4 | 
 5 | https://github.com/horovod/horovod/blob/v0.16.4/docs/troubleshooting.rst
 6 | 
 7 | ## Network connectivity
 8 | 
 9 | When launching distributed jobs, if you see hanging at the beginning, one possible reason is that your network connection has trouble. You can use `ps-lite` benchmark to verify the connectivity.
10 | 
11 | Install ps-lite:
12 | 
13 | ```
14 | git clone -b byteps https://github.com/bytedance/ps-lite.git
15 | cd ps-lite
16 | make -j
17 | ```
18 | 
19 | 
20 | For the scheduler
21 | ```
22 | export DMLC_ROLE=scheduler
23 | export DMLC_NUM_WORKER=1
24 | export DMLC_NUM_SERVER=1
25 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP]
26 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT]
27 | export DMLC_INTERFACE=eth0
28 | ./ps-lite/tests/test_benchmark
29 | ```
30 | 
31 | For the server
32 | ```
33 | export DMLC_ROLE=server
34 | export DMLC_NUM_WORKER=1
35 | export DMLC_NUM_SERVER=1
36 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP]
37 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT]
38 | export DMLC_INTERFACE=eth0
39 | ./ps-lite/tests/test_benchmark
40 | ```
41 | 
42 | For the worker:
43 | ```
44 | export DMLC_ROLE=worker
45 | export DMLC_NUM_WORKER=1
46 | export DMLC_NUM_SERVER=1
47 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP]
48 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT]
49 | export DMLC_INTERFACE=eth0
50 | ./ps-lite/tests/test_benchmark 1024000 100 0
51 | ```
52 | 
53 | If it succeed, you should be able to see something like this on the worker.
54 | ```
55 | push_byte=4096000, repeat=100, total_time=128.842ms
56 | pull_byte=4096000, repeat=100, total_time=353.38ms
57 | ```
58 | 
59 | (Note: for RDMA networks, use `make -j USE_RDMA=1` to build, and `export DMLC_ENABLE_RDMA=1` for running the scheduler / server / worker)
60 | 
61 | If it still hang, you may need to check your network connectivity.
62 | 


--------------------------------------------------------------------------------
/docs/running.md:
--------------------------------------------------------------------------------
 1 | # Running BytePS
 2 | 
 3 | BytePS follows the same running model as MXNet's PS implemenation, and provides a script, launcher/launcher.py, to help you start individual processes. **Below instructions, including those DMLC variables, apply to all frameworks.**
 4 | 
 5 | Let's say you have two worker machines (or docker containers) that have GPUs, one machine or container as a server, and a scheduler. The scheduler binds on 10.0.0.1 and port 9000. The workers and the server can connect to the scheduler via the IP and port using TCP.
 6 | 
 7 | To use launcher/launcher.py, NVIDIA_VISIBLE_DEVICES should exist -- either automatically set by nvidia-docker, or manually set by you.
 8 | 
 9 | On worker 0, run:
10 | 
11 | ```
12 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
13 | DMLC_WORKER_ID=0 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \
14 | bpslaunch YOUR_COMMAND
15 | ```
16 | 
17 | On worker 1, run (only DMLC_WORKER_ID is different from above):
18 | 
19 | ```
20 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
21 | DMLC_WORKER_ID=1 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \
22 | bpslaunch YOUR_COMMAND
23 | ```
24 | 
25 | **For servers and schedulers, we highly recommend you use the docker image we build:**
26 | 
27 | ```
28 | docker pull bytepsimage/byteps_server
29 | ```
30 | 
31 | Start server and scheduler docker instances with this image. In the server, run the following. Compared with the worker command, we remove DMLC_WORKER_ID, and set role to server.
32 | 
33 | ```
34 | DMLC_ROLE=server DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
35 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch
36 | ```
37 | 
38 | On the scheduler, run (we also remove DMLC_WORKER_ID, and set role to scheduler):
39 | 
40 | ```
41 | DMLC_ROLE=scheduler DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
42 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch
43 | ```
44 | 
45 | In this example, your scheduler must be able to bind to `10.0.0.1:9000`.
46 | 
47 | The order of starting workers/servers/scheduler does not matter.
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # vscode
  2 | .vscode
  3 | *.gz
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | bin/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 
111 | # pycharm
112 | .idea
113 | 
114 | # mac
115 | .DS_Store
116 | 
117 | # for development
118 | scripts/
119 | exps/
120 | 
121 | # dependency tarballs
122 | ucx.tar.gz
123 | zeromq-4.1.4.tar.gz
124 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/ops.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_TENSORFLOW_OPS_H
17 | #define BYTEPS_TENSORFLOW_OPS_H
18 | 
19 | #include <string>
20 | 
21 | #include "tensorflow/core/framework/op.h"
22 | #include "tensorflow/core/framework/op_kernel.h"
23 | #include "tensorflow/core/framework/shape_inference.h"
24 | 
25 | #define EIGEN_USE_THREADS
26 | #include "tensorflow/stream_executor/stream.h"
27 | 
28 | #include "../common/operations.h"
29 | 
30 | namespace byteps {
31 | namespace tensorflow {
32 | 
33 | class TFReadyEvent : public common::ReadyEvent {
34 |  public:
35 |   TFReadyEvent(::tensorflow::DeviceContext* device_context);
36 |   bool Ready() const override;
37 | 
38 |  private:
39 |   std::shared_ptr<perftools::gputools::Event> event_;
40 | };
41 | 
42 | class TFTensor : public common::Tensor {
43 |  public:
44 |   TFTensor(::tensorflow::Tensor& tensor);
45 |   virtual const common::DataType dtype() const override;
46 |   virtual const common::TensorShape shape() const override;
47 |   virtual const void* data() const override;
48 |   virtual int64_t size() const override;
49 | 
50 |  protected:
51 |   ::tensorflow::Tensor tensor_;
52 | };
53 | 
54 | extern "C" void byteps_tensorflow_declare_tensor(char* name);
55 | 
56 | }  // namespace tensorflow
57 | }  // namespace byteps
58 | 
59 | #endif  // BYTEPS_TENSORFLOW_OPS_H
60 | 


--------------------------------------------------------------------------------
/byteps/mxnet/adapter.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #if HAVE_CUDA
18 | #include "cuda.h"
19 | #endif
20 | 
21 | #include "adapter.h"
22 | #include "cuda_util.h"
23 | #include "tensor_util.h"
24 | 
25 | namespace byteps {
26 | namespace mxnet {
27 | 
28 | 
29 | template <class T>
30 | MXTensor<T>::MXTensor(T* tensor) : tensor_(tensor) {}
31 | 
32 | template <class T>
33 | const DataType MXTensor<T>::dtype() const {
34 |   return TensorUtil::GetDType(tensor_);
35 | }
36 | 
37 | template <class T>
38 | const TensorShape MXTensor<T>::shape() const {
39 |   auto shape = TensorUtil::GetShape(tensor_);
40 |   if (shape.dims() == 0) {
41 |     // Tensor with empty shape is a Tensor with no values in MXNet, unlike a
42 |     // constant in TensorFlow. So, we inject a dummy zero dimension to make sure
43 |     // that the number-of-elements calculation is correct.
44 |     shape.AddDim(0);
45 |   }
46 |   return shape;
47 | }
48 | 
49 | template <class T>
50 | const void* MXTensor<T>::data() const {
51 |   return TensorUtil::GetData(tensor_);
52 | }
53 | 
54 | template <class T>
55 | int64_t MXTensor<T>::size() const {
56 |   return TensorUtil::GetSize(tensor_);
57 | }
58 | 
59 | template class MXTensor<NDArray>;
60 | 
61 | }  // namespace mxnet
62 | }  // namespace byteps
63 | 


--------------------------------------------------------------------------------
/byteps/common/shared_memory.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_SHARED_MEMORY_H
17 | #define BYTEPS_SHARED_MEMORY_H
18 | 
19 | #include <cuda_runtime.h>
20 | #include <sys/mman.h>
21 | #include <cerrno>
22 | #include <cstdio>
23 | #include <cstdlib>
24 | #include <cstring>
25 | #include <mutex>
26 | #include <thread>
27 | #include <unordered_map>
28 | #include <vector>
29 | #include "logging.h"
30 | 
31 | namespace byteps {
32 | namespace common {
33 | 
34 | class BytePSSharedMemory {
35 |  public:
36 |   BytePSSharedMemory() {}
37 | 
38 |   ~BytePSSharedMemory() {
39 |     for (auto &it : _key_shm_addr) {
40 |       CUDA_CALL(cudaHostUnregister(it.second));
41 |       munmap(it.second, _key_shm_size[it.first]);
42 |       shm_unlink(it.first.c_str());
43 |     }
44 | 
45 |     BPS_LOG(DEBUG) << "Clear shared memory: all BytePS shared memory "
46 |                       "released/unregistered.";
47 |   }
48 | 
49 |   void *openSharedMemory(const std::string &prefix, uint64_t key, size_t size);
50 |   std::vector<void *> openPcieSharedMemory(uint64_t key, size_t size);
51 | 
52 |  private:
53 |   std::unordered_map<std::string, void *> _key_shm_addr;
54 |   std::unordered_map<std::string, size_t> _key_shm_size;
55 | 
56 |   std::mutex _shm_mu;
57 | };
58 | 
59 | }  // namespace common
60 | }  // namespace byteps
61 | 
62 | #endif  // BYTEPS_SHARED_MEMORY_H
63 | 


--------------------------------------------------------------------------------
/byteps/common/thread_pool.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copy From https://github.com/progschj/ThreadPool/blob/master/ThreadPool.h
 3 |  */
 4 | #ifndef THREAD_POOL_H
 5 | #define THREAD_POOL_H
 6 | 
 7 | #include <condition_variable>
 8 | #include <functional>
 9 | #include <future>
10 | #include <memory>
11 | #include <mutex>
12 | #include <queue>
13 | #include <stdexcept>
14 | #include <thread>
15 | #include <vector>
16 | 
17 | class ThreadPool {
18 |  public:
19 |   ThreadPool(size_t);
20 |   template <class F>
21 |   void enqueue(F&& f);
22 |   ~ThreadPool();
23 | 
24 |  private:
25 |   // need to keep track of threads so we can join them
26 |   std::vector<std::thread> workers;
27 |   // the task queue
28 |   std::queue<std::function<void()> > tasks;
29 | 
30 |   // synchronization
31 |   std::mutex queue_mutex;
32 |   std::condition_variable condition;
33 |   bool stop;
34 | };
35 | 
36 | // the constructor just launches some amount of workers
37 | inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
38 |   for (size_t i = 0; i < threads; ++i)
39 |     workers.emplace_back([this] {
40 |       for (;;) {
41 |         std::function<void()> task;
42 | 
43 |         {
44 |           std::unique_lock<std::mutex> lock(this->queue_mutex);
45 |           this->condition.wait(
46 |               lock, [this] { return this->stop || !this->tasks.empty(); });
47 |           if (this->stop && this->tasks.empty()) return;
48 |           task = std::move(this->tasks.front());
49 |           this->tasks.pop();
50 |         }
51 | 
52 |         task();
53 |       }
54 |     });
55 | }
56 | 
57 | // add new work item to the pool
58 | template <class F>
59 | void ThreadPool::enqueue(F&& f) {
60 |   {
61 |     std::lock_guard<std::mutex> lock(queue_mutex);
62 |     if (stop) throw std::runtime_error("enqueue on stopped ThreadPool");
63 |     tasks.emplace(std::forward<F>(f));
64 |   }
65 |   condition.notify_one();
66 | }
67 | // the destructor joins all threads
68 | inline ThreadPool::~ThreadPool() {
69 |   {
70 |     std::unique_lock<std::mutex> lock(queue_mutex);
71 |     stop = true;
72 |   }
73 |   condition.notify_all();
74 |   for (std::thread& worker : workers) worker.join();
75 | }
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/byteps/torch/handle_manager.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #include "handle_manager.h"
18 | 
19 | namespace byteps {
20 | namespace torch {
21 | 
22 | int HandleManager::AllocateHandle() {
23 |   int handle = last_handle_.fetch_add(1) + 1;
24 |   std::lock_guard<std::mutex> guard(mutex_);
25 |   results_[handle] = nullptr;
26 |   return handle;
27 | }
28 | 
29 | void HandleManager::MarkDone(int handle, const Status& status) {
30 |   std::lock_guard<std::mutex> guard(mutex_);
31 |   results_[handle] = std::make_shared<Status>(status);
32 | }
33 | 
34 | bool HandleManager::PollHandle(int handle) {
35 |   std::lock_guard<std::mutex> guard(mutex_);
36 |   if (results_.find(handle) == results_.end()) {
37 |     throw std::invalid_argument("Handle " + std::to_string(handle) +
38 |                                 " was not created or has been cleared.");
39 |   }
40 |   return results_[handle] != nullptr;
41 | }
42 | 
43 | std::shared_ptr<Status> HandleManager::ReleaseHandle(int handle) {
44 |   std::lock_guard<std::mutex> guard(mutex_);
45 |   if (results_.find(handle) == results_.end()) {
46 |     throw std::invalid_argument("Handle " + std::to_string(handle) +
47 |                                 " was not created or has been cleared.");
48 |   }
49 |   auto status = results_[handle];
50 |   results_.erase(handle);
51 |   return status;
52 | }
53 | 
54 | }  // namespace torch
55 | }  // namespace byteps
56 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/nesterov_momentum.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "nesterov_momentum.h"
17 | #include "../compressor_registry.h"
18 | 
19 | namespace byteps {
20 | namespace common {
21 | namespace compressor {
22 | namespace {
23 | CompressorRegistry::Register reg(
24 |     "nesterov_momentum",
25 |     [](const kwargs_t& kwargs, size_t size,
26 |        DataType dtype) -> std::unique_ptr<Compressor> {
27 |       // register cptr
28 |       auto kwargs_clone = kwargs;
29 |       kwargs_clone.erase("momentum_type");
30 |       auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype);
31 |       BPS_CHECK_NE(cptr, nullptr);
32 |       // find \mu
33 |       auto mu = HyperParamFinder<float>(kwargs, "momentum_mu");
34 |       return std::unique_ptr<NesterovMomentumCompressor>(
35 |           new NesterovMomentumCompressor(size, dtype, std::move(cptr), mu));
36 |     });
37 | }
38 | 
39 | void NesterovMomentumCompressor::UpdateMom(tensor_t grad) {
40 |   // m_t = \mu * m_{t-1} + g_t
41 |   this->_cpu_reducer->sum(_mom.get(), grad.data, _mom.get(), grad.size,
42 |                           static_cast<DataType>(grad.dtype), _mu);
43 | }
44 | 
45 | void NesterovMomentumCompressor::UpdateGradient(tensor_t grad) {
46 |   // p_t = \mu m_t + g_t
47 |   this->_cpu_reducer->sum(grad.data, _mom.get(), grad.size,
48 |                           static_cast<DataType>(grad.dtype), _mu);
49 | }
50 | 
51 | }  // namespace compressor
52 | }  // namespace common
53 | }  // namespace byteps


--------------------------------------------------------------------------------
/example/mxnet/common/util.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | import subprocess
19 | import os
20 | import errno
21 | 
22 | def download_file(url, local_fname=None, force_write=False):
23 |     # requests is not default installed
24 |     import requests
25 |     if local_fname is None:
26 |         local_fname = url.split('/')[-1]
27 |     if not force_write and os.path.exists(local_fname):
28 |         return local_fname
29 | 
30 |     dir_name = os.path.dirname(local_fname)
31 | 
32 |     if dir_name != "":
33 |         if not os.path.exists(dir_name):
34 |             try: # try to create the directory if it doesn't exists
35 |                 os.makedirs(dir_name)
36 |             except OSError as exc:
37 |                 if exc.errno != errno.EEXIST:
38 |                     raise
39 | 
40 |     r = requests.get(url, stream=True)
41 |     assert r.status_code == 200, "failed to open %s" % url
42 |     with open(local_fname, 'wb') as f:
43 |         for chunk in r.iter_content(chunk_size=1024):
44 |             if chunk: # filter out keep-alive new chunks
45 |                 f.write(chunk)
46 |     return local_fname
47 | 
48 | def get_gpus():
49 |     """
50 |     return a list of GPUs
51 |     """
52 |     try:
53 |         re = subprocess.check_output(["nvidia-smi", "-L"], universal_newlines=True)
54 |     except OSError:
55 |         return []
56 |     return range(len([i for i in re.split('\n') if 'GPU' in i]))
57 | 


--------------------------------------------------------------------------------
/byteps/mxnet/tensor_util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_TENSOR_UTIL_H
18 | #define BYTEPS_MXNET_TENSOR_UTIL_H
19 | 
20 | #include <mxnet/base.h>
21 | #include <mxnet/ndarray.h>
22 | #include <cassert>
23 | #include "../common/common.h"
24 | #include "cuda_util.h"
25 | #include "util.h"
26 | 
27 | namespace byteps {
28 | namespace mxnet {
29 | 
30 | using namespace byteps::common;
31 | using namespace ::mxnet;
32 | 
33 | class TensorUtil {
34 |  public:
35 |   static const DataType GetDType(NDArray* tensor);
36 |   static const TensorShape GetShape(NDArray* tensor);
37 |   static const void* GetData(NDArray* tensor);
38 |   static int64_t GetSize(NDArray* tensor);
39 |   static int GetDevice(NDArray* tensor);
40 | 
41 |   static NDArray* New(int device, int dtype);
42 |   static void Free(NDArray* tensor);
43 |   static void Copy(NDArray* output, NDArray* tensor);
44 |   static void DivideTensorInPlace(NDArray* tensor, int value);
45 | 
46 | #if HAVE_CUDA
47 |   static void CopyCPUToCuda(NDArray* cpu, NDArray* cuda);
48 |   static void AsyncCopyCudaToCPU(NDArray* cuda, NDArray* cpu);
49 | #endif
50 | 
51 |  private:
52 |   static const size_t kFloat32Size = 4;
53 |   static const size_t kFloat64Size = 8;
54 |   static const size_t kFloat16Size = 2;
55 |   static const size_t kUInt8Size = 1;
56 |   static const size_t kInt32Size = 4;
57 |   static const size_t kInt8Size = 1;
58 |   static const size_t kInt64Size = 8;
59 | };
60 | 
61 | }  // namespace mxnet
62 | }  // namespace byteps
63 | 
64 | #endif  // BYTEPS_MXNET_TENSOR_UTIL_H
65 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/compressor_registry.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "compressor_registry.h"
17 | 
18 | namespace byteps {
19 | namespace common {
20 | namespace compressor {
21 | 
22 | CompressorRegistry::map_t CompressorRegistry::_ctor_map;
23 | 
24 | CompressorRegistry::Register::Register(std::string name, ctor_t ctor) {
25 |   BPS_CHECK_EQ(_ctor_map.count(name), 0)
26 |       << "Duplicate registration of compressor under name " << name;
27 |   _ctor_map.emplace(name + "_type", std::move(ctor));
28 |   BPS_LOG(INFO) << name << " compressor is registered";
29 | }
30 | 
31 | CompressorRegistry::ctor_t CompressorRegistry::Find(const std::string& name) {
32 |   auto it = _ctor_map.find(name);
33 |   if (it == _ctor_map.end()) {
34 |     BPS_LOG(FATAL) << "No compressor registered under name:" << name;
35 |   }
36 |   return it->second;
37 | }
38 | 
39 | std::unique_ptr<Compressor> CompressorRegistry::Create(const kwargs_t& kwargs,
40 |                                                        size_t size, DataType dtype) {
41 | #ifndef BYTEPS_BUILDING_SERVER
42 |   const std::string types[] = {"momentum_type", "ef_type", "compressor_type"};
43 | #else
44 |   // server do not need momentum
45 |   const std::string types[] = {"ef_type", "compressor_type"};
46 | #endif
47 |   for (auto& type : types) {
48 |     auto iter = kwargs.find(type);
49 |     if (iter != kwargs.end()) {
50 |       auto ctor = CompressorRegistry::Find(iter->second + "_" + type);
51 |       return ctor(kwargs, size, dtype);
52 |     }
53 |   }
54 | 
55 |   return nullptr;
56 | }
57 | 
58 | }  // namespace compressor
59 | }  // namespace common
60 | }  // namespace byteps


--------------------------------------------------------------------------------
/byteps/torch/ops.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 ByteDance, Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_OPS_H
18 | #define BYTEPS_TORCH_OPS_H
19 | 
20 | #include <TH/TH.h>
21 | 
22 | #if HAVE_CUDA
23 | #include <THC/THC.h>
24 | #endif
25 | 
26 | #include "../common/operations.h"
27 | 
28 | namespace byteps {
29 | namespace torch {
30 | 
31 | using namespace byteps::common;
32 | 
33 | std::mutex mutex_;
34 | /* total number of gradients to push-pull */
35 | size_t num_grads_;
36 | /* number of push-pulls that have been triggered */
37 | size_t grad_count_;
38 | 
39 | #define PUSHPULL_H(torch_Tensor, THTensor)                         \
40 |   extern "C" int byteps_torch_push_pull_async_##torch_Tensor(      \
41 |       THTensor* tensor, THTensor* output, int average, char* name, \
42 |       int version, int priority);
43 | 
44 | PUSHPULL_H(torch_ByteTensor, THByteTensor)
45 | PUSHPULL_H(torch_IntTensor, THIntTensor)
46 | PUSHPULL_H(torch_LongTensor, THLongTensor)
47 | PUSHPULL_H(torch_FloatTensor, THFloatTensor)
48 | PUSHPULL_H(torch_DoubleTensor, THDoubleTensor)
49 | 
50 | #if HAVE_CUDA
51 | PUSHPULL_H(torch_cuda_ByteTensor, THCudaByteTensor)
52 | PUSHPULL_H(torch_cuda_IntTensor, THCudaIntTensor)
53 | PUSHPULL_H(torch_cuda_LongTensor, THCudaLongTensor)
54 | PUSHPULL_H(torch_cuda_FloatTensor, THCudaTensor)
55 | PUSHPULL_H(torch_cuda_DoubleTensor, THCudaDoubleTensor)
56 | #endif
57 | 
58 | extern "C" int byteps_torch_poll(int handle);
59 | extern "C" void byteps_torch_wait_and_clear(int handle);
60 | extern "C" void byteps_torch_declare_tensor(char* name);
61 | 
62 | }  // namespace torch
63 | }  // namespace byteps
64 | 
65 | #endif  // BYTEPS_TORCH_OPS_H
66 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/vanilla_error_feedback.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H
17 | #define BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H
18 | 
19 | #include "../error_feedback.h"
20 | 
21 | namespace byteps {
22 | namespace common {
23 | namespace compressor {
24 | 
25 | /*!
26 |  * \brief Vanilla Error Feedback Compressor
27 |  *
28 |  * paper: Communication-efficient distributed blockwise momentum sgd with
29 |  * error-feedback
30 |  * https://arxiv.org/pdf/1905.10936.pdf
31 |  *
32 |  * each worker i:
33 |  *    p_{t,i} <- g_{t,i} + \frac{\eta_{t-1}}{\eta_t} e_{t,i}
34 |  *    c_{t,i} <- Compress(p_{t,i})
35 |  *    e_{t,i} <- p_{t,i} - c_{t,i}
36 |  *
37 |  * server:
38 |  *    \tilde{p}_{t} <- \frac{1}{M} \sum_{i=1}^{M} c_{t,i}
39 |  * +\frac{\eta_{t-1}}{\eta_{t}} \tilde{e_t} \tilde{e}_{t+1} <-
40 |  * \tilde{p}_{t}-\tilde{c_t}
41 |  *
42 |  * Error-correction: error needs to be scaled with \frac{\eta_{t-1}}{\eta_t}.
43 |  */
44 | class VanillaErrorFeedbackCompressor : public ErrorFeedback {
45 |  public:
46 |   VanillaErrorFeedbackCompressor(size_t size, DataType dtype,
47 |                                  std::unique_ptr<Compressor> cptr);
48 |   virtual ~VanillaErrorFeedbackCompressor();
49 | 
50 |  protected:
51 |   void UpdateGradient(tensor_t grad) override;
52 | 
53 |  private:
54 |   /*!
55 |    * \brief learning rate
56 |    *
57 |    * read from file each step
58 |    */
59 |   double _pre_lr, _cur_lr;
60 | 
61 |   int _fd;
62 |   void* _mm;
63 | };
64 | }  // namespace compressor
65 | }  // namespace common
66 | }  // namespace byteps
67 | 
68 | #endif  // BYTEPS_COMPRESSOR_IMPL_VANILLA_ERROR_FEEDBACK_H


--------------------------------------------------------------------------------
/example/mxnet/data/caltech256.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | 
21 | # This file download the caltech 256 dataset
22 | # (http://www.vision.caltech.edu/Image_Datasets/Caltech256/), and split it into
23 | # the train and val rec files.
24 | 
25 | # number of images per class for training
26 | IMG_TRAIN=60
27 | 
28 | # download
29 | if [ ! -e 256_ObjectCategories.tar ]; then
30 |     wget http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
31 | fi
32 | 
33 | # split into train and val set
34 | tar -xf 256_ObjectCategories.tar
35 | TRAIN_DIR=caltech_256_train
36 | mkdir -p ${TRAIN_DIR}
37 | for i in 256_ObjectCategories/*; do
38 |     c=`basename $i`
39 |     echo "spliting $c"
40 |     mkdir -p ${TRAIN_DIR}/$c
41 |     for j in `ls $i/*.jpg | shuf | head -n ${IMG_TRAIN}`; do
42 |         mv $j ${TRAIN_DIR}/$c/
43 |     done
44 | done
45 | 
46 | # generate lst files
47 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
48 | MX_DIR=${CUR_DIR}/../../../
49 | python ${MX_DIR}/tools/im2rec.py --list --recursive caltech256-train ${TRAIN_DIR}/
50 | python ${MX_DIR}/tools/im2rec.py --list --recursive caltech256-val 256_ObjectCategories/
51 | mv caltech256-train_train.lst caltech256-train.lst
52 | rm caltech256-train_*
53 | mv caltech256-val_train.lst caltech256-val.lst
54 | rm caltech256-val_*
55 | 
56 | # generate rec files
57 | python ${MX_DIR}/tools/im2rec.py --resize 256 --quality 95 --num-thread 16 caltech256-val 256_ObjectCategories/
58 | python ${MX_DIR}/tools/im2rec.py --resize 256 --quality 95 --num-thread 16 caltech256-train ${TRAIN_DIR}/
59 | 
60 | # clean
61 | rm -rf ${TRAIN_DIR} 256_ObjectCategories/
62 | 


--------------------------------------------------------------------------------
/example/mxnet/train_imagenet_byteps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | import os
21 | import argparse
22 | import logging
23 | logging.basicConfig(level=logging.DEBUG)
24 | from common import find_mxnet
25 | from common import data_byteps as data
26 | from common import fit_byteps as fit
27 | from common.util import download_file
28 | import byteps.mxnet as bps
29 | import mxnet as mx
30 | 
31 | if __name__ == '__main__':
32 |     # init byteps
33 |     bps.init()
34 | 
35 |     # parse args
36 |     parser = argparse.ArgumentParser(description="train imagenet-1k",
37 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
38 |     fit.add_fit_args(parser)
39 |     data.add_data_args(parser)
40 |     data.add_data_aug_args(parser)
41 |     # use a large aug level
42 |     data.set_data_aug_level(parser, 3)
43 |     parser.set_defaults(
44 |         # network
45 |         network          = 'resnet',
46 |         num_layers       = 50,
47 |         # data
48 |         num_classes      = 1000,
49 |         num_examples     = 1281167,
50 |         image_shape      = '3,224,224',
51 |         min_random_scale = 1, # if input image has min size k, suggest to use
52 |                               # 256.0/x, e.g. 0.533 for 480
53 |         # train
54 |         num_epochs       = 80,
55 |         lr_step_epochs   = '30,60',
56 |         dtype            = 'float32'
57 |     )
58 |     args = parser.parse_args()
59 | 
60 |     # load network
61 |     from importlib import import_module
62 |     net = import_module('symbols.'+args.network)
63 |     sym = net.get_symbol(**vars(args))
64 | 
65 |     # train
66 |     fit.fit(args, sym, data.get_rec_iter)
67 | 


--------------------------------------------------------------------------------
/example/tensorflow/tensorflow2_mnist.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import byteps.tensorflow as bps
 3 | 
 4 | bps.init()
 5 | 
 6 | # BytePS: pin GPU to be used to process local rank (one GPU per process)
 7 | gpus = tf.config.experimental.list_physical_devices('GPU')
 8 | for gpu in gpus:
 9 |     tf.config.experimental.set_memory_growth(gpu, True)
10 | if gpus:
11 |     tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU')
12 | 
13 | # Before launching, need to fist download the dataset to ~/.keras/datasets
14 | (mnist_images, mnist_labels), _ = \
15 |     tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % bps.rank())
16 | 
17 | dataset = tf.data.Dataset.from_tensor_slices(
18 |     (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
19 |              tf.cast(mnist_labels, tf.int64))
20 | )
21 | dataset = dataset.repeat().shuffle(10000).batch(128)
22 | 
23 | mnist_model = tf.keras.Sequential([
24 |     tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
25 |     tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
26 |     tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
27 |     tf.keras.layers.Dropout(0.25),
28 |     tf.keras.layers.Flatten(),
29 |     tf.keras.layers.Dense(128, activation='relu'),
30 |     tf.keras.layers.Dropout(0.5),
31 |     tf.keras.layers.Dense(10, activation='softmax')
32 | ])
33 | loss = tf.losses.SparseCategoricalCrossentropy()
34 | 
35 | opt = tf.optimizers.Adam(0.001 * bps.size())
36 | 
37 | checkpoint_dir = './checkpoints'
38 | checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)
39 | 
40 | 
41 | @tf.function
42 | def training_step(images, labels, first_batch):
43 |     with tf.GradientTape() as tape:
44 |         probs = mnist_model(images, training=True)
45 |         loss_value = loss(labels, probs)
46 | 
47 |     tape = bps.DistributedGradientTape(tape)
48 | 
49 |     grads = tape.gradient(loss_value, mnist_model.trainable_variables)
50 |     opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
51 | 
52 |     # Note: broadcast should be done after the first gradient step to ensure optimizer
53 |     # initialization.
54 |     if first_batch:
55 |         bps.broadcast_variables(mnist_model.variables, root_rank=0)
56 |         bps.broadcast_variables(opt.variables(), root_rank=0)
57 | 
58 |     return loss_value
59 | 
60 | 
61 | # BytePS: adjust number of steps based on number of GPUs.
62 | for batch, (images, labels) in enumerate(dataset.take(10000 // bps.size())):
63 |     loss_value = training_step(images, labels, batch == 0)
64 | 
65 |     if batch % 10 == 0 and bps.local_rank() == 0:
66 |         print('Step #%d\tLoss: %.6f' % (batch, loss_value))
67 | 
68 | if bps.rank() == 0:
69 |     checkpoint.save(checkpoint_dir)


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/vanilla_error_feedback.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include <errno.h>
17 | #include <fcntl.h>
18 | #include <sys/mman.h>
19 | #include <unistd.h>
20 | 
21 | #include "../compressor_registry.h"
22 | #include "vanilla_error_feedback.h"
23 | 
24 | namespace byteps {
25 | namespace common {
26 | namespace compressor {
27 | namespace {
28 | CompressorRegistry::Register reg(
29 |     "vanilla_ef",
30 |     [](const kwargs_t& kwargs, size_t size,
31 |        DataType dtype) -> std::unique_ptr<Compressor> {
32 |       // register cptr
33 |       auto kwargs_clone = kwargs;
34 |       kwargs_clone.erase("ef_type");
35 |       auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype);
36 |       BPS_CHECK_NE(cptr, nullptr);
37 |       return std::unique_ptr<VanillaErrorFeedbackCompressor>(
38 |           new VanillaErrorFeedbackCompressor(size, dtype, std::move(cptr)));
39 |     });
40 | }
41 | 
42 | VanillaErrorFeedbackCompressor::VanillaErrorFeedbackCompressor(
43 |     size_t size, DataType dtype, std::unique_ptr<Compressor> cptr)
44 |     : ErrorFeedback(size, dtype, std::move(cptr)) {
45 |   _fd = open("lr.s", O_RDONLY);
46 |   BPS_CHECK(_fd > 0) << "open lr.s failed, errno=" << strerror(errno);
47 |   void* ptr = mmap(0, 8, PROT_READ, MAP_SHARED, _fd, 0);
48 |   BPS_CHECK_NE(ptr, MAP_FAILED) << "mmap failed, errno=" << strerror(errno);
49 |   _mm = ptr;
50 |   _pre_lr = _cur_lr = *reinterpret_cast<double*>(_mm);
51 | }
52 | 
53 | VanillaErrorFeedbackCompressor::~VanillaErrorFeedbackCompressor() {
54 |   munmap(_mm, 8);
55 |   close(_fd);
56 | }
57 | 
58 | void VanillaErrorFeedbackCompressor::UpdateGradient(tensor_t grad) {
59 |   _cur_lr = *reinterpret_cast<double*>(_mm);
60 |   this->_cpu_reducer->sum(grad.data, _error.get(), grad.size,
61 |                           static_cast<DataType>(grad.dtype),
62 |                           (_pre_lr / _cur_lr));
63 |   _pre_lr = _cur_lr;
64 | }
65 | 
66 | }  // namespace compressor
67 | }  // namespace common
68 | }  // namespace byteps


--------------------------------------------------------------------------------
/byteps/torch/compression.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 | """Gradient compression algorithms."""
17 | 
18 | import torch
19 | 
20 | 
21 | class Compressor(object):
22 |     """Interface for compressing and decompressing a given tensor."""
23 |     @staticmethod
24 |     def compress(tensor):
25 |         """Compresses a tensor and returns it with the context needed to decompress it."""
26 |         pass
27 | 
28 |     @staticmethod
29 |     def decompress(tensor, ctx):
30 |         """Decompress the tensor with the given context."""
31 |         pass
32 | 
33 | 
34 | class NoneCompressor(Compressor):
35 |     """Default no-op compression."""
36 |     @staticmethod
37 |     def compress(tensor):
38 |         """Returns the tensor unmodified."""
39 |         return tensor, None
40 | 
41 |     @staticmethod
42 |     def decompress(tensor, ctx):
43 |         """Returns the tensor unmodified."""
44 |         return tensor
45 | 
46 | 
47 | class FP16Compressor(Compressor):
48 |     """Compress all floating point gradients to 16-bit."""
49 |     @staticmethod
50 |     def compress(tensor):
51 |         """Downcasts the tensor to 16-bit."""
52 |         tensor_compressed = tensor
53 |         if tensor.dtype.is_floating_point:
54 |             # Only allow compression from other floating point types
55 |             tensor_compressed = tensor.type(torch.float16)
56 |         return tensor_compressed, tensor.dtype
57 | 
58 |     @staticmethod
59 |     def decompress(tensor, ctx):
60 |         """Upcasts the tensor to the initialization dtype."""
61 |         tensor_decompressed = tensor
62 |         dtype = ctx
63 |         if dtype.is_floating_point:
64 |             tensor_decompressed = tensor.type(dtype)
65 |         return tensor_decompressed
66 | 
67 | 
68 | class Compression(object):
69 |     """Optional gradient compression algorithm used during push_pull."""
70 | 
71 |     """Do not compress the gradients. This is the default."""
72 |     none = NoneCompressor
73 | 
74 |     """Compress all floating point gradients to 16-bit."""
75 |     fp16 = FP16Compressor
76 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/compression.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 | """Gradient compression algorithms."""
17 | 
18 | import tensorflow as tf
19 | 
20 | 
21 | class Compressor(object):
22 |     """Interface for compressing and decompressing a given tensor."""
23 |     @staticmethod
24 |     def compress(tensor):
25 |         """Compresses a tensor and returns it with the context needed to decompress it."""
26 |         pass
27 | 
28 |     @staticmethod
29 |     def decompress(tensor, ctx):
30 |         """Decompress the tensor with the given context."""
31 |         pass
32 | 
33 | 
34 | class NoneCompressor(Compressor):
35 |     """Default no-op compression."""
36 |     @staticmethod
37 |     def compress(tensor):
38 |         """Returns the tensor unmodified."""
39 |         return tensor, None
40 | 
41 |     @staticmethod
42 |     def decompress(tensor, ctx):
43 |         """Returns the tensor unmodified."""
44 |         return tensor
45 | 
46 | 
47 | class FP16Compressor(Compressor):
48 |     """Compress all floating point gradients to 16-bit."""
49 |     @staticmethod
50 |     def compress(tensor):
51 |         """Downcasts the tensor to 16-bit."""
52 |         tensor_compressed = tensor
53 |         if tensor.dtype.is_floating:
54 |             # Only allow compression from other floating point types
55 |             tensor_compressed = tf.cast(tensor, dtype=tf.float16)
56 |         return tensor_compressed, tensor.dtype
57 | 
58 |     @staticmethod
59 |     def decompress(tensor, ctx):
60 |         """Upcasts the tensor to the initialization dtype."""
61 |         tensor_decompressed = tensor
62 |         dtype = ctx
63 |         if dtype.is_floating:
64 |             tensor_decompressed = tf.cast(tensor, dtype=dtype)
65 |         return tensor_decompressed
66 | 
67 | 
68 | class Compression(object):
69 |     """Optional gradient compression algorithm used during push_pull."""
70 | 
71 |     """Do not compress the gradients. This is the default."""
72 |     none = NoneCompressor
73 | 
74 |     """Compress all floating point gradients to 16-bit."""
75 |     fp16 = FP16Compressor
76 | 


--------------------------------------------------------------------------------
/byteps/torch/adapter.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #include "adapter.h"
18 | #include "cuda_util.h"
19 | 
20 | namespace byteps {
21 | namespace torch {
22 | 
23 | TorchTensor::TorchTensor(::torch::Tensor tensor) : tensor_(tensor) {}
24 | 
25 | const DataType TorchTensor::dtype() const {
26 |   switch (tensor_.scalar_type()) {
27 |     case ::torch::kByte:
28 |       return DataType::BYTEPS_UINT8;
29 |     case ::torch::kChar:
30 |       return DataType::BYTEPS_INT8;
31 |     // case ::torch::kShort:
32 |     //   return DataType::BYTEPS_INT16;
33 |     case ::torch::kInt:
34 |       return DataType::BYTEPS_INT32;
35 |     case ::torch::kLong:
36 |       return DataType::BYTEPS_INT64;
37 |     case ::torch::kHalf:
38 |       return DataType::BYTEPS_FLOAT16;
39 |     case ::torch::kFloat:
40 |       return DataType::BYTEPS_FLOAT32;
41 |     case ::torch::kDouble:
42 |       return DataType::BYTEPS_FLOAT64;
43 |     default:
44 |       throw std::logic_error("Invalid or unsupported tensor type.");
45 |   }
46 | }
47 | 
48 | const TensorShape TorchTensor::shape() const {
49 |   TensorShape shape;
50 |   for (int idx = 0; idx < tensor_.dim(); ++idx) {
51 |     shape.AddDim(tensor_.size(idx));
52 |   }
53 |   return shape;
54 | }
55 | 
56 | const void* TorchTensor::data() const { return tensor_.data_ptr(); }
57 | 
58 | int64_t TorchTensor::size() const {
59 | #if TORCH_VERSION >= 1001000000
60 |   return tensor_.element_size() * tensor_.numel();
61 | #else
62 |   return tensor_.type().elementSizeInBytes() * tensor_.numel();
63 | #endif
64 | }
65 | 
66 | void ThrowIfError(Status status) {
67 |   switch (status.type()) {
68 |     case StatusType::OK:
69 |       return;
70 |     case StatusType::PRECONDITION_ERROR:
71 |       throw std::logic_error(status.reason());
72 |     case StatusType::ABORTED:
73 |       throw std::runtime_error(status.reason());
74 |     case StatusType::INVALID_ARGUMENT:
75 |       throw std::invalid_argument(status.reason());
76 |     default:  // Includes UNKNOWN_ERROR
77 |       throw std::runtime_error(status.reason());
78 |   }
79 | }
80 | 
81 | }  // namespace torch
82 | }  // namespace byteps
83 | 


--------------------------------------------------------------------------------
/example/tensorflow/tensorflow2_mnist_bps_MirroredStrategy.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import json
 4 | import os
 5 | import sys
 6 | import argparse
 7 | import byteps.tensorflow as bps
 8 | from  byteps.tensorflow.distribute import MirroredStrategy
 9 | 
10 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
11 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
12 | parser.add_argument('--rank', default=-1, type=int,
13 |                     help='node rank for distributed training')
14 | args = parser.parse_args()
15 | 
16 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
17 | 
18 | bps.init()
19 | args.rank = bps.local_rank()
20 | print("my rank ", args.rank)
21 | 
22 | gpus = tf.config.experimental.list_physical_devices('GPU')
23 | for gpu in gpus:
24 |     tf.config.experimental.set_memory_growth(gpu, True)
25 | if gpus:
26 |     tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU')
27 | 
28 | def mnist_dataset(batch_size):
29 |   (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
30 |   # The `x` arrays are in uint8 and have values in the range [0, 255].
31 |   # We need to convert them to float32 with values in the range [0, 1]
32 |   x_train = x_train / np.float32(255)
33 |   y_train = y_train.astype(np.int64)
34 |   train_dataset = tf.data.Dataset.from_tensor_slices(
35 |       (x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
36 |   return train_dataset
37 | 
38 | def build_and_compile_cnn_model():
39 |   model = tf.keras.Sequential([
40 |       tf.keras.Input(shape=(28, 28)),
41 |       tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
42 |       tf.keras.layers.Conv2D(32, 3, activation='relu'),
43 |       tf.keras.layers.Flatten(),
44 |       tf.keras.layers.Dense(128, activation='relu'),
45 |       tf.keras.layers.Dense(10)
46 |   ])
47 |   model.compile(
48 |       loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
49 |       optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
50 |       metrics=['accuracy'])
51 |   return model
52 | 
53 | per_worker_batch_size = 64
54 | 
55 | strategy = MirroredStrategy(devices=["/gpu:0"])
56 | 
57 | 
58 | num_workers = 1
59 | 
60 | per_worker_batch_size = 64
61 | # Here the batch size scales up by number of workers since 
62 | # `tf.data.Dataset.batch` expects the global batch size. Previously we used 64, 
63 | # and now this becomes 128.
64 | global_batch_size = per_worker_batch_size * num_workers
65 | multi_worker_dataset = mnist_dataset(global_batch_size)
66 | 
67 | with strategy.scope():
68 |   # Model building/compiling need to be within `strategy.scope()`.
69 |   multi_worker_model = build_and_compile_cnn_model()
70 | 
71 | # Keras' `model.fit()` trains the model with specified number of epochs and
72 | # number of steps per epoch. Note that the numbers here are for demonstration
73 | # purposes only and may not sufficiently produce a model with good quality.
74 | multi_worker_model.fit(multi_worker_dataset, epochs=100, steps_per_epoch=70)
75 | 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
 1 | # BytePS Architecture
 2 | 
 3 | We highly recommend you to read [BytePS's rationale](./rationale.md) first before reading this doc.
 4 | 
 5 | From application views, BytePS is a communication library just like Horovod. The plugins handle framework-specific transformation (e.g., on data structure), and
 6 | put communication tasks into BytePS priority queues. The BytePS Core then gets the tasks (priority-aware, not FIFO) and handles the actual communication.
 7 | 
 8 | ![byteps_architecture](https://user-images.githubusercontent.com/13852819/69873605-c3d39e00-12f3-11ea-942d-97af2606bb40.png)
 9 | 
10 | 
11 | ## General Workflow
12 | To demonstrate the work flow of BytePS, below we use a common data-parallel training scenario as an example. Say we have multiple worker machines (we refer them as "**workers**"), and each machine (worker) has multiple GPUs. We also have some CPU machines that serve as PS (we refer them as "**servers**").
13 | 
14 | In BytePS, a general walk-through of an iteration goes like this (we call each step as a **stage**):
15 | 
16 | 1. **Computation**: Each GPU performs computation (forward/backward propagation), which is irrelevant to BytePS;
17 | 2. **Local Reduce**: Multiple GPUs on the same machine reduces the gradients;
18 | 3. **Push**: The workers push the aggregated gradients to the servers;
19 | 4. **Global Reduce**: Once the servers receive the gradients from different workers, it aggregates the gradients;
20 | 5. **Pull**: The workers pull the aggregated gradients from the servers;
21 | 6. **Local Broadcast**: The workers broadcasts the updated gradients to local GPUs;
22 | 8. Goto next iteration and repeat from 1.
23 | 
24 | 
25 | ## Local Communication
26 | 
27 | We use NCCL for local communication, including **Local Reduce** and **Local Broadcast**.
28 | 
29 | For **Local Reduce** stage we use [ReduceScatter](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#reducescatter) to evenly distribute the gradients on multiple GPUs.
30 | 
31 | For **Local Broadcast** stage we use [AllGather](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather) to broadcast the gradients back to multiple GPUs.
32 | 
33 | ## Distributed Communication
34 | 
35 | We use [ps-lite](https://github.com/bytedance/ps-lite/tree/byteps) for **Push** and **Pull** between workers and servers.
36 | 
37 | For **Push** stage, the workers send the gradients to servers, as the traditional PS does.
38 | 
39 | For **Pull** stage, the workers <u>pull gradients rather than parameters</u> from the servers, which is different from traditional PS. Here is why:
40 | 
41 | In past, the SGD update is performed on servers, so the workers need to tell the servers what SGD optimizer to use. However, for different frameworks, even the same optimizer algorithm may be implemented in completely different ways, and not to mention there are many user-defined optimizers. So BytePS moves the SGD update from the servers to the workers, leaving the servers only do gradient reduction. We believe this is generic because it applies to all frameworks we know so far.
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/example/keras/keras_synthetic_benchmark_tf2.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import argparse
 4 | import os
 5 | import numpy as np
 6 | import timeit
 7 | 
 8 | import tensorflow as tf
 9 | import byteps.tensorflow.keras as bps
10 | from tensorflow.keras import applications
11 | 
12 | tf.compat.v1.disable_eager_execution()
13 | 
14 | # Benchmark settings
15 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
16 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
17 | parser.add_argument('--fp16-allreduce', action='store_true', default=False,
18 |                     help='use fp16 compression during allreduce')
19 | 
20 | parser.add_argument('--model', type=str, default='ResNet50',
21 |                     help='model to benchmark')
22 | parser.add_argument('--batch-size', type=int, default=32,
23 |                     help='input batch size')
24 | 
25 | parser.add_argument('--num-warmup-batches', type=int, default=10,
26 |                     help='number of warm-up batches that don\'t count towards benchmark')
27 | parser.add_argument('--num-batches-per-iter', type=int, default=10,
28 |                     help='number of batches per benchmark iteration')
29 | parser.add_argument('--num-iters', type=int, default=10,
30 |                     help='number of benchmark iterations')
31 | 
32 | parser.add_argument('--no-cuda', action='store_true', default=False,
33 |                     help='disables CUDA training')
34 | 
35 | args = parser.parse_args()
36 | args.cuda = not args.no_cuda
37 | 
38 | bps.init()
39 | 
40 | # pin GPU to be used to process local rank (one GPU per process)
41 | if args.cuda:
42 |     gpus = tf.config.experimental.list_physical_devices('GPU')
43 |     for gpu in gpus:
44 |         tf.config.experimental.set_memory_growth(gpu, True)
45 |     if gpus:
46 |         tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU')
47 | else:
48 |     os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
49 | 
50 | data = tf.random.uniform([args.batch_size, 224, 224, 3])
51 | target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)
52 | 
53 | callbacks = [
54 |     # BytePS: broadcast initial variable states from rank 0 to all other processes.
55 |     # This is necessary to ensure consistent initialization of all workers when
56 |     # training is started with random weights or restored from a checkpoint.
57 |     bps.callbacks.BroadcastGlobalVariablesCallback(0),
58 | ]
59 | # Set up standard model.
60 | model = getattr(applications, args.model)(weights=None)
61 | opt = tf.keras.optimizers.Adam(0.01)
62 | opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, loss_scale="dynamic")
63 | opt = bps.DistributedOptimizer(opt)
64 | 
65 | model.compile(loss=tf.keras.losses.categorical_crossentropy,
66 |               optimizer=opt,
67 |               metrics=['accuracy', 'top_k_categorical_accuracy'],
68 |               experimental_run_tf_function=False)
69 | model.fit(data, target, epochs=10, steps_per_epoch=16, callbacks=callbacks)
70 | 
71 | test_loss, test_acc, test_topk = model.evaluate(data, target, verbose=2, steps=16)
72 | print('\nTest accuracy:', test_acc)
73 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/momentum.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_MOMENTUM_H
17 | #define BYTEPS_COMPRESSOR_MOMENTUM_H
18 | 
19 | #include "../cpu_reducer.h"
20 | #include "compressor.h"
21 | 
22 | namespace byteps {
23 | namespace common {
24 | namespace compressor {
25 | /*!
26 |  * \brief Momentum
27 |  *
28 |  * Stochastic gradient descent with momentum
29 |  *
30 |  * \note
31 |  * The momentum is added to gradient before compression. This should not be used
32 |  * at the same time with the momentum implemented in the framework such as
33 |  * MXNet, Tensorflow or PyTorch etc. The key difference between the two is the
34 |  * position where they are added to the gradients. For this one, it is added
35 |  * before push_pull. But for framework's momentum, it is added after push_pull.
36 |  *
37 |  * \note
38 |  * The framework's momentum is disabled when using this momentum. User do not
39 |  * need to disable it manully.
40 |  *
41 |  * \sa Compressor, NesterovMomentumCompressor
42 |  */
43 | class Momentum : public Compressor {
44 |  public:
45 |   // momentum should be cleared to zeros
46 |   Momentum(size_t size, DataType dtype, std::unique_ptr<Compressor> cptr,
47 |            float mu)
48 |       : Compressor(size, dtype),
49 |         _mom(new byte_t[size]()),
50 |         _mu(mu),
51 |         _cpu_reducer(new CpuReducer(nullptr)),
52 |         _cptr(std::move(cptr)){};
53 |   virtual ~Momentum() = default;
54 | 
55 |   virtual tensor_t Compress(tensor_t grad) final;
56 | 
57 |   virtual tensor_t Decompress(tensor_t compressed) final;
58 | 
59 |  protected:
60 |   /*!
61 |    * \brief Update momentum
62 |    *
63 |    * e.g. m_t = \mu * m_{t-1} + g_t
64 |    *
65 |    * \param grad refers to gradient
66 |    */
67 |   virtual void UpdateMom(tensor_t grad) = 0;
68 | 
69 |   /*!
70 |    * \brief Update gradient with momentum
71 |    *
72 |    * e.g. g_t = \mu m_t + g_t
73 |    *
74 |    * \param grad refers to gradient which adds momentum in place.
75 |    */
76 |   virtual void UpdateGradient(tensor_t grad) = 0;
77 | 
78 |  protected:
79 |   /*! \brief buffer of momentum */
80 |   std::unique_ptr<byte_t[]> _mom;
81 | 
82 |   /*! \brief momentum factor */
83 |   float _mu;
84 | 
85 |   std::unique_ptr<CpuReducer> _cpu_reducer;
86 | 
87 |  private:
88 |   /*! \brief compressor pointer */
89 |   std::unique_ptr<Compressor> _cptr;
90 | };
91 | }  // namespace compressor
92 | }  // namespace common
93 | }  // namespace byteps
94 | 
95 | #endif  // BYTEPS_COMPRESSOR_MOMENTUM_H


--------------------------------------------------------------------------------
/example/mxnet/symbols/lenet.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | """
19 | LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
20 | Gradient-based learning applied to document recognition.
21 | Proceedings of the IEEE (1998)
22 | """
23 | import mxnet as mx
24 | 
25 | def get_loc(data, attr={'lr_mult':'0.01'}):
26 |     """
27 |     the localisation network in lenet-stn, it will increase acc about more than 1%,
28 |     when num-epoch >=15
29 |     """
30 |     loc = mx.symbol.Convolution(data=data, num_filter=30, kernel=(5, 5), stride=(2,2))
31 |     loc = mx.symbol.Activation(data = loc, act_type='relu')
32 |     loc = mx.symbol.Pooling(data=loc, kernel=(2, 2), stride=(2, 2), pool_type='max')
33 |     loc = mx.symbol.Convolution(data=loc, num_filter=60, kernel=(3, 3), stride=(1,1), pad=(1, 1))
34 |     loc = mx.symbol.Activation(data = loc, act_type='relu')
35 |     loc = mx.symbol.Pooling(data=loc, global_pool=True, kernel=(2, 2), pool_type='avg')
36 |     loc = mx.symbol.Flatten(data=loc)
37 |     loc = mx.symbol.FullyConnected(data=loc, num_hidden=6, name="stn_loc", attr=attr)
38 |     return loc
39 | 
40 | 
41 | def get_symbol(num_classes=10, add_stn=False, **kwargs):
42 |     data = mx.symbol.Variable('data')
43 |     if add_stn:
44 |         data = mx.sym.SpatialTransformer(data=data, loc=get_loc(data), target_shape = (28,28),
45 |                                          transform_type="affine", sampler_type="bilinear")
46 |     # first conv
47 |     conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
48 |     tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
49 |     pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
50 |                               kernel=(2,2), stride=(2,2))
51 |     # second conv
52 |     conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
53 |     tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
54 |     pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
55 |                               kernel=(2,2), stride=(2,2))
56 |     # first fullc
57 |     flatten = mx.symbol.Flatten(data=pool2)
58 |     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
59 |     tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
60 |     # second fullc
61 |     fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=num_classes)
62 |     # loss
63 |     lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
64 |     return lenet
65 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/topk.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_IMPL_TOPK_H
17 | #define BYTEPS_COMPRESSOR_IMPL_TOPK_H
18 | 
19 | #include "../compressor.h"
20 | 
21 | namespace byteps {
22 | namespace common {
23 | namespace compressor {
24 | 
25 | /*!
26 |  * \brief TopK Compressor
27 |  *
28 |  * paper: Sparsified SGD with Memory
29 |  * https://arxiv.org/pdf/1809.07599.pdf
30 |  *
31 |  * sending the most significant entries of the stochastic gradient
32 |  *
33 |  */
34 | class TopkCompressor : public Compressor {
35 |  public:
36 |   TopkCompressor(size_t size, DataType dtype, unsigned int k)
37 |       : Compressor(size, dtype), _k(k){};
38 |   virtual ~TopkCompressor() = default;
39 | 
40 |   /*!
41 |    * \brief Compress function
42 |    *
43 |    * select topk entries and corresponding indices
44 |    *
45 |    * \note compare with absolute values
46 |    *
47 |    * \param grad gradient tensor
48 |    * \param compressed compressed tensor
49 |    */
50 |   tensor_t Compress(tensor_t grad) override;
51 | 
52 |   /*!
53 |    * \brief Decompress function
54 |    *
55 |    * fill a zero tensor with topk entries and corresponding indices
56 |    *
57 |    * \param compressed compressed tensor
58 |    * \param decompressed decompressed tensor
59 |    */
60 |   tensor_t Decompress(tensor_t compressed) override;
61 | 
62 |   /*!
63 |    * \brief faster version of `UpdateError`
64 |    *
65 |    * 1. e <- p (e is the error and p is the corrected gradient)
66 |    * 2. zero-fill e with selected k indices
67 |    *
68 |    * \param corrected gradient corrected with error
69 |    * \param error error
70 |    * \param compressed compressed gradient
71 |    */
72 |   void FastUpdateError(tensor_t error, tensor_t corrected,
73 |                        tensor_t compressed) override;
74 | 
75 |  private:
76 |   template <typename index_t, typename scalar_t>
77 |   tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
78 | 
79 |   template <typename index_t, typename scalar_t>
80 |   tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
81 |                           size_t compressed_size);
82 | 
83 |   template <typename index_t, typename scalar_t>
84 |   void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
85 |                            const index_t* compressed, size_t compressed_size);
86 | 
87 |  private:
88 |   unsigned int _k;
89 | };
90 | }  // namespace compressor
91 | }  // namespace common
92 | }  // namespace byteps
93 | 
94 | #endif  // BYTEPS_COMPRESSOR_IMPL_TOPK_H


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/onebit.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_IMPL_ONEBIT_H
17 | #define BYTEPS_COMPRESSOR_IMPL_ONEBIT_H
18 | 
19 | #include "../compressor.h"
20 | 
21 | namespace byteps {
22 | namespace common {
23 | namespace compressor {
24 | 
25 | /*!
26 |  * \brief Onebit Compressor
27 |  *
28 |  * paper: SIGNSGD: Compressed Optimisation for Non-Convex Problems
29 |  * https://arxiv.org/pdf/1802.04434.pdf
30 |  *
31 |  * each worker i:
32 |  *    c_i <- sign(grad)
33 |  *
34 |  * server: majority vote
35 |  *    sign(\sum_i c_i)
36 |  *
37 |  * \note 0 represents positive and 1 represents negative.
38 |  */
39 | class OnebitCompressor : public Compressor {
40 |  public:
41 |   OnebitCompressor(size_t size, DataType dtype, bool use_scale = false)
42 |       : Compressor(size, dtype), _use_scale(use_scale) {}
43 |   virtual ~OnebitCompressor() = default;
44 | 
45 |   /*!
46 |    * \brief Compress function
47 |    *
48 |    * compress and pack into byte array.
49 |    * each bit represents a sign.
50 |    *
51 |    * \param grad gradient tensor
52 |    * \param compressed compressed tensor
53 |    */
54 |   tensor_t Compress(tensor_t grad) override;
55 | 
56 |   /*!
57 |    * \brief Decompress function
58 |    *
59 |    * unpack from byte array to FP tensor
60 |    *
61 |    * \param compressed compressed tensor
62 |    * \param decompressed decompressed tensor
63 |    */
64 |   tensor_t Decompress(tensor_t compressed) override;
65 | 
66 |   /*!
67 |    * \brief help function for error feedback `UpdateError`
68 |    *
69 |    * \param corrected gradient corrected with error
70 |    * \param error error
71 |    * \param compressed compressed gradient
72 |    */
73 |   void FastUpdateError(tensor_t error, tensor_t corrected,
74 |                        tensor_t compressed) override;
75 | 
76 |  private:
77 |   template <typename index_t, typename scalar_t>
78 |   tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
79 | 
80 |   template <typename scalar_t, typename index_t>
81 |   tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
82 |                           size_t compressed_size);
83 | 
84 |   template <typename scalar_t, typename index_t>
85 |   void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
86 |                            const index_t* compressed, size_t compressed_size);
87 | 
88 |  private:
89 |   bool _use_scale;
90 | };
91 | }  // namespace compressor
92 | }  // namespace common
93 | }  // namespace byteps
94 | 
95 | #endif  // BYTEPS_COMPRESSOR_IMPL_ONEBIT_H


--------------------------------------------------------------------------------
/byteps/common/operations.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_OPERATIONS_H
17 | #define BYTEPS_OPERATIONS_H
18 | 
19 | #include <functional>
20 | #include "common.h"
21 | 
22 | namespace byteps {
23 | namespace common {
24 | 
25 | // Check that byteps is initialized.
26 | Status CheckInitialized();
27 | 
28 | extern "C" {
29 | 
30 | // C interface to initialize byteps.
31 | void byteps_init();
32 | 
33 | // C interface to initialize byteps (without initializing ps-lite).
34 | void byteps_lazy_init();
35 | 
36 | // C interface to shut down byteps.
37 | void byteps_shutdown();
38 | 
39 | // C interface to restart byteps.
40 | void byteps_resume(int num_workers, int num_servers);
41 | 
42 | // C interface to suspend byteps.
43 | void byteps_suspend();
44 | 
45 | // C interface to get index of current byteps process.
46 | // Returns -1 if byteps is not initialized.
47 | int byteps_rank();
48 | 
49 | // C interface to get index of current byteps process in the node it is on.
50 | // Returns -1 if byteps is not initialized.
51 | int byteps_local_rank();
52 | 
53 | // C interface to return number of byteps processes.
54 | // Returns -1 if byteps is not initialized.
55 | int byteps_size();
56 | 
57 | // C interface to return number of byteps processes in the node it is on.
58 | // Returns -1 if byteps is not initialized.
59 | int byteps_local_size();
60 | }
61 | 
62 | extern "C" PyObject* byteps_get_pushpull_speed();
63 | 
64 | // Below are all for Framework plugins
65 | Status EnqueueTensor(BPSContext &context, std::shared_ptr<Tensor> input,
66 |                      std::shared_ptr<Tensor> output,
67 |                      std::shared_ptr<ReadyEvent> ready_event, const int device,
68 |                      const int priority, const int version,
69 |                      StatusCallback callback,
70 |                      std::shared_ptr<std::vector<QueueType>> queue_list);
71 | 
72 | void InitTensor(BPSContext &context, size_t size, int dtype, void *cpubuff);
73 | 
74 | // Only call these in Framework plugins for the best performance
75 | bool IsTensorDeclared(const std::string &name);
76 | 
77 | void RegisterCompressor(const std::string &name,
78 |                         std::unordered_map<std::string, std::string> &kwargs);
79 | 
80 | BPSContext &GetContextFromName(const std::string &name);
81 | 
82 | std::shared_ptr<std::vector<QueueType>> GetPushQueueList(int device);
83 | 
84 | std::shared_ptr<std::vector<QueueType>> GetPullQueueList(int device);
85 | 
86 | }  // namespace common
87 | }  // namespace byteps
88 | 
89 | #endif  // BYTEPS_OPERATIONS_H
90 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/dithering.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H
17 | #define BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H
18 | 
19 | #include "../compressor.h"
20 | #include "../utils.h"
21 | 
22 | namespace byteps {
23 | namespace common {
24 | namespace compressor {
25 | 
26 | /*!
27 |  * \brief Dithering Compressor
28 |  *
29 |  * paper: Natural Compression for Distributed Deep Learning
30 |  * https://arxiv.org/pdf/1905.10988.pdf
31 |  *
32 |  * two kinds of partition:
33 |  * 1. linear: {0, 1/s, 2/s, ..., (s-1)/s, 1}
34 |  *
35 |  * 2. natural: {0, 2^{1-s}, 2^(2-s), ..., 2^{-1}, 1}
36 |  *
37 |  * two kinds of normalization:
38 |  * 1. max: it gives better accuracy but less sparsity.
39 |  *
40 |  * 2. l2 norm: it is more sparse but less accurate. and
41 |  * empirically we found it will diverge with error-feedback.
42 |  */
43 | class DitheringCompressor : public Compressor {
44 |  public:
45 |   enum class PartitionType { LINEAR = 0, NATURAL = 1 };
46 |   enum class NomalizeType { MAX = 0, L2 = 1 };
47 | 
48 |   DitheringCompressor(size_t size, DataType dtype, unsigned int s,
49 |                       unsigned int seed = 0,
50 |                       PartitionType ptype = PartitionType::LINEAR,
51 |                       NomalizeType ntype = NomalizeType::MAX)
52 |       : Compressor(size, dtype), _s(s), _ptype(ptype), _ntype(ntype) {
53 |     if (seed) {
54 |       _rng.set_seed(seed);
55 |     }
56 |   };
57 |   virtual ~DitheringCompressor() = default;
58 | 
59 |   tensor_t Compress(tensor_t grad) override;
60 | 
61 |   tensor_t Decompress(tensor_t compressed) override;
62 | 
63 |   void FastUpdateError(tensor_t error, tensor_t corrected,
64 |                        tensor_t compressed) override;
65 | 
66 |  private:
67 |   template <typename index_t, typename scalar_t>
68 |   tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
69 | 
70 |   template <typename index_t, typename scalar_t>
71 |   tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
72 |                           size_t compressed_size);
73 | 
74 |   template <typename index_t, typename scalar_t>
75 |   void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
76 |                            const index_t* compressed, size_t compressed_size);
77 | 
78 |   /*! \brief number of levels */
79 |   const unsigned int _s;
80 | 
81 |   PartitionType _ptype;
82 |   NomalizeType _ntype;
83 |   XorShift128PlusBitShifterRNG _rng;
84 | };
85 | }  // namespace compressor
86 | }  // namespace common
87 | }  // namespace byteps
88 | 
89 | #endif  // BYTEPS_COMPRESSOR_IMPL_MULTIBIT_H


--------------------------------------------------------------------------------
/tests/meta_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import copy
17 | import time
18 | import os
19 | import subprocess
20 | import sys
21 | import threading
22 | 
23 | import byteps.mxnet as bps
24 | 
25 | 
26 | class MetaTest(type):
27 |     BASE_ENV = {"DMLC_NUM_WORKER": "1",
28 |                 "DMLC_NUM_SERVER": "1",
29 |                 "DMLC_PS_ROOT_URI": "127.0.0.1",
30 |                 "DMLC_PS_ROOT_PORT": "1234",
31 |                 "BYTEPS_LOG_LEVEL": "INFO",
32 |                 "BYTEPS_MIN_COMPRESS_BYTES": "0",
33 |                 "BYTEPS_PARTITION_BYTES": "2147483647"}
34 |     for name, value in os.environ.items():
35 |         if name not in BASE_ENV:
36 |             BASE_ENV[name] = value
37 |     SCHEDULER_ENV = copy.copy(BASE_ENV)
38 |     SCHEDULER_ENV.update(DMLC_ROLE="scheduler")
39 |     SERVER_ENV = copy.copy(BASE_ENV)
40 |     SERVER_ENV.update(DMLC_ROLE="server")
41 | 
42 |     def __new__(cls, name, bases, dict):
43 |         # decorate all test cases
44 |         for k, v in dict.items():
45 |             if k.startswith("test_") and hasattr(v, "__call__"):
46 |                 dict[k] = cls.launch_bps(v)
47 | 
48 |         for k, v in cls.BASE_ENV.items():
49 |             os.environ[k] = v
50 |         os.environ["NVIDIA_VISIBLE_DEVICES"] = "0"
51 |         os.environ["DMLC_WORKER_ID"] = "0"
52 |         os.environ["DMLC_ROLE"] = "worker"
53 |         os.environ["BYTEPS_THREADPOOL_SIZE"] = "4"
54 |         os.environ["BYTEPS_FORCE_DISTRIBUTED"] = "1"
55 |         os.environ["BYTEPS_LOCAL_RANK"] = "0"
56 |         os.environ["BYTEPS_LOCAL_SIZE"] = "1"
57 |         return type(name, bases, dict)
58 | 
59 |     @classmethod
60 |     def launch_bps(cls, func):
61 |         def wrapper(*args, **kwargs):
62 |             def run(env):
63 |                 subprocess.check_call(args=["bpslaunch"], shell=True,
64 |                                       stdout=sys.stdout, stderr=sys.stderr,
65 |                                       env=env)
66 |                 
67 |             print("bps init")
68 |             scheduler = threading.Thread(target=run,
69 |                                          args=(cls.SCHEDULER_ENV,))
70 |             server = threading.Thread(target=run, args=(cls.SERVER_ENV,))
71 |             scheduler.daemon = True
72 |             server.daemon = True
73 |             scheduler.start()
74 |             server.start()
75 | 
76 |             bps.init()
77 |             func(*args, **kwargs)
78 |             bps.shutdown()
79 | 
80 |             scheduler.join()
81 |             server.join()
82 |             print("bps shutdown")
83 |             time.sleep(2)
84 | 
85 |         return wrapper
86 | 


--------------------------------------------------------------------------------
/byteps/common/shared_memory.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "shared_memory.h"
17 | #include <fcntl.h>
18 | #include <numa.h>
19 | #include <sys/shm.h>
20 | #include <sys/stat.h>
21 | #include <sys/types.h>
22 | #include <unistd.h>
23 | #include "global.h"
24 | 
25 | namespace byteps {
26 | namespace common {
27 | 
28 | void* BytePSSharedMemory::openSharedMemory(const std::string& prefix,
29 |                                            uint64_t key, size_t size) {
30 |   size = BytePSGlobal::RoundUpToPageSize(size);
31 |   std::string shm_name(prefix);
32 |   shm_name += std::to_string(key);
33 |   int shm_fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0666);
34 |   BPS_CHECK_GE(shm_fd, 0) << "shm_open failed for " << shm_name << " " << strerror(errno);
35 | 
36 |   BPS_CHECK_GE(ftruncate(shm_fd, size), 0) << strerror(errno);
37 | 
38 |   void* ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
39 |   CUDA_CALL(cudaHostRegister(ptr, size, cudaHostRegisterDefault));
40 |   // mlock(ptr, size);
41 | 
42 |   BPS_CHECK_NE(ptr, (void*)-1) << strerror(errno);
43 | 
44 |   BPS_LOG(TRACE) << "initialized share memory size " << size;
45 | 
46 |   std::lock_guard<std::mutex> lock(_shm_mu);
47 |   _key_shm_addr[shm_name] = ptr;
48 |   _key_shm_size[shm_name] = size;
49 |   return ptr;
50 | }
51 | 
52 | std::vector<void*> BytePSSharedMemory::openPcieSharedMemory(uint64_t key,
53 |                                                             size_t size) {
54 |   std::vector<void*> r;
55 |   for (int i = 0; i < BytePSGlobal::GetPcieSwitchNum(); i++) {
56 |     auto prefix = std::string("BytePS_Pcie") + std::to_string(i) + "_Shm_";
57 |     if (BytePSGlobal::IsDistributed()) {
58 |       if (BytePSGlobal::IsCrossPcieSwitch()) {
59 |         if (i <= numa_max_node()) {
60 |           numa_set_preferred(i);
61 |           r.push_back(openSharedMemory(prefix, key, size));
62 |           numa_set_preferred(-1);
63 |         } else {
64 |           numa_set_preferred(numa_max_node());
65 |           r.push_back(openSharedMemory(prefix, key, size));
66 |           numa_set_preferred(-1);
67 |         }
68 |       } else {
69 |         r.push_back(openSharedMemory(prefix, key, size));
70 |       }
71 |     } else {
72 |       if (BytePSGlobal::IsCrossPcieSwitch()) {
73 |         numa_set_interleave_mask(numa_all_nodes_ptr);
74 |         r.push_back(openSharedMemory(prefix, key, size));
75 |         numa_set_interleave_mask(numa_no_nodes_ptr);
76 |       } else {
77 |         r.push_back(openSharedMemory(prefix, key, size));
78 |       }
79 |     }
80 |   }
81 |   return r;
82 | }
83 | 
84 | }  // namespace common
85 | 
86 | }  // namespace byteps
87 | 


--------------------------------------------------------------------------------
/example/mxnet/symbols/alexnet.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | """
19 | Reference:
20 | 
21 | Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012.
22 | """
23 | import mxnet as mx
24 | import numpy as np
25 | 
26 | def get_symbol(num_classes, dtype='float32', **kwargs):
27 |     input_data = mx.sym.Variable(name="data")
28 |     if dtype == 'float16':
29 |         input_data = mx.sym.Cast(data=input_data, dtype=np.float16)
30 |     # stage 1
31 |     conv1 = mx.sym.Convolution(name='conv1',
32 |         data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=96)
33 |     relu1 = mx.sym.Activation(data=conv1, act_type="relu")
34 |     lrn1 = mx.sym.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
35 |     pool1 = mx.sym.Pooling(
36 |         data=lrn1, pool_type="max", kernel=(3, 3), stride=(2,2))
37 |     # stage 2
38 |     conv2 = mx.sym.Convolution(name='conv2',
39 |         data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=256)
40 |     relu2 = mx.sym.Activation(data=conv2, act_type="relu")
41 |     lrn2 = mx.sym.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
42 |     pool2 = mx.sym.Pooling(data=lrn2, kernel=(3, 3), stride=(2, 2), pool_type="max")
43 |     # stage 3
44 |     conv3 = mx.sym.Convolution(name='conv3',
45 |         data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=384)
46 |     relu3 = mx.sym.Activation(data=conv3, act_type="relu")
47 |     conv4 = mx.sym.Convolution(name='conv4',
48 |         data=relu3, kernel=(3, 3), pad=(1, 1), num_filter=384)
49 |     relu4 = mx.sym.Activation(data=conv4, act_type="relu")
50 |     conv5 = mx.sym.Convolution(name='conv5',
51 |         data=relu4, kernel=(3, 3), pad=(1, 1), num_filter=256)
52 |     relu5 = mx.sym.Activation(data=conv5, act_type="relu")
53 |     pool3 = mx.sym.Pooling(data=relu5, kernel=(3, 3), stride=(2, 2), pool_type="max")
54 |     # stage 4
55 |     flatten = mx.sym.Flatten(data=pool3)
56 |     fc1 = mx.sym.FullyConnected(name='fc1', data=flatten, num_hidden=4096)
57 |     relu6 = mx.sym.Activation(data=fc1, act_type="relu")
58 |     dropout1 = mx.sym.Dropout(data=relu6, p=0.5)
59 |     # stage 5
60 |     fc2 = mx.sym.FullyConnected(name='fc2', data=dropout1, num_hidden=4096)
61 |     relu7 = mx.sym.Activation(data=fc2, act_type="relu")
62 |     dropout2 = mx.sym.Dropout(data=relu7, p=0.5)
63 |     # stage 6
64 |     fc3 = mx.sym.FullyConnected(name='fc3', data=dropout2, num_hidden=num_classes)
65 |     if dtype == 'float16':
66 |         fc3 = mx.sym.Cast(data=fc3, dtype=np.float32)
67 |     softmax = mx.sym.SoftmaxOutput(data=fc3, name='softmax')
68 |     return softmax
69 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/error_feedback.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | #ifndef BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H
 17 | #define BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H
 18 | 
 19 | #include "../cpu_reducer.h"
 20 | #include "compressor.h"
 21 | 
 22 | namespace byteps {
 23 | namespace common {
 24 | namespace compressor {
 25 | 
 26 | /*!
 27 |  *  \brief Error feedback Decorator
 28 |  *
 29 |  * paper: 1-bit stochastic gradient descent and its application to data-parallel
 30 |  * distributed training of speech dnns
 31 |  * https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/IS140694.pdf
 32 |  *
 33 |  * 1. UpdateGradient: g <- g + e
 34 |  * 2. UpdateError: e <- g - c
 35 |  *
 36 |  * These two functions should be implemented in children classes.
 37 |  *
 38 |  * \par
 39 |  * The caller do not need allocate an additional buffer to store error. There is
 40 |  * a buffer already inside the class.
 41 |  *
 42 |  * \par
 43 |  * Add error feedback behavior to any compressor at run-time via decorator
 44 |  * pattern. It keeps the same interface as Compressor. Compress and Decompress
 45 |  * have been implemented and can not be changed in children classes.
 46 |  *
 47 |  * \sa Compressor, VanillaErrorFeedbackCompressor
 48 |  */
 49 | class ErrorFeedback : public Compressor {
 50 |  public:
 51 |   // error buffer should be cleared to zeros at the beginning.
 52 |   ErrorFeedback(size_t size, DataType dtype, std::unique_ptr<Compressor> cptr)
 53 |       : Compressor(size, dtype),
 54 |         _error(new byte_t[size]()),
 55 |         _cpu_reducer(new CpuReducer(nullptr)),
 56 |         _cptr(std::move(cptr)) {}
 57 |   virtual ~ErrorFeedback() = default;
 58 | 
 59 |   virtual tensor_t Compress(tensor_t grad) final;
 60 | 
 61 |   virtual tensor_t Decompress(tensor_t compressed) final;
 62 | 
 63 |  protected:
 64 |   /*!
 65 |    * \brief Correct gradient with error
 66 |    *
 67 |    * grad += error
 68 |    *
 69 |    * \note it is an inplace operation.
 70 |    *
 71 |    * \param grad input gradient to be updated inplace
 72 |    * \param dtype type
 73 |    */
 74 |   virtual void UpdateGradient(tensor_t grad) = 0;
 75 | 
 76 |   /*!
 77 |    * \brief Update error
 78 |    *
 79 |    * error = corrected_grad - decompressed
 80 |    *
 81 |    * \param corrected refers to gradient + error
 82 |    * \param compressed compressed tensor
 83 |    */
 84 |   virtual void UpdateError(tensor_t corrected, tensor_t compressed);
 85 | 
 86 |  protected:
 87 |   /*! \brief buffer of error */
 88 |   std::unique_ptr<byte_t[]> _error;
 89 | 
 90 |   std::unique_ptr<CpuReducer> _cpu_reducer;
 91 | 
 92 |  private:
 93 |   /*! \brief compressor pointer */
 94 |   std::unique_ptr<Compressor> _cptr;
 95 | };
 96 | }  // namespace compressor
 97 | }  // namespace common
 98 | }  // namespace byteps
 99 | 
100 | #endif  // BYTEPS_COMPRESSOR_ERROR_FEEDBACK_H


--------------------------------------------------------------------------------
/example/keras/keras_mnist.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | import keras
 3 | from keras.datasets import mnist
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense, Dropout, Flatten
 6 | from keras.layers import Conv2D, MaxPooling2D
 7 | from keras import backend as K
 8 | import math
 9 | import tensorflow as tf
10 | import byteps.keras as bps
11 | 
12 | # BytePS: initialize BytePS.
13 | bps.init()
14 | 
15 | # BytePS: pin GPU to be used to process local rank (one GPU per process)
16 | config = tf.ConfigProto()
17 | config.gpu_options.allow_growth = True
18 | config.gpu_options.visible_device_list = str(bps.local_rank())
19 | K.set_session(tf.Session(config=config))
20 | 
21 | batch_size = 128
22 | num_classes = 10
23 | 
24 | # BytePS: adjust number of epochs based on number of GPUs.
25 | epochs = int(math.ceil(12.0 / bps.size()))
26 | 
27 | # Input image dimensions
28 | img_rows, img_cols = 28, 28
29 | 
30 | # The data, shuffled and split between train and test sets
31 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
32 | 
33 | if K.image_data_format() == 'channels_first':
34 |     x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
35 |     x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
36 |     input_shape = (1, img_rows, img_cols)
37 | else:
38 |     x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
39 |     x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
40 |     input_shape = (img_rows, img_cols, 1)
41 | 
42 | x_train = x_train.astype('float32')
43 | x_test = x_test.astype('float32')
44 | x_train /= 255
45 | x_test /= 255
46 | print('x_train shape:', x_train.shape)
47 | print(x_train.shape[0], 'train samples')
48 | print(x_test.shape[0], 'test samples')
49 | 
50 | # Convert class vectors to binary class matrices
51 | y_train = keras.utils.to_categorical(y_train, num_classes)
52 | y_test = keras.utils.to_categorical(y_test, num_classes)
53 | 
54 | model = Sequential()
55 | model.add(Conv2D(32, kernel_size=(3, 3),
56 |                  activation='relu',
57 |                  input_shape=input_shape))
58 | model.add(Conv2D(64, (3, 3), activation='relu'))
59 | model.add(MaxPooling2D(pool_size=(2, 2)))
60 | model.add(Dropout(0.25))
61 | model.add(Flatten())
62 | model.add(Dense(128, activation='relu'))
63 | model.add(Dropout(0.5))
64 | model.add(Dense(num_classes, activation='softmax'))
65 | 
66 | # BytePS: adjust learning rate based on number of GPUs.
67 | opt = keras.optimizers.Adadelta(1.0 * bps.size())
68 | 
69 | # BytePS: add BytePS Distributed Optimizer.
70 | opt = bps.DistributedOptimizer(opt)
71 | 
72 | model.compile(loss=keras.losses.categorical_crossentropy,
73 |               optimizer=opt,
74 |               metrics=['accuracy'])
75 | 
76 | callbacks = [
77 |     # BytePS: broadcast initial variable states from rank 0 to all other processes.
78 |     # This is necessary to ensure consistent initialization of all workers when
79 |     # training is started with random weights or restored from a checkpoint.
80 |     bps.callbacks.BroadcastGlobalVariablesCallback(0),
81 | ]
82 | 
83 | # BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them.
84 | if bps.rank() == 0:
85 |     callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
86 | 
87 | model.fit(x_train, y_train,
88 |           batch_size=batch_size,
89 |           callbacks=callbacks,
90 |           epochs=epochs,
91 |           verbose=1 if bps.rank() == 0 else 0,
92 |           validation_data=(x_test, y_test))
93 | score = model.evaluate(x_test, y_test, verbose=0)
94 | print('Test loss:', score[0])
95 | print('Test accuracy:', score[1])
96 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/randomk.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_RANDOMK_H
 17 | #define BYTEPS_COMPRESSOR_IMPL_RANDOMK_H
 18 | 
 19 | #include <random>
 20 | 
 21 | #include "../compressor.h"
 22 | #include "../utils.h"
 23 | 
 24 | namespace byteps {
 25 | namespace common {
 26 | namespace compressor {
 27 | 
 28 | /*!
 29 |  * \brief RandomK Compressor
 30 |  *
 31 |  * paper: Sparsified SGD with Memory
 32 |  * https://arxiv.org/pdf/1809.07599.pdf
 33 |  *
 34 |  * randomly sending k entries of the stochastic gradient
 35 |  *
 36 |  * \note it is a stochastic algorithm. If you want to have deterministic
 37 |  * behavior, please set a seed in the configurations.
 38 |  */
 39 | class RandomkCompressor : public Compressor {
 40 |  public:
 41 |   RandomkCompressor(size_t size, DataType dtype, unsigned int k, unsigned int seed = 0)
 42 |       : Compressor(size, dtype), _k(k) {
 43 |     if (seed != 0) {
 44 |       BPS_LOG(INFO) << "SET SEED = " << seed;
 45 |       _rng.set_seed(seed);
 46 |     }
 47 |   };
 48 |   virtual ~RandomkCompressor() = default;
 49 | 
 50 |   /*!
 51 |    * \brief Compress function
 52 |    *
 53 |    * randomly select k entries and corresponding indices
 54 |    *
 55 |    * \param grad gradient tensor
 56 |    * \param compressed compressed tensor
 57 |    */
 58 |   tensor_t Compress(tensor_t grad) override;
 59 | 
 60 |   /*!
 61 |    * \brief Decompress function
 62 |    *
 63 |    * fill a zero tensor with topk entries and corresponding indices
 64 |    *
 65 |    * \param compressed compressed tensor
 66 |    * \param decompressed decompressed tensor
 67 |    */
 68 |   tensor_t Decompress(tensor_t compressed) override;
 69 | 
 70 |   /*!
 71 |    * \brief faster version of `UpdateError`
 72 |    *
 73 |    * 1. e <- p (e is the error and p is the corrected gradient)
 74 |    * 2. zero-fill e with selected k indices
 75 |    *
 76 |    * \param corrected gradient corrected with error
 77 |    * \param error error
 78 |    * \param compressed compressed gradient
 79 |    */
 80 |   void FastUpdateError(tensor_t error, tensor_t corrected,
 81 |                        tensor_t compressed) override;
 82 | 
 83 |  private:
 84 |   template <typename index_t, typename scalar_t>
 85 |   tensor_t CompressImpl(index_t* dst, const scalar_t* src, size_t len);
 86 | 
 87 |   template <typename index_t, typename scalar_t>
 88 |   tensor_t DecompressImpl(scalar_t* dst, const index_t* src,
 89 |                           size_t compressed_size);
 90 | 
 91 |   template <typename index_t, typename scalar_t>
 92 |   void FastUpdateErrorImpl(scalar_t* error, scalar_t* corrected,
 93 |                            const index_t* compressed, size_t compressed_size);
 94 | 
 95 |  private:
 96 |   unsigned int _k;
 97 |   std::random_device _rd;
 98 |   XorShift128PlusBitShifterRNG _rng;
 99 | };
100 | }  // namespace compressor
101 | }  // namespace common
102 | }  // namespace byteps
103 | 
104 | #endif  // BYTEPS_COMPRESSOR_IMPL_RANDOMK_H


--------------------------------------------------------------------------------
/byteps/server/queue.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | #ifndef BYTEPS_SERVER_QUEUE_H
 17 | #define BYTEPS_SERVER_QUEUE_H
 18 | 
 19 | #include <vector>
 20 | #include <mutex>
 21 | #include <condition_variable>
 22 | #include <memory>
 23 | #include <algorithm>
 24 | 
 25 | namespace byteps {
 26 | namespace server {
 27 | 
 28 | /**
 29 |  * \brief thread-safe queue allowing push and waited pop
 30 |  */
 31 | class PriorityQueue {
 32 |  public:
 33 |   PriorityQueue(bool is_schedule) {
 34 |     enable_schedule_ = is_schedule;
 35 |     if (enable_schedule_) {
 36 |       std::make_heap(queue_.begin(), queue_.end(),
 37 |         [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) {
 38 |           return ComparePriority(a, b);
 39 |         }
 40 |       );
 41 |     }
 42 |   }
 43 |   ~PriorityQueue() { }
 44 | 
 45 |   /**
 46 |    * \brief push an value and sort using heap. threadsafe.
 47 |    * \param new_value the value
 48 |    */
 49 |   void Push(BytePSEngineMessage new_value) {
 50 |     mu_.lock();
 51 |     queue_.push_back(std::move(new_value));
 52 |     if (enable_schedule_) {
 53 |       ++push_cnt_[new_value.key];
 54 |       std::push_heap(queue_.begin(), queue_.end(),
 55 |         [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) {
 56 |           return ComparePriority(a, b);
 57 |         }
 58 |       );
 59 |     }
 60 |     mu_.unlock();
 61 |     cond_.notify_all();
 62 |   }
 63 | 
 64 |   /**
 65 |    * \brief wait until pop an element from the beginning, threadsafe
 66 |    * \param value the poped value
 67 |    */
 68 |   void WaitAndPop(BytePSEngineMessage* value) {
 69 |     std::unique_lock<std::mutex> lk(mu_);
 70 |     cond_.wait(lk, [this]{return !queue_.empty();});
 71 |     if (enable_schedule_) {
 72 |       std::pop_heap(queue_.begin(), queue_.end(),
 73 |         [this](const BytePSEngineMessage& a, const BytePSEngineMessage& b) {
 74 |           return ComparePriority(a, b);
 75 |         }
 76 |       );
 77 |       *value = queue_.back();
 78 |       queue_.pop_back();
 79 |     } else {
 80 |       *value = std::move(queue_.front());
 81 |       queue_.erase(queue_.begin());
 82 |     }
 83 |   }
 84 | 
 85 |   void ClearCounter(uint64_t key) {
 86 |     if (!enable_schedule_) return;
 87 |     std::unique_lock<std::mutex> lk(mu_);
 88 |     push_cnt_[key] = 0;
 89 |   }
 90 | 
 91 |   bool ComparePriority(const BytePSEngineMessage& a, const BytePSEngineMessage& b) {
 92 |     if (push_cnt_[a.key] == push_cnt_[b.key]) {
 93 |       return (a.id > b.id);
 94 |     } else {
 95 |       return (push_cnt_[a.key] > push_cnt_[b.key]);
 96 |     }
 97 |   }
 98 | 
 99 |  private:
100 |   mutable std::mutex mu_;
101 |   std::vector<BytePSEngineMessage> queue_;
102 |   std::condition_variable cond_;
103 |   std::unordered_map<uint64_t, uint64_t> push_cnt_;
104 |   volatile bool enable_schedule_ = false;
105 | };
106 | 
107 | }  // namespace server
108 | }  // namespace byteps
109 | 
110 | #endif  // BYTEPS_SERVER_QUEUE_H


--------------------------------------------------------------------------------
/example/tensorflow/tensorflow_keras_mnist.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | from tensorflow import keras
 3 | from tensorflow.keras.datasets import mnist
 4 | from tensorflow.keras.models import Sequential
 5 | from tensorflow.keras.layers import Dense, Dropout, Flatten
 6 | from tensorflow.keras.layers import Conv2D, MaxPooling2D
 7 | from tensorflow.keras import backend as K
 8 | import math
 9 | import tensorflow as tf
10 | import byteps.keras as bps
11 | 
12 | # BytePS: initialize BytePS.
13 | bps.init()
14 | 
15 | # BytePS: pin GPU to be used to process local rank (one GPU per process)
16 | config = tf.ConfigProto()
17 | config.gpu_options.allow_growth = True
18 | config.gpu_options.visible_device_list = str(bps.local_rank())
19 | K.set_session(tf.Session(config=config))
20 | 
21 | batch_size = 128
22 | num_classes = 10
23 | 
24 | # BytePS: adjust number of epochs based on number of GPUs.
25 | epochs = int(math.ceil(12.0 / bps.size()))
26 | 
27 | # Input image dimensions
28 | img_rows, img_cols = 28, 28
29 | 
30 | # The data, shuffled and split between train and test sets
31 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
32 | 
33 | if K.image_data_format() == 'channels_first':
34 |     x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
35 |     x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
36 |     input_shape = (1, img_rows, img_cols)
37 | else:
38 |     x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
39 |     x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
40 |     input_shape = (img_rows, img_cols, 1)
41 | 
42 | x_train = x_train.astype('float32')
43 | x_test = x_test.astype('float32')
44 | x_train /= 255
45 | x_test /= 255
46 | print('x_train shape:', x_train.shape)
47 | print(x_train.shape[0], 'train samples')
48 | print(x_test.shape[0], 'test samples')
49 | 
50 | # Convert class vectors to binary class matrices
51 | y_train = keras.utils.to_categorical(y_train, num_classes)
52 | y_test = keras.utils.to_categorical(y_test, num_classes)
53 | 
54 | model = Sequential()
55 | model.add(Conv2D(32, kernel_size=(3, 3),
56 |                  activation='relu',
57 |                  input_shape=input_shape))
58 | model.add(Conv2D(64, (3, 3), activation='relu'))
59 | model.add(MaxPooling2D(pool_size=(2, 2)))
60 | model.add(Dropout(0.25))
61 | model.add(Flatten())
62 | model.add(Dense(128, activation='relu'))
63 | model.add(Dropout(0.5))
64 | model.add(Dense(num_classes, activation='softmax'))
65 | 
66 | # BytePS: adjust learning rate based on number of GPUs.
67 | opt = keras.optimizers.Adadelta(1.0 * bps.size())
68 | 
69 | # BytePS: add BytePS Distributed Optimizer.
70 | opt = bps.DistributedOptimizer(opt)
71 | 
72 | model.compile(loss=keras.losses.categorical_crossentropy,
73 |               optimizer=opt,
74 |               metrics=['accuracy'])
75 | 
76 | callbacks = [
77 |     # BytePS: broadcast initial variable states from rank 0 to all other processes.
78 |     # This is necessary to ensure consistent initialization of all workers when
79 |     # training is started with random weights or restored from a checkpoint.
80 |     bps.callbacks.BroadcastGlobalVariablesCallback(0),
81 | ]
82 | 
83 | # BytePS: save checkpoints only on worker 0 to prevent other workers from corrupting them.
84 | if bps.rank() == 0:
85 |     callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
86 | 
87 | model.fit(x_train, y_train,
88 |           batch_size=batch_size,
89 |           callbacks=callbacks,
90 |           epochs=epochs,
91 |           verbose=1,
92 |           validation_data=(x_test, y_test))
93 | score = model.evaluate(x_test, y_test, verbose=0)
94 | print('Test loss:', score[0])
95 | print('Test accuracy:', score[1])
96 | 


--------------------------------------------------------------------------------
/byteps/common/nccl_manager.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | // =============================================================================
 15 | 
 16 | #ifndef BYTEPS_NCCL_MANAGER_H
 17 | #define BYTEPS_NCCL_MANAGER_H
 18 | 
 19 | #include <memory>
 20 | #include <queue>
 21 | #include <vector>
 22 | #include "common.h"
 23 | #include "communicator.h"
 24 | #include "scheduled_queue.h"
 25 | 
 26 | namespace byteps {
 27 | namespace common {
 28 | 
 29 | class NcclGroupEntry {
 30 |  public:
 31 |   void RecordEvents();
 32 |   void SynchronizeEvents();
 33 |   void DestroyEvents();
 34 | 
 35 |   std::vector<std::shared_ptr<TensorTableEntry>> tasks;
 36 |   std::vector<BytePSScheduledQueue*> queues;
 37 | 
 38 |  private:
 39 |   std::vector<cudaEvent_t> _events;
 40 | };
 41 | 
 42 | class NcclManager {
 43 |  public:
 44 |   NcclManager(std::shared_ptr<BytePSComm> comm);
 45 |   ~NcclManager() {
 46 |     if (_nccl_stream) {
 47 |       CUDA_CALL(cudaStreamDestroy(*_nccl_stream));
 48 |     }
 49 |     if (_nccl_id) {
 50 |       free(_nccl_id);
 51 |     }
 52 |     if (_nccl_comm) {
 53 |       free(_nccl_comm);
 54 |     }
 55 |     if (_signal_comm) {
 56 |       _signal_comm.reset();
 57 |     }
 58 |     if (_global_comm) {
 59 |       _global_comm.reset();
 60 |     }
 61 |     while (!_nccl_pipeline.empty()) _nccl_pipeline.pop();
 62 | 
 63 |     BPS_LOG(DEBUG) << "Clear NcclManager";
 64 |   }
 65 | 
 66 |   int GetGroupSize() { return _nccl_group_size; }
 67 |   void EnqueueGroup(std::shared_ptr<NcclGroupEntry> e);
 68 |   std::shared_ptr<NcclGroupEntry> DequeueGroup();
 69 | 
 70 |   virtual cudaStream_t GetStream(uint64_t key, QueueType op);
 71 |   virtual ncclComm_t GetComm(uint64_t key, QueueType op);
 72 |   virtual int GetRoot(uint64_t key, QueueType op);
 73 |   virtual int GetRank(uint64_t key, QueueType op);
 74 | 
 75 |   int GetSize() { return _nccl_size; }
 76 |   std::shared_ptr<BytePSComm> GetSignalComm() { return _signal_comm; }
 77 |   bool IsSignalRoot();
 78 | 
 79 |  protected:
 80 |   void InitGlobalEnv();
 81 |   virtual void ConstructRings();
 82 | 
 83 |   cudaStream_t* _nccl_stream;
 84 |   ncclUniqueId* _nccl_id;
 85 |   ncclComm_t* _nccl_comm;
 86 | 
 87 |   // global user-defined env
 88 |   size_t _nccl_group_size;
 89 |   size_t _nccl_pcie_size;
 90 |   size_t _nccl_pcie_num;
 91 |   size_t _nccl_num_rings;
 92 | 
 93 |   int _nccl_size;
 94 | 
 95 |   // for pipelining nccl
 96 |   std::mutex _nccl_mutex;
 97 |   std::queue<std::shared_ptr<NcclGroupEntry>> _nccl_pipeline;
 98 | 
 99 |   std::shared_ptr<BytePSComm> _signal_comm;
100 |   std::shared_ptr<BytePSComm> _global_comm;
101 | };
102 | 
103 | class NcclManagerExpr : public NcclManager {
104 |  public:
105 |   cudaStream_t GetStream(uint64_t key, QueueType op);
106 |   ncclComm_t GetComm(uint64_t key, QueueType op);
107 |   int GetRoot(uint64_t key, QueueType op);
108 |   int GetRank(uint64_t key, QueueType op);
109 | 
110 |  protected:
111 |   void ConstructRings();
112 | 
113 |   // for multi-ring
114 |   std::vector<std::vector<int>> _rings;
115 | };
116 | 
117 | }  // namespace common
118 | }  // namespace byteps
119 | 
120 | #endif  // BYTEPS_NCCL_MANAGER_H
121 | 


--------------------------------------------------------------------------------
/byteps/torch/ready_event.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Bytedance Inc. All Rights Reserved.
  2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");
  5 | // you may not use this file except in compliance with the License.
  6 | // You may obtain a copy of the License at
  7 | //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS,
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | // See the License for the specific language governing permissions and
 14 | // limitations under the License.
 15 | // =============================================================================
 16 | 
 17 | #if HAVE_CUDA
 18 | #if TORCH_VERSION >= 1005000000
 19 | #include <c10/cuda/CUDAException.h>
 20 | #include <c10/cuda/CUDAStream.h>
 21 | #else
 22 | #include <THC/THC.h>
 23 | #endif
 24 | #include <cassert>
 25 | #include <mutex>
 26 | #include <queue>
 27 | #include <unordered_map>
 28 | #endif
 29 | 
 30 | #include "cuda_util.h"
 31 | #include "ready_event.h"
 32 | 
 33 | #if TORCH_VERSION < 1005000000
 34 | #if HAVE_CUDA
 35 | extern THCState* state;
 36 | #endif
 37 | #endif
 38 | 
 39 | namespace byteps {
 40 | namespace torch {
 41 | 
 42 | #if HAVE_CUDA
 43 | struct ReadyEventRegistry {
 44 |   std::unordered_map<int, std::queue<cudaEvent_t>> cuda_events;
 45 |   std::mutex mutex;
 46 | };
 47 | 
 48 | static ReadyEventRegistry ready_event_registry;
 49 | 
 50 | TorchReadyEvent::TorchReadyEvent(int device) : device_(device) {
 51 |   assert(device_ != CPU_DEVICE_ID);
 52 | 
 53 |   with_device device_context(device_);
 54 |   {
 55 |     std::lock_guard<std::mutex> guard(ready_event_registry.mutex);
 56 |     auto& queue = ready_event_registry.cuda_events[device_];
 57 |     if (!queue.empty()) {
 58 |       cuda_event_ = queue.front();
 59 |       queue.pop();
 60 |     } else {
 61 | #if TORCH_VERSION >= 1005000000
 62 |       C10_CUDA_CHECK(cudaEventCreateWithFlags(
 63 |           &cuda_event_, cudaEventBlockingSync | cudaEventDisableTiming));
 64 | #else
 65 |       THCudaCheck(cudaEventCreateWithFlags(
 66 |           &cuda_event_, cudaEventBlockingSync | cudaEventDisableTiming));
 67 | #endif
 68 |     }
 69 |   }
 70 | #if TORCH_VERSION >= 1005000000
 71 |   auto stream = c10::cuda::getCurrentCUDAStream(device_);
 72 |   C10_CUDA_CHECK(cudaEventRecord(cuda_event_, stream));
 73 | #else
 74 |   auto stream = THCState_getCurrentStreamOnDevice(state, device_);
 75 |   THCudaCheck(cudaEventRecord(cuda_event_, stream));
 76 | #endif
 77 | }
 78 | 
 79 | TorchReadyEvent::~TorchReadyEvent() {
 80 |   {
 81 |     std::lock_guard<std::mutex> guard(ready_event_registry.mutex);
 82 |     auto& queue = ready_event_registry.cuda_events[device_];
 83 |     queue.push(cuda_event_);
 84 |   }
 85 | }
 86 | 
 87 | bool TorchReadyEvent::Ready() const {
 88 |   auto status = cudaEventQuery(cuda_event_);
 89 |   if (status == cudaErrorNotReady) {
 90 |     return false;
 91 |   }
 92 | #if TORCH_VERSION >= 1005000000
 93 |   C10_CUDA_CHECK(status);
 94 | #else
 95 |   THCudaCheck(status);
 96 | #endif
 97 |   return true;
 98 | }
 99 | #endif
100 | 
101 | // On GPU this event will signal that GPU computations are done and data is
102 | // ready.
103 | std::shared_ptr<ReadyEvent> RecordReadyEvent(int device) {
104 |   if (device == CPU_DEVICE_ID) {
105 |     return std::shared_ptr<ReadyEvent>();
106 |   } else {
107 | #if HAVE_CUDA
108 |     return std::make_shared<TorchReadyEvent>(device);
109 | #else
110 |     throw std::logic_error(
111 |         "Internal error. Requested ReadyEvent "
112 |         "with GPU device but not compiled with CUDA.");
113 | #endif
114 |   }
115 | }
116 | 
117 | }  // namespace torch
118 | }  // namespace byteps


--------------------------------------------------------------------------------
/example/mxnet/symbols/vgg.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | """References:
19 | 
20 | Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
21 | large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
22 | """
23 | 
24 | import mxnet as mx
25 | import numpy as np
26 | 
27 | def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
28 |     for i, num in enumerate(layers):
29 |         for j in range(num):
30 |             internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
31 |             if batch_norm:
32 |                 internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
33 |             internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
34 |         internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
35 |     return internel_layer
36 | 
37 | def get_classifier(input_data, num_classes, **kwargs):
38 |     flatten = mx.sym.Flatten(data=input_data, name="flatten")
39 |     fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
40 |     relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
41 |     drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
42 |     fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
43 |     relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
44 |     drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
45 |     fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
46 |     return fc8
47 | 
48 | def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
49 |     """
50 |     Parameters
51 |     ----------
52 |     num_classes : int, default 1000
53 |         Number of classification classes.
54 |     num_layers : int
55 |         Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
56 |     batch_norm : bool, default False
57 |         Use batch normalization.
58 |     dtype: str, float32 or float16
59 |         Data precision.
60 |     """
61 |     vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
62 |                 13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
63 |                 16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
64 |                 19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
65 |     if num_layers not in vgg_spec:
66 |         raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
67 |     layers, filters = vgg_spec[num_layers]
68 |     data = mx.sym.Variable(name="data")
69 |     if dtype == 'float16':
70 |         data = mx.sym.Cast(data=data, dtype=np.float16)
71 |     feature = get_feature(data, layers, filters, batch_norm)
72 |     classifier = get_classifier(feature, num_classes)
73 |     if dtype == 'float16':
74 |         classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
75 |     symbol = mx.sym.SoftmaxOutput(data=classifier, name='softmax')
76 |     return symbol
77 | 


--------------------------------------------------------------------------------
/docs/best-practice.md:
--------------------------------------------------------------------------------
 1 | # BytePS Best Practice
 2 | 
 3 | ## Single machine (non-distributed mode)
 4 | 
 5 | When `DMLC_NUM_WORKER=1`, BytePS will not use the parameter servers or schedulers at all. In this case, BytePS runs in non-distributed mode. You do not even need to run server/scheduler.
 6 | 
 7 | In non-distributed mode, BytePS is basically doing NCCL allreduce, so it will not outperform Horovod/NCCL much. BytePS implemented priority-based scheduling, which may improve the training speed by 0%~15%, depending on your training task.
 8 | 
 9 | The only thing you can tune is `BYTEPS_PCIE_SWITCH_SIZE`. If you know your hardware topology, e.g., say you have 8 GPUs in total, 4 GPUs connect to one PCI-e switch, the other 4 GPUs connect to another PCI-e switch, then you should set `BYTEPS_PCIE_SWITCH_SIZE=4`. In this case, you may see 20%~30% performance improvement compared with Horovod/NCCL.
10 | 
11 | If you have NVLinks, leave `BYTEPS_PCIE_SWITCH_SIZE` unmodified. If you don't know your hardware topology, leave `BYTEPS_PCIE_SWITCH_SIZE` unmodified.
12 | 
13 | 
14 | ## Multi-machine (distributed mode)
15 | 
16 | ### With additional CPU servers
17 | 
18 | This mode requires at least **4** physical machines. Two of the machines should have GPUs and run as workers. The other two run as CPU servers and do not need GPUs. The scheduler can run on any machine.
19 | 
20 | The key here is to make sure the following:
21 | * Servers must be on different physical machines from workers.
22 | * The total bandwidth of the servers must be equal or larger than the total bandwidth of workers.
23 | 
24 | If you are using RDMA, this should be sufficient. However, with TCP and >=25Gbps networks, it's possible that BytePS cannot fully utilize the bandwidth because a single TCP connection usually cannot run up to 25Gbps.
25 | 
26 | To address this, you can try running more BytePS server instances on the server machines. For example, you can try running two server instances per server machines. This effectively doubles the number of TCP connections and should be sufficient for 25Gbps networks. For 40Gbps/50Gbps networks, you need three server instances per server machine, and so on.
27 | 
28 | ### No additional CPU servers
29 | 
30 | When you don't have additional CPU servers, then for each physical machine, you should launch a worker and a server process. We call this *co-locate* mode, and the resource consumption is the same with Horovod (no additional servers).
31 | 
32 | If you are using TCP, you will probably get near-identical performance with Horovod-TCP. However, if you are using RDMA, you can set `BYTEPS_ENABLE_IPC=1` to enable the IPC communication between the co-located worker and server. And eventually you will get higher end-to-end performance than Horovod.
33 | 
34 | ## The expected performance
35 | 
36 | In the single machine case, if you leave `BYTEPS_PCIE_SWITCH_SIZE` unmodified, BytePS performance should never be lower than Horovod/NCCL.
37 | 
38 | In multi-machine case, if the deployment satisfies the two requirements above, you should see BytePS is at least as fast as Horovod or TF and MXNet's native PS. If each of your workers has two or more GPUs, you should see significant improvement, like 40% - 100% compared with other existing solutions.
39 | 
40 | If you have to deploy server instances on the same physical machines as workers, the performance will be similar to Horovod/NCCL.
41 | 
42 | If you have less servers than workers, the performance will be proportionally lower. For example, if you have only 1 server and 2 workers, you'll only get half of the performance compared with 2 servers + 2 workers.
43 | 
44 | ## How to compare with other solutions
45 | 
46 | To compare with Horovod is simple. Install Horovod, and change `bps` back to `hvd`.
47 | 
48 | To compare with other PS architecture, make sure that you use the same hardware setup. Most of the existing PS implementations cannot run as fast as Horovod/NCCL. So, usually you just need to compare with Horovod/NCCL.
49 | 


--------------------------------------------------------------------------------
/tests/test_tensorflow_keras.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Tests for byteps.keras."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import tensorflow as tf
 23 | import numpy as np
 24 | import warnings
 25 | 
 26 | from distutils.version import LooseVersion
 27 | from tensorflow import keras
 28 | from tensorflow.python.keras import backend as K
 29 | 
 30 | import byteps.tensorflow.keras as bps
 31 | 
 32 | class TfKerasTests:
 33 |     """
 34 |     Tests for ops in byteps.keras.
 35 |     """
 36 | 
 37 |     def __init__(self, *args, **kwargs):
 38 |         super(TfKerasTests, self).__init__(*args, **kwargs)
 39 |         warnings.simplefilter('module')
 40 |         bps.init()
 41 | 
 42 |         self.config = tf.ConfigProto()
 43 |         self.config.gpu_options.allow_growth = True
 44 |         self.config.gpu_options.visible_device_list = str(bps.local_rank())
 45 | 
 46 |     def test_train_model(self):
 47 |         with self.test_session(config=self.config) as sess:
 48 |             K.set_session(sess)
 49 | 
 50 |             opt = keras.optimizers.RMSprop(lr=0.0001)
 51 |             opt = bps.DistributedOptimizer(opt)
 52 | 
 53 |             model = keras.models.Sequential()
 54 |             model.add(keras.layers.Dense(2, input_shape=(3,)))
 55 |             model.add(keras.layers.RepeatVector(3))
 56 |             model.add(keras.layers.ThresholdedReLU(0.5))
 57 |             model.compile(loss=keras.losses.mean_squared_error,
 58 |                           optimizer=opt,
 59 |                           metrics=[keras.metrics.categorical_accuracy],
 60 |                           sample_weight_mode='temporal')
 61 | 
 62 |             x = np.random.random((1, 3))
 63 |             y = np.random.random((1, 3, 3))
 64 | 
 65 |             def generator():
 66 |                 while 1:
 67 |                     yield (x, y)
 68 | 
 69 |             print ('x is: ', x)
 70 |             print ('y is: ', y)
 71 |             # No assertions, we just need to verify that it doesn't hang
 72 |             callbacks = [bps.callbacks.BroadcastGlobalVariablesCallback(0)]
 73 |             model.fit_generator(generator(),
 74 |                                 steps_per_epoch=10,
 75 |                                 callbacks=callbacks,
 76 |                                 epochs=0,
 77 |                                 verbose=0,
 78 |                                 workers=4,
 79 |                                 initial_epoch=1)
 80 |             print ('x-trained is: ', x)
 81 |             print ('y-trained is: ', y)
 82 | 
 83 |     def test_sparse_as_dense(self):
 84 |         with self.test_session(config=self.config) as sess:
 85 |             K.set_session(sess)
 86 | 
 87 |             opt = keras.optimizers.RMSprop(lr=0.0001)
 88 |             opt = bps.DistributedOptimizer(opt, sparse_as_dense=True)
 89 | 
 90 |             model = keras.models.Sequential()
 91 |             model.add(keras.layers.Embedding(1000, 64, input_length=10))
 92 |             model.compile(loss=keras.losses.mean_squared_error,
 93 |                           optimizer=opt)
 94 | 
 95 |             x = np.random.randint(1000, size=(32, 10))
 96 |             y = np.random.random((32, 10, 64))
 97 |             # No assertions, we just need to verify that it doesn't hang
 98 |             model.train_on_batch(x, y)
 99 | 
100 | 
101 | if __name__ == '__main__':
102 |     keras_test = TfKerasTests()
103 |     keras_test.test_train_model()
104 | 


--------------------------------------------------------------------------------
/example/tensorflow/tensorflow2_keras_mnist.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Uber Technologies, Inc. All Rights Reserved.
 2 | # Copyright 2019 Uber Technologies, Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 | from __future__ import absolute_import, division, print_function
17 | 
18 | import tensorflow as tf
19 | import byteps.tensorflow.keras as bps
20 | 
21 | # tf.compat.v1.disable_eager_execution()
22 | 
23 | # byteps: initialize byteps.
24 | bps.init()
25 | 
26 | # byteps: pin GPU to be used to process local rank (one GPU per process)
27 | gpus = tf.config.experimental.list_physical_devices('GPU')
28 | for gpu in gpus:
29 |     tf.config.experimental.set_memory_growth(gpu, True)
30 | if gpus:
31 |     tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU')
32 | 
33 | (mnist_images, mnist_labels), _ = \
34 |     tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % bps.rank())
35 | 
36 | dataset = tf.data.Dataset.from_tensor_slices(
37 |     (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
38 |              tf.cast(mnist_labels, tf.int64))
39 | )
40 | dataset = dataset.repeat().shuffle(10000).batch(128)
41 | 
42 | mnist_model = tf.keras.Sequential([
43 |     tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
44 |     tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
45 |     tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
46 |     tf.keras.layers.Dropout(0.25),
47 |     tf.keras.layers.Flatten(),
48 |     tf.keras.layers.Dense(128, activation='relu'),
49 |     tf.keras.layers.Dropout(0.5),
50 |     tf.keras.layers.Dense(10, activation='softmax')
51 | ])
52 | 
53 | # byteps: adjust learning rate based on number of GPUs.
54 | scaled_lr = 0.001 * bps.size()
55 | opt = tf.optimizers.Adam(scaled_lr)
56 | 
57 | # byteps: add byteps DistributedOptimizer.
58 | opt = bps.DistributedOptimizer(opt)
59 | 
60 | # byteps: Specify `experimental_run_tf_function=False` to ensure TensorFlow
61 | # uses bps.DistributedOptimizer() to compute gradients.
62 | mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
63 |                     optimizer=opt,
64 |                     metrics=['accuracy'],
65 |                     experimental_run_tf_function=False)
66 | 
67 | callbacks = [
68 |     # byteps: broadcast initial variable states from rank 0 to all other processes.
69 |     # This is necessary to ensure consistent initialization of all workers when
70 |     # training is started with random weights or restored from a checkpoint.
71 |     bps.callbacks.BroadcastGlobalVariablesCallback(0, device="GPU:0"),
72 | 
73 |     # byteps: average metrics among workers at the end of every epoch.
74 |     #
75 |     # Note: This callback must be in the list before the ReduceLROnPlateau,
76 |     # TensorBoard or other metrics-based callbacks.
77 |     bps.callbacks.MetricAverageCallback(device="GPU:0"),
78 | 
79 |     # byteps: using `lr = 1.0 * bps.size()` from the very beginning leads to worse final
80 |     # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * bps.size()` during
81 |     # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
82 |     bps.callbacks.LearningRateWarmupCallback(warmup_epochs=3, initial_lr=scaled_lr, verbose=1),
83 | ]
84 | 
85 | # byteps: save checkpoints only on worker 0 to prevent other workers from corrupting them.
86 | if bps.rank() == 0:
87 |     callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
88 | 
89 | # byteps: write logs on worker 0.
90 | verbose = 1 if bps.rank() == 0 else 0
91 | 
92 | # Train the model.
93 | # byteps: adjust number of steps based on number of GPUs.
94 | mnist_model.fit(dataset, steps_per_epoch=500 // bps.size(), callbacks=callbacks, epochs=24, verbose=verbose)
95 | 


--------------------------------------------------------------------------------
/byteps/common/logging.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
  2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");
  5 | // you may not use this file except in compliance with the License.
  6 | // You may obtain a copy of the License at
  7 | //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS,
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | // See the License for the specific language governing permissions and
 14 | // limitations under the License.
 15 | // =============================================================================
 16 | 
 17 | #ifndef BYTEPS_LOGGING_H
 18 | #define BYTEPS_LOGGING_H
 19 | 
 20 | #include <sstream>
 21 | #include <string>
 22 | 
 23 | namespace byteps {
 24 | namespace common {
 25 | 
 26 | enum class LogLevel { TRACE, DEBUG, INFO, WARNING, ERROR, FATAL };
 27 | 
 28 | #define LOG_LEVELS "TDIWEF"
 29 | 
 30 | // Always-on checking
 31 | #define BPS_CHECK(x) \
 32 |   if (!(x))          \
 33 |   common::LogMessageFatal(__FILE__, __LINE__) << "Check failed: " #x << ' '
 34 | 
 35 | #define BPS_CHECK_LT(x, y) BPS_CHECK((x) < (y))
 36 | #define BPS_CHECK_GT(x, y) BPS_CHECK((x) > (y))
 37 | #define BPS_CHECK_LE(x, y) BPS_CHECK((x) <= (y))
 38 | #define BPS_CHECK_GE(x, y) BPS_CHECK((x) >= (y))
 39 | #define BPS_CHECK_EQ(x, y) BPS_CHECK((x) == (y))
 40 | #define BPS_CHECK_NE(x, y) BPS_CHECK((x) != (y))
 41 | #define BPS_CHECK_NOTNULL(x)                                 \
 42 |   ((x) == NULL ? common::LogMessageFatal(__FILE__, __LINE__) \
 43 |                      << "Check  notnull: " #x << ' ',        \
 44 |    (x) : (x))  // NOLINT(*)
 45 | 
 46 | /*!
 47 |  * \brief Protected CUDA call.
 48 |  * \param func Expression to call.
 49 |  *
 50 |  * It checks for CUDA errors after invocation of the expression.
 51 |  */
 52 | #define CUDA_CALL(func)                                          \
 53 |   {                                                              \
 54 |     cudaError_t e = (func);                                      \
 55 |     BPS_CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
 56 |         << "CUDA: " << cudaGetErrorString(e);                    \
 57 |   }
 58 | 
 59 | /*
 60 |  * \brief Protected NCCL call.
 61 |  */
 62 | #define NCCLCHECK(cmd)                                                      \
 63 |   {                                                                         \
 64 |     ncclResult_t r = (cmd);                                                 \
 65 |     BPS_CHECK(r == ncclSuccess) << "NCCL error: " << ncclGetErrorString(r); \
 66 |   }
 67 | 
 68 | class LogMessage : public std::basic_ostringstream<char> {
 69 |  public:
 70 |   LogMessage(const char* fname, int line, LogLevel severity);
 71 |   ~LogMessage();
 72 | 
 73 |  protected:
 74 |   void GenerateLogMessage(bool log_time);
 75 | 
 76 |  private:
 77 |   const char* fname_;
 78 |   int line_;
 79 |   LogLevel severity_;
 80 | };
 81 | 
 82 | // LogMessageFatal ensures the process will exit in failure after
 83 | // logging this message.
 84 | class LogMessageFatal : public LogMessage {
 85 |  public:
 86 |   LogMessageFatal(const char* file, int line);
 87 |   ~LogMessageFatal();
 88 | };
 89 | 
 90 | #define _BPS_LOG_TRACE LogMessage(__FILE__, __LINE__, LogLevel::TRACE)
 91 | #define _BPS_LOG_DEBUG LogMessage(__FILE__, __LINE__, LogLevel::DEBUG)
 92 | #define _BPS_LOG_INFO LogMessage(__FILE__, __LINE__, LogLevel::INFO)
 93 | #define _BPS_LOG_WARNING LogMessage(__FILE__, __LINE__, LogLevel::WARNING)
 94 | #define _BPS_LOG_ERROR LogMessage(__FILE__, __LINE__, LogLevel::ERROR)
 95 | #define _BPS_LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
 96 | 
 97 | #define _LOG(severity) _BPS_LOG_##severity
 98 | 
 99 | #define _LOG_RANK(severity, rank) _BPS_LOG_##severity << "[" << rank << "]: "
100 | 
101 | #define GET_LOG(_1, _2, NAME, ...) NAME
102 | #define BPS_LOG(...) GET_LOG(__VA_ARGS__, _LOG_RANK, _LOG)(__VA_ARGS__)
103 | 
104 | LogLevel MinLogLevelFromEnv();
105 | bool LogTimeFromEnv();
106 | 
107 | }  // namespace common
108 | }  // namespace byteps
109 | 
110 | #endif  // BYTEPS_LOGGING_H
111 | 


--------------------------------------------------------------------------------
/tests/test_topk.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import itertools
 17 | import random
 18 | import unittest
 19 | 
 20 | import byteps.mxnet as bps
 21 | import mxnet as mx
 22 | import mxnet.ndarray as nd
 23 | import numpy as np
 24 | from gluoncv.model_zoo import get_model
 25 | from mxnet import autograd, gluon
 26 | from parameterized import parameterized
 27 | from tqdm import tqdm
 28 | 
 29 | from meta_test import MetaTest
 30 | from utils import fake_data
 31 | 
 32 | 
 33 | def topk(x, k):
 34 |     y = x.flatten()
 35 |     indices = np.argsort(np.abs(y))[-k:][::-1]
 36 |     vals = y[indices]
 37 |     y.fill(0)
 38 |     for idx, val in zip(indices, vals):
 39 |         y[idx] = val
 40 |     return y.reshape(x.shape)
 41 | 
 42 | 
 43 | class TopkTestCase(unittest.TestCase, metaclass=MetaTest):
 44 |     @parameterized.expand(itertools.product([1, 3, 5]))
 45 |     def test_topk(self, k):
 46 |         ctx = mx.gpu(0)
 47 |         net = get_model("resnet18_v2")
 48 |         net.initialize(mx.init.Xavier(), ctx=ctx)
 49 |         net.summary(nd.ones((1, 3, 224, 224), ctx=ctx))
 50 | 
 51 |         # hyper-params
 52 |         batch_size = 32
 53 |         optimizer_params = {'momentum': 0, 'wd': 0,
 54 |                             'learning_rate': 0.01}
 55 | 
 56 |         compression_params = {
 57 |             "compressor": "topk",
 58 |             "k": k,
 59 |         }
 60 | 
 61 |         trainer = bps.DistributedTrainer(net.collect_params(
 62 |         ), "sgd", optimizer_params, compression_params=compression_params)
 63 | 
 64 |         loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
 65 | 
 66 |         train_data = fake_data(batch_size=batch_size)
 67 | 
 68 |         params = {}
 69 | 
 70 |         for i, param in enumerate(trainer._params):
 71 |             if param.grad_req != 'null':
 72 |                 params[i] = param._data[0].asnumpy()
 73 | 
 74 |         for it, batch in tqdm(enumerate(train_data)):
 75 |             data = batch[0].as_in_context(ctx)
 76 |             label = batch[1].as_in_context(ctx)
 77 | 
 78 |             with autograd.record():
 79 |                 output = net(data)
 80 |                 loss = loss_fn(output, label)
 81 | 
 82 |             loss.backward()
 83 | 
 84 |             gs = {}
 85 |             xs = {}
 86 | 
 87 |             for i, param in enumerate(trainer._params):
 88 |                 if param.grad_req != 'null':
 89 |                     gs[i] = param._grad[0].asnumpy()
 90 |                     xs[i] = param._data[0].asnumpy()
 91 | 
 92 |             trainer.step(batch_size)
 93 | 
 94 |             for i, param in enumerate(trainer._params):
 95 |                 if param.grad_req != "null":
 96 |                     g = gs[i] / (batch_size * bps.size())
 97 |                     c = topk(g, k)
 98 | 
 99 |                     cs = topk(c, k)
100 |                     c = cs
101 | 
102 |                     params[i] -= optimizer_params["learning_rate"] * c
103 | 
104 |         cnt = 0
105 |         tot = 0
106 |         for i, param in enumerate(trainer._params):
107 |             if param.grad_req != "null":
108 |                 x = param._data[0].asnumpy()
109 |                 tot += len(x.flatten())
110 |                 if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps):
111 |                     diff = np.abs(x.flatten() - params[i].flatten())
112 |                     idx = np.where(diff > np.finfo(np.float32).eps)
113 |                     cnt += len(idx[0])
114 | 
115 |         assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     unittest.main()
120 | 


--------------------------------------------------------------------------------
/docs/timeline.md:
--------------------------------------------------------------------------------
 1 | # Performance Analysis of BytePS
 2 | 
 3 | You can analyze the fine-grained performance of BytePS with the profiling tool.
 4 | 
 5 | ## For Communication Operations
 6 | 
 7 | ### Usage
 8 | 
 9 | Use the following environment variables to enable profiling the communication operations:
10 | 
11 | ``` python
12 | "BYTEPS_TRACE_ON" = "1"
13 | "BYTEPS_TRACE_END_STEP" = "20"
14 | "BYTEPS_TRACE_START_STEP"="10"
15 | "BYTEPS_TRACE_DIR"= "./traces"
16 | ```
17 | First `BYTEPS_TRACE_ON` should be set to `1` to enable profiling communication traces. `BYTEPS_TRACE_START_STEP` and `BYTEPS_TRACE_END_STEP` decide the step interval we want to profile, traces from step `BYTEPS_TRACE_START_STEP` to step `BYTEPS_TRACE_END_STEP` steps will be automatically collected and the result traces will be output in the chrome trace format. `BYTEPS_TRACE_DIR` denotes the path where you want to store traces.
18 | 
19 | The result directory is organized as follows.
20 | ```
21 | traces/
22 | ├── 0
23 | │   └── comm.json
24 | │ 
25 | └── 1
26 |     └── comm.json
27 | ```
28 | 
29 | Here, `traces/` is the trace directory we defined using `BYTEPS_TRACE_DIR`. `traces/` contains several sub-directories, each of which denotes one GPU and is named with the local rank of this GPU, e.g., path `./traces/0/` stores the traces results of the GPU whose local rank is `0`. Each sub-directory contains following directories/files:
30 | * `comm.json`: the final trace file which contains the communication traces of all gradients;
31 | 
32 | ### Trace Format
33 | Let's look deep into the traces.
34 | ``` json
35 | {
36 |     "ph": "X",
37 |     "args": {
38 |         "name": "Comm.byteps.gradient_0"
39 |     },
40 |     "pid": "Comm.byteps.gradient_0",
41 |     "name": "Comm.byteps.gradient_0",
42 |     "ts": 1574685989504865,
43 |     "dur": 24026,
44 |     "tid": "total"
45 | },
46 | {
47 |     "ph": "X",
48 |     "args": {
49 |         "name": "Comm.byteps.gradient_0"
50 |     },
51 |     "pid": "Comm.byteps.gradient_0",
52 |     "name": "Comm.byteps.gradient_0.BROADCAST",
53 |     "ts": 1574685984662375,
54 |     "dur": 1074,
55 |     "tid": "26148864"
56 | }
57 | ```
58 | Basically, the trace event format is the same as the standard [Trace Event Format](https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit). Here, `name` is the name of one event, which can be shown on `chrome://tracing/`. Considering BytePS divides each gradinets to multiple partitions if necessary and each partition needs to go through several types of following operations, namely `QueueType`.
59 | ```
60 |   "COORDINATE_REDUCE",
61 |   "REDUCE",
62 |   "COPYD2H",
63 |   "PCIE_REDUCE",
64 |   "COORDINATE_PUSH",
65 |   "PUSH",
66 |   "PULL",
67 |   "COPYH2D",
68 |   "COORDINATE_BROADCAST",
69 |   "BROADCAST"
70 | ```
71 | So there are two types of events:
72 | 1. If `tid` is `total`, the event records the entire interval to synchronize one gradient, including the queue time. In this case, `name` ends with the gradient index.
73 | 2. If `tid` is a number, the event records the interval for each `QueueType` of each partition of one gradient. In this case, `name` ends with the gradient index and the corresponding `QueueType`, `tid` denotes the partition id.
74 | 
75 | Note that for BytePS, for multiple GPUs on one worker, only the root GPU is responsible for synchronizing with servers, and these GPUs located on one worker update parameters through all-reduce. Therefore, you can observe `PUSH` and `PULL` operations only in the traces of the root GPU. By default, the root GPU is one with the largest local rank.
76 | 
77 | Below shows a visualization example of `comm.json`.
78 | <img src="https://user-images.githubusercontent.com/17765864/69711658-634e3080-113c-11ea-8d70-fb75f89f2791.png" width="1916">
79 | 
80 | ### Overhead
81 | Below shows the latency when running [`bert_12_768_12`](https://github.com/joapolarbear/gluon-nlp/tree/bert-byteprofile/scripts/bert) model with 2 workers, each containing 2 V100 GPUs with 16GB of memory. BytePS Timeline collects traces during step 10 to step 20 and after step 20, it asynchronously outputs the trace results, which may also cause extra overhead. Ignoring the warm up phase (the first 10 steps), the overhead induced by BytePS Timeline is small.
82 | <img src="https://user-images.githubusercontent.com/17765864/69713426-79a9bb80-113f-11ea-9bec-b588cc051fab.png" width="1916">
83 | 
84 | 


--------------------------------------------------------------------------------
/byteps/common/logging.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
  2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
  3 | //
  4 | // Licensed under the Apache License, Version 2.0 (the "License");
  5 | // you may not use this file except in compliance with the License.
  6 | // You may obtain a copy of the License at
  7 | //
  8 | //     http://www.apache.org/licenses/LICENSE-2.0
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software
 11 | // distributed under the License is distributed on an "AS IS" BASIS,
 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | // See the License for the specific language governing permissions and
 14 | // limitations under the License.
 15 | // =============================================================================
 16 | 
 17 | #include "logging.h"
 18 | #include <algorithm>
 19 | #include <chrono>
 20 | #include <iomanip>
 21 | #include <iostream>
 22 | 
 23 | namespace byteps {
 24 | namespace common {
 25 | 
 26 | LogMessage::LogMessage(const char* fname, int line, LogLevel severity)
 27 |     : fname_(fname), line_(line), severity_(severity) {}
 28 | 
 29 | void LogMessage::GenerateLogMessage(bool log_time) {
 30 |   bool use_cout =
 31 |       static_cast<int>(severity_) <= static_cast<int>(LogLevel::INFO);
 32 |   std::ostream& os = use_cout ? std::cout : std::cerr;
 33 |   if (log_time) {
 34 |     auto now = std::chrono::system_clock::now();
 35 |     auto as_time_t = std::chrono::system_clock::to_time_t(now);
 36 | 
 37 |     auto duration = now.time_since_epoch();
 38 |     auto seconds = std::chrono::duration_cast<std::chrono::seconds>(duration);
 39 |     auto micros_remainder =
 40 |         std::chrono::duration_cast<std::chrono::microseconds>(duration -
 41 |                                                               seconds);
 42 | 
 43 |     const size_t time_buffer_size = 30;
 44 |     char time_buffer[time_buffer_size];
 45 |     strftime(time_buffer, time_buffer_size, "%Y-%m-%d %H:%M:%S",
 46 |              localtime(&as_time_t));
 47 |     os << "[" << time_buffer << "." << std::setw(6) << micros_remainder.count()
 48 |        << ": " << LOG_LEVELS[static_cast<int>(severity_)] << " " << fname_
 49 |        << ":" << line_ << "] " << str() << std::endl;
 50 |   } else {
 51 |     os << "[" << LOG_LEVELS[static_cast<int>(severity_)] << " " << fname_ << ":"
 52 |        << line_ << "] " << str() << std::endl;
 53 |   }
 54 | }
 55 | 
 56 | LogMessage::~LogMessage() {
 57 |   static LogLevel min_log_level = MinLogLevelFromEnv();
 58 |   static bool log_time = LogTimeFromEnv();
 59 |   if (severity_ >= min_log_level) {
 60 |     GenerateLogMessage(log_time);
 61 |   }
 62 | }
 63 | 
 64 | LogMessageFatal::LogMessageFatal(const char* file, int line)
 65 |     : LogMessage(file, line, LogLevel::FATAL) {}
 66 | 
 67 | LogMessageFatal::~LogMessageFatal() {
 68 |   static bool log_time = LogTimeFromEnv();
 69 |   GenerateLogMessage(log_time);
 70 |   abort();
 71 | }
 72 | 
 73 | LogLevel ParseLogLevelStr(const char* env_var_val) {
 74 |   std::string min_log_level(env_var_val);
 75 |   std::transform(min_log_level.begin(), min_log_level.end(),
 76 |                  min_log_level.begin(), ::tolower);
 77 |   if (min_log_level == "trace") {
 78 |     return LogLevel::TRACE;
 79 |   } else if (min_log_level == "debug") {
 80 |     return LogLevel::DEBUG;
 81 |   } else if (min_log_level == "info") {
 82 |     return LogLevel::INFO;
 83 |   } else if (min_log_level == "warning") {
 84 |     return LogLevel::WARNING;
 85 |   } else if (min_log_level == "error") {
 86 |     return LogLevel::ERROR;
 87 |   } else if (min_log_level == "fatal") {
 88 |     return LogLevel::FATAL;
 89 |   } else {
 90 |     return LogLevel::WARNING;
 91 |   }
 92 | }
 93 | 
 94 | LogLevel MinLogLevelFromEnv() {
 95 |   const char* env_var_val = getenv("BYTEPS_LOG_LEVEL");
 96 |   if (env_var_val == nullptr) {
 97 |     // default to WARNING
 98 |     return LogLevel::WARNING;
 99 |   }
100 |   return ParseLogLevelStr(env_var_val);
101 | }
102 | 
103 | bool LogTimeFromEnv() {
104 |   const char* env_var_val = getenv("BYTEPS_LOG_HIDE_TIME");
105 |   if (env_var_val != nullptr && std::strtol(env_var_val, nullptr, 10) > 0) {
106 |     return false;
107 |   } else {
108 |     return true;
109 |   }
110 | }
111 | 
112 | }  // namespace common
113 | }  // namespace byteps
114 | 


--------------------------------------------------------------------------------
/example/pytorch/mnist-distributed.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | import argparse
  4 | import torch.multiprocessing as mp
  5 | import torchvision
  6 | import torchvision.transforms as transforms
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.distributed as dist
 10 | from torch.nn.parallel import DistributedDataParallel as DDP
 11 | 
 12 | 
 13 | def main():
 14 |     parser = argparse.ArgumentParser()
 15 |     parser.add_argument('-n', '--nodes', default=4, type=int, metavar='N',
 16 |                         help='number of data loading workers (default: 4)')
 17 |     parser.add_argument('-g', '--gpus', default=1, type=int,
 18 |                         help='number of gpus per node')
 19 |     parser.add_argument('-nr', '--nr', default=0, type=int,
 20 |                         help='ranking within the nodes')
 21 |     parser.add_argument('--epochs', default=2, type=int, metavar='N',
 22 |                         help='number of total epochs to run')
 23 |     args = parser.parse_args()
 24 |     args.world_size = args.gpus * args.nodes
 25 |     os.environ['MASTER_ADDR'] = '10.57.23.164'
 26 |     os.environ['MASTER_PORT'] = '8888'
 27 |     mp.spawn(train, nprocs=args.gpus, args=(args,))
 28 | 
 29 | 
 30 | class ConvNet(nn.Module):
 31 |     def __init__(self, num_classes=10):
 32 |         super(ConvNet, self).__init__()
 33 |         self.layer1 = nn.Sequential(
 34 |             nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
 35 |             nn.BatchNorm2d(16),
 36 |             nn.ReLU(),
 37 |             nn.MaxPool2d(kernel_size=2, stride=2))
 38 |         self.layer2 = nn.Sequential(
 39 |             nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
 40 |             nn.BatchNorm2d(32),
 41 |             nn.ReLU(),
 42 |             nn.MaxPool2d(kernel_size=2, stride=2))
 43 |         self.fc = nn.Linear(7*7*32, num_classes)
 44 | 
 45 |     def forward(self, x):
 46 |         out = self.layer1(x)
 47 |         out = self.layer2(out)
 48 |         out = out.reshape(out.size(0), -1)
 49 |         out = self.fc(out)
 50 |         return out
 51 | 
 52 | 
 53 | def train(gpu, args):
 54 |     rank = args.nr * args.gpus + gpu
 55 |     dist.init_process_group(
 56 |         backend='nccl',
 57 |         init_method='env://',
 58 |         world_size=args.world_size,
 59 |         rank=rank)
 60 |     torch.manual_seed(0)
 61 |     model = ConvNet()
 62 |     torch.cuda.set_device(gpu)
 63 |     model.cuda(gpu)
 64 |     batch_size = 100
 65 |     # define loss function (criterion) and optimizer
 66 |     criterion = nn.CrossEntropyLoss().cuda(gpu)
 67 |     optimizer = torch.optim.SGD(model.parameters(), 1e-4)
 68 |     # Wrap the model
 69 | 
 70 |     model = DDP(model, device_ids=[gpu])
 71 |     # Data loading code
 72 |     train_dataset = torchvision.datasets.MNIST(
 73 |         root='./data',
 74 |         train=True,
 75 |         transform=transforms.ToTensor(),
 76 |         download=True
 77 |     )
 78 |     train_sampler = torch.utils.data.distributed.DistributedSampler(
 79 |         train_dataset,
 80 |         num_replicas=args.world_size,
 81 |         rank=rank)
 82 |     train_loader = torch.utils.data.DataLoader(
 83 |         dataset=train_dataset,
 84 |         batch_size=batch_size,
 85 |         shuffle=False,
 86 |         num_workers=0,
 87 |         pin_memory=True,
 88 |         sampler=train_sampler
 89 |     )
 90 | 
 91 |     start = datetime.now()
 92 |     total_step = len(train_loader)
 93 |     for epoch in range(args.epochs):
 94 |         for i, (images, labels) in enumerate(train_loader):
 95 |             images = images.cuda(non_blocking=True)
 96 |             labels = labels.cuda(non_blocking=True)
 97 |             # Forward pass
 98 |             outputs = model(images)
 99 |             loss = criterion(outputs, labels)
100 | 
101 |             # Backward and optimize
102 |             optimizer.zero_grad()
103 |             loss.backward()
104 |             optimizer.step()
105 |             if (i + 1) % 100 == 0 and gpu == 0:
106 |                 print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step,
107 |                                                                          loss.item()))
108 |     if gpu == 0:
109 |         print("Training complete in: " + str(datetime.now() - start))
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     main()
114 | 


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ about PS vs Allreduce
 2 | 
 3 | Below we summarize a list of questions or *incorrect* statements that many users are confused of.
 4 | 
 5 | ### **BytePS has a better performance because of some data path details, e.g., less copying?**
 6 | 
 7 | Not really. BytePS uses share memory in a similar way as NCCL, and BytePS copies the data for as many times as NCCL. In addition, both BytePS and NCCL use tensor partition/segmentation internally, which hides most of the copying delay.
 8 | 
 9 | I would consider BytePS and NCCL are both close to their theoretical optimal based on their communication patterns.
10 | 
11 | ### **BytePS has a better performance because it has a hierarchical strategy, i.e., local reduce followed by inter-machine transfers?**
12 | 
13 | Hierarchical strategy does help a bit. However, it is not as fundamental as the PS communication pattern. Consider the following example:
14 | 
15 | You have N worker machines connected to the same network switch, and each worker has only 1 GPU. In this case, the topology is flat -- no hierarchy at all. In this case, you'll find the analysis in [rationale.md](/docs/rationale.md) still applies. PS communication pattern has less traffic volume *from worker's stand point of view*.
16 | 
17 | ### **Allreduce is equivelant to PS, as long as you find the correct allreduce strategy?**
18 | 
19 | Not really. Consider the above flat N-worker example again. No matter with which allreduce strategy, the conclusion of PS vs. allreduce does not change.
20 | 
21 | ### **Okay, I get that PS has less traffic from workers. But, PS Push and Pull are not duplex, and waste half bandwidth?**
22 | 
23 | BytePS does not have this problem. It can fully utilize bi-direction network bandwidth. The key ideas are tensor partitioning and pipelining. For example, you have a 100MB tensor to be pushed and pulled. Inside BytePS, we will partition the tensor into small pieces. After pushing the first piece, we will start pulling the first piece. At the same time, we will start pushing the second piece. And so on. For most of the time except the first piece and the last piece, the bi-directional bandwidth is fully utilized.
24 | 
25 | ### **Since the bottleneck is the NIC of GPU machines, why not add more NICs?**
26 | 
27 | There are indeed [specialized physical server designs](https://images.nvidia.com/content/pdf/dgx1-v100-system-architecture-whitepaper.pdf) doing that. Unfortunately, cloud or shared clusters usually prefer not do this. This is because, as a matter of fact, many training jobs are not distributed. For these jobs, users want the GPUs to be deployed as dense as possible and the network bandwidth requirement is low.
28 | 
29 | If you are building your own cluster for a *single* dedicated *distributed* training job, of course you can go with the HPC route, carefully calculate the best ratio between GPUs and NICs, build a homogeneous cluster and use allreduce. However, please realize that cloud and shared clusters are not HPC. This is the whole point of BytePS.
30 | 
31 | ### **Does PS architecture impose heavier cross rack traffic, and may be impacted by physical network over-subscription ratio?**
32 | 
33 | This is true. For a large job that workers and PS cannot fit side a rack, PS does have more cross-rack traffic.
34 | 
35 | However, the comparison with allreduce in real life is more complicated. It depends on how well you can control the physical job placement and allreduce rings. If you don't have the full control of placement, or your MPI/NCCL rank assignment is not physical network topology-aware, allreduce would face the exactly same problem. NCCL and most MPIs today are unaware of physical network topology, unless specifically designed for a given HPC.
36 | 
37 | Don't be scared of the oversubscription ratio. It exists for a reason -- usually, not all servers in a rack are simultaneously busy on networking. Multiple researches from major cloud providers show that the average bandwidth utilization is low. Remember, this is a shared cluster, not everyone is running distributed training.
38 | 
39 | ### **Final remarks**
40 | 
41 | With BytePS, we want to share two key insights --
42 | 
43 | * Cloud, either public or private, is different from HPC. Using ideas from HPC is a shortcut, but not optimal.
44 | * In a (public or private) cloud, PS architecture is theoretically better than allreduce, with minimal additional costs.
45 | 
46 | BytePS is a realization of the idea.
47 | 


--------------------------------------------------------------------------------
/tests/test_onebit.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon Technologies, Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | import itertools
 17 | import unittest
 18 | 
 19 | import byteps.mxnet as bps
 20 | import mxnet as mx
 21 | import mxnet.ndarray as nd
 22 | import numpy as np
 23 | from gluoncv.model_zoo import get_model
 24 | from mxnet import autograd, gluon
 25 | from parameterized import parameterized
 26 | from tqdm import tqdm
 27 | 
 28 | from meta_test import MetaTest
 29 | from utils import fake_data
 30 | 
 31 | 
 32 | def onebit(x, scaling):
 33 |     if scaling:
 34 |         l1 = np.linalg.norm(x.flatten(), 1)
 35 |     sign = x < 0
 36 |     sign = -((sign << 1) - 1)
 37 |     if scaling:
 38 |         return l1 / len(x.flatten()) * sign
 39 |     else:
 40 |         return sign
 41 | 
 42 | 
 43 | class OnebitTestCase(unittest.TestCase, metaclass=MetaTest):
 44 |     @parameterized.expand(itertools.product([True, False]))
 45 |     def test_onebit(self, scaling):
 46 |         bps.init()
 47 |         ctx = mx.gpu(0)
 48 |         net = get_model("resnet18_v2")
 49 |         net.initialize(mx.init.Xavier(), ctx=ctx)
 50 |         net.summary(nd.ones((1, 3, 224, 224), ctx=ctx))
 51 | 
 52 |         # hyper-params
 53 |         batch_size = 32
 54 |         optimizer_params = {'momentum': 0, 'wd': 0,
 55 |                             'learning_rate': 0.01}
 56 | 
 57 |         compression_params = {
 58 |             "compressor": "onebit",
 59 |             "scaling": scaling,
 60 |         }
 61 | 
 62 |         trainer = bps.DistributedTrainer(net.collect_params(
 63 |         ), "sgd", optimizer_params, compression_params=compression_params)
 64 | 
 65 |         loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
 66 | 
 67 |         train_data = fake_data(batch_size=batch_size)
 68 | 
 69 |         params = {}
 70 | 
 71 |         for i, param in enumerate(trainer._params):
 72 |             if param.grad_req != 'null':
 73 |                 params[i] = param._data[0].asnumpy()
 74 | 
 75 |         for it, batch in tqdm(enumerate(train_data)):
 76 |             data = batch[0].as_in_context(ctx)
 77 |             label = batch[1].as_in_context(ctx)
 78 | 
 79 |             with autograd.record():
 80 |                 output = net(data)
 81 |                 loss = loss_fn(output, label)
 82 | 
 83 |             loss.backward()
 84 | 
 85 |             gs = {}
 86 |             xs = {}
 87 | 
 88 |             for i, param in enumerate(trainer._params):
 89 |                 if param.grad_req != 'null':
 90 |                     gs[i] = param._grad[0].asnumpy()
 91 |                     xs[i] = param._data[0].asnumpy()
 92 | 
 93 |             trainer.step(batch_size)
 94 | 
 95 |             for i, param in enumerate(trainer._params):
 96 |                 if param.grad_req != "null":
 97 |                     g = gs[i] / (batch_size * bps.size())
 98 |                     c = onebit(g, scaling)
 99 | 
100 |                     cs = onebit(c, scaling)
101 |                     c = cs
102 | 
103 |                     params[i] -= optimizer_params["learning_rate"] * c
104 | 
105 |         cnt = 0
106 |         tot = 0
107 |         for i, param in enumerate(trainer._params):
108 |             if param.grad_req != "null":
109 |                 x = param._data[0].asnumpy()
110 |                 tot += len(x.flatten())
111 |                 if not np.allclose(params[i], x, atol=np.finfo(np.float32).eps):
112 |                     diff = np.abs(x.flatten() - params[i].flatten())
113 |                     idx = np.where(diff > np.finfo(np.float32).eps)
114 |                     cnt += len(idx[0])
115 | 
116 |         assert cnt == 0, "false/tot=%d/%d=%f" % (cnt, tot, cnt/tot)
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     unittest.main()
121 | 


--------------------------------------------------------------------------------