├── .clang-format
├── .gitignore
├── CMakeLists.txt
├── README.md
├── _archived
    ├── basic_test.py
    ├── check_env.py
    ├── check_env_remote.py
    ├── cluster-config
    │   ├── cluster-new.yaml
    │   ├── cluster.yaml
    │   ├── gpu.yaml
    │   ├── large-cluster.yaml
    │   ├── large_cluster.yaml
    │   ├── new_cluster.yaml
    │   ├── single-nc.yaml
    │   ├── single-new.yaml
    │   ├── single.yaml
    │   └── siyuan-old.yaml
    ├── compare_bcast.c
    ├── exit_test.py
    ├── fault_tolerance_tests
    │   ├── README.md
    │   ├── enable_failure.patch
    │   ├── run_test_fault_tolerance.sh
    │   └── test_wrapper_fault_tolerance.sh
    ├── hoplite_microbenchmarks.py
    ├── init_env.py
    ├── launch_test.py
    ├── mpi_compare_bcast.sh
    ├── notification_test.sh
    ├── parameter-server
    │   ├── compare_hoplite_mpi.sh
    │   ├── mpi_parameter_server.py
    │   ├── mpi_test.py
    │   ├── parse_results.py
    │   ├── run_gloo_allreduce.sh
    │   ├── run_mpi_allreduce.sh
    │   ├── run_mpi_ps.sh
    │   └── run_ps_tests.sh
    ├── parse_ray_result.py
    ├── restart_all_workers.sh
    ├── restart_ray.sh
    ├── script
    │   ├── README.md
    │   ├── find_missing_tests.py
    │   └── timeline.py
    ├── speed_test.sh
    └── sync_time.sh
├── app
    ├── parameter-server
    │   ├── README.md
    │   ├── analyze_fault_tolerance.py
    │   ├── cluster-asgd-fault-tolerance.yaml
    │   ├── cluster-config-access-results-only
    │   │   ├── README.md
    │   │   └── example.yaml
    │   ├── cluster-config-with-ami
    │   │   ├── README.md
    │   │   └── example.yaml
    │   ├── gloo_all_reduce.py
    │   ├── hoplite_all_reduce.py
    │   ├── hoplite_asgd_fault_tolerance.py
    │   ├── mpi_all_reduce.py
    │   ├── parameter_server.py
    │   ├── plot_async_ps_results.py
    │   ├── ps_helper.py
    │   ├── ray_asgd_fault_tolerance.py
    │   ├── ray_parameter_server_baseline.py
    │   ├── result_parser
    │   │   ├── parse_async_ps_hoplite.py
    │   │   ├── parse_gloo.py
    │   │   ├── parse_hoplite.py
    │   │   ├── parse_mpi.py
    │   │   └── parse_ray.py
    │   ├── run_allreduce_tests.sh
    │   ├── run_async_ps_fault_tolerance.sh
    │   └── run_async_ps_tests.sh
    ├── ray_serve
    │   ├── README.md
    │   ├── analyze_fault_tolerance.py
    │   ├── cluster-config-access-results-only
    │   │   ├── README.md
    │   │   └── example.yaml
    │   ├── cluster-config-with-ami
    │   │   ├── README.md
    │   │   └── example.yaml
    │   ├── cluster-config
    │   │   ├── README.md
    │   │   ├── cluster.yaml
    │   │   ├── example.yaml
    │   │   └── initial.yaml
    │   ├── hoplite_model_ensembling.py
    │   ├── hoplite_model_ensembling_fault_tolerance.py
    │   ├── model_ensembling.py
    │   └── model_ensembling_fault_tolerance.py
    └── rllib
    │   ├── README-with-ami.md
    │   ├── README.md
    │   ├── cluster.yaml
    │   ├── example.yaml
    │   └── initial.yaml
├── cmake
    └── FindGRPC.cmake
├── format.sh
├── fornode
├── install_dependencies.sh
├── microbenchmarks
    ├── README.md
    ├── cluster-config-access-results-only
    │   ├── README.md
    │   └── example.yaml
    ├── cluster-config-with-ami
    │   ├── README.md
    │   └── example.yaml
    ├── cluster-config
    │   ├── README.md
    │   ├── cluster.yaml
    │   ├── example.yaml
    │   └── initial.yaml
    ├── dask-python
    │   ├── auto_dask_benchmark.py
    │   ├── auto_test.sh
    │   ├── cleanup_dask.sh
    │   ├── dask_benchmark.py
    │   ├── dask_roundtrip.py
    │   ├── dask_roundtrip.sh
    │   ├── parse_result.py
    │   └── run_dask.sh
    ├── draw_collective_communication.py
    ├── gloo-cpp
    │   ├── .gitignore
    │   ├── README.md
    │   ├── auto_test.sh
    │   ├── install_gloo.sh
    │   ├── parse_result.py
    │   ├── run_test.sh
    │   └── test_wrapper.sh
    ├── hoplite-cpp
    │   ├── README.md
    │   ├── auto_test.sh
    │   ├── coverage_test.sh
    │   ├── parse_result.py
    │   ├── pressure_test.sh
    │   ├── run_test.sh
    │   └── test_wrapper.sh
    ├── hoplite-python
    │   ├── README.md
    │   ├── auto_test.sh
    │   ├── coverage_test.sh
    │   ├── hoplite_microbenchmarks.py
    │   ├── parse_result.py
    │   ├── parse_roundtrip_result.py
    │   ├── pressure_test.sh
    │   ├── run_test.sh
    │   └── test_wrapper.sh
    ├── mpi-cpp
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── README.md
    │   ├── allgather.c
    │   ├── allreduce.c
    │   ├── auto_test.sh
    │   ├── coverage_test.sh
    │   ├── gather.c
    │   ├── multicast.c
    │   ├── parse_result.py
    │   ├── parse_roundtrip_result.py
    │   ├── reduce.c
    │   ├── roundtrip.c
    │   ├── run_test.sh
    │   └── test_wrapper.sh
    ├── plot_rtt.py
    └── ray-python
    │   ├── Makefile
    │   ├── README.md
    │   ├── auto_test.sh
    │   ├── ray_microbenchmarks.py
    │   ├── ray_roundtrip.py
    │   └── run_tests.py
├── python
    ├── hoplite
    │   ├── __init__.py
    │   ├── _hoplite_client.pxd
    │   └── _hoplite_client.pyx
    ├── setup.py
    └── setup.sh
├── src
    ├── client
    │   ├── distributed_object_store.cc
    │   ├── distributed_object_store.h
    │   ├── global_control_store.cc
    │   ├── global_control_store.h
    │   ├── local_store_client.cc
    │   ├── local_store_client.h
    │   ├── notification_listener.cc
    │   ├── notification_listener.h
    │   ├── object_sender.cc
    │   ├── object_sender.h
    │   ├── object_store_state.cc
    │   ├── object_store_state.h
    │   ├── receiver.cc
    │   └── receiver.h
    ├── common
    │   ├── buffer.cc
    │   ├── buffer.h
    │   ├── config.h
    │   ├── id.cc
    │   ├── id.h
    │   ├── status.cc
    │   └── status.h
    ├── object_directory
    │   ├── dependency.cc
    │   ├── dependency.h
    │   ├── notification.cc
    │   ├── notification.h
    │   ├── reduce_dependency.cc
    │   └── reduce_dependency.h
    ├── protocol
    │   └── object_store.proto
    ├── tests
    │   ├── allgather_test.cc
    │   ├── allreduce_test.cc
    │   ├── gather_test.cc
    │   ├── multicast_test.cc
    │   ├── notification_server_test.cc
    │   ├── reduce_dependency_test.cc
    │   ├── reduce_test.cc
    │   └── subset_reduce_test.cc
    └── util
    │   ├── ctpl_stl.h
    │   ├── hash.cc
    │   ├── hash.h
    │   ├── logging.cc
    │   ├── logging.h
    │   ├── protobuf_utils.h
    │   ├── socket_utils.cc
    │   ├── socket_utils.h
    │   └── test_utils.h
└── test_utils
    ├── get_worker_ips.py
    ├── load_cluster_env.sh
    ├── mpirun_pernode.sh
    └── result_parser_utils.py


/.clang-format:
--------------------------------------------------------------------------------
1 | ColumnLimit: 120
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | .vscode/
35 | .cquery_cache/
36 | __pycache__/
37 | # binaries, databases, protobuf sockets
38 | dump.rdb
39 | multicast_test
40 | reduce_test
41 | allreduce_test
42 | gather_test
43 | allgather_test
44 | *.pb.h
45 | notification
46 | 
47 | python/object_store_pb2*.py
48 | 
49 | # logs
50 | log/
51 | mpi_log/
52 | gloo_log/
53 | 
54 | python/*.cpp
55 | .DS_Store
56 | 
57 | *.csv
58 | *.json
59 | cmake-build-debug/
60 | .idea/
61 | 
62 | # generated plots
63 | *.pdf
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hoplite: Efficient and Fault-Tolerant Collective Communication for Task-Based Distributed Systems
 2 | 
 3 | This is the repo for the artifact evaluataion for the SIGCOMM 2021 paper: _Hoplite: Efficient and Fault-Tolerant Collective Communication for Task-Based Distributed Systems_. For any questions or related issue, please feel free to contact Siyuan Zhuang (s.z@berkeley.edu) and Zhuohan Li (zhuohan@berkeley.edu).
 4 | 
 5 | ## Setup AWS Cluster & Hoplite
 6 | 
 7 | All the experiments in the paper are evaluated on AWS. We use [Ray cluster launcher](https://docs.ray.io/en/latest/cluster/launcher.html) to lanuch the cluster for all the experiments in the paper. We highly recommend using Ray cluster launcher to launch the cluster as it will automatically setup the execution environment we required in the experiments.
 8 | 
 9 | For every experiment, we include detailed instruction for setting up a cluster and reproducing the results in the paper.
10 | 
11 | ## Microbenchmarks (Section 5.1)
12 | 
13 | Please see [microbenchmarks/](microbenchmarks) to reproduce the microbenchmark experiments in the paper.
14 | 
15 | ## Asynchronous SGD (Section 5.2)
16 | 
17 | Please see [app/parameter-server/](app/parameter-server) to reproduce the Asynchronous SGD experiments in the paper.
18 | 
19 | ## Reinforcement Learning (Section 5.3)
20 | 
21 | Please see [app/rllib/](app/rllib/) to reproduce the rllib experiments in the paper.
22 | 
23 | ## ML Model Serving Experiments (Section 5.4)
24 | 
25 | Please see [app/ray_serve/](app/ray_serve) to reproduce the Ray serve experiments and the Ray serve fault tolerance experiments (Section 5.5, Figure 12a) in the paper.
26 | 


--------------------------------------------------------------------------------
/_archived/basic_test.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import numpy as np
 4 | import py_distributed_object_store as store_lib
 5 | 
 6 | from py_distributed_object_store import Buffer
 7 | arr = np.random.rand(2,3,4)
 8 | buf = Buffer.from_buffer(arr)
 9 | print(buf.size(), arr.nbytes, hash(buf))
10 | gc.collect()
11 | print(buf.size(), hash(buf))
12 | 
13 | arr2 = np.frombuffer(buf).reshape(arr.shape)
14 | assert np.array_equal(arr, arr2)
15 | 


--------------------------------------------------------------------------------
/_archived/check_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import ray
 4 | 
 5 | ray.init(address='auto')
 6 | @ray.remote(resources={'node': 1})
 7 | def check_env():
 8 |     import socket
 9 |     import sys
10 |     print(socket.gethostbyname(socket.gethostname()), sys.path)
11 | tasks = []
12 | 
13 | for _ in ray.nodes():
14 |     tasks.append(check_env.remote())
15 | 
16 | ray.get(tasks)
17 | 


--------------------------------------------------------------------------------
/_archived/check_env_remote.py:
--------------------------------------------------------------------------------
1 | import ray
2 | 
3 | @ray.remote(resources={'node': 1}, max_calls=1)
4 | def check_env():
5 |     import socket
6 |     import sys
7 |     print(socket.gethostbyname(socket.gethostname()), sys.path)


--------------------------------------------------------------------------------
/_archived/cluster-config/cluster-new.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 |     ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem
19 | 
20 | head_node:
21 |     InstanceType: m5.4xlarge
22 |     ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3
23 |     KeyName: shared_key
24 |     SecurityGroupIds:
25 |         - "sg-f55048b4"
26 |     Placement:
27 |         GroupName: hoplite-group
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3
32 |     KeyName: shared_key
33 |     SecurityGroupIds:
34 |         - "sg-f55048b4"
35 |     Placement:
36 |         GroupName: hoplite-group
37 | 
38 | setup_commands:
39 |     - sudo mount -t efs fs-07737200:/ efs
40 |     - sudo chmod 777 efs
41 | 
42 | # Command to start ray on the head node. You don't need to change this.
43 | head_start_ray_commands:
44 |     - ray stop
45 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
46 | 
47 | # Command to start ray on worker nodes. You don't need to change this.
48 | worker_start_ray_commands:
49 |     - ray stop
50 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
51 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 |     ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem
19 | 
20 | head_node:
21 |     InstanceType: m5.4xlarge
22 |     ImageId: ami-0875c13495964ad3e # hoplite-artifact-2
23 |     KeyName: shared_key
24 |     SecurityGroupIds:
25 |         - "sg-f55048b4"
26 |     Placement:
27 |         GroupName: hoplite-group
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-0875c13495964ad3e # hoplite-artifact-2
32 |     KeyName: shared_key
33 |     SecurityGroupIds:
34 |         - "sg-f55048b4"
35 |     Placement:
36 |         GroupName: hoplite-group
37 | 
38 | setup_commands:
39 |     - sudo mount -t efs fs-6dad81c6:/ efs
40 |     - sudo chmod 777 efs
41 | 
42 | # Command to start ray on the head node. You don't need to change this.
43 | head_start_ray_commands:
44 |     - ray stop
45 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
46 | 
47 | # Command to start ray on worker nodes. You don't need to change this.
48 | worker_start_ray_commands:
49 |     - ray stop
50 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
51 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/gpu.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-gpu
 2 | 
 3 | min_workers: 7
 4 | max_workers: 7
 5 | initial_workers: 7
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 | 
15 | auth:
16 |     ssh_user: ubuntu
17 |     ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem
18 | 
19 | head_node:
20 |     InstanceType: p3.2xlarge
21 |     ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11
22 |     KeyName: shared_key
23 |     SecurityGroupIds:
24 |         - "sg-f55048b4"
25 |     Placement:
26 |         GroupName: hoplite-group
27 | 
28 | worker_nodes:
29 |     InstanceType: p3.2xlarge
30 |     ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11
31 |     KeyName: shared_key
32 |     SecurityGroupIds:
33 |         - "sg-f55048b4"
34 |     Placement:
35 |         GroupName: hoplite-group
36 | 
37 | setup_commands:
38 |     - sudo mount -t efs fs-6dad81c6:/ efs
39 |     - sudo chmod 777 efs
40 | 
41 | # Command to start ray on the head node. You don't need to change this.
42 | head_start_ray_commands:
43 |     - ray stop
44 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
45 | 
46 | # Command to start ray on worker nodes. You don't need to change this.
47 | worker_start_ray_commands:
48 |     - ray stop
49 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
50 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/large-cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 63
 4 | max_workers: 63
 5 | initial_workers: 63
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 | 
15 | auth:
16 |     ssh_user: ubuntu
17 |     ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11
22 |     KeyName: shared_key
23 |     SecurityGroupIds:
24 |         - "sg-f55048b4"
25 |     Placement:
26 |         GroupName: hoplite-group
27 | 
28 | worker_nodes:
29 |     InstanceType: m5.4xlarge
30 |     ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11
31 |     KeyName: shared_key
32 |     SecurityGroupIds:
33 |         - "sg-f55048b4"
34 |     Placement:
35 |         GroupName: hoplite-group
36 | 
37 | setup_commands:
38 |     - sudo mount -t efs fs-6dad81c6:/ efs
39 |     - sudo chmod 777 efs
40 | 
41 | # Command to start ray on the head node. You don't need to change this.
42 | head_start_ray_commands:
43 |     - ray stop
44 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
45 | 
46 | # Command to start ray on worker nodes. You don't need to change this.
47 | worker_start_ray_commands:
48 |     - ray stop
49 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
50 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/large_cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-large
 2 | 
 3 | min_workers: 258
 4 | max_workers: 258
 5 | initial_workers: 258
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 | 
15 | auth:
16 |     ssh_user: ubuntu
17 |     ssh_private_key: /Users/siyuan/.ssh/siyuan-aws.pem
18 | 
19 | head_node:
20 |     InstanceType: m5.8xlarge
21 |     ImageId: ami-087095f2ce112c29d # latest_dlami # hoplite-nsdi-5
22 |     # InstanceMarketOptions:
23 |     #     MarketType: spot
24 |     #     SpotOptions:
25 |     #         MaxPrice: "1.5"  # Max Hourly Price MAX_HOURLY_PRICE
26 |     KeyName: siyuan-aws
27 |     SecurityGroupIds:
28 |         - "sg-50656710"
29 |     Placement:
30 |         GroupName: hoplite-group
31 | 
32 | worker_nodes:
33 |     InstanceType: c5.2xlarge
34 |     ImageId: ami-087095f2ce112c29d # latest_dlami # hoplite-nsdi-5
35 |     InstanceMarketOptions:
36 |         MarketType: spot
37 |         SpotOptions:
38 |             MaxPrice: "1.2"  # Max Hourly Price MAX_HOURLY_PRICE
39 |     KeyName: siyuan-aws
40 |     SecurityGroupIds:
41 |         - "sg-50656710"
42 |     Placement:
43 |         GroupName: hoplite-group
44 | 
45 | setup_commands:
46 |     # - pip install ray==0.8.6
47 |     - mkdir -p ~/efs
48 |     - sudo mount -t efs fs-a692810d:/ ~/efs
49 |     - sudo chmod 777 ~/efs
50 | 
51 | # Command to start ray on the head node. You don't need to change this.
52 | head_start_ray_commands:
53 |     - ray stop
54 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
55 | 
56 | # Command to start ray on worker nodes. You don't need to change this.
57 | worker_start_ray_commands:
58 |     - ray stop
59 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
60 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/new_cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-benchmark
 2 | 
 3 | min_workers: 7
 4 | max_workers: 7
 5 | initial_workers: 7
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 | 
15 | auth:
16 |     ssh_user: ubuntu
17 |     ssh_private_key: /Users/siyuan/.ssh/siyuan-aws.pem
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-0a53ac9d916d997ae  # hoplite-sigcomm21
22 |     InstanceMarketOptions:
23 |         MarketType: spot
24 |     KeyName: siyuan-aws
25 |     SecurityGroupIds:
26 |         - "sg-50656710"
27 |     Placement:
28 |         GroupName: hoplite-group
29 | 
30 | worker_nodes:
31 |     InstanceType: m5.4xlarge
32 |     ImageId: ami-0a53ac9d916d997ae  # hoplite-sigcomm21
33 |     InstanceMarketOptions:
34 |         MarketType: spot
35 |     KeyName: siyuan-aws
36 |     SecurityGroupIds:
37 |         - "sg-50656710"
38 |     Placement:
39 |         GroupName: hoplite-group
40 | 
41 | setup_commands:
42 |     # - pip install ray==1.1
43 |     - mkdir -p ~/efs
44 |     - sudo mount -t efs fs-a692810d:/ ~/efs
45 |     - sudo chmod 777 ~/efs
46 | 
47 | # Command to start ray on the head node. You don't need to change this.
48 | head_start_ray_commands:
49 |     - ray stop
50 |     - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
51 | 
52 | # Command to start ray on worker nodes. You don't need to change this.
53 | worker_start_ray_commands:
54 |     - ray stop
55 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
56 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/single-nc.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-single
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-1b
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 |     ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem
19 | 
20 | head_node:
21 |     InstanceType: m5.4xlarge
22 |     ImageId: ami-0445e6fea66b74ae5 # rllib-all
23 |     KeyName: shared_key
24 |     SecurityGroupIds:
25 |         - "sg-878331f8"
26 | 
27 | worker_nodes:
28 |     InstanceType: m5.4xlarge
29 |     ImageId: ami-0445e6fea66b74ae5 # rllib-all
30 |     KeyName: shared_key
31 |     SecurityGroupIds:
32 |         - "sg-878331f8"
33 | 
34 | setup_commands:
35 |     - sudo mount -t efs fs-760d746f:/ efs
36 |     - sudo chmod 777 efs
37 | 
38 | # Command to start ray on the head node. You don't need to change this.
39 | head_start_ray_commands:
40 |     - ray stop
41 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
42 | 
43 | # Command to start ray on worker nodes. You don't need to change this.
44 | worker_start_ray_commands:
45 |     - ray stop
46 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
47 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/single-new.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-single-initial
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 |     ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem
19 | 
20 | head_node:
21 |     InstanceType: m5.4xlarge
22 |     ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3
23 |     KeyName: shared_key
24 |     SecurityGroupIds:
25 |         - "sg-f55048b4"
26 |     Placement:
27 |         GroupName: hoplite-group
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3
32 |     KeyName: shared_key
33 |     SecurityGroupIds:
34 |         - "sg-f55048b4"
35 |     Placement:
36 |         GroupName: hoplite-group
37 | 
38 | setup_commands:
39 |     - sudo mount -t efs fs-07737200:/ ~/efs
40 |     - sudo chmod 777 ~/efs
41 | 
42 | # Command to start ray on the head node. You don't need to change this.
43 | head_start_ray_commands: []
44 | 
45 | # Command to start ray on worker nodes. You don't need to change this.
46 | worker_start_ray_commands: []
47 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/single.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-single
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 |     ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem
19 | 
20 | head_node:
21 |     InstanceType: m5.4xlarge
22 |     ImageId: ami-0875c13495964ad3e # hoplite-artifact-2
23 |     KeyName: shared_key
24 |     SecurityGroupIds:
25 |         - "sg-f55048b4"
26 |     Placement:
27 |         GroupName: hoplite-group
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-0875c13495964ad3e # hoplite-artifact-2
32 |     KeyName: shared_key
33 |     SecurityGroupIds:
34 |         - "sg-f55048b4"
35 |     Placement:
36 |         GroupName: hoplite-group
37 | 
38 | setup_commands:
39 |     - sudo mount -t efs fs-6dad81c6:/ efs
40 |     - sudo chmod 777 efs
41 | 
42 | # Command to start ray on the head node. You don't need to change this.
43 | head_start_ray_commands:
44 |     - ray stop
45 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
46 | 
47 | # Command to start ray on worker nodes. You don't need to change this.
48 | worker_start_ray_commands:
49 |     - ray stop
50 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
51 | 


--------------------------------------------------------------------------------
/_archived/cluster-config/siyuan-old.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-asgd
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 | 
15 | auth:
16 |     ssh_user: ubuntu
17 | 
18 | head_node:
19 |     InstanceType: m5.4xlarge
20 |     ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2
21 |     InstanceMarketOptions:
22 |         MarketType: spot
23 |     SecurityGroupIds:
24 |         - "sg-3463e565"
25 |     Placement:
26 |         GroupName: hoplite-group
27 | 
28 | worker_nodes:
29 |     InstanceType: m5.4xlarge
30 |     ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2
31 |     InstanceMarketOptions:
32 |         MarketType: spot
33 |     SecurityGroupIds:
34 |         - "sg-3463e565"
35 |     Placement:
36 |         GroupName: hoplite-group
37 | 
38 | setup_commands:
39 |     # - pip install ray==1.1
40 |     - mkdir -p ~/efs
41 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
42 |     - sudo chmod 777 ~/efs
43 | 
44 | # Command to start ray on the head node. You don't need to change this.
45 | head_start_ray_commands:
46 |     - ray stop
47 |     - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
48 | 
49 | # Command to start ray on worker nodes. You don't need to change this.
50 | worker_start_ray_commands:
51 |     - ray stop
52 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
53 | 


--------------------------------------------------------------------------------
/_archived/compare_bcast.c:
--------------------------------------------------------------------------------
 1 | // Author: Wes Kendall
 2 | // Copyright 2011 www.mpitutorial.com
 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel
 4 | // free to modify it for your own use. Any distribution of the code must
 5 | // either provide a link to www.mpitutorial.com or keep this header intact.
 6 | //
 7 | // Comparison of MPI_Bcast with the my_bcast function
 8 | //
 9 | #include <assert.h>
10 | #include <mpi.h>
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | 
14 | void my_bcast(void *data, int count, MPI_Datatype datatype, int root, MPI_Comm communicator) {
15 |   int world_rank;
16 |   MPI_Comm_rank(communicator, &world_rank);
17 |   int world_size;
18 |   MPI_Comm_size(communicator, &world_size);
19 | 
20 |   if (world_rank == root) {
21 |     // If we are the root process, send our data to everyone
22 |     int i;
23 |     for (i = 0; i < world_size; i++) {
24 |       if (i != world_rank) {
25 |         MPI_Send(data, count, datatype, i, 0, communicator);
26 |       }
27 |     }
28 |   } else {
29 |     // If we are a receiver process, receive the data from the root
30 |     MPI_Recv(data, count, datatype, root, 0, communicator, MPI_STATUS_IGNORE);
31 |   }
32 | }
33 | 
34 | int main(int argc, char **argv) {
35 |   if (argc != 3) {
36 |     fprintf(stderr, "Usage: compare_bcast num_elements num_trials\n");
37 |     exit(1);
38 |   }
39 | 
40 |   int num_elements = atoi(argv[1]);
41 |   int num_trials = atoi(argv[2]);
42 | 
43 |   MPI_Init(NULL, NULL);
44 | 
45 |   int world_rank;
46 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
47 | 
48 |   double total_my_bcast_time = 0.0;
49 |   double total_mpi_bcast_time = 0.0;
50 |   int i;
51 |   int *data = (int *)malloc(sizeof(int) * num_elements);
52 |   assert(data != NULL);
53 | 
54 |   for (i = 0; i < num_trials; i++) {
55 |     // Time my_bcast
56 |     // Synchronize before starting timing
57 |     MPI_Barrier(MPI_COMM_WORLD);
58 |     total_my_bcast_time -= MPI_Wtime();
59 |     my_bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
60 |     // Synchronize again before obtaining final time
61 |     MPI_Barrier(MPI_COMM_WORLD);
62 |     total_my_bcast_time += MPI_Wtime();
63 | 
64 |     // Time MPI_Bcast
65 |     MPI_Barrier(MPI_COMM_WORLD);
66 |     total_mpi_bcast_time -= MPI_Wtime();
67 |     MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
68 |     MPI_Barrier(MPI_COMM_WORLD);
69 |     total_mpi_bcast_time += MPI_Wtime();
70 |   }
71 | 
72 |   // Print off timing information
73 |   if (world_rank == 0) {
74 |     printf("Data size = %d, Trials = %d\n", num_elements * (int)sizeof(int), num_trials);
75 |     printf("Avg my_bcast time = %lf\n", total_my_bcast_time / num_trials);
76 |     printf("Avg MPI_Bcast time = %lf\n", total_mpi_bcast_time / num_trials);
77 |   }
78 | 
79 |   free(data);
80 |   MPI_Finalize();
81 | }
82 | 


--------------------------------------------------------------------------------
/_archived/exit_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import time
 4 | 
 5 | import py_distributed_object_store as store_lib
 6 | import utils
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | utils.add_arguments(parser)
10 | 
11 | args = parser.parse_args()
12 | args_dict = utils.extract_dict_from_args(args)
13 | 
14 | store = utils.create_store_using_dict(args_dict)
15 | time.sleep(5)
16 | print ("Exiting")
17 | 
18 | 


--------------------------------------------------------------------------------
/_archived/fault_tolerance_tests/README.md:
--------------------------------------------------------------------------------
 1 | # Fault tolerance tests (optional)
 2 | 
 3 | To enable fault tolerance test, first apply the patch that introduces failures:
 4 | 
 5 | ```bash
 6 | patch -p1 --directory .. < enable_failure.patch
 7 | ```
 8 | 
 9 | Then recompile the C++ project.
10 | 
11 | ## Multicast fault tolerance test
12 | 
13 | Intensive benchmarks that exploits corner cases. This test proves reliablility of our system. It could take serveral minutes to complete with high speed networks.
14 | 
15 | Usage:
16 | 
17 | 
18 | ```bash
19 | ./run_test_fault_tolerance.sh subset_reduce ${total_number_of_nodes} ${input_size_in_bytes}
20 | ```
21 | 
22 | ### Subset reduction fault tolerance test
23 | 
24 | This test shows Hoplite is able to reduce only a subset of objects. For example, we have 8 candidate objects to reduce, but we want to reduce 4 objects that are created first.
25 | 
26 | Usage:
27 | 
28 | ```bash
29 | ./run_test_fault_tolerance.sh subset_reduce ${total_number_of_nodes} ${input_size_in_bytes}
30 | ```
31 | 
32 | We suggest `total_number_of_nodes>=4`.
33 | 


--------------------------------------------------------------------------------
/_archived/fault_tolerance_tests/enable_failure.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/client/object_sender.cc b/src/client/object_sender.cc
 2 | index 5569061..f11863e 100644
 3 | --- a/src/client/object_sender.cc
 4 | +++ b/src/client/object_sender.cc
 5 | @@ -15,11 +15,14 @@ using objectstore::ObjectWriterRequest;
 6 |  using objectstore::ReceiveObjectRequest;
 7 |  using objectstore::ReceiveReducedObjectRequest;
 8 | 
 9 | +int global_count = 0;
10 | +
11 |  template <typename T> inline int stream_send(int conn_fd, T *stream, int64_t offset = 0) {
12 |    TIMELINE("ObjectSender::stream_send()");
13 |    LOG(DEBUG) << "ObjectSender::stream_send(), offset=" << offset;
14 |    const uint8_t *data_ptr = stream->Data();
15 |    const int64_t object_size = stream->Size();
16 | +  bool triggered = false;
17 | 
18 |    if (stream->IsFinished()) {
19 |      int status = send_all(conn_fd, data_ptr + offset, object_size - offset);
20 | @@ -32,6 +35,16 @@ template <typename T> inline int stream_send(int conn_fd, T *stream, int64_t off
21 |    int64_t cursor = offset;
22 |    while (cursor < object_size) {
23 |      int64_t current_progress = stream->progress;
24 | +    if (current_progress > object_size / 2 && !triggered) {
25 | +      triggered = true;
26 | +      if (++global_count >= 3) {
27 | +        int rank = std::stoi(getenv("OMPI_COMM_WORLD_RANK"));
28 | +        if (rank == 2) {
29 | +          //usleep(1000);
30 | +          LOG(FATAL) << " failed intentionally";
31 | +        }
32 | +      }
33 | +    }
34 |      if (cursor < current_progress) {
35 |        int bytes_sent = send(conn_fd, data_ptr + cursor, current_progress - cursor, 0);
36 |        if (bytes_sent < 0) {
37 | 


--------------------------------------------------------------------------------
/_archived/fault_tolerance_tests/run_test_fault_tolerance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -lt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes, input size & n_trials required"; exit -1; fi
 3 | if [ "$#" -gt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi
 4 | 
 5 | ## setup
 6 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 7 | SCRIPT_DIR=$(dirname $(realpath -s $0))
 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils)
 9 | BINARIES_DIR=$(realpath -s $SCRIPT_DIR/../../build)
10 | TEST_BINARIES_DIR=$BINARIES_DIR/tests
11 | 
12 | ## cleanup procs
13 | sudo fuser -k 6666/tcp -s &> /dev/null
14 | sudo fuser -k 50055/tcp -s &> /dev/null
15 | sudo fuser -k 20210/tcp -s &> /dev/null
16 | 
17 | test_name=$1
18 | test_executable_abspath=$TEST_BINARIES_DIR/${test_name}_test
19 | world_size=$2
20 | object_size=$3
21 | n_trials=$4
22 | 
23 | if [ ! -f $test_executable_abspath ]; then
24 |     echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test executable not found: $test_executable_abspath"
25 |     exit -2
26 | fi
27 | 
28 | # get cluster info
29 | source $TEST_UNILS_DIR/load_cluster_env.sh
30 | OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))})
31 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) head_node: $MY_IPADDR; other_nodes: ${OTHERS_IPADDR[@]}"
32 | 
33 | # prompt test info
34 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) Running test $(tput setaf 3)$(tput bold)$test_name$(tput sgr 0)"
35 | 
36 | # create logging dir
37 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size
38 | mkdir -p $log_dir
39 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest
40 | 
41 | export RAY_BACKEND_LOG_LEVEL=info
42 | 
43 | pkill notification
44 | sleep 0.5
45 | ($BINARIES_DIR/notification 2>&1 | tee $log_dir/$MY_IPADDR.notification.log) &
46 | sleep 0.5
47 | 
48 | all_nodes=(${ALL_IPADDR[@]:0:$world_size})
49 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
50 | 
51 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \
52 |     -x HOPLITE_LOGGING_DIR=$log_dir \
53 |     -x RAY_BACKEND_LOG_LEVEL=$RAY_BACKEND_LOG_LEVEL \
54 |     $SCRIPT_DIR/test_wrapper_fault_tolerance.sh $test_executable_abspath $MY_IPADDR $object_size $n_trials
55 | 
56 | sleep 1
57 | 


--------------------------------------------------------------------------------
/_archived/fault_tolerance_tests/test_wrapper_fault_tolerance.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | trap 'echo delaying MPI shutdown...' INT TERM
3 | logging_file=$HOPLITE_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log
4 | $@ 2>&1 | tee $logging_file
5 | sleep 10
6 | 


--------------------------------------------------------------------------------
/_archived/init_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import ray
 5 | import sys
 6 | 
 7 | root_directory = os.path.dirname(os.path.abspath(__file__))
 8 | print(root_directory)
 9 | 
10 | dirs = [
11 |     os.path.join(root_directory, 'python'),
12 |     os.path.join(root_directory, 'app', 'parameter-server'),
13 | ]
14 | 
15 | ray.init(address='auto', load_code_from_local=True)
16 | 
17 | ray.worker.global_worker.run_function_on_all_workers(
18 |     lambda worker_info: [sys.path.insert(1, d) for d in dirs])
19 | 


--------------------------------------------------------------------------------
/_archived/mpi_compare_bcast.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ -z "$2" ]; then echo "ERROR: number of nodes & input size required"; exit; fi
 3 | 
 4 | make compare_bcast > /dev/null
 5 | 
 6 | ROOT_DIR=$(dirname $(realpath -s $0))/../
 7 | source $ROOT_DIR/load_cluster_env.sh
 8 | 
 9 | all_nodes=(${ALL_IPADDR[@]:0:$1})
10 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
11 | 
12 | echo Number of nodes: $1 "(actually ${#all_nodes[@]})", data size: $2
13 | echo Nodes: ${all_nodes[@]}
14 | 
15 | $ROOT_DIR/mpirun_pernode.sh $all_hosts $(realpath -s compare_bcast) $[$2/4] 1
16 | 


--------------------------------------------------------------------------------
/_archived/notification_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 3 | 
 4 | sudo fuser -k 6666/tcp -s &> /dev/null
 5 | sudo fuser -k 50055/tcp -s &> /dev/null
 6 | 
 7 | ## setup
 8 | ROOT_DIR=$(dirname $(realpath -s $0))
 9 | source $ROOT_DIR/load_cluster_env.sh
10 | 
11 | pkill '^notification$'
12 | pkill '^notification_server_test$'
13 | sleep 2
14 | ./notification $MY_IPADDR $MY_IPADDR &
15 | sleep 2
16 | ./notification_server_test $MY_IPADDR $MY_IPADDR &
17 | sleep 40
18 | 


--------------------------------------------------------------------------------
/_archived/parameter-server/compare_hoplite_mpi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | mkdir -p ps-log-cmp/
 3 | 
 4 | ROOT_DIR=$(dirname $(realpath -s $0))/../../
 5 | source $ROOT_DIR/load_cluster_env.sh
 6 | 
 7 | for n_nodes in 8; do
 8 | 
 9 |     echo "==========" sync-$n_nodes-hoplite "=========="
10 |     pkill notification
11 |     $ROOT_DIR/restart_all_workers.sh
12 |     python parameter_server.py -n $(($n_nodes - 1)) --no-test | tee ps-log-cmp/sync-$n_nodes-hoplite.log
13 | 
14 |     echo "==========" sync-$n_nodes-mpi "=========="
15 |     
16 |     all_nodes=(${ALL_IPADDR[@]:0:$n_nodes})
17 |     all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
18 | 
19 |     echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")"
20 | 
21 |     pkill notification
22 |     $ROOT_DIR/restart_all_workers.sh
23 |     $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_parameter_server.py) --no-test | tee ps-log-cmp/sync-$n_nodes-mpi.log
24 | 
25 | done
26 | 


--------------------------------------------------------------------------------
/_archived/parameter-server/mpi_parameter_server.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import time
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | import numpy as np
10 | 
11 | from ps_helper import ConvNet, get_data_loader, evaluate, criterion
12 | 
13 | from mpi4py import MPI
14 | 
15 | comm = MPI.COMM_WORLD
16 | rank = comm.Get_rank()
17 | 
18 | class ParameterServer(object):
19 |     def __init__(self, lr, model_type="custom"):
20 |         self.model = ConvNet(model_type)
21 |         self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)
22 | 
23 |     def apply_gradients(self):
24 |         new_parameters = [p.data.cpu().numpy() for p in self.model.parameters()]
25 |         cont_p = np.concatenate([p.ravel() for p in new_parameters])
26 |         comm.Bcast(cont_p, root=0)
27 |         zero_grad = np.zeros(self.model.n_param, dtype=np.float32)
28 |         grad_buffer = np.empty(self.model.n_param, dtype=np.float32)
29 |         comm.Reduce(zero_grad, grad_buffer, op=MPI.SUM, root=0)
30 |         summed_gradients = self.model.buffer_to_tensors(grad_buffer.view(np.uint8))
31 |         self.optimizer.zero_grad()
32 |         self.model.set_gradients(summed_gradients)
33 |         self.optimizer.step()
34 | 
35 | 
36 | class DataWorker(object):
37 |     def __init__(self, model_type="custom", device="cpu"):
38 |         self.device = device
39 |         self.model = ConvNet(model_type).to(device)
40 | 
41 |     def compute_gradients(self, batch_size=128):
42 |         parameter_buffer = np.empty(self.model.n_param, dtype=np.float32)
43 |         comm.Bcast(parameter_buffer, root=0)
44 |         parameters = self.model.buffer_to_tensors(parameter_buffer.view(np.uint8))
45 |         self.model.set_parameters(parameters)
46 |         data = torch.randn(batch_size, 3, 224, 224, device=self.device)
47 |         self.model.zero_grad()
48 |         output = self.model(data)
49 |         loss = torch.mean(output)
50 |         loss.backward()
51 |         gradients = self.model.get_gradients()
52 |         cont_grad = np.concatenate([p.ravel() for p in gradients])
53 |         grad_buffer = np.empty(self.model.n_param, dtype=np.float32)
54 |         comm.Reduce(cont_grad, grad_buffer, op=MPI.SUM, root=0)
55 | 
56 | parser = argparse.ArgumentParser(description='parameter server')
57 | parser.add_argument('-m', '--model', type=str, default="custom",
58 |                     help='neural network model type')
59 | args = parser.parse_args()
60 | 
61 | 
62 | iterations = 50
63 | 
64 | 
65 | if rank == 0:
66 |     print("rank == 0")
67 |     ps = ParameterServer(1e-2, model_type=args.model)
68 |     step_start = time.time()
69 |     for i in range(iterations):
70 |         ps.apply_gradients()
71 |         now = time.time()
72 |         print("step time:", now - step_start, flush=True)
73 |         step_start = now
74 | 
75 | else:
76 |     print("rank > 0")
77 |     worker = DataWorker(model_type=args.model, device='cuda')
78 |     for i in range(iterations):
79 |         worker.compute_gradients()
80 | 


--------------------------------------------------------------------------------
/_archived/parameter-server/mpi_test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from mpi4py import MPI
 3 | import time
 4 | 
 5 | comm = MPI.COMM_WORLD
 6 | rank = comm.Get_rank()
 7 | 
 8 | if rank == 0:
 9 |     data = np.ones(10, dtype=np.float32)
10 |     data_sum = np.empty(10, dtype=np.float32)
11 | else:
12 |     time.sleep((4 - rank) * 3)
13 |     data = np.ones(10, dtype=np.float32)
14 |     data_sum = np.empty(10, dtype=np.float32)
15 | comm.Bcast(data, root=0)
16 | # comm.Reduce(data, data_sum, op=MPI.SUM, root=0)
17 | print(rank, data, data_sum)


--------------------------------------------------------------------------------
/_archived/parameter-server/parse_results.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | log_dir = 'ps-log/'
 4 | for log_file in sorted(os.listdir(log_dir)):
 5 |     with open(os.path.join(log_dir, log_file), "r") as f:
 6 |         all_time = []
 7 |         for line in f:
 8 |             if "step time:" in line:
 9 |                 all_time.append(float(line.split()[-1]))
10 |         all_time = all_time[1:]
11 |         all_time = np.array(all_time[3:-3])
12 |         if log_file.split('-')[0] == 'async':
13 |             all_time = 8 * ((int(log_file.split('-')[1]) - 1) // 2) / all_time
14 |         else:
15 |             all_time = 8 * int(log_file.split('-')[1]) / all_time
16 |         new_all_time = []
17 |         for i in range(0, len(all_time), 4):
18 |             new_all_time.append(np.mean(all_time[i:i + 4]))
19 |         print(log_file.ljust(20), np.mean(new_all_time), np.std(new_all_time), len(new_all_time), sep='\t')
20 | 


--------------------------------------------------------------------------------
/_archived/parameter-server/run_gloo_allreduce.sh:
--------------------------------------------------------------------------------
 1 | MODEL=alexnet
 2 | 
 3 | mkdir -p ps-log/
 4 | 
 5 | ROOT_DIR=$(dirname $(realpath -s $0))/../../
 6 | source $ROOT_DIR/load_cluster_env.sh
 7 | 
 8 | for n_nodes in 8; do
 9 |     i=0
10 |     for node in ${ALL_IPADDR[@]:0:$n_nodes}; do
11 |         echo "=> $node"
12 |         ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, \
13 |             python $ROOT_DIR/app/parameter-server/gloo_all_reduce.py \
14 |                 --master_ip $MY_IPADDR \
15 |                 --rank $i \
16 |                 --size $n_nodes \
17 |                 -m $MODEL &
18 |         i=$((i+1))
19 |     done
20 |     wait
21 | done
22 | 


--------------------------------------------------------------------------------
/_archived/parameter-server/run_mpi_allreduce.sh:
--------------------------------------------------------------------------------
 1 | MODEL=alexnet
 2 | 
 3 | mkdir -p ps-log/
 4 | 
 5 | ROOT_DIR=$(dirname $(realpath -s $0))/../../
 6 | source $ROOT_DIR/load_cluster_env.sh
 7 | 
 8 | for n_nodes in 8; do
 9 |     echo "==========" sync-$n_nodes-mpi "=========="
10 |     all_nodes=(${ALL_IPADDR[@]:0:$n_nodes})
11 |     all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
12 |     echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")"
13 | 
14 |     pkill notification
15 |     # $ROOT_DIR/restart_all_workers.sh
16 |     $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_all_reduce.py) -m $MODEL | tee ps-log/sync-$n_nodes-mpi-$MODEL.log
17 | done
18 | 


--------------------------------------------------------------------------------
/_archived/parameter-server/run_mpi_ps.sh:
--------------------------------------------------------------------------------
 1 | MODEL=alexnet
 2 | 
 3 | mkdir -p ps-log/
 4 | 
 5 | ROOT_DIR=$(dirname $(realpath -s $0))/../../
 6 | source $ROOT_DIR/load_cluster_env.sh
 7 | 
 8 | for n_nodes in 8; do
 9 |     echo "==========" sync-$n_nodes-mpi "=========="
10 |     all_nodes=(${ALL_IPADDR[@]:0:$n_nodes})
11 |     all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
12 |     echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")"
13 | 
14 |     pkill notification
15 |     # $ROOT_DIR/restart_all_workers.sh
16 |     $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_parameter_server.py) -m $MODEL | tee ps-log/sync-$n_nodes-mpi-$MODEL.log
17 | done
18 | 


--------------------------------------------------------------------------------
/_archived/parameter-server/run_ps_tests.sh:
--------------------------------------------------------------------------------
 1 | mkdir -p ps-log/
 2 | 
 3 | ROOT_DIR=$(dirname $(realpath -s $0))/../../
 4 | source $ROOT_DIR/load_cluster_env.sh
 5 | 
 6 | for n_nodes in 8 16; do
 7 |     echo "==========" sync-$n_nodes-hoplite "=========="
 8 |     pkill notification
 9 |     $ROOT_DIR/restart_all_workers.sh
10 |     python parameter_server.py -n $(($n_nodes - 1)) --no-test | tee ps-log/sync-$n_nodes-hoplite.log
11 | 
12 |     echo "==========" sync-$n_nodes-ray "=========="
13 |     pkill notification
14 |     $ROOT_DIR/restart_all_workers.sh
15 |     python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) --no-test | tee ps-log/sync-$n_nodes-ray.log
16 | 
17 |     echo "==========" sync-$n_nodes-mpi "=========="
18 |     all_nodes=(${ALL_IPADDR[@]:0:$n_nodes})
19 |     all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
20 |     echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")"
21 | 
22 |     pkill notification
23 |     $ROOT_DIR/restart_all_workers.sh
24 |     $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_parameter_server.py) --no-test | tee ps-log/sync-$n_nodes-mpi.log
25 | 
26 |     echo "==========" async-$n_nodes-hoplite "=========="
27 |     pkill notification
28 |     $ROOT_DIR/restart_all_workers.sh
29 |     python parameter_server.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) --no-test | tee ps-log/async-$n_nodes-hoplite.log
30 | 
31 |     echo "==========" async-$n_nodes-ray "=========="
32 |     pkill notification
33 |     $ROOT_DIR/restart_all_workers.sh
34 |     python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) --no-test | tee ps-log/async-$n_nodes-ray.log
35 | done
36 | 


--------------------------------------------------------------------------------
/_archived/restart_all_workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ROOT_DIR=$(dirname $(realpath -s $0))
 4 | 
 5 | # This script is only used when necessary to reboot the ray workers.
 6 | # Workers may not be available until next task execution, so some errors could still occur.
 7 | if [ "$#" -eq 0 ]; then
 8 |     $ROOT_DIR/fornode $(realpath -s $0) restart
 9 | else
10 |     for pid in $(ps aux | grep 'default_worker.py' | grep -v 'object_manager_port' | grep -v grep | awk '{print $2}'); do
11 |         kill -9 $pid
12 |     done
13 | fi
14 | 


--------------------------------------------------------------------------------
/_archived/restart_ray.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ROOT_DIR=$(dirname $(realpath -s $0))
 4 | source $ROOT_DIR/load_cluster_env.sh
 5 | 
 6 | ./fornode ray stop
 7 | 
 8 | ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{"node":1}' --object-store-memory=34359738368
 9 | sleep 5
10 | for node in ${OTHERS_IPADDR[@]}; do
11 |  echo "=> $node"
12 |  ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, ray start --redis-address=$MY_IPADDR:6379 --object-manager-port=8076 --resources=\'{\"node\":1}\' --object-store-memory=34359738368 &
13 | done
14 | wait
15 | 


--------------------------------------------------------------------------------
/_archived/script/README.md:
--------------------------------------------------------------------------------
 1 | # How to run the timeline script 
 2 | 
 3 | Use `reduce_test.sh` as an example:
 4 | 1. Run 
 5 |     ```bash
 6 |     bash reduce_test.sh
 7 |     ```
 8 |     You will get a bunch of log files under `log/YYMMDD-HHMMSS-reduce/`.
 9 | 2. Run the script
10 |     ```bash
11 |     python script/timeline.py log/YYMMDD-HHMMSS-reduce/
12 |     ```
13 |     The result json file will be dumped to `log/YYMMDD-HHMMSS-reduce/timeline.json`.
14 | 3. open `chrome://tracing` in your chrome browser, and then load the json file above. Then you will see the timeline.
15 | 


--------------------------------------------------------------------------------
/_archived/script/find_missing_tests.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | def main(log_dir):
 5 |     files = os.listdir(log_dir)
 6 | 
 7 |     tasks =  {'multicast', 'reduce', 'allreduce'}
 8 |     node_set = range(2, 18, 2)
 9 |     object_size_set = {2**i for i in range(20, 31)}
10 | 
11 |     print (node_set)
12 |     print (object_size_set)
13 | 
14 |     for task_name in tasks: 
15 |         for number_of_nodes in node_set:
16 |             for object_size in object_size_set:
17 |                 task = task_name + '-' + str(number_of_nodes) + '-' + str(object_size)
18 |                 found = False
19 |                 for filename in files:
20 |                     if task in filename:
21 |                         found = True
22 |                         break
23 |                 if not found:
24 |                     print (task)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     assert len(sys.argv) == 2, "Usage: python parse_mpi_result.py LOG_DIR"
29 |     log_dir = sys.argv[1]
30 |     main(log_dir)
31 | 


--------------------------------------------------------------------------------
/_archived/script/timeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | 
 5 | ph_dict = {
 6 |     "[BEGIN]": "B",
 7 |     "[END]": "E"
 8 | }
 9 | 
10 | def main(log_dir):
11 |     print("Working dir", log_dir)
12 |     all_logs = os.listdir(log_dir)
13 |     json_output = {
14 |         "traceEvents": [],
15 |         "displayTimeUnit": "ms",
16 |         "otherData": {
17 |             "log_dir": log_dir
18 |         }
19 |     }
20 |     
21 |     for log_file in all_logs:
22 |         with open(os.path.join(log_dir, log_file)) as f:
23 |             for line in f.readlines():
24 |                 elements = line.split()
25 |                 if len(elements) >= 6 and elements[5] == "[TIMELINE]":
26 |                     timestamp = int(elements[0])
27 |                     ip_pid_tid = elements[1]
28 |                     ip, pid, tid = ip_pid_tid.split(":")
29 |                     filename_line = elements[2]
30 |                     function_name = elements[3]
31 |                     assert elements[4] == "]:"
32 |                     timeline_id = elements[6]
33 |                     timeline_tag = elements[7]
34 |                     message = " ".join(elements[8:])
35 |                     event = {
36 |                         "name": function_name + "_" + timeline_id,
37 |                         "cat": "event",
38 |                         "ph": ph_dict[timeline_tag],
39 |                         "ts": str(timestamp // 1000) + "." + str(timestamp % 1000),
40 |                         "pid": ip + ":" + pid,
41 |                         "tid": tid,
42 |                         "args": {
43 |                             "message": message
44 |                         }
45 |                     }
46 |                     json_output["traceEvents"].append(event)
47 |     with open(os.path.join(log_dir, "timeline.json"), "w") as f:
48 |         json.dump(json_output, f)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     log_dir = sys.argv[1] if len(sys.argv) >= 2 else "log/latest"
53 |     main(log_dir)
54 | 


--------------------------------------------------------------------------------
/_archived/speed_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ROOT_DIR=$(dirname $(realpath -s $0))
 3 | source $ROOT_DIR/load_cluster_env.sh
 4 | 
 5 | echo ${ALL_IPADDR[@]}, ${#ALL_IPADDR[@]}
 6 | 
 7 | # start iperf server
 8 | for s in ${ALL_IPADDR[@]}
 9 | do
10 | 	ssh -o StrictHostKeyChecking=no $s pkill iperf
11 | 	ssh $s iperf -s &> /dev/null &
12 | done
13 | 
14 | for s in ${ALL_IPADDR[@]}
15 | do
16 | 	for t in ${ALL_IPADDR[@]}
17 | 	do
18 | 		if [ "$s" == "$t" ]
19 | 		then continue
20 | 		fi
21 | 		echo $s, $t
22 | 		ssh $s iperf -c $t -t 5 | grep GBytes
23 | 	done
24 | 	break
25 | done
26 | 
27 | # shutdown iperf server
28 | 
29 | for s in ${ALL_IPADDR[@]}
30 | do
31 |     ssh $s pkill iperf &> /dev/null &
32 | done
33 | 


--------------------------------------------------------------------------------
/_archived/sync_time.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sudo apt install -y chrony
 4 | sudo sed -i 's/^# information about usuable directives./server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4\n/g' /etc/chrony/chrony.conf
 5 | sudo /etc/init.d/chrony restart
 6 | 
 7 | if [ "$#" -eq 0 ]; then
 8 |     ROOT_DIR=$(dirname $(realpath -s $0))
 9 |     source $ROOT_DIR/load_cluster_env.sh
10 |     for node in ${OTHERS_IPADDR[@]}
11 |     do
12 |         ssh -t -t $node "$(realpath -s $0) 0" &
13 |     done
14 |     wait
15 | fi
16 | 


--------------------------------------------------------------------------------
/app/parameter-server/README.md:
--------------------------------------------------------------------------------
 1 | # Reproducing Hoplite Parameter Server Experiments on AWS
 2 | 
 3 | _(About 55 min)_
 4 | 
 5 | ## Cluster Setup
 6 | 
 7 | _(About 30 min)_
 8 | 
 9 | If you are provided with an AWS IAM account & pre-built binaries
10 | * If you just want to review figures & raw experimental data, see [cluster-config-access-results-only](cluster-config-access-results-only).
11 | * If you also want to reproduce all results from the beginning, see [cluster-config-with-ami](cluster-config-with-ami) for setting up a cluster.
12 | 
13 | If you are not provided with an AWS account or you want to build everything from scratch, see [cluster-config](../ray_serve/cluster-config).
14 | 
15 | ## Asynchronous Parameter Server Experiments (Section 5.2, Figure 9)
16 | 
17 | _(About 15 min)_
18 | 
19 | After logging in to the configured cluster, *chdir* to the current directory in the hoplite repo.
20 | 
21 | In the current directory, run
22 | 
23 | ```bash
24 | ./parameter-server/run_async_ps_tests.sh
25 | ```
26 | 
27 | After the script completes, results are saved under `ps-log`.
28 | 
29 | To visualize the results, run
30 | 
31 | ```bash
32 | python plot_async_ps_results.py
33 | ```
34 | 
35 | This generates 2 PDF files: `async_training_8.pdf` corresponds to Figure 9(a), and `async_training_16.pdf` corresponds to Figure 9(b).
36 | 
37 | You can download PDF files to your local machine using Ray cluster utils, for example:
38 | 
39 | ```bash
40 | ray rsync-down cluster.yaml /home/ubuntu/efs/hoplite/app/parameter-server/async_training_8.pdf .
41 | ```
42 | 
43 | ## Asynchronous Parameter Server Fault Tolerance Experiments (Section 5.5, Figure 12b)
44 | 
45 | _(About 10 min)_
46 | 
47 | After logging in to the configured cluster, *chdir* to the current directory in the hoplite repo.
48 | 
49 | In the current directory, run
50 | 
51 | ```bash
52 | ./run_async_ps_fault_tolerance.sh
53 | ```
54 | 
55 | The script generates `ray_asgd_fault_tolerance.json` and `hoplite_asgd_fault_tolerance.json` after running.
56 | 
57 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper).
58 | 
59 | ## Notes
60 | 
61 | The initial run will be extremely slow on AWS due to python generating caching files etc (about 5 min). This is totally normal.
62 | 


--------------------------------------------------------------------------------
/app/parameter-server/analyze_fault_tolerance.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | 
 4 | with open('ray_asgd_fault_tolerance.json', 'r') as f:
 5 |     ray_log = json.load(f)
 6 |     durations = [l['duration'] for l in ray_log if l['event'] == 'fail']
 7 |     # we only fail once in the paper, so no calculating std here
 8 |     print(f"Baseline latency caused by failure: {np.mean(durations):.6f}s")
 9 | 
10 | with open('hoplite_asgd_fault_tolerance.json', 'r') as f:
11 |     hoplite_log = json.load(f)
12 |     durations = [l['duration'] for l in hoplite_log if l['event'] == 'fail']
13 |     # we only fail once in the paper, so no calculating std here
14 |     print(f"Hoplite latency caused by failure: {np.mean(durations):.6f}s")
15 | 


--------------------------------------------------------------------------------
/app/parameter-server/cluster-asgd-fault-tolerance.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-asgd
 2 | 
 3 | min_workers: 7
 4 | max_workers: 7
 5 | initial_workers: 7
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 | 
15 | auth:
16 |     ssh_user: ubuntu
17 | 
18 | head_node:
19 |     InstanceType: p3.2xlarge
20 |     ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2
21 |     InstanceMarketOptions:
22 |         MarketType: spot
23 |     SecurityGroupIds:
24 |         - "sg-3463e565"
25 |     Placement:
26 |         GroupName: hoplite-group
27 | 
28 | worker_nodes:
29 |     InstanceType: p3.2xlarge
30 |     ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2
31 |     InstanceMarketOptions:
32 |         MarketType: spot
33 |     SecurityGroupIds:
34 |         - "sg-3463e565"
35 |     Placement:
36 |         GroupName: hoplite-group
37 | 
38 | setup_commands:
39 |     # - pip install ray==1.1
40 |     - mkdir -p ~/efs
41 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
42 |     - sudo chmod 777 ~/efs
43 | 
44 | # Command to start ray on the head node. You don't need to change this.
45 | head_start_ray_commands:
46 |     - ray stop
47 |     - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
48 | 
49 | # Command to start ray on worker nodes. You don't need to change this.
50 | worker_start_ray_commands:
51 |     - ray stop
52 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
53 | 


--------------------------------------------------------------------------------
/app/parameter-server/cluster-config-access-results-only/README.md:
--------------------------------------------------------------------------------
 1 | # Setup Hoplite Parameter Server experiments on AWS.
 2 | 
 3 | ## Setup Local Environment _(About 2 min)_
 4 | 
 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with:
 6 | 
 7 | ~~~bash
 8 | pip install ray==1.3 boto3  # if failed, use "pip -V" to check if you are using python3
 9 | ~~~
10 | 
11 | ## Start the Cluster _(About 3 min)_
12 | 
13 | Start the cluster and connect to the head node via:
14 | 
15 | ~~~bash
16 | export AWS_ACCESS_KEY_ID="Your Access Key ID"
17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key"
18 | ray up example.yaml
19 | ray attach example.yaml
20 | ~~~
21 | 
22 | Visit the directory with pre-built binaries and results via `cd ~/efs/hoplite-with-results/`
23 | 
24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation.
25 | 
26 | ## Access results
27 | 
28 | ### Asynchronous Parameter Server Experiments (Section 5.2, Figure 9)
29 | 
30 | Raw results are stored in `~/efs/hoplite-with-results/app/parameter-server/ps-log/`.
31 | 
32 | To download the figures:
33 | 
34 | **Figure 9(a)**
35 | 
36 | ```bash
37 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite/app/parameter-server/async_training_8.pdf .
38 | ```
39 | 
40 | **Figure 9(b)**
41 | 
42 | ```bash
43 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite/app/parameter-server/async_training_16.pdf .
44 | ```
45 | 
46 | ### Asynchronous Parameter Server Fault Tolerance Experiments (Section 5.5, Figure 12b)
47 | 
48 | After logging into the cluster, `cd ~/efs/hoplite-with-results/app/parameter-server`. `ray_asgd_fault_tolerance.json` and `hoplite_asgd_fault_tolerance.json` contain the raw trajectory during failure. 
49 | 
50 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper).
51 | 


--------------------------------------------------------------------------------
/app/parameter-server/cluster-config-access-results-only/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072"
56 | 


--------------------------------------------------------------------------------
/app/parameter-server/cluster-config-with-ami/README.md:
--------------------------------------------------------------------------------
 1 | # Setup Hoplite Parameter Server experiments on AWS.
 2 | 
 3 | ## Setup Local Environment _(About 2 min)_
 4 | 
 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with:
 6 | 
 7 | ~~~bash
 8 | pip install ray==1.3 boto3  # if failed, use "pip -V" to check if you are using python3
 9 | ~~~
10 | 
11 | ## Start the Cluster _(About 3 min)_
12 | 
13 | Start the cluster and connect to the head node via:
14 | 
15 | ~~~bash
16 | export AWS_ACCESS_KEY_ID="Your Access Key ID"
17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key"
18 | ray up example.yaml
19 | ray attach example.yaml
20 | ~~~
21 | 
22 | Visit the directory with pre-built binaries via `cd ~/efs/hoplite`
23 | 
24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation.
25 | 


--------------------------------------------------------------------------------
/app/parameter-server/cluster-config-with-ami/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-ml-serving
 2 | 
 3 | min_workers: 16
 4 | max_workers: 16
 5 | initial_workers: 16
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: p3.2xlarge
21 |     ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: p3.2xlarge
31 |     ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
56 | 


--------------------------------------------------------------------------------
/app/parameter-server/gloo_all_reduce.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import time
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | import torch.distributed as dist
 9 | 
10 | import numpy as np
11 | 
12 | from ps_helper import ConvNet, get_data_loader, evaluate, criterion
13 | 
14 | class DataWorker(object):
15 |     def __init__(self, model_type="custom", device="cpu"):
16 |         self.device = device
17 |         self.model = ConvNet(model_type).to(device)
18 |         self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.02)
19 | 
20 |     def compute_gradients(self, batch_size=128):
21 |         data = torch.randn(batch_size, 3, 224, 224, device=self.device)
22 |         self.model.zero_grad()
23 |         output = self.model(data)
24 |         loss = torch.mean(output)
25 |         loss.backward()
26 |         gradients = self.model.get_gradients()
27 |         cont_grad = np.concatenate([p.ravel() for p in gradients])
28 |         t = torch.from_numpy(cont_grad)
29 |         torch.distributed.all_reduce(t)
30 |         summed_gradients = self.model.buffer_to_tensors(t.numpy().view(np.uint8))
31 |         self.optimizer.zero_grad()
32 |         self.model.set_gradients(summed_gradients)
33 |         self.optimizer.step()
34 | 
35 | parser = argparse.ArgumentParser(description='parameter server')
36 | parser.add_argument('-m', '--model', type=str, default="custom",
37 |                     help='neural network model type')
38 | parser.add_argument('--rank', type=int)
39 | parser.add_argument('--size', type=int)
40 | parser.add_argument('--master_ip', type=str)
41 | args = parser.parse_args()
42 | 
43 | dist.init_process_group('gloo', init_method=f"tcp://{args.master_ip}:12345", rank=args.rank, world_size=args.size)
44 | 
45 | iterations = 50
46 | 
47 | worker = DataWorker(model_type=args.model, device='cuda')
48 | step_start = time.time()
49 | for i in range(iterations):
50 |     worker.compute_gradients()
51 |     now = time.time()
52 |     print("rank:", args.rank, "step time:", now - step_start, flush=True)
53 |     step_start = now
54 | 


--------------------------------------------------------------------------------
/app/parameter-server/mpi_all_reduce.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | import time
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | 
 9 | import numpy as np
10 | 
11 | from ps_helper import ConvNet, get_data_loader, evaluate, criterion
12 | 
13 | from mpi4py import MPI
14 | 
15 | comm = MPI.COMM_WORLD
16 | rank = comm.Get_rank()
17 | 
18 | class DataWorker(object):
19 |     def __init__(self, model_type="custom", device="cpu"):
20 |         self.device = device
21 |         self.model = ConvNet(model_type).to(device)
22 |         self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.02)
23 | 
24 |     def compute_gradients(self, batch_size=128):
25 |         data = torch.randn(batch_size, 3, 224, 224, device=self.device)
26 |         self.model.zero_grad()
27 |         output = self.model(data)
28 |         loss = torch.mean(output)
29 |         loss.backward()
30 |         gradients = self.model.get_gradients()
31 |         cont_grad = np.concatenate([p.ravel() for p in gradients])
32 |         grad_buffer = np.empty(self.model.n_param, dtype=np.float32)
33 |         comm.Allreduce(cont_grad, grad_buffer, op=MPI.SUM)
34 |         summed_gradients = self.model.buffer_to_tensors(grad_buffer.view(np.uint8))
35 |         self.optimizer.zero_grad()
36 |         self.model.set_gradients(summed_gradients)
37 |         self.optimizer.step()
38 | 
39 | parser = argparse.ArgumentParser(description='parameter server')
40 | parser.add_argument('-m', '--model', type=str, default="custom",
41 |                     help='neural network model type')
42 | args = parser.parse_args()
43 | 
44 | 
45 | iterations = 50
46 | 
47 | worker = DataWorker(model_type=args.model, device='cuda')
48 | step_start = time.time()
49 | for i in range(iterations):
50 |     worker.compute_gradients()
51 |     now = time.time()
52 |     print("rank:", rank, "step time:", now - step_start, flush=True)
53 |     step_start = now
54 | 


--------------------------------------------------------------------------------
/app/parameter-server/plot_async_ps_results.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | 
 4 | MODELS = ['alexnet', 'vgg16', 'resnet50']
 5 | BATCH_SIZE_PER_CLIENT = 128
 6 | 
 7 | # async batch size of clients = (#nodes - 1) // 2
 8 | 
 9 | def parse_ray(filename, n_nodes):
10 |     batch_size_client = (n_nodes - 1) // 2
11 |     all_step_time = []
12 |     with open(filename, 'r') as f:
13 |         for line in f.readlines():
14 |             if f"step time:" in line:
15 |                 all_step_time.append(float(line.split(f"step time:")[1]))
16 |     all_step_time = np.array(all_step_time[3:])
17 |     all_step_throughput = BATCH_SIZE_PER_CLIENT * batch_size_client / all_step_time
18 |     return np.mean(all_step_throughput), np.std(all_step_throughput)
19 | 
20 | 
21 | def parse_hoplite(filename, n_nodes):
22 |     batch_size_client = (n_nodes - 1) // 2
23 |     all_step_time = []
24 |     with open(filename, 'r') as f:
25 |         for line in f.readlines():
26 |             if f"step time:" in line:
27 |                 all_step_time.append(float(line.split(f"step time:")[1]))
28 |     all_step_time = np.array(all_step_time[6:])
29 |     all_step_throughput = BATCH_SIZE_PER_CLIENT * batch_size_client / all_step_time
30 |     all_step_throughput = (all_step_throughput[0::2] + all_step_throughput[1::2]) / 2
31 |     return np.mean(all_step_throughput), np.std(all_step_throughput)
32 | 
33 | 
34 | def parse_data(n_nodes):
35 |     ray_mean = []
36 |     ray_std = []
37 |     hoplite_mean = []
38 |     hoplite_std = []
39 |     for model in MODELS:
40 |         mean, std = parse_ray(f"ps-log/async-ps-{n_nodes}-{model}-ray.log", n_nodes)
41 |         ray_mean.append(mean)
42 |         ray_std.append(std)
43 |         mean, std = parse_hoplite(f"ps-log/async-ps-{n_nodes}-{model}-hoplite.log", n_nodes)
44 |         hoplite_mean.append(mean)
45 |         hoplite_std.append(std)
46 |     return ray_mean, ray_std, hoplite_mean, hoplite_std
47 | 
48 | 
49 | def draw_async_ps_results(n_nodes):
50 |     ray_mean, ray_std, hoplite_mean, hoplite_std = parse_data(n_nodes)
51 |     colors = (
52 |         plt.get_cmap('tab20c')(0 * 4 + 1),
53 |         plt.get_cmap('tab20c')(1 * 4 + 2),
54 |         plt.get_cmap('tab20')(11),
55 |         plt.get_cmap('tab20c')(2 * 4 + 2),
56 |     )
57 | 
58 |     ind = np.array(range(3))
59 |     width = 0.3
60 | 
61 |     plt.bar(ind, hoplite_mean, width, label='Hoplite', color=colors[0])
62 |     plt.errorbar(ind, hoplite_mean, yerr=hoplite_std, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6)
63 | 
64 |     plt.bar(ind + width, ray_mean, width, label='Ray', color=colors[3])
65 |     plt.errorbar(ind + width, ray_mean, yerr=ray_std, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6)
66 | 
67 |     plt.xticks(ind + width/2, ["AlexNet", "VGG-16", "ResNet50"], fontsize=20)
68 |     plt.yticks(fontsize=20)
69 |     plt.ylabel('Throughput\n(samples/s)', fontsize=20)
70 |     plt.ylim(0, 2000)
71 |     plt.legend(fontsize=20)
72 |     plt.tight_layout()
73 |     plt.savefig(f'async_training_{n_nodes}.pdf')
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     plt.figure(0)
78 |     draw_async_ps_results(16)
79 |     plt.figure(1)
80 |     draw_async_ps_results(8)
81 | 


--------------------------------------------------------------------------------
/app/parameter-server/result_parser/parse_async_ps_hoplite.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def parse(filename):
 4 |     all_step_time = []
 5 |     with open(filename, 'r') as f:
 6 |         for line in f.readlines():
 7 |             if f"step time:" in line:
 8 |                 all_step_time.append(float(line.split(f"step time:")[1]))
 9 |     all_step_time = np.array(all_step_time[6:])
10 |     all_step_throughput = 1.0 / all_step_time
11 |     all_step_throughput = (all_step_throughput[0::2] + all_step_throughput[1::2]) / 2
12 |     return np.mean(all_step_throughput), np.std(all_step_throughput)
13 | 


--------------------------------------------------------------------------------
/app/parameter-server/result_parser/parse_gloo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | path = sys.argv[1]
 6 | prefix = sys.argv[2]
 7 | 
 8 | all_step_time = []
 9 | 
10 | for filename in os.listdir(path):
11 |     if filename.startswith(prefix):
12 |         step_time_rank = []
13 |         with open(os.path.join(path, filename), 'r') as f:
14 |             for line in f.readlines():
15 |                 if "step time:" in line:
16 |                     step_time_rank.append(float(line.split("step time:")[1]))
17 |         all_step_time.append(np.array(step_time_rank))
18 | 
19 | all_step_time = np.array(all_step_time)
20 | all_step_time = all_step_time[:, 5:]
21 | all_step_time = np.amax(all_step_time, axis=0)
22 | 
23 | all_step_throughput = 1.0 / all_step_time
24 | 
25 | print(np.mean(all_step_throughput), np.std(all_step_throughput))
26 | 


--------------------------------------------------------------------------------
/app/parameter-server/result_parser/parse_hoplite.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | filename = sys.argv[1]
 6 | n_nodes = int(sys.argv[2])
 7 | 
 8 | all_step_time = []
 9 | min_len = 1e10
10 | 
11 | for i in range(n_nodes):
12 |     step_time_rank = []
13 |     with open(filename, 'r') as f:
14 |         for line in f.readlines():
15 |             if f" {i} in actor time" in line:
16 |                 step_time_rank.append(float(line.split(f" {i} in actor time")[1]))
17 |     print(len(step_time_rank))
18 |     min_len = min(min_len, len(step_time_rank))
19 |     all_step_time.append(np.array(step_time_rank))
20 | 
21 | all_step_time = np.array([a[:min_len] for a in all_step_time])
22 | all_step_time = all_step_time[:, 5:]
23 | all_step_time = np.amax(all_step_time, axis=0)
24 | all_step_throughput = 1.0 / all_step_time
25 | print(np.mean(all_step_throughput), np.std(all_step_throughput))
26 | 


--------------------------------------------------------------------------------
/app/parameter-server/result_parser/parse_mpi.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | filename = sys.argv[1]
 6 | n_nodes = int(sys.argv[2])
 7 | 
 8 | all_step_time = []
 9 | min_len = 1e10
10 | 
11 | for i in range(n_nodes):
12 |     step_time_rank = []
13 |     with open(filename, 'r') as f:
14 |         for line in f.readlines():
15 |             if f" {i} step time:" in line:
16 |                 step_time_rank.append(float(line.split(f" {i} step time:")[1]))
17 |     print(len(step_time_rank))
18 |     min_len = min(min_len, len(step_time_rank))
19 |     all_step_time.append(np.array(step_time_rank))
20 | 
21 | all_step_time = np.array([a[:min_len] for a in all_step_time])
22 | all_step_time = all_step_time[:, 5:]
23 | all_step_time = np.amax(all_step_time, axis=0)
24 | all_step_throughput = 1.0 / all_step_time
25 | print(np.mean(all_step_throughput), np.std(all_step_throughput))
26 | 


--------------------------------------------------------------------------------
/app/parameter-server/result_parser/parse_ray.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def parse(filename):
 4 |     all_step_time = []
 5 |     with open(filename, 'r') as f:
 6 |         for line in f.readlines():
 7 |             if f"step time:" in line:
 8 |                 all_step_time.append(float(line.split(f"step time:")[1]))
 9 |     all_step_time = np.array(all_step_time[3:])
10 |     all_step_throughput = 1.0 / all_step_time
11 |     return np.mean(all_step_throughput), np.std(all_step_throughput)
12 | 


--------------------------------------------------------------------------------
/app/parameter-server/run_allreduce_tests.sh:
--------------------------------------------------------------------------------
 1 | export RAY_BACKEND_LOG_LEVEL=info
 2 | mkdir -p ps-log/
 3 | 
 4 | ROOT_DIR=$(dirname $(realpath -s $0))/../../
 5 | source $ROOT_DIR/load_cluster_env.sh
 6 | 
 7 | for n_nodes in 16; do
 8 |   for model in alexnet vgg16 resnet50; do
 9 |     echo "==========" allreduce-$n_nodes-$model-mpi "=========="
10 |     all_nodes=(${ALL_IPADDR[@]:0:$n_nodes})
11 |     all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
12 |     echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")"
13 | 
14 |     pkill notification
15 |     # $ROOT_DIR/restart_all_workers.sh
16 |     $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_all_reduce.py) -m $model \
17 |       | tee ps-log/allreduce-$n_nodes-$model-mpi.log
18 |     sleep 0.5
19 | 
20 |     echo "==========" allreduce-$n_nodes-$model-gloo "=========="
21 |     i=0
22 |     for node in ${ALL_IPADDR[@]:0:$n_nodes}; do
23 |       echo "=> $node"
24 |       ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, \
25 |         python $ROOT_DIR/app/parameter-server/gloo_all_reduce.py \
26 |           --master_ip $MY_IPADDR \
27 |           --rank $i \
28 |           --size $n_nodes \
29 |           -m $model \
30 |           | tee ps-log/allreduce-$n_nodes-$model-gloo.$i.log &
31 |       i=$((i+1))
32 |     done
33 |     wait
34 |     sleep 0.5
35 | 
36 |     echo "==========" allreduce-$n_nodes-$model-hoplite "=========="
37 |     python hoplite_all_reduce.py -n $n_nodes -m $model | tee ps-log/allreduce-$n_nodes-$model-hoplite.log
38 |     sleep 0.5
39 | 
40 |     echo "==========" allreduce-$n_nodes-$model-ray "=========="
41 |     python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) -m $model | tee ps-log/allreduce-$n_nodes-$model-ray.log
42 |     sleep 0.5
43 |   done
44 | done
45 | 


--------------------------------------------------------------------------------
/app/parameter-server/run_async_ps_fault_tolerance.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export RAY_BACKEND_LOG_LEVEL=info
 3 | 
 4 | sudo fuser -k 6666/tcp -s &> /dev/null
 5 | sudo fuser -k 50055/tcp -s &> /dev/null
 6 | sudo fuser -k 20210/tcp -s &> /dev/null
 7 | sleep 1
 8 | 
 9 | n_nodes=7
10 | model=resnet50
11 | 
12 | echo "==========" async-ps-$n_nodes-$model-hoplite "=========="
13 | python hoplite_asgd_fault_tolerance.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model --iterations 100
14 | sleep 1
15 | 
16 | echo "==========" async-ps-$n_nodes-$model-ray "=========="
17 | python ray_asgd_fault_tolerance.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model --iterations 100
18 | 


--------------------------------------------------------------------------------
/app/parameter-server/run_async_ps_tests.sh:
--------------------------------------------------------------------------------
 1 | export RAY_BACKEND_LOG_LEVEL=info
 2 | mkdir -p ps-log/
 3 | 
 4 | for n_nodes in 8 16; do
 5 |   for model in alexnet vgg16 resnet50; do
 6 |     echo "==========" async-ps-$n_nodes-$model-hoplite "=========="
 7 |     python parameter_server.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model | tee ps-log/async-ps-$n_nodes-$model-hoplite.log
 8 |     sleep 0.5
 9 | 
10 |     echo "==========" async-ps-$n_nodes-$model-ray "=========="
11 |     python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model | tee ps-log/async-ps-$n_nodes-$model-ray.log
12 |     sleep 0.5
13 |   done
14 | done
15 | 


--------------------------------------------------------------------------------
/app/ray_serve/README.md:
--------------------------------------------------------------------------------
 1 | # Reproducing ML Model Serving Experiments on AWS
 2 | 
 3 | _(About 30 min)_
 4 | 
 5 | ## Setup 
 6 | 
 7 | _(About 15 min)_
 8 | 
 9 | If you are provided with an AWS IAM account & pre-built binaries
10 | * If you just want to review figures & raw experimental data, see [cluster-config-access-results-only](cluster-config-access-results-only).
11 | * If you also want to reproduce all results from the beginning, see [cluster-config-with-ami](cluster-config-with-ami) for setting up a cluster.
12 | 
13 | If you are not provided with an AWS account or you want to build everything from scratch, see [cluster-config](cluster-config).
14 | 
15 | ## ML model serving experiments (Figure 11)
16 | 
17 | After logging in to the configured cluster, *chdir* to the current directory in the hoplite repo.
18 | 
19 | Here is how you run the experiments:
20 | 
21 | **Baseline** _(2-3 min)_: `python model_ensembling.py ${scale}`
22 | 
23 | **Hoplite** _(1-2 min)_: `python hoplite_model_ensembling.py ${scale}`
24 | 
25 | `${scale}` controls the cluster size. `scale=1` corresponds to 8 GPU nodes, `scale=2` corresponds to 16 GPU nodes in the figure.
26 | 
27 | The script prints the mean and std of throughput (queries/s) at the end.
28 | 
29 | ## ML Model Serving fault tolerance experiments (Figure 12a)
30 | 
31 | Baseline + fault tolerance test _(About 2 min)_: `python model_ensembling_fault_tolerance.py 1`
32 | 
33 | With Hoplite + fault tolerance test _(About 2 min)_: `python hoplite_model_ensembling_fault_tolerance.py.py 1`
34 | 
35 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper).
36 | 
37 | ## Notes
38 | 
39 | The initial run will be extremely slow on AWS due to python generating caching files etc (about 4 min). This is totally normal.
40 | 


--------------------------------------------------------------------------------
/app/ray_serve/analyze_fault_tolerance.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | 
 4 | with open('ray_serve_log.json', 'r') as f:
 5 |     ray_log = json.load(f)
 6 |     durations = [l['duration'] for l in ray_log if l['event'] == 'fail']
 7 |     print(f"Baseline latency caused by failure: {np.mean(durations):.6f} ± {np.std(durations):.6f}s")
 8 | 
 9 | with open('hoplite_ray_serve_log.json', 'r') as f:
10 |     hoplite_log = json.load(f)
11 |     durations = [l['duration'] for l in hoplite_log if l['event'] == 'fail']
12 |     print(f"Hoplite latency caused by failure: {np.mean(durations):.6f} ± {np.std(durations):.6f}s")
13 | 


--------------------------------------------------------------------------------
/app/ray_serve/cluster-config-access-results-only/README.md:
--------------------------------------------------------------------------------
 1 | # Setup Hoplite ML serving experiments on AWS.
 2 | 
 3 | ## Setup Local Environment _(About 2 min)_
 4 | 
 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with:
 6 | 
 7 | ~~~bash
 8 | pip install ray==1.3 boto3  # if failed, use "pip -V" to check if you are using python3
 9 | ~~~
10 | 
11 | ## Start the Cluster _(About 3 min)_
12 | 
13 | Start the cluster and connect to the head node via:
14 | 
15 | ~~~bash
16 | export AWS_ACCESS_KEY_ID="Your Access Key ID"
17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key"
18 | ray up example.yaml
19 | ray attach example.yaml
20 | ~~~
21 | 
22 | Visit the directory with pre-built binaries and results via `cd ~/efs/hoplite-with-results/`
23 | 
24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation.
25 | 
26 | ## Access results
27 | 
28 | ### ML model serving experiments (Figure 11)
29 | 
30 | The results are collected dynamically, so if you want to get the result numbers, you need to run the experiments with [this cluster setup](../cluster-config-with-ami).
31 | 
32 | ### ML Model Serving fault tolerance experiments (Figure 12a)
33 | 
34 | After logging into the cluster, `cd ~/efs/hoplite-with-results/app/ray_serve`. `hoplite_ray_serve_log.json` and `ray_serve_log.json` contain the raw trajectory during failure. 
35 | 
36 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper).
37 | 


--------------------------------------------------------------------------------
/app/ray_serve/cluster-config-access-results-only/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072"
56 | 


--------------------------------------------------------------------------------
/app/ray_serve/cluster-config-with-ami/README.md:
--------------------------------------------------------------------------------
 1 | # Setup Hoplite ML serving experiments on AWS.
 2 | 
 3 | ## Setup Local Environment _(About 2 min)_
 4 | 
 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with:
 6 | 
 7 | ~~~bash
 8 | pip install ray==1.3 boto3  # if failed, use "pip -V" to check if you are using python3
 9 | ~~~
10 | 
11 | ## Start the Cluster _(About 3 min)_
12 | 
13 | Start the cluster and connect to the head node via:
14 | 
15 | ~~~bash
16 | export AWS_ACCESS_KEY_ID="Your Access Key ID"
17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key"
18 | ray up example.yaml
19 | ray attach example.yaml
20 | ~~~
21 | 
22 | Visit the directory with pre-built binaries via `cd ~/efs/hoplite`
23 | 
24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation.
25 | 


--------------------------------------------------------------------------------
/app/ray_serve/cluster-config-with-ami/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-ml-serving
 2 | 
 3 | min_workers: 16
 4 | max_workers: 16
 5 | initial_workers: 16
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: p3.2xlarge
21 |     ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: p3.2xlarge
31 |     ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
56 | 


--------------------------------------------------------------------------------
/app/ray_serve/cluster-config/cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-ml-serving
 2 | 
 3 | min_workers: 16
 4 | max_workers: 16
 5 | initial_workers: 16
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: p3.2xlarge
21 |     ImageId: {image-id}
22 |     SecurityGroupIds:
23 |         - "{security group id created by inital.yaml}"
24 |     Placement:
25 |         GroupName: {group-name}
26 | 
27 | worker_nodes:
28 |     InstanceType: p3.2xlarge
29 |     ImageId: {image-id}
30 |     SecurityGroupIds:
31 |         - "{security group id created by inital.yaml}"
32 |     Placement:
33 |         GroupName: {group-name}
34 | 
35 | setup_commands:
36 |     # This replaces the standard anaconda Ray installation
37 |     - mkdir -p ~/efs
38 |     - sudo mount -t efs {efs-id}:/ ~/efs
39 |     - sudo chmod 777 ~/efs
40 | 
41 | # Command to start ray on the head node. You don't need to change this.
42 | head_start_ray_commands:
43 |     - ray stop
44 |     # we allocate 28 GB memory for Ray object store
45 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
46 | 
47 | # Command to start ray on worker nodes. You don't need to change this.
48 | worker_start_ray_commands:
49 |     - ray stop
50 |     # we allocate 28 GB memory for Ray object store
51 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
52 | 


--------------------------------------------------------------------------------
/app/ray_serve/cluster-config/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-ml-serving
 2 | 
 3 | min_workers: 16
 4 | max_workers: 16
 5 | initial_workers: 16
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: p3.2xlarge
21 |     ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: p3.2xlarge
31 |     ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
56 | 


--------------------------------------------------------------------------------
/app/ray_serve/cluster-config/initial.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-single-initial
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0
22 | 
23 | worker_nodes:
24 |     InstanceType: m5.4xlarge
25 |     ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0
26 | 
27 | setup_commands: []
28 | 
29 | # Command to start ray on the head node. You don't need to change this.
30 | head_start_ray_commands: []
31 | 
32 | # Command to start ray on worker nodes. You don't need to change this.
33 | worker_start_ray_commands: []
34 | 


--------------------------------------------------------------------------------
/app/rllib/README-with-ami.md:
--------------------------------------------------------------------------------
 1 | # Reproducing RLLib experiments in Hoplite on AWS (with AMI).
 2 | 
 3 | ## Setup Local Environment _(About 2 min)_
 4 | 
 5 | On your local machine, make sure Python 3 is installed on the local machine.Then install Ray version `0.8.0` and boto with:
 6 |    ~~~bash
 7 |    pip install ray==0.8.0 boto3
 8 |    ~~~
 9 | 
10 | ## Start the Cluster and Evaluate _(About 30 min)_
11 | 
12 | 1. Launch the cluster and logging in:
13 |    ~~~bash
14 |    export AWS_ACCESS_KEY_ID="Your Access Key ID"
15 |    export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key"
16 |    ray up example.yaml
17 |    ray attach example.yaml
18 |    ~~~
19 | 3. Move to the running scripts directory:
20 |    ~~~bash
21 |    cd ~/hoplite-rllib/hoplite-scripts
22 |    ~~~
23 | 4. Generate the cluster configuration:
24 |    ~~~bash
25 |    python a3c_generate_config.py
26 |    python impala_generate_config.py
27 |    ~~~
28 | 5. Test all configurations:
29 |    ~~~bash
30 |    ./test_all_generated.sh
31 |    ~~~
32 | 6. After all experiments finished, we can get the results via:
33 |    ~~~bash
34 |    python a3c_parse_log.py
35 |    python impala_parse_log.py
36 |    ~~~
37 |    The results will be in the format of:
38 |    ~~~
39 |    #nodes / - / Hoplite or Ray / Throughput (mean) / Throughput (std)
40 |    ~~~
41 | 


--------------------------------------------------------------------------------
/app/rllib/cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: {image-id}
22 |     Placement:
23 |         GroupName: {group-name}
24 | 
25 | worker_nodes:
26 |     InstanceType: m5.4xlarge
27 |     ImageId: {image-id}
28 |     Placement:
29 |         GroupName: {group-name}
30 | 
31 | setup_commands: []
32 | 
33 | # Command to start ray on the head node. You don't need to change this.
34 | head_start_ray_commands:
35 |     - ray stop
36 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
37 | 
38 | # Command to start ray on worker nodes. You don't need to change this.
39 | worker_start_ray_commands:
40 |     - ray stop
41 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
42 | 


--------------------------------------------------------------------------------
/app/rllib/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-0e9f764f786728984 # hoplite-artifact-rllib-2
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-0e9f764f786728984 # hoplite-artifact-rllib-2
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | 
40 | setup_commands: []
41 | 
42 | # Command to start ray on the head node. You don't need to change this.
43 | head_start_ray_commands:
44 |     - ray stop
45 |     - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'"
46 | 
47 | # Command to start ray on worker nodes. You don't need to change this.
48 | worker_start_ray_commands:
49 |     - ray stop
50 |     - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'"
51 | 


--------------------------------------------------------------------------------
/app/rllib/initial.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-single-initial
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-west-2
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-west-2a
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-0f9543706892e0363 # Deep Learning AMI (Ubuntu 18.04) Version 43.0
22 | 
23 | worker_nodes:
24 |     InstanceType: m5.4xlarge
25 |     ImageId: ami-0f9543706892e0363 # Deep Learning AMI (Ubuntu 18.04) Version 43.0
26 | 
27 | setup_commands: []
28 | 
29 | # Command to start ray on the head node. You don't need to change this.
30 | head_start_ray_commands: []
31 | 
32 | # Command to start ray on worker nodes. You don't need to change this.
33 | worker_start_ray_commands: []
34 | 


--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | make clean
3 | clang-format -i *.cc src/*.cc src/*.h src/util/*.cc src/util/*.h mpi/*.c
4 | 


--------------------------------------------------------------------------------
/fornode:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script can run commands on all nodes on the cluster: ./fornode <commands>
 4 | 
 5 | ROOT_DIR=$(dirname $(realpath -s $0))
 6 | source $ROOT_DIR/test_utils/load_cluster_env.sh
 7 | 
 8 | for node in ${ALL_IPADDR[@]}; do
 9 |   echo "=> $node"
10 |   ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, $@ &
11 | done
12 | wait
13 | 


--------------------------------------------------------------------------------
/install_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cd $HOME
 4 | 
 5 | sudo apt update
 6 | 
 7 | ## build grpc
 8 | if [ ! -d grpc ]; then
 9 | 
10 |      sudo apt-get install -y \
11 |        build-essential \
12 | 	  autoconf \
13 | 	  libtool \
14 | 	  pkg-config \
15 | 	  libgflags-dev \
16 | 	  libgtest-dev \
17 | 	  clang-5.0 \
18 | 	  libc++-dev
19 |      
20 |      git clone https://github.com/grpc/grpc.git
21 | 
22 |      pushd grpc
23 |      # pin gRPC version to 1.31.0
24 |      git checkout tags/v1.31.0
25 |      git submodule update --init --recursive
26 | 
27 |      mkdir build && cd build
28 |      cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
29 |      make -j8 && sudo make install
30 |      popd
31 | 
32 |      pushd grpc/third_party/protobuf
33 |      ./autogen.sh
34 |      ./configure
35 |      make -j8 && sudo make install
36 |      popd
37 | fi
38 | 


--------------------------------------------------------------------------------
/microbenchmarks/cluster-config-access-results-only/README.md:
--------------------------------------------------------------------------------
 1 | # Setup AWS Cluster for Hoplite Microbenchmarks on AWS.
 2 | 
 3 | ## Setup Local Environment _(About 2 min)_
 4 | 
 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with:
 6 | 
 7 | ~~~bash
 8 | pip install ray==1.3 boto3  # if failed, use "pip -V" to check if you are using python3
 9 | ~~~
10 | 
11 | ## Start the Cluster _(About 3 min)_
12 | 
13 | Start the cluster and connect to the head node via:
14 | 
15 | ~~~bash
16 | export AWS_ACCESS_KEY_ID="Your Access Key ID"
17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key"
18 | ray up example.yaml
19 | ray attach example.yaml
20 | ~~~
21 | 
22 | Visit the directory with pre-built binaries and results via `cd ~/efs/hoplite-with-results/`
23 | 
24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation.
25 | 
26 | ## Access results
27 | 
28 | You can download results from the cluster to your local machine by executing
29 | 
30 | ~~~bash
31 | ray rsync-down example.yaml <remote file path> <local destination>
32 | ~~~
33 | 
34 | Here is how you could download main results:
35 | 
36 | ### Roundtrip Microbenchmarks (Figure 6 at Section 5.1)
37 | 
38 | **Raw data for Figure 6**
39 | 
40 | ~~~bash
41 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/roundtrip-results.csv .
42 | ~~~
43 | 
44 | **Figure 6 (a)**
45 | 
46 | ~~~bash
47 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/RTT1K.pdf .
48 | ~~~
49 | 
50 | **Figure 6 (b)**
51 | 
52 | ~~~bash
53 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/RTT1M.pdf .
54 | ~~~
55 | 
56 | **Figure 6 (c)**
57 | 
58 | ~~~bash
59 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/RTT1G.pdf .
60 | ~~~
61 | 
62 | ## Collective Communication Microbenchmarks (Figure 7 at Section 5.1, Figure 13 at Appendix A)
63 | 
64 | **Raw data for Figure 7 & Figure 13**
65 | 
66 | ~~~bash
67 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/mpi-cpp/mpi_results.csv .
68 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/hoplite-cpp/hoplite_results.csv .
69 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/gloo-cpp/gloo_results.csv .
70 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/ray-python/ray-microbenchmark.csv .
71 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/dask-python/dask_results.csv .
72 | ~~~
73 | 
74 | **Figure 7, Section 5.1**
75 | 
76 | ~~~bash
77 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/microbenchmarks-large.pdf .
78 | ~~~
79 | 
80 | **Figure 13, Appendix A**
81 | 
82 | ~~~bash
83 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/microbenchmarks-small.pdf .
84 | ~~~
85 | 


--------------------------------------------------------------------------------
/microbenchmarks/cluster-config-access-results-only/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072"
56 | 


--------------------------------------------------------------------------------
/microbenchmarks/cluster-config-with-ami/README.md:
--------------------------------------------------------------------------------
 1 | # Setup AWS Cluster for Hoplite Microbenchmarks on AWS.
 2 | 
 3 | ## Setup Local Environment _(About 2 min)_
 4 | 
 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with:
 6 | 
 7 | ~~~bash
 8 | pip install ray==1.3 boto3  # if failed, use "pip -V" to check if you are using python3
 9 | ~~~
10 | 
11 | ## Start the Cluster _(About 3 min)_
12 | 
13 | Start the cluster and connect to the head node via:
14 | 
15 | ~~~bash
16 | export AWS_ACCESS_KEY_ID="Your Access Key ID"
17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key"
18 | ray up example.yaml
19 | ray attach example.yaml
20 | ~~~
21 | 
22 | Visit the directory with pre-built binaries via `cd ~/efs/hoplite`
23 | 
24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation.
25 | 


--------------------------------------------------------------------------------
/microbenchmarks/cluster-config-with-ami/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072"
56 | 


--------------------------------------------------------------------------------
/microbenchmarks/cluster-config/cluster.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: {image-id}
22 |     SecurityGroupIds:
23 |         - "{security group id created by inital.yaml}"
24 |     Placement:
25 |         GroupName: {group-name}
26 | 
27 | worker_nodes:
28 |     InstanceType: m5.4xlarge
29 |     ImageId: {image-id}
30 |     SecurityGroupIds:
31 |         - "{security group id created by inital.yaml}"
32 |     Placement:
33 |         GroupName: {group-name}
34 | 
35 | setup_commands:
36 |     # This replaces the standard anaconda Ray installation
37 |     - mkdir -p ~/efs
38 |     - sudo mount -t efs {efs-id}:/ ~/efs
39 |     - sudo chmod 777 ~/efs
40 | 
41 | # Command to start ray on the head node. You don't need to change this.
42 | head_start_ray_commands:
43 |     - ray stop
44 |     # we allocate 28 GB memory for Ray object store
45 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'"
46 | 
47 | # Command to start ray on worker nodes. You don't need to change this.
48 | worker_start_ray_commands:
49 |     - ray stop
50 |     # we allocate 28 GB memory for Ray object store
51 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072"
52 | 


--------------------------------------------------------------------------------
/microbenchmarks/cluster-config/example.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite
 2 | 
 3 | min_workers: 15
 4 | max_workers: 15
 5 | initial_workers: 15
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
22 |     SecurityGroupIds:
23 |         - "sg-092b10044bcf1f37e"
24 |     Placement:
25 |         GroupName: hoplite-group
26 |     InstanceMarketOptions:
27 |         MarketType: spot
28 | 
29 | worker_nodes:
30 |     InstanceType: m5.4xlarge
31 |     ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3
32 |     SecurityGroupIds:
33 |         - "sg-092b10044bcf1f37e"
34 |     Placement:
35 |         GroupName: hoplite-group
36 |     InstanceMarketOptions:
37 |         MarketType: spot
38 | 
39 | setup_commands:
40 |     # This replaces the standard anaconda Ray installation
41 |     - mkdir -p ~/efs
42 |     - sudo mount -t efs fs-d416cc55:/ ~/efs
43 |     - sudo chmod 777 ~/efs
44 | 
45 | # Command to start ray on the head node. You don't need to change this.
46 | head_start_ray_commands:
47 |     - ray stop
48 |     # we allocate 28 GB memory for Ray object store
49 |     - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'"
50 | 
51 | # Command to start ray on worker nodes. You don't need to change this.
52 | worker_start_ray_commands:
53 |     - ray stop
54 |     # we allocate 28 GB memory for Ray object store
55 |     - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072"
56 | 


--------------------------------------------------------------------------------
/microbenchmarks/cluster-config/initial.yaml:
--------------------------------------------------------------------------------
 1 | cluster_name: hoplite-single-initial
 2 | 
 3 | min_workers: 0
 4 | max_workers: 0
 5 | initial_workers: 0
 6 | 
 7 | provider:
 8 |     type: aws
 9 |     region: us-east-1
10 |     # Availability zone(s), comma-separated, that nodes may be launched in.
11 |     # Nodes are currently spread between zones by a round-robin approach,
12 |     # however this implementation detail should not be relied upon.
13 |     availability_zone: us-east-1f
14 |     cache_stopped_nodes: False
15 | 
16 | auth:
17 |     ssh_user: ubuntu
18 | 
19 | head_node:
20 |     InstanceType: m5.4xlarge
21 |     ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0
22 | 
23 | worker_nodes:
24 |     InstanceType: m5.4xlarge
25 |     ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0
26 | 
27 | setup_commands: []
28 | 
29 | # Command to start ray on the head node. You don't need to change this.
30 | head_start_ray_commands: []
31 | 
32 | # Command to start ray on worker nodes. You don't need to change this.
33 | worker_start_ray_commands: []
34 | 


--------------------------------------------------------------------------------
/microbenchmarks/dask-python/auto_dask_benchmark.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from subprocess import Popen, PIPE
 3 | 
 4 | parser = argparse.ArgumentParser(description='Automatic Dask collective communication benchmark')
 5 | parser.add_argument('rounds', type=int, help="How many rounds we would to run the benchmark.")
 6 | 
 7 | args = parser.parse_args()
 8 | 
 9 | for i in range(args.rounds):
10 |     with open(f"result-{i+1}.csv", "w") as f:
11 |         for algorithm in ('multicast', 'gather', 'reduce', 'allreduce'):
12 |             for world_size in (4, 8, 12, 16):
13 |                 for object_size in (2 ** 10, 2 ** 15, 2 ** 20, 2 ** 25, 2 ** 30):
14 |                     process = Popen(["python", "dask_benchmark.py",
15 |                             algorithm, "-n", str(world_size), "-s", str(object_size)], stdout=PIPE)
16 |                     (output, err) = process.communicate()
17 |                     exit_code = process.wait()
18 |                     print(algorithm, world_size, object_size, float(output))
19 |                     f.write(f"{algorithm},{world_size},{object_size},{float(output)}\n")
20 | 


--------------------------------------------------------------------------------
/microbenchmarks/dask-python/auto_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
4 | 
5 | ./cleanup_dask.sh
6 | ./run_dask.sh 16 &
7 | python auto_dask_benchmark.py 5
8 | 


--------------------------------------------------------------------------------
/microbenchmarks/dask-python/cleanup_dask.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo pkill dask-scheduler
3 | ../../fornode sudo pkill dask-worker
4 | 


--------------------------------------------------------------------------------
/microbenchmarks/dask-python/dask_roundtrip.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | from dask.distributed import Client
 5 | 
 6 | 
 7 | def round_trip(obj):
 8 |     return obj
 9 | 
10 | 
11 | def measure_round_trip(client, object_size):
12 |     payload = np.empty(object_size, dtype=np.uint8)
13 |     before = time.time()
14 |     receiver = client.submit(round_trip, payload, workers=['Dask-1'])
15 |     receiver.result()
16 |     duration = time.time() - before
17 |     return duration
18 | 
19 | 
20 | def main():
21 |     client = Client("127.0.0.1:8786")
22 | 
23 |     # warmup
24 |     for size in (2**10, 2**20):
25 |         for _ in range(5):
26 |             measure_round_trip(client, size)
27 | 
28 |     with open(f"dask-roundtrip.csv", "w") as f:
29 |         for size in (2**10, 2**20, 2**30):
30 |             t = []
31 |             for _ in range(5):
32 |                 duration = measure_round_trip(client, size)
33 |                 t.append(duration)
34 |             f.write(f"dask,{size},{np.mean(t)},{np.std(t)}\n")
35 | 
36 |     # # Accumulate time for more precision.
37 |     # duration = 0.0
38 |     # for j in range(i + 1, i + 1 + 10):
39 |     #     duration += func(client, world_size, object_size, j)
40 |     # duration /= 10
41 |     # print(duration)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main()


--------------------------------------------------------------------------------
/microbenchmarks/dask-python/dask_roundtrip.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
4 | 
5 | ./cleanup_dask.sh
6 | ./run_dask.sh 2 &
7 | python dask_roundtrip.py
8 | 


--------------------------------------------------------------------------------
/microbenchmarks/dask-python/parse_result.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | parser = argparse.ArgumentParser(description='Hoplite (C++) benchmark results parser.')
 6 | parser.add_argument('--verbose', action='store_true')
 7 | args = parser.parse_args()
 8 | 
 9 | tables = []
10 | 
11 | for i in range(1, 100):
12 |     filename = f"result-{i}.csv"
13 |     if os.path.exists(filename):
14 |         tables.append(pd.read_csv(filename, header=None))
15 |     else:
16 |         break
17 | 
18 | df_avg = pd.concat(tables).groupby(by=[0, 1, 2]).mean()
19 | df_std = pd.concat(tables).groupby(by=[0, 1, 2]).std()
20 | df_cnt = pd.concat(tables).groupby(by=[0, 1, 2]).count()
21 | df_final = pd.concat([df_avg, df_std, df_cnt], axis=1)
22 | df_final.reset_index(inplace=True)
23 | columns = ['Benchmark Name', '#Nodes', 'Object Size (in bytes)',
24 |            'Average Time (s)', 'Std Time (s)', 'Repeated Times']
25 | df_final.to_csv("dask_results.csv", header=columns, index=False)
26 | 
27 | if args.verbose:
28 |     print(df_final)
29 | 


--------------------------------------------------------------------------------
/microbenchmarks/dask-python/run_dask.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of nodes"; exit -1; fi
 3 | if [ "$#" -gt 3 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi
 4 | 
 5 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 6 | 
 7 | SCRIPT_DIR=$(dirname $(realpath -s $0))
 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils)
 9 | world_size=$1
10 | 
11 | if [ "$#" -eq 1 ]; then
12 |     source $TEST_UNILS_DIR/load_cluster_env.sh
13 |     OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))})
14 |     
15 |     dask-scheduler &
16 |     sleep 1
17 | 
18 |     for index in ${!OTHERS_IPADDR[@]}
19 |     do
20 |         rank=$((index+1))
21 |         ssh -t -t ${OTHERS_IPADDR[$index]} "$(realpath -s $0) $MY_IPADDR $rank" &
22 |     done
23 | 
24 |     dask-worker $MY_IPADDR:8786 --name Dask-0
25 | else
26 |     master=$1
27 |     index=$2
28 |     source ~/anaconda3/etc/profile.d/conda.sh
29 |     conda activate
30 |     dask-worker $master:8786 --name Dask-$index
31 | fi
32 | 


--------------------------------------------------------------------------------
/microbenchmarks/gloo-cpp/.gitignore:
--------------------------------------------------------------------------------
1 | gloo/
2 | gloo_results.csv
3 | 


--------------------------------------------------------------------------------
/microbenchmarks/gloo-cpp/README.md:
--------------------------------------------------------------------------------
 1 | ## Gloo collective communication benchmarks (baseline)
 2 | 
 3 | Usage:
 4 | 
 5 | ```bash
 6 | ./run_benchmark.sh ${gloo_microbenchmark_name} ${total_number_of_nodes} ${input_size_in_bytes}`
 7 | ```
 8 | 
 9 | `${gloo_microbenchmark_name}` includes allreduce_ring, allreduce_ring_chunked, allreduce_halving_doubling, allreduce_bcube, barrier_all_to_all, broadcast_one_to_all, pairwise_exchange.
10 | 
11 | Note: Sometimes Gloo would be flaky and you might see error messages like
12 | 
13 | ```
14 | terminate called after throwing an instance of 'gloo::IoException'
15 |   what():  [**/hoplite/microbenchmarks/gloo-cpp/gloo/gloo/transport/tcp/pair.cc:572] Connection closed by peer [172.31.48.113]:44461
16 | ```
17 | 
18 | when you use large payloads. We have taken that into consideration when writing our result parsing scripts, and you will get informed during parsing. You can manually rerun these tests if you want to increase the accuracy of the statistics.
19 | 


--------------------------------------------------------------------------------
/microbenchmarks/gloo-cpp/auto_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for num_nodes in 4 8 12 16; do
 3 |   for test_name in allreduce_ring_chunked allreduce_halving_doubling broadcast_one_to_all; do
 4 |     for sz in 10 15 20 25 30; do
 5 |       for i in `seq 5`; do
 6 |         obj_size=$((2**$sz))
 7 |         ./run_test.sh $test_name $num_nodes $obj_size
 8 |       done
 9 |     done
10 |   done
11 | done
12 | 


--------------------------------------------------------------------------------
/microbenchmarks/gloo-cpp/install_gloo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # See https://github.com/facebookincubator/gloo
 4 | 
 5 | sudo apt-get install -y libhiredis-dev redis-server
 6 | 
 7 | if [ ! -d gloo ]; then
 8 |     git clone git@github.com:facebookincubator/gloo.git
 9 | fi
10 | 
11 | cd gloo
12 | # Pin gloo version to commit 881f7f0dcf06f7e49e134a45d3284860fb244fa9
13 | git checkout 881f7f0dcf06f7e49e134a45d3284860fb244fa9
14 | rm -rf build
15 | mkdir build
16 | cd build
17 | # Redis is required for the benchmark.
18 | cmake ../ -DBUILD_BENCHMARK=1 -DUSE_REDIS=ON
19 | make -j8
20 | 


--------------------------------------------------------------------------------
/microbenchmarks/gloo-cpp/parse_result.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | sys.path.insert(0, "../../test_utils")
 5 | import result_parser_utils
 6 | 
 7 | # Example output
 8 | 
 9 | """
10 | Device:      tcp, pci=0000:00:05.0, iface=ens5, speed=-1, addr=[172.31.49.113]
11 | Algorithm:   allreduce_ring_chunked
12 | Options:     processes=4, inputs=1, threads=1
13 | 
14 |    elements   min (us)   p50 (us)   p99 (us)   max (us)   avg (GB/s)    samples
15 |   268435456    1443672    1443672    1443672    1443672        0.693          1
16 | """
17 | 
18 | def parse_file(task_name, log_dir, foldername):
19 |     try:
20 |         lines = result_parser_utils.read_rank0_lines(log_dir, foldername)
21 |         # The unit of the original result is microsecond. We turn it into seconds.
22 |         return float(lines[5].split()[2]) / 1000 / 1000
23 |     except Exception:
24 |         return None
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     parser = argparse.ArgumentParser(description='Gloo (C++) benchmark results parser.')
29 |     parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log',
30 |                         help='The logging directory of Gloo benchmarks')
31 |     parser.add_argument('--verbose', action='store_true')
32 |     args = parser.parse_args()
33 |     df = result_parser_utils.parse(args.log_dir, parse_file)
34 |     if args.verbose:
35 |         print(df)
36 |     df.to_csv('gloo_results.csv', index=False)
37 | 


--------------------------------------------------------------------------------
/microbenchmarks/gloo-cpp/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 3 ]; then
 4 |   echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes & input size required"
 5 |   echo "test name: allreduce_ring, allreduce_ring_chunked, allreduce_halving_doubling, "
 6 |   echo "           allreduce_bcube, barrier_all_to_all, broadcast_one_to_all, pairwise_exchange"
 7 |   exit -1
 8 | fi
 9 | 
10 | if [ "$#" -gt 3 ]; then
11 |   echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"
12 |   exit -1
13 | fi
14 | 
15 | # trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
16 | 
17 | test_name=$1
18 | world_size=$2
19 | object_size=$3
20 | 
21 | SCRIPT_DIR=$(dirname $(realpath -s $0))
22 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils)
23 | GLOO_DIR=$SCRIPT_DIR/gloo/
24 | 
25 | source $TEST_UNILS_DIR/load_cluster_env.sh
26 | 
27 | # prepare logging directory
28 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size
29 | mkdir -p $log_dir
30 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest
31 | 
32 | # gloo benchmarks requires Redis
33 | redis-server --port 7799 --protected-mode no &> /dev/null &
34 | REDIS_PID=$!
35 | sleep 1
36 | echo "IP address of this node: $MY_IPADDR"
37 | 
38 | all_nodes=(${ALL_IPADDR[@]:0:$world_size})
39 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
40 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \
41 |     -x GLOO_DIR="$GLOO_DIR" \
42 |     -x GLOO_LOGGING_DIR="$log_dir" \
43 |     -x REDIS_HOST="$MY_IPADDR" \
44 |     -x test_name="$test_name" \
45 |     -x object_size="$object_size" \
46 |     test_wrapper.sh
47 | 
48 | kill $REDIS_PID
49 | 


--------------------------------------------------------------------------------
/microbenchmarks/gloo-cpp/test_wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | logging_file=$GLOO_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log
 3 | $GLOO_DIR/build/gloo/benchmark/benchmark \
 4 |     --size $OMPI_COMM_WORLD_SIZE \
 5 |     --rank $OMPI_COMM_WORLD_RANK \
 6 |     --redis-host $REDIS_HOST \
 7 |     --redis-port 7799 \
 8 |     --prefix benchmark-$test_name-$OMPI_COMM_WORLD_SIZE-$object_size \
 9 |     --transport tcp \
10 |     --elements $(($object_size / 4)) \
11 |     --iteration-count 1 \
12 |     $test_name \
13 |     2>&1 | tee $logging_file
14 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-cpp/README.md:
--------------------------------------------------------------------------------
 1 | ## Hoplite C++ interface benchmarks
 2 | 
 3 | Hoplite collective communication benchmarks with C++ binaries.
 4 | 
 5 | ```bash
 6 | ./run_tests.sh ${microbenchmark_name} ${total_number_of_nodes} ${input_size_in_bytes}
 7 | ```
 8 | 
 9 | `${microbenchmark_name}` includes 5 most common collective communication operations: `multicast`, `reduce`, `gather`, `allreduce`, `allgather`.
10 | 
11 | ### Pressure test (optional)
12 | 
13 | Intensive benchmarks that exploits corner cases. This test proves reliablility of our system. It could take serveral minutes to complete with high speed networks.
14 | 
15 | Usage: `./pressure_test.sh`
16 | 
17 | ### Subset reduction test (optional)
18 | 
19 | This test shows Hoplite is able to reduce only a subset of objects. For example, we have 8 candidate objects to reduce, but we want to reduce 4 objects that are created first.
20 | 
21 | Usage:
22 | 
23 | ```bash
24 | ./run_tests.sh subset_reduce ${total_number_of_nodes} ${input_size_in_bytes}
25 | ```
26 | 
27 | We suggest `total_number_of_nodes>=4`.
28 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-cpp/auto_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for test_name in multicast reduce gather allreduce; do
 3 |   for num_nodes in 4 8 12 16; do
 4 |     for sz in 10 15 20 25 30; do
 5 |       obj_size=$((2**$sz))
 6 |       ./run_test.sh ${test_name} $num_nodes $obj_size 5
 7 |       sleep 1
 8 |     done
 9 |   done
10 | done
11 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-cpp/coverage_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "exit" INT
 4 | 
 5 | num_nodes=8
 6 | 
 7 | for test_name in multicast reduce gather allreduce allgather; do
 8 |   for i in 15 25; do
 9 |       obj_size=$((2**$i))
10 |       echo $test_name-$num_nodes-$obj_size
11 |       ./run_test.sh $test_name $num_nodes $obj_size 3
12 |       sleep 1
13 |   done
14 | done
15 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-cpp/parse_result.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import numpy as np
 4 | 
 5 | import sys
 6 | 
 7 | sys.path.insert(0, "../../test_utils")
 8 | import result_parser_utils
 9 | 
10 | 
11 | WARMUP_ROUNDS = 2
12 | 
13 | 
14 | def get_durations(lines):
15 |     durations = []
16 |     for line in lines:
17 |         if 'duration = ' in line:
18 |             tmp = line.split('duration = ')[1]
19 |             durations.append(float(tmp))
20 |     return durations
21 | 
22 | 
23 | def parse_all_ranks(folder_path, with_rank0=True):
24 |     files = os.listdir(folder_path)
25 |     all_rank_durations = []
26 |     for filename in files:
27 |         if 'rank' in filename and (with_rank0 or 'rank_0' not in filename):
28 |             try:
29 |                 with open(os.path.join(folder_path, filename)) as f:
30 |                     durations = get_durations(f.readlines())
31 |                 if not durations:
32 |                     raise ValueError("Bad file")
33 |                 all_rank_durations.append(durations)
34 |             except Exception:
35 |                 print("Bad file", folder_path, filename)
36 |                 return None
37 | 
38 |     try:
39 |         return np.max(all_rank_durations, axis=0)
40 |     except Exception as e:
41 |         print("Error: empty directory", folder_path, e)
42 |         return None
43 | 
44 | 
45 | def parse_file(task_name, log_dir, foldername):
46 |     path = os.path.join(log_dir, foldername)
47 | 
48 |     if task_name in ('allreduce', 'allgather'):
49 |         return parse_all_ranks(path)
50 |     elif task_name == 'multicast':
51 |         return parse_all_ranks(path, with_rank0=False)
52 |     elif task_name in ('reduce', 'gather', 'subset_reduce'):
53 |         return result_parser_utils.default_parse_file(task_name, log_dir, foldername)
54 |     else:
55 |         raise ValueError('Unknown task', task_name)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     parser = argparse.ArgumentParser(description='Hoplite (C++) benchmark results parser.')
60 |     parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log',
61 |                         help='The logging directory of Gloo benchmarks')
62 |     parser.add_argument('--verbose', action='store_true')
63 |     args = parser.parse_args()
64 |     df = result_parser_utils.parse(args.log_dir, parse_file)
65 |     if args.verbose:
66 |         print(df)
67 |     df.to_csv('hoplite_results.csv', index=False)
68 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-cpp/pressure_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of nodes required"; exit -1; fi
 3 | if [ "$#" -gt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi
 4 | 
 5 | trap "exit" INT
 6 | 
 7 | ./run_test.sh multicast $1 $[2**10] 1000
 8 | ./run_test.sh multicast $1 $[2**17] 1000
 9 | ./run_test.sh multicast $1 $[2**30] 5
10 | 
11 | ./run_test.sh reduce $1 $[2**10] 1000
12 | ./run_test.sh reduce $1 $[2**17] 1000
13 | ./run_test.sh reduce $1 $[2**30] 5
14 | 
15 | ./run_test.sh allreduce $1 $[2**10] 1000
16 | ./run_test.sh allreduce $1 $[2**17] 1000
17 | ./run_test.sh allreduce $1 $[2**30] 5
18 | 
19 | ./run_test.sh gather $1 $[2**10] 1000
20 | ./run_test.sh gather $1 $[2**17] 1000
21 | ./run_test.sh gather $1 $[2**30] 5
22 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-cpp/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -lt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes, input size & n_trials required"; exit -1; fi
 3 | if [ "$#" -gt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi
 4 | 
 5 | ## setup
 6 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 7 | SCRIPT_DIR=$(dirname $(realpath -s $0))
 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils)
 9 | BINARIES_DIR=$(realpath -s $SCRIPT_DIR/../../build)
10 | TEST_BINARIES_DIR=$BINARIES_DIR/tests
11 | 
12 | ## cleanup procs
13 | sudo fuser -k 6666/tcp -s &> /dev/null
14 | sudo fuser -k 50055/tcp -s &> /dev/null
15 | sudo fuser -k 20210/tcp -s &> /dev/null
16 | 
17 | test_name=$1
18 | test_executable_abspath=$TEST_BINARIES_DIR/${test_name}_test
19 | world_size=$2
20 | object_size=$3
21 | n_trials=$4
22 | 
23 | if [ ! -f $test_executable_abspath ]; then
24 |     echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test executable not found: $test_executable_abspath"
25 |     exit -2
26 | fi
27 | 
28 | # get cluster info
29 | source $TEST_UNILS_DIR/load_cluster_env.sh
30 | OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))})
31 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) head_node: $MY_IPADDR; other_nodes: ${OTHERS_IPADDR[@]}"
32 | 
33 | # prompt test info
34 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) Running test $(tput setaf 3)$(tput bold)$test_name$(tput sgr 0)"
35 | 
36 | # create logging dir
37 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size
38 | mkdir -p $log_dir
39 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest
40 | 
41 | export RAY_BACKEND_LOG_LEVEL=info
42 | 
43 | pkill notification
44 | sleep 0.5
45 | ($BINARIES_DIR/notification 2>&1 | tee $log_dir/$MY_IPADDR.notification.log) &
46 | sleep 0.5
47 | 
48 | all_nodes=(${ALL_IPADDR[@]:0:$world_size})
49 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
50 | 
51 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \
52 |     -x HOPLITE_LOGGING_DIR=$log_dir \
53 |     -x RAY_BACKEND_LOG_LEVEL=$RAY_BACKEND_LOG_LEVEL \
54 |     $SCRIPT_DIR/test_wrapper.sh $test_executable_abspath $MY_IPADDR $object_size $n_trials
55 | 
56 | sleep 1
57 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-cpp/test_wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | logging_file=$HOPLITE_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log
3 | $@ 2>&1 | tee $logging_file
4 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/README.md:
--------------------------------------------------------------------------------
 1 | ## Hoplite Python interface benchmarks
 2 | 
 3 | Hoplite collective communication benchmarks with Python.
 4 | 
 5 | ```bash
 6 | ./run_tests.sh ${microbenchmark_name} ${total_number_of_nodes} ${input_size_in_bytes}
 7 | ```
 8 | 
 9 | `${microbenchmark_name}` includes 5 most common collective communication operations: `multicast`, `reduce`, `gather`, `allreduce`, `allgather`.
10 | 
11 | ### Pressure test (optional)
12 | 
13 | Intensive benchmarks that exploits corner cases. This test proves reliablility of our system. It could take serveral minutes to complete with high speed networks.
14 | 
15 | Usage: `./pressure_test.sh`
16 | 
17 | ### Round-trip test (optional)
18 | 
19 | This test shows when transfer data from the object store, Hoplite is able to overlap object copy with object transfer to gain higher performance.
20 | 
21 | Usage:
22 | 
23 | ```bash
24 | ./run_tests.sh roundtrip 2 ${input_size_in_bytes}
25 | ```
26 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/auto_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for test_name in multicast reduce gather allreduce; do
 4 |   for num_nodes in 4 8 12 16; do
 5 |     for sz in 10 15 20 25 30; do
 6 |         for i in `seq 5`; do
 7 |           obj_size=$((2**$sz))
 8 |           ./run_test.sh ${test_name} $num_nodes $obj_size
 9 |         done
10 |     done
11 |   done
12 | done
13 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/coverage_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "exit" INT
 4 | 
 5 | for test_index in `seq 1 3`; do
 6 | ./run_test.sh roundtrip 2 $[2**25]
 7 | ./run_test.sh roundtrip 2 $[2**15]
 8 | done
 9 | 
10 | num_nodes=8
11 | 
12 | for test_name in multicast reduce gather allreduce allgather; do
13 |   for i in 15 25; do
14 |     for test_index in `seq 1 3`; do
15 |       obj_size=$((2**$i))
16 |       echo $test_name-$num_nodes-$obj_size-$test_index
17 |       ./run_test.sh ${test_name} $num_nodes $obj_size
18 |       sleep 1
19 |     done
20 | done
21 | 
22 | done
23 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/parse_result.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import numpy as np
 4 | 
 5 | import sys
 6 | 
 7 | sys.path.insert(0, "../../test_utils")
 8 | import result_parser_utils
 9 | 
10 | 
11 | WARMUP_ROUNDS = 2
12 | 
13 | 
14 | def get_durations(lines):
15 |     durations = []
16 |     for line in lines:
17 |         if 'duration = ' in line:
18 |             tmp = line.split('duration = ')[1]
19 |             durations.append(float(tmp))
20 |     return durations
21 | 
22 | 
23 | def parse_all_ranks(folder_path, with_rank0=True):
24 |     files = os.listdir(folder_path)
25 |     all_rank_durations = []
26 |     for filename in files:
27 |         if 'rank' in filename and (with_rank0 or 'rank_0' not in filename):
28 |             try:
29 |                 with open(os.path.join(folder_path, filename)) as f:
30 |                     durations = get_durations(f.readlines())
31 |                 if not durations:
32 |                     raise ValueError("Bad file")
33 |                 all_rank_durations.append(durations)
34 |             except Exception:
35 |                 print("Bad file", folder_path, filename)
36 |                 return None
37 | 
38 |     try:
39 |         return np.max(all_rank_durations, axis=0)
40 |     except Exception as e:
41 |         print("Error: empty directory", folder_path, e)
42 |         return None
43 | 
44 | 
45 | def parse_file(task_name, log_dir, foldername):
46 |     path = os.path.join(log_dir, foldername)
47 | 
48 |     if task_name in ('allreduce', 'allgather'):
49 |         return parse_all_ranks(path)
50 |     elif task_name == 'multicast':
51 |         return parse_all_ranks(path, with_rank0=False)
52 |     elif task_name in ('roundtrip', 'reduce', 'gather', 'subset_reduce'):
53 |         return result_parser_utils.default_parse_file(task_name, log_dir, foldername)
54 |     else:
55 |         raise ValueError('Unknown task', task_name)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     parser = argparse.ArgumentParser(description='Hoplite (C++) benchmark results parser.')
60 |     parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log',
61 |                         help='The logging directory of Gloo benchmarks')
62 |     parser.add_argument('--verbose', action='store_true')
63 |     args = parser.parse_args()
64 |     df = result_parser_utils.parse(args.log_dir, parse_file)
65 |     if args.verbose:
66 |         print(df)
67 |     df.to_csv('hoplite_results.csv', index=False)
68 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/parse_roundtrip_result.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import numpy as np
 4 | 
 5 | import sys
 6 | 
 7 | sys.path.insert(0, "../../test_utils")
 8 | import result_parser_utils
 9 | 
10 | 
11 | WARMUP_ROUNDS = 2
12 | 
13 | 
14 | def get_durations(lines):
15 |     durations = []
16 |     for line in lines:
17 |         if 'duration = ' in line:
18 |             tmp = line.split('duration = ')[1]
19 |             durations.append(float(tmp))
20 |     return durations
21 | 
22 | 
23 | def parse_all_ranks(folder_path, with_rank0=True):
24 |     files = os.listdir(folder_path)
25 |     all_rank_durations = []
26 |     for filename in files:
27 |         if 'rank' in filename and (with_rank0 or 'rank_0' not in filename):
28 |             try:
29 |                 with open(os.path.join(folder_path, filename)) as f:
30 |                     durations = get_durations(f.readlines())
31 |                 if not durations:
32 |                     raise ValueError("Bad file")
33 |                 all_rank_durations.append(durations)
34 |             except Exception:
35 |                 print("Bad file", folder_path, filename)
36 |                 return None
37 | 
38 |     try:
39 |         return np.max(all_rank_durations, axis=0)
40 |     except Exception as e:
41 |         print("Error: empty directory", folder_path, e)
42 |         return None
43 | 
44 | 
45 | def parse_file(task_name, log_dir, foldername):
46 |     path = os.path.join(log_dir, foldername)
47 | 
48 |     if task_name in ('allreduce', 'allgather'):
49 |         return parse_all_ranks(path)
50 |     elif task_name == 'multicast':
51 |         return parse_all_ranks(path, with_rank0=False)
52 |     elif task_name in ('roundtrip', 'reduce', 'gather', 'subset_reduce'):
53 |         return result_parser_utils.default_parse_file(task_name, log_dir, foldername)
54 |     else:
55 |         raise ValueError('Unknown task', task_name)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     parser = argparse.ArgumentParser(description='Hoplite roundtrip results parser.')
60 |     parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log',
61 |                         help='The logging directory of Gloo benchmarks')
62 |     parser.add_argument('--verbose', action='store_true')
63 |     args = parser.parse_args()
64 |     df = result_parser_utils.parse(args.log_dir, parse_file)
65 | 
66 |     df = df[df['Benchmark Name'].str.contains('roundtrip')]
67 |     sz = df['Object Size (in bytes)'].astype('int64')
68 |     df = df[(sz == 2**10) | (sz == 2**20) | (sz == 2**30)]
69 | 
70 |     if args.verbose:
71 |         print(df)
72 |     
73 |     rs = df[['Object Size (in bytes)', 'Average Time (s)', 'Std Time (s)', 'Repeated Times']].values
74 |     with open('hoplite-roundtrip.csv', "w") as f:
75 |         for r in rs:
76 |             f.write(f"hoplite,{r[0]},{r[1]},{r[2]}\n")
77 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/pressure_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of nodes required"; exit -1; fi
 3 | if [ "$#" -gt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi
 4 | 
 5 | trap "exit" INT
 6 | 
 7 | for i in `seq 1000`; do
 8 | ./run_test.sh multicast $1 $[2**10]
 9 | ./run_test.sh reduce $1 $[2**10]
10 | ./run_test.sh allreduce $1 $[2**10]
11 | ./run_test.sh gather $1 $[2**10] 
12 | 
13 | ./run_test.sh multicast $1 $[2**17]
14 | ./run_test.sh reduce $1 $[2**17]
15 | ./run_test.sh allreduce $1 $[2**17]
16 | ./run_test.sh gather $1 $[2**17] 
17 | done
18 | 
19 | for i in `seq 5`; do
20 | ./run_test.sh multicast $1 $[2**17]
21 | ./run_test.sh reduce $1 $[2**17]
22 | ./run_test.sh allreduce $1 $[2**17]
23 | ./run_test.sh gather $1 $[2**17] 
24 | done
25 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -lt 3 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes, input size required"; exit -1; fi
 3 | if [ "$#" -gt 3 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi
 4 | 
 5 | ## setup
 6 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 7 | SCRIPT_DIR=$(dirname $(realpath -s $0))
 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils)
 9 | BINARIES_DIR=$(realpath -s $SCRIPT_DIR/../../build)
10 | 
11 | ## cleanup procs
12 | sudo fuser -k 6666/tcp -s &> /dev/null
13 | sudo fuser -k 50055/tcp -s &> /dev/null
14 | sudo fuser -k 20210/tcp -s &> /dev/null
15 | 
16 | test_name=$1
17 | world_size=$2
18 | object_size=$3
19 | 
20 | # get cluster info
21 | source $TEST_UNILS_DIR/load_cluster_env.sh
22 | OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))})
23 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) head_node: $MY_IPADDR; other_nodes: ${OTHERS_IPADDR[@]}"
24 | 
25 | # prompt test info
26 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) Running test $(tput setaf 3)$(tput bold)$test_name$(tput sgr 0)"
27 | 
28 | # create logging dir
29 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size
30 | mkdir -p $log_dir
31 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest
32 | 
33 | export RAY_BACKEND_LOG_LEVEL=info
34 | 
35 | # pkill notification
36 | # sleep 0.5
37 | # ($BINARIES_DIR/notification 2>&1 | tee $log_dir/$MY_IPADDR.notification.log) &
38 | # sleep 0.5
39 | 
40 | all_nodes=(${ALL_IPADDR[@]:0:$world_size})
41 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
42 | 
43 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \
44 |     -x HOPLITE_LOGGING_DIR=$log_dir \
45 |     -x RAY_BACKEND_LOG_LEVEL=$RAY_BACKEND_LOG_LEVEL \
46 |     test_wrapper.sh $test_name -s $object_size
47 | 
48 | sleep 1
49 | 


--------------------------------------------------------------------------------
/microbenchmarks/hoplite-python/test_wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | logging_file=$HOPLITE_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log
3 | source ~/anaconda3/etc/profile.d/conda.sh
4 | conda activate
5 | python hoplite_microbenchmarks.py $@ 2>&1 | tee $logging_file
6 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/.gitignore:
--------------------------------------------------------------------------------
1 | reduce
2 | allgather
3 | allreduce
4 | gather
5 | multicast
6 | send_recv
7 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/Makefile:
--------------------------------------------------------------------------------
 1 | EXECS=multicast reduce allreduce gather allgather roundtrip
 2 | MPICC?=mpicc
 3 | 
 4 | all: ${EXECS}
 5 | 
 6 | multicast: multicast.c
 7 | 	${MPICC} -O2 -o multicast multicast.c
 8 | 
 9 | reduce: reduce.c
10 | 	${MPICC} -O2 -o reduce reduce.c
11 | 
12 | allreduce: allreduce.c
13 | 	${MPICC} -O2 -o allreduce allreduce.c
14 | 
15 | gather: gather.c
16 | 	${MPICC} -O2 -o gather gather.c
17 | 
18 | allgather: allgather.c
19 | 	${MPICC} -O2 -o allgather allgather.c
20 | 
21 | roundtrip: roundtrip.c
22 | 	${MPICC} -O2 -o roundtrip roundtrip.c
23 | 
24 | clean:
25 | 	rm -f ${EXECS}
26 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/README.md:
--------------------------------------------------------------------------------
 1 | ## MPI collective communication benchmarks (baseline)
 2 | 
 3 | Usage:
 4 | 
 5 | ```bash
 6 | ./mpi_${microbenchmark_name}.sh ${total_number_of_nodes} ${input_size_in_bytes}
 7 | ```
 8 | 
 9 | `${microbenchmark_name}` includes 5 most common collective communication operations: `multicast`, `reduce`, `gather`, `allreduce`, `allgather`.
10 | 
11 | ### Roundtrip test
12 | 
13 | Usage:
14 | 
15 | ```bash
16 | ./mpi_sendrecv.sh ${input_size_in_bytes}
17 | ```
18 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/allgather.c:
--------------------------------------------------------------------------------
 1 | // Author: Wes Kendall
 2 | // Copyright 2013 www.mpitutorial.com
 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel
 4 | // free to modify it for your own use. Any distribution of the code must
 5 | // either provide a link to www.mpitutorial.com or keep this header intact.
 6 | //
 7 | // Program that computes the average of an array of elements in parallel using
 8 | // MPI_Reduce.
 9 | //
10 | #include <assert.h>
11 | #include <mpi.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <time.h>
15 | 
16 | // Creates an array of random numbers. Each number has a value from 0 - 1
17 | float *create_rand_nums(int num_elements) {
18 |   float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
19 |   assert(rand_nums != NULL);
20 |   int i;
21 |   for (i = 0; i < num_elements; i++) {
22 |     rand_nums[i] = (rand() / (float)RAND_MAX);
23 |   }
24 |   return rand_nums;
25 | }
26 | 
27 | int main(int argc, char **argv) {
28 |   if (argc != 2) {
29 |     fprintf(stderr, "Usage: ./allgather num_elements\n");
30 |     exit(1);
31 |   }
32 | 
33 |   int num_elements_per_proc = atoi(argv[1]);
34 |   double time = 0;
35 | 
36 |   MPI_Init(NULL, NULL);
37 | 
38 |   int world_rank;
39 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
40 |   int world_size;
41 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
42 | 
43 |   // Create a random array of elements on all processes.
44 |   srand(world_rank); // Seed the random number generator to get different
45 |                      // results each time for each processor
46 |   float *rand_nums = NULL;
47 |   rand_nums = create_rand_nums(num_elements_per_proc);
48 |   float *global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc * world_size);
49 | 
50 |   MPI_Barrier(MPI_COMM_WORLD);
51 |   // Reduce all of the local sums into the global sum
52 |   time -= MPI_Wtime();
53 |   MPI_Allgather(rand_nums, num_elements_per_proc, MPI_FLOAT, global_nums, num_elements_per_proc, MPI_FLOAT,
54 |                 MPI_COMM_WORLD);
55 |   time += MPI_Wtime();
56 | 
57 |   // Print the result
58 |   if (world_rank == 0) {
59 |     printf("MPI_Allgather duration = %lf\n", time);
60 |   }
61 | 
62 |   // Clean up
63 |   free(rand_nums);
64 | 
65 |   MPI_Barrier(MPI_COMM_WORLD);
66 |   MPI_Finalize();
67 | }
68 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/allreduce.c:
--------------------------------------------------------------------------------
 1 | // Author: Wes Kendall
 2 | // Copyright 2013 www.mpitutorial.com
 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel
 4 | // free to modify it for your own use. Any distribution of the code must
 5 | // either provide a link to www.mpitutorial.com or keep this header intact.
 6 | //
 7 | // Program that computes the average of an array of elements in parallel using
 8 | // MPI_Reduce.
 9 | //
10 | #include <assert.h>
11 | #include <mpi.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <time.h>
15 | 
16 | // Creates an array of random numbers. Each number has a value from 0 - 1
17 | float *create_rand_nums(int num_elements) {
18 |   float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
19 |   assert(rand_nums != NULL);
20 |   int i;
21 |   for (i = 0; i < num_elements; i++) {
22 |     rand_nums[i] = (rand() / (float)RAND_MAX);
23 |   }
24 |   return rand_nums;
25 | }
26 | 
27 | int main(int argc, char **argv) {
28 |   if (argc != 2) {
29 |     fprintf(stderr, "Usage: ./allreduce num_elements\n");
30 |     exit(1);
31 |   }
32 | 
33 |   int num_elements_per_proc = atoi(argv[1]);
34 |   double time = 0;
35 | 
36 |   MPI_Init(NULL, NULL);
37 | 
38 |   int world_rank;
39 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
40 |   int world_size;
41 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
42 | 
43 |   // Create a random array of elements on all processes.
44 |   srand(world_rank); // Seed the random number generator to get different
45 |                      // results each time for each processor
46 |   float *rand_nums = NULL;
47 |   rand_nums = create_rand_nums(num_elements_per_proc);
48 |   float *global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);
49 | 
50 |   MPI_Barrier(MPI_COMM_WORLD);
51 |   // Reduce all of the local sums into the global sum
52 |   time -= MPI_Wtime();
53 |   MPI_Allreduce(rand_nums, global_nums, num_elements_per_proc, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
54 |   time += MPI_Wtime();
55 | 
56 |   // Print the result
57 |   if (world_rank == 0) {
58 |     printf("MPI_Allreduce duration = %lf\n", time);
59 |   }
60 | 
61 |   // Clean up
62 |   free(rand_nums);
63 | 
64 |   MPI_Barrier(MPI_COMM_WORLD);
65 |   MPI_Finalize();
66 | }
67 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/auto_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for test_name in multicast reduce gather allreduce; do
 4 |   for num_nodes in 4 8 12 16; do
 5 |     for sz in 10 15 20 25 30; do
 6 |         for i in `seq 5`; do
 7 |           obj_size=$((2**$sz))
 8 |           ./run_test.sh ${test_name} $num_nodes $obj_size
 9 |         done
10 |     done
11 |   done
12 | done
13 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/coverage_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for test_index in `seq 1 3`; do
 4 | ./run_test.sh roundtrip 2 $[2**25]
 5 | ./run_test.sh roundtrip 2 $[2**15]
 6 | done
 7 | 
 8 | num_nodes=8
 9 | 
10 | for test_name in multicast reduce gather allreduce allgather; do
11 |   for i in 15 25; do
12 |     for test_index in `seq 1 3`; do
13 |       obj_size=$((2**$i))
14 |       echo $test_name-$num_nodes-$obj_size-$test_index
15 |       ./run_test.sh ${test_name} $num_nodes $obj_size
16 |       sleep 1
17 |     done
18 | done
19 | 
20 | done
21 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/gather.c:
--------------------------------------------------------------------------------
 1 | // Author: Wes Kendall
 2 | // Copyright 2013 www.mpitutorial.com
 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel
 4 | // free to modify it for your own use. Any distribution of the code must
 5 | // either provide a link to www.mpitutorial.com or keep this header intact.
 6 | //
 7 | // Program that computes the average of an array of elements in parallel using
 8 | // MPI_Reduce.
 9 | //
10 | #include <assert.h>
11 | #include <mpi.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <time.h>
15 | 
16 | // Creates an array of random numbers. Each number has a value from 0 - 1
17 | float *create_rand_nums(int num_elements) {
18 |   float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
19 |   assert(rand_nums != NULL);
20 |   int i;
21 |   for (i = 0; i < num_elements; i++) {
22 |     rand_nums[i] = (rand() / (float)RAND_MAX);
23 |   }
24 |   return rand_nums;
25 | }
26 | 
27 | int main(int argc, char **argv) {
28 |   if (argc != 2) {
29 |     fprintf(stderr, "Usage: ./gather num_elements\n");
30 |     exit(1);
31 |   }
32 | 
33 |   int num_elements_per_proc = atoi(argv[1]);
34 |   double time = 0;
35 | 
36 |   MPI_Init(NULL, NULL);
37 | 
38 |   int world_rank;
39 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
40 |   int world_size;
41 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
42 | 
43 |   // Create a random array of elements on all processes.
44 |   srand(world_rank); // Seed the random number generator to get different
45 |                      // results each time for each processor
46 |   float *rand_nums = NULL;
47 |   rand_nums = create_rand_nums(num_elements_per_proc);
48 |   float *global_nums = NULL;
49 | 
50 |   if (world_rank == 0) {
51 |     global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc * world_size);
52 |   }
53 | 
54 |   MPI_Barrier(MPI_COMM_WORLD);
55 |   // Reduce all of the local sums into the global sum
56 |   time -= MPI_Wtime();
57 |   MPI_Gather(rand_nums, num_elements_per_proc, MPI_FLOAT, global_nums, num_elements_per_proc, MPI_FLOAT, 0,
58 |              MPI_COMM_WORLD);
59 |   time += MPI_Wtime();
60 | 
61 |   // Print the result
62 |   if (world_rank == 0) {
63 |     printf("MPI_Gather duration = %lf\n", time);
64 |   }
65 | 
66 |   // Clean up
67 |   free(rand_nums);
68 | 
69 |   MPI_Barrier(MPI_COMM_WORLD);
70 |   MPI_Finalize();
71 | }
72 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/multicast.c:
--------------------------------------------------------------------------------
 1 | // Author: Wes Kendall
 2 | // Copyright 2011 www.mpitutorial.com
 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel
 4 | // free to modify it for your own use. Any distribution of the code must
 5 | // either provide a link to www.mpitutorial.com or keep this header intact.
 6 | //
 7 | // Comparison of MPI_Bcast with the my_bcast function
 8 | //
 9 | #include <assert.h>
10 | #include <mpi.h>
11 | #include <stdio.h>
12 | #include <stdlib.h>
13 | 
14 | int main(int argc, char **argv) {
15 |   if (argc != 2) {
16 |     fprintf(stderr, "Usage: ./multicast num_elements\n");
17 |     exit(1);
18 |   }
19 | 
20 |   int num_elements = atoi(argv[1]);
21 |   int num_trials = 1;
22 | 
23 |   MPI_Init(NULL, NULL);
24 | 
25 |   int world_rank;
26 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
27 | 
28 |   double total_mpi_bcast_time = 0.0;
29 |   int *data = (int *)malloc(sizeof(int) * num_elements);
30 |   assert(data != NULL);
31 | 
32 |   // Time MPI_Bcast
33 |   MPI_Barrier(MPI_COMM_WORLD);
34 |   total_mpi_bcast_time -= MPI_Wtime();
35 |   MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD);
36 |   MPI_Barrier(MPI_COMM_WORLD);
37 |   total_mpi_bcast_time += MPI_Wtime();
38 | 
39 |   // Print off timing information
40 |   if (world_rank == 0) {
41 |     printf("MPI_Bcast duration = %lf\n", total_mpi_bcast_time);
42 |   }
43 | 
44 |   free(data);
45 |   MPI_Finalize();
46 | }
47 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/parse_result.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | sys.path.insert(0, "../../test_utils")
 5 | import result_parser_utils
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser(description='MPI (C++) benchmark results parser.')
10 |     parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log',
11 |                         help='The logging directory of Gloo benchmarks')
12 |     parser.add_argument('--verbose', action='store_true')
13 |     args = parser.parse_args()
14 |     df = result_parser_utils.parse(args.log_dir, result_parser_utils.default_parse_file)
15 |     if args.verbose:
16 |         print(df)
17 |     df.to_csv('mpi_results.csv', index=False)
18 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/parse_roundtrip_result.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | sys.path.insert(0, "../../test_utils")
 5 | import result_parser_utils
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser(description='MPI roundtrip benchmark results parser.')
10 |     parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log',
11 |                         help='The logging directory of Gloo benchmarks')
12 |     parser.add_argument('--verbose', action='store_true')
13 |     args = parser.parse_args()
14 |     df = result_parser_utils.parse(args.log_dir, result_parser_utils.default_parse_file)
15 | 
16 |     df = df[df['Benchmark Name'].str.contains('roundtrip')]
17 |     sz = df['Object Size (in bytes)'].astype('int64')
18 |     df = df[(sz == 2**10) | (sz == 2**20) | (sz == 2**30)]
19 | 
20 |     if args.verbose:
21 |         print(df)
22 |     
23 |     rs = df[['Object Size (in bytes)', 'Average Time (s)', 'Std Time (s)', 'Repeated Times']].values
24 |     with open('mpi-roundtrip.csv', "w") as f:
25 |         for r in rs:
26 |             f.write(f"mpi,{r[0]},{r[1]},{r[2]}\n")
27 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/reduce.c:
--------------------------------------------------------------------------------
 1 | // Author: Wes Kendall
 2 | // Copyright 2013 www.mpitutorial.com
 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel
 4 | // free to modify it for your own use. Any distribution of the code must
 5 | // either provide a link to www.mpitutorial.com or keep this header intact.
 6 | //
 7 | // Program that computes the average of an array of elements in parallel using
 8 | // MPI_Reduce.
 9 | //
10 | #include <assert.h>
11 | #include <mpi.h>
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | #include <time.h>
15 | 
16 | // Creates an array of random numbers. Each number has a value from 0 - 1
17 | float *create_rand_nums(int num_elements) {
18 |   float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
19 |   assert(rand_nums != NULL);
20 |   int i;
21 |   for (i = 0; i < num_elements; i++) {
22 |     rand_nums[i] = (rand() / (float)RAND_MAX);
23 |   }
24 |   return rand_nums;
25 | }
26 | 
27 | int main(int argc, char **argv) {
28 |   if (argc != 2) {
29 |     fprintf(stderr, "Usage: ./reduce num_elements\n");
30 |     exit(1);
31 |   }
32 | 
33 |   int num_elements_per_proc = atoi(argv[1]);
34 |   double time = 0;
35 | 
36 |   MPI_Init(NULL, NULL);
37 | 
38 |   int world_rank;
39 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
40 |   int world_size;
41 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
42 | 
43 |   // Create a random array of elements on all processes.
44 |   srand(world_rank); // Seed the random number generator to get different
45 |                      // results each time for each processor
46 |   float *rand_nums = NULL;
47 |   rand_nums = create_rand_nums(num_elements_per_proc);
48 |   float *global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc);
49 | 
50 |   MPI_Barrier(MPI_COMM_WORLD);
51 |   // Reduce all of the local sums into the global sum
52 |   time -= MPI_Wtime();
53 |   MPI_Reduce(rand_nums, global_nums, num_elements_per_proc, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
54 |   time += MPI_Wtime();
55 | 
56 |   // Print the result
57 |   if (world_rank == 0) {
58 |     printf("MPI_Reduce duration = %lf\n", time);
59 |   }
60 | 
61 |   // Clean up
62 |   free(rand_nums);
63 | 
64 |   MPI_Barrier(MPI_COMM_WORLD);
65 |   MPI_Finalize();
66 | }
67 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/roundtrip.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <mpi.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | 
 6 | float *create_rand_nums(int num_elements) {
 7 |   float *rand_nums = (float *)malloc(sizeof(float) * num_elements);
 8 |   assert(rand_nums != NULL);
 9 |   int i;
10 |   for (i = 0; i < num_elements; i++) {
11 |     rand_nums[i] = (rand() / (float)RAND_MAX);
12 |   }
13 |   return rand_nums;
14 | }
15 | 
16 | int main(int argc, char **argv) {
17 |   // Initialize the MPI environment
18 |   MPI_Init(NULL, NULL);
19 |   // Find out rank, size
20 |   int world_rank;
21 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
22 |   int world_size;
23 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
24 | 
25 |   // We are assuming at least 2 processes for this task
26 |   if (world_size < 2) {
27 |     fprintf(stderr, "World size must be greater than 1 for %s\n", argv[0]);
28 |     MPI_Abort(MPI_COMM_WORLD, 1);
29 |   }
30 | 
31 |   if (argc != 2) {
32 |     fprintf(stderr, "Usage: ./roundtrip num_elements\n");
33 |     exit(1);
34 |   }
35 | 
36 |   int num_elements = atoi(argv[1]);
37 |   double time = 0;
38 |   float *numbers = create_rand_nums(num_elements);
39 |   MPI_Barrier(MPI_COMM_WORLD);
40 |   if (world_rank == 0) {
41 |     time -= MPI_Wtime();
42 |     MPI_Send(
43 |         /* data         = */ numbers,
44 |         /* count        = */ num_elements,
45 |         /* datatype     = */ MPI_FLOAT,
46 |         /* destination  = */ 1,
47 |         /* tag          = */ 0,
48 |         /* communicator = */ MPI_COMM_WORLD);
49 | 
50 |     MPI_Recv(
51 |         /* data         = */ numbers,
52 |         /* count        = */ num_elements,
53 |         /* datatype     = */ MPI_FLOAT,
54 |         /* source       = */ 1,
55 |         /* tag          = */ 0,
56 |         /* communicator = */ MPI_COMM_WORLD,
57 |         /* status       = */ MPI_STATUS_IGNORE);
58 |     time += MPI_Wtime();
59 |     printf("MPI_Recv (roundtrip) duration = %lf\n", time);
60 | 
61 |   } else if (world_rank == 1) {
62 |     MPI_Recv(
63 |         /* data         = */ numbers,
64 |         /* count        = */ num_elements,
65 |         /* datatype     = */ MPI_FLOAT,
66 |         /* source       = */ 0,
67 |         /* tag          = */ 0,
68 |         /* communicator = */ MPI_COMM_WORLD,
69 |         /* status       = */ MPI_STATUS_IGNORE);
70 | 
71 |     MPI_Send(
72 |         /* data         = */ numbers,
73 |         /* count        = */ num_elements,
74 |         /* datatype     = */ MPI_FLOAT,
75 |         /* destination  = */ 0,
76 |         /* tag          = */ 0,
77 |         /* communicator = */ MPI_COMM_WORLD);
78 |   }
79 |   MPI_Finalize();
80 | }
81 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$3" ]; then echo "ERROR: test name, node number and input size required"; exit; fi
 4 | 
 5 | SCRIPT_DIR=$(dirname $(realpath -s $0))
 6 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils)
 7 | source $TEST_UNILS_DIR/load_cluster_env.sh
 8 | 
 9 | test_name=$1 # can be allgather/allreduce/gather/multicast/reduce
10 | make $test_name > /dev/null
11 | 
12 | test_executable=$test_name
13 | test_executable_abspath=$(realpath -s $test_executable)
14 | world_size=$2
15 | object_size=$3
16 | 
17 | # create logging dir
18 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size
19 | mkdir -p $log_dir
20 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest
21 | 
22 | all_nodes=(${ALL_IPADDR[@]:0:$world_size})
23 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g')
24 | 
25 | echo Number of nodes: $world_size "(actually ${#all_nodes[@]})", data size: $object_size
26 | echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")"
27 | 
28 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \
29 |     -x MPI_LOGGING_DIR="$log_dir" \
30 |     test_wrapper.sh $test_executable_abspath $[$object_size/4]
31 | 


--------------------------------------------------------------------------------
/microbenchmarks/mpi-cpp/test_wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | logging_file=$MPI_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log
3 | $@ 2>&1 | tee $logging_file
4 | 


--------------------------------------------------------------------------------
/microbenchmarks/plot_rtt.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pandas as pd
 3 | 
 4 | COLUMNS = ["Method", "Object Size (in bytes)", "Average Time (s)" ,"Std Time (s)"]
 5 | LABELS = ['Optimal', 'Hoplite', 'OpenMPI', 'Ray', 'Dask']
 6 | COLORS = (
 7 |     plt.get_cmap('tab20c')(4 * 4 + 2),
 8 |     plt.get_cmap('tab20c')(0 * 4 + 1),
 9 |     plt.get_cmap('tab20c')(1 * 4 + 2),
10 |     plt.get_cmap('tab20c')(2 * 4 + 2),
11 |     plt.get_cmap('tab20')(3 * 2 + 1),
12 | )
13 | 
14 | 
15 | def draw_rtt_1K(results):
16 |   SIZE = 1024
17 |   results = results[results[COLUMNS[1]] == SIZE]
18 |   Latency, STD = results[COLUMNS[2]], results[COLUMNS[3]]
19 | 
20 |   plt.figure(figsize=(4, 4))
21 |   ind = range(5)
22 |   width = 0.8
23 |   plt.bar(ind, Latency * 1000, width, label='usr', color=COLORS, linewidth=10)
24 |   plt.errorbar(ind[1:], Latency[1:] * 1000, yerr=STD[1:] * 1000, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6)
25 |   plt.xticks(ind, LABELS, fontsize=18)
26 |   for label in plt.gca().get_xmajorticklabels():
27 |     label.set_rotation(30)
28 |     label.set_horizontalalignment("right")
29 |   plt.yticks(fontsize=18)
30 |   plt.ylabel('RTT (ms)', fontsize=18)
31 |   plt.annotate("1.7 μs", (-0.55, 0.08), fontsize=15)
32 |   plt.savefig('RTT1K.pdf', bbox_inches="tight")
33 | 
34 | 
35 | def draw_rtt_1M(results):
36 |   SIZE = 2 ** 20
37 |   results = results[results[COLUMNS[1]] == SIZE]
38 |   Latency, STD = results[COLUMNS[2]], results[COLUMNS[3]]
39 | 
40 |   plt.figure(figsize=(4, 4))
41 |   ind = range(5)
42 |   width = 0.8
43 |   plt.bar(ind, Latency * 1000, width, label='usr', color=COLORS, linewidth=10)
44 |   plt.errorbar(ind[1:], Latency[1:] * 1000, yerr=STD[1:]  * 1000, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6)
45 |   plt.xticks(ind, LABELS, fontsize=18)
46 |   for label in plt.gca().get_xmajorticklabels():
47 |     label.set_rotation(30)
48 |     label.set_horizontalalignment("right")
49 |   plt.yticks(fontsize=18)
50 |   plt.ylabel('RTT (ms)', fontsize=18)
51 |   plt.savefig('RTT1M.pdf', bbox_inches="tight")
52 | 
53 | 
54 | def draw_rtt_1G(results):
55 |   SIZE = 2 ** 30
56 |   results = results[results[COLUMNS[1]] == SIZE]
57 |   Latency, STD = results[COLUMNS[2]], results[COLUMNS[3]]
58 | 
59 |   plt.figure(figsize=(4, 4))
60 |   ind = range(5)
61 |   width = 0.8
62 |   plt.bar(ind, Latency, width, label='usr', color=COLORS, linewidth=10)
63 |   plt.errorbar(ind[1:], Latency[1:], yerr=STD[1:], linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6)
64 |   plt.xticks(ind, LABELS, fontsize=18)
65 |   for label in plt.gca().get_xmajorticklabels():
66 |     label.set_rotation(30)
67 |     label.set_horizontalalignment("right")
68 |   plt.yticks(fontsize=18)
69 |   plt.ylabel('RTT (s)', fontsize=18)
70 |   plt.savefig('RTT1G.pdf', bbox_inches="tight")
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     results = pd.read_csv('roundtrip-results.csv')
75 |     results.loc[len(results.index)] = ['optimal', 1024, 0.000031690911909, 0.0]
76 |     results.loc[len(results.index)] = ['optimal', 1048576, 0.001731493794326, 0.0]
77 |     results.loc[len(results.index)] = ['optimal', 1073741824, 1.773049645390071, 0.0]
78 | 
79 |     cat_method_order = pd.CategoricalDtype(
80 |         ['optimal', 'hoplite', 'mpi', 'ray', 'dask'], 
81 |         ordered=True
82 |     )
83 |     results['Method'] = results['Method'].astype(cat_method_order)
84 |     results = results.sort_values('Method')
85 |     draw_rtt_1K(results)
86 |     draw_rtt_1M(results)
87 |     draw_rtt_1G(results)
88 | 


--------------------------------------------------------------------------------
/microbenchmarks/ray-python/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	python -m pip install grpcio-tools
3 | 	python -m grpc_tools.protoc -I../../src/protocol/ --python_out=. --grpc_python_out=. object_store.proto
4 | 
5 | clean:
6 | 	rm object_store_pb2*.py
7 | 


--------------------------------------------------------------------------------
/microbenchmarks/ray-python/README.md:
--------------------------------------------------------------------------------
 1 | ## Ray collective communication benchmarks (baseline)
 2 | 
 3 | Run `make` to compile necessary files.
 4 | 
 5 | Usage:
 6 | 
 7 | See python `run_tests.py -h`.
 8 | 
 9 | ### Auto test
10 | 
11 | ./auto_test.sh
12 | 


--------------------------------------------------------------------------------
/microbenchmarks/ray-python/auto_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python run_tests.py auto
4 | 


--------------------------------------------------------------------------------
/microbenchmarks/ray-python/ray_roundtrip.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import ray
 4 | 
 5 | 
 6 | @ray.remote(resources={'machine': 1})
 7 | class RayBenchmarkWorker:
 8 |     def __init__(self, object_size):
 9 |         self.object_size = object_size
10 |         self.payload = np.ones(object_size//4, dtype=np.float32)
11 | 
12 |     def poll(self):
13 |         pass
14 | 
15 |     def send(self):
16 |         return ray.put(self.payload)
17 | 
18 |     def recv(self, x):
19 |         return ray.get(x)
20 | 
21 |     def recv2(self, x):
22 |         return None
23 | 
24 | 
25 | def ray_roundtrip(object_size):
26 |     sender = RayBenchmarkWorker.remote(object_size)
27 |     receiver = RayBenchmarkWorker.remote(object_size)
28 |     ray.get([sender.poll.remote(), receiver.poll.remote()])
29 |     start = time.time()
30 |     ray.get(sender.recv2.remote(receiver.recv.remote(sender.send.remote())))
31 |     return time.time() - start
32 | 
33 | 
34 | REPEAT_TIMES = 5
35 | 
36 | def test_with_mean_std(object_size, repeat_times=REPEAT_TIMES):
37 |     results = []
38 |     for _ in range(repeat_times):
39 |         duration = ray_roundtrip(object_size)
40 |         results.append(duration)
41 |     return np.mean(results), np.std(results)
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     ray.init(address='auto')
46 |     with open("ray-roundtrip.csv", "w") as f:
47 |         for object_size in (2 ** 10, 2 ** 20, 2 ** 30):
48 |             mean, std = test_with_mean_std(object_size)
49 |             print(f"roundtrip: {object_size} {mean:.6f} ± {std:.6f}s")
50 |             f.write(f"ray,{object_size},{mean},{std}\n")


--------------------------------------------------------------------------------
/microbenchmarks/ray-python/run_tests.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import ray
 4 | 
 5 | import hoplite
 6 | import ray_microbenchmarks
 7 | 
 8 | NUM_NODES = (4, 8, 12, 16)
 9 | OBJECT_SIZES = (2 ** 10, 2 ** 15, 2 ** 20, 2 ** 25, 2 ** 30)
10 | REPEAT_TIMES = 5
11 | 
12 | microbenchmark_names = ['multicast', 'reduce', 'allreduce', 'gather', 'allgather', 'auto']
13 | parser = argparse.ArgumentParser(description='Ray microbenchmarks')
14 | parser.add_argument('test_name', type=str, choices=microbenchmark_names, help='Microbenchmark name')
15 | parser.add_argument('-n', '--world-size', type=int, required=False,
16 |                     help='Size of the collective processing group')
17 | parser.add_argument('-s', '--object-size', type=int, required=False,
18 |                     help='The size of the object')
19 | args = parser.parse_args()
20 | 
21 | 
22 | def test_with_mean_std(test_name, notification_address, world_size, object_size,
23 |                        repeat_times=REPEAT_TIMES):
24 |     results = []
25 |     for _ in range(repeat_times):
26 |         test_case = ray_microbenchmarks.__dict__[test_name]
27 |         duration = test_case(notification_address, world_size, object_size)
28 |         results.append(duration)
29 |     return np.mean(results), np.std(results)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     notification_address = hoplite.start_location_server()
34 | 
35 |     ray.init(address='auto')
36 |     test_name = 'ray_' + args.test_name
37 |     assert test_name in ray_microbenchmarks.__dict__ or args.test_name == 'auto'
38 |     if args.test_name != 'auto':
39 |         assert args.world_size is not None and args.object_size is not None
40 |         mean, std = test_with_mean_std(test_name, notification_address, args.world_size, args.object_size, 5)
41 |         print(f"{args.test_name},{args.world_size},{args.object_size},{mean},{std}")
42 |     else:
43 |         assert args.world_size is None and args.object_size is None
44 |         with open("ray-microbenchmark.csv", "w") as f:
45 |             for algorithm in ('ray_multicast', 'ray_gather', 'ray_reduce', 'ray_allreduce'):
46 |                 for world_size in NUM_NODES:
47 |                     for object_size in OBJECT_SIZES:
48 |                         mean, std = test_with_mean_std(algorithm, notification_address, world_size, object_size)
49 |                         print(f"{algorithm}, {world_size}, {object_size}, {mean}, {std}")
50 |                         f.write(f"{algorithm},{world_size},{object_size},{mean},{std}\n")
51 | 


--------------------------------------------------------------------------------
/python/hoplite/__init__.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import pathlib
 3 | import subprocess
 4 | import time
 5 | 
 6 | from . import _hoplite_client as _hoplite_store
 7 | 
 8 | HopliteClient = _hoplite_store.DistributedObjectStore
 9 | Buffer = _hoplite_store.Buffer
10 | ObjectID = _hoplite_store.ObjectID
11 | ReduceOp = _hoplite_store.ReduceOp
12 | 
13 | 
14 | def get_my_address():
15 |     import socket
16 |     return socket.gethostbyname(socket.gethostname())
17 | 
18 | 
19 | def add_arguments(parser):
20 |     parser.add_argument('--redis-address', type=str, default=get_my_address(),
21 |                         help='The IP address of the redis server')
22 |     parser.add_argument('--redis-port', type=int, default=6380,
23 |                         help='The port of the redis server')
24 |     parser.add_argument('--notification-port', type=int, default=7777,
25 |                         help='The port of the notification server')
26 |     parser.add_argument('--notification-listening-port', type=int, default=8888,
27 |                         help='The listening port of the notification client')
28 |     parser.add_argument('--plasma-socket', type=str, default="/tmp/multicast_plasma",
29 |                         help='The path of the unix domain socket')
30 |     parser.add_argument('--object_writer_port', type=int, default=6666,
31 |                         help='The path of the unix domain socket')
32 |     parser.add_argument('--grpc-port', type=int, default=50055,
33 |                         help='The path of the unix domain socket')
34 | 
35 | 
36 | def extract_dict_from_args(args):
37 |     return {'redis_address': args.redis_address.encode()}
38 | 
39 | 
40 | def create_store_using_dict(args_dict):
41 |     store = _hoplite_store.DistributedObjectStore(args_dict['redis_address'])
42 |     return store
43 | 
44 | 
45 | def object_id_from_int(n):
46 |     return _hoplite_store.ObjectID(int(str(n), 16).to_bytes(20, byteorder='big'))
47 | 
48 | 
49 | def random_object_id():
50 |     import random
51 |     return object_id_from_int(random.randint(0, 1e20-1))
52 | 
53 | 
54 | def _register_cleanup(processes):
55 |     def _cleanup_processes():
56 |         print("Cleaning up process...")
57 |         # wait clients to exit to suppress error messages
58 |         time.sleep(0.5)
59 |         for p in processes:
60 |             p.terminate()
61 |     atexit.register(_cleanup_processes)
62 | 
63 | 
64 | def start_location_server():
65 |     server_exec = pathlib.Path(__file__).resolve().parent.absolute() / 'notification'
66 |     notification_p = subprocess.Popen([str(server_exec)])
67 |     _register_cleanup([notification_p])
68 |     time.sleep(2)
69 |     return get_my_address()
70 | 
71 | 
72 | __all__ = ('start_location_server', 'random_object_id', 'object_id_from_int',
73 |            'create_store_using_dict', 'extract_dict_from_args', 'add_arguments', 'get_my_address',
74 |            'Buffer', 'ObjectID', 'ReduceOp')
75 | 


--------------------------------------------------------------------------------
/python/hoplite/_hoplite_client.pxd:
--------------------------------------------------------------------------------
 1 | # cython: language_level = 3
 2 | 
 3 | from libcpp cimport bool as c_bool
 4 | from libcpp.memory cimport shared_ptr, unique_ptr
 5 | from libcpp.string cimport string as c_string
 6 | 
 7 | from libc.stdint cimport uint8_t, int32_t, uint64_t, int64_t, uint32_t
 8 | from libcpp.unordered_map cimport unordered_map
 9 | from libcpp.unordered_set cimport unordered_set
10 | 
11 | from libcpp.vector cimport vector as c_vector
12 | 
13 | cdef extern from "util/logging.h" namespace "ray" nogil:
14 |     cdef cppclass CRayLogLevel "hoplite::RayLogLevel":
15 |         pass
16 | 
17 |     cdef cppclass CRayLog  "hoplite::RayLog":
18 |         @staticmethod
19 |         void StartRayLog(const c_string &my_address, CRayLogLevel log_level)
20 | 
21 | 
22 | cdef extern from "util/logging.h" namespace "hoplite::RayLogLevel" nogil:
23 |     cdef CRayLogLevel CRayLogDEBUG "hoplite::RayLogLevel::DEBUG"
24 |     cdef CRayLogLevel CRayLogINFO "hoplite::RayLogLevel::INFO"
25 |     cdef CRayLogLevel CRayLogWARNING "hoplite::RayLogLevel::WARNING"
26 |     cdef CRayLogLevel CRayLogERROR "hoplite::RayLogLevel::ERROR"
27 |     cdef CRayLogLevel CRayLogFATAL "hoplite::RayLogLevel::FATAL"
28 | 
29 | 
30 | cdef extern from "common/id.h" namespace "" nogil:
31 |     cdef cppclass CObjectID "ObjectID":
32 |         @staticmethod
33 |         CObjectID FromBinary(const c_string& binary)
34 |         @staticmethod
35 |         CObjectID FromHex(const c_string& binary)
36 |         c_string Binary() const
37 | 
38 | 
39 | cdef extern from "common/buffer.h" namespace "" nogil:
40 |     cdef cppclass CBuffer "Buffer":
41 |         CBuffer(int64_t size)
42 |         CBuffer(uint8_t* data, int64_t size)
43 |         const uint8_t* Data()
44 |         uint8_t* MutableData()
45 |         int64_t Size()
46 |         uint64_t Hash() const
47 | 
48 | 
49 | cdef extern from "client/distributed_object_store.h" namespace "" nogil:
50 |     cdef cppclass CDistributedObjectStore "DistributedObjectStore":
51 |         CDistributedObjectStore(const c_string &object_directory_address)
52 | 
53 |         void Put(const shared_ptr[CBuffer] &buffer, const CObjectID &object_id)
54 | 
55 |         CObjectID Put(const shared_ptr[CBuffer] &buffer)
56 | 
57 |         void Reduce(const c_vector[CObjectID] &object_ids,
58 |                     CObjectID *created_reduction_id)
59 | 
60 |         void Reduce(const c_vector[CObjectID] &object_ids,
61 |                     const CObjectID &reduction_id)
62 | 
63 |         void Reduce(const c_vector[CObjectID] &object_ids,
64 |                     CObjectID *created_reduction_id,
65 |                     ssize_t num_reduce_objects)
66 | 
67 |         void Reduce(const c_vector[CObjectID] &object_ids,
68 |                     const CObjectID &reduction_id,
69 |                     ssize_t num_reduce_objects)
70 | 
71 |         unordered_set[CObjectID] GetReducedObjects(const CObjectID &reduction_id)
72 | 
73 |         void Get(const CObjectID &object_id,
74 |                  shared_ptr[CBuffer] *result)
75 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | from setuptools import Extension, setup
 4 | from Cython.Build import cythonize
 5 | 
 6 | project_dir = pathlib.Path(__file__).parent.absolute().parent
 7 | src_dir = project_dir / 'src'
 8 | lib_dir = project_dir / 'build'
 9 | 
10 | ext_modules = [
11 |     Extension(
12 |         "hoplite._hoplite_client",
13 |         sources=["hoplite/_hoplite_client.pyx"],
14 |         include_dirs=[str(src_dir), str(lib_dir)],  # lib_dir contains "object_store.grpc.pb.h"
15 |         library_dirs=[str(lib_dir)],
16 |         libraries=["hoplite_client_lib"],
17 |         # this is necessary for the dynamic linking of Linux to
18 |         # be working in a distributed environment
19 |         extra_link_args=['-Wl,-rpath=' + str(lib_dir)],
20 |     )
21 | ]
22 | 
23 | setup(name='hoplite',
24 |       zip_safe=False,
25 |       packages=['hoplite'],
26 |       ext_modules=cythonize(ext_modules))
27 | 


--------------------------------------------------------------------------------
/python/setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # make hoplite importable
4 | script_dir=$(dirname "${BASH_SOURCE[0]}")
5 | site_packages=$(python -c 'import site; print(site.getsitepackages()[0])')
6 | echo $(realpath $script_dir) > $site_packages/easy-install.pth
7 | echo $(realpath $script_dir) > $site_packages/hoplite.egg-link
8 | 


--------------------------------------------------------------------------------
/src/client/distributed_object_store.h:
--------------------------------------------------------------------------------
 1 | #ifndef DISTRIBUTED_OBJECT_STORE_H
 2 | #define DISTRIBUTED_OBJECT_STORE_H
 3 | 
 4 | #include <cstdint>
 5 | #include <string>
 6 | #include <vector>
 7 | // common headers
 8 | #include "common/buffer.h"
 9 | #include "common/id.h"
10 | // components headers
11 | #include "global_control_store.h"
12 | #include "local_store_client.h"
13 | #include "notification_listener.h"
14 | #include "object_sender.h"
15 | #include "object_store_state.h"
16 | #include "receiver.h"
17 | 
18 | class DistributedObjectStore {
19 | public:
20 |   explicit DistributedObjectStore(const std::string &object_directory_address);
21 | 
22 |   ~DistributedObjectStore();
23 | 
24 |   void Put(const std::shared_ptr<Buffer> &buffer, const ObjectID &object_id);
25 | 
26 |   ObjectID Put(const std::shared_ptr<Buffer> &buffer);
27 | 
28 |   void Reduce(const std::vector<ObjectID> &object_ids, ObjectID *created_reduction_id, ssize_t num_reduce_objects = -1);
29 | 
30 |   void Reduce(const std::vector<ObjectID> &object_ids, const ObjectID &reduction_id, ssize_t num_reduce_objects = -1);
31 | 
32 |   void Get(const ObjectID &object_id, std::shared_ptr<Buffer> *result);
33 | 
34 |   bool IsLocalObject(const ObjectID &object_id, int64_t *size);
35 | 
36 |   std::unordered_set<ObjectID> GetReducedObjects(const ObjectID &reduction_id);
37 | 
38 | private:
39 |   template <typename T> void reduce_local_objects(const std::vector<ObjectID> &object_ids, Buffer *output) {
40 |     DCHECK(output->Size() % sizeof(T) == 0) << "Buffer size cannot be divide whole by the element size";
41 |     auto num_elements = output->Size() / sizeof(T);
42 |     T *target = (T *)output->MutableData();
43 |     bool first = true;
44 |     // TODO: implement parallel reducing
45 |     for (const auto &object_id : object_ids) {
46 |       // TODO: those object_ids could also be local streams.
47 |       ObjectBuffer object_buffer;
48 |       DCHECK(local_store_client_.ObjectExists(object_id)) << "ObjectID not in local store";
49 |       local_store_client_.Get(object_id, &object_buffer);
50 |       std::shared_ptr<Buffer> buf = object_buffer.data;
51 |       const T *data_ptr = (const T *)buf->Data();
52 |       if (!first) {
53 |         for (int64_t i = 0; i < num_elements; i++)
54 |           target[i] += data_ptr[i];
55 |       } else {
56 |         for (int64_t i = 0; i < num_elements; i++)
57 |           target[i] = data_ptr[i];
58 |         first = false;
59 |       }
60 |     }
61 |     // TODO: try to pipeline this
62 |     output->progress = output->Size();
63 |   }
64 | 
65 |   // order of fields should be kept for proper initialization order
66 |   std::string my_address_;
67 |   ObjectStoreState state_;
68 |   GlobalControlStoreClient gcs_client_;
69 |   LocalStoreClient local_store_client_;
70 |   ObjectSender object_sender_;
71 |   Receiver receiver_;
72 |   NotificationListener notification_listener_;
73 | };
74 | 
75 | #endif // DISTRIBUTED_OBJECT_STORE_H
76 | 


--------------------------------------------------------------------------------
/src/client/global_control_store.h:
--------------------------------------------------------------------------------
 1 | #ifndef GLOBAL_CONTROL_STORE_H
 2 | #define GLOBAL_CONTROL_STORE_H
 3 | 
 4 | #include "common/id.h"
 5 | #include "object_store.grpc.pb.h"
 6 | #include "util/ctpl_stl.h"
 7 | #include <condition_variable>
 8 | #include <grpcpp/channel.h>
 9 | #include <grpcpp/server.h>
10 | #include <mutex>
11 | #include <string>
12 | #include <thread>
13 | #include <unordered_set>
14 | #include <utility>
15 | #include <vector>
16 | 
17 | constexpr int64_t inband_data_size_limit = 65536;
18 | 
19 | struct SyncReply {
20 |   std::string sender_ip;
21 |   size_t object_size;
22 |   std::string inband_data;
23 | };
24 | 
25 | class GlobalControlStoreClient {
26 | public:
27 |   GlobalControlStoreClient(const std::string &notification_server_address, const std::string &my_address,
28 |                            int notification_server_port);
29 | 
30 |   void ConnectNotificationServer();
31 | 
32 |   // Write object location to the notification server.
33 |   void WriteLocation(const ObjectID &object_id, const std::string &my_address, bool finished, size_t object_size,
34 |                      const uint8_t *inband_data = nullptr, bool blocking = false);
35 | 
36 |   // Get object location from the notification server.
37 |   SyncReply GetLocationSync(const ObjectID &object_id, bool occupying, const std::string &receiver_ip);
38 | 
39 |   bool HandlePullObjectFailure(const ObjectID &object_id, const std::string &receiver_ip,
40 |                                std::string *alternative_sender_ip);
41 | 
42 |   void HandleReceiveReducedObjectFailure(const ObjectID &reduction_id, const std::string &receiver_ip,
43 |                                          const std::string &sender_ip);
44 | 
45 |   /// Create reduce task
46 |   /// \param reduce_dst The IP address of the node that holds the final reduced object.
47 |   void CreateReduceTask(const std::vector<ObjectID> &objects_to_reduce, const ObjectID &reduction_id,
48 |                         int num_reduce_objects);
49 | 
50 |   /// Get the IDs of objects reduced for a reduction ID.
51 |   /// \param[in] reduction_id The reduction ID represents the reduce event.
52 |   /// \return A set of reduced object IDs
53 |   std::unordered_set<ObjectID> GetReducedObjects(const ObjectID &reduction_id);
54 | 
55 | private:
56 |   const std::string &notification_server_address_;
57 |   const std::string &my_address_;
58 |   const int notification_server_port_;
59 |   std::shared_ptr<grpc::Channel> notification_channel_;
60 |   std::unique_ptr<objectstore::NotificationServer::Stub> notification_stub_;
61 |   ctpl::thread_pool pool_;
62 | };
63 | 
64 | #endif // GLOBAL_CONTROL_STORE_H
65 | 


--------------------------------------------------------------------------------
/src/client/local_store_client.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOCAL_STORE_H
 2 | #define LOCAL_STORE_H
 3 | 
 4 | #include "common/buffer.h"
 5 | #include "common/id.h"
 6 | #include "common/status.h"
 7 | #include <mutex>
 8 | #include <queue>
 9 | #include <unordered_map>
10 | 
11 | class LocalStoreClient {
12 | public:
13 |   LocalStoreClient();
14 | 
15 |   Status Create(const ObjectID &object_id, int64_t data_size, std::shared_ptr<Buffer> *data);
16 | 
17 |   Status Seal(const ObjectID &object_id);
18 | 
19 |   // Check if an object exists in the store.
20 |   // We assume this function will never fail.
21 |   bool ObjectExists(const ObjectID &object_id, bool require_finished = true);
22 | 
23 |   Status Get(const std::vector<ObjectID> &object_ids, std::vector<ObjectBuffer> *object_buffers);
24 | 
25 |   // Get single object from the store.
26 |   Status Get(const ObjectID &object_id, ObjectBuffer *object_buffer);
27 | 
28 |   std::shared_ptr<Buffer> GetBufferNoExcept(const ObjectID &object_id);
29 | 
30 |   Status GetBufferOrCreate(const ObjectID &object_id, int64_t size, std::shared_ptr<Buffer> *data);
31 | 
32 |   Status Delete(const ObjectID &object_id);
33 | 
34 |   Status Wait(const ObjectID &object_id);
35 | 
36 | private:
37 |   Status create_internal(const ObjectID &object_id, int64_t data_size, std::shared_ptr<Buffer> *data);
38 |   bool object_exists_unsafe(const ObjectID &object_id, bool require_finished);
39 |   std::mutex local_store_mutex_;
40 |   std::unordered_map<ObjectID, std::shared_ptr<Buffer>> buffers_;
41 |   size_t total_store_size_;
42 |   const size_t lru_bound_size_ = (16LL << 30);
43 |   std::queue<ObjectID> lru_queue_;
44 | };
45 | 
46 | #endif // LOCAL_STORE_H
47 | 


--------------------------------------------------------------------------------
/src/client/notification_listener.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <condition_variable>
 3 | #include <memory>
 4 | #include <mutex>
 5 | #include <thread>
 6 | 
 7 | #include <grpcpp/channel.h>
 8 | #include <grpcpp/server.h>
 9 | 
10 | #include "common/id.h"
11 | #include "object_store_state.h"
12 | #include "receiver.h"
13 | 
14 | class NotificationListenerImpl;
15 | 
16 | class NotificationListener {
17 | public:
18 |   NotificationListener(const std::string &my_address, int notification_listener_port, ObjectStoreState &state,
19 |                        Receiver &recevier, LocalStoreClient &local_store_client);
20 | 
21 |   void Run();
22 | 
23 |   void Shutdown();
24 | 
25 | private:
26 |   void worker_loop();
27 | 
28 |   std::string my_address_;
29 | 
30 |   ObjectStoreState &state_;
31 |   Receiver &recevier_;
32 |   LocalStoreClient &local_store_client_;
33 | 
34 |   std::thread notification_listener_thread_;
35 |   std::unique_ptr<grpc::Server> grpc_server_;
36 |   std::shared_ptr<NotificationListenerImpl> service_;
37 | };
38 | 


--------------------------------------------------------------------------------
/src/client/object_sender.h:
--------------------------------------------------------------------------------
 1 | #ifndef OBJECT_SENDER_H
 2 | #define OBJECT_SENDER_H
 3 | 
 4 | #include <condition_variable>
 5 | #include <mutex>
 6 | #include <queue>
 7 | #include <thread>
 8 | 
 9 | #include <netinet/in.h> // struct sockaddr_in
10 | 
11 | #include "global_control_store.h"
12 | #include "local_store_client.h"
13 | #include "object_store.pb.h"
14 | #include "object_store_state.h"
15 | 
16 | class ObjectSender {
17 | public:
18 |   ObjectSender(ObjectStoreState &state, GlobalControlStoreClient &gcs_client, LocalStoreClient &local_store_client,
19 |                const std::string &my_address);
20 | 
21 |   void Run();
22 | 
23 |   void Shutdown();
24 | 
25 | private:
26 |   void listener_loop();
27 | 
28 |   int send_object(int conn_fd, const ObjectID &object_id, int64_t object_size, int64_t offset);
29 | 
30 |   int send_reduced_object(int conn_fd, const ObjectID &object_id, int64_t object_size, int64_t offset);
31 | 
32 |   GlobalControlStoreClient &gcs_client_;
33 |   LocalStoreClient &local_store_client_;
34 |   ObjectStoreState &state_;
35 |   std::string my_address_;
36 | 
37 |   // for the TCP listener
38 |   int server_fd_;
39 |   std::thread server_thread_;
40 |   struct sockaddr_in address_;
41 |   // thread pool for launching tasks
42 |   ctpl::thread_pool pool_;
43 | };
44 | 
45 | #endif // OBJECT_SENDER_H
46 | 


--------------------------------------------------------------------------------
/src/client/object_store_state.cc:
--------------------------------------------------------------------------------
 1 | #include "object_store_state.h"
 2 | #include "util/logging.h"
 3 | 
 4 | std::shared_ptr<Buffer> ObjectStoreState::create_reduction_stream(const ObjectID &reduction_id, size_t size) {
 5 |   std::unique_lock<std::mutex> l(reduction_stream_mutex_);
 6 |   DCHECK(reduction_stream_.find(reduction_id) == reduction_stream_.end());
 7 |   auto stream = std::make_shared<Buffer>(size);
 8 |   reduction_stream_[reduction_id] = stream;
 9 |   l.unlock();
10 |   reduction_stream_cv_.notify_all();
11 |   return stream;
12 | }
13 | 
14 | std::shared_ptr<Buffer> ObjectStoreState::get_reduction_stream(const ObjectID &reduction_id) {
15 |   std::unique_lock<std::mutex> l(reduction_stream_mutex_);
16 |   reduction_stream_cv_.wait(
17 |       l, [this, &reduction_id]() { return reduction_stream_.find(reduction_id) != reduction_stream_.end(); });
18 |   return reduction_stream_[reduction_id];
19 | }
20 | 
21 | std::shared_ptr<Buffer> ObjectStoreState::get_or_create_reduction_stream(const ObjectID &reduction_id, size_t size) {
22 |   std::unique_lock<std::mutex> l(reduction_stream_mutex_);
23 |   auto search = reduction_stream_.find(reduction_id);
24 |   if (search == reduction_stream_.end()) {
25 |     auto stream = std::make_shared<Buffer>(size);
26 |     reduction_stream_[reduction_id] = stream;
27 |     l.unlock();
28 |     reduction_stream_cv_.notify_all();
29 |     return stream;
30 |   } else {
31 |     return search->second;
32 |   }
33 | }
34 | 
35 | void ObjectStoreState::release_reduction_stream(const ObjectID &reduction_id) {
36 |   std::unique_lock<std::mutex> l(reduction_stream_mutex_);
37 |   // release the memory
38 |   reduction_stream_.erase(reduction_id);
39 | }
40 | 
41 | void ObjectStoreState::create_local_reduce_task(const ObjectID &reduction_id,
42 |                                                 const std::vector<ObjectID> &local_objects) {
43 |   DCHECK(local_objects.size() <= 1);
44 |   auto t = std::make_shared<LocalReduceTask>();
45 |   if (!local_objects.empty()) {
46 |     t->local_object = local_objects[0];
47 |   }
48 |   {
49 |     std::lock_guard<std::mutex> lock(reduce_tasks_mutex_);
50 |     reduce_tasks_[reduction_id] = t;
51 |   }
52 | }
53 | 
54 | std::shared_ptr<LocalReduceTask> ObjectStoreState::get_local_reduce_task(const ObjectID &reduction_id) {
55 |   std::lock_guard<std::mutex> lock(reduce_tasks_mutex_);
56 |   DCHECK(reduce_tasks_.count(reduction_id));
57 |   return reduce_tasks_[reduction_id];
58 | }
59 | 
60 | void ObjectStoreState::remove_local_reduce_task(const ObjectID &reduction_id) {
61 |   std::lock_guard<std::mutex> lock(reduce_tasks_mutex_);
62 |   DCHECK(reduce_tasks_.count(reduction_id));
63 |   reduce_tasks_.erase(reduction_id);
64 | }
65 | 
66 | bool ObjectStoreState::local_reduce_task_exists(const ObjectID &reduction_id) {
67 |   std::lock_guard<std::mutex> lock(reduce_tasks_mutex_);
68 |   return reduce_tasks_.count(reduction_id) > 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/src/client/object_store_state.h:
--------------------------------------------------------------------------------
 1 | #ifndef OBJECT_STORE_STATE_H
 2 | #define OBJECT_STORE_STATE_H
 3 | 
 4 | #include <atomic>
 5 | #include <condition_variable>
 6 | #include <mutex>
 7 | 
 8 | #include <unordered_map>
 9 | #include <vector>
10 | 
11 | #include "common/buffer.h"
12 | #include "common/id.h"
13 | 
14 | class LocalReduceTask {
15 | public:
16 |   LocalReduceTask() : is_finished_(false) {}
17 | 
18 |   ObjectID local_object;
19 | 
20 |   void Wait() {
21 |     std::unique_lock<std::mutex> l(notification_mutex_);
22 |     notification_cv_.wait(l, [this]() { return is_finished_.load(); });
23 |   }
24 | 
25 |   void NotifyFinished() {
26 |     std::unique_lock<std::mutex> l(notification_mutex_);
27 |     is_finished_ = true;
28 |     notification_cv_.notify_all();
29 |   }
30 | 
31 | private:
32 |   std::atomic<bool> is_finished_;
33 |   std::mutex notification_mutex_;
34 |   std::condition_variable notification_cv_;
35 | };
36 | 
37 | class ObjectStoreState {
38 | 
39 | public:
40 |   std::shared_ptr<Buffer> create_reduction_stream(const ObjectID &reduction_id, size_t size);
41 | 
42 |   std::shared_ptr<Buffer> get_reduction_stream(const ObjectID &reduction_id);
43 | 
44 |   std::shared_ptr<Buffer> get_or_create_reduction_stream(const ObjectID &reduction_id, size_t size);
45 | 
46 |   void release_reduction_stream(const ObjectID &reduction_id);
47 | 
48 |   void create_local_reduce_task(const ObjectID &reduction_id, const std::vector<ObjectID> &local_objects);
49 | 
50 |   std::shared_ptr<LocalReduceTask> get_local_reduce_task(const ObjectID &reduction_id);
51 | 
52 |   void remove_local_reduce_task(const ObjectID &reduction_id);
53 | 
54 |   bool local_reduce_task_exists(const ObjectID &reduction_id);
55 | 
56 | private:
57 |   std::mutex reduction_stream_mutex_;
58 |   std::condition_variable reduction_stream_cv_;
59 |   std::unordered_map<ObjectID, std::shared_ptr<Buffer>> reduction_stream_;
60 | 
61 |   std::mutex reduce_tasks_mutex_;
62 |   std::unordered_map<ObjectID, std::shared_ptr<LocalReduceTask>> reduce_tasks_;
63 | };
64 | 
65 | #endif // OBJECT_STORE_STATE_H
66 | 


--------------------------------------------------------------------------------
/src/common/buffer.cc:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cstring>
 3 | #include "util/logging.h"
 4 | #include "common/buffer.h"
 5 | 
 6 | Buffer::Buffer(uint8_t* data_ptr, int64_t size): progress(size), data_ptr_(data_ptr), size_(size), is_data_owner_(false) {}
 7 | 
 8 | Buffer::Buffer(int64_t size): progress(0), size_(size), is_data_owner_(true) {
 9 |   data_ptr_ = new uint8_t[size];
10 | }
11 | 
12 | uint8_t* Buffer::MutableData() { return data_ptr_; }
13 | const uint8_t* Buffer::Data() const { return data_ptr_; }
14 | int64_t Buffer::Size() const { return size_; }
15 | uint64_t Buffer::Hash() const {
16 |   return MurmurHash64A(data_ptr_, size_, 0);
17 | }
18 | 
19 | void Buffer::CopyFrom(const std::vector<uint8_t> &data) {
20 |   DCHECK(data.size() == size_) << "input size mismatch";
21 |   std::copy(data.begin(), data.end(), data_ptr_);
22 |   Seal();
23 | }
24 | 
25 | void Buffer::CopyFrom(const uint8_t *data, size_t size) {
26 |   DCHECK(size == size_) << "input size mismatch";
27 |   std::memcpy(data_ptr_, data, size);
28 |   Seal();
29 | }
30 | 
31 | void Buffer::CopyFrom(const Buffer &buffer) {
32 |   DCHECK(buffer.Size() == size_) << "input size mismatch";
33 |   std::memcpy(data_ptr_, buffer.Data(), buffer.Size());
34 |   Seal();
35 | }
36 | 
37 | void Buffer::CopyFrom(const std::string &data) {
38 |   CopyFrom((const uint8_t *)data.data(), data.size());
39 |   Seal();
40 | }
41 | 
42 | void Buffer::StreamCopy(const Buffer &src) {
43 |   DCHECK(src.IsFinished()) << "Copy from a unfinished buffer";
44 |   const uint8_t *data = src.Data();
45 |   int64_t size = src.Size();
46 |   DCHECK(size == Size()) << "Size mismatch for copying.";
47 |   size_t copy_size = size / 1024;
48 |   // trade off 'copy_size' between performance and latency
49 |   if (copy_size < 4096) {
50 |     copy_size = 4096;
51 |   } else if (copy_size > 2 << 20) {
52 |     copy_size = 2 << 20;
53 |   } else {
54 |     // align to 64
55 |     copy_size = (copy_size >> 6) << 6;
56 |   }
57 |   uint8_t *dst = MutableData();
58 |   size_t cursor = 0;
59 |   while (copy_size + cursor <= size) {
60 |     memcpy(dst + cursor, data + cursor, copy_size);
61 |     progress += copy_size;
62 |     cursor += copy_size;
63 |   }
64 |   memcpy(dst + cursor, data + cursor, size - cursor);
65 |   progress = cursor;
66 | }
67 | 
68 | void Buffer::Wait() {
69 |   std::unique_lock<std::mutex> l(notification_mutex_);
70 |   notification_cv_.wait(l, [this]() { return IsFinished(); });
71 | }
72 | 
73 | void Buffer::NotifyFinished() {
74 |   std::unique_lock<std::mutex> l(notification_mutex_);
75 |   DCHECK(IsFinished()) << "The buffer has not been finished";
76 |   notification_cv_.notify_all();
77 | }
78 | 
79 | void Buffer::ShrinkForLRU() {
80 |   delete[] data_ptr_;
81 |   data_ptr_ = new uint8_t[4];
82 |   size_ = 4;
83 | }
84 | 
85 | Buffer::~Buffer() {
86 |   if (is_data_owner_) {
87 |     delete[] data_ptr_;
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/src/common/buffer.h:
--------------------------------------------------------------------------------
 1 | #ifndef BUFFER_H
 2 | #define BUFFER_H
 3 | 
 4 | #include <cinttypes>
 5 | #include <memory>
 6 | #include <atomic>
 7 | #include <vector>
 8 | #include <mutex>
 9 | #include <condition_variable>
10 | #include "common/config.h"
11 | #include "util/hash.h"
12 | 
13 | class Buffer {
14 |   public:
15 |     Buffer(uint8_t* data_ptr, int64_t size);
16 |     explicit Buffer(int64_t size);
17 | 
18 |     void CopyFrom(const std::vector<uint8_t> &data);
19 |     void CopyFrom(const uint8_t *data, size_t size);
20 |     void CopyFrom(const Buffer &buffer);
21 |     void CopyFrom(const std::string &data);
22 |     void StreamCopy(const Buffer &buffer);
23 | 
24 |     uint8_t* MutableData();
25 |     const uint8_t* Data() const;
26 |     int64_t Size() const;
27 |     uint64_t Hash() const;
28 |     void ShrinkForLRU();
29 |     void Seal() { progress = size_; }
30 |     bool IsFinished() const { return progress >= size_; }
31 |     ~Buffer();
32 | 
33 |     void Wait();
34 |     void NotifyFinished();
35 | #ifdef HOPLITE_ENABLE_ATOMIC_BUFFER_PROGRESS
36 |     std::atomic_int64_t progress;
37 | #else
38 |     volatile int64_t progress;
39 | #endif
40 |     volatile bool reset = false;
41 |   private:
42 |     uint8_t* data_ptr_;
43 |     int64_t size_;
44 |     bool is_data_owner_;
45 |     std::mutex notification_mutex_;
46 |     std::condition_variable notification_cv_;
47 | };
48 | 
49 | struct ObjectBuffer {
50 |   std::shared_ptr<Buffer> data;
51 |   uint8_t* metadata;
52 |   int32_t device_num = 0;
53 | };
54 | 
55 | #endif // BUFFER_H


--------------------------------------------------------------------------------
/src/common/config.h:
--------------------------------------------------------------------------------
 1 | #ifndef _HOPLITE_COMMON_CONFIG_H_
 2 | #define _HOPLITE_COMMON_CONFIG_H_
 3 | 
 4 | // Enable non-blocking for the socket that receiving objects.
 5 | #define HOPLITE_ENABLE_NONBLOCKING_SOCKET_RECV
 6 | 
 7 | constexpr int64_t STREAM_MAX_BLOCK_SIZE = 4 * (2 << 20); // 4MB
 8 | 
 9 | // Enable ACK for sending/receiving buffers. Usually used for debugging.
10 | // FIXME(suquark): Disable ACK would cause numeric mismatch.
11 | #define HOPLITE_ENABLE_ACK
12 | 
13 | // The constant for RPC latency (in seconds)
14 | #define HOPLITE_RPC_LATENCY (750 * 1e-6)
15 | 
16 | // The constanf for bandwidth (in bytes/second)
17 | #define HOPLITE_BANDWIDTH (9.68 * (1 << 30) / 8)
18 | 
19 | // Use atomic type for buffer progress.
20 | // #define HOPLITE_ENABLE_ATOMIC_BUFFER_PROGRESS
21 | 
22 | // Maximum inflow concurrency for a node
23 | #define HOPLITE_MAX_INFLOW_CONCURRENCY 2
24 | 
25 | // Maximum outflow concurrency for a node
26 | #define HOPLITE_MAX_OUTLOW_CONCURRENCY 2
27 | 
28 | // The thread pool size for the distributed store to launch
29 | // RPCs like `InvokeReduceTo` and `InvokeRedirectReduce`.
30 | #define HOPLITE_THREADPOOL_SIZE_FOR_RPC 10
31 | 
32 | #define HOPLITE_MULTITHREAD_REDUCE_SIZE (1 << 28)
33 | 
34 | // Make the Put() call blocking on 'WriteLocation'
35 | #ifndef HOPLITE_PUT_BLOCKING
36 | #define HOPLITE_PUT_BLOCKING false
37 | #endif
38 | 
39 | // NOTE: SO_ZEROCOPY & TCP_NODELAY is not working.
40 | 
41 | // Default ports
42 | #define HOPLITE_SENDER_PORT 20210
43 | #define HOPLITE_RECEIVER_PORT 20211
44 | #define OBJECT_DIRECTORY_PORT 7777
45 | #define OBJECT_DIRECTORY_LISTENER_PORT 8888
46 | 
47 | #endif  // _HOPLITE_COMMON_CONFIG_H_
48 | 


--------------------------------------------------------------------------------
/src/common/id.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "common/id.h"
 3 | 
 4 | std::string ObjectID::ToString() const {
 5 |   return std::string("ObjectID(") + Hex() + ")";
 6 | }
 7 | 
 8 | #define ID_OSTREAM_OPERATOR(id_type)                              \
 9 |   std::ostream &operator<<(std::ostream &os, const id_type &id) { \
10 |     if (id.IsNil()) {                                             \
11 |       os << "NIL_ID";                                             \
12 |     } else {                                                      \
13 |       os << id.Hex();                                             \
14 |     }                                                             \
15 |     return os;                                                    \
16 |   }
17 | 
18 | ID_OSTREAM_OPERATOR(ObjectID);


--------------------------------------------------------------------------------
/src/common/status.cc:
--------------------------------------------------------------------------------
 1 | #include "common/status.h"
 2 | 
 3 | #include <assert.h>
 4 | 
 5 | Status::Status(StatusCode code, const std::string &msg) {
 6 |   assert(code != StatusCode::OK);
 7 |   state_ = new State;
 8 |   state_->code = code;
 9 |   state_->msg = msg;
10 | }
11 | 
12 | void Status::CopyFrom(const State *state) {
13 |   delete state_;
14 |   if (state == nullptr) {
15 |     state_ = nullptr;
16 |   } else {
17 |     state_ = new State(*state);
18 |   }
19 | }
20 | 
21 | std::string Status::CodeAsString() const {
22 |   if (state_ == NULL) {
23 |     return "OK";
24 |   }
25 | 
26 |   const char *type;
27 |   switch (code()) {
28 |   case StatusCode::OK:
29 |     type = "OK";
30 |     break;
31 |   case StatusCode::OutOfMemory:
32 |     type = "Out of memory";
33 |     break;
34 |   case StatusCode::KeyError:
35 |     type = "Key error";
36 |     break;
37 |   case StatusCode::TypeError:
38 |     type = "Type error";
39 |     break;
40 |   case StatusCode::Invalid:
41 |     type = "Invalid";
42 |     break;
43 |   case StatusCode::IOError:
44 |     type = "IOError";
45 |     break;
46 |   case StatusCode::ObjectExists:
47 |     type = "ObjectExists";
48 |     break;
49 |   case StatusCode::ObjectStoreFull:
50 |     type = "ObjectStoreFull";
51 |     break;
52 |   case StatusCode::UnknownError:
53 |     type = "Unknown error";
54 |     break;
55 |   case StatusCode::NotImplemented:
56 |     type = "NotImplemented";
57 |     break;
58 |   case StatusCode::RedisError:
59 |     type = "RedisError";
60 |     break;
61 |   case StatusCode::TimedOut:
62 |     type = "TimedOut";
63 |     break;
64 |   case StatusCode::Interrupted:
65 |     type = "Interrupted";
66 |     break;
67 |   default:
68 |     type = "Unknown";
69 |     break;
70 |   }
71 |   return std::string(type);
72 | }
73 | 
74 | std::string Status::ToString() const {
75 |   std::string result(CodeAsString());
76 |   if (state_ == NULL) {
77 |     return result;
78 |   }
79 |   result += ": ";
80 |   result += state_->msg;
81 |   return result;
82 | }
83 | 


--------------------------------------------------------------------------------
/src/object_directory/notification.h:
--------------------------------------------------------------------------------
 1 | #ifndef NOTIFICATION_H
 2 | #define NOTIFICATION_H
 3 | 
 4 | #include "common/id.h"
 5 | #include "util/logging.h"
 6 | #include <atomic>
 7 | #include <grpcpp/server.h>
 8 | #include <string>
 9 | #include <thread>
10 | 
11 | class NotificationServiceImpl;
12 | 
13 | class NotificationServer {
14 | public:
15 |   NotificationServer(const std::string &my_address, int notification_server_port,
16 |                      int notification_listener_port);
17 | 
18 |   std::thread Run() {
19 |     std::thread notification_thread(&NotificationServer::worker_loop, this);
20 |     return notification_thread;
21 |   }
22 | 
23 | private:
24 |   void worker_loop();
25 | 
26 |   const int notification_server_port_;
27 |   const int notification_listener_port_;
28 | 
29 |   std::unique_ptr<grpc::Server> grpc_server_;
30 |   std::shared_ptr<NotificationServiceImpl> service_;
31 | };
32 | 
33 | #endif // NOTIFICATION_H
34 | 


--------------------------------------------------------------------------------
/src/tests/allgather_test.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <memory>
 3 | #include <mpi.h>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | // Make the Put() call blocking for more precise timing.
 8 | #define HOPLITE_PUT_BLOCKING true
 9 | 
10 | #include "distributed_object_store.h"
11 | #include "util/logging.h"
12 | #include "util/socket_utils.h"
13 | #include "util/test_utils.h"
14 | 
15 | int main(int argc, char **argv) {
16 |   // argv: *, object_directory_address, object_size, n_trials
17 |   std::string object_directory_address = std::string(argv[1]);
18 |   int64_t object_size = std::strtoll(argv[2], NULL, 10);
19 |   int64_t n_trials = std::strtoll(argv[3], NULL, 10);
20 |   std::string my_address = get_host_ipaddress();
21 |   MPI_Init(NULL, NULL);
22 |   int world_rank;
23 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
24 |   int world_size;
25 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
26 | 
27 |   ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG);
28 | 
29 |   TIMELINE("main");
30 | 
31 |   DistributedObjectStore store(object_directory_address);
32 | 
33 |   for (int trial = 0; trial < n_trials; trial++) {
34 |     std::vector<ObjectID> object_ids;
35 |     float sum = 0;
36 |     for (int i = 0; i < world_size; i++) {
37 |       auto oid = object_id_from_integer(trial * 1000000 + i);
38 |       object_ids.push_back(oid);
39 |       auto rnum = get_uniform_random_float(oid.Hex());
40 |       sum += rnum;
41 |     }
42 |     DCHECK(object_size % sizeof(float) == 0);
43 | 
44 |     ObjectID rank_object_id = object_ids[world_rank];
45 |     std::unordered_map<ObjectID, std::shared_ptr<Buffer>> gather_result;
46 | 
47 |     put_random_buffer<float>(store, rank_object_id, object_size);
48 | 
49 |     MPI_Barrier(MPI_COMM_WORLD);
50 | 
51 |     auto start = std::chrono::system_clock::now();
52 |     for (auto &object_id : object_ids) {
53 |       store.Get(object_id, &gather_result[object_id]);
54 |     }
55 |     auto end = std::chrono::system_clock::now();
56 |     std::chrono::duration<double> duration = end - start;
57 |     LOG(INFO) << "Allgather finished. duration = " << duration.count();
58 |     uint32_t sum_crc = 0;
59 |     for (auto &object_id : object_ids) {
60 |       sum_crc += gather_result[object_id]->Hash();
61 |     }
62 |     LOG(INFO) << "Hash for objects is " << sum_crc;
63 |     MPI_Barrier(MPI_COMM_WORLD);
64 |   }
65 |   MPI_Barrier(MPI_COMM_WORLD);
66 |   MPI_Finalize();
67 |   return 0;
68 | }
69 | 


--------------------------------------------------------------------------------
/src/tests/allreduce_test.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <memory>
 3 | #include <mpi.h>
 4 | #include <string>
 5 | #include <thread>
 6 | #include <vector>
 7 | 
 8 | // Make the Put() call blocking for more precise timing.
 9 | #define HOPLITE_PUT_BLOCKING true
10 | 
11 | #include "common/buffer.h"
12 | #include "common/id.h"
13 | #include "distributed_object_store.h"
14 | #include "util/logging.h"
15 | #include "util/socket_utils.h"
16 | #include "util/test_utils.h"
17 | 
18 | int main(int argc, char **argv) {
19 |   // argv: *, object_directory_address, object_size, n_trials
20 |   std::string object_directory_address = std::string(argv[1]);
21 |   int64_t object_size = std::strtoll(argv[2], NULL, 10);
22 |   int64_t n_trials = std::strtoll(argv[3], NULL, 10);
23 |   std::string my_address = get_host_ipaddress();
24 |   MPI_Init(NULL, NULL);
25 |   int world_rank;
26 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
27 |   int world_size;
28 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
29 | 
30 |   ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG);
31 | 
32 |   TIMELINE("main");
33 | 
34 |   DistributedObjectStore store(object_directory_address);
35 | 
36 |   for (int trial = 0; trial < n_trials; trial++) {
37 |     ObjectID reduction_id = object_id_from_integer(trial * 1000000 + 99999);
38 |     std::vector<ObjectID> object_ids;
39 |     float sum = 0;
40 |     for (int i = 0; i < world_size; i++) {
41 |       auto oid = object_id_from_integer(trial * 1000000 + i);
42 |       object_ids.push_back(oid);
43 |       auto rnum = get_uniform_random_float(oid.Hex());
44 |       sum += rnum;
45 |     }
46 |     DCHECK(object_size % sizeof(float) == 0);
47 | 
48 |     ObjectID rank_object_id = object_ids[world_rank];
49 |     std::shared_ptr<Buffer> reduction_result;
50 | 
51 |     put_random_buffer<float>(store, rank_object_id, object_size);
52 |     MPI_Barrier(MPI_COMM_WORLD);
53 | 
54 |     auto start = std::chrono::system_clock::now();
55 |     if (world_rank == 0) {
56 |       store.Reduce(object_ids, reduction_id);
57 |     }
58 |     store.Get(reduction_id, &reduction_result);
59 |     auto end = std::chrono::system_clock::now();
60 |     std::chrono::duration<double> duration = end - start;
61 |     LOG(INFO) << reduction_id.ToString() << " is reduced. duration = " << duration.count();
62 |     print_reduction_result<float>(reduction_id, reduction_result, sum);
63 |     MPI_Barrier(MPI_COMM_WORLD);
64 |   }
65 | 
66 |   MPI_Barrier(MPI_COMM_WORLD);
67 |   MPI_Finalize();
68 |   return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/src/tests/gather_test.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <memory>
 3 | #include <mpi.h>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | // Make the Put() call blocking for more precise timing.
 8 | #define HOPLITE_PUT_BLOCKING true
 9 | 
10 | #include "distributed_object_store.h"
11 | #include "util/logging.h"
12 | #include "util/socket_utils.h"
13 | #include "util/test_utils.h"
14 | 
15 | int main(int argc, char **argv) {
16 |   // argv: *, object_directory_address, object_size, n_trials
17 |   std::string object_directory_address = std::string(argv[1]);
18 |   int64_t object_size = std::strtoll(argv[2], NULL, 10);
19 |   int64_t n_trials = std::strtoll(argv[3], NULL, 10);
20 |   std::string my_address = get_host_ipaddress();
21 |   MPI_Init(NULL, NULL);
22 |   int world_rank;
23 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
24 |   int world_size;
25 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
26 | 
27 |   ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG);
28 | 
29 |   TIMELINE("main");
30 | 
31 |   DistributedObjectStore store(object_directory_address);
32 | 
33 |   for (int trial = 0; trial < n_trials; trial++) {
34 |     std::vector<ObjectID> object_ids;
35 |     float sum = 0;
36 |     for (int i = 0; i < world_size; i++) {
37 |       auto oid = object_id_from_integer(trial * 1000000 + i);
38 |       object_ids.push_back(oid);
39 |       auto rnum = get_uniform_random_float(oid.Hex());
40 |       sum += rnum;
41 |     }
42 |     DCHECK(object_size % sizeof(float) == 0);
43 | 
44 |     ObjectID rank_object_id = object_ids[world_rank];
45 |     std::unordered_map<ObjectID, std::shared_ptr<Buffer>> gather_result;
46 | 
47 |     put_random_buffer<float>(store, rank_object_id, object_size);
48 | 
49 |     MPI_Barrier(MPI_COMM_WORLD);
50 | 
51 |     if (world_rank == 0) {
52 |       auto start = std::chrono::system_clock::now();
53 |       for (auto &object_id : object_ids) {
54 |         store.Get(object_id, &gather_result[object_id]);
55 |       }
56 |       auto end = std::chrono::system_clock::now();
57 |       std::chrono::duration<double> duration = end - start;
58 |       LOG(INFO) << "Objects gathered. duration = " << duration.count();
59 | 
60 |       uint32_t sum_crc = 0;
61 |       for (auto &object_id : object_ids) {
62 |         sum_crc += gather_result[object_id]->Hash();
63 |       }
64 |       LOG(INFO) << "Hash for objects is " << sum_crc;
65 |     }
66 |     MPI_Barrier(MPI_COMM_WORLD);
67 |   }
68 | 
69 |   MPI_Barrier(MPI_COMM_WORLD);
70 |   MPI_Finalize();
71 |   return 0;
72 | }
73 | 


--------------------------------------------------------------------------------
/src/tests/multicast_test.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <mpi.h>
 3 | #include <string>
 4 | #include <vector>
 5 | 
 6 | // Make the Put() call blocking for more precise timing.
 7 | #define HOPLITE_PUT_BLOCKING true
 8 | 
 9 | #include "common/buffer.h"
10 | #include "common/id.h"
11 | #include "distributed_object_store.h"
12 | #include "util/logging.h"
13 | #include "util/socket_utils.h"
14 | #include "util/test_utils.h"
15 | 
16 | int main(int argc, char **argv) {
17 |   // argv: *, object_directory_address, object_size, n_trials
18 |   std::string object_directory_address = std::string(argv[1]);
19 |   int64_t object_size = std::strtoll(argv[2], NULL, 10);
20 |   int64_t n_trials = std::strtoll(argv[3], NULL, 10);
21 |   std::string my_address = get_host_ipaddress();
22 |   MPI_Init(NULL, NULL);
23 |   int world_rank;
24 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
25 |   int world_size;
26 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
27 | 
28 |   ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG);
29 | 
30 |   TIMELINE("main");
31 | 
32 |   DistributedObjectStore store(object_directory_address);
33 | 
34 |   for (int trial = 0; trial < n_trials; trial++) {
35 |     ObjectID object_id = object_id_from_integer(trial);
36 |     std::shared_ptr<Buffer> result;
37 | 
38 |     if (world_rank == 0) {
39 |       result = std::make_shared<Buffer>(object_size);
40 |       uint8_t *buf = result->MutableData();
41 |       for (int64_t i = 0; i < object_size; i++) {
42 |         buf[i] = i % 256;
43 |       }
44 |       result->Seal();
45 |       store.Put(result, object_id);
46 | 
47 |       LOG(INFO) << object_id.ToString() << " is created!"
48 |                 << " Hash = " << result->Hash();
49 | 
50 |       LOG(INFO) << "entering barrier";
51 |       MPI_Barrier(MPI_COMM_WORLD);
52 |     } else {
53 | 
54 |       LOG(INFO) << "entering barrier";
55 |       MPI_Barrier(MPI_COMM_WORLD);
56 |       auto start = std::chrono::system_clock::now();
57 |       store.Get(object_id, &result);
58 |       auto end = std::chrono::system_clock::now();
59 |       std::chrono::duration<double> duration = end - start;
60 | 
61 |       LOG(INFO) << object_id.ToString() << " is retrieved. Hash = " << result->Hash();
62 |       LOG(INFO) << "Retrieving duration = " << duration.count();
63 |     }
64 |     MPI_Barrier(MPI_COMM_WORLD);
65 |   }
66 |   MPI_Barrier(MPI_COMM_WORLD);
67 |   MPI_Finalize();
68 |   return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/src/tests/reduce_dependency_test.cc:
--------------------------------------------------------------------------------
 1 | #include "object_directory/reduce_dependency.h"
 2 | #include <iostream>
 3 | 
 4 | int main() {
 5 |   std::cout << ReduceTreeChain(128, 8).DebugString();
 6 |   std::cout << ReduceTreeChain(152, 24).DebugString();
 7 |   std::cout << ReduceTreeChain(61, 2).DebugString();
 8 |   std::cout << ReduceTreeChain(32, 44).DebugString();
 9 |   return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/tests/reduce_test.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <memory>
 3 | #include <mpi.h>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | // Make the Put() call blocking for more precise timing.
 8 | #define HOPLITE_PUT_BLOCKING true
 9 | 
10 | #include "distributed_object_store.h"
11 | #include "util/logging.h"
12 | #include "util/socket_utils.h"
13 | #include "util/test_utils.h"
14 | 
15 | int main(int argc, char **argv) {
16 |   // argv: *, object_directory_address, object_size, n_trials
17 |   std::string object_directory_address = std::string(argv[1]);
18 |   int64_t object_size = std::strtoll(argv[2], NULL, 10);
19 |   int64_t n_trials = std::strtoll(argv[3], NULL, 10);
20 |   std::string my_address = get_host_ipaddress();
21 |   MPI_Init(NULL, NULL);
22 |   int world_rank;
23 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
24 |   int world_size;
25 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
26 | 
27 |   ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG);
28 | 
29 |   TIMELINE("main");
30 | 
31 |   DistributedObjectStore store(object_directory_address);
32 | 
33 |   for (int trial = 0; trial < n_trials; trial++) {
34 |     ObjectID reduction_id = object_id_from_integer(trial * 1000000 + 99999);
35 |     std::vector<ObjectID> object_ids;
36 |     float sum = 0;
37 |     for (int i = 0; i < world_size; i++) {
38 |       auto oid = object_id_from_integer(trial * 1000000 + i);
39 |       object_ids.push_back(oid);
40 |       auto rnum = get_uniform_random_float(oid.Hex());
41 |       sum += rnum;
42 |     }
43 |     DCHECK(object_size % sizeof(float) == 0);
44 | 
45 |     ObjectID rank_object_id = object_ids[world_rank];
46 |     std::shared_ptr<Buffer> reduction_result;
47 | 
48 |     put_random_buffer<float>(store, rank_object_id, object_size);
49 | 
50 |     MPI_Barrier(MPI_COMM_WORLD);
51 | 
52 |     if (world_rank == 0) {
53 |       auto start = std::chrono::system_clock::now();
54 |       store.Reduce(object_ids, reduction_id);
55 |       store.Get(reduction_id, &reduction_result);
56 |       auto end = std::chrono::system_clock::now();
57 |       std::chrono::duration<double> duration = end - start;
58 |       LOG(INFO) << reduction_id.ToString() << " is reduced. duration = " << duration.count();
59 |       print_reduction_result<float>(reduction_id, reduction_result, sum);
60 |     }
61 |     MPI_Barrier(MPI_COMM_WORLD);
62 |   }
63 |   MPI_Barrier(MPI_COMM_WORLD);
64 |   MPI_Finalize();
65 |   return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/src/tests/subset_reduce_test.cc:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | #include <memory>
 3 | #include <mpi.h>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "distributed_object_store.h"
 8 | #include "util/logging.h"
 9 | #include "util/socket_utils.h"
10 | #include "util/test_utils.h"
11 | 
12 | int main(int argc, char **argv) {
13 |   // argv: *, object_directory_address, object_size, n_trials
14 |   std::string object_directory_address = std::string(argv[1]);
15 |   int64_t object_size = std::strtoll(argv[2], NULL, 10);
16 |   int64_t n_trials = std::strtoll(argv[3], NULL, 10);
17 |   std::string my_address = get_host_ipaddress();
18 |   MPI_Init(NULL, NULL);
19 |   int world_rank;
20 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
21 |   int world_size;
22 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
23 | 
24 |   ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG);
25 | 
26 |   TIMELINE("main");
27 | 
28 |   DistributedObjectStore store(object_directory_address);
29 | 
30 |   for (int trial = 0; trial < n_trials; trial++) {
31 |     if (world_rank == 0) {
32 |       LOG(INFO) << "\n\n\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Trail #" << trial << "/" << n_trials << "\n\n\n";
33 |     }
34 |     ObjectID reduction_id = object_id_from_integer(trial * 1000000 + 99999);
35 |     std::vector<ObjectID> object_ids;
36 |     for (int i = 0; i < world_size; i++) {
37 |       auto oid = object_id_from_integer(trial * 1000000 + i);
38 |       object_ids.push_back(oid);
39 |     }
40 |     DCHECK(object_size % sizeof(float) == 0);
41 | 
42 |     ObjectID rank_object_id = object_ids[world_rank];
43 |     std::shared_ptr<Buffer> reduction_result;
44 |     int num_reduce_objects = world_size / 2;
45 | 
46 |     put_fixed_buffer(store, rank_object_id, object_size, (float)world_rank);
47 | 
48 |     MPI_Barrier(MPI_COMM_WORLD);
49 | 
50 |     if (world_rank == 0) {
51 |       auto start = std::chrono::system_clock::now();
52 |       store.Reduce(object_ids, reduction_id, num_reduce_objects);
53 |       store.Get(reduction_id, &reduction_result);
54 |       auto end = std::chrono::system_clock::now();
55 |       std::chrono::duration<double> duration = end - start;
56 |       LOG(INFO) << "Reducing " << num_reduce_objects << " objects";
57 |       LOG(INFO) << reduction_id.ToString() << " is reduced. duration = " << duration.count();
58 |       std::unordered_set<ObjectID> reduced_objects;
59 |       reduced_objects = store.GetReducedObjects(reduction_id);
60 |       for (const auto &reduced_object : reduced_objects) {
61 |         LOG(INFO) << "Reduced object: " << reduced_object.ToString();
62 |       }
63 |       print_reduction_result<float>(reduction_id, reduction_result, 0.0);
64 |     }
65 |     MPI_Barrier(MPI_COMM_WORLD);
66 |   }
67 |   MPI_Barrier(MPI_COMM_WORLD);
68 |   MPI_Finalize();
69 |   return 0;
70 | }
71 | 


--------------------------------------------------------------------------------
/src/util/hash.cc:
--------------------------------------------------------------------------------
 1 | #include "util/hash.h"
 2 | // This code is from https://sites.google.com/site/murmurhash/
 3 | // and is public domain.
 4 | uint64_t MurmurHash64A(const void *key, int len, unsigned int seed) {
 5 |   const uint64_t m = 0xc6a4a7935bd1e995;
 6 |   const int r = 47;
 7 | 
 8 |   uint64_t h = seed ^ (len * m);
 9 | 
10 |   const auto *data = reinterpret_cast<const uint64_t *>(key);
11 |   const uint64_t *end = data + (len / 8);
12 | 
13 |   while (data != end) {
14 |     uint64_t k = *data++;
15 | 
16 |     k *= m;
17 |     k ^= k >> r;
18 |     k *= m;
19 | 
20 |     h ^= k;
21 |     h *= m;
22 |   }
23 | 
24 |   const auto *data2 = reinterpret_cast<const unsigned char *>(data);
25 | 
26 |   switch (len & 7) {
27 |   case 7:
28 |     h ^= uint64_t(data2[6]) << 48;
29 |   case 6:
30 |     h ^= uint64_t(data2[5]) << 40;
31 |   case 5:
32 |     h ^= uint64_t(data2[4]) << 32;
33 |   case 4:
34 |     h ^= uint64_t(data2[3]) << 24;
35 |   case 3:
36 |     h ^= uint64_t(data2[2]) << 16;
37 |   case 2:
38 |     h ^= uint64_t(data2[1]) << 8;
39 |   case 1:
40 |     h ^= uint64_t(data2[0]);
41 |     h *= m;
42 |   }
43 | 
44 |   h ^= h >> r;
45 |   h *= m;
46 |   h ^= h >> r;
47 | 
48 |   return h;
49 | }
50 | 


--------------------------------------------------------------------------------
/src/util/hash.h:
--------------------------------------------------------------------------------
1 | #include <cstdint>
2 | uint64_t MurmurHash64A(const void *key, int len, unsigned int seed);
3 | 


--------------------------------------------------------------------------------
/src/util/protobuf_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util/logging.h"
 4 | #include "util/socket_utils.h"
 5 | 
 6 | template <typename T> inline void SendProtobufMessage(int conn_fd, const T &message) {
 7 |   size_t message_size = message.ByteSizeLong();
 8 |   auto status = send_all(conn_fd, (void *)&message_size, sizeof(message_size));
 9 |   DCHECK(!status) << "socket send error: message_size";
10 | 
11 |   std::vector<uint8_t> message_buf(message_size);
12 |   message.SerializeWithCachedSizesToArray(message_buf.data());
13 | 
14 |   status = send_all(conn_fd, (void *)message_buf.data(), message_buf.size());
15 |   DCHECK(!status) << "socket send error: message";
16 | }
17 | 
18 | template <typename T> inline void ReceiveProtobufMessage(int conn_fd, T *message) {
19 |   size_t message_len;
20 |   int status = recv_all(conn_fd, &message_len, sizeof(message_len));
21 |   DCHECK(!status) << "receive message_len failed";
22 | 
23 |   std::vector<uint8_t> message_buf(message_len);
24 |   status = recv_all(conn_fd, message_buf.data(), message_len);
25 |   DCHECK(!status) << "receive message failed";
26 | 
27 |   message->ParseFromArray(message_buf.data(), message_buf.size());
28 | }
29 | 


--------------------------------------------------------------------------------
/src/util/socket_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOCKET_UTILS_H
 2 | #define SOCKET_UTILS_H
 3 | 
 4 | #include <string>
 5 | 
 6 | int send_all(int conn_fd, const void *buf, const size_t size);
 7 | 
 8 | int recv_all(int conn_fd, void *buf, const size_t size);
 9 | 
10 | int tcp_connect(const std::string &ip_address, int port, int *conn_fd);
11 | 
12 | void tcp_bind_and_listen(int port, struct sockaddr_in *address, int *server_fd);
13 | 
14 | void recv_ack(int fd);
15 | 
16 | void send_ack(int fd);
17 | 
18 | std::string get_host_ipaddress();
19 | 
20 | #endif // SOCKET_UTILS_H
21 | 


--------------------------------------------------------------------------------
/test_utils/get_worker_ips.py:
--------------------------------------------------------------------------------
 1 | import ray
 2 | import socket
 3 | ray.init(address="auto")
 4 | d = ray.cluster_resources()
 5 | my_addr = socket.gethostbyname(socket.gethostname())
 6 | for k in d:
 7 |     if k.startswith('node'):
 8 |         ip = k.split(':')[1]
 9 |         if ip != my_addr:
10 |             print(ip)
11 | 


--------------------------------------------------------------------------------
/test_utils/load_cluster_env.sh:
--------------------------------------------------------------------------------
 1 | # This file should only be sourced.
 2 | 
 3 | MY_IPADDR=$(hostname -i)
 4 | # OTHERS_IPADDR=()
 5 | # for s in $(ray get-worker-ips ~/ray_bootstrap_config.yaml); do
 6 | #     OTHERS_IPADDR+=($(ssh -o StrictHostKeyChecking=no $s hostname -i));
 7 | # done
 8 | SCRIPT_CURRENT_DIR=$(dirname $(realpath -s ${BASH_SOURCE[0]}))
 9 | 
10 | OTHERS_IPADDR=($(python $SCRIPT_CURRENT_DIR/get_worker_ips.py 2>/dev/null))
11 | ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]})
12 | unset SCRIPT_CURRENT_DIR
13 | 


--------------------------------------------------------------------------------
/test_utils/mpirun_pernode.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | all_hosts=$1
3 | shift
4 | # This syntax is for OpenMPI
5 | /opt/amazon/openmpi/bin/mpirun --mca btl_tcp_if_exclude lo,docker0 --map-by ppr:1:node -H $all_hosts $@
6 | 


--------------------------------------------------------------------------------
/test_utils/result_parser_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def read_rank0_lines(log_dir, foldername):
 8 |     file_name = os.path.join(log_dir, foldername, "rank_0.log")
 9 |     with open(file_name) as f:
10 |         return f.readlines()
11 | 
12 | 
13 | def default_parse_file(task_name, log_dir, foldername):
14 |     try:
15 |         lines = read_rank0_lines(log_dir, foldername)
16 |         results = []
17 |         for line in lines:
18 |             if 'duration = ' in line:
19 |                 tmp = line.split('duration = ')[1]
20 |                 results.append(float(tmp))
21 |         return results
22 |     except Exception:
23 |         return None
24 | 
25 | 
26 | def collect_log_folders(log_dir):
27 |     tasks = {}
28 | 
29 |     for filename in os.listdir(log_dir):
30 |         if filename == "latest":
31 |             continue
32 |         # log name format: $date-$time-$test_name-$world_size-$object_size
33 |         splited = filename.split('-')
34 |         if len(splited) != 5:
35 |             raise Exception(f"Unexpected log name {filename}.")
36 |         task_name, number_of_nodes, object_size = splited[2:5]
37 |         task = (task_name, number_of_nodes, object_size)
38 |         if task not in tasks:
39 |             tasks[task] = []
40 |         tasks[task].append(filename)
41 | 
42 |     return tasks
43 | 
44 | 
45 | def parse(log_dir, parse_file):
46 |     tasks = collect_log_folders(log_dir)
47 | 
48 |     results = {}
49 | 
50 |     for task, folders in tasks.items():
51 |         task_results = []
52 |         for foldername in folders:
53 |             result = parse_file(task[0], log_dir, foldername)
54 |             if isinstance(result, (list, np.ndarray)):
55 |                 task_results += list(result)
56 |             elif result is None or np.isnan(result):
57 |                 print(f"Error parsing {foldername}: cannot read out value.")
58 |             else:
59 |                 task_results.append(result)
60 |         results[task] = np.array(task_results)
61 | 
62 |     task_list = sorted(list(results.keys()), reverse=True)
63 | 
64 |     df = pd.DataFrame(columns = ['Benchmark Name', '#Nodes', 'Object Size (in bytes)',
65 |                                 'Average Time (s)', 'Std Time (s)', 'Repeated Times'])
66 | 
67 |     for i, task in enumerate(task_list):
68 |         task_name, number_of_nodes, object_size = task
69 |         df.loc[i] = [task_name, number_of_nodes, object_size, np.mean(results[task]), np.std(results[task]),
70 |                      len(results[task])]
71 |     return df
72 | 


--------------------------------------------------------------------------------