├── .clang-format ├── .gitignore ├── CMakeLists.txt ├── README.md ├── _archived ├── basic_test.py ├── check_env.py ├── check_env_remote.py ├── cluster-config │ ├── cluster-new.yaml │ ├── cluster.yaml │ ├── gpu.yaml │ ├── large-cluster.yaml │ ├── large_cluster.yaml │ ├── new_cluster.yaml │ ├── single-nc.yaml │ ├── single-new.yaml │ ├── single.yaml │ └── siyuan-old.yaml ├── compare_bcast.c ├── exit_test.py ├── fault_tolerance_tests │ ├── README.md │ ├── enable_failure.patch │ ├── run_test_fault_tolerance.sh │ └── test_wrapper_fault_tolerance.sh ├── hoplite_microbenchmarks.py ├── init_env.py ├── launch_test.py ├── mpi_compare_bcast.sh ├── notification_test.sh ├── parameter-server │ ├── compare_hoplite_mpi.sh │ ├── mpi_parameter_server.py │ ├── mpi_test.py │ ├── parse_results.py │ ├── run_gloo_allreduce.sh │ ├── run_mpi_allreduce.sh │ ├── run_mpi_ps.sh │ └── run_ps_tests.sh ├── parse_ray_result.py ├── restart_all_workers.sh ├── restart_ray.sh ├── script │ ├── README.md │ ├── find_missing_tests.py │ └── timeline.py ├── speed_test.sh └── sync_time.sh ├── app ├── parameter-server │ ├── README.md │ ├── analyze_fault_tolerance.py │ ├── cluster-asgd-fault-tolerance.yaml │ ├── cluster-config-access-results-only │ │ ├── README.md │ │ └── example.yaml │ ├── cluster-config-with-ami │ │ ├── README.md │ │ └── example.yaml │ ├── gloo_all_reduce.py │ ├── hoplite_all_reduce.py │ ├── hoplite_asgd_fault_tolerance.py │ ├── mpi_all_reduce.py │ ├── parameter_server.py │ ├── plot_async_ps_results.py │ ├── ps_helper.py │ ├── ray_asgd_fault_tolerance.py │ ├── ray_parameter_server_baseline.py │ ├── result_parser │ │ ├── parse_async_ps_hoplite.py │ │ ├── parse_gloo.py │ │ ├── parse_hoplite.py │ │ ├── parse_mpi.py │ │ └── parse_ray.py │ ├── run_allreduce_tests.sh │ ├── run_async_ps_fault_tolerance.sh │ └── run_async_ps_tests.sh ├── ray_serve │ ├── README.md │ ├── analyze_fault_tolerance.py │ ├── cluster-config-access-results-only │ │ ├── README.md │ │ └── example.yaml │ ├── cluster-config-with-ami │ │ ├── README.md │ │ └── example.yaml │ ├── cluster-config │ │ ├── README.md │ │ ├── cluster.yaml │ │ ├── example.yaml │ │ └── initial.yaml │ ├── hoplite_model_ensembling.py │ ├── hoplite_model_ensembling_fault_tolerance.py │ ├── model_ensembling.py │ └── model_ensembling_fault_tolerance.py └── rllib │ ├── README-with-ami.md │ ├── README.md │ ├── cluster.yaml │ ├── example.yaml │ └── initial.yaml ├── cmake └── FindGRPC.cmake ├── format.sh ├── fornode ├── install_dependencies.sh ├── microbenchmarks ├── README.md ├── cluster-config-access-results-only │ ├── README.md │ └── example.yaml ├── cluster-config-with-ami │ ├── README.md │ └── example.yaml ├── cluster-config │ ├── README.md │ ├── cluster.yaml │ ├── example.yaml │ └── initial.yaml ├── dask-python │ ├── auto_dask_benchmark.py │ ├── auto_test.sh │ ├── cleanup_dask.sh │ ├── dask_benchmark.py │ ├── dask_roundtrip.py │ ├── dask_roundtrip.sh │ ├── parse_result.py │ └── run_dask.sh ├── draw_collective_communication.py ├── gloo-cpp │ ├── .gitignore │ ├── README.md │ ├── auto_test.sh │ ├── install_gloo.sh │ ├── parse_result.py │ ├── run_test.sh │ └── test_wrapper.sh ├── hoplite-cpp │ ├── README.md │ ├── auto_test.sh │ ├── coverage_test.sh │ ├── parse_result.py │ ├── pressure_test.sh │ ├── run_test.sh │ └── test_wrapper.sh ├── hoplite-python │ ├── README.md │ ├── auto_test.sh │ ├── coverage_test.sh │ ├── hoplite_microbenchmarks.py │ ├── parse_result.py │ ├── parse_roundtrip_result.py │ ├── pressure_test.sh │ ├── run_test.sh │ └── test_wrapper.sh ├── mpi-cpp │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── allgather.c │ ├── allreduce.c │ ├── auto_test.sh │ ├── coverage_test.sh │ ├── gather.c │ ├── multicast.c │ ├── parse_result.py │ ├── parse_roundtrip_result.py │ ├── reduce.c │ ├── roundtrip.c │ ├── run_test.sh │ └── test_wrapper.sh ├── plot_rtt.py └── ray-python │ ├── Makefile │ ├── README.md │ ├── auto_test.sh │ ├── ray_microbenchmarks.py │ ├── ray_roundtrip.py │ └── run_tests.py ├── python ├── hoplite │ ├── __init__.py │ ├── _hoplite_client.pxd │ └── _hoplite_client.pyx ├── setup.py └── setup.sh ├── src ├── client │ ├── distributed_object_store.cc │ ├── distributed_object_store.h │ ├── global_control_store.cc │ ├── global_control_store.h │ ├── local_store_client.cc │ ├── local_store_client.h │ ├── notification_listener.cc │ ├── notification_listener.h │ ├── object_sender.cc │ ├── object_sender.h │ ├── object_store_state.cc │ ├── object_store_state.h │ ├── receiver.cc │ └── receiver.h ├── common │ ├── buffer.cc │ ├── buffer.h │ ├── config.h │ ├── id.cc │ ├── id.h │ ├── status.cc │ └── status.h ├── object_directory │ ├── dependency.cc │ ├── dependency.h │ ├── notification.cc │ ├── notification.h │ ├── reduce_dependency.cc │ └── reduce_dependency.h ├── protocol │ └── object_store.proto ├── tests │ ├── allgather_test.cc │ ├── allreduce_test.cc │ ├── gather_test.cc │ ├── multicast_test.cc │ ├── notification_server_test.cc │ ├── reduce_dependency_test.cc │ ├── reduce_test.cc │ └── subset_reduce_test.cc └── util │ ├── ctpl_stl.h │ ├── hash.cc │ ├── hash.h │ ├── logging.cc │ ├── logging.h │ ├── protobuf_utils.h │ ├── socket_utils.cc │ ├── socket_utils.h │ └── test_utils.h └── test_utils ├── get_worker_ips.py ├── load_cluster_env.sh ├── mpirun_pernode.sh └── result_parser_utils.py /.clang-format: -------------------------------------------------------------------------------- 1 | ColumnLimit: 120 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | .vscode/ 35 | .cquery_cache/ 36 | __pycache__/ 37 | # binaries, databases, protobuf sockets 38 | dump.rdb 39 | multicast_test 40 | reduce_test 41 | allreduce_test 42 | gather_test 43 | allgather_test 44 | *.pb.h 45 | notification 46 | 47 | python/object_store_pb2*.py 48 | 49 | # logs 50 | log/ 51 | mpi_log/ 52 | gloo_log/ 53 | 54 | python/*.cpp 55 | .DS_Store 56 | 57 | *.csv 58 | *.json 59 | cmake-build-debug/ 60 | .idea/ 61 | 62 | # generated plots 63 | *.pdf 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hoplite: Efficient and Fault-Tolerant Collective Communication for Task-Based Distributed Systems 2 | 3 | This is the repo for the artifact evaluataion for the SIGCOMM 2021 paper: _Hoplite: Efficient and Fault-Tolerant Collective Communication for Task-Based Distributed Systems_. For any questions or related issue, please feel free to contact Siyuan Zhuang (s.z@berkeley.edu) and Zhuohan Li (zhuohan@berkeley.edu). 4 | 5 | ## Setup AWS Cluster & Hoplite 6 | 7 | All the experiments in the paper are evaluated on AWS. We use [Ray cluster launcher](https://docs.ray.io/en/latest/cluster/launcher.html) to lanuch the cluster for all the experiments in the paper. We highly recommend using Ray cluster launcher to launch the cluster as it will automatically setup the execution environment we required in the experiments. 8 | 9 | For every experiment, we include detailed instruction for setting up a cluster and reproducing the results in the paper. 10 | 11 | ## Microbenchmarks (Section 5.1) 12 | 13 | Please see [microbenchmarks/](microbenchmarks) to reproduce the microbenchmark experiments in the paper. 14 | 15 | ## Asynchronous SGD (Section 5.2) 16 | 17 | Please see [app/parameter-server/](app/parameter-server) to reproduce the Asynchronous SGD experiments in the paper. 18 | 19 | ## Reinforcement Learning (Section 5.3) 20 | 21 | Please see [app/rllib/](app/rllib/) to reproduce the rllib experiments in the paper. 22 | 23 | ## ML Model Serving Experiments (Section 5.4) 24 | 25 | Please see [app/ray_serve/](app/ray_serve) to reproduce the Ray serve experiments and the Ray serve fault tolerance experiments (Section 5.5, Figure 12a) in the paper. 26 | -------------------------------------------------------------------------------- /_archived/basic_test.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import numpy as np 4 | import py_distributed_object_store as store_lib 5 | 6 | from py_distributed_object_store import Buffer 7 | arr = np.random.rand(2,3,4) 8 | buf = Buffer.from_buffer(arr) 9 | print(buf.size(), arr.nbytes, hash(buf)) 10 | gc.collect() 11 | print(buf.size(), hash(buf)) 12 | 13 | arr2 = np.frombuffer(buf).reshape(arr.shape) 14 | assert np.array_equal(arr, arr2) 15 | -------------------------------------------------------------------------------- /_archived/check_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import ray 4 | 5 | ray.init(address='auto') 6 | @ray.remote(resources={'node': 1}) 7 | def check_env(): 8 | import socket 9 | import sys 10 | print(socket.gethostbyname(socket.gethostname()), sys.path) 11 | tasks = [] 12 | 13 | for _ in ray.nodes(): 14 | tasks.append(check_env.remote()) 15 | 16 | ray.get(tasks) 17 | -------------------------------------------------------------------------------- /_archived/check_env_remote.py: -------------------------------------------------------------------------------- 1 | import ray 2 | 3 | @ray.remote(resources={'node': 1}, max_calls=1) 4 | def check_env(): 5 | import socket 6 | import sys 7 | print(socket.gethostbyname(socket.gethostname()), sys.path) -------------------------------------------------------------------------------- /_archived/cluster-config/cluster-new.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem 19 | 20 | head_node: 21 | InstanceType: m5.4xlarge 22 | ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3 23 | KeyName: shared_key 24 | SecurityGroupIds: 25 | - "sg-f55048b4" 26 | Placement: 27 | GroupName: hoplite-group 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3 32 | KeyName: shared_key 33 | SecurityGroupIds: 34 | - "sg-f55048b4" 35 | Placement: 36 | GroupName: hoplite-group 37 | 38 | setup_commands: 39 | - sudo mount -t efs fs-07737200:/ efs 40 | - sudo chmod 777 efs 41 | 42 | # Command to start ray on the head node. You don't need to change this. 43 | head_start_ray_commands: 44 | - ray stop 45 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 46 | 47 | # Command to start ray on worker nodes. You don't need to change this. 48 | worker_start_ray_commands: 49 | - ray stop 50 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 51 | -------------------------------------------------------------------------------- /_archived/cluster-config/cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem 19 | 20 | head_node: 21 | InstanceType: m5.4xlarge 22 | ImageId: ami-0875c13495964ad3e # hoplite-artifact-2 23 | KeyName: shared_key 24 | SecurityGroupIds: 25 | - "sg-f55048b4" 26 | Placement: 27 | GroupName: hoplite-group 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-0875c13495964ad3e # hoplite-artifact-2 32 | KeyName: shared_key 33 | SecurityGroupIds: 34 | - "sg-f55048b4" 35 | Placement: 36 | GroupName: hoplite-group 37 | 38 | setup_commands: 39 | - sudo mount -t efs fs-6dad81c6:/ efs 40 | - sudo chmod 777 efs 41 | 42 | # Command to start ray on the head node. You don't need to change this. 43 | head_start_ray_commands: 44 | - ray stop 45 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 46 | 47 | # Command to start ray on worker nodes. You don't need to change this. 48 | worker_start_ray_commands: 49 | - ray stop 50 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 51 | -------------------------------------------------------------------------------- /_archived/cluster-config/gpu.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-gpu 2 | 3 | min_workers: 7 4 | max_workers: 7 5 | initial_workers: 7 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | 15 | auth: 16 | ssh_user: ubuntu 17 | ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem 18 | 19 | head_node: 20 | InstanceType: p3.2xlarge 21 | ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11 22 | KeyName: shared_key 23 | SecurityGroupIds: 24 | - "sg-f55048b4" 25 | Placement: 26 | GroupName: hoplite-group 27 | 28 | worker_nodes: 29 | InstanceType: p3.2xlarge 30 | ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11 31 | KeyName: shared_key 32 | SecurityGroupIds: 33 | - "sg-f55048b4" 34 | Placement: 35 | GroupName: hoplite-group 36 | 37 | setup_commands: 38 | - sudo mount -t efs fs-6dad81c6:/ efs 39 | - sudo chmod 777 efs 40 | 41 | # Command to start ray on the head node. You don't need to change this. 42 | head_start_ray_commands: 43 | - ray stop 44 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 45 | 46 | # Command to start ray on worker nodes. You don't need to change this. 47 | worker_start_ray_commands: 48 | - ray stop 49 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 50 | -------------------------------------------------------------------------------- /_archived/cluster-config/large-cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 63 4 | max_workers: 63 5 | initial_workers: 63 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | 15 | auth: 16 | ssh_user: ubuntu 17 | ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11 22 | KeyName: shared_key 23 | SecurityGroupIds: 24 | - "sg-f55048b4" 25 | Placement: 26 | GroupName: hoplite-group 27 | 28 | worker_nodes: 29 | InstanceType: m5.4xlarge 30 | ImageId: ami-01859c084acd14dc9 # hoplite-nsdi-11 31 | KeyName: shared_key 32 | SecurityGroupIds: 33 | - "sg-f55048b4" 34 | Placement: 35 | GroupName: hoplite-group 36 | 37 | setup_commands: 38 | - sudo mount -t efs fs-6dad81c6:/ efs 39 | - sudo chmod 777 efs 40 | 41 | # Command to start ray on the head node. You don't need to change this. 42 | head_start_ray_commands: 43 | - ray stop 44 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 45 | 46 | # Command to start ray on worker nodes. You don't need to change this. 47 | worker_start_ray_commands: 48 | - ray stop 49 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 50 | -------------------------------------------------------------------------------- /_archived/cluster-config/large_cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-large 2 | 3 | min_workers: 258 4 | max_workers: 258 5 | initial_workers: 258 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | 15 | auth: 16 | ssh_user: ubuntu 17 | ssh_private_key: /Users/siyuan/.ssh/siyuan-aws.pem 18 | 19 | head_node: 20 | InstanceType: m5.8xlarge 21 | ImageId: ami-087095f2ce112c29d # latest_dlami # hoplite-nsdi-5 22 | # InstanceMarketOptions: 23 | # MarketType: spot 24 | # SpotOptions: 25 | # MaxPrice: "1.5" # Max Hourly Price MAX_HOURLY_PRICE 26 | KeyName: siyuan-aws 27 | SecurityGroupIds: 28 | - "sg-50656710" 29 | Placement: 30 | GroupName: hoplite-group 31 | 32 | worker_nodes: 33 | InstanceType: c5.2xlarge 34 | ImageId: ami-087095f2ce112c29d # latest_dlami # hoplite-nsdi-5 35 | InstanceMarketOptions: 36 | MarketType: spot 37 | SpotOptions: 38 | MaxPrice: "1.2" # Max Hourly Price MAX_HOURLY_PRICE 39 | KeyName: siyuan-aws 40 | SecurityGroupIds: 41 | - "sg-50656710" 42 | Placement: 43 | GroupName: hoplite-group 44 | 45 | setup_commands: 46 | # - pip install ray==0.8.6 47 | - mkdir -p ~/efs 48 | - sudo mount -t efs fs-a692810d:/ ~/efs 49 | - sudo chmod 777 ~/efs 50 | 51 | # Command to start ray on the head node. You don't need to change this. 52 | head_start_ray_commands: 53 | - ray stop 54 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 55 | 56 | # Command to start ray on worker nodes. You don't need to change this. 57 | worker_start_ray_commands: 58 | - ray stop 59 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 60 | -------------------------------------------------------------------------------- /_archived/cluster-config/new_cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-benchmark 2 | 3 | min_workers: 7 4 | max_workers: 7 5 | initial_workers: 7 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | 15 | auth: 16 | ssh_user: ubuntu 17 | ssh_private_key: /Users/siyuan/.ssh/siyuan-aws.pem 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-0a53ac9d916d997ae # hoplite-sigcomm21 22 | InstanceMarketOptions: 23 | MarketType: spot 24 | KeyName: siyuan-aws 25 | SecurityGroupIds: 26 | - "sg-50656710" 27 | Placement: 28 | GroupName: hoplite-group 29 | 30 | worker_nodes: 31 | InstanceType: m5.4xlarge 32 | ImageId: ami-0a53ac9d916d997ae # hoplite-sigcomm21 33 | InstanceMarketOptions: 34 | MarketType: spot 35 | KeyName: siyuan-aws 36 | SecurityGroupIds: 37 | - "sg-50656710" 38 | Placement: 39 | GroupName: hoplite-group 40 | 41 | setup_commands: 42 | # - pip install ray==1.1 43 | - mkdir -p ~/efs 44 | - sudo mount -t efs fs-a692810d:/ ~/efs 45 | - sudo chmod 777 ~/efs 46 | 47 | # Command to start ray on the head node. You don't need to change this. 48 | head_start_ray_commands: 49 | - ray stop 50 | - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 51 | 52 | # Command to start ray on worker nodes. You don't need to change this. 53 | worker_start_ray_commands: 54 | - ray stop 55 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 56 | -------------------------------------------------------------------------------- /_archived/cluster-config/single-nc.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-single 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-west-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-1b 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem 19 | 20 | head_node: 21 | InstanceType: m5.4xlarge 22 | ImageId: ami-0445e6fea66b74ae5 # rllib-all 23 | KeyName: shared_key 24 | SecurityGroupIds: 25 | - "sg-878331f8" 26 | 27 | worker_nodes: 28 | InstanceType: m5.4xlarge 29 | ImageId: ami-0445e6fea66b74ae5 # rllib-all 30 | KeyName: shared_key 31 | SecurityGroupIds: 32 | - "sg-878331f8" 33 | 34 | setup_commands: 35 | - sudo mount -t efs fs-760d746f:/ efs 36 | - sudo chmod 777 efs 37 | 38 | # Command to start ray on the head node. You don't need to change this. 39 | head_start_ray_commands: 40 | - ray stop 41 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 42 | 43 | # Command to start ray on worker nodes. You don't need to change this. 44 | worker_start_ray_commands: 45 | - ray stop 46 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 47 | -------------------------------------------------------------------------------- /_archived/cluster-config/single-new.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-single-initial 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem 19 | 20 | head_node: 21 | InstanceType: m5.4xlarge 22 | ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3 23 | KeyName: shared_key 24 | SecurityGroupIds: 25 | - "sg-f55048b4" 26 | Placement: 27 | GroupName: hoplite-group 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-0f1ec76edd85bfd5d # hoplite-artifact-new-3 32 | KeyName: shared_key 33 | SecurityGroupIds: 34 | - "sg-f55048b4" 35 | Placement: 36 | GroupName: hoplite-group 37 | 38 | setup_commands: 39 | - sudo mount -t efs fs-07737200:/ ~/efs 40 | - sudo chmod 777 ~/efs 41 | 42 | # Command to start ray on the head node. You don't need to change this. 43 | head_start_ray_commands: [] 44 | 45 | # Command to start ray on worker nodes. You don't need to change this. 46 | worker_start_ray_commands: [] 47 | -------------------------------------------------------------------------------- /_archived/cluster-config/single.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-single 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | ssh_private_key: /Users/zhuohan123/.ssh/shared_key.pem 19 | 20 | head_node: 21 | InstanceType: m5.4xlarge 22 | ImageId: ami-0875c13495964ad3e # hoplite-artifact-2 23 | KeyName: shared_key 24 | SecurityGroupIds: 25 | - "sg-f55048b4" 26 | Placement: 27 | GroupName: hoplite-group 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-0875c13495964ad3e # hoplite-artifact-2 32 | KeyName: shared_key 33 | SecurityGroupIds: 34 | - "sg-f55048b4" 35 | Placement: 36 | GroupName: hoplite-group 37 | 38 | setup_commands: 39 | - sudo mount -t efs fs-6dad81c6:/ efs 40 | - sudo chmod 777 efs 41 | 42 | # Command to start ray on the head node. You don't need to change this. 43 | head_start_ray_commands: 44 | - ray stop 45 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 46 | 47 | # Command to start ray on worker nodes. You don't need to change this. 48 | worker_start_ray_commands: 49 | - ray stop 50 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 51 | -------------------------------------------------------------------------------- /_archived/cluster-config/siyuan-old.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-asgd 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | 15 | auth: 16 | ssh_user: ubuntu 17 | 18 | head_node: 19 | InstanceType: m5.4xlarge 20 | ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2 21 | InstanceMarketOptions: 22 | MarketType: spot 23 | SecurityGroupIds: 24 | - "sg-3463e565" 25 | Placement: 26 | GroupName: hoplite-group 27 | 28 | worker_nodes: 29 | InstanceType: m5.4xlarge 30 | ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2 31 | InstanceMarketOptions: 32 | MarketType: spot 33 | SecurityGroupIds: 34 | - "sg-3463e565" 35 | Placement: 36 | GroupName: hoplite-group 37 | 38 | setup_commands: 39 | # - pip install ray==1.1 40 | - mkdir -p ~/efs 41 | - sudo mount -t efs fs-d416cc55:/ ~/efs 42 | - sudo chmod 777 ~/efs 43 | 44 | # Command to start ray on the head node. You don't need to change this. 45 | head_start_ray_commands: 46 | - ray stop 47 | - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 48 | 49 | # Command to start ray on worker nodes. You don't need to change this. 50 | worker_start_ray_commands: 51 | - ray stop 52 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 53 | -------------------------------------------------------------------------------- /_archived/compare_bcast.c: -------------------------------------------------------------------------------- 1 | // Author: Wes Kendall 2 | // Copyright 2011 www.mpitutorial.com 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel 4 | // free to modify it for your own use. Any distribution of the code must 5 | // either provide a link to www.mpitutorial.com or keep this header intact. 6 | // 7 | // Comparison of MPI_Bcast with the my_bcast function 8 | // 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | void my_bcast(void *data, int count, MPI_Datatype datatype, int root, MPI_Comm communicator) { 15 | int world_rank; 16 | MPI_Comm_rank(communicator, &world_rank); 17 | int world_size; 18 | MPI_Comm_size(communicator, &world_size); 19 | 20 | if (world_rank == root) { 21 | // If we are the root process, send our data to everyone 22 | int i; 23 | for (i = 0; i < world_size; i++) { 24 | if (i != world_rank) { 25 | MPI_Send(data, count, datatype, i, 0, communicator); 26 | } 27 | } 28 | } else { 29 | // If we are a receiver process, receive the data from the root 30 | MPI_Recv(data, count, datatype, root, 0, communicator, MPI_STATUS_IGNORE); 31 | } 32 | } 33 | 34 | int main(int argc, char **argv) { 35 | if (argc != 3) { 36 | fprintf(stderr, "Usage: compare_bcast num_elements num_trials\n"); 37 | exit(1); 38 | } 39 | 40 | int num_elements = atoi(argv[1]); 41 | int num_trials = atoi(argv[2]); 42 | 43 | MPI_Init(NULL, NULL); 44 | 45 | int world_rank; 46 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 47 | 48 | double total_my_bcast_time = 0.0; 49 | double total_mpi_bcast_time = 0.0; 50 | int i; 51 | int *data = (int *)malloc(sizeof(int) * num_elements); 52 | assert(data != NULL); 53 | 54 | for (i = 0; i < num_trials; i++) { 55 | // Time my_bcast 56 | // Synchronize before starting timing 57 | MPI_Barrier(MPI_COMM_WORLD); 58 | total_my_bcast_time -= MPI_Wtime(); 59 | my_bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD); 60 | // Synchronize again before obtaining final time 61 | MPI_Barrier(MPI_COMM_WORLD); 62 | total_my_bcast_time += MPI_Wtime(); 63 | 64 | // Time MPI_Bcast 65 | MPI_Barrier(MPI_COMM_WORLD); 66 | total_mpi_bcast_time -= MPI_Wtime(); 67 | MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD); 68 | MPI_Barrier(MPI_COMM_WORLD); 69 | total_mpi_bcast_time += MPI_Wtime(); 70 | } 71 | 72 | // Print off timing information 73 | if (world_rank == 0) { 74 | printf("Data size = %d, Trials = %d\n", num_elements * (int)sizeof(int), num_trials); 75 | printf("Avg my_bcast time = %lf\n", total_my_bcast_time / num_trials); 76 | printf("Avg MPI_Bcast time = %lf\n", total_mpi_bcast_time / num_trials); 77 | } 78 | 79 | free(data); 80 | MPI_Finalize(); 81 | } 82 | -------------------------------------------------------------------------------- /_archived/exit_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import time 4 | 5 | import py_distributed_object_store as store_lib 6 | import utils 7 | 8 | parser = argparse.ArgumentParser() 9 | utils.add_arguments(parser) 10 | 11 | args = parser.parse_args() 12 | args_dict = utils.extract_dict_from_args(args) 13 | 14 | store = utils.create_store_using_dict(args_dict) 15 | time.sleep(5) 16 | print ("Exiting") 17 | 18 | -------------------------------------------------------------------------------- /_archived/fault_tolerance_tests/README.md: -------------------------------------------------------------------------------- 1 | # Fault tolerance tests (optional) 2 | 3 | To enable fault tolerance test, first apply the patch that introduces failures: 4 | 5 | ```bash 6 | patch -p1 --directory .. < enable_failure.patch 7 | ``` 8 | 9 | Then recompile the C++ project. 10 | 11 | ## Multicast fault tolerance test 12 | 13 | Intensive benchmarks that exploits corner cases. This test proves reliablility of our system. It could take serveral minutes to complete with high speed networks. 14 | 15 | Usage: 16 | 17 | 18 | ```bash 19 | ./run_test_fault_tolerance.sh subset_reduce ${total_number_of_nodes} ${input_size_in_bytes} 20 | ``` 21 | 22 | ### Subset reduction fault tolerance test 23 | 24 | This test shows Hoplite is able to reduce only a subset of objects. For example, we have 8 candidate objects to reduce, but we want to reduce 4 objects that are created first. 25 | 26 | Usage: 27 | 28 | ```bash 29 | ./run_test_fault_tolerance.sh subset_reduce ${total_number_of_nodes} ${input_size_in_bytes} 30 | ``` 31 | 32 | We suggest `total_number_of_nodes>=4`. 33 | -------------------------------------------------------------------------------- /_archived/fault_tolerance_tests/enable_failure.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/client/object_sender.cc b/src/client/object_sender.cc 2 | index 5569061..f11863e 100644 3 | --- a/src/client/object_sender.cc 4 | +++ b/src/client/object_sender.cc 5 | @@ -15,11 +15,14 @@ using objectstore::ObjectWriterRequest; 6 | using objectstore::ReceiveObjectRequest; 7 | using objectstore::ReceiveReducedObjectRequest; 8 | 9 | +int global_count = 0; 10 | + 11 | template inline int stream_send(int conn_fd, T *stream, int64_t offset = 0) { 12 | TIMELINE("ObjectSender::stream_send()"); 13 | LOG(DEBUG) << "ObjectSender::stream_send(), offset=" << offset; 14 | const uint8_t *data_ptr = stream->Data(); 15 | const int64_t object_size = stream->Size(); 16 | + bool triggered = false; 17 | 18 | if (stream->IsFinished()) { 19 | int status = send_all(conn_fd, data_ptr + offset, object_size - offset); 20 | @@ -32,6 +35,16 @@ template inline int stream_send(int conn_fd, T *stream, int64_t off 21 | int64_t cursor = offset; 22 | while (cursor < object_size) { 23 | int64_t current_progress = stream->progress; 24 | + if (current_progress > object_size / 2 && !triggered) { 25 | + triggered = true; 26 | + if (++global_count >= 3) { 27 | + int rank = std::stoi(getenv("OMPI_COMM_WORLD_RANK")); 28 | + if (rank == 2) { 29 | + //usleep(1000); 30 | + LOG(FATAL) << " failed intentionally"; 31 | + } 32 | + } 33 | + } 34 | if (cursor < current_progress) { 35 | int bytes_sent = send(conn_fd, data_ptr + cursor, current_progress - cursor, 0); 36 | if (bytes_sent < 0) { 37 | -------------------------------------------------------------------------------- /_archived/fault_tolerance_tests/run_test_fault_tolerance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" -lt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes, input size & n_trials required"; exit -1; fi 3 | if [ "$#" -gt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi 4 | 5 | ## setup 6 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 7 | SCRIPT_DIR=$(dirname $(realpath -s $0)) 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils) 9 | BINARIES_DIR=$(realpath -s $SCRIPT_DIR/../../build) 10 | TEST_BINARIES_DIR=$BINARIES_DIR/tests 11 | 12 | ## cleanup procs 13 | sudo fuser -k 6666/tcp -s &> /dev/null 14 | sudo fuser -k 50055/tcp -s &> /dev/null 15 | sudo fuser -k 20210/tcp -s &> /dev/null 16 | 17 | test_name=$1 18 | test_executable_abspath=$TEST_BINARIES_DIR/${test_name}_test 19 | world_size=$2 20 | object_size=$3 21 | n_trials=$4 22 | 23 | if [ ! -f $test_executable_abspath ]; then 24 | echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test executable not found: $test_executable_abspath" 25 | exit -2 26 | fi 27 | 28 | # get cluster info 29 | source $TEST_UNILS_DIR/load_cluster_env.sh 30 | OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))}) 31 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) head_node: $MY_IPADDR; other_nodes: ${OTHERS_IPADDR[@]}" 32 | 33 | # prompt test info 34 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) Running test $(tput setaf 3)$(tput bold)$test_name$(tput sgr 0)" 35 | 36 | # create logging dir 37 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size 38 | mkdir -p $log_dir 39 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest 40 | 41 | export RAY_BACKEND_LOG_LEVEL=info 42 | 43 | pkill notification 44 | sleep 0.5 45 | ($BINARIES_DIR/notification 2>&1 | tee $log_dir/$MY_IPADDR.notification.log) & 46 | sleep 0.5 47 | 48 | all_nodes=(${ALL_IPADDR[@]:0:$world_size}) 49 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 50 | 51 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \ 52 | -x HOPLITE_LOGGING_DIR=$log_dir \ 53 | -x RAY_BACKEND_LOG_LEVEL=$RAY_BACKEND_LOG_LEVEL \ 54 | $SCRIPT_DIR/test_wrapper_fault_tolerance.sh $test_executable_abspath $MY_IPADDR $object_size $n_trials 55 | 56 | sleep 1 57 | -------------------------------------------------------------------------------- /_archived/fault_tolerance_tests/test_wrapper_fault_tolerance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | trap 'echo delaying MPI shutdown...' INT TERM 3 | logging_file=$HOPLITE_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log 4 | $@ 2>&1 | tee $logging_file 5 | sleep 10 6 | -------------------------------------------------------------------------------- /_archived/init_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import ray 5 | import sys 6 | 7 | root_directory = os.path.dirname(os.path.abspath(__file__)) 8 | print(root_directory) 9 | 10 | dirs = [ 11 | os.path.join(root_directory, 'python'), 12 | os.path.join(root_directory, 'app', 'parameter-server'), 13 | ] 14 | 15 | ray.init(address='auto', load_code_from_local=True) 16 | 17 | ray.worker.global_worker.run_function_on_all_workers( 18 | lambda worker_info: [sys.path.insert(1, d) for d in dirs]) 19 | -------------------------------------------------------------------------------- /_archived/mpi_compare_bcast.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ -z "$2" ]; then echo "ERROR: number of nodes & input size required"; exit; fi 3 | 4 | make compare_bcast > /dev/null 5 | 6 | ROOT_DIR=$(dirname $(realpath -s $0))/../ 7 | source $ROOT_DIR/load_cluster_env.sh 8 | 9 | all_nodes=(${ALL_IPADDR[@]:0:$1}) 10 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 11 | 12 | echo Number of nodes: $1 "(actually ${#all_nodes[@]})", data size: $2 13 | echo Nodes: ${all_nodes[@]} 14 | 15 | $ROOT_DIR/mpirun_pernode.sh $all_hosts $(realpath -s compare_bcast) $[$2/4] 1 16 | -------------------------------------------------------------------------------- /_archived/notification_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 3 | 4 | sudo fuser -k 6666/tcp -s &> /dev/null 5 | sudo fuser -k 50055/tcp -s &> /dev/null 6 | 7 | ## setup 8 | ROOT_DIR=$(dirname $(realpath -s $0)) 9 | source $ROOT_DIR/load_cluster_env.sh 10 | 11 | pkill '^notification$' 12 | pkill '^notification_server_test$' 13 | sleep 2 14 | ./notification $MY_IPADDR $MY_IPADDR & 15 | sleep 2 16 | ./notification_server_test $MY_IPADDR $MY_IPADDR & 17 | sleep 40 18 | -------------------------------------------------------------------------------- /_archived/parameter-server/compare_hoplite_mpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir -p ps-log-cmp/ 3 | 4 | ROOT_DIR=$(dirname $(realpath -s $0))/../../ 5 | source $ROOT_DIR/load_cluster_env.sh 6 | 7 | for n_nodes in 8; do 8 | 9 | echo "==========" sync-$n_nodes-hoplite "==========" 10 | pkill notification 11 | $ROOT_DIR/restart_all_workers.sh 12 | python parameter_server.py -n $(($n_nodes - 1)) --no-test | tee ps-log-cmp/sync-$n_nodes-hoplite.log 13 | 14 | echo "==========" sync-$n_nodes-mpi "==========" 15 | 16 | all_nodes=(${ALL_IPADDR[@]:0:$n_nodes}) 17 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 18 | 19 | echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")" 20 | 21 | pkill notification 22 | $ROOT_DIR/restart_all_workers.sh 23 | $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_parameter_server.py) --no-test | tee ps-log-cmp/sync-$n_nodes-mpi.log 24 | 25 | done 26 | -------------------------------------------------------------------------------- /_archived/parameter-server/mpi_parameter_server.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import time 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | import numpy as np 10 | 11 | from ps_helper import ConvNet, get_data_loader, evaluate, criterion 12 | 13 | from mpi4py import MPI 14 | 15 | comm = MPI.COMM_WORLD 16 | rank = comm.Get_rank() 17 | 18 | class ParameterServer(object): 19 | def __init__(self, lr, model_type="custom"): 20 | self.model = ConvNet(model_type) 21 | self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr) 22 | 23 | def apply_gradients(self): 24 | new_parameters = [p.data.cpu().numpy() for p in self.model.parameters()] 25 | cont_p = np.concatenate([p.ravel() for p in new_parameters]) 26 | comm.Bcast(cont_p, root=0) 27 | zero_grad = np.zeros(self.model.n_param, dtype=np.float32) 28 | grad_buffer = np.empty(self.model.n_param, dtype=np.float32) 29 | comm.Reduce(zero_grad, grad_buffer, op=MPI.SUM, root=0) 30 | summed_gradients = self.model.buffer_to_tensors(grad_buffer.view(np.uint8)) 31 | self.optimizer.zero_grad() 32 | self.model.set_gradients(summed_gradients) 33 | self.optimizer.step() 34 | 35 | 36 | class DataWorker(object): 37 | def __init__(self, model_type="custom", device="cpu"): 38 | self.device = device 39 | self.model = ConvNet(model_type).to(device) 40 | 41 | def compute_gradients(self, batch_size=128): 42 | parameter_buffer = np.empty(self.model.n_param, dtype=np.float32) 43 | comm.Bcast(parameter_buffer, root=0) 44 | parameters = self.model.buffer_to_tensors(parameter_buffer.view(np.uint8)) 45 | self.model.set_parameters(parameters) 46 | data = torch.randn(batch_size, 3, 224, 224, device=self.device) 47 | self.model.zero_grad() 48 | output = self.model(data) 49 | loss = torch.mean(output) 50 | loss.backward() 51 | gradients = self.model.get_gradients() 52 | cont_grad = np.concatenate([p.ravel() for p in gradients]) 53 | grad_buffer = np.empty(self.model.n_param, dtype=np.float32) 54 | comm.Reduce(cont_grad, grad_buffer, op=MPI.SUM, root=0) 55 | 56 | parser = argparse.ArgumentParser(description='parameter server') 57 | parser.add_argument('-m', '--model', type=str, default="custom", 58 | help='neural network model type') 59 | args = parser.parse_args() 60 | 61 | 62 | iterations = 50 63 | 64 | 65 | if rank == 0: 66 | print("rank == 0") 67 | ps = ParameterServer(1e-2, model_type=args.model) 68 | step_start = time.time() 69 | for i in range(iterations): 70 | ps.apply_gradients() 71 | now = time.time() 72 | print("step time:", now - step_start, flush=True) 73 | step_start = now 74 | 75 | else: 76 | print("rank > 0") 77 | worker = DataWorker(model_type=args.model, device='cuda') 78 | for i in range(iterations): 79 | worker.compute_gradients() 80 | -------------------------------------------------------------------------------- /_archived/parameter-server/mpi_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mpi4py import MPI 3 | import time 4 | 5 | comm = MPI.COMM_WORLD 6 | rank = comm.Get_rank() 7 | 8 | if rank == 0: 9 | data = np.ones(10, dtype=np.float32) 10 | data_sum = np.empty(10, dtype=np.float32) 11 | else: 12 | time.sleep((4 - rank) * 3) 13 | data = np.ones(10, dtype=np.float32) 14 | data_sum = np.empty(10, dtype=np.float32) 15 | comm.Bcast(data, root=0) 16 | # comm.Reduce(data, data_sum, op=MPI.SUM, root=0) 17 | print(rank, data, data_sum) -------------------------------------------------------------------------------- /_archived/parameter-server/parse_results.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | log_dir = 'ps-log/' 4 | for log_file in sorted(os.listdir(log_dir)): 5 | with open(os.path.join(log_dir, log_file), "r") as f: 6 | all_time = [] 7 | for line in f: 8 | if "step time:" in line: 9 | all_time.append(float(line.split()[-1])) 10 | all_time = all_time[1:] 11 | all_time = np.array(all_time[3:-3]) 12 | if log_file.split('-')[0] == 'async': 13 | all_time = 8 * ((int(log_file.split('-')[1]) - 1) // 2) / all_time 14 | else: 15 | all_time = 8 * int(log_file.split('-')[1]) / all_time 16 | new_all_time = [] 17 | for i in range(0, len(all_time), 4): 18 | new_all_time.append(np.mean(all_time[i:i + 4])) 19 | print(log_file.ljust(20), np.mean(new_all_time), np.std(new_all_time), len(new_all_time), sep='\t') 20 | -------------------------------------------------------------------------------- /_archived/parameter-server/run_gloo_allreduce.sh: -------------------------------------------------------------------------------- 1 | MODEL=alexnet 2 | 3 | mkdir -p ps-log/ 4 | 5 | ROOT_DIR=$(dirname $(realpath -s $0))/../../ 6 | source $ROOT_DIR/load_cluster_env.sh 7 | 8 | for n_nodes in 8; do 9 | i=0 10 | for node in ${ALL_IPADDR[@]:0:$n_nodes}; do 11 | echo "=> $node" 12 | ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, \ 13 | python $ROOT_DIR/app/parameter-server/gloo_all_reduce.py \ 14 | --master_ip $MY_IPADDR \ 15 | --rank $i \ 16 | --size $n_nodes \ 17 | -m $MODEL & 18 | i=$((i+1)) 19 | done 20 | wait 21 | done 22 | -------------------------------------------------------------------------------- /_archived/parameter-server/run_mpi_allreduce.sh: -------------------------------------------------------------------------------- 1 | MODEL=alexnet 2 | 3 | mkdir -p ps-log/ 4 | 5 | ROOT_DIR=$(dirname $(realpath -s $0))/../../ 6 | source $ROOT_DIR/load_cluster_env.sh 7 | 8 | for n_nodes in 8; do 9 | echo "==========" sync-$n_nodes-mpi "==========" 10 | all_nodes=(${ALL_IPADDR[@]:0:$n_nodes}) 11 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 12 | echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")" 13 | 14 | pkill notification 15 | # $ROOT_DIR/restart_all_workers.sh 16 | $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_all_reduce.py) -m $MODEL | tee ps-log/sync-$n_nodes-mpi-$MODEL.log 17 | done 18 | -------------------------------------------------------------------------------- /_archived/parameter-server/run_mpi_ps.sh: -------------------------------------------------------------------------------- 1 | MODEL=alexnet 2 | 3 | mkdir -p ps-log/ 4 | 5 | ROOT_DIR=$(dirname $(realpath -s $0))/../../ 6 | source $ROOT_DIR/load_cluster_env.sh 7 | 8 | for n_nodes in 8; do 9 | echo "==========" sync-$n_nodes-mpi "==========" 10 | all_nodes=(${ALL_IPADDR[@]:0:$n_nodes}) 11 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 12 | echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")" 13 | 14 | pkill notification 15 | # $ROOT_DIR/restart_all_workers.sh 16 | $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_parameter_server.py) -m $MODEL | tee ps-log/sync-$n_nodes-mpi-$MODEL.log 17 | done 18 | -------------------------------------------------------------------------------- /_archived/parameter-server/run_ps_tests.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ps-log/ 2 | 3 | ROOT_DIR=$(dirname $(realpath -s $0))/../../ 4 | source $ROOT_DIR/load_cluster_env.sh 5 | 6 | for n_nodes in 8 16; do 7 | echo "==========" sync-$n_nodes-hoplite "==========" 8 | pkill notification 9 | $ROOT_DIR/restart_all_workers.sh 10 | python parameter_server.py -n $(($n_nodes - 1)) --no-test | tee ps-log/sync-$n_nodes-hoplite.log 11 | 12 | echo "==========" sync-$n_nodes-ray "==========" 13 | pkill notification 14 | $ROOT_DIR/restart_all_workers.sh 15 | python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) --no-test | tee ps-log/sync-$n_nodes-ray.log 16 | 17 | echo "==========" sync-$n_nodes-mpi "==========" 18 | all_nodes=(${ALL_IPADDR[@]:0:$n_nodes}) 19 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 20 | echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")" 21 | 22 | pkill notification 23 | $ROOT_DIR/restart_all_workers.sh 24 | $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_parameter_server.py) --no-test | tee ps-log/sync-$n_nodes-mpi.log 25 | 26 | echo "==========" async-$n_nodes-hoplite "==========" 27 | pkill notification 28 | $ROOT_DIR/restart_all_workers.sh 29 | python parameter_server.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) --no-test | tee ps-log/async-$n_nodes-hoplite.log 30 | 31 | echo "==========" async-$n_nodes-ray "==========" 32 | pkill notification 33 | $ROOT_DIR/restart_all_workers.sh 34 | python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) --no-test | tee ps-log/async-$n_nodes-ray.log 35 | done 36 | -------------------------------------------------------------------------------- /_archived/restart_all_workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ROOT_DIR=$(dirname $(realpath -s $0)) 4 | 5 | # This script is only used when necessary to reboot the ray workers. 6 | # Workers may not be available until next task execution, so some errors could still occur. 7 | if [ "$#" -eq 0 ]; then 8 | $ROOT_DIR/fornode $(realpath -s $0) restart 9 | else 10 | for pid in $(ps aux | grep 'default_worker.py' | grep -v 'object_manager_port' | grep -v grep | awk '{print $2}'); do 11 | kill -9 $pid 12 | done 13 | fi 14 | -------------------------------------------------------------------------------- /_archived/restart_ray.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ROOT_DIR=$(dirname $(realpath -s $0)) 4 | source $ROOT_DIR/load_cluster_env.sh 5 | 6 | ./fornode ray stop 7 | 8 | ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{"node":1}' --object-store-memory=34359738368 9 | sleep 5 10 | for node in ${OTHERS_IPADDR[@]}; do 11 | echo "=> $node" 12 | ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, ray start --redis-address=$MY_IPADDR:6379 --object-manager-port=8076 --resources=\'{\"node\":1}\' --object-store-memory=34359738368 & 13 | done 14 | wait 15 | -------------------------------------------------------------------------------- /_archived/script/README.md: -------------------------------------------------------------------------------- 1 | # How to run the timeline script 2 | 3 | Use `reduce_test.sh` as an example: 4 | 1. Run 5 | ```bash 6 | bash reduce_test.sh 7 | ``` 8 | You will get a bunch of log files under `log/YYMMDD-HHMMSS-reduce/`. 9 | 2. Run the script 10 | ```bash 11 | python script/timeline.py log/YYMMDD-HHMMSS-reduce/ 12 | ``` 13 | The result json file will be dumped to `log/YYMMDD-HHMMSS-reduce/timeline.json`. 14 | 3. open `chrome://tracing` in your chrome browser, and then load the json file above. Then you will see the timeline. 15 | -------------------------------------------------------------------------------- /_archived/script/find_missing_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | def main(log_dir): 5 | files = os.listdir(log_dir) 6 | 7 | tasks = {'multicast', 'reduce', 'allreduce'} 8 | node_set = range(2, 18, 2) 9 | object_size_set = {2**i for i in range(20, 31)} 10 | 11 | print (node_set) 12 | print (object_size_set) 13 | 14 | for task_name in tasks: 15 | for number_of_nodes in node_set: 16 | for object_size in object_size_set: 17 | task = task_name + '-' + str(number_of_nodes) + '-' + str(object_size) 18 | found = False 19 | for filename in files: 20 | if task in filename: 21 | found = True 22 | break 23 | if not found: 24 | print (task) 25 | 26 | 27 | if __name__ == "__main__": 28 | assert len(sys.argv) == 2, "Usage: python parse_mpi_result.py LOG_DIR" 29 | log_dir = sys.argv[1] 30 | main(log_dir) 31 | -------------------------------------------------------------------------------- /_archived/script/timeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | 5 | ph_dict = { 6 | "[BEGIN]": "B", 7 | "[END]": "E" 8 | } 9 | 10 | def main(log_dir): 11 | print("Working dir", log_dir) 12 | all_logs = os.listdir(log_dir) 13 | json_output = { 14 | "traceEvents": [], 15 | "displayTimeUnit": "ms", 16 | "otherData": { 17 | "log_dir": log_dir 18 | } 19 | } 20 | 21 | for log_file in all_logs: 22 | with open(os.path.join(log_dir, log_file)) as f: 23 | for line in f.readlines(): 24 | elements = line.split() 25 | if len(elements) >= 6 and elements[5] == "[TIMELINE]": 26 | timestamp = int(elements[0]) 27 | ip_pid_tid = elements[1] 28 | ip, pid, tid = ip_pid_tid.split(":") 29 | filename_line = elements[2] 30 | function_name = elements[3] 31 | assert elements[4] == "]:" 32 | timeline_id = elements[6] 33 | timeline_tag = elements[7] 34 | message = " ".join(elements[8:]) 35 | event = { 36 | "name": function_name + "_" + timeline_id, 37 | "cat": "event", 38 | "ph": ph_dict[timeline_tag], 39 | "ts": str(timestamp // 1000) + "." + str(timestamp % 1000), 40 | "pid": ip + ":" + pid, 41 | "tid": tid, 42 | "args": { 43 | "message": message 44 | } 45 | } 46 | json_output["traceEvents"].append(event) 47 | with open(os.path.join(log_dir, "timeline.json"), "w") as f: 48 | json.dump(json_output, f) 49 | 50 | 51 | if __name__ == "__main__": 52 | log_dir = sys.argv[1] if len(sys.argv) >= 2 else "log/latest" 53 | main(log_dir) 54 | -------------------------------------------------------------------------------- /_archived/speed_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ROOT_DIR=$(dirname $(realpath -s $0)) 3 | source $ROOT_DIR/load_cluster_env.sh 4 | 5 | echo ${ALL_IPADDR[@]}, ${#ALL_IPADDR[@]} 6 | 7 | # start iperf server 8 | for s in ${ALL_IPADDR[@]} 9 | do 10 | ssh -o StrictHostKeyChecking=no $s pkill iperf 11 | ssh $s iperf -s &> /dev/null & 12 | done 13 | 14 | for s in ${ALL_IPADDR[@]} 15 | do 16 | for t in ${ALL_IPADDR[@]} 17 | do 18 | if [ "$s" == "$t" ] 19 | then continue 20 | fi 21 | echo $s, $t 22 | ssh $s iperf -c $t -t 5 | grep GBytes 23 | done 24 | break 25 | done 26 | 27 | # shutdown iperf server 28 | 29 | for s in ${ALL_IPADDR[@]} 30 | do 31 | ssh $s pkill iperf &> /dev/null & 32 | done 33 | -------------------------------------------------------------------------------- /_archived/sync_time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo apt install -y chrony 4 | sudo sed -i 's/^# information about usuable directives./server 169.254.169.123 prefer iburst minpoll 4 maxpoll 4\n/g' /etc/chrony/chrony.conf 5 | sudo /etc/init.d/chrony restart 6 | 7 | if [ "$#" -eq 0 ]; then 8 | ROOT_DIR=$(dirname $(realpath -s $0)) 9 | source $ROOT_DIR/load_cluster_env.sh 10 | for node in ${OTHERS_IPADDR[@]} 11 | do 12 | ssh -t -t $node "$(realpath -s $0) 0" & 13 | done 14 | wait 15 | fi 16 | -------------------------------------------------------------------------------- /app/parameter-server/README.md: -------------------------------------------------------------------------------- 1 | # Reproducing Hoplite Parameter Server Experiments on AWS 2 | 3 | _(About 55 min)_ 4 | 5 | ## Cluster Setup 6 | 7 | _(About 30 min)_ 8 | 9 | If you are provided with an AWS IAM account & pre-built binaries 10 | * If you just want to review figures & raw experimental data, see [cluster-config-access-results-only](cluster-config-access-results-only). 11 | * If you also want to reproduce all results from the beginning, see [cluster-config-with-ami](cluster-config-with-ami) for setting up a cluster. 12 | 13 | If you are not provided with an AWS account or you want to build everything from scratch, see [cluster-config](../ray_serve/cluster-config). 14 | 15 | ## Asynchronous Parameter Server Experiments (Section 5.2, Figure 9) 16 | 17 | _(About 15 min)_ 18 | 19 | After logging in to the configured cluster, *chdir* to the current directory in the hoplite repo. 20 | 21 | In the current directory, run 22 | 23 | ```bash 24 | ./parameter-server/run_async_ps_tests.sh 25 | ``` 26 | 27 | After the script completes, results are saved under `ps-log`. 28 | 29 | To visualize the results, run 30 | 31 | ```bash 32 | python plot_async_ps_results.py 33 | ``` 34 | 35 | This generates 2 PDF files: `async_training_8.pdf` corresponds to Figure 9(a), and `async_training_16.pdf` corresponds to Figure 9(b). 36 | 37 | You can download PDF files to your local machine using Ray cluster utils, for example: 38 | 39 | ```bash 40 | ray rsync-down cluster.yaml /home/ubuntu/efs/hoplite/app/parameter-server/async_training_8.pdf . 41 | ``` 42 | 43 | ## Asynchronous Parameter Server Fault Tolerance Experiments (Section 5.5, Figure 12b) 44 | 45 | _(About 10 min)_ 46 | 47 | After logging in to the configured cluster, *chdir* to the current directory in the hoplite repo. 48 | 49 | In the current directory, run 50 | 51 | ```bash 52 | ./run_async_ps_fault_tolerance.sh 53 | ``` 54 | 55 | The script generates `ray_asgd_fault_tolerance.json` and `hoplite_asgd_fault_tolerance.json` after running. 56 | 57 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper). 58 | 59 | ## Notes 60 | 61 | The initial run will be extremely slow on AWS due to python generating caching files etc (about 5 min). This is totally normal. 62 | -------------------------------------------------------------------------------- /app/parameter-server/analyze_fault_tolerance.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | with open('ray_asgd_fault_tolerance.json', 'r') as f: 5 | ray_log = json.load(f) 6 | durations = [l['duration'] for l in ray_log if l['event'] == 'fail'] 7 | # we only fail once in the paper, so no calculating std here 8 | print(f"Baseline latency caused by failure: {np.mean(durations):.6f}s") 9 | 10 | with open('hoplite_asgd_fault_tolerance.json', 'r') as f: 11 | hoplite_log = json.load(f) 12 | durations = [l['duration'] for l in hoplite_log if l['event'] == 'fail'] 13 | # we only fail once in the paper, so no calculating std here 14 | print(f"Hoplite latency caused by failure: {np.mean(durations):.6f}s") 15 | -------------------------------------------------------------------------------- /app/parameter-server/cluster-asgd-fault-tolerance.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-asgd 2 | 3 | min_workers: 7 4 | max_workers: 7 5 | initial_workers: 7 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | 15 | auth: 16 | ssh_user: ubuntu 17 | 18 | head_node: 19 | InstanceType: p3.2xlarge 20 | ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2 21 | InstanceMarketOptions: 22 | MarketType: spot 23 | SecurityGroupIds: 24 | - "sg-3463e565" 25 | Placement: 26 | GroupName: hoplite-group 27 | 28 | worker_nodes: 29 | InstanceType: p3.2xlarge 30 | ImageId: ami-0947593b62663ba38 # hoplite-sigcomm21-2 31 | InstanceMarketOptions: 32 | MarketType: spot 33 | SecurityGroupIds: 34 | - "sg-3463e565" 35 | Placement: 36 | GroupName: hoplite-group 37 | 38 | setup_commands: 39 | # - pip install ray==1.1 40 | - mkdir -p ~/efs 41 | - sudo mount -t efs fs-d416cc55:/ ~/efs 42 | - sudo chmod 777 ~/efs 43 | 44 | # Command to start ray on the head node. You don't need to change this. 45 | head_start_ray_commands: 46 | - ray stop 47 | - "ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 48 | 49 | # Command to start ray on worker nodes. You don't need to change this. 50 | worker_start_ray_commands: 51 | - ray stop 52 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 53 | -------------------------------------------------------------------------------- /app/parameter-server/cluster-config-access-results-only/README.md: -------------------------------------------------------------------------------- 1 | # Setup Hoplite Parameter Server experiments on AWS. 2 | 3 | ## Setup Local Environment _(About 2 min)_ 4 | 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with: 6 | 7 | ~~~bash 8 | pip install ray==1.3 boto3 # if failed, use "pip -V" to check if you are using python3 9 | ~~~ 10 | 11 | ## Start the Cluster _(About 3 min)_ 12 | 13 | Start the cluster and connect to the head node via: 14 | 15 | ~~~bash 16 | export AWS_ACCESS_KEY_ID="Your Access Key ID" 17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key" 18 | ray up example.yaml 19 | ray attach example.yaml 20 | ~~~ 21 | 22 | Visit the directory with pre-built binaries and results via `cd ~/efs/hoplite-with-results/` 23 | 24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation. 25 | 26 | ## Access results 27 | 28 | ### Asynchronous Parameter Server Experiments (Section 5.2, Figure 9) 29 | 30 | Raw results are stored in `~/efs/hoplite-with-results/app/parameter-server/ps-log/`. 31 | 32 | To download the figures: 33 | 34 | **Figure 9(a)** 35 | 36 | ```bash 37 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite/app/parameter-server/async_training_8.pdf . 38 | ``` 39 | 40 | **Figure 9(b)** 41 | 42 | ```bash 43 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite/app/parameter-server/async_training_16.pdf . 44 | ``` 45 | 46 | ### Asynchronous Parameter Server Fault Tolerance Experiments (Section 5.5, Figure 12b) 47 | 48 | After logging into the cluster, `cd ~/efs/hoplite-with-results/app/parameter-server`. `ray_asgd_fault_tolerance.json` and `hoplite_asgd_fault_tolerance.json` contain the raw trajectory during failure. 49 | 50 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper). 51 | -------------------------------------------------------------------------------- /app/parameter-server/cluster-config-access-results-only/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072" 56 | -------------------------------------------------------------------------------- /app/parameter-server/cluster-config-with-ami/README.md: -------------------------------------------------------------------------------- 1 | # Setup Hoplite Parameter Server experiments on AWS. 2 | 3 | ## Setup Local Environment _(About 2 min)_ 4 | 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with: 6 | 7 | ~~~bash 8 | pip install ray==1.3 boto3 # if failed, use "pip -V" to check if you are using python3 9 | ~~~ 10 | 11 | ## Start the Cluster _(About 3 min)_ 12 | 13 | Start the cluster and connect to the head node via: 14 | 15 | ~~~bash 16 | export AWS_ACCESS_KEY_ID="Your Access Key ID" 17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key" 18 | ray up example.yaml 19 | ray attach example.yaml 20 | ~~~ 21 | 22 | Visit the directory with pre-built binaries via `cd ~/efs/hoplite` 23 | 24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation. 25 | -------------------------------------------------------------------------------- /app/parameter-server/cluster-config-with-ami/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-ml-serving 2 | 3 | min_workers: 16 4 | max_workers: 16 5 | initial_workers: 16 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: p3.2xlarge 21 | ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: p3.2xlarge 31 | ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 56 | -------------------------------------------------------------------------------- /app/parameter-server/gloo_all_reduce.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import time 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.distributed as dist 9 | 10 | import numpy as np 11 | 12 | from ps_helper import ConvNet, get_data_loader, evaluate, criterion 13 | 14 | class DataWorker(object): 15 | def __init__(self, model_type="custom", device="cpu"): 16 | self.device = device 17 | self.model = ConvNet(model_type).to(device) 18 | self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.02) 19 | 20 | def compute_gradients(self, batch_size=128): 21 | data = torch.randn(batch_size, 3, 224, 224, device=self.device) 22 | self.model.zero_grad() 23 | output = self.model(data) 24 | loss = torch.mean(output) 25 | loss.backward() 26 | gradients = self.model.get_gradients() 27 | cont_grad = np.concatenate([p.ravel() for p in gradients]) 28 | t = torch.from_numpy(cont_grad) 29 | torch.distributed.all_reduce(t) 30 | summed_gradients = self.model.buffer_to_tensors(t.numpy().view(np.uint8)) 31 | self.optimizer.zero_grad() 32 | self.model.set_gradients(summed_gradients) 33 | self.optimizer.step() 34 | 35 | parser = argparse.ArgumentParser(description='parameter server') 36 | parser.add_argument('-m', '--model', type=str, default="custom", 37 | help='neural network model type') 38 | parser.add_argument('--rank', type=int) 39 | parser.add_argument('--size', type=int) 40 | parser.add_argument('--master_ip', type=str) 41 | args = parser.parse_args() 42 | 43 | dist.init_process_group('gloo', init_method=f"tcp://{args.master_ip}:12345", rank=args.rank, world_size=args.size) 44 | 45 | iterations = 50 46 | 47 | worker = DataWorker(model_type=args.model, device='cuda') 48 | step_start = time.time() 49 | for i in range(iterations): 50 | worker.compute_gradients() 51 | now = time.time() 52 | print("rank:", args.rank, "step time:", now - step_start, flush=True) 53 | step_start = now 54 | -------------------------------------------------------------------------------- /app/parameter-server/mpi_all_reduce.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | import time 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | import numpy as np 10 | 11 | from ps_helper import ConvNet, get_data_loader, evaluate, criterion 12 | 13 | from mpi4py import MPI 14 | 15 | comm = MPI.COMM_WORLD 16 | rank = comm.Get_rank() 17 | 18 | class DataWorker(object): 19 | def __init__(self, model_type="custom", device="cpu"): 20 | self.device = device 21 | self.model = ConvNet(model_type).to(device) 22 | self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.02) 23 | 24 | def compute_gradients(self, batch_size=128): 25 | data = torch.randn(batch_size, 3, 224, 224, device=self.device) 26 | self.model.zero_grad() 27 | output = self.model(data) 28 | loss = torch.mean(output) 29 | loss.backward() 30 | gradients = self.model.get_gradients() 31 | cont_grad = np.concatenate([p.ravel() for p in gradients]) 32 | grad_buffer = np.empty(self.model.n_param, dtype=np.float32) 33 | comm.Allreduce(cont_grad, grad_buffer, op=MPI.SUM) 34 | summed_gradients = self.model.buffer_to_tensors(grad_buffer.view(np.uint8)) 35 | self.optimizer.zero_grad() 36 | self.model.set_gradients(summed_gradients) 37 | self.optimizer.step() 38 | 39 | parser = argparse.ArgumentParser(description='parameter server') 40 | parser.add_argument('-m', '--model', type=str, default="custom", 41 | help='neural network model type') 42 | args = parser.parse_args() 43 | 44 | 45 | iterations = 50 46 | 47 | worker = DataWorker(model_type=args.model, device='cuda') 48 | step_start = time.time() 49 | for i in range(iterations): 50 | worker.compute_gradients() 51 | now = time.time() 52 | print("rank:", rank, "step time:", now - step_start, flush=True) 53 | step_start = now 54 | -------------------------------------------------------------------------------- /app/parameter-server/plot_async_ps_results.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | 4 | MODELS = ['alexnet', 'vgg16', 'resnet50'] 5 | BATCH_SIZE_PER_CLIENT = 128 6 | 7 | # async batch size of clients = (#nodes - 1) // 2 8 | 9 | def parse_ray(filename, n_nodes): 10 | batch_size_client = (n_nodes - 1) // 2 11 | all_step_time = [] 12 | with open(filename, 'r') as f: 13 | for line in f.readlines(): 14 | if f"step time:" in line: 15 | all_step_time.append(float(line.split(f"step time:")[1])) 16 | all_step_time = np.array(all_step_time[3:]) 17 | all_step_throughput = BATCH_SIZE_PER_CLIENT * batch_size_client / all_step_time 18 | return np.mean(all_step_throughput), np.std(all_step_throughput) 19 | 20 | 21 | def parse_hoplite(filename, n_nodes): 22 | batch_size_client = (n_nodes - 1) // 2 23 | all_step_time = [] 24 | with open(filename, 'r') as f: 25 | for line in f.readlines(): 26 | if f"step time:" in line: 27 | all_step_time.append(float(line.split(f"step time:")[1])) 28 | all_step_time = np.array(all_step_time[6:]) 29 | all_step_throughput = BATCH_SIZE_PER_CLIENT * batch_size_client / all_step_time 30 | all_step_throughput = (all_step_throughput[0::2] + all_step_throughput[1::2]) / 2 31 | return np.mean(all_step_throughput), np.std(all_step_throughput) 32 | 33 | 34 | def parse_data(n_nodes): 35 | ray_mean = [] 36 | ray_std = [] 37 | hoplite_mean = [] 38 | hoplite_std = [] 39 | for model in MODELS: 40 | mean, std = parse_ray(f"ps-log/async-ps-{n_nodes}-{model}-ray.log", n_nodes) 41 | ray_mean.append(mean) 42 | ray_std.append(std) 43 | mean, std = parse_hoplite(f"ps-log/async-ps-{n_nodes}-{model}-hoplite.log", n_nodes) 44 | hoplite_mean.append(mean) 45 | hoplite_std.append(std) 46 | return ray_mean, ray_std, hoplite_mean, hoplite_std 47 | 48 | 49 | def draw_async_ps_results(n_nodes): 50 | ray_mean, ray_std, hoplite_mean, hoplite_std = parse_data(n_nodes) 51 | colors = ( 52 | plt.get_cmap('tab20c')(0 * 4 + 1), 53 | plt.get_cmap('tab20c')(1 * 4 + 2), 54 | plt.get_cmap('tab20')(11), 55 | plt.get_cmap('tab20c')(2 * 4 + 2), 56 | ) 57 | 58 | ind = np.array(range(3)) 59 | width = 0.3 60 | 61 | plt.bar(ind, hoplite_mean, width, label='Hoplite', color=colors[0]) 62 | plt.errorbar(ind, hoplite_mean, yerr=hoplite_std, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6) 63 | 64 | plt.bar(ind + width, ray_mean, width, label='Ray', color=colors[3]) 65 | plt.errorbar(ind + width, ray_mean, yerr=ray_std, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6) 66 | 67 | plt.xticks(ind + width/2, ["AlexNet", "VGG-16", "ResNet50"], fontsize=20) 68 | plt.yticks(fontsize=20) 69 | plt.ylabel('Throughput\n(samples/s)', fontsize=20) 70 | plt.ylim(0, 2000) 71 | plt.legend(fontsize=20) 72 | plt.tight_layout() 73 | plt.savefig(f'async_training_{n_nodes}.pdf') 74 | 75 | 76 | if __name__ == '__main__': 77 | plt.figure(0) 78 | draw_async_ps_results(16) 79 | plt.figure(1) 80 | draw_async_ps_results(8) 81 | -------------------------------------------------------------------------------- /app/parameter-server/result_parser/parse_async_ps_hoplite.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def parse(filename): 4 | all_step_time = [] 5 | with open(filename, 'r') as f: 6 | for line in f.readlines(): 7 | if f"step time:" in line: 8 | all_step_time.append(float(line.split(f"step time:")[1])) 9 | all_step_time = np.array(all_step_time[6:]) 10 | all_step_throughput = 1.0 / all_step_time 11 | all_step_throughput = (all_step_throughput[0::2] + all_step_throughput[1::2]) / 2 12 | return np.mean(all_step_throughput), np.std(all_step_throughput) 13 | -------------------------------------------------------------------------------- /app/parameter-server/result_parser/parse_gloo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | path = sys.argv[1] 6 | prefix = sys.argv[2] 7 | 8 | all_step_time = [] 9 | 10 | for filename in os.listdir(path): 11 | if filename.startswith(prefix): 12 | step_time_rank = [] 13 | with open(os.path.join(path, filename), 'r') as f: 14 | for line in f.readlines(): 15 | if "step time:" in line: 16 | step_time_rank.append(float(line.split("step time:")[1])) 17 | all_step_time.append(np.array(step_time_rank)) 18 | 19 | all_step_time = np.array(all_step_time) 20 | all_step_time = all_step_time[:, 5:] 21 | all_step_time = np.amax(all_step_time, axis=0) 22 | 23 | all_step_throughput = 1.0 / all_step_time 24 | 25 | print(np.mean(all_step_throughput), np.std(all_step_throughput)) 26 | -------------------------------------------------------------------------------- /app/parameter-server/result_parser/parse_hoplite.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | filename = sys.argv[1] 6 | n_nodes = int(sys.argv[2]) 7 | 8 | all_step_time = [] 9 | min_len = 1e10 10 | 11 | for i in range(n_nodes): 12 | step_time_rank = [] 13 | with open(filename, 'r') as f: 14 | for line in f.readlines(): 15 | if f" {i} in actor time" in line: 16 | step_time_rank.append(float(line.split(f" {i} in actor time")[1])) 17 | print(len(step_time_rank)) 18 | min_len = min(min_len, len(step_time_rank)) 19 | all_step_time.append(np.array(step_time_rank)) 20 | 21 | all_step_time = np.array([a[:min_len] for a in all_step_time]) 22 | all_step_time = all_step_time[:, 5:] 23 | all_step_time = np.amax(all_step_time, axis=0) 24 | all_step_throughput = 1.0 / all_step_time 25 | print(np.mean(all_step_throughput), np.std(all_step_throughput)) 26 | -------------------------------------------------------------------------------- /app/parameter-server/result_parser/parse_mpi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | 5 | filename = sys.argv[1] 6 | n_nodes = int(sys.argv[2]) 7 | 8 | all_step_time = [] 9 | min_len = 1e10 10 | 11 | for i in range(n_nodes): 12 | step_time_rank = [] 13 | with open(filename, 'r') as f: 14 | for line in f.readlines(): 15 | if f" {i} step time:" in line: 16 | step_time_rank.append(float(line.split(f" {i} step time:")[1])) 17 | print(len(step_time_rank)) 18 | min_len = min(min_len, len(step_time_rank)) 19 | all_step_time.append(np.array(step_time_rank)) 20 | 21 | all_step_time = np.array([a[:min_len] for a in all_step_time]) 22 | all_step_time = all_step_time[:, 5:] 23 | all_step_time = np.amax(all_step_time, axis=0) 24 | all_step_throughput = 1.0 / all_step_time 25 | print(np.mean(all_step_throughput), np.std(all_step_throughput)) 26 | -------------------------------------------------------------------------------- /app/parameter-server/result_parser/parse_ray.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def parse(filename): 4 | all_step_time = [] 5 | with open(filename, 'r') as f: 6 | for line in f.readlines(): 7 | if f"step time:" in line: 8 | all_step_time.append(float(line.split(f"step time:")[1])) 9 | all_step_time = np.array(all_step_time[3:]) 10 | all_step_throughput = 1.0 / all_step_time 11 | return np.mean(all_step_throughput), np.std(all_step_throughput) 12 | -------------------------------------------------------------------------------- /app/parameter-server/run_allreduce_tests.sh: -------------------------------------------------------------------------------- 1 | export RAY_BACKEND_LOG_LEVEL=info 2 | mkdir -p ps-log/ 3 | 4 | ROOT_DIR=$(dirname $(realpath -s $0))/../../ 5 | source $ROOT_DIR/load_cluster_env.sh 6 | 7 | for n_nodes in 16; do 8 | for model in alexnet vgg16 resnet50; do 9 | echo "==========" allreduce-$n_nodes-$model-mpi "==========" 10 | all_nodes=(${ALL_IPADDR[@]:0:$n_nodes}) 11 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 12 | echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")" 13 | 14 | pkill notification 15 | # $ROOT_DIR/restart_all_workers.sh 16 | $ROOT_DIR/mpirun_pernode.sh $all_hosts python $(realpath -s mpi_all_reduce.py) -m $model \ 17 | | tee ps-log/allreduce-$n_nodes-$model-mpi.log 18 | sleep 0.5 19 | 20 | echo "==========" allreduce-$n_nodes-$model-gloo "==========" 21 | i=0 22 | for node in ${ALL_IPADDR[@]:0:$n_nodes}; do 23 | echo "=> $node" 24 | ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, \ 25 | python $ROOT_DIR/app/parameter-server/gloo_all_reduce.py \ 26 | --master_ip $MY_IPADDR \ 27 | --rank $i \ 28 | --size $n_nodes \ 29 | -m $model \ 30 | | tee ps-log/allreduce-$n_nodes-$model-gloo.$i.log & 31 | i=$((i+1)) 32 | done 33 | wait 34 | sleep 0.5 35 | 36 | echo "==========" allreduce-$n_nodes-$model-hoplite "==========" 37 | python hoplite_all_reduce.py -n $n_nodes -m $model | tee ps-log/allreduce-$n_nodes-$model-hoplite.log 38 | sleep 0.5 39 | 40 | echo "==========" allreduce-$n_nodes-$model-ray "==========" 41 | python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) -m $model | tee ps-log/allreduce-$n_nodes-$model-ray.log 42 | sleep 0.5 43 | done 44 | done 45 | -------------------------------------------------------------------------------- /app/parameter-server/run_async_ps_fault_tolerance.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export RAY_BACKEND_LOG_LEVEL=info 3 | 4 | sudo fuser -k 6666/tcp -s &> /dev/null 5 | sudo fuser -k 50055/tcp -s &> /dev/null 6 | sudo fuser -k 20210/tcp -s &> /dev/null 7 | sleep 1 8 | 9 | n_nodes=7 10 | model=resnet50 11 | 12 | echo "==========" async-ps-$n_nodes-$model-hoplite "==========" 13 | python hoplite_asgd_fault_tolerance.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model --iterations 100 14 | sleep 1 15 | 16 | echo "==========" async-ps-$n_nodes-$model-ray "==========" 17 | python ray_asgd_fault_tolerance.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model --iterations 100 18 | -------------------------------------------------------------------------------- /app/parameter-server/run_async_ps_tests.sh: -------------------------------------------------------------------------------- 1 | export RAY_BACKEND_LOG_LEVEL=info 2 | mkdir -p ps-log/ 3 | 4 | for n_nodes in 8 16; do 5 | for model in alexnet vgg16 resnet50; do 6 | echo "==========" async-ps-$n_nodes-$model-hoplite "==========" 7 | python parameter_server.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model | tee ps-log/async-ps-$n_nodes-$model-hoplite.log 8 | sleep 0.5 9 | 10 | echo "==========" async-ps-$n_nodes-$model-ray "==========" 11 | python ray_parameter_server_baseline.py -n $(($n_nodes - 1)) -a $((($n_nodes - 1) / 2)) -m $model | tee ps-log/async-ps-$n_nodes-$model-ray.log 12 | sleep 0.5 13 | done 14 | done 15 | -------------------------------------------------------------------------------- /app/ray_serve/README.md: -------------------------------------------------------------------------------- 1 | # Reproducing ML Model Serving Experiments on AWS 2 | 3 | _(About 30 min)_ 4 | 5 | ## Setup 6 | 7 | _(About 15 min)_ 8 | 9 | If you are provided with an AWS IAM account & pre-built binaries 10 | * If you just want to review figures & raw experimental data, see [cluster-config-access-results-only](cluster-config-access-results-only). 11 | * If you also want to reproduce all results from the beginning, see [cluster-config-with-ami](cluster-config-with-ami) for setting up a cluster. 12 | 13 | If you are not provided with an AWS account or you want to build everything from scratch, see [cluster-config](cluster-config). 14 | 15 | ## ML model serving experiments (Figure 11) 16 | 17 | After logging in to the configured cluster, *chdir* to the current directory in the hoplite repo. 18 | 19 | Here is how you run the experiments: 20 | 21 | **Baseline** _(2-3 min)_: `python model_ensembling.py ${scale}` 22 | 23 | **Hoplite** _(1-2 min)_: `python hoplite_model_ensembling.py ${scale}` 24 | 25 | `${scale}` controls the cluster size. `scale=1` corresponds to 8 GPU nodes, `scale=2` corresponds to 16 GPU nodes in the figure. 26 | 27 | The script prints the mean and std of throughput (queries/s) at the end. 28 | 29 | ## ML Model Serving fault tolerance experiments (Figure 12a) 30 | 31 | Baseline + fault tolerance test _(About 2 min)_: `python model_ensembling_fault_tolerance.py 1` 32 | 33 | With Hoplite + fault tolerance test _(About 2 min)_: `python hoplite_model_ensembling_fault_tolerance.py.py 1` 34 | 35 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper). 36 | 37 | ## Notes 38 | 39 | The initial run will be extremely slow on AWS due to python generating caching files etc (about 4 min). This is totally normal. 40 | -------------------------------------------------------------------------------- /app/ray_serve/analyze_fault_tolerance.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | 4 | with open('ray_serve_log.json', 'r') as f: 5 | ray_log = json.load(f) 6 | durations = [l['duration'] for l in ray_log if l['event'] == 'fail'] 7 | print(f"Baseline latency caused by failure: {np.mean(durations):.6f} ± {np.std(durations):.6f}s") 8 | 9 | with open('hoplite_ray_serve_log.json', 'r') as f: 10 | hoplite_log = json.load(f) 11 | durations = [l['duration'] for l in hoplite_log if l['event'] == 'fail'] 12 | print(f"Hoplite latency caused by failure: {np.mean(durations):.6f} ± {np.std(durations):.6f}s") 13 | -------------------------------------------------------------------------------- /app/ray_serve/cluster-config-access-results-only/README.md: -------------------------------------------------------------------------------- 1 | # Setup Hoplite ML serving experiments on AWS. 2 | 3 | ## Setup Local Environment _(About 2 min)_ 4 | 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with: 6 | 7 | ~~~bash 8 | pip install ray==1.3 boto3 # if failed, use "pip -V" to check if you are using python3 9 | ~~~ 10 | 11 | ## Start the Cluster _(About 3 min)_ 12 | 13 | Start the cluster and connect to the head node via: 14 | 15 | ~~~bash 16 | export AWS_ACCESS_KEY_ID="Your Access Key ID" 17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key" 18 | ray up example.yaml 19 | ray attach example.yaml 20 | ~~~ 21 | 22 | Visit the directory with pre-built binaries and results via `cd ~/efs/hoplite-with-results/` 23 | 24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation. 25 | 26 | ## Access results 27 | 28 | ### ML model serving experiments (Figure 11) 29 | 30 | The results are collected dynamically, so if you want to get the result numbers, you need to run the experiments with [this cluster setup](../cluster-config-with-ami). 31 | 32 | ### ML Model Serving fault tolerance experiments (Figure 12a) 33 | 34 | After logging into the cluster, `cd ~/efs/hoplite-with-results/app/ray_serve`. `hoplite_ray_serve_log.json` and `ray_serve_log.json` contain the raw trajectory during failure. 35 | 36 | Run `python analyze_fault_tolerance.py` to compare the failure detection latency (see section 5.5 in the paper). 37 | -------------------------------------------------------------------------------- /app/ray_serve/cluster-config-access-results-only/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072" 56 | -------------------------------------------------------------------------------- /app/ray_serve/cluster-config-with-ami/README.md: -------------------------------------------------------------------------------- 1 | # Setup Hoplite ML serving experiments on AWS. 2 | 3 | ## Setup Local Environment _(About 2 min)_ 4 | 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with: 6 | 7 | ~~~bash 8 | pip install ray==1.3 boto3 # if failed, use "pip -V" to check if you are using python3 9 | ~~~ 10 | 11 | ## Start the Cluster _(About 3 min)_ 12 | 13 | Start the cluster and connect to the head node via: 14 | 15 | ~~~bash 16 | export AWS_ACCESS_KEY_ID="Your Access Key ID" 17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key" 18 | ray up example.yaml 19 | ray attach example.yaml 20 | ~~~ 21 | 22 | Visit the directory with pre-built binaries via `cd ~/efs/hoplite` 23 | 24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation. 25 | -------------------------------------------------------------------------------- /app/ray_serve/cluster-config-with-ami/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-ml-serving 2 | 3 | min_workers: 16 4 | max_workers: 16 5 | initial_workers: 16 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: p3.2xlarge 21 | ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: p3.2xlarge 31 | ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 56 | -------------------------------------------------------------------------------- /app/ray_serve/cluster-config/cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-ml-serving 2 | 3 | min_workers: 16 4 | max_workers: 16 5 | initial_workers: 16 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: p3.2xlarge 21 | ImageId: {image-id} 22 | SecurityGroupIds: 23 | - "{security group id created by inital.yaml}" 24 | Placement: 25 | GroupName: {group-name} 26 | 27 | worker_nodes: 28 | InstanceType: p3.2xlarge 29 | ImageId: {image-id} 30 | SecurityGroupIds: 31 | - "{security group id created by inital.yaml}" 32 | Placement: 33 | GroupName: {group-name} 34 | 35 | setup_commands: 36 | # This replaces the standard anaconda Ray installation 37 | - mkdir -p ~/efs 38 | - sudo mount -t efs {efs-id}:/ ~/efs 39 | - sudo chmod 777 ~/efs 40 | 41 | # Command to start ray on the head node. You don't need to change this. 42 | head_start_ray_commands: 43 | - ray stop 44 | # we allocate 28 GB memory for Ray object store 45 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 46 | 47 | # Command to start ray on worker nodes. You don't need to change this. 48 | worker_start_ray_commands: 49 | - ray stop 50 | # we allocate 28 GB memory for Ray object store 51 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 52 | -------------------------------------------------------------------------------- /app/ray_serve/cluster-config/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-ml-serving 2 | 3 | min_workers: 16 4 | max_workers: 16 5 | initial_workers: 16 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: p3.2xlarge 21 | ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: p3.2xlarge 31 | ImageId: ami-02f718c4a5c79a4ad # hoplite-sigcomm21-image 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 56 | -------------------------------------------------------------------------------- /app/ray_serve/cluster-config/initial.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-single-initial 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0 22 | 23 | worker_nodes: 24 | InstanceType: m5.4xlarge 25 | ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0 26 | 27 | setup_commands: [] 28 | 29 | # Command to start ray on the head node. You don't need to change this. 30 | head_start_ray_commands: [] 31 | 32 | # Command to start ray on worker nodes. You don't need to change this. 33 | worker_start_ray_commands: [] 34 | -------------------------------------------------------------------------------- /app/rllib/README-with-ami.md: -------------------------------------------------------------------------------- 1 | # Reproducing RLLib experiments in Hoplite on AWS (with AMI). 2 | 3 | ## Setup Local Environment _(About 2 min)_ 4 | 5 | On your local machine, make sure Python 3 is installed on the local machine.Then install Ray version `0.8.0` and boto with: 6 | ~~~bash 7 | pip install ray==0.8.0 boto3 8 | ~~~ 9 | 10 | ## Start the Cluster and Evaluate _(About 30 min)_ 11 | 12 | 1. Launch the cluster and logging in: 13 | ~~~bash 14 | export AWS_ACCESS_KEY_ID="Your Access Key ID" 15 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key" 16 | ray up example.yaml 17 | ray attach example.yaml 18 | ~~~ 19 | 3. Move to the running scripts directory: 20 | ~~~bash 21 | cd ~/hoplite-rllib/hoplite-scripts 22 | ~~~ 23 | 4. Generate the cluster configuration: 24 | ~~~bash 25 | python a3c_generate_config.py 26 | python impala_generate_config.py 27 | ~~~ 28 | 5. Test all configurations: 29 | ~~~bash 30 | ./test_all_generated.sh 31 | ~~~ 32 | 6. After all experiments finished, we can get the results via: 33 | ~~~bash 34 | python a3c_parse_log.py 35 | python impala_parse_log.py 36 | ~~~ 37 | The results will be in the format of: 38 | ~~~ 39 | #nodes / - / Hoplite or Ray / Throughput (mean) / Throughput (std) 40 | ~~~ 41 | -------------------------------------------------------------------------------- /app/rllib/cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: {image-id} 22 | Placement: 23 | GroupName: {group-name} 24 | 25 | worker_nodes: 26 | InstanceType: m5.4xlarge 27 | ImageId: {image-id} 28 | Placement: 29 | GroupName: {group-name} 30 | 31 | setup_commands: [] 32 | 33 | # Command to start ray on the head node. You don't need to change this. 34 | head_start_ray_commands: 35 | - ray stop 36 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 37 | 38 | # Command to start ray on worker nodes. You don't need to change this. 39 | worker_start_ray_commands: 40 | - ray stop 41 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 42 | -------------------------------------------------------------------------------- /app/rllib/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-0e9f764f786728984 # hoplite-artifact-rllib-2 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-0e9f764f786728984 # hoplite-artifact-rllib-2 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | 40 | setup_commands: [] 41 | 42 | # Command to start ray on the head node. You don't need to change this. 43 | head_start_ray_commands: 44 | - ray stop 45 | - "ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}'" 46 | 47 | # Command to start ray on worker nodes. You don't need to change this. 48 | worker_start_ray_commands: 49 | - ray stop 50 | - "ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}'" 51 | -------------------------------------------------------------------------------- /app/rllib/initial.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-single-initial 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-west-2 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-west-2a 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-0f9543706892e0363 # Deep Learning AMI (Ubuntu 18.04) Version 43.0 22 | 23 | worker_nodes: 24 | InstanceType: m5.4xlarge 25 | ImageId: ami-0f9543706892e0363 # Deep Learning AMI (Ubuntu 18.04) Version 43.0 26 | 27 | setup_commands: [] 28 | 29 | # Command to start ray on the head node. You don't need to change this. 30 | head_start_ray_commands: [] 31 | 32 | # Command to start ray on worker nodes. You don't need to change this. 33 | worker_start_ray_commands: [] 34 | -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | make clean 3 | clang-format -i *.cc src/*.cc src/*.h src/util/*.cc src/util/*.h mpi/*.c 4 | -------------------------------------------------------------------------------- /fornode: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script can run commands on all nodes on the cluster: ./fornode 4 | 5 | ROOT_DIR=$(dirname $(realpath -s $0)) 6 | source $ROOT_DIR/test_utils/load_cluster_env.sh 7 | 8 | for node in ${ALL_IPADDR[@]}; do 9 | echo "=> $node" 10 | ssh -o StrictHostKeyChecking=no $node PATH=$PATH:/home/ubuntu/anaconda3/bin:/home/ubuntu/anaconda3/condabin, $@ & 11 | done 12 | wait 13 | -------------------------------------------------------------------------------- /install_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd $HOME 4 | 5 | sudo apt update 6 | 7 | ## build grpc 8 | if [ ! -d grpc ]; then 9 | 10 | sudo apt-get install -y \ 11 | build-essential \ 12 | autoconf \ 13 | libtool \ 14 | pkg-config \ 15 | libgflags-dev \ 16 | libgtest-dev \ 17 | clang-5.0 \ 18 | libc++-dev 19 | 20 | git clone https://github.com/grpc/grpc.git 21 | 22 | pushd grpc 23 | # pin gRPC version to 1.31.0 24 | git checkout tags/v1.31.0 25 | git submodule update --init --recursive 26 | 27 | mkdir build && cd build 28 | cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local 29 | make -j8 && sudo make install 30 | popd 31 | 32 | pushd grpc/third_party/protobuf 33 | ./autogen.sh 34 | ./configure 35 | make -j8 && sudo make install 36 | popd 37 | fi 38 | -------------------------------------------------------------------------------- /microbenchmarks/cluster-config-access-results-only/README.md: -------------------------------------------------------------------------------- 1 | # Setup AWS Cluster for Hoplite Microbenchmarks on AWS. 2 | 3 | ## Setup Local Environment _(About 2 min)_ 4 | 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with: 6 | 7 | ~~~bash 8 | pip install ray==1.3 boto3 # if failed, use "pip -V" to check if you are using python3 9 | ~~~ 10 | 11 | ## Start the Cluster _(About 3 min)_ 12 | 13 | Start the cluster and connect to the head node via: 14 | 15 | ~~~bash 16 | export AWS_ACCESS_KEY_ID="Your Access Key ID" 17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key" 18 | ray up example.yaml 19 | ray attach example.yaml 20 | ~~~ 21 | 22 | Visit the directory with pre-built binaries and results via `cd ~/efs/hoplite-with-results/` 23 | 24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation. 25 | 26 | ## Access results 27 | 28 | You can download results from the cluster to your local machine by executing 29 | 30 | ~~~bash 31 | ray rsync-down example.yaml 32 | ~~~ 33 | 34 | Here is how you could download main results: 35 | 36 | ### Roundtrip Microbenchmarks (Figure 6 at Section 5.1) 37 | 38 | **Raw data for Figure 6** 39 | 40 | ~~~bash 41 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/roundtrip-results.csv . 42 | ~~~ 43 | 44 | **Figure 6 (a)** 45 | 46 | ~~~bash 47 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/RTT1K.pdf . 48 | ~~~ 49 | 50 | **Figure 6 (b)** 51 | 52 | ~~~bash 53 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/RTT1M.pdf . 54 | ~~~ 55 | 56 | **Figure 6 (c)** 57 | 58 | ~~~bash 59 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/RTT1G.pdf . 60 | ~~~ 61 | 62 | ## Collective Communication Microbenchmarks (Figure 7 at Section 5.1, Figure 13 at Appendix A) 63 | 64 | **Raw data for Figure 7 & Figure 13** 65 | 66 | ~~~bash 67 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/mpi-cpp/mpi_results.csv . 68 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/hoplite-cpp/hoplite_results.csv . 69 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/gloo-cpp/gloo_results.csv . 70 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/ray-python/ray-microbenchmark.csv . 71 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/dask-python/dask_results.csv . 72 | ~~~ 73 | 74 | **Figure 7, Section 5.1** 75 | 76 | ~~~bash 77 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/microbenchmarks-large.pdf . 78 | ~~~ 79 | 80 | **Figure 13, Appendix A** 81 | 82 | ~~~bash 83 | ray rsync-down example.yaml /home/ubuntu/efs/hoplite-with-results/microbenchmarks/microbenchmarks-small.pdf . 84 | ~~~ 85 | -------------------------------------------------------------------------------- /microbenchmarks/cluster-config-access-results-only/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072" 56 | -------------------------------------------------------------------------------- /microbenchmarks/cluster-config-with-ami/README.md: -------------------------------------------------------------------------------- 1 | # Setup AWS Cluster for Hoplite Microbenchmarks on AWS. 2 | 3 | ## Setup Local Environment _(About 2 min)_ 4 | 5 | On your local machine, make sure Python (>=3.6) is installed on the local machine. Then install Ray version `1.3` and boto with: 6 | 7 | ~~~bash 8 | pip install ray==1.3 boto3 # if failed, use "pip -V" to check if you are using python3 9 | ~~~ 10 | 11 | ## Start the Cluster _(About 3 min)_ 12 | 13 | Start the cluster and connect to the head node via: 14 | 15 | ~~~bash 16 | export AWS_ACCESS_KEY_ID="Your Access Key ID" 17 | export AWS_SECRET_ACCESS_KEY="Your Secret Acess Key" 18 | ray up example.yaml 19 | ray attach example.yaml 20 | ~~~ 21 | 22 | Visit the directory with pre-built binaries via `cd ~/efs/hoplite` 23 | 24 | Remember to take down the cluster using `ray down example.yaml` on the local machine after evaluation. 25 | -------------------------------------------------------------------------------- /microbenchmarks/cluster-config-with-ami/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072" 56 | -------------------------------------------------------------------------------- /microbenchmarks/cluster-config/cluster.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: {image-id} 22 | SecurityGroupIds: 23 | - "{security group id created by inital.yaml}" 24 | Placement: 25 | GroupName: {group-name} 26 | 27 | worker_nodes: 28 | InstanceType: m5.4xlarge 29 | ImageId: {image-id} 30 | SecurityGroupIds: 31 | - "{security group id created by inital.yaml}" 32 | Placement: 33 | GroupName: {group-name} 34 | 35 | setup_commands: 36 | # This replaces the standard anaconda Ray installation 37 | - mkdir -p ~/efs 38 | - sudo mount -t efs {efs-id}:/ ~/efs 39 | - sudo chmod 777 ~/efs 40 | 41 | # Command to start ray on the head node. You don't need to change this. 42 | head_start_ray_commands: 43 | - ray stop 44 | # we allocate 28 GB memory for Ray object store 45 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'" 46 | 47 | # Command to start ray on worker nodes. You don't need to change this. 48 | worker_start_ray_commands: 49 | - ray stop 50 | # we allocate 28 GB memory for Ray object store 51 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072" 52 | -------------------------------------------------------------------------------- /microbenchmarks/cluster-config/example.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite 2 | 3 | min_workers: 15 4 | max_workers: 15 5 | initial_workers: 15 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 22 | SecurityGroupIds: 23 | - "sg-092b10044bcf1f37e" 24 | Placement: 25 | GroupName: hoplite-group 26 | InstanceMarketOptions: 27 | MarketType: spot 28 | 29 | worker_nodes: 30 | InstanceType: m5.4xlarge 31 | ImageId: ami-00df0f081db89b1f4 # hoplite-sigcomm21-image-3 32 | SecurityGroupIds: 33 | - "sg-092b10044bcf1f37e" 34 | Placement: 35 | GroupName: hoplite-group 36 | InstanceMarketOptions: 37 | MarketType: spot 38 | 39 | setup_commands: 40 | # This replaces the standard anaconda Ray installation 41 | - mkdir -p ~/efs 42 | - sudo mount -t efs fs-d416cc55:/ ~/efs 43 | - sudo chmod 777 ~/efs 44 | 45 | # Command to start ray on the head node. You don't need to change this. 46 | head_start_ray_commands: 47 | - ray stop 48 | # we allocate 28 GB memory for Ray object store 49 | - "sudo ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --resources='{\"machine\": 1}' --object-store-memory 30064771072 --system-config '{\"num_heartbeats_timeout\": 10000}'" 50 | 51 | # Command to start ray on worker nodes. You don't need to change this. 52 | worker_start_ray_commands: 53 | - ray stop 54 | # we allocate 28 GB memory for Ray object store 55 | - "sudo ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --resources='{\"machine\": 1}' --object-store-memory 30064771072" 56 | -------------------------------------------------------------------------------- /microbenchmarks/cluster-config/initial.yaml: -------------------------------------------------------------------------------- 1 | cluster_name: hoplite-single-initial 2 | 3 | min_workers: 0 4 | max_workers: 0 5 | initial_workers: 0 6 | 7 | provider: 8 | type: aws 9 | region: us-east-1 10 | # Availability zone(s), comma-separated, that nodes may be launched in. 11 | # Nodes are currently spread between zones by a round-robin approach, 12 | # however this implementation detail should not be relied upon. 13 | availability_zone: us-east-1f 14 | cache_stopped_nodes: False 15 | 16 | auth: 17 | ssh_user: ubuntu 18 | 19 | head_node: 20 | InstanceType: m5.4xlarge 21 | ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0 22 | 23 | worker_nodes: 24 | InstanceType: m5.4xlarge 25 | ImageId: ami-04cd519d2f9578053 # Deep Learning AMI (Ubuntu 18.04) Version 43.0 26 | 27 | setup_commands: [] 28 | 29 | # Command to start ray on the head node. You don't need to change this. 30 | head_start_ray_commands: [] 31 | 32 | # Command to start ray on worker nodes. You don't need to change this. 33 | worker_start_ray_commands: [] 34 | -------------------------------------------------------------------------------- /microbenchmarks/dask-python/auto_dask_benchmark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from subprocess import Popen, PIPE 3 | 4 | parser = argparse.ArgumentParser(description='Automatic Dask collective communication benchmark') 5 | parser.add_argument('rounds', type=int, help="How many rounds we would to run the benchmark.") 6 | 7 | args = parser.parse_args() 8 | 9 | for i in range(args.rounds): 10 | with open(f"result-{i+1}.csv", "w") as f: 11 | for algorithm in ('multicast', 'gather', 'reduce', 'allreduce'): 12 | for world_size in (4, 8, 12, 16): 13 | for object_size in (2 ** 10, 2 ** 15, 2 ** 20, 2 ** 25, 2 ** 30): 14 | process = Popen(["python", "dask_benchmark.py", 15 | algorithm, "-n", str(world_size), "-s", str(object_size)], stdout=PIPE) 16 | (output, err) = process.communicate() 17 | exit_code = process.wait() 18 | print(algorithm, world_size, object_size, float(output)) 19 | f.write(f"{algorithm},{world_size},{object_size},{float(output)}\n") 20 | -------------------------------------------------------------------------------- /microbenchmarks/dask-python/auto_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT 4 | 5 | ./cleanup_dask.sh 6 | ./run_dask.sh 16 & 7 | python auto_dask_benchmark.py 5 8 | -------------------------------------------------------------------------------- /microbenchmarks/dask-python/cleanup_dask.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo pkill dask-scheduler 3 | ../../fornode sudo pkill dask-worker 4 | -------------------------------------------------------------------------------- /microbenchmarks/dask-python/dask_roundtrip.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | from dask.distributed import Client 5 | 6 | 7 | def round_trip(obj): 8 | return obj 9 | 10 | 11 | def measure_round_trip(client, object_size): 12 | payload = np.empty(object_size, dtype=np.uint8) 13 | before = time.time() 14 | receiver = client.submit(round_trip, payload, workers=['Dask-1']) 15 | receiver.result() 16 | duration = time.time() - before 17 | return duration 18 | 19 | 20 | def main(): 21 | client = Client("127.0.0.1:8786") 22 | 23 | # warmup 24 | for size in (2**10, 2**20): 25 | for _ in range(5): 26 | measure_round_trip(client, size) 27 | 28 | with open(f"dask-roundtrip.csv", "w") as f: 29 | for size in (2**10, 2**20, 2**30): 30 | t = [] 31 | for _ in range(5): 32 | duration = measure_round_trip(client, size) 33 | t.append(duration) 34 | f.write(f"dask,{size},{np.mean(t)},{np.std(t)}\n") 35 | 36 | # # Accumulate time for more precision. 37 | # duration = 0.0 38 | # for j in range(i + 1, i + 1 + 10): 39 | # duration += func(client, world_size, object_size, j) 40 | # duration /= 10 41 | # print(duration) 42 | 43 | 44 | if __name__ == "__main__": 45 | main() -------------------------------------------------------------------------------- /microbenchmarks/dask-python/dask_roundtrip.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT 4 | 5 | ./cleanup_dask.sh 6 | ./run_dask.sh 2 & 7 | python dask_roundtrip.py 8 | -------------------------------------------------------------------------------- /microbenchmarks/dask-python/parse_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | 5 | parser = argparse.ArgumentParser(description='Hoplite (C++) benchmark results parser.') 6 | parser.add_argument('--verbose', action='store_true') 7 | args = parser.parse_args() 8 | 9 | tables = [] 10 | 11 | for i in range(1, 100): 12 | filename = f"result-{i}.csv" 13 | if os.path.exists(filename): 14 | tables.append(pd.read_csv(filename, header=None)) 15 | else: 16 | break 17 | 18 | df_avg = pd.concat(tables).groupby(by=[0, 1, 2]).mean() 19 | df_std = pd.concat(tables).groupby(by=[0, 1, 2]).std() 20 | df_cnt = pd.concat(tables).groupby(by=[0, 1, 2]).count() 21 | df_final = pd.concat([df_avg, df_std, df_cnt], axis=1) 22 | df_final.reset_index(inplace=True) 23 | columns = ['Benchmark Name', '#Nodes', 'Object Size (in bytes)', 24 | 'Average Time (s)', 'Std Time (s)', 'Repeated Times'] 25 | df_final.to_csv("dask_results.csv", header=columns, index=False) 26 | 27 | if args.verbose: 28 | print(df_final) 29 | -------------------------------------------------------------------------------- /microbenchmarks/dask-python/run_dask.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of nodes"; exit -1; fi 3 | if [ "$#" -gt 3 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi 4 | 5 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 6 | 7 | SCRIPT_DIR=$(dirname $(realpath -s $0)) 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils) 9 | world_size=$1 10 | 11 | if [ "$#" -eq 1 ]; then 12 | source $TEST_UNILS_DIR/load_cluster_env.sh 13 | OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))}) 14 | 15 | dask-scheduler & 16 | sleep 1 17 | 18 | for index in ${!OTHERS_IPADDR[@]} 19 | do 20 | rank=$((index+1)) 21 | ssh -t -t ${OTHERS_IPADDR[$index]} "$(realpath -s $0) $MY_IPADDR $rank" & 22 | done 23 | 24 | dask-worker $MY_IPADDR:8786 --name Dask-0 25 | else 26 | master=$1 27 | index=$2 28 | source ~/anaconda3/etc/profile.d/conda.sh 29 | conda activate 30 | dask-worker $master:8786 --name Dask-$index 31 | fi 32 | -------------------------------------------------------------------------------- /microbenchmarks/gloo-cpp/.gitignore: -------------------------------------------------------------------------------- 1 | gloo/ 2 | gloo_results.csv 3 | -------------------------------------------------------------------------------- /microbenchmarks/gloo-cpp/README.md: -------------------------------------------------------------------------------- 1 | ## Gloo collective communication benchmarks (baseline) 2 | 3 | Usage: 4 | 5 | ```bash 6 | ./run_benchmark.sh ${gloo_microbenchmark_name} ${total_number_of_nodes} ${input_size_in_bytes}` 7 | ``` 8 | 9 | `${gloo_microbenchmark_name}` includes allreduce_ring, allreduce_ring_chunked, allreduce_halving_doubling, allreduce_bcube, barrier_all_to_all, broadcast_one_to_all, pairwise_exchange. 10 | 11 | Note: Sometimes Gloo would be flaky and you might see error messages like 12 | 13 | ``` 14 | terminate called after throwing an instance of 'gloo::IoException' 15 | what(): [**/hoplite/microbenchmarks/gloo-cpp/gloo/gloo/transport/tcp/pair.cc:572] Connection closed by peer [172.31.48.113]:44461 16 | ``` 17 | 18 | when you use large payloads. We have taken that into consideration when writing our result parsing scripts, and you will get informed during parsing. You can manually rerun these tests if you want to increase the accuracy of the statistics. 19 | -------------------------------------------------------------------------------- /microbenchmarks/gloo-cpp/auto_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for num_nodes in 4 8 12 16; do 3 | for test_name in allreduce_ring_chunked allreduce_halving_doubling broadcast_one_to_all; do 4 | for sz in 10 15 20 25 30; do 5 | for i in `seq 5`; do 6 | obj_size=$((2**$sz)) 7 | ./run_test.sh $test_name $num_nodes $obj_size 8 | done 9 | done 10 | done 11 | done 12 | -------------------------------------------------------------------------------- /microbenchmarks/gloo-cpp/install_gloo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # See https://github.com/facebookincubator/gloo 4 | 5 | sudo apt-get install -y libhiredis-dev redis-server 6 | 7 | if [ ! -d gloo ]; then 8 | git clone git@github.com:facebookincubator/gloo.git 9 | fi 10 | 11 | cd gloo 12 | # Pin gloo version to commit 881f7f0dcf06f7e49e134a45d3284860fb244fa9 13 | git checkout 881f7f0dcf06f7e49e134a45d3284860fb244fa9 14 | rm -rf build 15 | mkdir build 16 | cd build 17 | # Redis is required for the benchmark. 18 | cmake ../ -DBUILD_BENCHMARK=1 -DUSE_REDIS=ON 19 | make -j8 20 | -------------------------------------------------------------------------------- /microbenchmarks/gloo-cpp/parse_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | sys.path.insert(0, "../../test_utils") 5 | import result_parser_utils 6 | 7 | # Example output 8 | 9 | """ 10 | Device: tcp, pci=0000:00:05.0, iface=ens5, speed=-1, addr=[172.31.49.113] 11 | Algorithm: allreduce_ring_chunked 12 | Options: processes=4, inputs=1, threads=1 13 | 14 | elements min (us) p50 (us) p99 (us) max (us) avg (GB/s) samples 15 | 268435456 1443672 1443672 1443672 1443672 0.693 1 16 | """ 17 | 18 | def parse_file(task_name, log_dir, foldername): 19 | try: 20 | lines = result_parser_utils.read_rank0_lines(log_dir, foldername) 21 | # The unit of the original result is microsecond. We turn it into seconds. 22 | return float(lines[5].split()[2]) / 1000 / 1000 23 | except Exception: 24 | return None 25 | 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser(description='Gloo (C++) benchmark results parser.') 29 | parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log', 30 | help='The logging directory of Gloo benchmarks') 31 | parser.add_argument('--verbose', action='store_true') 32 | args = parser.parse_args() 33 | df = result_parser_utils.parse(args.log_dir, parse_file) 34 | if args.verbose: 35 | print(df) 36 | df.to_csv('gloo_results.csv', index=False) 37 | -------------------------------------------------------------------------------- /microbenchmarks/gloo-cpp/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 3 ]; then 4 | echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes & input size required" 5 | echo "test name: allreduce_ring, allreduce_ring_chunked, allreduce_halving_doubling, " 6 | echo " allreduce_bcube, barrier_all_to_all, broadcast_one_to_all, pairwise_exchange" 7 | exit -1 8 | fi 9 | 10 | if [ "$#" -gt 3 ]; then 11 | echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#" 12 | exit -1 13 | fi 14 | 15 | # trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 16 | 17 | test_name=$1 18 | world_size=$2 19 | object_size=$3 20 | 21 | SCRIPT_DIR=$(dirname $(realpath -s $0)) 22 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils) 23 | GLOO_DIR=$SCRIPT_DIR/gloo/ 24 | 25 | source $TEST_UNILS_DIR/load_cluster_env.sh 26 | 27 | # prepare logging directory 28 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size 29 | mkdir -p $log_dir 30 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest 31 | 32 | # gloo benchmarks requires Redis 33 | redis-server --port 7799 --protected-mode no &> /dev/null & 34 | REDIS_PID=$! 35 | sleep 1 36 | echo "IP address of this node: $MY_IPADDR" 37 | 38 | all_nodes=(${ALL_IPADDR[@]:0:$world_size}) 39 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 40 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \ 41 | -x GLOO_DIR="$GLOO_DIR" \ 42 | -x GLOO_LOGGING_DIR="$log_dir" \ 43 | -x REDIS_HOST="$MY_IPADDR" \ 44 | -x test_name="$test_name" \ 45 | -x object_size="$object_size" \ 46 | test_wrapper.sh 47 | 48 | kill $REDIS_PID 49 | -------------------------------------------------------------------------------- /microbenchmarks/gloo-cpp/test_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | logging_file=$GLOO_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log 3 | $GLOO_DIR/build/gloo/benchmark/benchmark \ 4 | --size $OMPI_COMM_WORLD_SIZE \ 5 | --rank $OMPI_COMM_WORLD_RANK \ 6 | --redis-host $REDIS_HOST \ 7 | --redis-port 7799 \ 8 | --prefix benchmark-$test_name-$OMPI_COMM_WORLD_SIZE-$object_size \ 9 | --transport tcp \ 10 | --elements $(($object_size / 4)) \ 11 | --iteration-count 1 \ 12 | $test_name \ 13 | 2>&1 | tee $logging_file 14 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-cpp/README.md: -------------------------------------------------------------------------------- 1 | ## Hoplite C++ interface benchmarks 2 | 3 | Hoplite collective communication benchmarks with C++ binaries. 4 | 5 | ```bash 6 | ./run_tests.sh ${microbenchmark_name} ${total_number_of_nodes} ${input_size_in_bytes} 7 | ``` 8 | 9 | `${microbenchmark_name}` includes 5 most common collective communication operations: `multicast`, `reduce`, `gather`, `allreduce`, `allgather`. 10 | 11 | ### Pressure test (optional) 12 | 13 | Intensive benchmarks that exploits corner cases. This test proves reliablility of our system. It could take serveral minutes to complete with high speed networks. 14 | 15 | Usage: `./pressure_test.sh` 16 | 17 | ### Subset reduction test (optional) 18 | 19 | This test shows Hoplite is able to reduce only a subset of objects. For example, we have 8 candidate objects to reduce, but we want to reduce 4 objects that are created first. 20 | 21 | Usage: 22 | 23 | ```bash 24 | ./run_tests.sh subset_reduce ${total_number_of_nodes} ${input_size_in_bytes} 25 | ``` 26 | 27 | We suggest `total_number_of_nodes>=4`. 28 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-cpp/auto_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for test_name in multicast reduce gather allreduce; do 3 | for num_nodes in 4 8 12 16; do 4 | for sz in 10 15 20 25 30; do 5 | obj_size=$((2**$sz)) 6 | ./run_test.sh ${test_name} $num_nodes $obj_size 5 7 | sleep 1 8 | done 9 | done 10 | done 11 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-cpp/coverage_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "exit" INT 4 | 5 | num_nodes=8 6 | 7 | for test_name in multicast reduce gather allreduce allgather; do 8 | for i in 15 25; do 9 | obj_size=$((2**$i)) 10 | echo $test_name-$num_nodes-$obj_size 11 | ./run_test.sh $test_name $num_nodes $obj_size 3 12 | sleep 1 13 | done 14 | done 15 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-cpp/parse_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | 5 | import sys 6 | 7 | sys.path.insert(0, "../../test_utils") 8 | import result_parser_utils 9 | 10 | 11 | WARMUP_ROUNDS = 2 12 | 13 | 14 | def get_durations(lines): 15 | durations = [] 16 | for line in lines: 17 | if 'duration = ' in line: 18 | tmp = line.split('duration = ')[1] 19 | durations.append(float(tmp)) 20 | return durations 21 | 22 | 23 | def parse_all_ranks(folder_path, with_rank0=True): 24 | files = os.listdir(folder_path) 25 | all_rank_durations = [] 26 | for filename in files: 27 | if 'rank' in filename and (with_rank0 or 'rank_0' not in filename): 28 | try: 29 | with open(os.path.join(folder_path, filename)) as f: 30 | durations = get_durations(f.readlines()) 31 | if not durations: 32 | raise ValueError("Bad file") 33 | all_rank_durations.append(durations) 34 | except Exception: 35 | print("Bad file", folder_path, filename) 36 | return None 37 | 38 | try: 39 | return np.max(all_rank_durations, axis=0) 40 | except Exception as e: 41 | print("Error: empty directory", folder_path, e) 42 | return None 43 | 44 | 45 | def parse_file(task_name, log_dir, foldername): 46 | path = os.path.join(log_dir, foldername) 47 | 48 | if task_name in ('allreduce', 'allgather'): 49 | return parse_all_ranks(path) 50 | elif task_name == 'multicast': 51 | return parse_all_ranks(path, with_rank0=False) 52 | elif task_name in ('reduce', 'gather', 'subset_reduce'): 53 | return result_parser_utils.default_parse_file(task_name, log_dir, foldername) 54 | else: 55 | raise ValueError('Unknown task', task_name) 56 | 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser(description='Hoplite (C++) benchmark results parser.') 60 | parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log', 61 | help='The logging directory of Gloo benchmarks') 62 | parser.add_argument('--verbose', action='store_true') 63 | args = parser.parse_args() 64 | df = result_parser_utils.parse(args.log_dir, parse_file) 65 | if args.verbose: 66 | print(df) 67 | df.to_csv('hoplite_results.csv', index=False) 68 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-cpp/pressure_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of nodes required"; exit -1; fi 3 | if [ "$#" -gt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi 4 | 5 | trap "exit" INT 6 | 7 | ./run_test.sh multicast $1 $[2**10] 1000 8 | ./run_test.sh multicast $1 $[2**17] 1000 9 | ./run_test.sh multicast $1 $[2**30] 5 10 | 11 | ./run_test.sh reduce $1 $[2**10] 1000 12 | ./run_test.sh reduce $1 $[2**17] 1000 13 | ./run_test.sh reduce $1 $[2**30] 5 14 | 15 | ./run_test.sh allreduce $1 $[2**10] 1000 16 | ./run_test.sh allreduce $1 $[2**17] 1000 17 | ./run_test.sh allreduce $1 $[2**30] 5 18 | 19 | ./run_test.sh gather $1 $[2**10] 1000 20 | ./run_test.sh gather $1 $[2**17] 1000 21 | ./run_test.sh gather $1 $[2**30] 5 22 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-cpp/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" -lt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes, input size & n_trials required"; exit -1; fi 3 | if [ "$#" -gt 4 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi 4 | 5 | ## setup 6 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 7 | SCRIPT_DIR=$(dirname $(realpath -s $0)) 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils) 9 | BINARIES_DIR=$(realpath -s $SCRIPT_DIR/../../build) 10 | TEST_BINARIES_DIR=$BINARIES_DIR/tests 11 | 12 | ## cleanup procs 13 | sudo fuser -k 6666/tcp -s &> /dev/null 14 | sudo fuser -k 50055/tcp -s &> /dev/null 15 | sudo fuser -k 20210/tcp -s &> /dev/null 16 | 17 | test_name=$1 18 | test_executable_abspath=$TEST_BINARIES_DIR/${test_name}_test 19 | world_size=$2 20 | object_size=$3 21 | n_trials=$4 22 | 23 | if [ ! -f $test_executable_abspath ]; then 24 | echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test executable not found: $test_executable_abspath" 25 | exit -2 26 | fi 27 | 28 | # get cluster info 29 | source $TEST_UNILS_DIR/load_cluster_env.sh 30 | OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))}) 31 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) head_node: $MY_IPADDR; other_nodes: ${OTHERS_IPADDR[@]}" 32 | 33 | # prompt test info 34 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) Running test $(tput setaf 3)$(tput bold)$test_name$(tput sgr 0)" 35 | 36 | # create logging dir 37 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size 38 | mkdir -p $log_dir 39 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest 40 | 41 | export RAY_BACKEND_LOG_LEVEL=info 42 | 43 | pkill notification 44 | sleep 0.5 45 | ($BINARIES_DIR/notification 2>&1 | tee $log_dir/$MY_IPADDR.notification.log) & 46 | sleep 0.5 47 | 48 | all_nodes=(${ALL_IPADDR[@]:0:$world_size}) 49 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 50 | 51 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \ 52 | -x HOPLITE_LOGGING_DIR=$log_dir \ 53 | -x RAY_BACKEND_LOG_LEVEL=$RAY_BACKEND_LOG_LEVEL \ 54 | $SCRIPT_DIR/test_wrapper.sh $test_executable_abspath $MY_IPADDR $object_size $n_trials 55 | 56 | sleep 1 57 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-cpp/test_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | logging_file=$HOPLITE_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log 3 | $@ 2>&1 | tee $logging_file 4 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/README.md: -------------------------------------------------------------------------------- 1 | ## Hoplite Python interface benchmarks 2 | 3 | Hoplite collective communication benchmarks with Python. 4 | 5 | ```bash 6 | ./run_tests.sh ${microbenchmark_name} ${total_number_of_nodes} ${input_size_in_bytes} 7 | ``` 8 | 9 | `${microbenchmark_name}` includes 5 most common collective communication operations: `multicast`, `reduce`, `gather`, `allreduce`, `allgather`. 10 | 11 | ### Pressure test (optional) 12 | 13 | Intensive benchmarks that exploits corner cases. This test proves reliablility of our system. It could take serveral minutes to complete with high speed networks. 14 | 15 | Usage: `./pressure_test.sh` 16 | 17 | ### Round-trip test (optional) 18 | 19 | This test shows when transfer data from the object store, Hoplite is able to overlap object copy with object transfer to gain higher performance. 20 | 21 | Usage: 22 | 23 | ```bash 24 | ./run_tests.sh roundtrip 2 ${input_size_in_bytes} 25 | ``` 26 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/auto_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for test_name in multicast reduce gather allreduce; do 4 | for num_nodes in 4 8 12 16; do 5 | for sz in 10 15 20 25 30; do 6 | for i in `seq 5`; do 7 | obj_size=$((2**$sz)) 8 | ./run_test.sh ${test_name} $num_nodes $obj_size 9 | done 10 | done 11 | done 12 | done 13 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/coverage_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "exit" INT 4 | 5 | for test_index in `seq 1 3`; do 6 | ./run_test.sh roundtrip 2 $[2**25] 7 | ./run_test.sh roundtrip 2 $[2**15] 8 | done 9 | 10 | num_nodes=8 11 | 12 | for test_name in multicast reduce gather allreduce allgather; do 13 | for i in 15 25; do 14 | for test_index in `seq 1 3`; do 15 | obj_size=$((2**$i)) 16 | echo $test_name-$num_nodes-$obj_size-$test_index 17 | ./run_test.sh ${test_name} $num_nodes $obj_size 18 | sleep 1 19 | done 20 | done 21 | 22 | done 23 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/parse_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | 5 | import sys 6 | 7 | sys.path.insert(0, "../../test_utils") 8 | import result_parser_utils 9 | 10 | 11 | WARMUP_ROUNDS = 2 12 | 13 | 14 | def get_durations(lines): 15 | durations = [] 16 | for line in lines: 17 | if 'duration = ' in line: 18 | tmp = line.split('duration = ')[1] 19 | durations.append(float(tmp)) 20 | return durations 21 | 22 | 23 | def parse_all_ranks(folder_path, with_rank0=True): 24 | files = os.listdir(folder_path) 25 | all_rank_durations = [] 26 | for filename in files: 27 | if 'rank' in filename and (with_rank0 or 'rank_0' not in filename): 28 | try: 29 | with open(os.path.join(folder_path, filename)) as f: 30 | durations = get_durations(f.readlines()) 31 | if not durations: 32 | raise ValueError("Bad file") 33 | all_rank_durations.append(durations) 34 | except Exception: 35 | print("Bad file", folder_path, filename) 36 | return None 37 | 38 | try: 39 | return np.max(all_rank_durations, axis=0) 40 | except Exception as e: 41 | print("Error: empty directory", folder_path, e) 42 | return None 43 | 44 | 45 | def parse_file(task_name, log_dir, foldername): 46 | path = os.path.join(log_dir, foldername) 47 | 48 | if task_name in ('allreduce', 'allgather'): 49 | return parse_all_ranks(path) 50 | elif task_name == 'multicast': 51 | return parse_all_ranks(path, with_rank0=False) 52 | elif task_name in ('roundtrip', 'reduce', 'gather', 'subset_reduce'): 53 | return result_parser_utils.default_parse_file(task_name, log_dir, foldername) 54 | else: 55 | raise ValueError('Unknown task', task_name) 56 | 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser(description='Hoplite (C++) benchmark results parser.') 60 | parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log', 61 | help='The logging directory of Gloo benchmarks') 62 | parser.add_argument('--verbose', action='store_true') 63 | args = parser.parse_args() 64 | df = result_parser_utils.parse(args.log_dir, parse_file) 65 | if args.verbose: 66 | print(df) 67 | df.to_csv('hoplite_results.csv', index=False) 68 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/parse_roundtrip_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | 5 | import sys 6 | 7 | sys.path.insert(0, "../../test_utils") 8 | import result_parser_utils 9 | 10 | 11 | WARMUP_ROUNDS = 2 12 | 13 | 14 | def get_durations(lines): 15 | durations = [] 16 | for line in lines: 17 | if 'duration = ' in line: 18 | tmp = line.split('duration = ')[1] 19 | durations.append(float(tmp)) 20 | return durations 21 | 22 | 23 | def parse_all_ranks(folder_path, with_rank0=True): 24 | files = os.listdir(folder_path) 25 | all_rank_durations = [] 26 | for filename in files: 27 | if 'rank' in filename and (with_rank0 or 'rank_0' not in filename): 28 | try: 29 | with open(os.path.join(folder_path, filename)) as f: 30 | durations = get_durations(f.readlines()) 31 | if not durations: 32 | raise ValueError("Bad file") 33 | all_rank_durations.append(durations) 34 | except Exception: 35 | print("Bad file", folder_path, filename) 36 | return None 37 | 38 | try: 39 | return np.max(all_rank_durations, axis=0) 40 | except Exception as e: 41 | print("Error: empty directory", folder_path, e) 42 | return None 43 | 44 | 45 | def parse_file(task_name, log_dir, foldername): 46 | path = os.path.join(log_dir, foldername) 47 | 48 | if task_name in ('allreduce', 'allgather'): 49 | return parse_all_ranks(path) 50 | elif task_name == 'multicast': 51 | return parse_all_ranks(path, with_rank0=False) 52 | elif task_name in ('roundtrip', 'reduce', 'gather', 'subset_reduce'): 53 | return result_parser_utils.default_parse_file(task_name, log_dir, foldername) 54 | else: 55 | raise ValueError('Unknown task', task_name) 56 | 57 | 58 | if __name__ == "__main__": 59 | parser = argparse.ArgumentParser(description='Hoplite roundtrip results parser.') 60 | parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log', 61 | help='The logging directory of Gloo benchmarks') 62 | parser.add_argument('--verbose', action='store_true') 63 | args = parser.parse_args() 64 | df = result_parser_utils.parse(args.log_dir, parse_file) 65 | 66 | df = df[df['Benchmark Name'].str.contains('roundtrip')] 67 | sz = df['Object Size (in bytes)'].astype('int64') 68 | df = df[(sz == 2**10) | (sz == 2**20) | (sz == 2**30)] 69 | 70 | if args.verbose: 71 | print(df) 72 | 73 | rs = df[['Object Size (in bytes)', 'Average Time (s)', 'Std Time (s)', 'Repeated Times']].values 74 | with open('hoplite-roundtrip.csv', "w") as f: 75 | for r in rs: 76 | f.write(f"hoplite,{r[0]},{r[1]},{r[2]}\n") 77 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/pressure_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of nodes required"; exit -1; fi 3 | if [ "$#" -gt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi 4 | 5 | trap "exit" INT 6 | 7 | for i in `seq 1000`; do 8 | ./run_test.sh multicast $1 $[2**10] 9 | ./run_test.sh reduce $1 $[2**10] 10 | ./run_test.sh allreduce $1 $[2**10] 11 | ./run_test.sh gather $1 $[2**10] 12 | 13 | ./run_test.sh multicast $1 $[2**17] 14 | ./run_test.sh reduce $1 $[2**17] 15 | ./run_test.sh allreduce $1 $[2**17] 16 | ./run_test.sh gather $1 $[2**17] 17 | done 18 | 19 | for i in `seq 5`; do 20 | ./run_test.sh multicast $1 $[2**17] 21 | ./run_test.sh reduce $1 $[2**17] 22 | ./run_test.sh allreduce $1 $[2**17] 23 | ./run_test.sh gather $1 $[2**17] 24 | done 25 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" -lt 3 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) test name, number of nodes, input size required"; exit -1; fi 3 | if [ "$#" -gt 3 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi 4 | 5 | ## setup 6 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 7 | SCRIPT_DIR=$(dirname $(realpath -s $0)) 8 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils) 9 | BINARIES_DIR=$(realpath -s $SCRIPT_DIR/../../build) 10 | 11 | ## cleanup procs 12 | sudo fuser -k 6666/tcp -s &> /dev/null 13 | sudo fuser -k 50055/tcp -s &> /dev/null 14 | sudo fuser -k 20210/tcp -s &> /dev/null 15 | 16 | test_name=$1 17 | world_size=$2 18 | object_size=$3 19 | 20 | # get cluster info 21 | source $TEST_UNILS_DIR/load_cluster_env.sh 22 | OTHERS_IPADDR=(${OTHERS_IPADDR[@]:0:$(($world_size-1))}) 23 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) head_node: $MY_IPADDR; other_nodes: ${OTHERS_IPADDR[@]}" 24 | 25 | # prompt test info 26 | echo "$(tput setaf 2)[INFO]$(tput sgr 0) Running test $(tput setaf 3)$(tput bold)$test_name$(tput sgr 0)" 27 | 28 | # create logging dir 29 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size 30 | mkdir -p $log_dir 31 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest 32 | 33 | export RAY_BACKEND_LOG_LEVEL=info 34 | 35 | # pkill notification 36 | # sleep 0.5 37 | # ($BINARIES_DIR/notification 2>&1 | tee $log_dir/$MY_IPADDR.notification.log) & 38 | # sleep 0.5 39 | 40 | all_nodes=(${ALL_IPADDR[@]:0:$world_size}) 41 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 42 | 43 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \ 44 | -x HOPLITE_LOGGING_DIR=$log_dir \ 45 | -x RAY_BACKEND_LOG_LEVEL=$RAY_BACKEND_LOG_LEVEL \ 46 | test_wrapper.sh $test_name -s $object_size 47 | 48 | sleep 1 49 | -------------------------------------------------------------------------------- /microbenchmarks/hoplite-python/test_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | logging_file=$HOPLITE_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log 3 | source ~/anaconda3/etc/profile.d/conda.sh 4 | conda activate 5 | python hoplite_microbenchmarks.py $@ 2>&1 | tee $logging_file 6 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/.gitignore: -------------------------------------------------------------------------------- 1 | reduce 2 | allgather 3 | allreduce 4 | gather 5 | multicast 6 | send_recv 7 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/Makefile: -------------------------------------------------------------------------------- 1 | EXECS=multicast reduce allreduce gather allgather roundtrip 2 | MPICC?=mpicc 3 | 4 | all: ${EXECS} 5 | 6 | multicast: multicast.c 7 | ${MPICC} -O2 -o multicast multicast.c 8 | 9 | reduce: reduce.c 10 | ${MPICC} -O2 -o reduce reduce.c 11 | 12 | allreduce: allreduce.c 13 | ${MPICC} -O2 -o allreduce allreduce.c 14 | 15 | gather: gather.c 16 | ${MPICC} -O2 -o gather gather.c 17 | 18 | allgather: allgather.c 19 | ${MPICC} -O2 -o allgather allgather.c 20 | 21 | roundtrip: roundtrip.c 22 | ${MPICC} -O2 -o roundtrip roundtrip.c 23 | 24 | clean: 25 | rm -f ${EXECS} 26 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/README.md: -------------------------------------------------------------------------------- 1 | ## MPI collective communication benchmarks (baseline) 2 | 3 | Usage: 4 | 5 | ```bash 6 | ./mpi_${microbenchmark_name}.sh ${total_number_of_nodes} ${input_size_in_bytes} 7 | ``` 8 | 9 | `${microbenchmark_name}` includes 5 most common collective communication operations: `multicast`, `reduce`, `gather`, `allreduce`, `allgather`. 10 | 11 | ### Roundtrip test 12 | 13 | Usage: 14 | 15 | ```bash 16 | ./mpi_sendrecv.sh ${input_size_in_bytes} 17 | ``` 18 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/allgather.c: -------------------------------------------------------------------------------- 1 | // Author: Wes Kendall 2 | // Copyright 2013 www.mpitutorial.com 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel 4 | // free to modify it for your own use. Any distribution of the code must 5 | // either provide a link to www.mpitutorial.com or keep this header intact. 6 | // 7 | // Program that computes the average of an array of elements in parallel using 8 | // MPI_Reduce. 9 | // 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // Creates an array of random numbers. Each number has a value from 0 - 1 17 | float *create_rand_nums(int num_elements) { 18 | float *rand_nums = (float *)malloc(sizeof(float) * num_elements); 19 | assert(rand_nums != NULL); 20 | int i; 21 | for (i = 0; i < num_elements; i++) { 22 | rand_nums[i] = (rand() / (float)RAND_MAX); 23 | } 24 | return rand_nums; 25 | } 26 | 27 | int main(int argc, char **argv) { 28 | if (argc != 2) { 29 | fprintf(stderr, "Usage: ./allgather num_elements\n"); 30 | exit(1); 31 | } 32 | 33 | int num_elements_per_proc = atoi(argv[1]); 34 | double time = 0; 35 | 36 | MPI_Init(NULL, NULL); 37 | 38 | int world_rank; 39 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 40 | int world_size; 41 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 42 | 43 | // Create a random array of elements on all processes. 44 | srand(world_rank); // Seed the random number generator to get different 45 | // results each time for each processor 46 | float *rand_nums = NULL; 47 | rand_nums = create_rand_nums(num_elements_per_proc); 48 | float *global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc * world_size); 49 | 50 | MPI_Barrier(MPI_COMM_WORLD); 51 | // Reduce all of the local sums into the global sum 52 | time -= MPI_Wtime(); 53 | MPI_Allgather(rand_nums, num_elements_per_proc, MPI_FLOAT, global_nums, num_elements_per_proc, MPI_FLOAT, 54 | MPI_COMM_WORLD); 55 | time += MPI_Wtime(); 56 | 57 | // Print the result 58 | if (world_rank == 0) { 59 | printf("MPI_Allgather duration = %lf\n", time); 60 | } 61 | 62 | // Clean up 63 | free(rand_nums); 64 | 65 | MPI_Barrier(MPI_COMM_WORLD); 66 | MPI_Finalize(); 67 | } 68 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/allreduce.c: -------------------------------------------------------------------------------- 1 | // Author: Wes Kendall 2 | // Copyright 2013 www.mpitutorial.com 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel 4 | // free to modify it for your own use. Any distribution of the code must 5 | // either provide a link to www.mpitutorial.com or keep this header intact. 6 | // 7 | // Program that computes the average of an array of elements in parallel using 8 | // MPI_Reduce. 9 | // 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // Creates an array of random numbers. Each number has a value from 0 - 1 17 | float *create_rand_nums(int num_elements) { 18 | float *rand_nums = (float *)malloc(sizeof(float) * num_elements); 19 | assert(rand_nums != NULL); 20 | int i; 21 | for (i = 0; i < num_elements; i++) { 22 | rand_nums[i] = (rand() / (float)RAND_MAX); 23 | } 24 | return rand_nums; 25 | } 26 | 27 | int main(int argc, char **argv) { 28 | if (argc != 2) { 29 | fprintf(stderr, "Usage: ./allreduce num_elements\n"); 30 | exit(1); 31 | } 32 | 33 | int num_elements_per_proc = atoi(argv[1]); 34 | double time = 0; 35 | 36 | MPI_Init(NULL, NULL); 37 | 38 | int world_rank; 39 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 40 | int world_size; 41 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 42 | 43 | // Create a random array of elements on all processes. 44 | srand(world_rank); // Seed the random number generator to get different 45 | // results each time for each processor 46 | float *rand_nums = NULL; 47 | rand_nums = create_rand_nums(num_elements_per_proc); 48 | float *global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc); 49 | 50 | MPI_Barrier(MPI_COMM_WORLD); 51 | // Reduce all of the local sums into the global sum 52 | time -= MPI_Wtime(); 53 | MPI_Allreduce(rand_nums, global_nums, num_elements_per_proc, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); 54 | time += MPI_Wtime(); 55 | 56 | // Print the result 57 | if (world_rank == 0) { 58 | printf("MPI_Allreduce duration = %lf\n", time); 59 | } 60 | 61 | // Clean up 62 | free(rand_nums); 63 | 64 | MPI_Barrier(MPI_COMM_WORLD); 65 | MPI_Finalize(); 66 | } 67 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/auto_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for test_name in multicast reduce gather allreduce; do 4 | for num_nodes in 4 8 12 16; do 5 | for sz in 10 15 20 25 30; do 6 | for i in `seq 5`; do 7 | obj_size=$((2**$sz)) 8 | ./run_test.sh ${test_name} $num_nodes $obj_size 9 | done 10 | done 11 | done 12 | done 13 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/coverage_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for test_index in `seq 1 3`; do 4 | ./run_test.sh roundtrip 2 $[2**25] 5 | ./run_test.sh roundtrip 2 $[2**15] 6 | done 7 | 8 | num_nodes=8 9 | 10 | for test_name in multicast reduce gather allreduce allgather; do 11 | for i in 15 25; do 12 | for test_index in `seq 1 3`; do 13 | obj_size=$((2**$i)) 14 | echo $test_name-$num_nodes-$obj_size-$test_index 15 | ./run_test.sh ${test_name} $num_nodes $obj_size 16 | sleep 1 17 | done 18 | done 19 | 20 | done 21 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/gather.c: -------------------------------------------------------------------------------- 1 | // Author: Wes Kendall 2 | // Copyright 2013 www.mpitutorial.com 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel 4 | // free to modify it for your own use. Any distribution of the code must 5 | // either provide a link to www.mpitutorial.com or keep this header intact. 6 | // 7 | // Program that computes the average of an array of elements in parallel using 8 | // MPI_Reduce. 9 | // 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // Creates an array of random numbers. Each number has a value from 0 - 1 17 | float *create_rand_nums(int num_elements) { 18 | float *rand_nums = (float *)malloc(sizeof(float) * num_elements); 19 | assert(rand_nums != NULL); 20 | int i; 21 | for (i = 0; i < num_elements; i++) { 22 | rand_nums[i] = (rand() / (float)RAND_MAX); 23 | } 24 | return rand_nums; 25 | } 26 | 27 | int main(int argc, char **argv) { 28 | if (argc != 2) { 29 | fprintf(stderr, "Usage: ./gather num_elements\n"); 30 | exit(1); 31 | } 32 | 33 | int num_elements_per_proc = atoi(argv[1]); 34 | double time = 0; 35 | 36 | MPI_Init(NULL, NULL); 37 | 38 | int world_rank; 39 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 40 | int world_size; 41 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 42 | 43 | // Create a random array of elements on all processes. 44 | srand(world_rank); // Seed the random number generator to get different 45 | // results each time for each processor 46 | float *rand_nums = NULL; 47 | rand_nums = create_rand_nums(num_elements_per_proc); 48 | float *global_nums = NULL; 49 | 50 | if (world_rank == 0) { 51 | global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc * world_size); 52 | } 53 | 54 | MPI_Barrier(MPI_COMM_WORLD); 55 | // Reduce all of the local sums into the global sum 56 | time -= MPI_Wtime(); 57 | MPI_Gather(rand_nums, num_elements_per_proc, MPI_FLOAT, global_nums, num_elements_per_proc, MPI_FLOAT, 0, 58 | MPI_COMM_WORLD); 59 | time += MPI_Wtime(); 60 | 61 | // Print the result 62 | if (world_rank == 0) { 63 | printf("MPI_Gather duration = %lf\n", time); 64 | } 65 | 66 | // Clean up 67 | free(rand_nums); 68 | 69 | MPI_Barrier(MPI_COMM_WORLD); 70 | MPI_Finalize(); 71 | } 72 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/multicast.c: -------------------------------------------------------------------------------- 1 | // Author: Wes Kendall 2 | // Copyright 2011 www.mpitutorial.com 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel 4 | // free to modify it for your own use. Any distribution of the code must 5 | // either provide a link to www.mpitutorial.com or keep this header intact. 6 | // 7 | // Comparison of MPI_Bcast with the my_bcast function 8 | // 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | int main(int argc, char **argv) { 15 | if (argc != 2) { 16 | fprintf(stderr, "Usage: ./multicast num_elements\n"); 17 | exit(1); 18 | } 19 | 20 | int num_elements = atoi(argv[1]); 21 | int num_trials = 1; 22 | 23 | MPI_Init(NULL, NULL); 24 | 25 | int world_rank; 26 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 27 | 28 | double total_mpi_bcast_time = 0.0; 29 | int *data = (int *)malloc(sizeof(int) * num_elements); 30 | assert(data != NULL); 31 | 32 | // Time MPI_Bcast 33 | MPI_Barrier(MPI_COMM_WORLD); 34 | total_mpi_bcast_time -= MPI_Wtime(); 35 | MPI_Bcast(data, num_elements, MPI_INT, 0, MPI_COMM_WORLD); 36 | MPI_Barrier(MPI_COMM_WORLD); 37 | total_mpi_bcast_time += MPI_Wtime(); 38 | 39 | // Print off timing information 40 | if (world_rank == 0) { 41 | printf("MPI_Bcast duration = %lf\n", total_mpi_bcast_time); 42 | } 43 | 44 | free(data); 45 | MPI_Finalize(); 46 | } 47 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/parse_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | sys.path.insert(0, "../../test_utils") 5 | import result_parser_utils 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser(description='MPI (C++) benchmark results parser.') 10 | parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log', 11 | help='The logging directory of Gloo benchmarks') 12 | parser.add_argument('--verbose', action='store_true') 13 | args = parser.parse_args() 14 | df = result_parser_utils.parse(args.log_dir, result_parser_utils.default_parse_file) 15 | if args.verbose: 16 | print(df) 17 | df.to_csv('mpi_results.csv', index=False) 18 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/parse_roundtrip_result.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | sys.path.insert(0, "../../test_utils") 5 | import result_parser_utils 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser(description='MPI roundtrip benchmark results parser.') 10 | parser.add_argument('log_dir', metavar='PATH', nargs='?', type=str, default='log', 11 | help='The logging directory of Gloo benchmarks') 12 | parser.add_argument('--verbose', action='store_true') 13 | args = parser.parse_args() 14 | df = result_parser_utils.parse(args.log_dir, result_parser_utils.default_parse_file) 15 | 16 | df = df[df['Benchmark Name'].str.contains('roundtrip')] 17 | sz = df['Object Size (in bytes)'].astype('int64') 18 | df = df[(sz == 2**10) | (sz == 2**20) | (sz == 2**30)] 19 | 20 | if args.verbose: 21 | print(df) 22 | 23 | rs = df[['Object Size (in bytes)', 'Average Time (s)', 'Std Time (s)', 'Repeated Times']].values 24 | with open('mpi-roundtrip.csv', "w") as f: 25 | for r in rs: 26 | f.write(f"mpi,{r[0]},{r[1]},{r[2]}\n") 27 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/reduce.c: -------------------------------------------------------------------------------- 1 | // Author: Wes Kendall 2 | // Copyright 2013 www.mpitutorial.com 3 | // This code is provided freely with the tutorials on mpitutorial.com. Feel 4 | // free to modify it for your own use. Any distribution of the code must 5 | // either provide a link to www.mpitutorial.com or keep this header intact. 6 | // 7 | // Program that computes the average of an array of elements in parallel using 8 | // MPI_Reduce. 9 | // 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // Creates an array of random numbers. Each number has a value from 0 - 1 17 | float *create_rand_nums(int num_elements) { 18 | float *rand_nums = (float *)malloc(sizeof(float) * num_elements); 19 | assert(rand_nums != NULL); 20 | int i; 21 | for (i = 0; i < num_elements; i++) { 22 | rand_nums[i] = (rand() / (float)RAND_MAX); 23 | } 24 | return rand_nums; 25 | } 26 | 27 | int main(int argc, char **argv) { 28 | if (argc != 2) { 29 | fprintf(stderr, "Usage: ./reduce num_elements\n"); 30 | exit(1); 31 | } 32 | 33 | int num_elements_per_proc = atoi(argv[1]); 34 | double time = 0; 35 | 36 | MPI_Init(NULL, NULL); 37 | 38 | int world_rank; 39 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 40 | int world_size; 41 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 42 | 43 | // Create a random array of elements on all processes. 44 | srand(world_rank); // Seed the random number generator to get different 45 | // results each time for each processor 46 | float *rand_nums = NULL; 47 | rand_nums = create_rand_nums(num_elements_per_proc); 48 | float *global_nums = (float *)malloc(sizeof(float) * num_elements_per_proc); 49 | 50 | MPI_Barrier(MPI_COMM_WORLD); 51 | // Reduce all of the local sums into the global sum 52 | time -= MPI_Wtime(); 53 | MPI_Reduce(rand_nums, global_nums, num_elements_per_proc, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); 54 | time += MPI_Wtime(); 55 | 56 | // Print the result 57 | if (world_rank == 0) { 58 | printf("MPI_Reduce duration = %lf\n", time); 59 | } 60 | 61 | // Clean up 62 | free(rand_nums); 63 | 64 | MPI_Barrier(MPI_COMM_WORLD); 65 | MPI_Finalize(); 66 | } 67 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/roundtrip.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | float *create_rand_nums(int num_elements) { 7 | float *rand_nums = (float *)malloc(sizeof(float) * num_elements); 8 | assert(rand_nums != NULL); 9 | int i; 10 | for (i = 0; i < num_elements; i++) { 11 | rand_nums[i] = (rand() / (float)RAND_MAX); 12 | } 13 | return rand_nums; 14 | } 15 | 16 | int main(int argc, char **argv) { 17 | // Initialize the MPI environment 18 | MPI_Init(NULL, NULL); 19 | // Find out rank, size 20 | int world_rank; 21 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 22 | int world_size; 23 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 24 | 25 | // We are assuming at least 2 processes for this task 26 | if (world_size < 2) { 27 | fprintf(stderr, "World size must be greater than 1 for %s\n", argv[0]); 28 | MPI_Abort(MPI_COMM_WORLD, 1); 29 | } 30 | 31 | if (argc != 2) { 32 | fprintf(stderr, "Usage: ./roundtrip num_elements\n"); 33 | exit(1); 34 | } 35 | 36 | int num_elements = atoi(argv[1]); 37 | double time = 0; 38 | float *numbers = create_rand_nums(num_elements); 39 | MPI_Barrier(MPI_COMM_WORLD); 40 | if (world_rank == 0) { 41 | time -= MPI_Wtime(); 42 | MPI_Send( 43 | /* data = */ numbers, 44 | /* count = */ num_elements, 45 | /* datatype = */ MPI_FLOAT, 46 | /* destination = */ 1, 47 | /* tag = */ 0, 48 | /* communicator = */ MPI_COMM_WORLD); 49 | 50 | MPI_Recv( 51 | /* data = */ numbers, 52 | /* count = */ num_elements, 53 | /* datatype = */ MPI_FLOAT, 54 | /* source = */ 1, 55 | /* tag = */ 0, 56 | /* communicator = */ MPI_COMM_WORLD, 57 | /* status = */ MPI_STATUS_IGNORE); 58 | time += MPI_Wtime(); 59 | printf("MPI_Recv (roundtrip) duration = %lf\n", time); 60 | 61 | } else if (world_rank == 1) { 62 | MPI_Recv( 63 | /* data = */ numbers, 64 | /* count = */ num_elements, 65 | /* datatype = */ MPI_FLOAT, 66 | /* source = */ 0, 67 | /* tag = */ 0, 68 | /* communicator = */ MPI_COMM_WORLD, 69 | /* status = */ MPI_STATUS_IGNORE); 70 | 71 | MPI_Send( 72 | /* data = */ numbers, 73 | /* count = */ num_elements, 74 | /* datatype = */ MPI_FLOAT, 75 | /* destination = */ 0, 76 | /* tag = */ 0, 77 | /* communicator = */ MPI_COMM_WORLD); 78 | } 79 | MPI_Finalize(); 80 | } 81 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$3" ]; then echo "ERROR: test name, node number and input size required"; exit; fi 4 | 5 | SCRIPT_DIR=$(dirname $(realpath -s $0)) 6 | TEST_UNILS_DIR=$(realpath -s $SCRIPT_DIR/../../test_utils) 7 | source $TEST_UNILS_DIR/load_cluster_env.sh 8 | 9 | test_name=$1 # can be allgather/allreduce/gather/multicast/reduce 10 | make $test_name > /dev/null 11 | 12 | test_executable=$test_name 13 | test_executable_abspath=$(realpath -s $test_executable) 14 | world_size=$2 15 | object_size=$3 16 | 17 | # create logging dir 18 | log_dir=$SCRIPT_DIR/log/$(date +"%Y%m%d-%H%M%S.%N")-$test_name-$world_size-$object_size 19 | mkdir -p $log_dir 20 | ln -sfn $log_dir/ $SCRIPT_DIR/log/latest 21 | 22 | all_nodes=(${ALL_IPADDR[@]:0:$world_size}) 23 | all_hosts=$(echo ${all_nodes[@]} | sed 's/ /,/g') 24 | 25 | echo Number of nodes: $world_size "(actually ${#all_nodes[@]})", data size: $object_size 26 | echo Nodes: ${all_nodes[@]} "("${#all_nodes[@]}")" 27 | 28 | $TEST_UNILS_DIR/mpirun_pernode.sh $all_hosts \ 29 | -x MPI_LOGGING_DIR="$log_dir" \ 30 | test_wrapper.sh $test_executable_abspath $[$object_size/4] 31 | -------------------------------------------------------------------------------- /microbenchmarks/mpi-cpp/test_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | logging_file=$MPI_LOGGING_DIR/rank_$OMPI_COMM_WORLD_RANK.log 3 | $@ 2>&1 | tee $logging_file 4 | -------------------------------------------------------------------------------- /microbenchmarks/plot_rtt.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | 4 | COLUMNS = ["Method", "Object Size (in bytes)", "Average Time (s)" ,"Std Time (s)"] 5 | LABELS = ['Optimal', 'Hoplite', 'OpenMPI', 'Ray', 'Dask'] 6 | COLORS = ( 7 | plt.get_cmap('tab20c')(4 * 4 + 2), 8 | plt.get_cmap('tab20c')(0 * 4 + 1), 9 | plt.get_cmap('tab20c')(1 * 4 + 2), 10 | plt.get_cmap('tab20c')(2 * 4 + 2), 11 | plt.get_cmap('tab20')(3 * 2 + 1), 12 | ) 13 | 14 | 15 | def draw_rtt_1K(results): 16 | SIZE = 1024 17 | results = results[results[COLUMNS[1]] == SIZE] 18 | Latency, STD = results[COLUMNS[2]], results[COLUMNS[3]] 19 | 20 | plt.figure(figsize=(4, 4)) 21 | ind = range(5) 22 | width = 0.8 23 | plt.bar(ind, Latency * 1000, width, label='usr', color=COLORS, linewidth=10) 24 | plt.errorbar(ind[1:], Latency[1:] * 1000, yerr=STD[1:] * 1000, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6) 25 | plt.xticks(ind, LABELS, fontsize=18) 26 | for label in plt.gca().get_xmajorticklabels(): 27 | label.set_rotation(30) 28 | label.set_horizontalalignment("right") 29 | plt.yticks(fontsize=18) 30 | plt.ylabel('RTT (ms)', fontsize=18) 31 | plt.annotate("1.7 μs", (-0.55, 0.08), fontsize=15) 32 | plt.savefig('RTT1K.pdf', bbox_inches="tight") 33 | 34 | 35 | def draw_rtt_1M(results): 36 | SIZE = 2 ** 20 37 | results = results[results[COLUMNS[1]] == SIZE] 38 | Latency, STD = results[COLUMNS[2]], results[COLUMNS[3]] 39 | 40 | plt.figure(figsize=(4, 4)) 41 | ind = range(5) 42 | width = 0.8 43 | plt.bar(ind, Latency * 1000, width, label='usr', color=COLORS, linewidth=10) 44 | plt.errorbar(ind[1:], Latency[1:] * 1000, yerr=STD[1:] * 1000, linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6) 45 | plt.xticks(ind, LABELS, fontsize=18) 46 | for label in plt.gca().get_xmajorticklabels(): 47 | label.set_rotation(30) 48 | label.set_horizontalalignment("right") 49 | plt.yticks(fontsize=18) 50 | plt.ylabel('RTT (ms)', fontsize=18) 51 | plt.savefig('RTT1M.pdf', bbox_inches="tight") 52 | 53 | 54 | def draw_rtt_1G(results): 55 | SIZE = 2 ** 30 56 | results = results[results[COLUMNS[1]] == SIZE] 57 | Latency, STD = results[COLUMNS[2]], results[COLUMNS[3]] 58 | 59 | plt.figure(figsize=(4, 4)) 60 | ind = range(5) 61 | width = 0.8 62 | plt.bar(ind, Latency, width, label='usr', color=COLORS, linewidth=10) 63 | plt.errorbar(ind[1:], Latency[1:], yerr=STD[1:], linewidth=0, elinewidth=1.5, color='#444444', capthick=1.5, capsize=6) 64 | plt.xticks(ind, LABELS, fontsize=18) 65 | for label in plt.gca().get_xmajorticklabels(): 66 | label.set_rotation(30) 67 | label.set_horizontalalignment("right") 68 | plt.yticks(fontsize=18) 69 | plt.ylabel('RTT (s)', fontsize=18) 70 | plt.savefig('RTT1G.pdf', bbox_inches="tight") 71 | 72 | 73 | if __name__ == '__main__': 74 | results = pd.read_csv('roundtrip-results.csv') 75 | results.loc[len(results.index)] = ['optimal', 1024, 0.000031690911909, 0.0] 76 | results.loc[len(results.index)] = ['optimal', 1048576, 0.001731493794326, 0.0] 77 | results.loc[len(results.index)] = ['optimal', 1073741824, 1.773049645390071, 0.0] 78 | 79 | cat_method_order = pd.CategoricalDtype( 80 | ['optimal', 'hoplite', 'mpi', 'ray', 'dask'], 81 | ordered=True 82 | ) 83 | results['Method'] = results['Method'].astype(cat_method_order) 84 | results = results.sort_values('Method') 85 | draw_rtt_1K(results) 86 | draw_rtt_1M(results) 87 | draw_rtt_1G(results) 88 | -------------------------------------------------------------------------------- /microbenchmarks/ray-python/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python -m pip install grpcio-tools 3 | python -m grpc_tools.protoc -I../../src/protocol/ --python_out=. --grpc_python_out=. object_store.proto 4 | 5 | clean: 6 | rm object_store_pb2*.py 7 | -------------------------------------------------------------------------------- /microbenchmarks/ray-python/README.md: -------------------------------------------------------------------------------- 1 | ## Ray collective communication benchmarks (baseline) 2 | 3 | Run `make` to compile necessary files. 4 | 5 | Usage: 6 | 7 | See python `run_tests.py -h`. 8 | 9 | ### Auto test 10 | 11 | ./auto_test.sh 12 | -------------------------------------------------------------------------------- /microbenchmarks/ray-python/auto_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run_tests.py auto 4 | -------------------------------------------------------------------------------- /microbenchmarks/ray-python/ray_roundtrip.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import ray 4 | 5 | 6 | @ray.remote(resources={'machine': 1}) 7 | class RayBenchmarkWorker: 8 | def __init__(self, object_size): 9 | self.object_size = object_size 10 | self.payload = np.ones(object_size//4, dtype=np.float32) 11 | 12 | def poll(self): 13 | pass 14 | 15 | def send(self): 16 | return ray.put(self.payload) 17 | 18 | def recv(self, x): 19 | return ray.get(x) 20 | 21 | def recv2(self, x): 22 | return None 23 | 24 | 25 | def ray_roundtrip(object_size): 26 | sender = RayBenchmarkWorker.remote(object_size) 27 | receiver = RayBenchmarkWorker.remote(object_size) 28 | ray.get([sender.poll.remote(), receiver.poll.remote()]) 29 | start = time.time() 30 | ray.get(sender.recv2.remote(receiver.recv.remote(sender.send.remote()))) 31 | return time.time() - start 32 | 33 | 34 | REPEAT_TIMES = 5 35 | 36 | def test_with_mean_std(object_size, repeat_times=REPEAT_TIMES): 37 | results = [] 38 | for _ in range(repeat_times): 39 | duration = ray_roundtrip(object_size) 40 | results.append(duration) 41 | return np.mean(results), np.std(results) 42 | 43 | 44 | if __name__ == "__main__": 45 | ray.init(address='auto') 46 | with open("ray-roundtrip.csv", "w") as f: 47 | for object_size in (2 ** 10, 2 ** 20, 2 ** 30): 48 | mean, std = test_with_mean_std(object_size) 49 | print(f"roundtrip: {object_size} {mean:.6f} ± {std:.6f}s") 50 | f.write(f"ray,{object_size},{mean},{std}\n") -------------------------------------------------------------------------------- /microbenchmarks/ray-python/run_tests.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import ray 4 | 5 | import hoplite 6 | import ray_microbenchmarks 7 | 8 | NUM_NODES = (4, 8, 12, 16) 9 | OBJECT_SIZES = (2 ** 10, 2 ** 15, 2 ** 20, 2 ** 25, 2 ** 30) 10 | REPEAT_TIMES = 5 11 | 12 | microbenchmark_names = ['multicast', 'reduce', 'allreduce', 'gather', 'allgather', 'auto'] 13 | parser = argparse.ArgumentParser(description='Ray microbenchmarks') 14 | parser.add_argument('test_name', type=str, choices=microbenchmark_names, help='Microbenchmark name') 15 | parser.add_argument('-n', '--world-size', type=int, required=False, 16 | help='Size of the collective processing group') 17 | parser.add_argument('-s', '--object-size', type=int, required=False, 18 | help='The size of the object') 19 | args = parser.parse_args() 20 | 21 | 22 | def test_with_mean_std(test_name, notification_address, world_size, object_size, 23 | repeat_times=REPEAT_TIMES): 24 | results = [] 25 | for _ in range(repeat_times): 26 | test_case = ray_microbenchmarks.__dict__[test_name] 27 | duration = test_case(notification_address, world_size, object_size) 28 | results.append(duration) 29 | return np.mean(results), np.std(results) 30 | 31 | 32 | if __name__ == "__main__": 33 | notification_address = hoplite.start_location_server() 34 | 35 | ray.init(address='auto') 36 | test_name = 'ray_' + args.test_name 37 | assert test_name in ray_microbenchmarks.__dict__ or args.test_name == 'auto' 38 | if args.test_name != 'auto': 39 | assert args.world_size is not None and args.object_size is not None 40 | mean, std = test_with_mean_std(test_name, notification_address, args.world_size, args.object_size, 5) 41 | print(f"{args.test_name},{args.world_size},{args.object_size},{mean},{std}") 42 | else: 43 | assert args.world_size is None and args.object_size is None 44 | with open("ray-microbenchmark.csv", "w") as f: 45 | for algorithm in ('ray_multicast', 'ray_gather', 'ray_reduce', 'ray_allreduce'): 46 | for world_size in NUM_NODES: 47 | for object_size in OBJECT_SIZES: 48 | mean, std = test_with_mean_std(algorithm, notification_address, world_size, object_size) 49 | print(f"{algorithm}, {world_size}, {object_size}, {mean}, {std}") 50 | f.write(f"{algorithm},{world_size},{object_size},{mean},{std}\n") 51 | -------------------------------------------------------------------------------- /python/hoplite/__init__.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import pathlib 3 | import subprocess 4 | import time 5 | 6 | from . import _hoplite_client as _hoplite_store 7 | 8 | HopliteClient = _hoplite_store.DistributedObjectStore 9 | Buffer = _hoplite_store.Buffer 10 | ObjectID = _hoplite_store.ObjectID 11 | ReduceOp = _hoplite_store.ReduceOp 12 | 13 | 14 | def get_my_address(): 15 | import socket 16 | return socket.gethostbyname(socket.gethostname()) 17 | 18 | 19 | def add_arguments(parser): 20 | parser.add_argument('--redis-address', type=str, default=get_my_address(), 21 | help='The IP address of the redis server') 22 | parser.add_argument('--redis-port', type=int, default=6380, 23 | help='The port of the redis server') 24 | parser.add_argument('--notification-port', type=int, default=7777, 25 | help='The port of the notification server') 26 | parser.add_argument('--notification-listening-port', type=int, default=8888, 27 | help='The listening port of the notification client') 28 | parser.add_argument('--plasma-socket', type=str, default="/tmp/multicast_plasma", 29 | help='The path of the unix domain socket') 30 | parser.add_argument('--object_writer_port', type=int, default=6666, 31 | help='The path of the unix domain socket') 32 | parser.add_argument('--grpc-port', type=int, default=50055, 33 | help='The path of the unix domain socket') 34 | 35 | 36 | def extract_dict_from_args(args): 37 | return {'redis_address': args.redis_address.encode()} 38 | 39 | 40 | def create_store_using_dict(args_dict): 41 | store = _hoplite_store.DistributedObjectStore(args_dict['redis_address']) 42 | return store 43 | 44 | 45 | def object_id_from_int(n): 46 | return _hoplite_store.ObjectID(int(str(n), 16).to_bytes(20, byteorder='big')) 47 | 48 | 49 | def random_object_id(): 50 | import random 51 | return object_id_from_int(random.randint(0, 1e20-1)) 52 | 53 | 54 | def _register_cleanup(processes): 55 | def _cleanup_processes(): 56 | print("Cleaning up process...") 57 | # wait clients to exit to suppress error messages 58 | time.sleep(0.5) 59 | for p in processes: 60 | p.terminate() 61 | atexit.register(_cleanup_processes) 62 | 63 | 64 | def start_location_server(): 65 | server_exec = pathlib.Path(__file__).resolve().parent.absolute() / 'notification' 66 | notification_p = subprocess.Popen([str(server_exec)]) 67 | _register_cleanup([notification_p]) 68 | time.sleep(2) 69 | return get_my_address() 70 | 71 | 72 | __all__ = ('start_location_server', 'random_object_id', 'object_id_from_int', 73 | 'create_store_using_dict', 'extract_dict_from_args', 'add_arguments', 'get_my_address', 74 | 'Buffer', 'ObjectID', 'ReduceOp') 75 | -------------------------------------------------------------------------------- /python/hoplite/_hoplite_client.pxd: -------------------------------------------------------------------------------- 1 | # cython: language_level = 3 2 | 3 | from libcpp cimport bool as c_bool 4 | from libcpp.memory cimport shared_ptr, unique_ptr 5 | from libcpp.string cimport string as c_string 6 | 7 | from libc.stdint cimport uint8_t, int32_t, uint64_t, int64_t, uint32_t 8 | from libcpp.unordered_map cimport unordered_map 9 | from libcpp.unordered_set cimport unordered_set 10 | 11 | from libcpp.vector cimport vector as c_vector 12 | 13 | cdef extern from "util/logging.h" namespace "ray" nogil: 14 | cdef cppclass CRayLogLevel "hoplite::RayLogLevel": 15 | pass 16 | 17 | cdef cppclass CRayLog "hoplite::RayLog": 18 | @staticmethod 19 | void StartRayLog(const c_string &my_address, CRayLogLevel log_level) 20 | 21 | 22 | cdef extern from "util/logging.h" namespace "hoplite::RayLogLevel" nogil: 23 | cdef CRayLogLevel CRayLogDEBUG "hoplite::RayLogLevel::DEBUG" 24 | cdef CRayLogLevel CRayLogINFO "hoplite::RayLogLevel::INFO" 25 | cdef CRayLogLevel CRayLogWARNING "hoplite::RayLogLevel::WARNING" 26 | cdef CRayLogLevel CRayLogERROR "hoplite::RayLogLevel::ERROR" 27 | cdef CRayLogLevel CRayLogFATAL "hoplite::RayLogLevel::FATAL" 28 | 29 | 30 | cdef extern from "common/id.h" namespace "" nogil: 31 | cdef cppclass CObjectID "ObjectID": 32 | @staticmethod 33 | CObjectID FromBinary(const c_string& binary) 34 | @staticmethod 35 | CObjectID FromHex(const c_string& binary) 36 | c_string Binary() const 37 | 38 | 39 | cdef extern from "common/buffer.h" namespace "" nogil: 40 | cdef cppclass CBuffer "Buffer": 41 | CBuffer(int64_t size) 42 | CBuffer(uint8_t* data, int64_t size) 43 | const uint8_t* Data() 44 | uint8_t* MutableData() 45 | int64_t Size() 46 | uint64_t Hash() const 47 | 48 | 49 | cdef extern from "client/distributed_object_store.h" namespace "" nogil: 50 | cdef cppclass CDistributedObjectStore "DistributedObjectStore": 51 | CDistributedObjectStore(const c_string &object_directory_address) 52 | 53 | void Put(const shared_ptr[CBuffer] &buffer, const CObjectID &object_id) 54 | 55 | CObjectID Put(const shared_ptr[CBuffer] &buffer) 56 | 57 | void Reduce(const c_vector[CObjectID] &object_ids, 58 | CObjectID *created_reduction_id) 59 | 60 | void Reduce(const c_vector[CObjectID] &object_ids, 61 | const CObjectID &reduction_id) 62 | 63 | void Reduce(const c_vector[CObjectID] &object_ids, 64 | CObjectID *created_reduction_id, 65 | ssize_t num_reduce_objects) 66 | 67 | void Reduce(const c_vector[CObjectID] &object_ids, 68 | const CObjectID &reduction_id, 69 | ssize_t num_reduce_objects) 70 | 71 | unordered_set[CObjectID] GetReducedObjects(const CObjectID &reduction_id) 72 | 73 | void Get(const CObjectID &object_id, 74 | shared_ptr[CBuffer] *result) 75 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from setuptools import Extension, setup 4 | from Cython.Build import cythonize 5 | 6 | project_dir = pathlib.Path(__file__).parent.absolute().parent 7 | src_dir = project_dir / 'src' 8 | lib_dir = project_dir / 'build' 9 | 10 | ext_modules = [ 11 | Extension( 12 | "hoplite._hoplite_client", 13 | sources=["hoplite/_hoplite_client.pyx"], 14 | include_dirs=[str(src_dir), str(lib_dir)], # lib_dir contains "object_store.grpc.pb.h" 15 | library_dirs=[str(lib_dir)], 16 | libraries=["hoplite_client_lib"], 17 | # this is necessary for the dynamic linking of Linux to 18 | # be working in a distributed environment 19 | extra_link_args=['-Wl,-rpath=' + str(lib_dir)], 20 | ) 21 | ] 22 | 23 | setup(name='hoplite', 24 | zip_safe=False, 25 | packages=['hoplite'], 26 | ext_modules=cythonize(ext_modules)) 27 | -------------------------------------------------------------------------------- /python/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make hoplite importable 4 | script_dir=$(dirname "${BASH_SOURCE[0]}") 5 | site_packages=$(python -c 'import site; print(site.getsitepackages()[0])') 6 | echo $(realpath $script_dir) > $site_packages/easy-install.pth 7 | echo $(realpath $script_dir) > $site_packages/hoplite.egg-link 8 | -------------------------------------------------------------------------------- /src/client/distributed_object_store.h: -------------------------------------------------------------------------------- 1 | #ifndef DISTRIBUTED_OBJECT_STORE_H 2 | #define DISTRIBUTED_OBJECT_STORE_H 3 | 4 | #include 5 | #include 6 | #include 7 | // common headers 8 | #include "common/buffer.h" 9 | #include "common/id.h" 10 | // components headers 11 | #include "global_control_store.h" 12 | #include "local_store_client.h" 13 | #include "notification_listener.h" 14 | #include "object_sender.h" 15 | #include "object_store_state.h" 16 | #include "receiver.h" 17 | 18 | class DistributedObjectStore { 19 | public: 20 | explicit DistributedObjectStore(const std::string &object_directory_address); 21 | 22 | ~DistributedObjectStore(); 23 | 24 | void Put(const std::shared_ptr &buffer, const ObjectID &object_id); 25 | 26 | ObjectID Put(const std::shared_ptr &buffer); 27 | 28 | void Reduce(const std::vector &object_ids, ObjectID *created_reduction_id, ssize_t num_reduce_objects = -1); 29 | 30 | void Reduce(const std::vector &object_ids, const ObjectID &reduction_id, ssize_t num_reduce_objects = -1); 31 | 32 | void Get(const ObjectID &object_id, std::shared_ptr *result); 33 | 34 | bool IsLocalObject(const ObjectID &object_id, int64_t *size); 35 | 36 | std::unordered_set GetReducedObjects(const ObjectID &reduction_id); 37 | 38 | private: 39 | template void reduce_local_objects(const std::vector &object_ids, Buffer *output) { 40 | DCHECK(output->Size() % sizeof(T) == 0) << "Buffer size cannot be divide whole by the element size"; 41 | auto num_elements = output->Size() / sizeof(T); 42 | T *target = (T *)output->MutableData(); 43 | bool first = true; 44 | // TODO: implement parallel reducing 45 | for (const auto &object_id : object_ids) { 46 | // TODO: those object_ids could also be local streams. 47 | ObjectBuffer object_buffer; 48 | DCHECK(local_store_client_.ObjectExists(object_id)) << "ObjectID not in local store"; 49 | local_store_client_.Get(object_id, &object_buffer); 50 | std::shared_ptr buf = object_buffer.data; 51 | const T *data_ptr = (const T *)buf->Data(); 52 | if (!first) { 53 | for (int64_t i = 0; i < num_elements; i++) 54 | target[i] += data_ptr[i]; 55 | } else { 56 | for (int64_t i = 0; i < num_elements; i++) 57 | target[i] = data_ptr[i]; 58 | first = false; 59 | } 60 | } 61 | // TODO: try to pipeline this 62 | output->progress = output->Size(); 63 | } 64 | 65 | // order of fields should be kept for proper initialization order 66 | std::string my_address_; 67 | ObjectStoreState state_; 68 | GlobalControlStoreClient gcs_client_; 69 | LocalStoreClient local_store_client_; 70 | ObjectSender object_sender_; 71 | Receiver receiver_; 72 | NotificationListener notification_listener_; 73 | }; 74 | 75 | #endif // DISTRIBUTED_OBJECT_STORE_H 76 | -------------------------------------------------------------------------------- /src/client/global_control_store.h: -------------------------------------------------------------------------------- 1 | #ifndef GLOBAL_CONTROL_STORE_H 2 | #define GLOBAL_CONTROL_STORE_H 3 | 4 | #include "common/id.h" 5 | #include "object_store.grpc.pb.h" 6 | #include "util/ctpl_stl.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | constexpr int64_t inband_data_size_limit = 65536; 18 | 19 | struct SyncReply { 20 | std::string sender_ip; 21 | size_t object_size; 22 | std::string inband_data; 23 | }; 24 | 25 | class GlobalControlStoreClient { 26 | public: 27 | GlobalControlStoreClient(const std::string ¬ification_server_address, const std::string &my_address, 28 | int notification_server_port); 29 | 30 | void ConnectNotificationServer(); 31 | 32 | // Write object location to the notification server. 33 | void WriteLocation(const ObjectID &object_id, const std::string &my_address, bool finished, size_t object_size, 34 | const uint8_t *inband_data = nullptr, bool blocking = false); 35 | 36 | // Get object location from the notification server. 37 | SyncReply GetLocationSync(const ObjectID &object_id, bool occupying, const std::string &receiver_ip); 38 | 39 | bool HandlePullObjectFailure(const ObjectID &object_id, const std::string &receiver_ip, 40 | std::string *alternative_sender_ip); 41 | 42 | void HandleReceiveReducedObjectFailure(const ObjectID &reduction_id, const std::string &receiver_ip, 43 | const std::string &sender_ip); 44 | 45 | /// Create reduce task 46 | /// \param reduce_dst The IP address of the node that holds the final reduced object. 47 | void CreateReduceTask(const std::vector &objects_to_reduce, const ObjectID &reduction_id, 48 | int num_reduce_objects); 49 | 50 | /// Get the IDs of objects reduced for a reduction ID. 51 | /// \param[in] reduction_id The reduction ID represents the reduce event. 52 | /// \return A set of reduced object IDs 53 | std::unordered_set GetReducedObjects(const ObjectID &reduction_id); 54 | 55 | private: 56 | const std::string ¬ification_server_address_; 57 | const std::string &my_address_; 58 | const int notification_server_port_; 59 | std::shared_ptr notification_channel_; 60 | std::unique_ptr notification_stub_; 61 | ctpl::thread_pool pool_; 62 | }; 63 | 64 | #endif // GLOBAL_CONTROL_STORE_H 65 | -------------------------------------------------------------------------------- /src/client/local_store_client.h: -------------------------------------------------------------------------------- 1 | #ifndef LOCAL_STORE_H 2 | #define LOCAL_STORE_H 3 | 4 | #include "common/buffer.h" 5 | #include "common/id.h" 6 | #include "common/status.h" 7 | #include 8 | #include 9 | #include 10 | 11 | class LocalStoreClient { 12 | public: 13 | LocalStoreClient(); 14 | 15 | Status Create(const ObjectID &object_id, int64_t data_size, std::shared_ptr *data); 16 | 17 | Status Seal(const ObjectID &object_id); 18 | 19 | // Check if an object exists in the store. 20 | // We assume this function will never fail. 21 | bool ObjectExists(const ObjectID &object_id, bool require_finished = true); 22 | 23 | Status Get(const std::vector &object_ids, std::vector *object_buffers); 24 | 25 | // Get single object from the store. 26 | Status Get(const ObjectID &object_id, ObjectBuffer *object_buffer); 27 | 28 | std::shared_ptr GetBufferNoExcept(const ObjectID &object_id); 29 | 30 | Status GetBufferOrCreate(const ObjectID &object_id, int64_t size, std::shared_ptr *data); 31 | 32 | Status Delete(const ObjectID &object_id); 33 | 34 | Status Wait(const ObjectID &object_id); 35 | 36 | private: 37 | Status create_internal(const ObjectID &object_id, int64_t data_size, std::shared_ptr *data); 38 | bool object_exists_unsafe(const ObjectID &object_id, bool require_finished); 39 | std::mutex local_store_mutex_; 40 | std::unordered_map> buffers_; 41 | size_t total_store_size_; 42 | const size_t lru_bound_size_ = (16LL << 30); 43 | std::queue lru_queue_; 44 | }; 45 | 46 | #endif // LOCAL_STORE_H 47 | -------------------------------------------------------------------------------- /src/client/notification_listener.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | #include "common/id.h" 11 | #include "object_store_state.h" 12 | #include "receiver.h" 13 | 14 | class NotificationListenerImpl; 15 | 16 | class NotificationListener { 17 | public: 18 | NotificationListener(const std::string &my_address, int notification_listener_port, ObjectStoreState &state, 19 | Receiver &recevier, LocalStoreClient &local_store_client); 20 | 21 | void Run(); 22 | 23 | void Shutdown(); 24 | 25 | private: 26 | void worker_loop(); 27 | 28 | std::string my_address_; 29 | 30 | ObjectStoreState &state_; 31 | Receiver &recevier_; 32 | LocalStoreClient &local_store_client_; 33 | 34 | std::thread notification_listener_thread_; 35 | std::unique_ptr grpc_server_; 36 | std::shared_ptr service_; 37 | }; 38 | -------------------------------------------------------------------------------- /src/client/object_sender.h: -------------------------------------------------------------------------------- 1 | #ifndef OBJECT_SENDER_H 2 | #define OBJECT_SENDER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include // struct sockaddr_in 10 | 11 | #include "global_control_store.h" 12 | #include "local_store_client.h" 13 | #include "object_store.pb.h" 14 | #include "object_store_state.h" 15 | 16 | class ObjectSender { 17 | public: 18 | ObjectSender(ObjectStoreState &state, GlobalControlStoreClient &gcs_client, LocalStoreClient &local_store_client, 19 | const std::string &my_address); 20 | 21 | void Run(); 22 | 23 | void Shutdown(); 24 | 25 | private: 26 | void listener_loop(); 27 | 28 | int send_object(int conn_fd, const ObjectID &object_id, int64_t object_size, int64_t offset); 29 | 30 | int send_reduced_object(int conn_fd, const ObjectID &object_id, int64_t object_size, int64_t offset); 31 | 32 | GlobalControlStoreClient &gcs_client_; 33 | LocalStoreClient &local_store_client_; 34 | ObjectStoreState &state_; 35 | std::string my_address_; 36 | 37 | // for the TCP listener 38 | int server_fd_; 39 | std::thread server_thread_; 40 | struct sockaddr_in address_; 41 | // thread pool for launching tasks 42 | ctpl::thread_pool pool_; 43 | }; 44 | 45 | #endif // OBJECT_SENDER_H 46 | -------------------------------------------------------------------------------- /src/client/object_store_state.cc: -------------------------------------------------------------------------------- 1 | #include "object_store_state.h" 2 | #include "util/logging.h" 3 | 4 | std::shared_ptr ObjectStoreState::create_reduction_stream(const ObjectID &reduction_id, size_t size) { 5 | std::unique_lock l(reduction_stream_mutex_); 6 | DCHECK(reduction_stream_.find(reduction_id) == reduction_stream_.end()); 7 | auto stream = std::make_shared(size); 8 | reduction_stream_[reduction_id] = stream; 9 | l.unlock(); 10 | reduction_stream_cv_.notify_all(); 11 | return stream; 12 | } 13 | 14 | std::shared_ptr ObjectStoreState::get_reduction_stream(const ObjectID &reduction_id) { 15 | std::unique_lock l(reduction_stream_mutex_); 16 | reduction_stream_cv_.wait( 17 | l, [this, &reduction_id]() { return reduction_stream_.find(reduction_id) != reduction_stream_.end(); }); 18 | return reduction_stream_[reduction_id]; 19 | } 20 | 21 | std::shared_ptr ObjectStoreState::get_or_create_reduction_stream(const ObjectID &reduction_id, size_t size) { 22 | std::unique_lock l(reduction_stream_mutex_); 23 | auto search = reduction_stream_.find(reduction_id); 24 | if (search == reduction_stream_.end()) { 25 | auto stream = std::make_shared(size); 26 | reduction_stream_[reduction_id] = stream; 27 | l.unlock(); 28 | reduction_stream_cv_.notify_all(); 29 | return stream; 30 | } else { 31 | return search->second; 32 | } 33 | } 34 | 35 | void ObjectStoreState::release_reduction_stream(const ObjectID &reduction_id) { 36 | std::unique_lock l(reduction_stream_mutex_); 37 | // release the memory 38 | reduction_stream_.erase(reduction_id); 39 | } 40 | 41 | void ObjectStoreState::create_local_reduce_task(const ObjectID &reduction_id, 42 | const std::vector &local_objects) { 43 | DCHECK(local_objects.size() <= 1); 44 | auto t = std::make_shared(); 45 | if (!local_objects.empty()) { 46 | t->local_object = local_objects[0]; 47 | } 48 | { 49 | std::lock_guard lock(reduce_tasks_mutex_); 50 | reduce_tasks_[reduction_id] = t; 51 | } 52 | } 53 | 54 | std::shared_ptr ObjectStoreState::get_local_reduce_task(const ObjectID &reduction_id) { 55 | std::lock_guard lock(reduce_tasks_mutex_); 56 | DCHECK(reduce_tasks_.count(reduction_id)); 57 | return reduce_tasks_[reduction_id]; 58 | } 59 | 60 | void ObjectStoreState::remove_local_reduce_task(const ObjectID &reduction_id) { 61 | std::lock_guard lock(reduce_tasks_mutex_); 62 | DCHECK(reduce_tasks_.count(reduction_id)); 63 | reduce_tasks_.erase(reduction_id); 64 | } 65 | 66 | bool ObjectStoreState::local_reduce_task_exists(const ObjectID &reduction_id) { 67 | std::lock_guard lock(reduce_tasks_mutex_); 68 | return reduce_tasks_.count(reduction_id) > 0; 69 | } 70 | -------------------------------------------------------------------------------- /src/client/object_store_state.h: -------------------------------------------------------------------------------- 1 | #ifndef OBJECT_STORE_STATE_H 2 | #define OBJECT_STORE_STATE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "common/buffer.h" 12 | #include "common/id.h" 13 | 14 | class LocalReduceTask { 15 | public: 16 | LocalReduceTask() : is_finished_(false) {} 17 | 18 | ObjectID local_object; 19 | 20 | void Wait() { 21 | std::unique_lock l(notification_mutex_); 22 | notification_cv_.wait(l, [this]() { return is_finished_.load(); }); 23 | } 24 | 25 | void NotifyFinished() { 26 | std::unique_lock l(notification_mutex_); 27 | is_finished_ = true; 28 | notification_cv_.notify_all(); 29 | } 30 | 31 | private: 32 | std::atomic is_finished_; 33 | std::mutex notification_mutex_; 34 | std::condition_variable notification_cv_; 35 | }; 36 | 37 | class ObjectStoreState { 38 | 39 | public: 40 | std::shared_ptr create_reduction_stream(const ObjectID &reduction_id, size_t size); 41 | 42 | std::shared_ptr get_reduction_stream(const ObjectID &reduction_id); 43 | 44 | std::shared_ptr get_or_create_reduction_stream(const ObjectID &reduction_id, size_t size); 45 | 46 | void release_reduction_stream(const ObjectID &reduction_id); 47 | 48 | void create_local_reduce_task(const ObjectID &reduction_id, const std::vector &local_objects); 49 | 50 | std::shared_ptr get_local_reduce_task(const ObjectID &reduction_id); 51 | 52 | void remove_local_reduce_task(const ObjectID &reduction_id); 53 | 54 | bool local_reduce_task_exists(const ObjectID &reduction_id); 55 | 56 | private: 57 | std::mutex reduction_stream_mutex_; 58 | std::condition_variable reduction_stream_cv_; 59 | std::unordered_map> reduction_stream_; 60 | 61 | std::mutex reduce_tasks_mutex_; 62 | std::unordered_map> reduce_tasks_; 63 | }; 64 | 65 | #endif // OBJECT_STORE_STATE_H 66 | -------------------------------------------------------------------------------- /src/common/buffer.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "util/logging.h" 4 | #include "common/buffer.h" 5 | 6 | Buffer::Buffer(uint8_t* data_ptr, int64_t size): progress(size), data_ptr_(data_ptr), size_(size), is_data_owner_(false) {} 7 | 8 | Buffer::Buffer(int64_t size): progress(0), size_(size), is_data_owner_(true) { 9 | data_ptr_ = new uint8_t[size]; 10 | } 11 | 12 | uint8_t* Buffer::MutableData() { return data_ptr_; } 13 | const uint8_t* Buffer::Data() const { return data_ptr_; } 14 | int64_t Buffer::Size() const { return size_; } 15 | uint64_t Buffer::Hash() const { 16 | return MurmurHash64A(data_ptr_, size_, 0); 17 | } 18 | 19 | void Buffer::CopyFrom(const std::vector &data) { 20 | DCHECK(data.size() == size_) << "input size mismatch"; 21 | std::copy(data.begin(), data.end(), data_ptr_); 22 | Seal(); 23 | } 24 | 25 | void Buffer::CopyFrom(const uint8_t *data, size_t size) { 26 | DCHECK(size == size_) << "input size mismatch"; 27 | std::memcpy(data_ptr_, data, size); 28 | Seal(); 29 | } 30 | 31 | void Buffer::CopyFrom(const Buffer &buffer) { 32 | DCHECK(buffer.Size() == size_) << "input size mismatch"; 33 | std::memcpy(data_ptr_, buffer.Data(), buffer.Size()); 34 | Seal(); 35 | } 36 | 37 | void Buffer::CopyFrom(const std::string &data) { 38 | CopyFrom((const uint8_t *)data.data(), data.size()); 39 | Seal(); 40 | } 41 | 42 | void Buffer::StreamCopy(const Buffer &src) { 43 | DCHECK(src.IsFinished()) << "Copy from a unfinished buffer"; 44 | const uint8_t *data = src.Data(); 45 | int64_t size = src.Size(); 46 | DCHECK(size == Size()) << "Size mismatch for copying."; 47 | size_t copy_size = size / 1024; 48 | // trade off 'copy_size' between performance and latency 49 | if (copy_size < 4096) { 50 | copy_size = 4096; 51 | } else if (copy_size > 2 << 20) { 52 | copy_size = 2 << 20; 53 | } else { 54 | // align to 64 55 | copy_size = (copy_size >> 6) << 6; 56 | } 57 | uint8_t *dst = MutableData(); 58 | size_t cursor = 0; 59 | while (copy_size + cursor <= size) { 60 | memcpy(dst + cursor, data + cursor, copy_size); 61 | progress += copy_size; 62 | cursor += copy_size; 63 | } 64 | memcpy(dst + cursor, data + cursor, size - cursor); 65 | progress = cursor; 66 | } 67 | 68 | void Buffer::Wait() { 69 | std::unique_lock l(notification_mutex_); 70 | notification_cv_.wait(l, [this]() { return IsFinished(); }); 71 | } 72 | 73 | void Buffer::NotifyFinished() { 74 | std::unique_lock l(notification_mutex_); 75 | DCHECK(IsFinished()) << "The buffer has not been finished"; 76 | notification_cv_.notify_all(); 77 | } 78 | 79 | void Buffer::ShrinkForLRU() { 80 | delete[] data_ptr_; 81 | data_ptr_ = new uint8_t[4]; 82 | size_ = 4; 83 | } 84 | 85 | Buffer::~Buffer() { 86 | if (is_data_owner_) { 87 | delete[] data_ptr_; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/common/buffer.h: -------------------------------------------------------------------------------- 1 | #ifndef BUFFER_H 2 | #define BUFFER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "common/config.h" 11 | #include "util/hash.h" 12 | 13 | class Buffer { 14 | public: 15 | Buffer(uint8_t* data_ptr, int64_t size); 16 | explicit Buffer(int64_t size); 17 | 18 | void CopyFrom(const std::vector &data); 19 | void CopyFrom(const uint8_t *data, size_t size); 20 | void CopyFrom(const Buffer &buffer); 21 | void CopyFrom(const std::string &data); 22 | void StreamCopy(const Buffer &buffer); 23 | 24 | uint8_t* MutableData(); 25 | const uint8_t* Data() const; 26 | int64_t Size() const; 27 | uint64_t Hash() const; 28 | void ShrinkForLRU(); 29 | void Seal() { progress = size_; } 30 | bool IsFinished() const { return progress >= size_; } 31 | ~Buffer(); 32 | 33 | void Wait(); 34 | void NotifyFinished(); 35 | #ifdef HOPLITE_ENABLE_ATOMIC_BUFFER_PROGRESS 36 | std::atomic_int64_t progress; 37 | #else 38 | volatile int64_t progress; 39 | #endif 40 | volatile bool reset = false; 41 | private: 42 | uint8_t* data_ptr_; 43 | int64_t size_; 44 | bool is_data_owner_; 45 | std::mutex notification_mutex_; 46 | std::condition_variable notification_cv_; 47 | }; 48 | 49 | struct ObjectBuffer { 50 | std::shared_ptr data; 51 | uint8_t* metadata; 52 | int32_t device_num = 0; 53 | }; 54 | 55 | #endif // BUFFER_H -------------------------------------------------------------------------------- /src/common/config.h: -------------------------------------------------------------------------------- 1 | #ifndef _HOPLITE_COMMON_CONFIG_H_ 2 | #define _HOPLITE_COMMON_CONFIG_H_ 3 | 4 | // Enable non-blocking for the socket that receiving objects. 5 | #define HOPLITE_ENABLE_NONBLOCKING_SOCKET_RECV 6 | 7 | constexpr int64_t STREAM_MAX_BLOCK_SIZE = 4 * (2 << 20); // 4MB 8 | 9 | // Enable ACK for sending/receiving buffers. Usually used for debugging. 10 | // FIXME(suquark): Disable ACK would cause numeric mismatch. 11 | #define HOPLITE_ENABLE_ACK 12 | 13 | // The constant for RPC latency (in seconds) 14 | #define HOPLITE_RPC_LATENCY (750 * 1e-6) 15 | 16 | // The constanf for bandwidth (in bytes/second) 17 | #define HOPLITE_BANDWIDTH (9.68 * (1 << 30) / 8) 18 | 19 | // Use atomic type for buffer progress. 20 | // #define HOPLITE_ENABLE_ATOMIC_BUFFER_PROGRESS 21 | 22 | // Maximum inflow concurrency for a node 23 | #define HOPLITE_MAX_INFLOW_CONCURRENCY 2 24 | 25 | // Maximum outflow concurrency for a node 26 | #define HOPLITE_MAX_OUTLOW_CONCURRENCY 2 27 | 28 | // The thread pool size for the distributed store to launch 29 | // RPCs like `InvokeReduceTo` and `InvokeRedirectReduce`. 30 | #define HOPLITE_THREADPOOL_SIZE_FOR_RPC 10 31 | 32 | #define HOPLITE_MULTITHREAD_REDUCE_SIZE (1 << 28) 33 | 34 | // Make the Put() call blocking on 'WriteLocation' 35 | #ifndef HOPLITE_PUT_BLOCKING 36 | #define HOPLITE_PUT_BLOCKING false 37 | #endif 38 | 39 | // NOTE: SO_ZEROCOPY & TCP_NODELAY is not working. 40 | 41 | // Default ports 42 | #define HOPLITE_SENDER_PORT 20210 43 | #define HOPLITE_RECEIVER_PORT 20211 44 | #define OBJECT_DIRECTORY_PORT 7777 45 | #define OBJECT_DIRECTORY_LISTENER_PORT 8888 46 | 47 | #endif // _HOPLITE_COMMON_CONFIG_H_ 48 | -------------------------------------------------------------------------------- /src/common/id.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "common/id.h" 3 | 4 | std::string ObjectID::ToString() const { 5 | return std::string("ObjectID(") + Hex() + ")"; 6 | } 7 | 8 | #define ID_OSTREAM_OPERATOR(id_type) \ 9 | std::ostream &operator<<(std::ostream &os, const id_type &id) { \ 10 | if (id.IsNil()) { \ 11 | os << "NIL_ID"; \ 12 | } else { \ 13 | os << id.Hex(); \ 14 | } \ 15 | return os; \ 16 | } 17 | 18 | ID_OSTREAM_OPERATOR(ObjectID); -------------------------------------------------------------------------------- /src/common/status.cc: -------------------------------------------------------------------------------- 1 | #include "common/status.h" 2 | 3 | #include 4 | 5 | Status::Status(StatusCode code, const std::string &msg) { 6 | assert(code != StatusCode::OK); 7 | state_ = new State; 8 | state_->code = code; 9 | state_->msg = msg; 10 | } 11 | 12 | void Status::CopyFrom(const State *state) { 13 | delete state_; 14 | if (state == nullptr) { 15 | state_ = nullptr; 16 | } else { 17 | state_ = new State(*state); 18 | } 19 | } 20 | 21 | std::string Status::CodeAsString() const { 22 | if (state_ == NULL) { 23 | return "OK"; 24 | } 25 | 26 | const char *type; 27 | switch (code()) { 28 | case StatusCode::OK: 29 | type = "OK"; 30 | break; 31 | case StatusCode::OutOfMemory: 32 | type = "Out of memory"; 33 | break; 34 | case StatusCode::KeyError: 35 | type = "Key error"; 36 | break; 37 | case StatusCode::TypeError: 38 | type = "Type error"; 39 | break; 40 | case StatusCode::Invalid: 41 | type = "Invalid"; 42 | break; 43 | case StatusCode::IOError: 44 | type = "IOError"; 45 | break; 46 | case StatusCode::ObjectExists: 47 | type = "ObjectExists"; 48 | break; 49 | case StatusCode::ObjectStoreFull: 50 | type = "ObjectStoreFull"; 51 | break; 52 | case StatusCode::UnknownError: 53 | type = "Unknown error"; 54 | break; 55 | case StatusCode::NotImplemented: 56 | type = "NotImplemented"; 57 | break; 58 | case StatusCode::RedisError: 59 | type = "RedisError"; 60 | break; 61 | case StatusCode::TimedOut: 62 | type = "TimedOut"; 63 | break; 64 | case StatusCode::Interrupted: 65 | type = "Interrupted"; 66 | break; 67 | default: 68 | type = "Unknown"; 69 | break; 70 | } 71 | return std::string(type); 72 | } 73 | 74 | std::string Status::ToString() const { 75 | std::string result(CodeAsString()); 76 | if (state_ == NULL) { 77 | return result; 78 | } 79 | result += ": "; 80 | result += state_->msg; 81 | return result; 82 | } 83 | -------------------------------------------------------------------------------- /src/object_directory/notification.h: -------------------------------------------------------------------------------- 1 | #ifndef NOTIFICATION_H 2 | #define NOTIFICATION_H 3 | 4 | #include "common/id.h" 5 | #include "util/logging.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | class NotificationServiceImpl; 12 | 13 | class NotificationServer { 14 | public: 15 | NotificationServer(const std::string &my_address, int notification_server_port, 16 | int notification_listener_port); 17 | 18 | std::thread Run() { 19 | std::thread notification_thread(&NotificationServer::worker_loop, this); 20 | return notification_thread; 21 | } 22 | 23 | private: 24 | void worker_loop(); 25 | 26 | const int notification_server_port_; 27 | const int notification_listener_port_; 28 | 29 | std::unique_ptr grpc_server_; 30 | std::shared_ptr service_; 31 | }; 32 | 33 | #endif // NOTIFICATION_H 34 | -------------------------------------------------------------------------------- /src/tests/allgather_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Make the Put() call blocking for more precise timing. 8 | #define HOPLITE_PUT_BLOCKING true 9 | 10 | #include "distributed_object_store.h" 11 | #include "util/logging.h" 12 | #include "util/socket_utils.h" 13 | #include "util/test_utils.h" 14 | 15 | int main(int argc, char **argv) { 16 | // argv: *, object_directory_address, object_size, n_trials 17 | std::string object_directory_address = std::string(argv[1]); 18 | int64_t object_size = std::strtoll(argv[2], NULL, 10); 19 | int64_t n_trials = std::strtoll(argv[3], NULL, 10); 20 | std::string my_address = get_host_ipaddress(); 21 | MPI_Init(NULL, NULL); 22 | int world_rank; 23 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 24 | int world_size; 25 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 26 | 27 | ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG); 28 | 29 | TIMELINE("main"); 30 | 31 | DistributedObjectStore store(object_directory_address); 32 | 33 | for (int trial = 0; trial < n_trials; trial++) { 34 | std::vector object_ids; 35 | float sum = 0; 36 | for (int i = 0; i < world_size; i++) { 37 | auto oid = object_id_from_integer(trial * 1000000 + i); 38 | object_ids.push_back(oid); 39 | auto rnum = get_uniform_random_float(oid.Hex()); 40 | sum += rnum; 41 | } 42 | DCHECK(object_size % sizeof(float) == 0); 43 | 44 | ObjectID rank_object_id = object_ids[world_rank]; 45 | std::unordered_map> gather_result; 46 | 47 | put_random_buffer(store, rank_object_id, object_size); 48 | 49 | MPI_Barrier(MPI_COMM_WORLD); 50 | 51 | auto start = std::chrono::system_clock::now(); 52 | for (auto &object_id : object_ids) { 53 | store.Get(object_id, &gather_result[object_id]); 54 | } 55 | auto end = std::chrono::system_clock::now(); 56 | std::chrono::duration duration = end - start; 57 | LOG(INFO) << "Allgather finished. duration = " << duration.count(); 58 | uint32_t sum_crc = 0; 59 | for (auto &object_id : object_ids) { 60 | sum_crc += gather_result[object_id]->Hash(); 61 | } 62 | LOG(INFO) << "Hash for objects is " << sum_crc; 63 | MPI_Barrier(MPI_COMM_WORLD); 64 | } 65 | MPI_Barrier(MPI_COMM_WORLD); 66 | MPI_Finalize(); 67 | return 0; 68 | } 69 | -------------------------------------------------------------------------------- /src/tests/allreduce_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // Make the Put() call blocking for more precise timing. 9 | #define HOPLITE_PUT_BLOCKING true 10 | 11 | #include "common/buffer.h" 12 | #include "common/id.h" 13 | #include "distributed_object_store.h" 14 | #include "util/logging.h" 15 | #include "util/socket_utils.h" 16 | #include "util/test_utils.h" 17 | 18 | int main(int argc, char **argv) { 19 | // argv: *, object_directory_address, object_size, n_trials 20 | std::string object_directory_address = std::string(argv[1]); 21 | int64_t object_size = std::strtoll(argv[2], NULL, 10); 22 | int64_t n_trials = std::strtoll(argv[3], NULL, 10); 23 | std::string my_address = get_host_ipaddress(); 24 | MPI_Init(NULL, NULL); 25 | int world_rank; 26 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 27 | int world_size; 28 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 29 | 30 | ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG); 31 | 32 | TIMELINE("main"); 33 | 34 | DistributedObjectStore store(object_directory_address); 35 | 36 | for (int trial = 0; trial < n_trials; trial++) { 37 | ObjectID reduction_id = object_id_from_integer(trial * 1000000 + 99999); 38 | std::vector object_ids; 39 | float sum = 0; 40 | for (int i = 0; i < world_size; i++) { 41 | auto oid = object_id_from_integer(trial * 1000000 + i); 42 | object_ids.push_back(oid); 43 | auto rnum = get_uniform_random_float(oid.Hex()); 44 | sum += rnum; 45 | } 46 | DCHECK(object_size % sizeof(float) == 0); 47 | 48 | ObjectID rank_object_id = object_ids[world_rank]; 49 | std::shared_ptr reduction_result; 50 | 51 | put_random_buffer(store, rank_object_id, object_size); 52 | MPI_Barrier(MPI_COMM_WORLD); 53 | 54 | auto start = std::chrono::system_clock::now(); 55 | if (world_rank == 0) { 56 | store.Reduce(object_ids, reduction_id); 57 | } 58 | store.Get(reduction_id, &reduction_result); 59 | auto end = std::chrono::system_clock::now(); 60 | std::chrono::duration duration = end - start; 61 | LOG(INFO) << reduction_id.ToString() << " is reduced. duration = " << duration.count(); 62 | print_reduction_result(reduction_id, reduction_result, sum); 63 | MPI_Barrier(MPI_COMM_WORLD); 64 | } 65 | 66 | MPI_Barrier(MPI_COMM_WORLD); 67 | MPI_Finalize(); 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /src/tests/gather_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Make the Put() call blocking for more precise timing. 8 | #define HOPLITE_PUT_BLOCKING true 9 | 10 | #include "distributed_object_store.h" 11 | #include "util/logging.h" 12 | #include "util/socket_utils.h" 13 | #include "util/test_utils.h" 14 | 15 | int main(int argc, char **argv) { 16 | // argv: *, object_directory_address, object_size, n_trials 17 | std::string object_directory_address = std::string(argv[1]); 18 | int64_t object_size = std::strtoll(argv[2], NULL, 10); 19 | int64_t n_trials = std::strtoll(argv[3], NULL, 10); 20 | std::string my_address = get_host_ipaddress(); 21 | MPI_Init(NULL, NULL); 22 | int world_rank; 23 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 24 | int world_size; 25 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 26 | 27 | ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG); 28 | 29 | TIMELINE("main"); 30 | 31 | DistributedObjectStore store(object_directory_address); 32 | 33 | for (int trial = 0; trial < n_trials; trial++) { 34 | std::vector object_ids; 35 | float sum = 0; 36 | for (int i = 0; i < world_size; i++) { 37 | auto oid = object_id_from_integer(trial * 1000000 + i); 38 | object_ids.push_back(oid); 39 | auto rnum = get_uniform_random_float(oid.Hex()); 40 | sum += rnum; 41 | } 42 | DCHECK(object_size % sizeof(float) == 0); 43 | 44 | ObjectID rank_object_id = object_ids[world_rank]; 45 | std::unordered_map> gather_result; 46 | 47 | put_random_buffer(store, rank_object_id, object_size); 48 | 49 | MPI_Barrier(MPI_COMM_WORLD); 50 | 51 | if (world_rank == 0) { 52 | auto start = std::chrono::system_clock::now(); 53 | for (auto &object_id : object_ids) { 54 | store.Get(object_id, &gather_result[object_id]); 55 | } 56 | auto end = std::chrono::system_clock::now(); 57 | std::chrono::duration duration = end - start; 58 | LOG(INFO) << "Objects gathered. duration = " << duration.count(); 59 | 60 | uint32_t sum_crc = 0; 61 | for (auto &object_id : object_ids) { 62 | sum_crc += gather_result[object_id]->Hash(); 63 | } 64 | LOG(INFO) << "Hash for objects is " << sum_crc; 65 | } 66 | MPI_Barrier(MPI_COMM_WORLD); 67 | } 68 | 69 | MPI_Barrier(MPI_COMM_WORLD); 70 | MPI_Finalize(); 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /src/tests/multicast_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // Make the Put() call blocking for more precise timing. 7 | #define HOPLITE_PUT_BLOCKING true 8 | 9 | #include "common/buffer.h" 10 | #include "common/id.h" 11 | #include "distributed_object_store.h" 12 | #include "util/logging.h" 13 | #include "util/socket_utils.h" 14 | #include "util/test_utils.h" 15 | 16 | int main(int argc, char **argv) { 17 | // argv: *, object_directory_address, object_size, n_trials 18 | std::string object_directory_address = std::string(argv[1]); 19 | int64_t object_size = std::strtoll(argv[2], NULL, 10); 20 | int64_t n_trials = std::strtoll(argv[3], NULL, 10); 21 | std::string my_address = get_host_ipaddress(); 22 | MPI_Init(NULL, NULL); 23 | int world_rank; 24 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 25 | int world_size; 26 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 27 | 28 | ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG); 29 | 30 | TIMELINE("main"); 31 | 32 | DistributedObjectStore store(object_directory_address); 33 | 34 | for (int trial = 0; trial < n_trials; trial++) { 35 | ObjectID object_id = object_id_from_integer(trial); 36 | std::shared_ptr result; 37 | 38 | if (world_rank == 0) { 39 | result = std::make_shared(object_size); 40 | uint8_t *buf = result->MutableData(); 41 | for (int64_t i = 0; i < object_size; i++) { 42 | buf[i] = i % 256; 43 | } 44 | result->Seal(); 45 | store.Put(result, object_id); 46 | 47 | LOG(INFO) << object_id.ToString() << " is created!" 48 | << " Hash = " << result->Hash(); 49 | 50 | LOG(INFO) << "entering barrier"; 51 | MPI_Barrier(MPI_COMM_WORLD); 52 | } else { 53 | 54 | LOG(INFO) << "entering barrier"; 55 | MPI_Barrier(MPI_COMM_WORLD); 56 | auto start = std::chrono::system_clock::now(); 57 | store.Get(object_id, &result); 58 | auto end = std::chrono::system_clock::now(); 59 | std::chrono::duration duration = end - start; 60 | 61 | LOG(INFO) << object_id.ToString() << " is retrieved. Hash = " << result->Hash(); 62 | LOG(INFO) << "Retrieving duration = " << duration.count(); 63 | } 64 | MPI_Barrier(MPI_COMM_WORLD); 65 | } 66 | MPI_Barrier(MPI_COMM_WORLD); 67 | MPI_Finalize(); 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /src/tests/reduce_dependency_test.cc: -------------------------------------------------------------------------------- 1 | #include "object_directory/reduce_dependency.h" 2 | #include 3 | 4 | int main() { 5 | std::cout << ReduceTreeChain(128, 8).DebugString(); 6 | std::cout << ReduceTreeChain(152, 24).DebugString(); 7 | std::cout << ReduceTreeChain(61, 2).DebugString(); 8 | std::cout << ReduceTreeChain(32, 44).DebugString(); 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /src/tests/reduce_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Make the Put() call blocking for more precise timing. 8 | #define HOPLITE_PUT_BLOCKING true 9 | 10 | #include "distributed_object_store.h" 11 | #include "util/logging.h" 12 | #include "util/socket_utils.h" 13 | #include "util/test_utils.h" 14 | 15 | int main(int argc, char **argv) { 16 | // argv: *, object_directory_address, object_size, n_trials 17 | std::string object_directory_address = std::string(argv[1]); 18 | int64_t object_size = std::strtoll(argv[2], NULL, 10); 19 | int64_t n_trials = std::strtoll(argv[3], NULL, 10); 20 | std::string my_address = get_host_ipaddress(); 21 | MPI_Init(NULL, NULL); 22 | int world_rank; 23 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 24 | int world_size; 25 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 26 | 27 | ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG); 28 | 29 | TIMELINE("main"); 30 | 31 | DistributedObjectStore store(object_directory_address); 32 | 33 | for (int trial = 0; trial < n_trials; trial++) { 34 | ObjectID reduction_id = object_id_from_integer(trial * 1000000 + 99999); 35 | std::vector object_ids; 36 | float sum = 0; 37 | for (int i = 0; i < world_size; i++) { 38 | auto oid = object_id_from_integer(trial * 1000000 + i); 39 | object_ids.push_back(oid); 40 | auto rnum = get_uniform_random_float(oid.Hex()); 41 | sum += rnum; 42 | } 43 | DCHECK(object_size % sizeof(float) == 0); 44 | 45 | ObjectID rank_object_id = object_ids[world_rank]; 46 | std::shared_ptr reduction_result; 47 | 48 | put_random_buffer(store, rank_object_id, object_size); 49 | 50 | MPI_Barrier(MPI_COMM_WORLD); 51 | 52 | if (world_rank == 0) { 53 | auto start = std::chrono::system_clock::now(); 54 | store.Reduce(object_ids, reduction_id); 55 | store.Get(reduction_id, &reduction_result); 56 | auto end = std::chrono::system_clock::now(); 57 | std::chrono::duration duration = end - start; 58 | LOG(INFO) << reduction_id.ToString() << " is reduced. duration = " << duration.count(); 59 | print_reduction_result(reduction_id, reduction_result, sum); 60 | } 61 | MPI_Barrier(MPI_COMM_WORLD); 62 | } 63 | MPI_Barrier(MPI_COMM_WORLD); 64 | MPI_Finalize(); 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /src/tests/subset_reduce_test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "distributed_object_store.h" 8 | #include "util/logging.h" 9 | #include "util/socket_utils.h" 10 | #include "util/test_utils.h" 11 | 12 | int main(int argc, char **argv) { 13 | // argv: *, object_directory_address, object_size, n_trials 14 | std::string object_directory_address = std::string(argv[1]); 15 | int64_t object_size = std::strtoll(argv[2], NULL, 10); 16 | int64_t n_trials = std::strtoll(argv[3], NULL, 10); 17 | std::string my_address = get_host_ipaddress(); 18 | MPI_Init(NULL, NULL); 19 | int world_rank; 20 | MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 21 | int world_size; 22 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 23 | 24 | ::hoplite::RayLog::StartRayLog(my_address, ::hoplite::RayLogLevel::DEBUG); 25 | 26 | TIMELINE("main"); 27 | 28 | DistributedObjectStore store(object_directory_address); 29 | 30 | for (int trial = 0; trial < n_trials; trial++) { 31 | if (world_rank == 0) { 32 | LOG(INFO) << "\n\n\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Trail #" << trial << "/" << n_trials << "\n\n\n"; 33 | } 34 | ObjectID reduction_id = object_id_from_integer(trial * 1000000 + 99999); 35 | std::vector object_ids; 36 | for (int i = 0; i < world_size; i++) { 37 | auto oid = object_id_from_integer(trial * 1000000 + i); 38 | object_ids.push_back(oid); 39 | } 40 | DCHECK(object_size % sizeof(float) == 0); 41 | 42 | ObjectID rank_object_id = object_ids[world_rank]; 43 | std::shared_ptr reduction_result; 44 | int num_reduce_objects = world_size / 2; 45 | 46 | put_fixed_buffer(store, rank_object_id, object_size, (float)world_rank); 47 | 48 | MPI_Barrier(MPI_COMM_WORLD); 49 | 50 | if (world_rank == 0) { 51 | auto start = std::chrono::system_clock::now(); 52 | store.Reduce(object_ids, reduction_id, num_reduce_objects); 53 | store.Get(reduction_id, &reduction_result); 54 | auto end = std::chrono::system_clock::now(); 55 | std::chrono::duration duration = end - start; 56 | LOG(INFO) << "Reducing " << num_reduce_objects << " objects"; 57 | LOG(INFO) << reduction_id.ToString() << " is reduced. duration = " << duration.count(); 58 | std::unordered_set reduced_objects; 59 | reduced_objects = store.GetReducedObjects(reduction_id); 60 | for (const auto &reduced_object : reduced_objects) { 61 | LOG(INFO) << "Reduced object: " << reduced_object.ToString(); 62 | } 63 | print_reduction_result(reduction_id, reduction_result, 0.0); 64 | } 65 | MPI_Barrier(MPI_COMM_WORLD); 66 | } 67 | MPI_Barrier(MPI_COMM_WORLD); 68 | MPI_Finalize(); 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /src/util/hash.cc: -------------------------------------------------------------------------------- 1 | #include "util/hash.h" 2 | // This code is from https://sites.google.com/site/murmurhash/ 3 | // and is public domain. 4 | uint64_t MurmurHash64A(const void *key, int len, unsigned int seed) { 5 | const uint64_t m = 0xc6a4a7935bd1e995; 6 | const int r = 47; 7 | 8 | uint64_t h = seed ^ (len * m); 9 | 10 | const auto *data = reinterpret_cast(key); 11 | const uint64_t *end = data + (len / 8); 12 | 13 | while (data != end) { 14 | uint64_t k = *data++; 15 | 16 | k *= m; 17 | k ^= k >> r; 18 | k *= m; 19 | 20 | h ^= k; 21 | h *= m; 22 | } 23 | 24 | const auto *data2 = reinterpret_cast(data); 25 | 26 | switch (len & 7) { 27 | case 7: 28 | h ^= uint64_t(data2[6]) << 48; 29 | case 6: 30 | h ^= uint64_t(data2[5]) << 40; 31 | case 5: 32 | h ^= uint64_t(data2[4]) << 32; 33 | case 4: 34 | h ^= uint64_t(data2[3]) << 24; 35 | case 3: 36 | h ^= uint64_t(data2[2]) << 16; 37 | case 2: 38 | h ^= uint64_t(data2[1]) << 8; 39 | case 1: 40 | h ^= uint64_t(data2[0]); 41 | h *= m; 42 | } 43 | 44 | h ^= h >> r; 45 | h *= m; 46 | h ^= h >> r; 47 | 48 | return h; 49 | } 50 | -------------------------------------------------------------------------------- /src/util/hash.h: -------------------------------------------------------------------------------- 1 | #include 2 | uint64_t MurmurHash64A(const void *key, int len, unsigned int seed); 3 | -------------------------------------------------------------------------------- /src/util/protobuf_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util/logging.h" 4 | #include "util/socket_utils.h" 5 | 6 | template inline void SendProtobufMessage(int conn_fd, const T &message) { 7 | size_t message_size = message.ByteSizeLong(); 8 | auto status = send_all(conn_fd, (void *)&message_size, sizeof(message_size)); 9 | DCHECK(!status) << "socket send error: message_size"; 10 | 11 | std::vector message_buf(message_size); 12 | message.SerializeWithCachedSizesToArray(message_buf.data()); 13 | 14 | status = send_all(conn_fd, (void *)message_buf.data(), message_buf.size()); 15 | DCHECK(!status) << "socket send error: message"; 16 | } 17 | 18 | template inline void ReceiveProtobufMessage(int conn_fd, T *message) { 19 | size_t message_len; 20 | int status = recv_all(conn_fd, &message_len, sizeof(message_len)); 21 | DCHECK(!status) << "receive message_len failed"; 22 | 23 | std::vector message_buf(message_len); 24 | status = recv_all(conn_fd, message_buf.data(), message_len); 25 | DCHECK(!status) << "receive message failed"; 26 | 27 | message->ParseFromArray(message_buf.data(), message_buf.size()); 28 | } 29 | -------------------------------------------------------------------------------- /src/util/socket_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef SOCKET_UTILS_H 2 | #define SOCKET_UTILS_H 3 | 4 | #include 5 | 6 | int send_all(int conn_fd, const void *buf, const size_t size); 7 | 8 | int recv_all(int conn_fd, void *buf, const size_t size); 9 | 10 | int tcp_connect(const std::string &ip_address, int port, int *conn_fd); 11 | 12 | void tcp_bind_and_listen(int port, struct sockaddr_in *address, int *server_fd); 13 | 14 | void recv_ack(int fd); 15 | 16 | void send_ack(int fd); 17 | 18 | std::string get_host_ipaddress(); 19 | 20 | #endif // SOCKET_UTILS_H 21 | -------------------------------------------------------------------------------- /test_utils/get_worker_ips.py: -------------------------------------------------------------------------------- 1 | import ray 2 | import socket 3 | ray.init(address="auto") 4 | d = ray.cluster_resources() 5 | my_addr = socket.gethostbyname(socket.gethostname()) 6 | for k in d: 7 | if k.startswith('node'): 8 | ip = k.split(':')[1] 9 | if ip != my_addr: 10 | print(ip) 11 | -------------------------------------------------------------------------------- /test_utils/load_cluster_env.sh: -------------------------------------------------------------------------------- 1 | # This file should only be sourced. 2 | 3 | MY_IPADDR=$(hostname -i) 4 | # OTHERS_IPADDR=() 5 | # for s in $(ray get-worker-ips ~/ray_bootstrap_config.yaml); do 6 | # OTHERS_IPADDR+=($(ssh -o StrictHostKeyChecking=no $s hostname -i)); 7 | # done 8 | SCRIPT_CURRENT_DIR=$(dirname $(realpath -s ${BASH_SOURCE[0]})) 9 | 10 | OTHERS_IPADDR=($(python $SCRIPT_CURRENT_DIR/get_worker_ips.py 2>/dev/null)) 11 | ALL_IPADDR=($MY_IPADDR ${OTHERS_IPADDR[@]}) 12 | unset SCRIPT_CURRENT_DIR 13 | -------------------------------------------------------------------------------- /test_utils/mpirun_pernode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | all_hosts=$1 3 | shift 4 | # This syntax is for OpenMPI 5 | /opt/amazon/openmpi/bin/mpirun --mca btl_tcp_if_exclude lo,docker0 --map-by ppr:1:node -H $all_hosts $@ 6 | -------------------------------------------------------------------------------- /test_utils/result_parser_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def read_rank0_lines(log_dir, foldername): 8 | file_name = os.path.join(log_dir, foldername, "rank_0.log") 9 | with open(file_name) as f: 10 | return f.readlines() 11 | 12 | 13 | def default_parse_file(task_name, log_dir, foldername): 14 | try: 15 | lines = read_rank0_lines(log_dir, foldername) 16 | results = [] 17 | for line in lines: 18 | if 'duration = ' in line: 19 | tmp = line.split('duration = ')[1] 20 | results.append(float(tmp)) 21 | return results 22 | except Exception: 23 | return None 24 | 25 | 26 | def collect_log_folders(log_dir): 27 | tasks = {} 28 | 29 | for filename in os.listdir(log_dir): 30 | if filename == "latest": 31 | continue 32 | # log name format: $date-$time-$test_name-$world_size-$object_size 33 | splited = filename.split('-') 34 | if len(splited) != 5: 35 | raise Exception(f"Unexpected log name {filename}.") 36 | task_name, number_of_nodes, object_size = splited[2:5] 37 | task = (task_name, number_of_nodes, object_size) 38 | if task not in tasks: 39 | tasks[task] = [] 40 | tasks[task].append(filename) 41 | 42 | return tasks 43 | 44 | 45 | def parse(log_dir, parse_file): 46 | tasks = collect_log_folders(log_dir) 47 | 48 | results = {} 49 | 50 | for task, folders in tasks.items(): 51 | task_results = [] 52 | for foldername in folders: 53 | result = parse_file(task[0], log_dir, foldername) 54 | if isinstance(result, (list, np.ndarray)): 55 | task_results += list(result) 56 | elif result is None or np.isnan(result): 57 | print(f"Error parsing {foldername}: cannot read out value.") 58 | else: 59 | task_results.append(result) 60 | results[task] = np.array(task_results) 61 | 62 | task_list = sorted(list(results.keys()), reverse=True) 63 | 64 | df = pd.DataFrame(columns = ['Benchmark Name', '#Nodes', 'Object Size (in bytes)', 65 | 'Average Time (s)', 'Std Time (s)', 'Repeated Times']) 66 | 67 | for i, task in enumerate(task_list): 68 | task_name, number_of_nodes, object_size = task 69 | df.loc[i] = [task_name, number_of_nodes, object_size, np.mean(results[task]), np.std(results[task]), 70 | len(results[task])] 71 | return df 72 | --------------------------------------------------------------------------------