├── src ├── replayer │ ├── src │ │ ├── worker │ │ │ └── mod.rs │ │ ├── controller │ │ │ ├── mod.rs │ │ │ └── app.rs │ │ ├── lib.rs │ │ ├── vm_ip_addrs.in │ │ └── message.rs │ ├── README.md │ └── Cargo.toml ├── mapreduce │ ├── .gitignore │ ├── figure │ │ ├── mapreduce_fattree_16_100g_7.00_m20_r20 │ │ └── mapreduce_cdf_fattree_16_100g_7.00_m20_r20 │ ├── Cargo.toml │ └── src │ │ ├── plink.rs │ │ ├── random.rs │ │ ├── argument.rs │ │ ├── inspect.rs │ │ ├── trace.rs │ │ └── config.rs ├── utils │ ├── src │ │ ├── lib.rs │ │ ├── net.rs │ │ ├── collector.rs │ │ ├── fs.rs │ │ └── cmd_helper.rs │ └── Cargo.toml ├── nhagent_v2 │ ├── src │ │ ├── sampler │ │ │ └── mod.rs │ │ ├── lib.rs │ │ ├── argument.rs │ │ └── message.rs │ ├── Cargo.toml │ └── testbed.toml ├── nhagent │ ├── README.md │ ├── src │ │ ├── sampler │ │ │ └── mod.rs │ │ ├── lib.rs │ │ ├── argument.rs │ │ ├── ssagent.rs │ │ ├── message.rs │ │ └── timing.rs │ ├── testbed.toml │ └── Cargo.toml ├── litemsg │ ├── src │ │ ├── communicator.rs │ │ ├── command.rs │ │ └── buffer.rs │ └── Cargo.toml ├── logging │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── rat_solver │ └── Cargo.toml ├── nethint │ ├── Cargo.toml │ ├── src │ │ └── runtime_est.rs │ └── tests │ │ └── toy1.rs ├── rl │ ├── Cargo.toml │ ├── src │ │ ├── lib.rs │ │ ├── argument.rs │ │ ├── random_ring.rs │ │ └── topology_aware.rs │ └── testbed.toml └── allreduce │ ├── Cargo.toml │ ├── src │ ├── lib.rs │ ├── argument.rs │ ├── random_ring.rs │ └── topology_aware.rs │ └── testbed.toml ├── README.md ├── .gitignore ├── scripts ├── clippy.sh ├── kill_background_flow.sh ├── testbed │ ├── one-click-configuration.sh │ ├── utils.sh │ ├── attach_vfs.sh │ ├── vfconfig │ │ ├── vf0.xml │ │ ├── vf1.xml │ │ ├── vf2.xml │ │ ├── vf3.xml │ │ ├── vf4.xml │ │ ├── vf5.xml │ │ ├── vf6.xml │ │ └── vf7.xml │ ├── environment │ │ ├── vf.xml.example │ │ ├── meta_config │ │ │ ├── interfaces.conf.sh │ │ │ └── set_rdma_intf.conf.sh │ │ ├── cpu_vm_stage1.sh │ │ └── README.md │ ├── provision_vms.sh │ ├── migrate_pf.sh │ ├── enable_sriov.sh │ ├── enable_eswitch.sh │ └── setup_ovs.sh ├── testbed-2 │ ├── one-click-restore.sh │ ├── nixos-vm-setup │ │ ├── README.md │ │ ├── clean_all.sh │ │ ├── utils.sh │ │ ├── flake.nix │ │ ├── hardware-configuration.nix │ │ ├── pubkeys.nix │ │ └── bootstrap.sh │ ├── one-click-configuration.sh │ ├── meta_config │ │ └── set_rdma_intf.conf.sh │ ├── utils.sh │ ├── provision_vms.sh │ ├── setup_sflow.sh │ ├── migrate_pf.sh │ ├── setup_ovs.sh │ └── enable_sriov.sh ├── request-response.sh ├── switch_fairness_to.sh ├── recover_bandwidth_setting.sh ├── run_duplicates.sh ├── build_testbed_bins.sh ├── InfoCollectOverHead.hs └── run_duplicates_v2.sh ├── .gitmodules ├── evaluation ├── rl_configs │ ├── run_all.sh │ ├── run_paper.sh │ └── level2probe.toml ├── allreduce_configs │ ├── run_all.sh │ ├── run_paper.sh │ ├── standard3.toml │ ├── standard3_pervm.toml │ ├── standard3_pertenant.toml │ ├── background_off.toml │ ├── standard2.toml │ ├── background_dynamic_strong.toml │ └── background_static_strong.toml ├── model_serving_configs │ ├── run_paper.sh │ └── standard2.toml ├── mapreduce_configs │ ├── run_paper.sh │ └── run_all.sh ├── spectrum │ ├── run_spectrum3.sh │ ├── run_spectrum4.sh │ ├── run_spectrum5.sh │ ├── run_spectrum1.sh │ ├── run_spectrum2.sh │ ├── run_spectrum6.sh │ ├── spectrum4_base.toml │ ├── spectrum3_base.toml │ ├── spectrum1_base.toml │ ├── spectrum2_base.toml │ ├── spectrum5_base.toml │ └── spectrum6_base.toml ├── inaccuracy │ ├── run_inaccuracy2.sh │ ├── run_inaccuracy1.sh │ ├── inaccuracy1_base.toml │ └── inaccuracy2_base.toml ├── sensitivity │ ├── run_sensitivity_oversub.sh │ ├── run_sensitivity_probing_cost2.sh │ ├── run_sensitivity_probing_cost1.sh │ ├── run_sensitivity_rack_size.sh │ ├── sensitivity_probing_cost1_base.toml │ ├── sensitivity_probing_cost1_baseline.toml │ ├── sensitivity_oversub_base.toml │ └── sensitivity_rack_size_base.toml └── herd_behavior │ ├── run_mapreduce.sh │ ├── run_allreduce.sh │ └── allreduce_herd_base.toml ├── Cargo.toml └── run_test.sh /src/replayer/src/worker/mod.rs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/mapreduce/.gitignore: -------------------------------------------------------------------------------- 1 | figure/*.pdf 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NetHint 2 | 3 | To be updated. 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Session.vim 3 | *.code-workspace 4 | -------------------------------------------------------------------------------- /scripts/clippy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | find . | grep "\.rs$" | xargs touch ; cargo clippy 4 | -------------------------------------------------------------------------------- /scripts/kill_background_flow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ps aux | grep iperf | awk '{print $2}' | xargs -I {} kill {} 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "nethint-bpf"] 2 | path = nethint-bpf 3 | url = https://github.com/crazyboycjr/nethint-bpf 4 | -------------------------------------------------------------------------------- /src/utils/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(command_access)] 2 | 3 | pub mod cmd_helper; 4 | pub mod fs; 5 | pub mod net; 6 | pub mod algo; 7 | pub mod collector; 8 | -------------------------------------------------------------------------------- /src/nhagent_v2/src/sampler/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod sflow_sampler; 2 | pub use sflow_sampler::SFlowSampler; 3 | 4 | pub mod bpf_sampler; 5 | pub use bpf_sampler::TcSampler; -------------------------------------------------------------------------------- /src/replayer/src/controller/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod app; 2 | pub mod mapreduce; 3 | pub mod allreduce; 4 | pub mod rl; 5 | pub mod plink; 6 | pub mod background_flow; 7 | -------------------------------------------------------------------------------- /src/mapreduce/figure/mapreduce_fattree_16_100g_7.00_m20_r20: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyboycjr/nethint/HEAD/src/mapreduce/figure/mapreduce_fattree_16_100g_7.00_m20_r20 -------------------------------------------------------------------------------- /src/mapreduce/figure/mapreduce_cdf_fattree_16_100g_7.00_m20_r20: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazyboycjr/nethint/HEAD/src/mapreduce/figure/mapreduce_cdf_fattree_16_100g_7.00_m20_r20 -------------------------------------------------------------------------------- /src/nhagent/README.md: -------------------------------------------------------------------------------- 1 | # NetHint Agent 2 | 3 | It samples traffic by peroidically query flow table counters in OpenvSwitch. The collected results from all agents in the cluster are all-gathered and can be queried by each tenant. -------------------------------------------------------------------------------- /scripts/testbed/one-click-configuration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | 5 | sudo ./enable_sriov.sh rdma0 5 6 | sudo ./enable_eswitch.sh rdma0 7 | sudo ./setup_ovs.sh rdma0 4 8 | sudo ./migrate_pf.sh up 9 | sudo mlnx_qos -i rdma0 --prio_tc=0,1,2,3,4,5,6,7 -r 0,0,0,0,0,0,0,0 10 | -------------------------------------------------------------------------------- /src/litemsg/src/communicator.rs: -------------------------------------------------------------------------------- 1 | pub struct Communicator { 2 | my_rank: usize, 3 | nodes: Vec, 4 | peers: Vec, 5 | } 6 | 7 | 8 | impl Communicator { 9 | pub fn new(controller_uri: &str, num_workers: usize) -> Result { 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/nhagent/src/sampler/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod ovs_sampler; 2 | pub mod ss_sampler; 3 | 4 | pub use ss_sampler::SsSampler; 5 | pub use ss_sampler::get_local_ip_table; 6 | 7 | pub use ovs_sampler::OvsSampler; 8 | pub use ovs_sampler::EthAddr; 9 | pub use ovs_sampler::get_local_eth_table; 10 | -------------------------------------------------------------------------------- /evaluation/rl_configs/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | for conf in `ls *.toml`; do 6 | echo $conf 7 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -c $conf & 8 | done 9 | 10 | wait 11 | -------------------------------------------------------------------------------- /scripts/testbed-2/one-click-restore.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DIR=`dirname $(realpath $0)` 4 | 5 | sudo "$DIR"/migrate_pf.sh down 6 | sudo ovs-vsctl del-br ovs0 7 | sudo "$DIR"/enable_sriov.sh rdma0 0 8 | 9 | # sudo ip link set ovs-system mtu 1500 # no such device after removing ovs0 10 | sudo ip link set rdma0 mtu 1500 11 | -------------------------------------------------------------------------------- /src/replayer/README.md: -------------------------------------------------------------------------------- 1 | # A distributed traffic pattern replayer. 2 | 3 | Environment Variables 4 | ``` 5 | RP_CONTROLLER_URI 6 | RP_NUM_WORKER 7 | ``` 8 | 9 | 10 | Use the launcher 11 | ``` 12 | ./rplaunch --controller-ssh 192.168.211.35 --controller-uri 192.168.211.35:9000 --hostfile ~/hostfile --jobname mapreduce 13 | ``` 14 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["src/nethint", "src/logging", "src/mapreduce", "src/allreduce", "src/rl", "src/litemsg", "src/replayer", "src/nhagent_v2", "src/nhagent", "src/utils", "src/rat_solver"] 3 | 4 | [profile.dev] 5 | panic = "unwind" 6 | 7 | [profile.release] 8 | panic = "unwind" 9 | lto = false # too slow for lto = true 10 | -------------------------------------------------------------------------------- /scripts/testbed-2/nixos-vm-setup/README.md: -------------------------------------------------------------------------------- 1 | Prepare 2 | add nixos channel 3 | install nix-install-tools to the current user profile 4 | nix-channel --add https://nixos.org/channels/nixos-unstable-small nixos 5 | nix-channel --update 6 | 7 | 8 | How to run? 9 | ``` 10 | # ./bootstrap.sh 11 | ``` 12 | 13 | 14 | This will create 8 nixos VMs; 15 | -------------------------------------------------------------------------------- /scripts/testbed-2/nixos-vm-setup/clean_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=`dirname $(realpath $0)` 4 | source "$DIR./utils.sh" 5 | 6 | for name in `lsnames 8`; do 7 | sudo virsh vol-delete ${name}.img --pool images 8 | sudo virsh undefine $name 9 | done 10 | 11 | sudo virsh undefine nixosbase 12 | sudo rm /var/lib/libvirt/images/nixos_vm_base.img 13 | -------------------------------------------------------------------------------- /scripts/testbed/utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function lsnames { 4 | host_id=`hostname | cut -d'-' -f2` 5 | [[ -n $1 ]] && num_vms=$1 || num_vms=8 6 | # base: host_id, len: num_vms, offset: i, id: j 7 | for ((i=0;i<$num_vms;i++)); do 8 | j=`expr $host_id \* $num_vms + $i - $num_vms`; 9 | name=cpu${j} 10 | echo $name 11 | done 12 | } 13 | -------------------------------------------------------------------------------- /src/utils/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "utils" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | log = "0.4.14" 11 | anyhow = "1.0.38" 12 | lazy_static = "1.4.0" 13 | -------------------------------------------------------------------------------- /src/nhagent_v2/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(command_access)] 2 | 3 | pub mod cluster; 4 | pub mod sampler; 5 | pub mod message; 6 | pub mod communicator; 7 | pub mod argument; 8 | pub mod sdn_controller; 9 | 10 | pub use litemsg::Node; 11 | 12 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 13 | pub enum Role { 14 | GlobalLeader, 15 | RackLeader, 16 | } 17 | -------------------------------------------------------------------------------- /scripts/testbed-2/nixos-vm-setup/utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function lsnames { 4 | host_id=`hostname | cut -d'-' -f2` 5 | [[ -n $1 ]] && num_vms=$1 || num_vms=8 6 | # base: host_id, len: num_vms, offset: i, id: j 7 | for ((i=0;i<$num_vms;i++)); do 8 | j=`expr $host_id \* $num_vms + $i - $num_vms`; 9 | name=nixos${j} 10 | echo $name 11 | done 12 | } 13 | -------------------------------------------------------------------------------- /scripts/testbed-2/one-click-configuration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=`dirname $(realpath $0)` 4 | 5 | sudo "$DIR"/enable_sriov.sh rdma0 1 6 | sudo "$DIR"/setup_ovs.sh rdma0 7 | sudo "$DIR"/migrate_pf.sh up 8 | 9 | sudo ip link set rdma0 mtu 9000 10 | sudo ip link set ovs-system mtu 9000 11 | sudo mlnx_qos -i rdma0 --prio_tc=0,1,2,3,4,5,6,7 -r 0,0,0,0,0,0,0,0 12 | -------------------------------------------------------------------------------- /src/nhagent/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(command_access)] 2 | 3 | pub mod cluster; 4 | pub mod sampler; 5 | pub mod message; 6 | pub mod communicator; 7 | pub mod argument; 8 | pub mod timing; 9 | 10 | pub use litemsg::Node; 11 | 12 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 13 | pub enum Role { 14 | GlobalLeader, 15 | RackLeader, 16 | Worker, 17 | } 18 | -------------------------------------------------------------------------------- /scripts/testbed/attach_vfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $UID -ne 0 ]; then 4 | echo "Please run $0 as root" 5 | exit 3 6 | fi 7 | 8 | source `dirname $0`/utils.sh 9 | 10 | names=(`lsnames 8`); 11 | for i in {0..7}; do 12 | name=${names[$i]}; 13 | virsh attach-device --domain $name --file /nfs/cjr/Developing/nethint-rs/scripts/testbed/vfconfig/vf${i}.xml 14 | done 15 | -------------------------------------------------------------------------------- /src/logging/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "logging" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | publish = false 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | log = "0.4.11" 12 | env_logger = "0.8.1" 13 | chrono = "0.4.19" 14 | -------------------------------------------------------------------------------- /src/rat_solver/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rat_solver" 3 | version = "0.1.0" 4 | edition = "2018" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | nethint = { path = "../nethint" } 10 | utils = { path = "../utils" } 11 | log = "0.4.14" 12 | lpsolve = "0.1.0" 13 | lpsolve-sys = "5.5.0" 14 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | # enable computation time 6 | sed -i '/^buffer_size/a computation_speed = 0.1' *.toml 7 | 8 | for conf in `ls *.toml`; do 9 | echo $conf 10 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 2 -c $conf & 11 | done 12 | 13 | wait 14 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/vfconfig/vf7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /scripts/testbed/environment/vf.xml.example: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 | 7 |
8 | 9 | -------------------------------------------------------------------------------- /evaluation/model_serving_configs/run_paper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | configs=( 6 | standard2.toml 7 | nonnegligible_computing_overhead.toml 8 | coarse_grained_workloads.toml 9 | ) 10 | 11 | for conf in ${configs[@]}; do 12 | echo $conf 13 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf & 14 | done 15 | 16 | wait 17 | -------------------------------------------------------------------------------- /scripts/testbed/environment/meta_config/interfaces.conf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IP_ADDR_CIDR="$1" 4 | GATEWAY_ADDR="$2" 5 | 6 | if [ $# -ne 2 ]; then 7 | echo "Usage: $0 " 8 | echo "For example: $0 172.16.0.100/24 172.16.0.160" 9 | exit 1 10 | fi 11 | 12 | cat <), 12 | /// send by worker, processed by controller 13 | LeaveNode(Node), 14 | } 15 | -------------------------------------------------------------------------------- /evaluation/rl_configs/run_paper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | configs=( 6 | standard2.toml 7 | nonnegligible_computing_overhead.toml 8 | # background_dynamic_strong.toml 9 | # background_off.toml 10 | # background_static_strong.toml 11 | # level2bad.toml 12 | # level2probe.toml 13 | ) 14 | 15 | for conf in ${configs[@]}; do 16 | echo $conf 17 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf & 18 | done 19 | 20 | wait 21 | -------------------------------------------------------------------------------- /scripts/testbed-2/meta_config/set_rdma_intf.conf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | RDMA_IP_ADDR_CIDR="$1" 4 | 5 | if [ $# -ne 1 ]; then 6 | echo "Usage: $0 " 7 | echo "For example: $0 1.1.1.1/24" 8 | exit 1 9 | fi 10 | 11 | cat <" 16 | exit 1 17 | fi 18 | 19 | intf=\$1 20 | 21 | sudo ip link set \$intf name rdma0 22 | sudo ip link set rdma0 up 23 | sudo ip addr add ${RDMA_IP_ADDR_CIDR} dev rdma0 24 | sudo ip link set rdma0 mtu 8930 25 | EOF 26 | -------------------------------------------------------------------------------- /src/utils/src/net.rs: -------------------------------------------------------------------------------- 1 | use std::process::Command; 2 | use crate::cmd_helper::get_command_output; 3 | use std::net::Ipv4Addr; 4 | 5 | pub fn get_primary_ipv4(iface: &str) -> anyhow::Result { 6 | let mut cmd = Command::new("ip"); 7 | cmd.arg("addr").arg("show").arg(iface); 8 | let output = get_command_output(cmd).expect("ip addr failed to execute"); 9 | let start = 5 + output.find("inet ").expect("inet not found in the output"); 10 | let len = (&output[start..]).find("/").unwrap(); 11 | Ok((&output[start..start + len]).parse()?) 12 | } 13 | -------------------------------------------------------------------------------- /scripts/switch_fairness_to.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | check_args() { 4 | [[ $# -eq 1 ]] && [[ $1 = "PerFlowMaxMin" || $1 = "TenantFlowMaxMin" ]] 5 | } 6 | 7 | usage_exit() { 8 | echo "Usage: $0 [fairness] fairness in [PerFlowMaxMin, TenantFlowMaxMin]" && exit 1 9 | } 10 | 11 | check_args $* || usage_exit 12 | 13 | fairness=$1 14 | 15 | echo switching to $fairness 16 | 17 | sed -i 's/^fairness = "\(.*\)"/Fairness = "\1"/' *.toml 18 | sed -i "s/^# fairness = \"${fairness}\"/fairness = \"${fairness}\"/" *.toml 19 | sed -i 's/^Fairness = "\(.*\)"/# fairness = "\1"/' *.toml 20 | -------------------------------------------------------------------------------- /run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of tests required"; exit -1; fi 3 | if [ "$#" -gt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi 4 | 5 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 6 | 7 | log_dir=log 8 | mkdir -p $log_dir 9 | 10 | for ((i=0;i<$1;i++)) 11 | do 12 | ./rplaunch --controller-ssh 192.168.211.35 --controller-uri 192.168.211.35:9000 --hostfile hostfiles/$i --jobname mapreduce --config allreduce_tomls/$i.toml 2>&1 | tee $log_dir/$i & 13 | sleep 10 14 | done -------------------------------------------------------------------------------- /src/litemsg/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "litemsg" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | log = "0.4.14" 11 | anyhow = "1.0.38" 12 | thiserror = "1.0.23" 13 | bincode = "1.3.1" 14 | serde = { version = "1.0.123", features = ["derive"] } 15 | logging = { path = "../logging" } 16 | static_assertions = "1.1.0" 17 | lazy_static = "1.4.0" 18 | rand = "0.8.3" 19 | 20 | [dependencies.mio] 21 | version = "0.6.23" 22 | -------------------------------------------------------------------------------- /scripts/testbed-2/nixos-vm-setup/flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | inputs.nixpkgs.url = "github:nixos/nixpkgs?rev=fb45fa64ae3460d6bd2701ab5a6c4512d781f166"; 3 | outputs = { self, nixpkgs }: { 4 | nixosConfigurations.nixos = nixpkgs.lib.nixosSystem { 5 | system = "x86_64-linux"; 6 | modules = [ 7 | ( 8 | { pkgs, ... }: { 9 | nix.registry.nixpkgs = { 10 | from = { type = "indirect"; id = "nixpkgs"; }; 11 | flake = nixpkgs; 12 | }; 13 | } 14 | ) 15 | ./configuration.nix 16 | ]; 17 | }; 18 | }; 19 | } 20 | -------------------------------------------------------------------------------- /src/utils/src/collector.rs: -------------------------------------------------------------------------------- 1 | use std::time::Duration; 2 | use crate::fs::append_to_file; 3 | 4 | #[derive(Debug, Clone, Default)] 5 | pub struct OverheadCollector { 6 | // controller overhead, job scale 7 | data: Vec<(Duration, usize)>, 8 | } 9 | 10 | impl OverheadCollector { 11 | pub fn collect(&mut self, duration: Duration, scale: usize) { 12 | self.data.push((duration, scale)); 13 | 14 | if let Ok(path) = std::env::var("NETHINT_COLLECT_CONTROLLER_OVERHEAD") { 15 | append_to_file(path, &format!("{} {}", scale, duration.as_nanos())); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /scripts/testbed/environment/meta_config/set_rdma_intf.conf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RDMA_IP_ADDR_CIDR="$1" 4 | 5 | if [ $# -ne 1 ]; then 6 | echo "Usage: $0 " 7 | echo "For example: $0 1.1.1.1/24" 8 | exit 1 9 | fi 10 | 11 | cat <" 16 | exit 1 17 | fi 18 | 19 | intf=\$1 20 | 21 | sudo ip link set \$intf name rdma0 22 | sudo ip link set rdma0 up 23 | sudo ip addr add ${RDMA_IP_ADDR_CIDR} dev rdma0 24 | sudo ip link set rdma0 mtu 1430 25 | echo 106 | sudo tee /sys/class/infiniband/mlx5_0/tc/1/traffic_class 26 | EOF 27 | -------------------------------------------------------------------------------- /scripts/testbed-2/utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function lsnames { 4 | host_id=`hostname | cut -d'-' -f2` 5 | [[ -n $1 ]] && num_vms=$1 || num_vms=8 6 | # base: host_id, len: num_vms, offset: i, id: j 7 | for ((i=0;i<$num_vms;i++)); do 8 | j=`expr $host_id \* $num_vms + $i - $num_vms`; 9 | name=cpu${j} 10 | echo $name 11 | done 12 | } 13 | 14 | function get_rack_agent_ip { 15 | # my_ip=`ip a show rdma0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1` 16 | my_id=`hostname | cut -d'-' -f2` 17 | if [ $my_id -le 3 ]; then 18 | echo "192.168.211.2" 19 | else 20 | echo "192.168.211.130" 21 | fi 22 | } 23 | -------------------------------------------------------------------------------- /evaluation/mapreduce_configs/run_paper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | # enable computation time 6 | # sed -i '/^num_reduce/a enable_computation_time = true' *.toml 7 | 8 | configs=( 9 | standard_hybrid2.toml 10 | casestudy1.toml 11 | fallback.toml 12 | # background_dynamic_strong.toml 13 | # background_off.toml 14 | # background_static_strong.toml 15 | # level2probe.toml 16 | ) 17 | 18 | for conf in ${configs[@]}; do 19 | echo $conf 20 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf & 21 | done 22 | 23 | wait 24 | -------------------------------------------------------------------------------- /src/replayer/src/lib.rs: -------------------------------------------------------------------------------- 1 | use nethint::Token; 2 | use serde::{Deserialize, Serialize}; 3 | 4 | pub mod message; 5 | pub mod controller; 6 | pub mod worker; 7 | 8 | pub use litemsg::Node; 9 | 10 | #[derive(Debug, Clone, Serialize, Deserialize)] 11 | pub struct Flow { 12 | pub bytes: usize, 13 | pub src: Node, 14 | pub dst: Node, 15 | pub token: Option, 16 | } 17 | 18 | impl Flow { 19 | pub fn new(bytes: usize, src: Node, dst: Node, token: Option) -> Self { 20 | Flow { 21 | bytes, 22 | src, 23 | dst, 24 | token, 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /evaluation/spectrum/run_spectrum3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | auto_tunes=( 6 | 10 7 | 20 8 | 40 9 | 80 10 | 160 11 | 320 12 | 640 13 | 1280 14 | ) 15 | 16 | cnt=0 17 | for f in ${auto_tunes[@]}; do 18 | echo $f 19 | conf=spectrum3_$f.toml 20 | cp spectrum3_base.toml $conf 21 | sed -i "s/^auto_tune = 10/auto_tune = $f/" $conf 22 | sed -i "s/spectrum3_1/spectrum3_$f/" $conf 23 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf & 24 | cnt=`expr $cnt + 5` # 5 threads 25 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 26 | done 27 | 28 | wait 29 | -------------------------------------------------------------------------------- /evaluation/spectrum/run_spectrum4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | auto_tunes=( 6 | 1 7 | 2 8 | 4 9 | 8 10 | 16 11 | 32 12 | 64 13 | 128 14 | ) 15 | 16 | cnt=0 17 | for f in ${auto_tunes[@]}; do 18 | echo $f 19 | conf=spectrum4_$f.toml 20 | cp spectrum4_base.toml $conf 21 | sed -i "s/^auto_tune = 10/auto_tune = $f/" $conf 22 | sed -i "s/spectrum4_1/spectrum4_$f/" $conf 23 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 24 | cnt=`expr $cnt + 5` # 5 threads 25 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 26 | done 27 | 28 | wait 29 | -------------------------------------------------------------------------------- /src/replayer/src/vm_ip_addrs.in: -------------------------------------------------------------------------------- 1 | vec![ 2 | "192.168.211.3", 3 | "192.168.211.4", 4 | "192.168.211.5", 5 | "192.168.211.6", 6 | "192.168.211.35", 7 | "192.168.211.36", 8 | "192.168.211.37", 9 | "192.168.211.38", 10 | "192.168.211.67", 11 | "192.168.211.68", 12 | "192.168.211.69", 13 | "192.168.211.70", 14 | "192.168.211.131", 15 | "192.168.211.132", 16 | "192.168.211.133", 17 | "192.168.211.134", 18 | "192.168.211.163", 19 | "192.168.211.164", 20 | "192.168.211.165", 21 | "192.168.211.166", 22 | "192.168.211.195", 23 | "192.168.211.196", 24 | "192.168.211.197", 25 | "192.168.211.198", 26 | ] -------------------------------------------------------------------------------- /evaluation/inaccuracy/run_inaccuracy2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | inaccuracies=( 6 | 0.0 7 | 0.1 8 | 0.2 9 | 0.3 10 | 0.4 11 | 0.5 12 | 0.6 13 | 0.7 14 | 0.8 15 | 0.9 16 | ) 17 | 18 | cnt=0 19 | for f in ${inaccuracies[@]}; do 20 | echo $f 21 | conf=inaccuracy2_$f.toml 22 | cp inaccuracy2_base.toml $conf 23 | sed -i "s/^inaccuracy = 0.1/inaccuracy = $f/" $conf 24 | sed -i "s/inaccuracy2_base/inaccuracy2_$f/" $conf 25 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf & 26 | cnt=`expr $cnt + 5` # 5 threads 27 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 28 | done 29 | 30 | wait 31 | -------------------------------------------------------------------------------- /evaluation/inaccuracy/run_inaccuracy1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | inaccuracies=( 6 | 0.0 7 | 0.1 8 | 0.2 9 | 0.3 10 | 0.4 11 | 0.5 12 | 0.6 13 | 0.7 14 | 0.8 15 | 0.9 16 | ) 17 | 18 | cnt=0 19 | for f in ${inaccuracies[@]}; do 20 | echo $f 21 | conf=inaccuracy1_$f.toml 22 | cp inaccuracy1_base.toml $conf 23 | sed -i "s/^inaccuracy = 0.1/inaccuracy = $f/" $conf 24 | sed -i "s/inaccuracy1_base/inaccuracy1_$f/" $conf 25 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 26 | cnt=`expr $cnt + 5` # 5 threads 27 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 28 | done 29 | 30 | wait 31 | -------------------------------------------------------------------------------- /scripts/testbed-2/provision_vms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $UID -ne 0 ]; then 4 | echo "Please run $0 as root" 5 | exit 3 6 | fi 7 | 8 | source `dirname $0`/utils.sh 9 | 10 | # cpubase m4.xlarge 11 | virt-install --virt-type kvm --vcpus 8 --name cpubase --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --nographic --os-type=linux --os-variant=ubuntu20.04 --noreboot --import 12 | 13 | 14 | for name in `lsnames 4`; do 15 | virsh vol-delete ${name}.img --pool images 16 | virt-clone --replace --original cpubase --name $name --file /var/lib/libvirt/images/${name}.img 17 | virt-sysprep --domain $name --enable customize,dhcp-client-state,machine-id --hostname $name 18 | done 19 | -------------------------------------------------------------------------------- /scripts/testbed/provision_vms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $UID -ne 0 ]; then 4 | echo "Please run $0 as root" 5 | exit 3 6 | fi 7 | 8 | source `dirname $0`/utils.sh 9 | 10 | # cpubase m4.xlarge 11 | virt-install --virt-type kvm --vcpus 8 --name cpubase --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --nographic --os-type=linux --os-variant=ubuntu20.04 --noreboot --import 12 | 13 | 14 | for name in `lsnames 4`; do 15 | virsh vol-delete ${name}.img --pool images 16 | virt-clone --replace --original cpubase --name $name --file /var/lib/libvirt/images/${name}.img 17 | virt-sysprep --domain $name --enable customize,dhcp-client-state,machine-id --hostname $name 18 | done 19 | -------------------------------------------------------------------------------- /evaluation/mapreduce_configs/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | # enable computation time 6 | # sed -i '/^num_reduce/a enable_computation_time = true' *.toml 7 | 8 | for conf in `ls *.toml`; do 9 | echo $conf 10 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 3 -c $conf & 11 | done 12 | 13 | # RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -c standard_hybrid1.toml & 14 | # RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -c standard_hybrid2.toml & 15 | # RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -c standard_hybrid3.toml & 16 | 17 | wait 18 | -------------------------------------------------------------------------------- /scripts/testbed-2/setup_sflow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $USER != "root" ]; then 4 | echo "Please run this script as root" 5 | exit 1 6 | fi 7 | 8 | if [ $# -eq 1 ]; then 9 | COLLECTOR_IP=$1 10 | else 11 | COLLECTOR_IP=127.0.0.1 12 | fi 13 | 14 | COLLECTOR_PORT=6343 15 | AGENT_IP=enp24s0v0 16 | HEADER_BYTES=128 17 | SAMPLING_N=10000 18 | POLLING_SECS=10 19 | BRIDGE=ovs0 20 | 21 | ovs-vsctl -- --id=@sflow create sflow agent=${AGENT_IP} \ 22 | target="\"${COLLECTOR_IP}:${COLLECTOR_PORT}\"" header=${HEADER_BYTES} \ 23 | sampling=${SAMPLING_N} polling=${POLLING_SECS} \ 24 | -- set bridge ${BRIDGE} sflow=@sflow 25 | 26 | ovs-vsctl list sflow 27 | 28 | # to remove, use ovs-vsctl remove bridge $BRIDGE sflow 29 | -------------------------------------------------------------------------------- /src/nethint/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nethint" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | lazy_static = "1.4.0" 11 | log = "0.4.11" 12 | petgraph = { version = "0.5.1", features = ["serde-1"] } 13 | fnv = "1.0.7" 14 | indexmap = "1.6.0" 15 | structopt = "0.3.21" 16 | thiserror = "1.0.22" 17 | rand = "0.8.3" 18 | smallvec = "1.5.1" 19 | serde = { version = "1.0.120", features = ["derive"] } 20 | toml = "0.5.8" 21 | rand_distr = "0.4.0" 22 | zipf = "7.0.0" 23 | utils = { path = "../utils" } 24 | 25 | [dev-dependencies] 26 | logging = { path = "../logging" } 27 | -------------------------------------------------------------------------------- /scripts/recover_bandwidth_setting.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | for i in {1..6}; do 5 | ssh danyang-0$i 'sudo mlnx_qos -i rdma0 -r 0,0,0,0,0,0,0,0' 6 | done 7 | 8 | 9 | 10 | read -d '' cmds <" 10 | exit 1 11 | fi 12 | 13 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 14 | 15 | scale=$1 16 | num_workers=`expr 6 \* $scale` 17 | num_racks=`expr 2 \* $scale` 18 | 19 | for ((i=0; i<$scale; i++)); do 20 | sampler_port=`expr 5555 + $i` 21 | 22 | RUST_BACKTRACE=full \ 23 | NH_CONTROLLER_URI=192.168.211.2:9000 \ 24 | NH_NUM_WORKER=$num_workers \ 25 | target/release/nhagent \ 26 | --shadow-id $i \ 27 | -p $sampler_port \ 28 | -i 100 \ 29 | -b 800000000000000:1:5:0.1 \ 30 | arbitrary $num_racks 3 10 10 \ 31 | & 32 | # --disable-v2 \ 33 | done 34 | 35 | wait 36 | -------------------------------------------------------------------------------- /src/nhagent_v2/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nhagent_v2" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | log = "0.4.14" 11 | anyhow = "1.0.38" 12 | logging = { path = "../logging" } 13 | nethint = { path = "../nethint" } 14 | litemsg = { path = "../litemsg" } 15 | utils = { path = "../utils" } 16 | thiserror = "1.0.23" 17 | lazy_static = "1.4.0" 18 | bincode = "1.3.1" 19 | serde = { version = "1.0.123", features = ["derive"] } 20 | structopt = "0.3.21" 21 | sflow = "0.0.1" 22 | etherparse = "0.9.0" 23 | 24 | [dependencies.mio] 25 | version = "0.6.23" 26 | 27 | [[bin]] 28 | name = "nhagent_v2" 29 | path = "src/main.rs" 30 | -------------------------------------------------------------------------------- /src/nhagent/testbed.toml: -------------------------------------------------------------------------------- 1 | sample_interval_ns = 100_000_000 # 100ms 2 | 3 | max_slots = 4 4 | 5 | background_flow_hard = { enable = true, frequency_ns = 100_000_000_000, probability = 0.9, amplitude = 5 } 6 | 7 | # The topology for simulation 8 | [brain.topology] 9 | type = "Arbitrary" # another possible value is "FatTree" 10 | 11 | # [brain.topology.args] # When type = "FatTree" 12 | # nports = 20 # the number of ports of a switch 13 | # bandwidth = 100 # in Gbps 14 | # oversub_ratio = 4.0 # oversubscription ratio 15 | 16 | [brain.topology.args] # When type = "Arbitrary" 17 | nracks = 2 # the number of racks 18 | rack_size = 3 # the number of hosts under a rack 19 | host_bw = 10 # bandwidth of a host, in Gbps 20 | rack_bw = 10 # bandwidth of a ToR switch, in Gbps -------------------------------------------------------------------------------- /src/nhagent_v2/testbed.toml: -------------------------------------------------------------------------------- 1 | sample_interval_ns = 100_000_000 # 100ms 2 | 3 | max_slots = 4 4 | 5 | background_flow_hard = { enable = true, frequency_ns = 100_000_000_000, probability = 0.9, amplitude = 5 } 6 | 7 | # The topology for simulation 8 | [brain.topology] 9 | type = "Arbitrary" # another possible value is "FatTree" 10 | 11 | # [brain.topology.args] # When type = "FatTree" 12 | # nports = 20 # the number of ports of a switch 13 | # bandwidth = 100 # in Gbps 14 | # oversub_ratio = 4.0 # oversubscription ratio 15 | 16 | [brain.topology.args] # When type = "Arbitrary" 17 | nracks = 2 # the number of racks 18 | rack_size = 3 # the number of hosts under a rack 19 | host_bw = 10 # bandwidth of a host, in Gbps 20 | rack_bw = 10 # bandwidth of a ToR switch, in Gbps -------------------------------------------------------------------------------- /src/nhagent/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nhagent" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | log = "0.4.14" 11 | anyhow = "1.0.38" 12 | logging = { path = "../logging" } 13 | nethint = { path = "../nethint" } 14 | litemsg = { path = "../litemsg" } 15 | utils = { path = "../utils" } 16 | thiserror = "1.0.23" 17 | lazy_static = "1.4.0" 18 | bincode = "1.3.1" 19 | serde = { version = "1.0.123", features = ["derive"] } 20 | structopt = "0.3.21" 21 | 22 | [dependencies.mio] 23 | version = "0.6.23" 24 | 25 | [[bin]] 26 | name = "nhagent" 27 | path = "src/main.rs" 28 | 29 | [[bin]] 30 | name = "ssagent" 31 | path = "src/ssagent.rs" 32 | -------------------------------------------------------------------------------- /evaluation/sensitivity/run_sensitivity_oversub.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | rack_size=6 6 | oversubs=( 7 | 1 8 | 1.5 9 | 2 10 | 3 11 | 4 12 | 10 13 | ) 14 | 15 | cnt=0 16 | for f in ${oversubs[@]}; do 17 | echo $f 18 | conf=sensitivity_oversub_$f.toml 19 | cp sensitivity_oversub_base.toml $conf 20 | rack_bw=`python3 -c "print('{:.0f}'.format(${rack_size} * 100 / ${f}))"` 21 | echo rack_bw: ${rack_bw} 22 | sed -i "s/^rack_bw = 0/rack_bw = ${rack_bw}/" $conf 23 | sed -i "s/sensitivity_oversub_base/sensitivity_oversub_$f/" $conf 24 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 25 | cnt=`expr $cnt + 5` # 5 threads 26 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 27 | done 28 | 29 | wait 30 | -------------------------------------------------------------------------------- /src/utils/src/fs.rs: -------------------------------------------------------------------------------- 1 | use std::io::{Seek, Write}; 2 | use std::path::Path; 3 | use std::sync::Mutex; 4 | use lazy_static::lazy_static; 5 | 6 | lazy_static! { 7 | static ref FILE_MUTEX: Mutex<()> = Mutex::new(()); 8 | } 9 | 10 | pub fn open_with_create_append>(path: P) -> std::fs::File { 11 | std::fs::OpenOptions::new() 12 | .append(true) 13 | .create(true) 14 | .open(&path) 15 | .unwrap_or_else(|e| panic!("fail to open or create {:?}: {}", path.as_ref(), e)) 16 | } 17 | 18 | pub fn append_to_file>(filename: P, content: &str) { 19 | let _file_mutex = FILE_MUTEX.lock().unwrap(); 20 | 21 | let mut f = open_with_create_append(filename); 22 | f.seek(std::io::SeekFrom::End(0)).unwrap(); 23 | writeln!(f, "{}", content).unwrap(); 24 | } 25 | -------------------------------------------------------------------------------- /src/rl/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rl" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | publish = false 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | log = "0.4.11" 12 | rand = "0.8.3" 13 | gnuplot = "0.0.37" 14 | nethint = { path = "../nethint" } 15 | logging = { path = "../logging" } 16 | mapreduce = { path = "../mapreduce" } 17 | utils = { path = "../utils" } 18 | structopt = "0.3.21" 19 | rand_distr = "0.4.0" 20 | toml = "0.5.8" 21 | serde = { version = "1.0.122", features = ["derive"] } 22 | lazy_static = "1.4.0" 23 | rayon = "1.5.0" 24 | rat_solver = { path = "../rat_solver" } 25 | indicatif = "0.17.0-rc.4" 26 | 27 | [[bin]] 28 | name = "rl_experiment" 29 | path = "src/experiment.rs" 30 | -------------------------------------------------------------------------------- /evaluation/sensitivity/run_sensitivity_probing_cost2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c sensitivity_probing_cost2_baseline.toml & 6 | 7 | round_mses=( 8 | 1 9 | 10 10 | 25 11 | 50 12 | 75 13 | 100 14 | ) 15 | 16 | cnt=5 17 | for f in ${round_mses[@]}; do 18 | echo round_ms: $f 19 | conf=sensitivity_probing_cost2_$f.toml 20 | cp sensitivity_probing_cost2_base.toml $conf 21 | sed -i "s/round_ms = 100/round_ms = $f/" $conf 22 | sed -i "s/sensitivity_probing_cost2_base/sensitivity_probing_cost2_$f/" $conf 23 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf & 24 | cnt=`expr $cnt + 5` # 5 threads 25 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 26 | done 27 | 28 | wait 29 | -------------------------------------------------------------------------------- /evaluation/spectrum/run_spectrum5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | background_flow_freqs=( 6 | 25_000_000 7 | 50_000_000 8 | 1_00_000_000 9 | 2_00_000_000 10 | 4_00_000_000 11 | 8_00_000_000 12 | 1_600_000_000 13 | 3_200_000_000 14 | 6_400_000_000 15 | 12_800_000_000 16 | 25_600_000_000 17 | 51_200_000_000 18 | 102_400_000_000 19 | ) 20 | 21 | cnt=0 22 | for f in ${background_flow_freqs[@]}; do 23 | echo $f 24 | conf=spectrum5_$f.toml 25 | cp spectrum5_base.toml $conf 26 | sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf 27 | sed -i "s/spectrum5_1/spectrum5_$f/" $conf 28 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf & 29 | cnt=`expr $cnt + 5` # 5 threads 30 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 31 | done 32 | 33 | wait 34 | -------------------------------------------------------------------------------- /evaluation/spectrum/run_spectrum1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | background_flow_freqs=( 6 | 25_000_000 7 | 50_000_000 8 | 1_00_000_000 9 | 2_00_000_000 10 | 4_00_000_000 11 | 8_00_000_000 12 | 1_600_000_000 13 | 3_200_000_000 14 | 6_400_000_000 15 | 12_800_000_000 16 | 25_600_000_000 17 | 51_200_000_000 18 | 102_400_000_000 19 | ) 20 | 21 | cnt=0 22 | for f in ${background_flow_freqs[@]}; do 23 | echo $f 24 | conf=spectrum1_$f.toml 25 | cp spectrum1_base.toml $conf 26 | sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf 27 | sed -i "s/spectrum1_1/spectrum1_$f/" $conf 28 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 29 | cnt=`expr $cnt + 5` # 5 threads 30 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 31 | done 32 | 33 | wait 34 | -------------------------------------------------------------------------------- /evaluation/spectrum/run_spectrum2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | background_flow_freqs=( 6 | 25_000_000 7 | 50_000_000 8 | 1_00_000_000 9 | 2_00_000_000 10 | 4_00_000_000 11 | 8_00_000_000 12 | 1_600_000_000 13 | 3_200_000_000 14 | 6_400_000_000 15 | 12_800_000_000 16 | 25_600_000_000 17 | 51_200_000_000 18 | 102_400_000_000 19 | ) 20 | 21 | cnt=0 22 | for f in ${background_flow_freqs[@]}; do 23 | echo $f 24 | conf=spectrum2_$f.toml 25 | cp spectrum2_base.toml $conf 26 | sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf 27 | sed -i "s/spectrum2_1/spectrum2_$f/" $conf 28 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 29 | cnt=`expr $cnt + 5` # 5 threads 30 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 31 | done 32 | 33 | wait 34 | -------------------------------------------------------------------------------- /evaluation/spectrum/run_spectrum6.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | background_flow_freqs=( 6 | 25_000_000 7 | 50_000_000 8 | 1_00_000_000 9 | 2_00_000_000 10 | 4_00_000_000 11 | 8_00_000_000 12 | 1_600_000_000 13 | 3_200_000_000 14 | 6_400_000_000 15 | 12_800_000_000 16 | 25_600_000_000 17 | 51_200_000_000 18 | 102_400_000_000 19 | ) 20 | 21 | cnt=0 22 | for f in ${background_flow_freqs[@]}; do 23 | echo $f 24 | conf=spectrum6_$f.toml 25 | cp spectrum6_base.toml $conf 26 | sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf 27 | sed -i "s/spectrum6_1/spectrum6_$f/" $conf 28 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf & 29 | cnt=`expr $cnt + 5` # 5 threads 30 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 31 | done 32 | 33 | wait 34 | -------------------------------------------------------------------------------- /src/allreduce/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "allreduce" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | publish = false 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | log = "0.4.11" 12 | rand = "0.8.3" 13 | gnuplot = "0.0.37" 14 | nethint = { path = "../nethint" } 15 | logging = { path = "../logging" } 16 | mapreduce = { path = "../mapreduce" } 17 | utils = { path = "../utils" } 18 | structopt = "0.3.21" 19 | lpsolve = "0.1.0" 20 | lpsolve-sys = "5.5.0" 21 | rand_distr = "0.4.0" 22 | toml = "0.5.8" 23 | serde = { version = "1.0.122", features = ["derive"] } 24 | lazy_static = "1.4.0" 25 | rayon = "1.5.0" 26 | rat_solver = { path = "../rat_solver" } 27 | indicatif = "0.17.0-rc.4" 28 | 29 | [[bin]] 30 | name = "allreduce_experiment" 31 | path = "src/experiment.rs" 32 | -------------------------------------------------------------------------------- /src/mapreduce/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mapreduce" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | publish = false 7 | 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | log = "0.4.11" 12 | rand = "0.7.3" 13 | gnuplot = "0.0.37" 14 | nethint = { path = "../nethint" } 15 | logging = { path = "../logging" } 16 | spiril = { git = "https://github.com/Jeffail/spiril", branch = "master" } 17 | utils = { path = "../utils" } 18 | structopt = "0.3.21" 19 | futures = "0.3.8" 20 | async-std = "1.7.0" 21 | num_cpus = "1.13.0" 22 | anyhow = "1.0.35" 23 | zipf = "6.1.0" 24 | toml = "0.5.8" 25 | serde = { version = "1.0.122", features = ["derive"] } 26 | lazy_static = "1.4.0" 27 | rayon = "1.5.0" 28 | 29 | [[bin]] 30 | name = "mapreduce_experiment" 31 | path = "src/experiment.rs" 32 | -------------------------------------------------------------------------------- /evaluation/herd_behavior/run_mapreduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 3 | 4 | overlapped_jobs=( 5 | 5 6 | 10 7 | 20 8 | 30 9 | ) 10 | 11 | time_scales=( 12 | 1 13 | 0.5 14 | 0.2 15 | 0.1 16 | ) 17 | 18 | # time_scales=( 19 | # 1 20 | # 0.1 21 | # 0.05 22 | # 0.01 23 | # ) 24 | 25 | idx=0 26 | for f in ${overlapped_jobs[@]}; do 27 | echo overlapped_jobs: $f 28 | conf=mapreduce_herd_$f.toml 29 | cp mapreduce_herd_base.toml $conf 30 | time_scale=${time_scales[$idx]} 31 | echo time_scale: $time_scale 32 | sed -i "s/^time_scale = 1/time_scale = ${time_scale}/" $conf 33 | sed -i "s/mapreduce_herd_base/mapreduce_herd_$f/" $conf 34 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf & 35 | idx=`expr $idx + 1` 36 | cnt=`expr $cnt + 5` # 5 threads 37 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 38 | done 39 | 40 | wait 41 | -------------------------------------------------------------------------------- /scripts/testbed-2/migrate_pf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $# -ne 1 ]; then 6 | echo "Usage: $0 up/down" 7 | exit 1 8 | fi 9 | 10 | 11 | if [ $1 = "up" ]; then 12 | 13 | echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind 14 | echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/mlx5_core/bind 15 | echo "sleep for 5 seconds to wait for the interface ready" 16 | sleep 5 17 | sudo ip link set enp24s0v0 up 18 | cidr=`ip a show rdma0 | grep 'inet ' | awk '{print $2}'` 19 | sudo ip addr del ${cidr} dev rdma0 20 | sudo ip addr add ${cidr} dev enp24s0v0 21 | sudo ip addr add ${cidr} dev rdma0 22 | 23 | elif [ $1 = "down" ]; then 24 | 25 | cidr=`ip a show enp24s0v0 | grep 'inet ' | awk '{print $2}'` 26 | sudo ip addr del ${cidr} dev enp24s0v0 27 | sudo ip link set enp24s0v0 down 28 | echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/mlx5_core/unbind 29 | echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/vfio-pci/bind 30 | 31 | else 32 | echo "Usage: $0 up/down" 33 | exit 1 34 | fi 35 | -------------------------------------------------------------------------------- /scripts/build_testbed_bins.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cargo build --bin nhagent_v2 4 | 5 | cargo build --release --bin controller 6 | cargo build --release --bin worker 7 | cargo build --release --bin rplaunch 8 | cargo build --release --bin nhagent_v2 9 | cargo build --release --bin scheduler 10 | # cargo build --release --bin ssagent 11 | 12 | DIR=`cargo metadata --format-version 1 | jq -r '.workspace_root'` 13 | builtin cd $DIR/../nethint-bpf 14 | 15 | nix develop -c cargo build 16 | nix develop -c cargo build --release 17 | 18 | # build BPF program and its userspace program 19 | # DIR=$(dirname `realpath $0`) 20 | # nix develop $DIR/../nethint-bpf -c cargo build 21 | # nix develop $DIR/../nethint-bpf -c cargo build --release 22 | # for sec in {.BTF,.eh_frame,.text,.BTF.ext}; do 23 | # nix develop $DIR/../nethint-bpf -c \ 24 | # llvm-strip --strip-unneeded --remove-section ${sec} \ 25 | # /nfs/cjr/Developing/nethint-bpf/target/debug/build/nethint-userspace-0abfae651d38dbe6/out/target/bpf/programs/nethint/nethint.elf 26 | # done 27 | -------------------------------------------------------------------------------- /evaluation/sensitivity/run_sensitivity_probing_cost1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c sensitivity_probing_cost1_baseline.toml & 6 | 7 | round_mses=( 8 | 1 9 | 10 10 | 25 11 | # 50 12 | # 75 13 | 100 14 | ) 15 | 16 | cnt=5 17 | for f in ${round_mses[@]}; do 18 | echo round_ms: $f 19 | conf=sensitivity_probing_cost1_$f.toml 20 | cp sensitivity_probing_cost1_base.toml $conf 21 | auto_tune=`python3 -c "print('{:.0f}'.format(${f} * 10))"` 22 | echo auto_tune: $auto_tune 23 | sed -i "s/round_ms = 100/round_ms = $f/" $conf 24 | sed -i "s/auto_tune = 1000/auto_tune = ${auto_tune}/" $conf 25 | sed -i "s/sensitivity_probing_cost1_base/sensitivity_probing_cost1_$f/" $conf 26 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 27 | cnt=`expr $cnt + 5` # 5 threads 28 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 29 | done 30 | 31 | wait 32 | -------------------------------------------------------------------------------- /scripts/testbed-2/setup_ovs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $UID -ne 0 ]; then 6 | echo "Please run $0 as root" 7 | exit 3 8 | fi 9 | 10 | if [ $# -ne 1 ]; then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | pf=$1 16 | 17 | # Create an OVS bridge (here it's named ovs-sriov). 18 | ovs-vsctl add-br ovs0 19 | 20 | # check the result 21 | ovs-vsctl get Open_vSwitch . other_config 22 | ovs-vsctl get Open_vSwitch . dpdk_initialized 23 | 24 | # Restart the openvswitch service. This step is required for HW offload changes to take effect. 25 | systemctl restart openvswitch-switch.service 26 | 27 | # Make sure to bring up the PF and representor netdevices. 28 | ovs-vsctl add-port ovs0 $pf 29 | 30 | # show something 31 | ovs-vsctl list-ports ovs0 32 | ovs-dpctl show 33 | 34 | # add sflow configuration 35 | DIR=$(dirname `realpath $0`) 36 | source "$DIR"/utils.sh 37 | rack_agent_ip=`get_rack_agent_ip` 38 | echo 'rack agent IP: ' $rack_agent_ip 39 | # do not setup sFlow, use BPF agent instead 40 | # ./setup_sflow.sh $rack_agent_ip 41 | -------------------------------------------------------------------------------- /src/allreduce/src/lib.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | pub mod argument; 4 | 5 | pub mod app; 6 | 7 | pub mod random_ring; 8 | 9 | pub mod topology_aware; 10 | 11 | pub mod rat; 12 | 13 | pub mod config; 14 | 15 | use nethint::{cluster::Topology, Flow}; 16 | use std::rc::Rc; 17 | 18 | #[derive(Debug, Clone)] 19 | pub struct JobSpec { 20 | pub num_workers: usize, 21 | pub buffer_size: usize, 22 | pub num_iterations: usize, 23 | } 24 | 25 | impl JobSpec { 26 | pub fn new(num_workers: usize, buffer_size: usize, num_iterations: usize) -> Self { 27 | JobSpec { 28 | num_workers, 29 | buffer_size, 30 | num_iterations, 31 | } 32 | } 33 | } 34 | 35 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] 36 | pub enum AllReducePolicy { 37 | Random, 38 | TopologyAware, 39 | /// Resilient Aggregation Tree 40 | RAT, 41 | } 42 | 43 | pub trait AllReduceAlgorithm { 44 | fn allreduce(&mut self, size: u64, vcluster: Rc) -> Vec; 45 | } 46 | -------------------------------------------------------------------------------- /evaluation/herd_behavior/run_allreduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | overlapped_jobs=( 6 | 5 7 | 10 8 | 20 9 | 30 10 | ) 11 | 12 | rack_sizes=( 13 | 7 14 | 15 15 | 30 16 | 40 17 | ) 18 | 19 | rack_bws=( 20 | 233 21 | 500 22 | 1000 23 | 1333 24 | ) 25 | 26 | idx=0 27 | for n in ${overlapped_jobs[@]}; do 28 | echo n: $n 29 | rack_size=${rack_sizes[$idx]} 30 | rack_bw=${rack_bws[$idx]} 31 | echo rack_size: $rack_size 32 | echo rack_bw: $rack_bw 33 | conf=allreduce_herd_$n.toml 34 | cp allreduce_herd_base.toml $conf 35 | sed -i "s/^ncases = 40/ncases = ${n}/" $conf 36 | sed -i "s/^rack_size = 40/rack_size = ${rack_size}/" $conf 37 | sed -i "s/^rack_bw = 1333/rack_bw = ${rack_bw}/" $conf 38 | sed -i "s/allreduce_herd_base/allreduce_herd_$n/" $conf 39 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 40 | idx=`expr $idx + 1` 41 | cnt=`expr $cnt + 5` # 5 threads 42 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 43 | done 44 | 45 | wait 46 | -------------------------------------------------------------------------------- /scripts/testbed/migrate_pf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $# -ne 1 ]; then 6 | echo "Usage: $0 up/down" 7 | exit 1 8 | fi 9 | 10 | 11 | if [ $1 = "up" ]; then 12 | 13 | echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind 14 | echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/mlx5_core/bind 15 | echo "sleep for 5 seconds to wait for the interface ready" 16 | sleep 5 17 | sudo ip link set enp24s0v4 up 18 | cidr=`ip a show rdma0 | grep 'inet ' | awk '{print $2}'` 19 | sudo ip addr del ${cidr} dev rdma0 20 | sudo ip addr add ${cidr} dev enp24s0v4 21 | sudo ip addr add ${cidr} dev rdma0 22 | sudo ovs-vsctl add-port ovs-sriov rdma0_4 23 | 24 | elif [ $1 = "down" ]; then 25 | 26 | sudo ovs-vsctl del-port ovs-sriov rdma0_4 27 | cidr=`ip a show enp24s0v4 | grep 'inet ' | awk '{print $2}'` 28 | sudo ip addr del ${cidr} dev enp24s0v4 29 | sudo ip link set enp24s0v4 down 30 | echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/mlx5_core/unbind 31 | echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/vfio-pci/bind 32 | 33 | else 34 | echo "Usage: $0 up/down" 35 | exit 1 36 | fi 37 | -------------------------------------------------------------------------------- /evaluation/sensitivity/run_sensitivity_rack_size.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 4 | 5 | oversub=3 6 | rack_sizes=( 7 | 5 8 | 10 9 | 20 10 | ) 11 | 12 | cnt=0 13 | for f in ${rack_sizes[@]}; do 14 | echo rack_size: $f 15 | conf=sensitivity_rack_size_$f.toml 16 | cp sensitivity_rack_size_base.toml $conf 17 | rack_bw=`python3 -c "print('{:.0f}'.format(${f} * 100 / ${oversub}))"` 18 | job_size_distribution=`python3 -c "print('[[80, {}], [80, {}]]'.format($f // 5 * 8, $f // 5 * 12))"` 19 | echo rack_bw: ${rack_bw} 20 | echo job_size_distribution: ${job_size_distribution} 21 | sed -i "s/job_size_distribution = [[80, 8], [80, 12]]/job_size_distribution = ${job_size_distribution}/" $conf 22 | sed -i "s/^rack_size = 0/rack_size = $f/" $conf 23 | sed -i "s/^rack_bw = 0/rack_bw = ${rack_bw}/" $conf 24 | sed -i "s/sensitivity_rack_size_base/sensitivity_rack_size_$f/" $conf 25 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf & 26 | cnt=`expr $cnt + 5` # 5 threads 27 | [[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; } 28 | done 29 | 30 | wait 31 | -------------------------------------------------------------------------------- /scripts/testbed-2/nixos-vm-setup/pubkeys.nix: -------------------------------------------------------------------------------- 1 | [ 2 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAMvo/AjpONh9/Y4MEwSyygyucngxsAVuZwUDEt6fk3m root@danyang-01" 3 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMbY3tVYI5q/vxab4YcIejCNUd58Azp4Bv7bT0RgPATX root@danyang-02" 4 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILPFsLGA+mBhA7s+aIEt3q2if8QpkcQ542cvIjA5XKpl root@danyang-03" 5 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM22OX0cdXpl9ItXHqsfwdA+hJr0GNcgpij8R7fXKGXa root@danyang-04" 6 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM/GEWc20b7noIOP9de+tnFHGA6pFxxa69E/s//wAdDk root@danyang-05" 7 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAu83mlLBROmGdMcbWo97ssMugGzM8Mp1bs1UZFo+xPz root@danyang-06" 8 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGhs6gFH2lE9YUk2KDFQ7XyYr6MgVQwG8DHm1M/w/hCq cjr@danyang-01" 9 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEFXTx3dU1by85bCjrlIdYmazlvCEOCc3Rx8Bg+pEe5I cjr@danyang-02" 10 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJjiy+/hk2tKBHxMzHeqzhFps+T5AVHQ2nyxOltD5VdJ cjr@danyang-03" 11 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPLNdTWXsb/M8sCwHpdzbLTMojfKZBlehzliSq1wP+rd cjr@danyang-04" 12 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJfKFXbfE4W8wm6n+Sfdcwdo8wXoARpde/8BSGemGUNy cjr@danyang-05" 13 | "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAVG2U5Qo9Ac3nBdah4iCbhZwlPEh5jZtsduCVIkpA67 cjr@danyang-06" 14 | ] 15 | -------------------------------------------------------------------------------- /src/replayer/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "replayer" 3 | version = "0.1.0" 4 | authors = ["Jingrong Chen "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | log = "0.4.14" 11 | anyhow = "1.0.38" 12 | thiserror = "1.0.23" 13 | bincode = "1.3.1" 14 | serde = { version = "1.0.123", features = ["derive"] } 15 | logging = { path = "../logging" } 16 | nethint = { path = "../nethint" } 17 | litemsg = { path = "../litemsg" } 18 | mapreduce = { path = "../mapreduce" } 19 | allreduce = { path = "../allreduce" } 20 | rl = { path = "../rl" } 21 | nhagent_v2 = { path = "../nhagent_v2" } 22 | utils = { path = "../utils" } 23 | structopt = "0.3.21" 24 | nix = "0.19.1" 25 | serde_bytes = "0.11.5" 26 | num_cpus = "1.13.0" 27 | rand = "0.8.3" 28 | rand_distr = "0.4.0" 29 | toml = "0.5.8" 30 | zipf = "7.0.0" 31 | lazy_static = "1.4.0" 32 | sha2 = "0.9.3" 33 | crossbeam = "0.8.0" 34 | 35 | [dependencies.mio] 36 | version = "0.6.23" 37 | 38 | 39 | [[bin]] 40 | name = "controller" 41 | path = "src/controller/main.rs" 42 | 43 | [[bin]] 44 | name = "worker" 45 | path = "src/worker/main.rs" 46 | 47 | [[bin]] 48 | name = "rplaunch" 49 | path = "src/launcher.rs" 50 | 51 | [[bin]] 52 | name = "scheduler" 53 | path = "src/scheduler.rs" 54 | -------------------------------------------------------------------------------- /src/nhagent/src/argument.rs: -------------------------------------------------------------------------------- 1 | use nethint::architecture::TopoArgs; 2 | use nethint::background_flow_hard::BackgroundFlowHard; 3 | use structopt::StructOpt; 4 | 5 | #[derive(Debug, Clone, StructOpt)] 6 | #[structopt(name = "nhagent", about = "NetHint Agent")] 7 | pub struct Opts { 8 | /// The working interval of agent in millisecond 9 | #[structopt(short = "i", long = "interval", default_value = "100")] 10 | pub interval_ms: u64, 11 | 12 | /// The listening port of the sampler 13 | #[structopt(short = "p", long = "p", default_value = "5555")] 14 | pub sampler_listen_port: u16, 15 | 16 | /// Specify the topology for testbed 17 | #[structopt(subcommand)] 18 | pub topo: TopoArgs, 19 | 20 | /// Background flow parameter by enforcing rate limit, the 21 | /// format is freq:prob:amp[:avg_load] 22 | #[structopt(short, long, default_value)] 23 | pub background_flow_hard: BackgroundFlowHard, 24 | 25 | /// When specified, it represents the number of the duplicated agent. 26 | /// This option is only used to measure the system overhead by running 27 | /// multiple nhagents on the same servers. 28 | #[structopt(short, long)] 29 | pub shadow_id: Option, 30 | 31 | /// Disable HetHint v2, and only run NetHint v1. 32 | #[structopt(short, long)] 33 | pub disable_v2: bool, 34 | } 35 | -------------------------------------------------------------------------------- /src/rl/src/lib.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | pub mod argument; 4 | 5 | pub mod app; 6 | 7 | pub mod contraction; 8 | pub mod random_ring; 9 | pub mod rat; 10 | pub mod topology_aware; 11 | 12 | pub mod config; 13 | 14 | use std::rc::Rc; 15 | use nethint::{cluster::Topology, Flow}; 16 | 17 | #[derive(Debug, Clone)] 18 | pub struct JobSpec { 19 | pub num_workers: usize, 20 | pub buffer_size: usize, 21 | pub num_iterations: usize, 22 | pub root_index: usize, 23 | } 24 | 25 | impl JobSpec { 26 | pub fn new( 27 | num_workers: usize, 28 | buffer_size: usize, 29 | num_iterations: usize, 30 | root_index: usize, 31 | ) -> Self { 32 | JobSpec { 33 | num_workers, 34 | buffer_size, 35 | num_iterations, 36 | root_index, 37 | } 38 | } 39 | } 40 | 41 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] 42 | pub enum RLPolicy { 43 | Random, 44 | TopologyAware, 45 | Contraction, 46 | /// Resilient Aggregation Tree 47 | RAT, 48 | } 49 | 50 | pub trait RLAlgorithm { 51 | fn run_rl_traffic( 52 | &mut self, 53 | root_index: usize, 54 | group: Option>, // worker group 55 | size: u64, 56 | vcluster: Rc, 57 | ) -> Vec; 58 | } 59 | -------------------------------------------------------------------------------- /src/replayer/src/controller/app.rs: -------------------------------------------------------------------------------- 1 | use crate::Node; 2 | use litemsg::endpoint::Endpoint; 3 | use std::collections::HashMap; 4 | use nethint::hint::NetHintVersion; 5 | use nethint::TenantId; 6 | use crate::message; 7 | // use nhagent::timing::{self, TimeList}; 8 | 9 | pub trait Application { 10 | fn workers(&self) -> &HashMap; 11 | 12 | fn workers_mut(&mut self) -> &mut HashMap; 13 | 14 | fn brain(&self) -> &Endpoint; 15 | 16 | fn brain_mut(&mut self) -> &mut Endpoint; 17 | 18 | fn tenant_id(&self) -> TenantId; 19 | 20 | fn hostname_to_node(&self) -> &HashMap; 21 | 22 | fn request_nethint(&mut self, version: NetHintVersion) -> anyhow::Result<()> { 23 | // let mut time_list = TimeList::new(); 24 | // time_list.push_now(timing::ON_TENANT_SENT_REQ); 25 | // let msg = nhagent::message::Message::NetHintRequest(self.tenant_id(), version, time_list); 26 | let msg = nhagent_v2::message::Message::NetHintRequest(self.tenant_id(), version); 27 | self.brain_mut().post(msg, None)?; 28 | Ok(()) 29 | } 30 | 31 | fn start(&mut self) -> anyhow::Result<()>; 32 | 33 | fn on_event(&mut self, cmd: message::Command) -> anyhow::Result; 34 | 35 | fn finish(&mut self) -> anyhow::Result<()> { 36 | for worker in self.workers_mut().values_mut() { 37 | worker.post(message::Command::AppFinish, None)?; 38 | } 39 | Ok(()) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/rl/src/argument.rs: -------------------------------------------------------------------------------- 1 | use nethint::architecture::TopoArgs; 2 | use structopt::StructOpt; 3 | 4 | #[derive(Debug, Clone, StructOpt)] 5 | #[structopt(name = "AllReduce", about = "AllReduce Application")] 6 | pub struct Opt { 7 | /// Specify the topology for simulation 8 | #[structopt(subcommand)] 9 | pub topo: TopoArgs, 10 | 11 | /// Number of workers. 12 | #[structopt(short = "w", long = "num_workers", default_value = "16")] 13 | pub num_workers: usize, 14 | 15 | /// Buffer size of allreduce. 16 | #[structopt(short = "s", long = "buffer_size", default_value = "100000000")] 17 | pub buffer_size: usize, 18 | 19 | /// Number of allreduce iterations. 20 | #[structopt(short = "i", long = "num_iterations", default_value = "1200")] 21 | pub num_iterations: usize, 22 | 23 | /// Number of jobs. 24 | #[structopt(short = "n", long = "ncases", default_value = "1")] 25 | pub ncases: usize, 26 | 27 | /// Nethint level. 28 | #[structopt(short = "l", long = "nethint_level", default_value = "1")] 29 | pub nethint_level: usize, 30 | 31 | /// Poisson arrival lambda. 32 | #[structopt(short = "p", long = "poisson_lambda", default_value = "24000000000")] 33 | pub poisson_lambda: f64, 34 | 35 | /// Asymmetric bandwidth 36 | #[structopt(short = "a", long = "asymmetric")] 37 | pub asym: bool, 38 | 39 | /// Auto tune after some itertions. 40 | #[structopt(long = "autotune")] 41 | pub tune: Option, 42 | } 43 | -------------------------------------------------------------------------------- /src/allreduce/src/argument.rs: -------------------------------------------------------------------------------- 1 | use nethint::architecture::TopoArgs; 2 | use structopt::StructOpt; 3 | 4 | #[derive(Debug, Clone, StructOpt)] 5 | #[structopt(name = "AllReduce", about = "AllReduce Application")] 6 | pub struct Opt { 7 | /// Specify the topology for simulation 8 | #[structopt(subcommand)] 9 | pub topo: TopoArgs, 10 | 11 | /// Number of workers. 12 | #[structopt(short = "w", long = "num_workers", default_value = "16")] 13 | pub num_workers: usize, 14 | 15 | /// Buffer size of allreduce. 16 | #[structopt(short = "s", long = "buffer_size", default_value = "100000000")] 17 | pub buffer_size: usize, 18 | 19 | /// Number of allreduce iterations. 20 | #[structopt(short = "i", long = "num_iterations", default_value = "1200")] 21 | pub num_iterations: usize, 22 | 23 | /// Number of jobs. 24 | #[structopt(short = "n", long = "ncases", default_value = "1")] 25 | pub ncases: usize, 26 | 27 | /// Nethint level. 28 | #[structopt(short = "l", long = "nethint_level", default_value = "1")] 29 | pub nethint_level: usize, 30 | 31 | /// Poisson arrival lambda. 32 | #[structopt(short = "p", long = "poisson_lambda", default_value = "24000000000")] 33 | pub poisson_lambda: f64, 34 | 35 | /// Asymmetric bandwidth 36 | #[structopt(short = "a", long = "asymmetric")] 37 | pub asym: bool, 38 | 39 | /// Auto tune after some itertions. 40 | #[structopt(long = "autotune")] 41 | pub tune: Option, 42 | } 43 | -------------------------------------------------------------------------------- /src/nethint/src/runtime_est.rs: -------------------------------------------------------------------------------- 1 | pub struct RunningTimeEstimator { 2 | total_trials: Option, 3 | done_trials: usize, 4 | data: Vec, // running time for each single trial 5 | single_start: std::time::Instant, 6 | running_time: std::time::Duration, 7 | } 8 | 9 | impl RunningTimeEstimator { 10 | pub fn new() -> Self { 11 | RunningTimeEstimator { 12 | total_trials: None, 13 | done_trials: 0, 14 | data: Vec::new(), 15 | single_start: std::time::Instant::now(), 16 | running_time: std::time::Duration::from_nanos(0), 17 | } 18 | } 19 | 20 | pub fn set_total_trials(&mut self, total_trials: usize) { 21 | self.total_trials = Some(total_trials); 22 | } 23 | 24 | pub fn bench_single_start(&mut self) { 25 | let now = std::time::Instant::now(); 26 | self.running_time += now - self.single_start; 27 | 28 | if let Some(total_trials) = self.total_trials { 29 | if self.done_trials > 0 { 30 | log::info!( 31 | "average speed: {:?} second/trial, time left: {:?}", 32 | self.running_time / self.done_trials as u32, 33 | self.running_time * (total_trials - self.done_trials) as u32 34 | / self.done_trials as u32 35 | ); 36 | } 37 | } 38 | 39 | self.data.push(now - self.single_start); 40 | self.done_trials += 1; 41 | self.single_start = now; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/litemsg/src/buffer.rs: -------------------------------------------------------------------------------- 1 | /// A Buffer represents a segment of sending or receiving data (maybe unfinished). 2 | #[derive(Debug, Default)] 3 | pub struct Buffer { 4 | inner: Vec, 5 | cur_pos: usize, 6 | } 7 | 8 | impl Buffer { 9 | pub fn from_vec(v: Vec) -> Self { 10 | Buffer { 11 | inner: v, 12 | cur_pos: 0, 13 | } 14 | } 15 | 16 | pub fn with_len(len: usize) -> Self { 17 | let mut inner = Vec::with_capacity(len); 18 | unsafe { 19 | inner.set_len(len); 20 | } 21 | Buffer { inner, cur_pos: 0 } 22 | } 23 | 24 | pub fn take(&mut self) -> Vec { 25 | std::mem::take(&mut self.inner) 26 | } 27 | 28 | pub fn as_slice(&self) -> &[u8] { 29 | assert!(self.is_clear()); 30 | &self.inner 31 | } 32 | 33 | pub fn as_slice_mut(&mut self) -> &mut [u8] { 34 | assert!(self.is_clear()); 35 | &mut self.inner 36 | } 37 | 38 | pub fn mark_handled(&mut self, nbytes: usize) { 39 | self.cur_pos += nbytes; 40 | assert!(self.cur_pos <= self.inner.len()); 41 | } 42 | 43 | pub fn is_clear(&self) -> bool { 44 | self.cur_pos == self.inner.len() 45 | } 46 | 47 | pub fn get_remain_buffer(&self) -> &[u8] { 48 | &self.inner[self.cur_pos..] 49 | } 50 | 51 | pub fn get_remain_buffer_mut(&mut self) -> &mut [u8] { 52 | &mut self.inner[self.cur_pos..] 53 | } 54 | } 55 | 56 | // /// Zero-copied buffer with reference counting. 57 | // #[derive(Debug, Default)] 58 | // struct ZBuffer { 59 | // } 60 | -------------------------------------------------------------------------------- /src/mapreduce/src/plink.rs: -------------------------------------------------------------------------------- 1 | use nethint::{ 2 | app::{AppEvent, Application, Sequence}, 3 | background_flow::{BackgroundFlowApp, BackgroundFlowPattern}, 4 | simulator::Events, 5 | Duration, 6 | }; 7 | 8 | #[derive(Debug)] 9 | pub struct PlinkApp<'a, T> { 10 | dur_ms: Duration, 11 | inner: Box>, 12 | } 13 | 14 | impl<'a, T: 'a> PlinkApp<'a, T> 15 | where 16 | T: Default + Clone + std::fmt::Debug, 17 | { 18 | pub fn new(nhosts: usize, round_ms: u64, app: Box + 'a>) -> Self { 19 | let dur_ms = (nhosts as u64 * round_ms) as _; 20 | let background_flow = Box::new(BackgroundFlowApp::new( 21 | nhosts, 22 | dur_ms, 23 | BackgroundFlowPattern::PlinkProbe, 24 | Some(100_000_000), // 8ms on 100G 25 | T::default(), 26 | )); 27 | 28 | let mut app_seq = Box::new(Sequence::new()); 29 | app_seq.add(background_flow); 30 | app_seq.add(app); 31 | 32 | PlinkApp { 33 | dur_ms, 34 | inner: app_seq, 35 | } 36 | } 37 | } 38 | 39 | impl<'a> Application for PlinkApp<'a, Option> { 40 | type Output = Option; 41 | 42 | fn on_event(&mut self, event: AppEvent) -> Events { 43 | self.inner.on_event(event) 44 | } 45 | 46 | fn answer(&mut self) -> Option { 47 | // self.inner.answer().last().unwrap().clone() 48 | self.inner 49 | .answer() 50 | .last() 51 | .unwrap() 52 | .map(|dur| dur + self.dur_ms * 1_000_000) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/logging/src/lib.rs: -------------------------------------------------------------------------------- 1 | use log::info; 2 | 3 | pub fn init_log() { 4 | use chrono::Utc; 5 | use std::io::Write; 6 | 7 | let env = env_logger::Env::default().default_filter_or("debug"); 8 | env_logger::Builder::from_env(env) 9 | .format(|buf, record| { 10 | let level_style = buf.default_level_style(record.level()); 11 | writeln!( 12 | buf, 13 | "[{} {} {}:{}] {}", 14 | Utc::now().format("%Y-%m-%d %H:%M:%S%.6f"), 15 | level_style.value(record.level()), 16 | record.file().unwrap_or(""), 17 | record.line().unwrap_or(0), 18 | &record.args() 19 | ) 20 | }) 21 | .init(); 22 | 23 | info!("env_logger initialized"); 24 | } 25 | 26 | pub fn init_log_with_id(id: String) { 27 | use chrono::Utc; 28 | use std::io::Write; 29 | 30 | let env = env_logger::Env::default().default_filter_or("debug"); 31 | env_logger::Builder::from_env(env) 32 | .format(move |buf, record| { 33 | let level_style = buf.default_level_style(record.level()); 34 | writeln!( 35 | buf, 36 | "[{} {} {}:{} {}] {}", 37 | Utc::now().format("%Y-%m-%d %H:%M:%S%.6f"), 38 | level_style.value(record.level()), 39 | record.file().unwrap_or(""), 40 | record.line().unwrap_or(0), 41 | id, 42 | &record.args() 43 | ) 44 | }) 45 | .init(); 46 | 47 | info!("env_logger initialized"); 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/nhagent_v2/src/argument.rs: -------------------------------------------------------------------------------- 1 | use nethint::architecture::TopoArgs; 2 | use nethint::background_flow_hard::BackgroundFlowHard; 3 | use structopt::StructOpt; 4 | use std::net::SocketAddr; 5 | 6 | #[derive(Debug, Clone, StructOpt)] 7 | #[structopt(name = "nhagent", about = "NetHint Agent")] 8 | pub struct Opts { 9 | /// The working interval of agent in millisecond 10 | #[structopt(short = "i", long = "interval", default_value = "100")] 11 | pub interval_ms: u64, 12 | 13 | /// The listening port of the sampler 14 | #[structopt(short = "p", long, default_value = "6343")] 15 | pub sampler_listen_port: u16, 16 | 17 | /// Specify the topology for testbed 18 | #[structopt(subcommand)] 19 | pub topo: TopoArgs, 20 | 21 | /// Background flow parameter by enforcing rate limit, the 22 | /// format is freq:prob:amp[:avg_load] 23 | #[structopt(short, long, default_value)] 24 | pub background_flow_hard: BackgroundFlowHard, 25 | 26 | /// When specified, it represents the number of the duplicated agent. 27 | /// This option is only used to measure the system overhead by running 28 | /// multiple nhagents on the same servers. 29 | #[structopt(short, long)] 30 | pub shadow_id: Option, 31 | 32 | /// Disable HetHint v2, and only run NetHint v1. 33 | #[structopt(short, long)] 34 | pub disable_v2: bool, 35 | 36 | // the two fields below are used by the BPF userspace program 37 | /// Physical interface name for the BPF program 38 | #[structopt(long)] 39 | pub iface: Option, 40 | 41 | /// Rack leader address (ip:port) 42 | #[structopt(long)] 43 | pub rack_leader: Option, 44 | } 45 | -------------------------------------------------------------------------------- /scripts/InfoCollectOverHead.hs: -------------------------------------------------------------------------------- 1 | import Text.Printf (printf) 2 | 3 | numRacks = 1000 4 | numMachinesEachRack = 20 5 | numVMsEachMachine = 10 6 | 7 | rackBandwidth = 600 * 1e9 -- Gbps 8 | -- machineBandwidth = 100 -- Gbps 9 | 10 | nethintPeriodInSec = 0.1 -- 100ms 11 | 12 | numVirtualLinksEachRack :: Float 13 | numVirtualLinksEachRack = 2 * (1 + numMachinesEachRack * numVMsEachMachine) 14 | -- 2 * (1 ToR switch + a bunch of VMs) 15 | -- 2 because of there are a upstream link and a downstream link 16 | 17 | numVirtualLinks :: Float 18 | numVirtualLinks = numVirtualLinksEachRack * numRacks 19 | 20 | -- in bytes 21 | nBrPairSize :: Float 22 | nBrPairSize = 2 * 8 23 | 24 | -- in bytes 25 | virtualLinkIDSize :: Float 26 | virtualLinkIDSize = 8 27 | 28 | -- one virtualLinkID, 8 bytes 29 | -- two (n, Br) pairs, one for traffic within the rack, and the other for traffic contributes to the cross rack link 30 | virtualLinkBytes :: Float 31 | virtualLinkBytes = virtualLinkIDSize + 2 * nBrPairSize 32 | 33 | -- for each rack, it has to send and receive so much information 34 | -- note that the `numVirtualLinks` contains all the virtual links in the data center 35 | crossRackTrafficBytes :: Float -- bytes 36 | crossRackTrafficBytes = numVirtualLinks * virtualLinkBytes 37 | 38 | crossRackTrafficPerSecond :: Float -- bits/second 39 | crossRackTrafficPerSecond = 8 * crossRackTrafficBytes / nethintPeriodInSec 40 | 41 | computeBandwidthOverhead :: Float 42 | computeBandwidthOverhead = crossRackTrafficPerSecond / rackBandwidth 43 | 44 | 45 | showMB :: Float -> String 46 | showMB bytes = printf "%fMB" (bytes / 1e6) 47 | 48 | main = do 49 | let percentage = computeBandwidthOverhead 50 | printf "%.5f%%\n" (percentage * 100) 51 | -------------------------------------------------------------------------------- /src/allreduce/src/random_ring.rs: -------------------------------------------------------------------------------- 1 | use crate::AllReduceAlgorithm; 2 | use nethint::{cluster::Topology, Flow}; 3 | use rand::prelude::SliceRandom; 4 | use rand::{rngs::StdRng, SeedableRng}; 5 | use std::rc::Rc; 6 | 7 | #[derive(Debug)] 8 | pub struct RandomRingAllReduce { 9 | seed: u64, 10 | num_rings: usize, 11 | rng: StdRng, 12 | } 13 | 14 | impl RandomRingAllReduce { 15 | pub fn new(seed: u64, num_rings: usize) -> Self { 16 | RandomRingAllReduce { 17 | seed, 18 | num_rings, 19 | rng: StdRng::seed_from_u64(seed), 20 | } 21 | } 22 | } 23 | 24 | impl AllReduceAlgorithm for RandomRingAllReduce { 25 | fn allreduce(&mut self, size: u64, vcluster: Rc) -> Vec { 26 | let n = vcluster.num_hosts(); 27 | 28 | let mut flows = Vec::new(); 29 | for _ in 0..self.num_rings { 30 | let mut alloced_hosts: Vec = (0..n).into_iter().collect(); 31 | alloced_hosts.shuffle(&mut self.rng); 32 | assert!(n > 0); 33 | for _ in 0..2 { 34 | for i in 0..n { 35 | let pred = format!("host_{}", alloced_hosts[i]); 36 | let succ = format!("host_{}", alloced_hosts[(i + 1) % n]); 37 | log::debug!("pred: {}, succ: {}", pred, succ); 38 | let flow = Flow::new( 39 | size as usize * (n - 1) / n / self.num_rings, 40 | &pred, 41 | &succ, 42 | None, 43 | ); 44 | flows.push(flow); 45 | } 46 | } 47 | } 48 | 49 | flows 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /scripts/testbed-2/enable_sriov.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $UID -ne 0 ]; then 6 | echo "Please run $0 as root" 7 | exit 3 8 | fi 9 | 10 | if [ $# -ne 2 ]; then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | pf=$1 16 | num_vfs=$2 17 | 18 | pci_addr=$(basename `readlink /sys/class/net/$pf/device`) 19 | echo "PCI address of $pf is $pci_addr" 20 | # domain:bus:slot.function 21 | # pci_slot=`echo $pci_addr | cut -d "." -f1` 22 | 23 | # check if sriov has already been enabled 24 | echo 0 > /sys/class/net/$pf/device/sriov_numvfs 25 | num=`cat /sys/class/net/$pf/device/sriov_numvfs` 26 | if [ $num -ne 0 ]; then 27 | echo "$pf SR-IOV has already been enabled, to change the number of VFs, please disable it first, then execute this script" 28 | exit 2 29 | fi 30 | 31 | echo $num_vfs > /sys/class/net/$pf/device/sriov_numvfs 32 | 33 | # set mac address of VFs 34 | for ((i=0;i<$num_vfs;i++)); do 35 | macaddr=`tr -dc A-F0-9 < /dev/urandom | head -c 10 | sed -r 's/(..)/\1:/g;s/:$//;s/^/02:/'` 36 | ip link set $pf vf $i mac $macaddr; 37 | done 38 | 39 | ip link show $pf 40 | 41 | # bind VF's driver to vfio-pci 42 | modprobe vfio-pci 43 | 44 | for ((i=0;i<$num_vfs;i++)); do 45 | vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`) 46 | current_driver=$(basename `readlink /sys/bus/pci/devices/$vf_pci_addr/driver`) 47 | if [ "x$current_driver" != "xvfio-pci" ]; then 48 | # unbind current driver 49 | echo $vf_pci_addr > /sys/bus/pci/devices/$vf_pci_addr/driver/unbind 50 | numeric_id=`lspci -s $vf_pci_addr -n | cut -d " " -f3 | tr ':' ' '` 51 | # bind to vfio-pci 52 | echo $numeric_id > /sys/bus/pci/drivers/vfio-pci/new_id 53 | fi 54 | # show results 55 | lspci -k -s $vf_pci_addr 56 | done 57 | -------------------------------------------------------------------------------- /scripts/testbed/enable_sriov.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $UID -ne 0 ]; then 6 | echo "Please run $0 as root" 7 | exit 3 8 | fi 9 | 10 | if [ $# -ne 2 ]; then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | pf=$1 16 | num_vfs=$2 17 | 18 | pci_addr=$(basename `readlink /sys/class/net/$pf/device`) 19 | echo "PCI address of $pf is $pci_addr" 20 | # domain:bus:slot.function 21 | # pci_slot=`echo $pci_addr | cut -d "." -f1` 22 | 23 | # check if sriov has already been enabled 24 | echo 0 > /sys/class/net/$pf/device/sriov_numvfs 25 | num=`cat /sys/class/net/$pf/device/sriov_numvfs` 26 | if [ $num -ne 0 ]; then 27 | echo "$pf SR-IOV has already been enabled, to change the number of VFs, please disable it first, then execute this script" 28 | exit 2 29 | fi 30 | 31 | echo $num_vfs > /sys/class/net/$pf/device/sriov_numvfs 32 | 33 | # set mac address of VFs 34 | for ((i=0;i<$num_vfs;i++)); do 35 | macaddr=`tr -dc A-F0-9 < /dev/urandom | head -c 10 | sed -r 's/(..)/\1:/g;s/:$//;s/^/02:/'` 36 | ip link set $pf vf $i mac $macaddr; 37 | done 38 | 39 | ip link show $pf 40 | 41 | # bind VF's driver to vfio-pci 42 | modprobe vfio-pci 43 | 44 | for ((i=0;i<$num_vfs;i++)); do 45 | vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`) 46 | current_driver=$(basename `readlink /sys/bus/pci/devices/$vf_pci_addr/driver`) 47 | if [ "x$current_driver" != "xvfio-pci" ]; then 48 | # unbind current driver 49 | echo $vf_pci_addr > /sys/bus/pci/devices/$vf_pci_addr/driver/unbind 50 | numeric_id=`lspci -s $vf_pci_addr -n | cut -d " " -f3 | tr ':' ' '` 51 | # bind to vfio-pci 52 | echo $numeric_id > /sys/bus/pci/drivers/vfio-pci/new_id 53 | fi 54 | # show results 55 | lspci -k -s $vf_pci_addr 56 | done 57 | -------------------------------------------------------------------------------- /scripts/run_duplicates_v2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $UID -ne 0 ]; then 4 | echo "Please run $0 as root" 5 | exit 3 6 | fi 7 | 8 | if [ $# -ne 1 ]; then 9 | echo "Usage: $0 " 10 | exit 1 11 | fi 12 | 13 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT 14 | 15 | scale=$1 16 | rack_size=3 # you may want to change it to 20 in emulation 17 | num_racks=`expr 6 \* $scale` 18 | num_workers=`expr $num_racks \* $rack_size` 19 | 20 | if [ "x$scale" = "x1" -o "x$scale" = "x" ]; then 21 | sampler_port=6343 22 | 23 | RUST_BACKTRACE=full \ 24 | NH_CONTROLLER_URI=danyang-01.cs.duke.edu:9000 \ 25 | NH_NUM_RACKS=$num_racks \ 26 | target/release/nhagent_v2 \ 27 | -p $sampler_port \ 28 | -i 100 \ 29 | -b 10000000000:1:5:0.1 \ 30 | arbitrary $num_racks $rack_size 10 10 31 | 32 | else 33 | # sampler_port=6343 34 | 35 | # RUST_BACKTRACE=full \ 36 | # NH_CONTROLLER_URI=danyang-01.cs.duke.edu:9000 \ 37 | # NH_NUM_RACKS=$num_racks \ 38 | # valgrind --leak-check=full --show-reachable=yes target/release/nhagent_v2 \ 39 | # --shadow-id 0 \ 40 | # -p $sampler_port \ 41 | # -i 100 \ 42 | # -b 10000000000:1:5:0.1 \ 43 | # arbitrary $num_racks $rack_size 10 10 \ 44 | # & 45 | # # --disable-v2 \ 46 | 47 | for ((i=0; i<$scale; i++)); do 48 | sampler_port=`expr 6343 + $i` 49 | 50 | RUST_BACKTRACE=full \ 51 | NH_CONTROLLER_URI=danyang-01.cs.duke.edu:9000 \ 52 | NH_NUM_RACKS=$num_racks \ 53 | target/release/nhagent_v2 \ 54 | --shadow-id $i \ 55 | -p $sampler_port \ 56 | -i 100 \ 57 | -b 10000000000:1:5:0.1 \ 58 | arbitrary $num_racks $rack_size 10 10 \ 59 | & 60 | # --disable-v2 \ 61 | done 62 | 63 | wait 64 | fi 65 | 66 | # DIR=$(dirname `realpath $0`) 67 | # nix develop $DIR/../nethint-bpf -c \ 68 | # sudo -E NH_LOG=info RUST_BACKTRACE=1 \ 69 | # $DIR/../nethint-bpf/target/debug/nethint-user \ 70 | # arbitrary $num_racks 3 10 10 -------------------------------------------------------------------------------- /src/rl/src/random_ring.rs: -------------------------------------------------------------------------------- 1 | use crate::RLAlgorithm; 2 | use nethint::{cluster::Topology, Flow}; 3 | use rand::prelude::SliceRandom; 4 | use rand::{rngs::StdRng, SeedableRng}; 5 | use std::rc::Rc; 6 | 7 | #[derive(Debug)] 8 | pub struct RandomChain { 9 | seed: u64, 10 | num_trees: usize, 11 | rng: StdRng, 12 | } 13 | 14 | impl RandomChain { 15 | pub fn new(seed: u64, num_trees: usize) -> Self { 16 | RandomChain { 17 | seed, 18 | num_trees, 19 | rng: StdRng::seed_from_u64(seed), 20 | } 21 | } 22 | } 23 | 24 | impl RLAlgorithm for RandomChain { 25 | fn run_rl_traffic( 26 | &mut self, 27 | root_index: usize, 28 | group: Option>, 29 | size: u64, 30 | vcluster: Rc, 31 | ) -> Vec { 32 | let mut flows = Vec::new(); 33 | 34 | for _ in 0..self.num_trees { 35 | let mut alloced_hosts: Vec = if group.is_none() { 36 | let n = vcluster.num_hosts(); 37 | let mut hs: Vec = (0..n).into_iter().collect(); 38 | hs.remove(root_index); 39 | hs 40 | } else { 41 | group.clone().unwrap() 42 | }; 43 | alloced_hosts.shuffle(&mut self.rng); 44 | 45 | alloced_hosts.insert(0, root_index); 46 | 47 | assert!( 48 | alloced_hosts.len() >= 2, 49 | "vcluster size must >= 2, worker group cannot be empty" 50 | ); 51 | 52 | for (&x, &y) in alloced_hosts.iter().zip(alloced_hosts.iter().skip(1)) { 53 | let pred = format!("host_{}", x); 54 | let succ = format!("host_{}", y); 55 | let flow = Flow::new(size as usize, &pred, &succ, None); 56 | flows.push(flow); 57 | } 58 | } 59 | 60 | for f in &mut flows { 61 | f.bytes /= self.num_trees; 62 | } 63 | 64 | log::info!("flows: {:?}", flows); 65 | flows 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/allreduce/src/topology_aware.rs: -------------------------------------------------------------------------------- 1 | use crate::AllReduceAlgorithm; 2 | use nethint::{cluster::Topology, Flow}; 3 | use std::rc::Rc; 4 | 5 | #[derive(Debug, Default)] 6 | pub struct TopologyAwareRingAllReduce { 7 | seed: u64, 8 | num_rings: usize, 9 | } 10 | 11 | impl TopologyAwareRingAllReduce { 12 | pub fn new(seed: u64, num_rings: usize) -> Self { 13 | TopologyAwareRingAllReduce { seed, num_rings } 14 | } 15 | } 16 | 17 | impl AllReduceAlgorithm for TopologyAwareRingAllReduce { 18 | fn allreduce(&mut self, size: u64, vcluster: Rc) -> Vec { 19 | use rand::prelude::SliceRandom; 20 | use rand::{rngs::StdRng, SeedableRng}; 21 | let mut rng = StdRng::seed_from_u64(self.seed); 22 | 23 | let mut flows = Vec::new(); 24 | 25 | for _ in 0..self.num_rings { 26 | let mut ring = Vec::new(); 27 | 28 | for i in 0..vcluster.num_switches() - 1 { 29 | let mut ringlet = Vec::new(); 30 | let tor = format!("tor_{}", i); 31 | 32 | for link_ix in vcluster.get_downlinks(vcluster.get_node_index(&tor)) { 33 | let h = vcluster.get_target(*link_ix); 34 | let host_idx = vcluster[h].name.strip_prefix("host_").unwrap().parse::().unwrap(); 35 | ringlet.push(host_idx) 36 | } 37 | ringlet.shuffle(&mut rng); 38 | for node_idx in ringlet { 39 | ring.push(node_idx); 40 | } 41 | } 42 | 43 | let n = vcluster.num_hosts(); 44 | for _ in 0..2 { 45 | for i in 0..n { 46 | let sender = format!("host_{}", ring[i]); 47 | let receiver = format!("host_{}", ring[(i + 1) % n]); 48 | let flow = Flow::new(size as usize * (n - 1) / n / self.num_rings, &sender, &receiver, None); 49 | flows.push(flow); 50 | } 51 | } 52 | } 53 | 54 | flows 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /scripts/testbed/enable_eswitch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $UID -ne 0 ]; then 6 | echo "Please run $0 as root" 7 | exit 3 8 | fi 9 | 10 | if [ $# -ne 1 ]; then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | pf=$1 16 | 17 | pci_addr=$(basename `readlink /sys/class/net/$pf/device`) 18 | echo "PCI address of $pf is $pci_addr" 19 | 20 | pci_bus=$(printf "%d" "0x`echo $pci_addr | cut -d':' -f2`") 21 | pci_slot=$(printf "%d" "0x`echo $pci_addr | cut -d':' -f3 | cut -d'.' -f1 `") 22 | pf_altname="enp${pci_bus}s${pci_slot}" 23 | echo "altname of PF: $pf_altname" 24 | 25 | # check if device has already been set to eswitch mode 26 | mode=`cat /sys/class/net/$pf/compat/devlink/mode` 27 | if [ $mode = "switchmode" ]; then 28 | echo "$pf has already been set to eswitch mode" 29 | exit 2 30 | fi 31 | 32 | # unbind the VFs 33 | num_vfs=`cat /sys/class/net/$pf/device/sriov_numvfs` 34 | drivers=(`seq 1 8`) 35 | 36 | for ((i=0;i<$num_vfs;i++)); do 37 | vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`) 38 | driver=$(basename `readlink /sys/bus/pci/devices/$vf_pci_addr/driver`) 39 | drivers[$i]=$driver 40 | echo $vf_pci_addr > /sys/bus/pci/devices/$vf_pci_addr/driver/unbind 41 | # show results 42 | lspci -k -s $vf_pci_addr 43 | done 44 | 45 | echo switchdev > "/sys/class/net/$pf/compat/devlink/mode" 46 | 47 | echo "sleeping for 20 seconds..." 48 | sleep 20 49 | 50 | # check if the VF representors has been renamed 51 | ip link 52 | 53 | # It is necessary to first set the network VF representor device names 54 | # to be in the form of $PF_$VFID where $PF is the PF netdev name, 55 | # and $VFID is the VF ID=0,1,[..], bring up these VF representors 56 | for ((i=0;i<$num_vfs;i++)); do 57 | ip link set "${pf_altname}_${i}" name "${pf}_${i}" 58 | ip link set "${pf}_${i}" up 59 | done 60 | 61 | 62 | # re-bind the VFs' drivers 63 | for ((i=0;i<$num_vfs;i++)); do 64 | vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`) 65 | echo $vf_pci_addr > /sys/bus/pci/drivers/${drivers[$i]}/bind 66 | # show results 67 | lspci -k -s $vf_pci_addr 68 | done 69 | -------------------------------------------------------------------------------- /src/nethint/tests/toy1.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | use nethint::bandwidth::BandwidthTrait; 3 | use nethint::cluster::{Cluster, Node, NodeType}; 4 | use nethint::simulator::{Executor, Simulator}; 5 | use nethint::{Flow, Trace, TraceRecord}; 6 | 7 | #[test] 8 | fn toy1() { 9 | logging::init_log(); 10 | 11 | let nodes = vec![ 12 | ("a1", 3), 13 | ("a2", 3), 14 | ("a3", 3), 15 | ("a5", 3), 16 | ("a6", 3), 17 | ("vs1", 2), 18 | ("vs2", 2), 19 | ("cloud", 1), 20 | ] 21 | .into_iter() 22 | .map(|(n, depth)| { 23 | Node::new( 24 | n, 25 | depth, 26 | if depth == 3 { 27 | NodeType::Host 28 | } else { 29 | NodeType::Switch 30 | }, 31 | ) 32 | }) 33 | .collect(); 34 | 35 | let mut cluster = Cluster::from_nodes(nodes); 36 | 37 | vec![ 38 | ("a1", "vs1", 20.gbps()), 39 | ("a2", "vs1", 10.gbps()), 40 | ("a3", "vs1", 9.gbps()), 41 | ("a5", "vs2", 10.gbps()), 42 | ("a6", "vs2", 5.gbps()), 43 | ("vs1", "cloud", 35.gbps()), 44 | ("vs2", "cloud", 15.gbps()), 45 | ] 46 | .into_iter() 47 | .for_each(|args| cluster.add_link_by_name(args.1, args.0, args.2)); 48 | 49 | let mut trace = Trace::new(); 50 | let records: Vec = vec![ 51 | (0, 1e6 as usize, "a1", "a5"), 52 | (0, 1e6 as usize, "a2", "a6"), 53 | (1000000, 1e6 as usize, "a2", "a3"), 54 | ] 55 | .into_iter() 56 | .map(|args| TraceRecord::new(args.0, Flow::new(args.1, args.2, args.3, None), None)) 57 | .collect(); 58 | records.into_iter().for_each(|r| trace.add_record(r)); 59 | 60 | let mut simulator = Simulator::new(cluster); 61 | let output = simulator.run_with_trace(trace); 62 | println!("{:#?}", output); 63 | assert_eq!(output.recs[0].dura, Some(800_000)); 64 | assert_eq!(output.recs[1].dura, Some(1_600_000)); 65 | assert_eq!( 66 | output.recs[2].dura, 67 | Some(((600. + (10. / 16.) * 1600. * (5. / 9.)) * 1e3f64).round() as u64) 68 | ); 69 | } 70 | -------------------------------------------------------------------------------- /src/nhagent_v2/src/message.rs: -------------------------------------------------------------------------------- 1 | use nethint::{TenantId, hint::{NetHintV1Real, NetHintV2Real, NetHintVersion}}; 2 | use nethint::counterunit::CounterUnit; 3 | use nethint::cluster::LinkIx; 4 | use serde::{Deserialize, Serialize}; 5 | use std::collections::HashMap; 6 | use crate::communicator::BcastId; 7 | 8 | #[derive(Debug, Serialize, Deserialize)] 9 | pub enum Message { 10 | // do not handle these messages for now 11 | // /// send by worker, processed by leader 12 | // LeaveNode(Node), 13 | /// send by leader, processed by worker 14 | AppFinish, 15 | 16 | /// send by non global leader, processed by global leader 17 | /// barrier ID 18 | SyncRequest(u64), 19 | 20 | /// send by global leader, processed by non global leader 21 | /// barrier ID 22 | SyncResponse(u64), 23 | 24 | /// broadcast type wrapper 25 | BcastMessage(BcastId, Box), 26 | 27 | /// send by worker, processed by worker 28 | DeclareHostname(String), 29 | 30 | /// A potential problem here is that LinkIx from different machines may not be compatible 31 | /// send by rack leader, processed by rack leader 32 | RackChunk(HashMap>), 33 | 34 | /// send by experiment scheduler, processed by rack leader, 35 | /// forward by rack leader, processed by global leader 36 | /// in practice, we skip the forwarding pass 37 | /// tenant_id, nhosts, allow_delay 38 | ProvisionRequest(TenantId, usize, bool), 39 | /// send by global leader, processed by rack leader 40 | /// forward by rack leader, processed by experiment scheduler 41 | /// in practice, we skip the forwarding pass 42 | /// tenant_id, hintv1 43 | ProvisionResponse(TenantId, NetHintV1Real), 44 | /// send by app, processed by global leader 45 | DestroyRequest(TenantId), 46 | /// send by global leader, processed by app 47 | DestroyResponse(TenantId), 48 | /// send by app, processed by rack/global leader leader 49 | NetHintRequest(TenantId, NetHintVersion), 50 | /// send by rack/global leader, processed by app 51 | NetHintResponseV1(TenantId, NetHintV1Real), 52 | /// send by rack/global leader, processed by app 53 | NetHintResponseV2(TenantId, NetHintV2Real), 54 | 55 | /// send by the scheduler 56 | BatchDoneNotification, 57 | } 58 | -------------------------------------------------------------------------------- /scripts/testbed/environment/cpu_vm_stage1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script will create cpu_vm_base.img in the current directory, 4 | # the cpu_vm_base.img can be used as a disk and boot by qemu 5 | 6 | DISK_IMG=/tmp/cpu_vm_base.img 7 | 8 | umount /mnt 9 | losetup -D 10 | 11 | # create raw disk, the 5GB is not enought for the latest OFED packages, so give it 10GB 12 | dd if=/dev/zero of=$DISK_IMG bs=1G count=10 status=progress && sync 13 | 14 | # create only 1 partition, mark it bootable 15 | # an example command 16 | echo -en 'n\np\n1\n\n\na\nw\n\n' | fdisk $DISK_IMG 17 | 18 | # in the below example output, I only create 100MB file disk 19 | # cjr@cpu22 /tmp % fdisk -l raw_disk.img 20 | # Disk raw.bin: 100 MiB, 104857600 bytes, 204800 sectors 21 | # Units: sectors of 1 * 512 = 512 bytes 22 | # Sector size (logical/physical): 512 bytes / 512 bytes 23 | # I/O size (minimum/optimal): 512 bytes / 512 bytes 24 | # Disklabel type: dos 25 | # Disk identifier: 0x7bcdb498 26 | # 27 | # Device Boot Start End Sectors Size Id Type 28 | # raw.bin1 * 2048 204799 202752 99M 83 Linux 29 | 30 | LOOP_DEV=`losetup -f` 31 | if [ $? -ne 0 ]; then 32 | echo "losetup -f cannot find free loop device" 33 | exit 1 34 | fi 35 | 36 | losetup $LOOP_DEV $DISK_IMG 37 | partprobe $LOOP_DEV 38 | # root@cpu21 /tmp # lsblk 39 | # NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT 40 | # loop0 7:0 0 4.7G 0 loop 41 | # ├─loop0p1 259:0 0 4.7G 0 loop 42 | # sda 8:0 0 1.8T 0 disk 43 | # └─sda1 8:1 0 1.8T 0 part / 44 | 45 | LOOP_PART=${LOOP_DEV}p1 46 | # format the filesystems 47 | mkfs.ext4 $LOOP_PART 48 | 49 | # mount the partitions 50 | mount $LOOP_PART /mnt 51 | 52 | # debootstrap 53 | apt install debootstrap -y 54 | debootstrap --merged-usr --keyring=/usr/share/keyrings/ubuntu-archive-keyring.gpg --verbose focal /mnt http://archive.ubuntu.com/ubuntu/ 55 | 56 | # generate fstab 57 | apt install arch-install-scripts -y 58 | genfstab -U /mnt | grep -v swap >> /mnt/etc/fstab 59 | 60 | # after command finish, chroot to that directory 61 | cp /etc/apt/sources.list /mnt/etc/apt/sources.list 62 | 63 | #mount -t proc /proc /mnt/proc 64 | #mount --rbind /sys /mnt/sys 65 | #mount --rbind /dev /mnt/dev 66 | #mount --rbind /run /mnt/run 67 | #cp /etc/resolv.conf /mnt/etc/resolv.conf 68 | #chroot /mnt /bin/bash 69 | 70 | cp ./cpu_vm_stage2.sh /mnt/root 71 | arch-chroot /mnt /bin/bash /root/cpu_vm_stage2.sh $LOOP_DEV 72 | 73 | sync 74 | umount /mnt 75 | losetup -D 76 | -------------------------------------------------------------------------------- /scripts/testbed-2/nixos-vm-setup/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | [[ $UID -ne 0 ]] && echo "Please run $0 as root" && exit 1 5 | 6 | # this will create a disk image with nixos filesystem at /tmp/nixos_vm_base.img 7 | ./prepare_disk.sh 8 | 9 | DISK_FILE="/var/lib/libvirt/images/nixos_vm_base.img" 10 | 11 | [[ -e "$DISK_FILE" ]] && echo "$(tput setaf 1)[ERROR]$(tput sgr 0) image file already exists" && exit 2 12 | 13 | # copy the image to image pool so virt-install can see it 14 | 15 | echo "copying /tmp/nixos_vm_base.img to libvirt image pool" 16 | cp /tmp/nixos_vm_base.img $DISK_FILE 17 | 18 | # cpubase m4.xlarge 19 | virt-install --virt-type kvm --name nixosbase --vcpus 8 --ram 16384 --boot hd --disk $DISK_FILE,format=raw --network network=default --network bridge=ovs0,virtualport_type=openvswitch,model=virtio --nographic --os-type=linux --os-variant=generic --noreboot --import 20 | 21 | # provision 8 NixOS VMs 22 | source `dirname $0`/utils.sh 23 | 24 | function customize() 25 | { 26 | name=$1 27 | # customize the ip address on ens3 interface 28 | # sed -i "s/192.168.211.3/192.168.211.3/" ./mnt/etc/nixos/configuration.nix 29 | # customize hostname 30 | sed -i "s/networking.hostName = \"nixos\"/networking.hostName = \"$name\"/" "$MNT_DIR/etc/nixos/configuration.nix" 31 | sed -i "s/nixosConfigurations.nixos/nixosConfigurations.$name/" "$MNT_DIR/etc/nixos/flake.nix" 32 | } 33 | 34 | NIXOS_INSTALL=`command -v nixos-install` 35 | NIXOS_ENTER=`command -v nixos-enter` 36 | 37 | for name in `lsnames 8`; do 38 | TARGET="/var/lib/libvirt/images/${name}.img" 39 | virsh vol-delete ${name}.img --pool images 40 | virt-clone --replace --original nixosbase --name $name --file "$TARGET" 41 | 42 | # an alternative to virt-sysprep 43 | MNT_DIR=./mnt 44 | mkdir -p "$MNT_DIR"; umount "$MNT_DIR"; losetup -D 45 | LOOP_DEV=`losetup -f` 46 | losetup -P $LOOP_DEV $TARGET 47 | mount ${LOOP_DEV}p1 "$MNT_DIR" 48 | customize $name 49 | PATH_BAK=$PATH 50 | export NIX_PATH=nixpkgs=/nix/var/nix/profiles/per-user/cjr/channels/nixos/nixpkgs 51 | export PATH=/run/wrappers/bin:/root/.nix-profile/bin:/etc/profiles/per-user/root/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin:$PATH 52 | $NIXOS_INSTALL --root $(realpath "$MNT_DIR") --no-root-passwd --flake "$MNT_DIR/etc/nixos#$name" --impure 53 | $NIXOS_INSTALL --root $(realpath "$MNT_DIR") --no-bootloader --no-root-passwd --flake "$MNT_DIR/etc/nixos#$name" --impure 54 | export PATH=$PATH_BAK 55 | sync; umount -R "$MNT_DIR"; losetup -D 56 | done 57 | -------------------------------------------------------------------------------- /src/mapreduce/src/random.rs: -------------------------------------------------------------------------------- 1 | use rand::{self, seq::SliceRandom}; 2 | use std::collections::HashMap; 3 | 4 | use nethint::cluster::{Topology, LinkIx, RouteHint}; 5 | 6 | use crate::{JobSpec, PlaceReducer, Placement, Shuffle, RNG}; 7 | 8 | #[derive(Debug, Default)] 9 | pub struct RandomReducerScheduler {} 10 | 11 | impl RandomReducerScheduler { 12 | pub fn new() -> Self { 13 | Default::default() 14 | } 15 | 16 | pub fn estimate_jct( 17 | &mut self, 18 | cluster: &dyn Topology, 19 | job_spec: &JobSpec, 20 | mapper: &Placement, 21 | shuffle_pairs: &Shuffle, 22 | collocate: bool, 23 | ) -> f64 { 24 | let reducers = self.place(cluster, job_spec, mapper, shuffle_pairs, collocate); 25 | let mut traffic: HashMap = Default::default(); 26 | 27 | for (mi, m) in mapper.0.iter().enumerate() { 28 | let m_ix = cluster.get_node_index(m); 29 | for (ri, r) in reducers.0.iter().enumerate() { 30 | let s = shuffle_pairs.0[mi][ri]; 31 | let r_ix = cluster.get_node_index(r); 32 | if m_ix != r_ix { 33 | let route = cluster.resolve_route(m, r, &RouteHint::default(), None); 34 | for link_ix in route.path { 35 | *traffic.entry(link_ix).or_insert(0) += s; 36 | } 37 | } 38 | } 39 | } 40 | 41 | let mut est: f64 = 0.0; 42 | for (&link_ix, &tr) in traffic.iter() { 43 | let bw = cluster[link_ix].bandwidth; 44 | est = est.max(tr as f64 * 8.0 / bw.val() as f64); 45 | } 46 | 47 | // unit in seconds 48 | est 49 | } 50 | } 51 | 52 | impl PlaceReducer for RandomReducerScheduler { 53 | fn place( 54 | &mut self, 55 | cluster: &dyn Topology, 56 | job_spec: &JobSpec, 57 | mapper: &Placement, 58 | _shuffle_pairs: &Shuffle, 59 | collocate: bool, 60 | ) -> Placement { 61 | RNG.with(|rng| { 62 | let mut rng = rng.borrow_mut(); 63 | let num_hosts = cluster.num_hosts(); 64 | let mut hosts: Vec = (0..num_hosts).map(|x| format!("host_{}", x)).collect(); 65 | if !collocate { 66 | hosts.retain(|h| mapper.0.iter().find(|&m| m.eq(h)).is_none()); 67 | } 68 | let mut hosts: Vec = hosts 69 | .choose_multiple(&mut *rng, job_spec.num_reduce) 70 | .cloned() 71 | .collect(); 72 | hosts.shuffle(&mut *rng); 73 | Placement(hosts) 74 | }) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/nhagent/src/ssagent.rs: -------------------------------------------------------------------------------- 1 | use std::net::SocketAddr; 2 | use structopt::StructOpt; 3 | use utils::cmd_helper::get_command_output; 4 | use std::time::Duration; 5 | use std::process::Command; 6 | use nhagent::sampler::ss_sampler::SsTcpFlows; 7 | 8 | #[derive(Debug, Clone, StructOpt)] 9 | #[structopt( 10 | name = "ssagent", 11 | about = "while true; do ss -tuni | nc -w0 -u 127.0.0.1 9999; sleep 0.1; done" 12 | )] 13 | struct Opts { 14 | /// Poll interval in ms 15 | #[structopt(short = "i", long = "interval", default_value = "100")] 16 | interval_ms: u64, 17 | /// Target address 18 | #[structopt(short = "t", long = "target")] 19 | target: Option, 20 | } 21 | 22 | #[inline] 23 | fn now() -> std::time::Instant { 24 | std::time::Instant::now() 25 | } 26 | 27 | fn get_default_target() -> SocketAddr { 28 | let my_ip = utils::net::get_primary_ipv4("rdma0").unwrap(); 29 | let last_field = my_ip.octets()[3]; 30 | // the conversion is hard coded 31 | // 3,4,5,6 -> 2, 35... -> 34 32 | let target_num = (last_field / 32 * 32 + 2).to_string(); 33 | let numbers = my_ip.octets(); 34 | let addr = format!( 35 | "{}.{}.{}.{}:{}", 36 | numbers[0], numbers[1], numbers[2], target_num, 5555 37 | ); 38 | addr.parse().unwrap_or_else(|_| panic!("addr: {}", addr)) 39 | } 40 | 41 | fn main() { 42 | logging::init_log(); 43 | 44 | let mut opts = Opts::from_args(); 45 | if opts.target.is_none() { 46 | opts.target = Some(get_default_target()); 47 | } 48 | log::info!("opts: {:?}", opts); 49 | 50 | let sock = std::net::UdpSocket::bind("0.0.0.0:34254").expect("bind failed"); 51 | sock.connect(opts.target.unwrap()).expect("connect failed"); 52 | sock.set_write_timeout(Some(Duration::from_millis(opts.interval_ms / 2))).unwrap(); 53 | let sleep_ms = Duration::from_millis(opts.interval_ms); 54 | let mut last_ts = now(); 55 | 56 | loop { 57 | let mut cmd = Command::new("ss"); 58 | cmd.arg("-tuni"); 59 | let output = get_command_output(cmd).unwrap(); 60 | let ss_flows: SsTcpFlows = output.parse().expect("fail to parse ss output"); 61 | let ts = std::time::SystemTime::now(); 62 | let buf = bincode::serialize(&(ts, ss_flows)).expect("fail to serialize ss_flows"); 63 | assert!(buf.len() <= 65507); 64 | match sock.send(&buf) { 65 | Ok(_nbytes) => {} 66 | Err(_e) => {} 67 | } 68 | let n = now(); 69 | if last_ts + sleep_ms > n { 70 | // avoid duration < 0, which will cause a panic. 71 | std::thread::sleep(last_ts + sleep_ms - n); 72 | } 73 | last_ts = now(); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /scripts/testbed/environment/README.md: -------------------------------------------------------------------------------- 1 | # Base Image Preparation 2 | 3 | Scripts to prepare a CPU base image file disk. 4 | 5 | ```bash 6 | sudo ./cpu_vm_stage1.sh 7 | # at the end of stage1, the script will mount the guest filesystem at 8 | # /mnt, and copy the stage2 script to /mnt/root, and chroot to that and 9 | # execute the stage2.sh. 10 | ``` 11 | 12 | To install and use the VM, use libvirt. First copy the `cpu_vm_base.img` to 13 | `/var/lib/libvirt/images/cpu_vm_base.img`. Then run virt-install. The 14 | example below create an AWS `m4.xlarge` like CPU machine, except that the 15 | network uses RDMA SR-IOV. Remember to remove the `--print-xml` to really install 16 | the profile to libvirt. 17 | ```bash 18 | # you may finish the install without graphics 19 | virt-install --virt-type kvm --name cpubase --vcpus 4 --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --hostdev=pci_0000_18_00_1 --nographic --os-type=linux --os-variant=ubuntu20.04 --print-xml 20 | 21 | # or with graphics 22 | virt-install --virt-type kvm --name cpubase --vcpus 4 --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --hostdev=pci_0000_18_00_1 --graphic vnc,listen=0.0.0.0 --os-type=linux --os-variant=ubuntu20.04 --print-xml 23 | ``` 24 | 25 | 26 | To simplify the configuration, I use the default network created by libvirt to 27 | allow the internet access (SNAT). But that would not allow the VMs to reach 28 | each other. To allow this, I configure a correct IP address for the rdma 29 | interface and use that for interconnection. The switch has already been 30 | configured to support this. 31 | 32 | 33 | A bunch of things I decide to setup later after all VMs have been booted are 34 | 1. add many sshkeys to these VMs 35 | 2. generate a script for each VM to bring up and set different IP 36 | address for the rdma interface. 37 | 3. On every boot, attach VFs to corresponding VMs by using virsh 38 | attach-device, this gives more flexibility. 39 | 4. configure the name address resolution in both guests and hosts to 40 | allow easy access by sth like rdma0.cpu5 41 | 42 | 43 | Then we are done! 44 | 45 | 46 | ### Notes 47 | A couple of things that has to be check for each reboot of physical 48 | server (aka what is not persistent). 49 | 1. `enable_sriov.sh` 50 | 2. `enable_eswitch.sh` 51 | 3. `setup_ovs.sh` 52 | 53 | The order is important. 54 | 55 | To clear ovs setings 56 | 1. `ovs-vsctl del-br ovs-sriov` 57 | 1. `ovs-dpctl show` 58 | 59 | To disable eswtich and recovery the configuation 60 | 1. `echo legacy | sudo tee /sys/class/net/rdma0/compat/devlink/mode` 61 | 2. `enable_sriov.sh` 62 | 3. `echo 0 | sudo tee /sys/class/net/rdma0/device/sriov_numvfs` (optionally) 63 | -------------------------------------------------------------------------------- /evaluation/sensitivity/sensitivity_probing_cost1_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[80, 16], [80, 32]] 6 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 7 | 8 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 9 | buffer_size = 100_000_000 10 | 11 | # Number of iterations for all jobs 12 | num_iterations = 1000 13 | 14 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s 15 | poisson_lambda = 24_000_000_000.0 16 | 17 | placement_strategy = { type = "Compact" } 18 | # placement_strategy = { type = "CompactLoadBalanced" } 19 | # placement_strategy = { type = "Spread" } 20 | # placement_strategy = { type = "Random", args = 0 } 21 | 22 | # global seed 23 | seed = 1 24 | 25 | # Output path of for the simulation results 26 | directory = "/tmp/sensitivity_probing_cost1/sensitivity_probing_cost1_base" 27 | 28 | # Number of repeats for each batch of experiments 29 | batch_repeat = 5 30 | 31 | [[batch]] 32 | policy = "RAT" 33 | probe = { enable = true, round_ms = 100 } 34 | nethint_level = 2 35 | auto_tune = 1000 36 | 37 | [simulator] 38 | nethint = true 39 | sample_interval_ns = 100_000_000 # 100ms 40 | loopback_speed = 400 41 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 42 | # fairness = "TenantFlowMaxMin" 43 | fairness = "PerFlowMaxMin" 44 | 45 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 46 | # background_flow_hard = { enable = false } 47 | # nethint_delay_ms = 100 48 | 49 | [brain] 50 | # Random seed for multiple uses 51 | seed = 1 52 | # Whether the cluster's bandwidth is asymmetric 53 | asymmetric = false 54 | # The percentage of nodes marked broken 55 | broken = 0.0 56 | # The slots of each physical machine 57 | max_slots = 1 58 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 59 | sharing_mode = "Guaranteed" 60 | guaranteed_bandwidth = 25 61 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 62 | # background_flow_high_freq = { enable = false } 63 | gc_period = 10 64 | 65 | # The topology for simulation 66 | [brain.topology] 67 | type = "Arbitrary" # another possible value is "FatTree" 68 | 69 | # [brain.topology.args] # When type = "FatTree" 70 | # nports = 20 # the number of ports of a switch 71 | # bandwidth = 100 # in Gbps 72 | # oversub_ratio = 4.0 # oversubscription ratio 73 | 74 | [brain.topology.args] # When type = "Arbitrary" 75 | nracks = 320 # the number of racks 76 | rack_size = 18 # the number of hosts under a rack 77 | host_bw = 100 # bandwidth of a host, in Gbps 78 | rack_bw = 600 # bandwidth of a ToR switch, in Gbps 79 | 80 | # [envs] 81 | # KEY = "value" 82 | -------------------------------------------------------------------------------- /scripts/testbed/setup_ovs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $UID -ne 0 ]; then 6 | echo "Please run $0 as root" 7 | exit 3 8 | fi 9 | 10 | if [ $# -ne 2 ]; then 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | pf=$1 16 | num_vfs=$2 17 | 18 | # Create an OVS bridge (here it's named ovs-sriov). 19 | ovs-vsctl add-br ovs-sriov 20 | 21 | # Enable hardware offload (disabled by default). 22 | ovs-vsctl set Open_vSwitch . other_config:hw-offload=true 23 | 24 | # The aging timeout of OVS is given is ms and can be controlled with this command: 25 | ovs-vsctl set Open_vSwitch . other_config:max-idle=30000 26 | 27 | # check the result 28 | ovs-vsctl get Open_vSwitch . other_config 29 | 30 | # Restart the openvswitch service. This step is required for HW offload changes to take effect. 31 | systemctl restart openvswitch-switch.service 32 | 33 | 34 | # Make sure to bring up the PF and representor netdevices. 35 | ovs-vsctl add-port ovs-sriov $pf 36 | for ((i=0;i<$num_vfs;i++)); do 37 | ovs-vsctl add-port ovs-sriov ${pf}_${i} 38 | done 39 | 40 | # show something 41 | ovs-vsctl list-ports ovs-sriov 42 | ovs-dpctl show 43 | 44 | 45 | # sudo ovs-appctl dpctl/dump-flows type=all -m 46 | # This will give results like below. All type of flows are displayed! 47 | # recirc_id(0),in_port(2),eth(src=02:49:61:d4:70:e8,dst=02:bc:b6:ff:bf:97),eth_type(0x0800),ipv4(frag=no), packets:584316, bytes:36666200, used:4.940s, actions:3 48 | # recirc_id(0),in_port(2),eth(src=02:49:61:d4:70:e8,dst=02:bc:b6:ff:bf:97),eth_type(0x0806), packets:0, bytes:0, used:3.110s, actions:3 49 | # recirc_id(0),in_port(3),eth(src=02:bc:b6:ff:bf:97,dst=02:49:61:d4:70:e8),eth_type(0x0800),ipv4(frag=no), packets:29461402, bytes:34366302886, used:4.940s, actions:2 50 | # recirc_id(0),in_port(3),eth(src=02:bc:b6:ff:bf:97,dst=02:49:61:d4:70:e8),eth_type(0x0806), packets:2, bytes:120, used:2.090s, actions:2 51 | # recirc_id(0),in_port(3),eth(src=1c:34:da:a5:55:94,dst=01:80:c2:00:00:0e),eth_type(0x88cc), packets:0, bytes:0, used:4.340s, actions:drop 52 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1b:06,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.162,tip=192.168.211.2,op=1/0xff), packets:2120, bytes:127200, used:0.141s, actions:1,2 53 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1a:4e,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.130,tip=192.168.211.2,op=1/0xff), packets:2121, bytes:127260, used:0.633s, actions:1,2 54 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1b:5a,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.194,tip=192.168.211.2,op=1/0xff), packets:2120, bytes:127200, used:0.401s, actions:1,2 55 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1b:26,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.34,tip=192.168.211.2,op=1/0xff), packets:3120, bytes:187200, used:0.721s, actions:1,2 56 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1a:4a,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.66,tip=192.168.211.2,op=1/0xff), packets:2118, bytes:127080, used:0.845s, actions:1,2 57 | -------------------------------------------------------------------------------- /src/nhagent/src/message.rs: -------------------------------------------------------------------------------- 1 | use crate::sampler::EthAddr; 2 | 3 | use nethint::{TenantId, hint::{NetHintV1Real, NetHintV2Real, NetHintVersion}}; 4 | use nethint::counterunit::CounterUnit; 5 | use nethint::cluster::LinkIx; 6 | use serde::{Deserialize, Serialize}; 7 | use std::collections::HashMap; 8 | use std::net::IpAddr; 9 | use crate::communicator::BcastId; 10 | use crate::timing::TimeList; 11 | 12 | #[derive(Debug, Serialize, Deserialize)] 13 | pub enum Message { 14 | // do not handle these messages for now 15 | // /// send by worker, processed by leader 16 | // LeaveNode(Node), 17 | /// send by leader, processed by worker 18 | AppFinish, 19 | 20 | /// send by non global leader, processed by global leader 21 | /// barrier ID 22 | SyncRequest(u64), 23 | 24 | /// send by global leader, processed by non global leader 25 | /// barrier ID 26 | SyncResponse(u64), 27 | 28 | /// broadcast type wrapper 29 | BcastMessage(BcastId, Box), 30 | 31 | /// send by worker, procesed by worker 32 | /// declare the table to map ethaddr to hostname collected locally 33 | DeclareEthHostTable(HashMap), 34 | 35 | /// send by worker, procesed by worker 36 | /// declare the table to map ethaddr to hostname collected locally 37 | DeclareIpHostTable(HashMap), 38 | 39 | /// send by worker, processed by worker 40 | DeclareHostname(String), 41 | 42 | /// send by worker, processed by rack leader 43 | ServerChunk(Vec, TimeList), 44 | /// A potential problem here is that LinkIx from different machines may not be compatible 45 | /// send by rack leader, processed by rack leader 46 | RackChunk(HashMap>, TimeList), 47 | /// send by rack leader, processed by worker 48 | AllHints(HashMap>), 49 | 50 | /// send by experiment scheduler, processed by rack leader, 51 | /// forward by rack leader, processed by global leader 52 | /// in practice, we skip the forwarding pass 53 | /// tenant_id, nhosts, allow_delay 54 | ProvisionRequest(TenantId, usize, bool), 55 | /// send by global leader, processed by rack leader 56 | /// forward by rack leader, processed by experiment scheduler 57 | /// in practice, we skip the forwarding pass 58 | /// tenant_id, hintv1 59 | ProvisionResponse(TenantId, NetHintV1Real), 60 | /// send by app, processed by global leader 61 | DestroyRequest(TenantId), 62 | /// send by global leader, processed by app 63 | DestroyResponse(TenantId), 64 | /// send by app, processed by rack/global leader leader 65 | NetHintRequest(TenantId, NetHintVersion, TimeList), 66 | /// send by rack/global leader, processed by app 67 | NetHintResponseV1(TenantId, NetHintV1Real), 68 | /// send by rack/global leader, processed by app 69 | NetHintResponseV2(TenantId, NetHintV2Real, TimeList), 70 | /// send by global leader, processed by all 71 | UpdateRateLimit(usize), 72 | } 73 | -------------------------------------------------------------------------------- /evaluation/sensitivity/sensitivity_probing_cost1_baseline.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[80, 16], [80, 32]] 6 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 7 | 8 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 9 | buffer_size = 100_000_000 10 | 11 | # Number of iterations for all jobs 12 | num_iterations = 1000 13 | 14 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s 15 | poisson_lambda = 24_000_000_000.0 16 | 17 | placement_strategy = { type = "Compact" } 18 | # placement_strategy = { type = "CompactLoadBalanced" } 19 | # placement_strategy = { type = "Spread" } 20 | # placement_strategy = { type = "Random", args = 0 } 21 | 22 | # global seed 23 | seed = 1 24 | 25 | # Output path of for the simulation results 26 | directory = "/tmp/sensitivity_probing_cost1/sensitivity_probing_cost1_baseline" 27 | 28 | # Number of repeats for each batch of experiments 29 | batch_repeat = 5 30 | 31 | [[batch]] 32 | policy = "Random" 33 | probe = { enable = false } 34 | nethint_level = 2 35 | 36 | [[batch]] 37 | policy = "RAT" 38 | probe = { enable = false } 39 | nethint_level = 2 40 | # Auto tune after some iterations. default is disabled 41 | auto_tune = 10 42 | 43 | [simulator] 44 | nethint = true 45 | sample_interval_ns = 100_000_000 # 100ms 46 | loopback_speed = 400 47 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 48 | # fairness = "TenantFlowMaxMin" 49 | fairness = "PerFlowMaxMin" 50 | 51 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 52 | # background_flow_hard = { enable = false } 53 | # nethint_delay_ms = 100 54 | 55 | [brain] 56 | # Random seed for multiple uses 57 | seed = 1 58 | # Whether the cluster's bandwidth is asymmetric 59 | asymmetric = false 60 | # The percentage of nodes marked broken 61 | broken = 0.0 62 | # The slots of each physical machine 63 | max_slots = 1 64 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 65 | sharing_mode = "Guaranteed" 66 | guaranteed_bandwidth = 25 67 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 68 | # background_flow_high_freq = { enable = false } 69 | gc_period = 10 70 | 71 | # The topology for simulation 72 | [brain.topology] 73 | type = "Arbitrary" # another possible value is "FatTree" 74 | 75 | # [brain.topology.args] # When type = "FatTree" 76 | # nports = 20 # the number of ports of a switch 77 | # bandwidth = 100 # in Gbps 78 | # oversub_ratio = 4.0 # oversubscription ratio 79 | 80 | [brain.topology.args] # When type = "Arbitrary" 81 | nracks = 320 # the number of racks 82 | rack_size = 18 # the number of hosts under a rack 83 | host_bw = 100 # bandwidth of a host, in Gbps 84 | rack_bw = 600 # bandwidth of a ToR switch, in Gbps 85 | 86 | # [envs] 87 | # KEY = "value" 88 | -------------------------------------------------------------------------------- /evaluation/inaccuracy/inaccuracy1_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[80, 16], [80, 32]] 6 | 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 8 | buffer_size = 100_000_000 9 | 10 | # Number of iterations for all jobs 11 | num_iterations = 100 12 | 13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s 14 | poisson_lambda = 2_400_000_000.0 15 | 16 | placement_strategy = { type = "Compact" } 17 | # placement_strategy = { type = "CompactLoadBalanced" } 18 | # placement_strategy = { type = "Spread" } 19 | # placement_strategy = { type = "Random", args = 0 } 20 | 21 | # global seed 22 | seed = 1 23 | 24 | # Output path of for the simulation results 25 | directory = "/tmp/inaccuracy1/inaccuracy1_base" 26 | 27 | # Number of repeats for each batch of experiments 28 | batch_repeat = 5 29 | 30 | [[batch]] 31 | policy = "Random" 32 | probe = { enable = false } 33 | nethint_level = 2 34 | 35 | [[batch]] 36 | policy = "TopologyAware" # which is ring 37 | probe = { enable = false } 38 | nethint_level = 1 39 | 40 | [[batch]] 41 | policy = "RAT" 42 | probe = { enable = false } 43 | nethint_level = 2 44 | # Auto tune after some iterations. default is disabled 45 | auto_tune = 10 46 | 47 | [simulator] 48 | nethint = true 49 | sample_interval_ns = 100_000_000 # 100ms 50 | loopback_speed = 400 51 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 52 | # fairness = "TenantFlowMaxMin" 53 | fairness = "PerFlowMaxMin" 54 | 55 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 56 | # background_flow_hard = { enable = false } 57 | # nethint_delay_ms = 100 58 | 59 | [brain] 60 | # Random seed for multiple uses 61 | seed = 1 62 | # Whether the cluster's bandwidth is asymmetric 63 | asymmetric = false 64 | # The percentage of nodes marked broken 65 | broken = 0.0 66 | # The slots of each physical machine 67 | max_slots = 1 68 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 69 | sharing_mode = "Guaranteed" 70 | guaranteed_bandwidth = 25 71 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 72 | # background_flow_high_freq = { enable = false } 73 | gc_period = 10 74 | # inaccuracy 75 | inaccuracy = 0.1 76 | 77 | # The topology for simulation 78 | [brain.topology] 79 | type = "Arbitrary" # another possible value is "FatTree" 80 | 81 | # [brain.topology.args] # When type = "FatTree" 82 | # nports = 20 # the number of ports of a switch 83 | # bandwidth = 100 # in Gbps 84 | # oversub_ratio = 4.0 # oversubscription ratio 85 | 86 | [brain.topology.args] # When type = "Arbitrary" 87 | nracks = 320 # the number of racks 88 | rack_size = 6 # the number of hosts under a rack 89 | host_bw = 100 # bandwidth of a host, in Gbps 90 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 91 | 92 | # [envs] 93 | # KEY = "value" 94 | -------------------------------------------------------------------------------- /evaluation/sensitivity/sensitivity_oversub_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[80, 16], [80, 32]] 6 | 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 8 | buffer_size = 100_000_000 9 | 10 | # Number of iterations for all jobs 11 | num_iterations = 100 12 | 13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s 14 | poisson_lambda = 2_400_000_000.0 15 | 16 | placement_strategy = { type = "Compact" } 17 | # placement_strategy = { type = "CompactLoadBalanced" } 18 | # placement_strategy = { type = "Spread" } 19 | # placement_strategy = { type = "Random", args = 0 } 20 | 21 | # global seed 22 | seed = 1 23 | 24 | # Output path of for the simulation results 25 | directory = "/tmp/sensitivity_oversub/sensitivity_oversub_base" 26 | 27 | # Number of repeats for each batch of experiments 28 | batch_repeat = 5 29 | 30 | [[batch]] 31 | policy = "Random" 32 | probe = { enable = false } 33 | nethint_level = 2 34 | 35 | # [[batch]] 36 | # policy = "RAT" 37 | # probe = { enable = true, round_ms = 100 } 38 | # nethint_level = 2 39 | # auto_tune = 1000 40 | 41 | [[batch]] 42 | policy = "RAT" 43 | probe = { enable = false } 44 | nethint_level = 2 45 | # Auto tune after some iterations. default is disabled 46 | auto_tune = 10 47 | 48 | [simulator] 49 | nethint = true 50 | sample_interval_ns = 100_000_000 # 100ms 51 | loopback_speed = 400 52 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 53 | # fairness = "TenantFlowMaxMin" 54 | fairness = "PerFlowMaxMin" 55 | 56 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 57 | # background_flow_hard = { enable = false } 58 | # nethint_delay_ms = 100 59 | 60 | [brain] 61 | # Random seed for multiple uses 62 | seed = 1 63 | # Whether the cluster's bandwidth is asymmetric 64 | asymmetric = false 65 | # The percentage of nodes marked broken 66 | broken = 0.0 67 | # The slots of each physical machine 68 | max_slots = 1 69 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 70 | sharing_mode = "Guaranteed" 71 | guaranteed_bandwidth = 25 72 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 73 | # background_flow_high_freq = { enable = false } 74 | gc_period = 10 75 | 76 | # The topology for simulation 77 | [brain.topology] 78 | type = "Arbitrary" # another possible value is "FatTree" 79 | 80 | # [brain.topology.args] # When type = "FatTree" 81 | # nports = 20 # the number of ports of a switch 82 | # bandwidth = 100 # in Gbps 83 | # oversub_ratio = 4.0 # oversubscription ratio 84 | 85 | [brain.topology.args] # When type = "Arbitrary" 86 | nracks = 320 # the number of racks 87 | rack_size = 6 # the number of hosts under a rack 88 | host_bw = 100 # bandwidth of a host, in Gbps 89 | rack_bw = 0 # bandwidth of a ToR switch, in Gbps 90 | 91 | # [envs] 92 | # KEY = "value" 93 | -------------------------------------------------------------------------------- /evaluation/sensitivity/sensitivity_rack_size_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[80, 8], [80, 12]] 6 | 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 8 | buffer_size = 100_000_000 9 | 10 | # Number of iterations for all jobs 11 | num_iterations = 100 12 | 13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s 14 | poisson_lambda = 2_400_000_000.0 15 | 16 | placement_strategy = { type = "Compact" } 17 | # placement_strategy = { type = "CompactLoadBalanced" } 18 | # placement_strategy = { type = "Spread" } 19 | # placement_strategy = { type = "Random", args = 0 } 20 | 21 | # global seed 22 | seed = 1 23 | 24 | # Output path of for the simulation results 25 | directory = "/tmp/sensitivity_rack_size/sensitivity_rack_size_base" 26 | 27 | # Number of repeats for each batch of experiments 28 | batch_repeat = 5 29 | 30 | [[batch]] 31 | policy = "Random" 32 | probe = { enable = false } 33 | nethint_level = 2 34 | 35 | # [[batch]] 36 | # policy = "RAT" 37 | # probe = { enable = true, round_ms = 100 } 38 | # nethint_level = 2 39 | # auto_tune = 1000 40 | 41 | [[batch]] 42 | policy = "RAT" 43 | probe = { enable = false } 44 | nethint_level = 2 45 | # Auto tune after some iterations. default is disabled 46 | auto_tune = 10 47 | 48 | [simulator] 49 | nethint = true 50 | sample_interval_ns = 100_000_000 # 100ms 51 | loopback_speed = 400 52 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 53 | # fairness = "TenantFlowMaxMin" 54 | fairness = "PerFlowMaxMin" 55 | 56 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 57 | # background_flow_hard = { enable = false } 58 | # nethint_delay_ms = 100 59 | 60 | [brain] 61 | # Random seed for multiple uses 62 | seed = 1 63 | # Whether the cluster's bandwidth is asymmetric 64 | asymmetric = false 65 | # The percentage of nodes marked broken 66 | broken = 0.0 67 | # The slots of each physical machine 68 | max_slots = 1 69 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 70 | sharing_mode = "Guaranteed" 71 | guaranteed_bandwidth = 25 72 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 73 | # background_flow_high_freq = { enable = false } 74 | gc_period = 10 75 | 76 | # The topology for simulation 77 | [brain.topology] 78 | type = "Arbitrary" # another possible value is "FatTree" 79 | 80 | # [brain.topology.args] # When type = "FatTree" 81 | # nports = 20 # the number of ports of a switch 82 | # bandwidth = 100 # in Gbps 83 | # oversub_ratio = 4.0 # oversubscription ratio 84 | 85 | [brain.topology.args] # When type = "Arbitrary" 86 | nracks = 320 # the number of racks 87 | rack_size = 0 # the number of hosts under a rack 88 | host_bw = 100 # bandwidth of a host, in Gbps 89 | rack_bw = 0 # bandwidth of a ToR switch, in Gbps 90 | 91 | # [envs] 92 | # KEY = "value" 93 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/standard3.toml: -------------------------------------------------------------------------------- 1 | # Specifiation of a Allreduce application experiment 2 | 3 | # Number of jobs 4 | ncases = 30 5 | 6 | # Job size distributions [(percentage, number of workers)] 7 | job_size_distribution = [[40, 16], [40, 32]] 8 | 9 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 10 | buffer_size = 100_000_000 11 | 12 | # Number of iterations for all jobs 13 | num_iterations = 1000 14 | 15 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 16 | poisson_lambda = 24_000_000_000.0 17 | 18 | placement_strategy = { type = "Compact" } 19 | # placement_strategy = { type = "CompactLoadBalanced" } 20 | # placement_strategy = { type = "Spread" } 21 | # placement_strategy = { type = "Random", args = 0 } 22 | 23 | # global seed 24 | seed = 1 25 | 26 | # Output path of for the simulation results 27 | directory = "/tmp/allreduce_result_for_paper/standard3" 28 | 29 | # Number of repeats for each batch of experiments 30 | batch_repeat = 5 31 | 32 | [[batch]] 33 | policy = "Random" 34 | probe = { enable = false } 35 | nethint_level = 0 36 | 37 | [[batch]] 38 | policy = "RAT" 39 | probe = { enable = true, round_ms = 100 } 40 | nethint_level = 2 41 | auto_tune = 1000 42 | 43 | [[batch]] 44 | policy = "RAT" 45 | probe = { enable = false } 46 | nethint_level = 2 47 | # Auto tune after some iterations. default is disabled 48 | auto_tune = 10 49 | 50 | [simulator] 51 | nethint = true 52 | sample_interval_ns = 100_000_000 # 100ms 53 | loopback_speed = 400 54 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 55 | # fairness = "TenantFlowMaxMin" 56 | fairness = "PerFlowMaxMin" 57 | 58 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 } 59 | # background_flow_hard = { enable = false } 60 | # nethint_delay_ms = 100 61 | 62 | [brain] 63 | # Random seed for multiple uses 64 | seed = 1 65 | # Whether the cluster's bandwidth is asymmetric 66 | asymmetric = false 67 | # The percentage of nodes marked broken 68 | broken = 0.0 69 | # The slots of each physical machine 70 | max_slots = 1 71 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 72 | sharing_mode = "Guaranteed" 73 | # in Gbps 74 | guaranteed_bandwidth = 25 75 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 76 | # background_flow_high_freq = { enable = false } 77 | gc_period = 100 78 | 79 | # The topology for simulation 80 | [brain.topology] 81 | type = "Arbitrary" # another possible value is "FatTree" 82 | 83 | # [brain.topology.args] # When type = "FatTree" 84 | # nports = 20 # the number of ports of a switch 85 | # bandwidth = 100 # in Gbps 86 | # oversub_ratio = 4.0 # oversubscription ratio 87 | 88 | [brain.topology.args] # When type = "Arbitrary" 89 | nracks = 300 # the number of racks 90 | rack_size = 18 # the number of hosts under a rack 91 | host_bw = 100 # bandwidth of a host, in Gbps 92 | rack_bw = 600 # bandwidth of a ToR switch, in Gbps 93 | 94 | # [envs] 95 | # KEY = "value" 96 | -------------------------------------------------------------------------------- /evaluation/inaccuracy/inaccuracy2_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[50, 8], [50, 16]] 6 | 7 | # Images in a ML serving batch request, batch size = 64, each image has 3 channels, crop to 256x256 8 | buffer_size = 12582912 9 | 10 | # Number of iterations for all jobs 11 | num_iterations = 1000 12 | 13 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 4000 = 2s 14 | poisson_lambda = 2_400_000_000.0 15 | 16 | partially_sync = false 17 | 18 | placement_strategy = { type = "Compact" } 19 | # placement_strategy = { type = "CompactLoadBalanced" } 20 | # placement_strategy = { type = "Spread" } 21 | # placement_strategy = { type = "Random", args = 0 } 22 | 23 | # global seed 24 | seed = 1 25 | 26 | # Output path of for the simulation results 27 | directory = "/tmp/inaccuracy2/inaccuracy2_base" 28 | 29 | # Number of repeats for each batch of experiments 30 | batch_repeat = 5 31 | 32 | [[batch]] 33 | policy = "Random" 34 | probe = { enable = false } 35 | nethint_level = 2 36 | 37 | [[batch]] 38 | policy = "TopologyAware" # which is ring 39 | probe = { enable = false } 40 | nethint_level = 1 41 | 42 | [[batch]] 43 | policy = "RAT" 44 | probe = { enable = false } 45 | nethint_level = 2 46 | # Auto tune after some iterations. default is disabled 47 | auto_tune = 10 48 | 49 | [simulator] 50 | nethint = true 51 | sample_interval_ns = 100_000_000 # 100ms 52 | loopback_speed = 400 53 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 54 | # fairness = "TenantFlowMaxMin" 55 | fairness = "PerFlowMaxMin" 56 | 57 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 58 | # background_flow_hard = { enable = false } 59 | # nethint_delay_ms = 100 60 | 61 | [brain] 62 | # Random seed for multiple uses 63 | seed = 1 64 | # Whether the cluster's bandwidth is asymmetric 65 | asymmetric = false 66 | # The percentage of nodes marked broken 67 | broken = 0.0 68 | # The slots of each physical machine 69 | max_slots = 1 70 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 71 | sharing_mode = "Guaranteed" 72 | guaranteed_bandwidth = 25 73 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 74 | # background_flow_high_freq = { enable = false } 75 | gc_period = 10 76 | # inaccuracy 77 | inaccuracy = 0.1 78 | 79 | # The topology for simulation 80 | [brain.topology] 81 | type = "Arbitrary" # another possible value is "FatTree" 82 | 83 | # [brain.topology.args] # When type = "FatTree" 84 | # nports = 20 # the number of ports of a switch 85 | # bandwidth = 100 # in Gbps 86 | # oversub_ratio = 4.0 # oversubscription ratio 87 | 88 | [brain.topology.args] # When type = "Arbitrary" 89 | nracks = 320 # the number of racks 90 | rack_size = 6 # the number of hosts under a rack 91 | host_bw = 100 # bandwidth of a host, in Gbps 92 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 93 | 94 | # [envs] 95 | # KEY = "value" 96 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/standard3_pervm.toml: -------------------------------------------------------------------------------- 1 | # Specifiation of a Allreduce application experiment 2 | 3 | # Number of jobs 4 | ncases = 30 5 | 6 | # Job size distributions [(percentage, number of workers)] 7 | job_size_distribution = [[40, 16], [40, 32]] 8 | 9 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 10 | buffer_size = 100_000_000 11 | 12 | # Number of iterations for all jobs 13 | num_iterations = 1000 14 | 15 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 16 | poisson_lambda = 24_000_000_000.0 17 | 18 | placement_strategy = { type = "Compact" } 19 | # placement_strategy = { type = "CompactLoadBalanced" } 20 | # placement_strategy = { type = "Spread" } 21 | # placement_strategy = { type = "Random", args = 0 } 22 | 23 | # global seed 24 | seed = 1 25 | 26 | # Output path of for the simulation results 27 | directory = "/tmp/allreduce_result_for_paper/standard3_pervm" 28 | 29 | # Number of repeats for each batch of experiments 30 | batch_repeat = 5 31 | 32 | [[batch]] 33 | policy = "Random" 34 | probe = { enable = false } 35 | nethint_level = 0 36 | 37 | [[batch]] 38 | policy = "RAT" 39 | probe = { enable = true, round_ms = 100 } 40 | nethint_level = 2 41 | auto_tune = 1000 42 | 43 | [[batch]] 44 | policy = "RAT" 45 | probe = { enable = false } 46 | nethint_level = 2 47 | # Auto tune after some iterations. default is disabled 48 | auto_tune = 10 49 | 50 | [simulator] 51 | nethint = true 52 | sample_interval_ns = 100_000_000 # 100ms 53 | loopback_speed = 400 54 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 55 | # fairness = "TenantFlowMaxMin" 56 | # fairness = "PerFlowMaxMin" 57 | fairness = "PerVmPairMaxMin" 58 | 59 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 } 60 | # background_flow_hard = { enable = false } 61 | # nethint_delay_ms = 100 62 | 63 | [brain] 64 | # Random seed for multiple uses 65 | seed = 1 66 | # Whether the cluster's bandwidth is asymmetric 67 | asymmetric = false 68 | # The percentage of nodes marked broken 69 | broken = 0.0 70 | # The slots of each physical machine 71 | max_slots = 1 72 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 73 | sharing_mode = "Guaranteed" 74 | # in Gbps 75 | guaranteed_bandwidth = 25 76 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 77 | # background_flow_high_freq = { enable = false } 78 | gc_period = 100 79 | 80 | # The topology for simulation 81 | [brain.topology] 82 | type = "Arbitrary" # another possible value is "FatTree" 83 | 84 | # [brain.topology.args] # When type = "FatTree" 85 | # nports = 20 # the number of ports of a switch 86 | # bandwidth = 100 # in Gbps 87 | # oversub_ratio = 4.0 # oversubscription ratio 88 | 89 | [brain.topology.args] # When type = "Arbitrary" 90 | nracks = 300 # the number of racks 91 | rack_size = 6 # the number of hosts under a rack 92 | host_bw = 100 # bandwidth of a host, in Gbps 93 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 94 | 95 | # [envs] 96 | # KEY = "value" 97 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/standard3_pertenant.toml: -------------------------------------------------------------------------------- 1 | # Specifiation of a Allreduce application experiment 2 | 3 | # Number of jobs 4 | ncases = 30 5 | 6 | # Job size distributions [(percentage, number of workers)] 7 | job_size_distribution = [[40, 16], [40, 32]] 8 | 9 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 10 | buffer_size = 100_000_000 11 | 12 | # Number of iterations for all jobs 13 | num_iterations = 1000 14 | 15 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 16 | poisson_lambda = 24_000_000_000.0 17 | 18 | placement_strategy = { type = "Compact" } 19 | # placement_strategy = { type = "CompactLoadBalanced" } 20 | # placement_strategy = { type = "Spread" } 21 | # placement_strategy = { type = "Random", args = 0 } 22 | 23 | # global seed 24 | seed = 1 25 | 26 | # Output path of for the simulation results 27 | directory = "/tmp/allreduce_result_for_paper/standard3_pertenant" 28 | 29 | # Number of repeats for each batch of experiments 30 | batch_repeat = 5 31 | 32 | [[batch]] 33 | policy = "Random" 34 | probe = { enable = false } 35 | nethint_level = 0 36 | 37 | [[batch]] 38 | policy = "RAT" 39 | probe = { enable = true, round_ms = 100 } 40 | nethint_level = 2 41 | auto_tune = 1000 42 | 43 | [[batch]] 44 | policy = "RAT" 45 | probe = { enable = false } 46 | nethint_level = 2 47 | # Auto tune after some iterations. default is disabled 48 | auto_tune = 10 49 | 50 | [simulator] 51 | nethint = true 52 | sample_interval_ns = 100_000_000 # 100ms 53 | loopback_speed = 400 54 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 55 | fairness = "TenantFlowMaxMin" 56 | # fairness = "PerFlowMaxMin" 57 | # fairness = "PerVmPairMaxMin" 58 | 59 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 } 60 | # background_flow_hard = { enable = false } 61 | # nethint_delay_ms = 100 62 | 63 | [brain] 64 | # Random seed for multiple uses 65 | seed = 1 66 | # Whether the cluster's bandwidth is asymmetric 67 | asymmetric = false 68 | # The percentage of nodes marked broken 69 | broken = 0.0 70 | # The slots of each physical machine 71 | max_slots = 1 72 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 73 | sharing_mode = "Guaranteed" 74 | # in Gbps 75 | guaranteed_bandwidth = 25 76 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 77 | # background_flow_high_freq = { enable = false } 78 | gc_period = 100 79 | 80 | # The topology for simulation 81 | [brain.topology] 82 | type = "Arbitrary" # another possible value is "FatTree" 83 | 84 | # [brain.topology.args] # When type = "FatTree" 85 | # nports = 20 # the number of ports of a switch 86 | # bandwidth = 100 # in Gbps 87 | # oversub_ratio = 4.0 # oversubscription ratio 88 | 89 | [brain.topology.args] # When type = "Arbitrary" 90 | nracks = 300 # the number of racks 91 | rack_size = 18 # the number of hosts under a rack 92 | host_bw = 100 # bandwidth of a host, in Gbps 93 | rack_bw = 600 # bandwidth of a ToR switch, in Gbps 94 | 95 | # [envs] 96 | # KEY = "value" 97 | -------------------------------------------------------------------------------- /src/allreduce/testbed.toml: -------------------------------------------------------------------------------- 1 | # Specifiation of a Allreduce application experiment 2 | 3 | # Number of jobs 4 | ncases = 20 5 | 6 | allow_delay = true 7 | 8 | # Job size distributions [(percentage, number of workers)] 9 | job_size_distribution = [[40, 2], [80, 4], [90, 6]] 10 | 11 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 12 | buffer_size = 100_000_000 13 | 14 | # Number of iterations for all jobs 15 | num_iterations = 10 16 | 17 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 18 | poisson_lambda = 4_000_000_000.0 19 | 20 | placement_strategy = { type = "Compact" } 21 | 22 | # global seed 23 | seed = 1 24 | 25 | # Output path of for the simulation results 26 | directory = "/tmp/allreduce_result" 27 | 28 | # Number of repeats for each batch of experiments 29 | batch_repeat = 1 30 | 31 | [[batch]] 32 | policy = "Random" 33 | probe = { enable = false } 34 | nethint_level = 0 35 | 36 | [[batch]] 37 | policy = "TopologyAware" # which is ring 38 | probe = { enable = true, round_ms = 10 } 39 | nethint_level = 1 40 | 41 | [[batch]] 42 | policy = "TopologyAware" # which is ring 43 | probe = { enable = false } 44 | nethint_level = 1 45 | 46 | [[batch]] 47 | policy = "RAT" 48 | probe = { enable = false } 49 | nethint_level = 2 50 | # Auto tune after some iterations. default is disabled 51 | auto_tune = 10 52 | 53 | [simulator] 54 | nethint = true 55 | sample_interval_ns = 100_000_000 # 100ms 56 | loopback_speed = 400 57 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 58 | # fairness = "TenantFlowMaxMin" 59 | fairness = "PerFlowMaxMin" 60 | 61 | background_flow_hard = { enable = true, frequency_ns = 1_000_000_000, probability = 0.1, amplitude = 1 } 62 | # background_flow_hard = { enable = false } 63 | # nethint_delay_ms = 100 64 | 65 | # These fields below are ignored 66 | [brain] 67 | # Random seed for multiple uses 68 | seed = 1 69 | # Whether the cluster's bandwidth is asymmetric 70 | asymmetric = false 71 | # The percentage of nodes marked broken 72 | broken = 0.0 73 | # The slots of each physical machine 74 | max_slots = 1 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 76 | sharing_mode = "Guaranteed" 77 | # in Gbps 78 | guaranteed_bandwidth = 25 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 80 | background_flow_high_freq = { enable = false } 81 | gc_period = 100 82 | 83 | # The topology for simulation 84 | [brain.topology] 85 | type = "Arbitrary" # another possible value is "FatTree" 86 | 87 | # [brain.topology.args] # When type = "FatTree" 88 | # nports = 20 # the number of ports of a switch 89 | # bandwidth = 100 # in Gbps 90 | # oversub_ratio = 4.0 # oversubscription ratio 91 | 92 | [brain.topology.args] # When type = "Arbitrary" 93 | nracks = 2 # the number of racks 94 | rack_size = 6 # the number of hosts under a rack 95 | host_bw = 100 # bandwidth of a host, in Gbps 96 | rack_bw = 100 # bandwidth of a ToR switch, in Gbps 97 | 98 | # [envs] 99 | # KEY = "value" 100 | -------------------------------------------------------------------------------- /evaluation/spectrum/spectrum4_base.toml: -------------------------------------------------------------------------------- 1 | # background=0.64s, poisson=2.4s, 128 iterations, 0.032s each iter, 2 | # alpha from 0.05 to 6.4 (0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4) 3 | # corresponding auto_tune: 1, 2, 4, 8, 16, (20, alpha=1), 32, 64, 128 4 | 5 | # Number of jobs 6 | ncases = 30 7 | 8 | # Job size distributions [(percentage, number of workers)] 9 | job_size_distribution = [[80, 16], [80, 32]] 10 | 11 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 12 | buffer_size = 100_000_000 13 | 14 | # Number of iterations for all jobs 15 | num_iterations = 128 16 | 17 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2*1.28s 18 | poisson_lambda = 24_00_000_000.0 19 | 20 | placement_strategy = { type = "Compact" } 21 | # placement_strategy = { type = "CompactLoadBalanced" } 22 | # placement_strategy = { type = "Spread" } 23 | # placement_strategy = { type = "Random", args = 0 } 24 | 25 | # global seed 26 | seed = 1 27 | 28 | # Output path of for the simulation results 29 | directory = "/tmp/spectrum4/spectrum4_1" 30 | 31 | # Number of repeats for each batch of experiments 32 | batch_repeat = 5 33 | 34 | [[batch]] 35 | policy = "Random" 36 | probe = { enable = false } 37 | nethint_level = 2 38 | 39 | [[batch]] 40 | policy = "TopologyAware" # which is ring 41 | probe = { enable = false } 42 | nethint_level = 1 43 | 44 | [[batch]] 45 | policy = "RAT" 46 | probe = { enable = false } 47 | nethint_level = 2 48 | # Auto tune after some iterations. default is disabled 49 | auto_tune = 10 50 | 51 | [simulator] 52 | nethint = true 53 | sample_interval_ns = 100_000_000 # 100ms 54 | loopback_speed = 400 55 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 56 | # fairness = "TenantFlowMaxMin" 57 | fairness = "PerFlowMaxMin" 58 | 59 | background_flow_hard = { enable = true, frequency_ns = 640_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 60 | # background_flow_hard = { enable = false } 61 | # nethint_delay_ms = 100 62 | 63 | [brain] 64 | # Random seed for multiple uses 65 | seed = 1 66 | # Whether the cluster's bandwidth is asymmetric 67 | asymmetric = false 68 | # The percentage of nodes marked broken 69 | broken = 0.0 70 | # The slots of each physical machine 71 | max_slots = 1 72 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 73 | sharing_mode = "Guaranteed" 74 | guaranteed_bandwidth = 25 75 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 76 | # background_flow_high_freq = { enable = false } 77 | gc_period = 10 78 | 79 | # The topology for simulation 80 | [brain.topology] 81 | type = "Arbitrary" # another possible value is "FatTree" 82 | 83 | # [brain.topology.args] # When type = "FatTree" 84 | # nports = 20 # the number of ports of a switch 85 | # bandwidth = 100 # in Gbps 86 | # oversub_ratio = 4.0 # oversubscription ratio 87 | 88 | [brain.topology.args] # When type = "Arbitrary" 89 | nracks = 320 # the number of racks 90 | rack_size = 6 # the number of hosts under a rack 91 | host_bw = 100 # bandwidth of a host, in Gbps 92 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 93 | 94 | # [envs] 95 | # KEY = "value" 96 | -------------------------------------------------------------------------------- /evaluation/spectrum/spectrum3_base.toml: -------------------------------------------------------------------------------- 1 | # background=0.4s, poisson=2.6s, 1280 iterations, 0.002s each iter, 2 | # alpha from 0.05 to 6.4 (0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4) 3 | # corresponding auto_tune: 10, 20, 40, 80, 160, (200, alpha=1), 320, 640, 1280 4 | 5 | # Number of jobs 6 | ncases = 30 7 | 8 | # Job size distributions [(percentage, number of workers)] 9 | job_size_distribution = [[50, 8], [50, 16]] 10 | 11 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 12 | buffer_size = 12582912 13 | 14 | # Number of iterations for all jobs 15 | num_iterations = 1280 16 | 17 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 1280 = 2.56s 18 | poisson_lambda = 2_600_000_000.0 19 | 20 | partially_sync = false 21 | 22 | placement_strategy = { type = "Compact" } 23 | # placement_strategy = { type = "CompactLoadBalanced" } 24 | # placement_strategy = { type = "Spread" } 25 | # placement_strategy = { type = "Random", args = 0 } 26 | 27 | # global seed 28 | seed = 1 29 | 30 | # Output path of for the simulation results 31 | directory = "/tmp/spectrum3/spectrum3_1" 32 | 33 | # Number of repeats for each batch of experiments 34 | batch_repeat = 5 35 | 36 | [[batch]] 37 | policy = "Random" 38 | probe = { enable = false } 39 | nethint_level = 2 40 | 41 | [[batch]] 42 | policy = "TopologyAware" # which is ring 43 | probe = { enable = false } 44 | nethint_level = 1 45 | 46 | [[batch]] 47 | policy = "RAT" 48 | probe = { enable = false } 49 | nethint_level = 2 50 | # Auto tune after some iterations. default is disabled 51 | auto_tune = 10 52 | 53 | [simulator] 54 | nethint = true 55 | sample_interval_ns = 100_000_000 # 100ms 56 | loopback_speed = 400 57 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 58 | # fairness = "TenantFlowMaxMin" 59 | fairness = "PerFlowMaxMin" 60 | 61 | background_flow_hard = { enable = true, frequency_ns = 400_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 62 | # background_flow_hard = { enable = false } 63 | # nethint_delay_ms = 100 64 | 65 | [brain] 66 | # Random seed for multiple uses 67 | seed = 1 68 | # Whether the cluster's bandwidth is asymmetric 69 | asymmetric = false 70 | # The percentage of nodes marked broken 71 | broken = 0.0 72 | # The slots of each physical machine 73 | max_slots = 1 74 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 75 | sharing_mode = "Guaranteed" 76 | guaranteed_bandwidth = 25 77 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 78 | # background_flow_high_freq = { enable = false } 79 | gc_period = 10 80 | 81 | # The topology for simulation 82 | [brain.topology] 83 | type = "Arbitrary" # another possible value is "FatTree" 84 | 85 | # [brain.topology.args] # When type = "FatTree" 86 | # nports = 20 # the number of ports of a switch 87 | # bandwidth = 100 # in Gbps 88 | # oversub_ratio = 4.0 # oversubscription ratio 89 | 90 | [brain.topology.args] # When type = "Arbitrary" 91 | nracks = 320 # the number of racks 92 | rack_size = 6 # the number of hosts under a rack 93 | host_bw = 100 # bandwidth of a host, in Gbps 94 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 95 | 96 | # [envs] 97 | # KEY = "value" 98 | -------------------------------------------------------------------------------- /evaluation/spectrum/spectrum1_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[80, 16], [80, 32]] 6 | 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 8 | buffer_size = 100_000_000 9 | 10 | # Number of iterations for all jobs 11 | num_iterations = 100 12 | 13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 14 | poisson_lambda = 24_00_000_000.0 15 | 16 | placement_strategy = { type = "Compact" } 17 | # placement_strategy = { type = "CompactLoadBalanced" } 18 | # placement_strategy = { type = "Spread" } 19 | # placement_strategy = { type = "Random", args = 0 } 20 | 21 | # global seed 22 | seed = 1 23 | 24 | # Output path of for the simulation results 25 | directory = "/tmp/spectrum1/spectrum1_1" 26 | 27 | # Number of repeats for each batch of experiments 28 | batch_repeat = 5 29 | 30 | [[batch]] 31 | policy = "Random" 32 | probe = { enable = false } 33 | nethint_level = 2 34 | 35 | [[batch]] 36 | policy = "TopologyAware" # which is ring 37 | probe = { enable = false } 38 | nethint_level = 1 39 | 40 | [[batch]] 41 | policy = "RAT" 42 | probe = { enable = false } 43 | nethint_level = 2 44 | # Auto tune after some iterations. default is disabled 45 | auto_tune = 10 46 | 47 | [[batch]] 48 | policy = "RAT" 49 | probe = { enable = false } 50 | nethint_level = 2 51 | # Auto tune after some iterations. default is disabled 52 | auto_tune = 10 53 | auto_fallback = true 54 | alpha = 1.0 55 | 56 | [simulator] 57 | nethint = true 58 | sample_interval_ns = 100_000_000 # 100ms 59 | loopback_speed = 400 60 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 61 | # fairness = "TenantFlowMaxMin" 62 | fairness = "PerFlowMaxMin" 63 | 64 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 65 | # background_flow_hard = { enable = false } 66 | # nethint_delay_ms = 100 67 | 68 | [brain] 69 | # Random seed for multiple uses 70 | seed = 1 71 | # Whether the cluster's bandwidth is asymmetric 72 | asymmetric = false 73 | # The percentage of nodes marked broken 74 | broken = 0.0 75 | # The slots of each physical machine 76 | max_slots = 1 77 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 78 | sharing_mode = "Guaranteed" 79 | guaranteed_bandwidth = 25 80 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 81 | # background_flow_high_freq = { enable = false } 82 | gc_period = 10 83 | 84 | # The topology for simulation 85 | [brain.topology] 86 | type = "Arbitrary" # another possible value is "FatTree" 87 | 88 | # [brain.topology.args] # When type = "FatTree" 89 | # nports = 20 # the number of ports of a switch 90 | # bandwidth = 100 # in Gbps 91 | # oversub_ratio = 4.0 # oversubscription ratio 92 | 93 | [brain.topology.args] # When type = "Arbitrary" 94 | nracks = 320 # the number of racks 95 | rack_size = 6 # the number of hosts under a rack 96 | host_bw = 100 # bandwidth of a host, in Gbps 97 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 98 | 99 | # [envs] 100 | # KEY = "value" 101 | -------------------------------------------------------------------------------- /evaluation/spectrum/spectrum2_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[80, 16], [80, 32]] 6 | 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 8 | buffer_size = 100_000_000 9 | 10 | # Number of iterations for all jobs 11 | num_iterations = 100 12 | 13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 14 | poisson_lambda = 24_00_000_000.0 15 | 16 | placement_strategy = { type = "Compact" } 17 | # placement_strategy = { type = "CompactLoadBalanced" } 18 | # placement_strategy = { type = "Spread" } 19 | # placement_strategy = { type = "Random", args = 0 } 20 | 21 | # global seed 22 | seed = 1 23 | 24 | # Output path of for the simulation results 25 | directory = "/tmp/spectrum2/spectrum2_1" 26 | 27 | # Number of repeats for each batch of experiments 28 | batch_repeat = 5 29 | 30 | [[batch]] 31 | policy = "Random" 32 | probe = { enable = false } 33 | nethint_level = 2 34 | 35 | [[batch]] 36 | policy = "TopologyAware" # which is ring 37 | probe = { enable = false } 38 | nethint_level = 1 39 | 40 | [[batch]] 41 | policy = "RAT" 42 | probe = { enable = false } 43 | nethint_level = 2 44 | # Auto tune after some iterations. default is disabled 45 | auto_tune = 10 46 | 47 | [[batch]] 48 | policy = "RAT" 49 | probe = { enable = false } 50 | nethint_level = 2 51 | # Auto tune after some iterations. default is disabled 52 | auto_tune = 10 53 | auto_fallback = true 54 | alpha = 1.0 55 | 56 | [simulator] 57 | nethint = true 58 | sample_interval_ns = 100_000_000 # 100ms 59 | loopback_speed = 400 60 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 61 | # fairness = "TenantFlowMaxMin" 62 | fairness = "PerFlowMaxMin" 63 | 64 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 65 | # background_flow_hard = { enable = false } 66 | # nethint_delay_ms = 100 67 | 68 | [brain] 69 | # Random seed for multiple uses 70 | seed = 1 71 | # Whether the cluster's bandwidth is asymmetric 72 | asymmetric = false 73 | # The percentage of nodes marked broken 74 | broken = 0.0 75 | # The slots of each physical machine 76 | max_slots = 1 77 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 78 | sharing_mode = "Guaranteed" 79 | guaranteed_bandwidth = 25 80 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 81 | # background_flow_high_freq = { enable = false } 82 | gc_period = 10 83 | 84 | # The topology for simulation 85 | [brain.topology] 86 | type = "Arbitrary" # another possible value is "FatTree" 87 | 88 | # [brain.topology.args] # When type = "FatTree" 89 | # nports = 20 # the number of ports of a switch 90 | # bandwidth = 100 # in Gbps 91 | # oversub_ratio = 4.0 # oversubscription ratio 92 | 93 | [brain.topology.args] # When type = "Arbitrary" 94 | nracks = 320 # the number of racks 95 | rack_size = 6 # the number of hosts under a rack 96 | host_bw = 100 # bandwidth of a host, in Gbps 97 | rack_bw = 400 # bandwidth of a ToR switch, in Gbps 98 | 99 | # [envs] 100 | # KEY = "value" 101 | -------------------------------------------------------------------------------- /evaluation/herd_behavior/allreduce_herd_base.toml: -------------------------------------------------------------------------------- 1 | # 3:1, jobsize = 4n, poisson = 24s, background: (60s, 0.5, 5), max_slots = 1 2 | # Specifiation of a Allreduce application experiment 3 | 4 | # Number of jobs 5 | ncases = 40 6 | 7 | # Job size distributions [(percentage, number of workers)] 8 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]] 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]] 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]] 12 | job_size_distribution = [[40, 12]] 13 | 14 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 15 | buffer_size = 100_000_000 16 | 17 | # Number of iterations for all jobs 18 | num_iterations = 30 19 | 20 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 21 | poisson_lambda = 1_000.0 22 | 23 | # placement_strategy = { type = "Compact" } 24 | # placement_strategy = { type = "CompactLoadBalanced" } 25 | # placement_strategy = { type = "Spread" } 26 | placement_strategy = { type = "Random", args = 0 } 27 | 28 | # global seed 29 | seed = 1 30 | 31 | # Output path of for the simulation results 32 | directory = "/tmp/herd_behavior/allreduce_herd_base" 33 | 34 | # Number of repeats for each batch of experiments 35 | batch_repeat = 5 36 | 37 | [[batch]] 38 | policy = "Random" 39 | probe = { enable = false } 40 | nethint_level = 0 41 | 42 | [[batch]] 43 | policy = "RAT" 44 | probe = { enable = false } 45 | nethint_level = 2 46 | # Auto tune after some iterations. default is disabled 47 | auto_tune = 10 48 | 49 | [simulator] 50 | nethint = true 51 | sample_interval_ns = 100_000_000 # 100ms 52 | loopback_speed = 400 53 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 54 | # fairness = "TenantFlowMaxMin" 55 | fairness = "PerFlowMaxMin" 56 | 57 | # background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 } 58 | background_flow_hard = { enable = false } 59 | # nethint_delay_ms = 100 60 | 61 | [brain] 62 | # Random seed for multiple uses 63 | seed = 1 64 | # Whether the cluster's bandwidth is asymmetric 65 | asymmetric = false 66 | # The percentage of nodes marked broken 67 | broken = 0.0 68 | # The slots of each physical machine 69 | max_slots = 1 70 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 71 | sharing_mode = "Guaranteed" 72 | # in Gbps 73 | guaranteed_bandwidth = 25 74 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 75 | background_flow_high_freq = { enable = false } 76 | gc_period = 100 77 | 78 | # The topology for simulation 79 | [brain.topology] 80 | type = "Arbitrary" # another possible value is "FatTree" 81 | 82 | # [brain.topology.args] # When type = "FatTree" 83 | # nports = 20 # the number of ports of a switch 84 | # bandwidth = 100 # in Gbps 85 | # oversub_ratio = 4.0 # oversubscription ratio 86 | 87 | [brain.topology.args] # When type = "Arbitrary" 88 | nracks = 9 # the number of racks 89 | rack_size = 40 # the number of hosts under a rack 90 | host_bw = 100 # bandwidth of a host, in Gbps 91 | rack_bw = 1333 # bandwidth of a ToR switch, in Gbps 92 | 93 | # [envs] 94 | # KEY = "value" 95 | -------------------------------------------------------------------------------- /src/rl/testbed.toml: -------------------------------------------------------------------------------- 1 | # Specifiation of a Allreduce application experiment 2 | 3 | # Number of jobs 4 | ncases = 30 5 | 6 | allow_delay = true 7 | 8 | # Job size distributions [(percentage, number of workers)] 9 | job_size_distribution = [[40, 2], [80, 4], [90, 6]] 10 | # job_size_distribution = [[40, 6]] 11 | 12 | # Buffer size of all jobs, in bytes 13 | buffer_size = 100_000_000 14 | 15 | # Number of iterations for all jobs 16 | num_iterations = 10 17 | 18 | # Lambda of the poisson arrival, 2*100MB/25Gbps*120 = 7.68s 19 | # poisson_lambda = 8_000_000_000.0 20 | poisson_lambda = 10_000_000_000.0 21 | 22 | placement_strategy = { type = "Compact" } 23 | # placement_strategy = { type = "CompactLoadBalanced" } 24 | # placement_strategy = { type = "Spread" } 25 | # placement_strategy = { type = "Random", args = 0 } 26 | 27 | # global seed 28 | seed = 1 29 | 30 | # Output path of for the simulation results 31 | directory = "/tmp/rl_result" 32 | 33 | # Number of repeats for each batch of experiments 34 | batch_repeat = 1 35 | 36 | [[batch]] 37 | policy = "Random" 38 | probe = { enable = false } 39 | nethint_level = 2 40 | 41 | [[batch]] 42 | policy = "TopologyAware" # which is ring 43 | probe = { enable = true, round_ms = 10 } 44 | nethint_level = 1 45 | 46 | [[batch]] 47 | policy = "TopologyAware" # which is ring 48 | probe = { enable = false } 49 | nethint_level = 1 50 | 51 | [[batch]] 52 | policy = "RAT" 53 | probe = { enable = false } 54 | nethint_level = 2 55 | # Auto tune after some iterations. default is disabled 56 | auto_tune = 10 57 | 58 | [simulator] 59 | nethint = true 60 | sample_interval_ns = 100_000_000 # 100ms 61 | loopback_speed = 400 62 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 63 | # fairness = "TenantFlowMaxMin" 64 | fairness = "PerFlowMaxMin" 65 | 66 | background_flow_hard = { enable = true, frequency_ns = 1000_000_000_000, probability = 0.5, amplitude = 5, average_load = 0.1 } 67 | # background_flow_hard = { enable = false } 68 | # nethint_delay_ms = 100 69 | 70 | [brain] 71 | # Random seed for multiple uses 72 | seed = 1 73 | # Whether the cluster's bandwidth is asymmetric 74 | asymmetric = false 75 | # The percentage of nodes marked broken 76 | broken = 0.0 77 | # The slots of each physical machine 78 | max_slots = 1 79 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 80 | sharing_mode = "Guaranteed" 81 | guaranteed_bandwidth = 2.5 82 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 83 | # background_flow_high_freq = { enable = false } 84 | gc_period = 0 85 | 86 | # The topology for simulation 87 | [brain.topology] 88 | type = "Arbitrary" # another possible value is "FatTree" 89 | 90 | # [brain.topology.args] # When type = "FatTree" 91 | # nports = 20 # the number of ports of a switch 92 | # bandwidth = 100 # in Gbps 93 | # oversub_ratio = 4.0 # oversubscription ratio 94 | 95 | [brain.topology.args] # When type = "Arbitrary" 96 | nracks = 2 # the number of racks 97 | rack_size = 3 # the number of hosts under a rack 98 | host_bw = 10 # bandwidth of a host, in Gbps 99 | rack_bw = 10 # bandwidth of a ToR switch, in Gbps 100 | 101 | # [envs] 102 | # KEY = "value" 103 | -------------------------------------------------------------------------------- /src/mapreduce/src/argument.rs: -------------------------------------------------------------------------------- 1 | use nethint::architecture::TopoArgs; 2 | use structopt::StructOpt; 3 | 4 | use crate::{JobSpec, ShufflePattern}; 5 | 6 | #[derive(Debug, Clone, StructOpt)] 7 | #[structopt(name = "MapReduce", about = "MapReduce Application")] 8 | pub struct Opt { 9 | /// Specify the topology for simulation 10 | #[structopt(subcommand)] 11 | pub topo: TopoArgs, 12 | 13 | /// Asymmetric bandwidth 14 | #[structopt(short = "a", long = "asymmetric")] 15 | pub asym: bool, 16 | 17 | /// Probability distribution of shuffle flows, examples: uniform_1000000, zipf_1000000_0.5 18 | #[structopt( 19 | short = "s", 20 | long = "shuffle-pattern", 21 | name = "distribution", 22 | default_value = "uniform_1000000" 23 | )] 24 | pub shuffle: ShufflePattern, 25 | 26 | /// Number of map tasks. When using trace, this parameter means map scale factor 27 | #[structopt(short = "m", long = "map", default_value = "4")] 28 | pub num_map: usize, 29 | 30 | /// Number of reduce tasks. When using trace, this parameter means reduce scale factor 31 | #[structopt(short = "r", long = "reduce", default_value = "4")] 32 | pub num_reduce: usize, 33 | 34 | /// Number of testcases 35 | #[structopt(short = "n", long = "ncases", default_value = "10")] 36 | pub ncases: usize, 37 | 38 | /// Traffic scale, multiply the traffic size by a number to allow job overlaps 39 | #[structopt(short = "t", long = "traffic-scale", default_value = "1.0")] 40 | pub traffic_scale: f64, 41 | 42 | /// Run experiments from trace file 43 | #[structopt(short = "f", long = "file")] 44 | pub trace: Option, 45 | 46 | /// Output path of the figure 47 | #[structopt(short = "d", long = "directory")] 48 | pub directory: Option, 49 | 50 | /// Run simulation experiments in parallel, default using the hardware concurrency 51 | #[structopt(short = "P", long = "parallel", name = "nthreads")] 52 | pub parallel: Option, 53 | 54 | /// Normalize, draw speed up instead of absolution job completion time 55 | #[structopt(short = "N", long = "normalize")] 56 | pub normalize: bool, 57 | 58 | /// Inspect the trace file, see the overlap among multiple jobs 59 | #[structopt(long = "inspect")] 60 | pub inspect: bool, 61 | 62 | /// Multi-tenant 63 | #[structopt(long = "multitenant")] 64 | pub multitenant: bool, 65 | 66 | /// Nethint level. 67 | #[structopt(short = "l", long = "nethint_level", default_value = "1")] 68 | pub nethint_level: usize, 69 | 70 | /// Collocate or De-collocate 71 | #[structopt(short = "c", long = "collocate")] 72 | pub collocate: bool, 73 | 74 | /// Mark some nodes as Broken to be more realistic 75 | #[structopt(short = "b", long = "broken")] 76 | pub broken: bool, 77 | } 78 | 79 | impl Opt { 80 | pub fn to_filename(&self, prefix: &str) -> String { 81 | if let Some(_f) = self.trace.as_ref() { 82 | format!( 83 | "{}_{}_from_trace_m{}_r{}.pdf", 84 | prefix, self.topo, self.num_map, self.num_reduce 85 | ) 86 | } else { 87 | let job_spec = JobSpec::new(self.num_map, self.num_reduce, self.shuffle.clone()); 88 | format!("{}_{}_{}.pdf", prefix, self.topo, job_spec) 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /evaluation/spectrum/spectrum5_base.toml: -------------------------------------------------------------------------------- 1 | # Number of jobs 2 | ncases = 30 3 | 4 | # Job size distributions [(percentage, number of workers)] 5 | job_size_distribution = [[50, 8], [50, 16]] 6 | 7 | # Images in a ML serving batch request, batch size = 64, each image has 3 channels, crop to 256x256 8 | buffer_size = 12582912 9 | 10 | # Number of iterations for all jobs 11 | num_iterations = 1000 12 | 13 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 4000 = 2s 14 | poisson_lambda = 2_400_000_000.0 15 | 16 | partially_sync = false 17 | 18 | placement_strategy = { type = "Compact" } 19 | # placement_strategy = { type = "CompactLoadBalanced" } 20 | # placement_strategy = { type = "Spread" } 21 | # placement_strategy = { type = "Random", args = 0 } 22 | 23 | # global seed 24 | seed = 1 25 | 26 | # Output path of for the simulation results 27 | directory = "/tmp/spectrum5/spectrum5_1" 28 | 29 | # Number of repeats for each batch of experiments 30 | batch_repeat = 5 31 | 32 | [[batch]] 33 | policy = "Random" 34 | probe = { enable = false } 35 | nethint_level = 2 36 | 37 | [[batch]] 38 | policy = "TopologyAware" # which is ring 39 | probe = { enable = false } 40 | nethint_level = 1 41 | 42 | [[batch]] 43 | policy = "RAT" 44 | probe = { enable = false } 45 | nethint_level = 2 46 | # Auto tune after some iterations. default is disabled 47 | auto_tune = 10 48 | 49 | [[batch]] 50 | policy = "RAT" 51 | probe = { enable = false } 52 | nethint_level = 2 53 | # Auto tune after some iterations. default is disabled 54 | auto_tune = 10 55 | auto_fallback = true 56 | alpha = 1.0 57 | 58 | [simulator] 59 | nethint = true 60 | sample_interval_ns = 100_000_000 # 100ms 61 | loopback_speed = 400 62 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 63 | # fairness = "TenantFlowMaxMin" 64 | fairness = "PerFlowMaxMin" 65 | 66 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 67 | # background_flow_hard = { enable = false } 68 | # nethint_delay_ms = 100 69 | 70 | [brain] 71 | # Random seed for multiple uses 72 | seed = 1 73 | # Whether the cluster's bandwidth is asymmetric 74 | asymmetric = false 75 | # The percentage of nodes marked broken 76 | broken = 0.0 77 | # The slots of each physical machine 78 | max_slots = 1 79 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 80 | sharing_mode = "Guaranteed" 81 | guaranteed_bandwidth = 25 82 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 83 | # background_flow_high_freq = { enable = false } 84 | gc_period = 10 85 | 86 | # The topology for simulation 87 | [brain.topology] 88 | type = "Arbitrary" # another possible value is "FatTree" 89 | 90 | # [brain.topology.args] # When type = "FatTree" 91 | # nports = 20 # the number of ports of a switch 92 | # bandwidth = 100 # in Gbps 93 | # oversub_ratio = 4.0 # oversubscription ratio 94 | 95 | [brain.topology.args] # When type = "Arbitrary" 96 | nracks = 320 # the number of racks 97 | rack_size = 6 # the number of hosts under a rack 98 | host_bw = 100 # bandwidth of a host, in Gbps 99 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 100 | 101 | # [envs] 102 | # KEY = "value" 103 | -------------------------------------------------------------------------------- /evaluation/spectrum/spectrum6_base.toml: -------------------------------------------------------------------------------- 1 | # Run the experiments from trace file 2 | trace = "../../src/mapreduce/FB2010-1Hr-150-0.txt" 3 | 4 | # Number of testcases to run 5 | ncases = 500 6 | 7 | # Number of map tasks and reduce tasks 8 | # When running from trace, these parameters become scale factors 9 | num_map = 1 10 | num_reduce = 1 11 | enable_computation_time = false 12 | 13 | # Multiply the traffic size by a number 14 | traffic_scale = 10.0 15 | 16 | # Mapper placement policy 17 | mapper_policy = { type = "Greedy" } 18 | 19 | placement_strategy = { type = "Compact" } 20 | 21 | # Output path of for the simulation results 22 | directory = "/tmp/spectrum6/spectrum6_1" 23 | 24 | # Whether to allow a mapper to collocate with a reduce 25 | collocate = true 26 | 27 | # Number of repeats for each batch of experiments 28 | batch_repeat = 5 29 | 30 | [[batch]] 31 | reducer_policy = "Random" 32 | probe = { enable = false } 33 | # NetHint level, possible values are 0, 1, 2 34 | nethint_level = 1 35 | 36 | [[batch]] 37 | reducer_policy = "HierarchicalGreedyLevel1" 38 | probe = { enable = false } 39 | nethint_level = 1 40 | 41 | [[batch]] 42 | reducer_policy = "HierarchicalGreedyPaper" # please use this 43 | # reducer_policy = "HierarchicalGreedy" 44 | probe = { enable = false } 45 | nethint_level = 2 46 | 47 | [[batch]] 48 | reducer_policy = "HierarchicalGreedyPaper" # please use this 49 | # reducer_policy = "HierarchicalGreedy" 50 | probe = { enable = false } 51 | nethint_level = 2 52 | auto_fallback = true 53 | alpha = 1.0 54 | 55 | [simulator] 56 | nethint = true 57 | sample_interval_ns = 100_000_000 # 100ms 58 | loopback_speed = 400 59 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 60 | # fairness = "TenantFlowMaxMin" 61 | fairness = "PerFlowMaxMin" 62 | 63 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 64 | # background_flow_hard = { enable = false } 65 | # nethint_delay_ms = 100 66 | 67 | [brain] 68 | # Random seed for multiple uses 69 | seed = 1 70 | # Whether the cluster's bandwidth is asymmetric 71 | asymmetric = false 72 | # The percentage of nodes marked broken 73 | broken = 0.1 74 | # The slots of each physical machine 75 | max_slots = 4 76 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 77 | sharing_mode = "Guaranteed" 78 | guaranteed_bandwidth = 25 79 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 80 | # background_flow_high_freq = { enable = false } 81 | gc_period = 100 82 | 83 | # The topology for simulation 84 | [brain.topology] 85 | type = "Arbitrary" # another possible value is "FatTree" 86 | 87 | # [brain.topology.args] # When type = "FatTree" 88 | # nports = 20 # the number of ports of a switch 89 | # bandwidth = 100 # in Gbps 90 | # oversub_ratio = 4.0 # oversubscription ratio 91 | 92 | [brain.topology.args] # When type = "Arbitrary" 93 | nracks = 150 # the number of racks 94 | rack_size = 18 # the number of hosts under a rack 95 | host_bw = 100 # bandwidth of a host, in Gbps 96 | rack_bw = 600 # bandwidth of a ToR switch, in Gbps 97 | 98 | [envs] 99 | NETHINT_SHOW_DECISION = "/tmp/spectrum6/spectrum6_1/decision.txt" 100 | -------------------------------------------------------------------------------- /src/nhagent/src/timing.rs: -------------------------------------------------------------------------------- 1 | use serde::{Serialize, Deserialize}; 2 | use std::time::SystemTime; 3 | 4 | pub const ON_COLLECTED: &str = "OnCollected"; 5 | pub const ON_SAMPLED: &str = "OnSampled"; 6 | pub const ON_CHUNK_SENT: &str = "OnChunkSent"; 7 | pub const ON_ALL_RECEIVED: &str = "OnAllReceived"; 8 | pub const ON_TENANT_SENT_REQ: &str = "OnTenantSentRequest"; 9 | pub const ON_RECV_TENANT_REQ: &str = "OnRecvTenantRequest"; 10 | pub const ON_TENANT_RECV_RES: &str = "OnTenantRecvResponse"; 11 | 12 | #[derive(Debug, Clone, Serialize, Deserialize)] 13 | pub struct TimeRecord { 14 | pub stage: String, 15 | pub ts: SystemTime, 16 | } 17 | 18 | impl TimeRecord { 19 | pub fn new(stage: &str) -> Self { 20 | TimeRecord { 21 | stage: stage.to_owned(), 22 | ts: SystemTime::now(), 23 | } 24 | } 25 | 26 | pub fn with_ts(stage: &str, ts: SystemTime) -> Self { 27 | TimeRecord { 28 | stage: stage.to_owned(), 29 | ts, 30 | } 31 | } 32 | } 33 | 34 | #[derive(Debug, Clone, Default, Serialize, Deserialize)] 35 | pub struct TimeList { 36 | recs: Vec, 37 | } 38 | 39 | impl TimeList { 40 | pub fn new() -> Self { 41 | Default::default() 42 | } 43 | 44 | pub fn clear(&mut self) { 45 | self.recs.clear() 46 | } 47 | 48 | pub fn get(&self, stage: &str) -> Option { 49 | self.recs.iter().find(|x| x.stage == stage).cloned() 50 | } 51 | 52 | pub fn push(&mut self, stage: &str, ts: SystemTime) { 53 | self.recs.push(TimeRecord::with_ts(stage, ts)) 54 | } 55 | 56 | pub fn push_now(&mut self, stage: &str) { 57 | self.recs.push(TimeRecord::new(stage)); 58 | } 59 | 60 | pub fn update(&mut self, stage: &str, ts: SystemTime) { 61 | let e = self.recs.iter_mut().rfind(|x| x.stage == stage); 62 | if let Some(x) = e { 63 | x.ts = ts.max(x.ts); 64 | } else { 65 | self.recs.push(TimeRecord::with_ts(stage, ts)); 66 | } 67 | } 68 | 69 | pub fn update_now(&mut self, stage: &str) { 70 | self.update(stage, SystemTime::now()); 71 | } 72 | 73 | /// sync the latest corresponding element in `self` with `other`, 74 | /// if not exists, append that element to `self`. 75 | pub fn update_time_list(&mut self, other: &TimeList) { 76 | other.recs.iter().for_each(|o| self.update(&o.stage, o.ts)); 77 | } 78 | 79 | pub fn update_min(&mut self, stage: &str, other: &TimeList) { 80 | if let Some(o) = other.get(stage) { 81 | if let Some(e) = self.recs.iter_mut().rfind(|x| x.stage == stage) { 82 | e.ts = o.ts.min(e.ts); 83 | } else { 84 | self.recs.push(TimeRecord::with_ts(&o.stage, o.ts)); 85 | } 86 | } 87 | } 88 | } 89 | 90 | impl std::fmt::Display for TimeList { 91 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 92 | let mut rs = self.recs.clone(); 93 | rs.sort_by_key(|x| x.ts); 94 | if !rs.is_empty() { 95 | let eariest = rs[0].ts; 96 | for r in rs { 97 | writeln!(f, "{} {}", r.stage, r.ts.duration_since(eariest).unwrap().as_micros())?; 98 | } 99 | } 100 | writeln!(f, "\n") 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /evaluation/model_serving_configs/standard2.toml: -------------------------------------------------------------------------------- 1 | # 3:1, jobsize = 4n, poisson = 24s, background: (60s, 0.5, 5), max_slots = 1 2 | # Specifiation of a Allreduce application experiment 3 | 4 | # Number of jobs 5 | ncases = 100 6 | 7 | # Job size distributions [(percentage, number of workers)] 8 | job_size_distribution = [[50, 8], [50, 16]] 9 | 10 | # Images in a ML serving batch request, batch size = 64, each image has 3 channels, crop to 256x256 11 | buffer_size = 12582912 12 | 13 | # Number of iterations for all jobs 14 | num_iterations = 4000 15 | 16 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 4000 = 8s 17 | poisson_lambda = 8_000_000_000.0 18 | 19 | partially_sync = false 20 | 21 | placement_strategy = { type = "Compact" } 22 | # placement_strategy = { type = "CompactLoadBalanced" } 23 | # placement_strategy = { type = "Spread" } 24 | # placement_strategy = { type = "Random", args = 0 } 25 | 26 | # global seed 27 | seed = 1 28 | 29 | # Output path of for the simulation results 30 | directory = "/tmp/model_serving_result_for_paper/standard2" 31 | 32 | # Number of repeats for each batch of experiments 33 | batch_repeat = 1 34 | 35 | [[batch]] 36 | policy = "Random" 37 | probe = { enable = false } 38 | nethint_level = 0 39 | 40 | # [[batch]] 41 | # policy = "RAT" 42 | # probe = { enable = true, round_ms = 100 } 43 | # nethint_level = 2 44 | # auto_tune = 1 45 | 46 | [[batch]] 47 | policy = "RAT" 48 | probe = { enable = true, round_ms = 100 } 49 | nethint_level = 2 50 | auto_tune = 4000 51 | 52 | [[batch]] 53 | policy = "RAT" 54 | probe = { enable = false } 55 | nethint_level = 2 56 | # Auto tune after some iterations. default is disabled 57 | auto_tune = 10 58 | 59 | [simulator] 60 | nethint = true 61 | sample_interval_ns = 100_000_000 # 100ms 62 | loopback_speed = 400 63 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 64 | # fairness = "TenantFlowMaxMin" 65 | fairness = "PerFlowMaxMin" 66 | 67 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 } 68 | # background_flow_hard = { enable = false } 69 | # nethint_delay_ms = 100 70 | 71 | [brain] 72 | # Random seed for multiple uses 73 | seed = 1 74 | # Whether the cluster's bandwidth is asymmetric 75 | asymmetric = false 76 | # The percentage of nodes marked broken 77 | broken = 0.0 78 | # The slots of each physical machine 79 | max_slots = 1 80 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 81 | sharing_mode = "Guaranteed" 82 | # in Gbps 83 | guaranteed_bandwidth = 25 84 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 85 | # background_flow_high_freq = { enable = false } 86 | gc_period = 100 87 | 88 | # The topology for simulation 89 | [brain.topology] 90 | type = "Arbitrary" # another possible value is "FatTree" 91 | 92 | # [brain.topology.args] # When type = "FatTree" 93 | # nports = 20 # the number of ports of a switch 94 | # bandwidth = 100 # in Gbps 95 | # oversub_ratio = 4.0 # oversubscription ratio 96 | 97 | [brain.topology.args] # When type = "Arbitrary" 98 | nracks = 300 # the number of racks 99 | rack_size = 18 # the number of hosts under a rack 100 | host_bw = 100 # bandwidth of a host, in Gbps 101 | rack_bw = 600 # bandwidth of a ToR switch, in Gbps 102 | 103 | # [envs] 104 | # KEY = "value" 105 | -------------------------------------------------------------------------------- /src/mapreduce/src/inspect.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | app::run_map_reduce, argument::Opt, trace::JobTrace, JobSpec, ReducerPlacementPolicy, 3 | ShufflePattern, 4 | }; 5 | use async_std::task; 6 | use futures::stream::StreamExt; 7 | use log::{debug, info}; 8 | use nethint::{cluster::Cluster, Duration, Timestamp, ToStdDuration}; 9 | use std::sync::Arc; 10 | 11 | #[derive(Debug, Clone, Copy)] 12 | pub struct JobLifetime { 13 | // start time (ns) of the job, grabbed from trace 14 | pub start: Timestamp, 15 | // duration of the job, simulated 16 | pub dura: Duration, 17 | } 18 | 19 | pub fn run_experiments(opt: &Opt, cluster: Arc) -> Option> { 20 | assert!(opt.trace.is_some(), "need to specify a trace file"); 21 | 22 | let num_cpus = opt.parallel.unwrap_or_else(num_cpus::get); 23 | 24 | let job_trace = opt.trace.as_ref().map(|p| { 25 | JobTrace::from_path(p) 26 | .unwrap_or_else(|e| panic!("failed to load from file: {:?}, error: {}", p, e)) 27 | }); 28 | 29 | assert!(job_trace.is_some()); 30 | 31 | task::block_on(async { 32 | let experiments = futures::stream::iter({ 33 | let ncases = std::cmp::min( 34 | opt.ncases, 35 | job_trace.as_ref().map(|v| v.count).unwrap_or(usize::MAX), 36 | ); 37 | (0..ncases).map(|i| { 38 | let id = i; 39 | let cluster = Arc::clone(&cluster); 40 | 41 | let (start_ts, job_spec) = job_trace 42 | .as_ref() 43 | .map(|job_trace| { 44 | let mut record = job_trace.records[id].clone(); 45 | // mutiple traffic by a number 46 | record.reducers = record 47 | .reducers 48 | .into_iter() 49 | .map(|(a, b)| (a, b * opt.traffic_scale)) 50 | .collect(); 51 | debug!("record: {:?}", record); 52 | let ts = record.ts; 53 | let job_spec = JobSpec::new( 54 | record.num_map * opt.num_map, 55 | record.num_reduce * opt.num_reduce, 56 | ShufflePattern::FromTrace(Box::new(record)), 57 | ); 58 | (ts, job_spec) 59 | }) 60 | .unwrap(); 61 | 62 | let policy = ReducerPlacementPolicy::HierarchicalGreedy; 63 | 64 | task::spawn(async move { 65 | info!("testcase: {}", id); 66 | let jct = run_map_reduce(&cluster, &job_spec, policy, id as _); 67 | // let time = output.recs.into_iter().map(|r| r.dura.unwrap()).max(); 68 | info!( 69 | "{:?}, job_finish_time: {:?}", 70 | policy, 71 | jct.unwrap().to_dura() 72 | ); 73 | Some(( 74 | i, 75 | JobLifetime { 76 | start: start_ts * 1_000_000, 77 | dura: jct.unwrap(), 78 | }, 79 | )) 80 | }) 81 | }) 82 | }) 83 | .buffer_unordered(num_cpus) 84 | .collect::>>(); 85 | experiments.await.into_iter().collect() 86 | }) 87 | } 88 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/background_off.toml: -------------------------------------------------------------------------------- 1 | # 3:1, jobsize = 4n, poisson = 24s, background: off, max_slots = 1 2 | # Specifiation of a Allreduce application experiment 3 | 4 | # Number of jobs 5 | ncases = 100 6 | 7 | # Job size distributions [(percentage, number of workers)] 8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]] 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]] 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]] 12 | 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 14 | buffer_size = 100_000_000 15 | 16 | # Number of iterations for all jobs 17 | num_iterations = 100 18 | 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 20 | poisson_lambda = 24_00_000_000.0 21 | 22 | placement_strategy = { type = "Compact" } 23 | # placement_strategy = { type = "CompactLoadBalanced" } 24 | # placement_strategy = { type = "Spread" } 25 | # placement_strategy = { type = "Random", args = 0 } 26 | 27 | # global seed 28 | seed = 1 29 | 30 | # Output path of for the simulation results 31 | directory = "/tmp/allreduce_result_for_paper/background_off" 32 | 33 | # Number of repeats for each batch of experiments 34 | batch_repeat = 5 35 | 36 | [[batch]] 37 | policy = "Random" 38 | probe = { enable = false } 39 | nethint_level = 0 40 | 41 | [[batch]] 42 | policy = "RAT" 43 | probe = { enable = true, round_ms = 10 } 44 | nethint_level = 2 45 | auto_tune = 10 46 | 47 | [[batch]] 48 | policy = "RAT" 49 | probe = { enable = false } 50 | nethint_level = 2 51 | # Auto tune after some iterations. default is disabled 52 | auto_tune = 10 53 | 54 | [simulator] 55 | nethint = true 56 | sample_interval_ns = 100_000_000 # 100ms 57 | loopback_speed = 400 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 59 | # fairness = "TenantFlowMaxMin" 60 | fairness = "PerFlowMaxMin" 61 | 62 | # background_flow_hard = { enable = true, frequency_ns = 1_000_000_000, probability = 0.1, amplitude = 1 } 63 | background_flow_hard = { enable = false } 64 | # nethint_delay_ms = 100 65 | 66 | [brain] 67 | # Random seed for multiple uses 68 | seed = 1 69 | # Whether the cluster's bandwidth is asymmetric 70 | asymmetric = false 71 | # The percentage of nodes marked broken 72 | broken = 0.0 73 | # The slots of each physical machine 74 | max_slots = 1 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 76 | sharing_mode = "Guaranteed" 77 | # in Gbps 78 | guaranteed_bandwidth = 25 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 80 | background_flow_high_freq = { enable = false } 81 | gc_period = 100 82 | 83 | # The topology for simulation 84 | [brain.topology] 85 | type = "Arbitrary" # another possible value is "FatTree" 86 | 87 | # [brain.topology.args] # When type = "FatTree" 88 | # nports = 20 # the number of ports of a switch 89 | # bandwidth = 100 # in Gbps 90 | # oversub_ratio = 4.0 # oversubscription ratio 91 | 92 | [brain.topology.args] # When type = "Arbitrary" 93 | nracks = 300 # the number of racks 94 | rack_size = 6 # the number of hosts under a rack 95 | host_bw = 100 # bandwidth of a host, in Gbps 96 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 97 | 98 | # [envs] 99 | # KEY = "value" 100 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/standard2.toml: -------------------------------------------------------------------------------- 1 | # 3:1, jobsize = 4n, poisson = 24s, background: (60s, 0.5, 5), max_slots = 1 2 | # Specifiation of a Allreduce application experiment 3 | 4 | # Number of jobs 5 | ncases = 100 6 | 7 | # Job size distributions [(percentage, number of workers)] 8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]] 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]] 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]] 12 | 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 14 | buffer_size = 100_000_000 15 | 16 | # Number of iterations for all jobs 17 | num_iterations = 100 18 | 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 20 | poisson_lambda = 24_00_000_000.0 21 | 22 | placement_strategy = { type = "Compact" } 23 | # placement_strategy = { type = "CompactLoadBalanced" } 24 | # placement_strategy = { type = "Spread" } 25 | # placement_strategy = { type = "Random", args = 0 } 26 | 27 | # global seed 28 | seed = 1 29 | 30 | # Output path of for the simulation results 31 | directory = "/tmp/allreduce_result_for_paper/standard2" 32 | 33 | # Number of repeats for each batch of experiments 34 | batch_repeat = 5 35 | 36 | [[batch]] 37 | policy = "Random" 38 | probe = { enable = false } 39 | nethint_level = 0 40 | 41 | [[batch]] 42 | policy = "RAT" 43 | probe = { enable = true, round_ms = 10 } 44 | nethint_level = 2 45 | auto_tune = 10 46 | 47 | [[batch]] 48 | policy = "RAT" 49 | probe = { enable = false } 50 | nethint_level = 2 51 | # Auto tune after some iterations. default is disabled 52 | auto_tune = 10 53 | 54 | [simulator] 55 | nethint = true 56 | sample_interval_ns = 100_000_000 # 100ms 57 | loopback_speed = 400 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 59 | # fairness = "TenantFlowMaxMin" 60 | fairness = "PerFlowMaxMin" 61 | 62 | background_flow_hard = { enable = true, frequency_ns = 60_000_000_000, probability = 0.5, amplitude = 5 } 63 | # background_flow_hard = { enable = false } 64 | # nethint_delay_ms = 100 65 | 66 | [brain] 67 | # Random seed for multiple uses 68 | seed = 1 69 | # Whether the cluster's bandwidth is asymmetric 70 | asymmetric = false 71 | # The percentage of nodes marked broken 72 | broken = 0.0 73 | # The slots of each physical machine 74 | max_slots = 1 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 76 | sharing_mode = "Guaranteed" 77 | # in Gbps 78 | guaranteed_bandwidth = 25 79 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 80 | # background_flow_high_freq = { enable = false } 81 | gc_period = 100 82 | 83 | # The topology for simulation 84 | [brain.topology] 85 | type = "Arbitrary" # another possible value is "FatTree" 86 | 87 | # [brain.topology.args] # When type = "FatTree" 88 | # nports = 20 # the number of ports of a switch 89 | # bandwidth = 100 # in Gbps 90 | # oversub_ratio = 4.0 # oversubscription ratio 91 | 92 | [brain.topology.args] # When type = "Arbitrary" 93 | nracks = 300 # the number of racks 94 | rack_size = 6 # the number of hosts under a rack 95 | host_bw = 100 # bandwidth of a host, in Gbps 96 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 97 | 98 | # [envs] 99 | # KEY = "value" 100 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/background_dynamic_strong.toml: -------------------------------------------------------------------------------- 1 | # 3:1, jobsize = 4n, poisson = 24s, background: (1s, 0.9, 9), max_slots = 1 2 | # Specifiation of a Allreduce application experiment 3 | 4 | # Number of jobs 5 | ncases = 100 6 | 7 | # Job size distributions [(percentage, number of workers)] 8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]] 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]] 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]] 12 | 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 14 | buffer_size = 100_000_000 15 | 16 | # Number of iterations for all jobs 17 | num_iterations = 100 18 | 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 20 | poisson_lambda = 24_00_000_000.0 21 | 22 | placement_strategy = { type = "Compact" } 23 | # placement_strategy = { type = "CompactLoadBalanced" } 24 | # placement_strategy = { type = "Spread" } 25 | # placement_strategy = { type = "Random", args = 0 } 26 | 27 | # global seed 28 | seed = 1 29 | 30 | # Output path of for the simulation results 31 | directory = "/tmp/allreduce_result_for_paper/background_dynamic_strong" 32 | 33 | # Number of repeats for each batch of experiments 34 | batch_repeat = 5 35 | 36 | [[batch]] 37 | policy = "Random" 38 | probe = { enable = false } 39 | nethint_level = 0 40 | 41 | [[batch]] 42 | policy = "RAT" 43 | probe = { enable = true, round_ms = 10 } 44 | nethint_level = 2 45 | auto_tune = 10 46 | 47 | [[batch]] 48 | policy = "RAT" 49 | probe = { enable = false } 50 | nethint_level = 2 51 | # Auto tune after some iterations. default is disabled 52 | auto_tune = 10 53 | 54 | [simulator] 55 | nethint = true 56 | sample_interval_ns = 100_000_000 # 100ms 57 | loopback_speed = 400 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 59 | # fairness = "TenantFlowMaxMin" 60 | fairness = "PerFlowMaxMin" 61 | 62 | background_flow_hard = { enable = true, frequency_ns = 1_000_000_000, probability = 0.9, amplitude = 8 } 63 | # background_flow_hard = { enable = false } 64 | # nethint_delay_ms = 100 65 | 66 | [brain] 67 | # Random seed for multiple uses 68 | seed = 1 69 | # Whether the cluster's bandwidth is asymmetric 70 | asymmetric = false 71 | # The percentage of nodes marked broken 72 | broken = 0.0 73 | # The slots of each physical machine 74 | max_slots = 1 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 76 | sharing_mode = "Guaranteed" 77 | # in Gbps 78 | guaranteed_bandwidth = 25 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 80 | background_flow_high_freq = { enable = false } 81 | gc_period = 100 82 | 83 | # The topology for simulation 84 | [brain.topology] 85 | type = "Arbitrary" # another possible value is "FatTree" 86 | 87 | # [brain.topology.args] # When type = "FatTree" 88 | # nports = 20 # the number of ports of a switch 89 | # bandwidth = 100 # in Gbps 90 | # oversub_ratio = 4.0 # oversubscription ratio 91 | 92 | [brain.topology.args] # When type = "Arbitrary" 93 | nracks = 300 # the number of racks 94 | rack_size = 6 # the number of hosts under a rack 95 | host_bw = 100 # bandwidth of a host, in Gbps 96 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 97 | 98 | # [envs] 99 | # KEY = "value" 100 | -------------------------------------------------------------------------------- /evaluation/allreduce_configs/background_static_strong.toml: -------------------------------------------------------------------------------- 1 | # 3:1, jobsize = 4n, poisson = 24s, background: (1000s, 0.9, 9), max_slots = 1 2 | # Specifiation of a Allreduce application experiment 3 | 4 | # Number of jobs 5 | ncases = 100 6 | 7 | # Job size distributions [(percentage, number of workers)] 8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]] 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]] 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]] 12 | 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB 14 | buffer_size = 100_000_000 15 | 16 | # Number of iterations for all jobs 17 | num_iterations = 100 18 | 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s 20 | poisson_lambda = 24_00_000_000.0 21 | 22 | placement_strategy = { type = "Compact" } 23 | # placement_strategy = { type = "CompactLoadBalanced" } 24 | # placement_strategy = { type = "Spread" } 25 | # placement_strategy = { type = "Random", args = 0 } 26 | 27 | # global seed 28 | seed = 1 29 | 30 | # Output path of for the simulation results 31 | directory = "/tmp/allreduce_result_for_paper/background_static_strong" 32 | 33 | # Number of repeats for each batch of experiments 34 | batch_repeat = 5 35 | 36 | [[batch]] 37 | policy = "Random" 38 | probe = { enable = false } 39 | nethint_level = 0 40 | 41 | [[batch]] 42 | policy = "RAT" 43 | probe = { enable = true, round_ms = 10 } 44 | nethint_level = 2 45 | auto_tune = 10 46 | 47 | [[batch]] 48 | policy = "RAT" 49 | probe = { enable = false } 50 | nethint_level = 2 51 | # Auto tune after some iterations. default is disabled 52 | auto_tune = 10 53 | 54 | [simulator] 55 | nethint = true 56 | sample_interval_ns = 100_000_000 # 100ms 57 | loopback_speed = 400 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 59 | # fairness = "TenantFlowMaxMin" 60 | fairness = "PerFlowMaxMin" 61 | 62 | background_flow_hard = { enable = true, frequency_ns = 1000_000_000_000, probability = 0.9, amplitude = 8 } 63 | # background_flow_hard = { enable = false } 64 | # nethint_delay_ms = 100 65 | 66 | [brain] 67 | # Random seed for multiple uses 68 | seed = 1 69 | # Whether the cluster's bandwidth is asymmetric 70 | asymmetric = false 71 | # The percentage of nodes marked broken 72 | broken = 0.0 73 | # The slots of each physical machine 74 | max_slots = 1 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 76 | sharing_mode = "Guaranteed" 77 | # in Gbps 78 | guaranteed_bandwidth = 25 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 80 | background_flow_high_freq = { enable = false } 81 | gc_period = 100 82 | 83 | # The topology for simulation 84 | [brain.topology] 85 | type = "Arbitrary" # another possible value is "FatTree" 86 | 87 | # [brain.topology.args] # When type = "FatTree" 88 | # nports = 20 # the number of ports of a switch 89 | # bandwidth = 100 # in Gbps 90 | # oversub_ratio = 4.0 # oversubscription ratio 91 | 92 | [brain.topology.args] # When type = "Arbitrary" 93 | nracks = 300 # the number of racks 94 | rack_size = 6 # the number of hosts under a rack 95 | host_bw = 100 # bandwidth of a host, in Gbps 96 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 97 | 98 | # [envs] 99 | # KEY = "value" 100 | -------------------------------------------------------------------------------- /src/mapreduce/src/trace.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use nethint::Timestamp; 3 | use std::io::BufRead; 4 | 5 | pub struct JobTrace { 6 | pub nracks: usize, 7 | pub count: usize, 8 | pub records: Vec, 9 | } 10 | 11 | // example: 3 13122 2 66 138 1 38:4.0 12 | #[derive(Debug, Clone)] 13 | pub struct Record { 14 | pub id: usize, 15 | pub ts: Timestamp, 16 | pub num_map: usize, 17 | pub mappers: Vec, 18 | pub num_reduce: usize, 19 | pub reducers: Vec<(usize, f64)>, 20 | } 21 | 22 | #[derive(Debug, Clone, Copy)] 23 | pub struct ParseRecordError; 24 | 25 | impl std::fmt::Display for ParseRecordError { 26 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 27 | write!(f, "{:?}", self) 28 | } 29 | } 30 | 31 | macro_rules! parse_next { 32 | ($tokens:expr, $ret:ty) => { 33 | $tokens 34 | .next() 35 | .and_then(|f| f.parse::<$ret>().ok()) 36 | .ok_or(ParseRecordError)? 37 | }; 38 | } 39 | 40 | impl std::str::FromStr for Record { 41 | type Err = ParseRecordError; 42 | fn from_str(s: &str) -> Result { 43 | let mut tokens = s.trim().split(' '); 44 | let id = parse_next!(tokens, usize); 45 | let ts = parse_next!(tokens, Timestamp); 46 | let num_map = parse_next!(tokens, usize); 47 | let mappers: Vec = tokens 48 | .by_ref() 49 | .take(num_map) 50 | .map(|x| x.parse::().ok()) 51 | .collect::>>() 52 | .ok_or(ParseRecordError)?; 53 | let num_reduce = parse_next!(tokens, usize); 54 | let reducers: Vec<(usize, f64)> = tokens 55 | .take(num_reduce) 56 | .map(|x| { 57 | x.split_once(":") 58 | .and_then(|(a, b)| a.parse::().ok().zip(b.parse::().ok())) 59 | }) 60 | .collect::>>() 61 | .ok_or(ParseRecordError)?; 62 | 63 | assert_eq!(num_map, mappers.len()); 64 | assert_eq!(num_reduce, reducers.len()); 65 | 66 | Ok(Record { 67 | id, 68 | ts, 69 | num_map, 70 | mappers, 71 | num_reduce, 72 | reducers, 73 | }) 74 | } 75 | } 76 | 77 | impl JobTrace { 78 | pub fn from_path>(path: P) -> Result { 79 | let f = std::fs::File::open(path)?; 80 | let mut reader = std::io::BufReader::new(f); 81 | let mut line = String::new(); 82 | reader.read_line(&mut line)?; 83 | let v: Vec = line 84 | .trim() 85 | .split(' ') 86 | .map(|x| x.parse().ok()) 87 | .collect::>() 88 | .unwrap(); 89 | assert_eq!(v.len(), 2); 90 | 91 | let nracks = v[0]; 92 | let count = v[1]; 93 | let mut records = Vec::new(); 94 | for _i in 0..count { 95 | let mut line = String::new(); 96 | reader.read_line(&mut line)?; 97 | if line.starts_with('#') { 98 | continue; 99 | } 100 | let r: Record = line 101 | .parse() 102 | .unwrap_or_else(|e| panic!("pare line failed: {}, line: {}", e, line)); 103 | records.push(r); 104 | } 105 | 106 | Ok(JobTrace { 107 | nracks, 108 | count, 109 | records, 110 | }) 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /evaluation/rl_configs/level2probe.toml: -------------------------------------------------------------------------------- 1 | # Specifiation of a Allreduce application experiment 2 | 3 | # Number of jobs 4 | ncases = 50 5 | 6 | # Job size distributions [(percentage, number of workers)] 7 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]] 8 | # job_size_distribution = [[40, 4], [40, 8], [40, 12], [40, 16], [40, 32], [40, 64]] 9 | # job_size_distribution = [[40, 4], [40, 8], [40, 12]] 10 | # job_size_distribution = [[40, 4], [40, 8], [20, 12], [20, 16]] 11 | # job_size_distribution = [[40, 32], [40, 64]] 12 | job_size_distribution = [[40, 4], [40, 8], [40, 12]] 13 | 14 | # Buffer size of all jobs, in bytes 15 | buffer_size = 100_000_000 16 | 17 | # Number of iterations for all jobs 18 | num_iterations = 100 19 | 20 | # Lambda of the poisson arrival, 2*100MB/25Gbps*120 = 7.68s 21 | # poisson_lambda = 8_000_000_000.0 22 | poisson_lambda = 8_00_000_000.0 23 | 24 | placement_strategy = { type = "Compact" } 25 | # placement_strategy = { type = "CompactLoadBalanced" } 26 | # placement_strategy = { type = "Spread" } 27 | # placement_strategy = { type = "Random", args = 0 } 28 | 29 | # global seed 30 | seed = 1 31 | 32 | # Output path of for the simulation results 33 | directory = "/tmp/rl_result_for_paper/level2probe" 34 | 35 | # Number of repeats for each batch of experiments 36 | batch_repeat = 5 37 | 38 | [[batch]] 39 | policy = "Random" 40 | probe = { enable = false } 41 | nethint_level = 2 42 | 43 | [[batch]] 44 | policy = "RAT" 45 | probe = { enable = true, round_ms = 10 } 46 | nethint_level = 2 47 | # Auto tune after some iterations. default is disabled 48 | auto_tune = 10 49 | 50 | [[batch]] 51 | policy = "RAT" 52 | probe = { enable = false } 53 | nethint_level = 2 54 | # Auto tune after some iterations. default is disabled 55 | auto_tune = 10 56 | 57 | [simulator] 58 | nethint = true 59 | sample_interval_ns = 100_000_000 # 100ms 60 | loopback_speed = 400 61 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin" 62 | # fairness = "TenantFlowMaxMin" 63 | fairness = "PerFlowMaxMin" 64 | 65 | background_flow_hard = { enable = true, frequency_ns = 60_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 } 66 | # background_flow_hard = { enable = false } 67 | # nethint_delay_ms = 100 68 | 69 | [brain] 70 | # Random seed for multiple uses 71 | seed = 1 72 | # Whether the cluster's bandwidth is asymmetric 73 | asymmetric = false 74 | # The percentage of nodes marked broken 75 | broken = 0.0 76 | # The slots of each physical machine 77 | max_slots = 1 78 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed" 79 | sharing_mode = "Guaranteed" 80 | guaranteed_bandwidth = 25 81 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 } 82 | # background_flow_high_freq = { enable = false } 83 | gc_period = 100 84 | 85 | # The topology for simulation 86 | [brain.topology] 87 | type = "Arbitrary" # another possible value is "FatTree" 88 | 89 | # [brain.topology.args] # When type = "FatTree" 90 | # nports = 20 # the number of ports of a switch 91 | # bandwidth = 100 # in Gbps 92 | # oversub_ratio = 4.0 # oversubscription ratio 93 | 94 | [brain.topology.args] # When type = "Arbitrary" 95 | nracks = 320 # the number of racks 96 | rack_size = 6 # the number of hosts under a rack 97 | host_bw = 100 # bandwidth of a host, in Gbps 98 | rack_bw = 200 # bandwidth of a ToR switch, in Gbps 99 | 100 | # [envs] 101 | # KEY = "value" 102 | -------------------------------------------------------------------------------- /src/mapreduce/src/config.rs: -------------------------------------------------------------------------------- 1 | use crate::{ShufflePattern, mapper::MapperPlacementPolicy, ReducerPlacementPolicy}; 2 | use nethint::brain::{self, BrainSetting}; 3 | use nethint::simulator::SimulatorSetting; 4 | use serde::{Deserialize, Serialize}; 5 | 6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] 7 | pub struct ProbeConfig { 8 | pub enable: bool, 9 | #[serde(default)] 10 | pub round_ms: u64, 11 | } 12 | 13 | #[derive(Debug, Clone, Serialize, Deserialize)] 14 | pub struct BatchConfig { 15 | /// Reducer placement policy 16 | pub reducer_policy: ReducerPlacementPolicy, 17 | /// whether to use plink 18 | pub probe: ProbeConfig, 19 | /// Nethint level. 20 | pub nethint_level: usize, 21 | /// automatically choose which solution to use, BW or TO 22 | #[serde(default)] 23 | pub auto_fallback: Option, 24 | /// the alpha, details in paper 25 | #[serde(default)] 26 | pub alpha: Option, 27 | } 28 | 29 | #[derive(Debug, Clone, Serialize, Deserialize)] 30 | #[serde(deny_unknown_fields)] 31 | pub struct ExperimentConfig { 32 | /// Run experiments from trace file 33 | #[serde(default)] 34 | pub trace: Option, 35 | 36 | /// How to generate the shuffle 37 | pub shuffle: Option, 38 | 39 | /// Number of testcases 40 | pub ncases: usize, 41 | 42 | /// Number of map tasks. When using trace, this parameter means map scale factor 43 | pub num_map: usize, 44 | 45 | /// Number of reduce tasks. When using trace, this parameter means reduce scale factor 46 | pub num_reduce: usize, 47 | 48 | /// The map scale used only in testbed setting to support scale down; default 1.0 49 | pub map_scale: Option, 50 | 51 | /// The reduce scale used only in testbed setting to support scale down; default 1.0 52 | pub reduce_scale: Option, 53 | 54 | /// Traffic scale, multiply the traffic size by a number to allow job overlaps 55 | pub traffic_scale: f64, 56 | 57 | /// Scale the time of job arrival; default 1.0 58 | pub time_scale: Option, 59 | 60 | /// Computation time switch 61 | pub enable_computation_time: bool, 62 | 63 | /// Mapper placement policy 64 | pub mapper_policy: MapperPlacementPolicy, 65 | 66 | /// akin to AWS Placement Group 67 | pub placement_strategy: brain::PlacementStrategy, 68 | 69 | /// Whether to allow delay scheduling, default to false, in simulation, it must be false 70 | pub allow_delay: Option, 71 | 72 | /// Whether to skip trivial jobs; default false 73 | pub skip_trivial: Option, 74 | 75 | /// Collocate or De-collocate 76 | pub collocate: bool, 77 | 78 | /// Number of repeats for each batch of experiments 79 | pub batch_repeat: usize, 80 | 81 | #[serde(rename = "batch")] 82 | pub batches: Vec, 83 | 84 | /// Output path of the figure 85 | #[serde(default)] 86 | pub directory: Option, 87 | 88 | /// Simulator settings 89 | pub simulator: SimulatorSetting, 90 | 91 | /// Brain settings 92 | pub brain: BrainSetting, 93 | 94 | /// Environment variables 95 | #[serde(default)] 96 | pub envs: toml::value::Table, 97 | } 98 | 99 | pub fn read_config>(path: P) -> ExperimentConfig { 100 | use std::io::Read; 101 | let mut file = std::fs::File::open(path).expect("fail to open file"); 102 | let mut content = String::new(); 103 | file.read_to_string(&mut content).unwrap(); 104 | toml::from_str(&content).expect("parse failed") 105 | } 106 | -------------------------------------------------------------------------------- /src/utils/src/cmd_helper.rs: -------------------------------------------------------------------------------- 1 | use std::process::Command; 2 | 3 | pub fn get_command_str(cmd: &Command) -> String { 4 | let prog = cmd.get_program().to_str().unwrap(); 5 | let args: Vec<&str> = cmd.get_args().map(|x| x.to_str().unwrap()).collect(); 6 | let cmd_str = std::iter::once(prog).chain(args).collect::>().join(" "); 7 | cmd_str 8 | } 9 | 10 | pub fn get_command_output(mut cmd: Command) -> anyhow::Result { 11 | let cmd_str = get_command_str(&cmd); 12 | log::debug!("executing command: {}", cmd_str); 13 | 14 | use std::os::unix::process::ExitStatusExt; // for status.signal() 15 | let result = cmd.output()?; 16 | 17 | if !result.status.success() { 18 | return match result.status.code() { 19 | Some(code) => Err(anyhow::anyhow!( 20 | "Exited with code: {}, cmd: {}", 21 | code, 22 | cmd_str 23 | )), 24 | None => Err(anyhow::anyhow!( 25 | "Process terminated by signal: {}, cmd: {}", 26 | result.status.signal().unwrap(), 27 | cmd_str, 28 | )), 29 | }; 30 | } 31 | 32 | Ok(std::str::from_utf8(&result.stdout)?.to_owned()) 33 | } 34 | 35 | #[macro_export] 36 | macro_rules! poll_cmd { 37 | ($cmd:expr, $stop_flag:expr) => {{ 38 | let prog = $cmd.get_program().to_str().unwrap(); 39 | let args: Vec<&str> = $cmd.get_args().map(|x| x.to_str().unwrap()).collect(); 40 | let cmd_str = (std::iter::once(prog).chain(args).collect::>()).join(" "); 41 | log::debug!("command: {}", cmd_str); 42 | 43 | use std::os::unix::process::ExitStatusExt; // for status.signal() 44 | let mut child = $cmd.spawn().expect("Failed to rplaunch"); 45 | loop { 46 | match child.try_wait() { 47 | Ok(Some(status)) => { 48 | if !status.success() { 49 | match status.code() { 50 | Some(code) => { 51 | log::error!("Exited with code: {}, cmd: {}", code, cmd_str) 52 | } 53 | None => log::error!( 54 | "Process terminated by signal: {}, cmd: {}", 55 | status.signal().unwrap(), 56 | cmd_str, 57 | ), 58 | } 59 | } 60 | break; 61 | } 62 | Ok(None) => { 63 | log::trace!("status not ready yet, sleep for 5 ms"); 64 | std::thread::sleep(std::time::Duration::from_millis(5)); 65 | } 66 | Err(e) => { 67 | panic!("Command wasn't running: {}", e); 68 | } 69 | } 70 | // check if kill is needed 71 | if $stop_flag.load(SeqCst) { 72 | log::warn!("killing the child process: {}", cmd_str); 73 | // instead of SIGKILL, we use SIGTERM here to gracefully shutdown ssh process tree. 74 | // SIGKILL can cause terminal control characters to mess up, which must be 75 | // fixed later with sth like "stty sane". 76 | // signal::kill(nix::unistd::Pid::from_raw(child.id() as _), signal::SIGTERM) 77 | // .unwrap_or_else(|e| panic!("Failed to kill: {}", e)); 78 | child 79 | .kill() 80 | .unwrap_or_else(|e| panic!("Failed to kill: {}", e)); 81 | log::warn!("child process terminated") 82 | } 83 | } 84 | }} 85 | } -------------------------------------------------------------------------------- /src/rl/src/topology_aware.rs: -------------------------------------------------------------------------------- 1 | use crate::RLAlgorithm; 2 | use nethint::{cluster::Topology, Flow}; 3 | use std::rc::Rc; 4 | 5 | #[derive(Debug, Default)] 6 | pub struct TopologyAwareTree { 7 | seed: u64, 8 | num_trees: usize, 9 | } 10 | 11 | impl TopologyAwareTree { 12 | pub fn new(seed: u64, num_trees: usize) -> Self { 13 | TopologyAwareTree { seed, num_trees } 14 | } 15 | } 16 | 17 | impl RLAlgorithm for TopologyAwareTree { 18 | fn run_rl_traffic( 19 | &mut self, 20 | root_index: usize, 21 | group: Option>, 22 | size: u64, 23 | vcluster: Rc, 24 | ) -> Vec { 25 | use rand::prelude::SliceRandom; 26 | use rand::{rngs::StdRng, SeedableRng}; 27 | let mut rng = StdRng::seed_from_u64(self.seed); 28 | 29 | let mut flows = Vec::new(); 30 | 31 | for _ in 0..self.num_trees { 32 | let mut ring = Vec::new(); 33 | 34 | for i in 0..vcluster.num_switches() - 1 { 35 | let mut ringlet = Vec::new(); 36 | let tor = format!("tor_{}", i); 37 | 38 | for link_ix in vcluster.get_downlinks(vcluster.get_node_index(&tor)) { 39 | let h = vcluster.get_target(*link_ix); 40 | let host_idx = vcluster[h] 41 | .name 42 | .strip_prefix("host_") 43 | .unwrap() 44 | .parse::() 45 | .unwrap(); 46 | ringlet.push(host_idx) 47 | } 48 | 49 | let pos = ringlet.iter().position(|x| *x == root_index); 50 | 51 | if pos == None { 52 | ringlet.shuffle(&mut rng); 53 | } else { 54 | let pos = pos.unwrap(); 55 | ringlet.remove(pos); 56 | ringlet.shuffle(&mut rng); 57 | ringlet.insert(0, root_index); 58 | } 59 | for node_idx in ringlet { 60 | ring.push(node_idx); 61 | } 62 | } 63 | 64 | // filter all nodes in the communication group 65 | if group.is_some() { 66 | let g = group.clone().unwrap(); 67 | ring.retain(|x| *x == root_index || g.contains(x)); 68 | } 69 | 70 | let pos = ring.iter().position(|x| *x == root_index).unwrap(); 71 | let n = ring.len(); 72 | 73 | // log::error!("pos {} n {}", pos, n); 74 | // log::error!("{}",root_index); 75 | // log::error!("{:?}",ring); 76 | 77 | for i in pos..n { 78 | let sender = format!("host_{}", ring[i]); 79 | let receiver = format!("host_{}", ring[(i + 1) % n]); 80 | if (i + 1) % n == pos { 81 | break; 82 | } 83 | let flow = Flow::new(size as usize, &sender, &receiver, None); 84 | flows.push(flow); 85 | } 86 | 87 | if pos > 0 { 88 | for i in 0..pos - 1 { 89 | let sender = format!("host_{}", ring[i]); 90 | let receiver = format!("host_{}", ring[i + 1]); 91 | let flow = Flow::new(size as usize, &sender, &receiver, None); 92 | flows.push(flow); 93 | } 94 | } 95 | } 96 | 97 | for f in &mut flows { 98 | f.bytes /= self.num_trees; 99 | } 100 | 101 | flows 102 | } 103 | } 104 | --------------------------------------------------------------------------------