├── src
    ├── replayer
    │   ├── src
    │   │   ├── worker
    │   │   │   └── mod.rs
    │   │   ├── controller
    │   │   │   ├── mod.rs
    │   │   │   └── app.rs
    │   │   ├── lib.rs
    │   │   ├── vm_ip_addrs.in
    │   │   └── message.rs
    │   ├── README.md
    │   └── Cargo.toml
    ├── mapreduce
    │   ├── .gitignore
    │   ├── figure
    │   │   ├── mapreduce_fattree_16_100g_7.00_m20_r20
    │   │   └── mapreduce_cdf_fattree_16_100g_7.00_m20_r20
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── plink.rs
    │   │   ├── random.rs
    │   │   ├── argument.rs
    │   │   ├── inspect.rs
    │   │   ├── trace.rs
    │   │   └── config.rs
    ├── utils
    │   ├── src
    │   │   ├── lib.rs
    │   │   ├── net.rs
    │   │   ├── collector.rs
    │   │   ├── fs.rs
    │   │   └── cmd_helper.rs
    │   └── Cargo.toml
    ├── nhagent_v2
    │   ├── src
    │   │   ├── sampler
    │   │   │   └── mod.rs
    │   │   ├── lib.rs
    │   │   ├── argument.rs
    │   │   └── message.rs
    │   ├── Cargo.toml
    │   └── testbed.toml
    ├── nhagent
    │   ├── README.md
    │   ├── src
    │   │   ├── sampler
    │   │   │   └── mod.rs
    │   │   ├── lib.rs
    │   │   ├── argument.rs
    │   │   ├── ssagent.rs
    │   │   ├── message.rs
    │   │   └── timing.rs
    │   ├── testbed.toml
    │   └── Cargo.toml
    ├── litemsg
    │   ├── src
    │   │   ├── communicator.rs
    │   │   ├── command.rs
    │   │   └── buffer.rs
    │   └── Cargo.toml
    ├── logging
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── rat_solver
    │   └── Cargo.toml
    ├── nethint
    │   ├── Cargo.toml
    │   ├── src
    │   │   └── runtime_est.rs
    │   └── tests
    │   │   └── toy1.rs
    ├── rl
    │   ├── Cargo.toml
    │   ├── src
    │   │   ├── lib.rs
    │   │   ├── argument.rs
    │   │   ├── random_ring.rs
    │   │   └── topology_aware.rs
    │   └── testbed.toml
    └── allreduce
    │   ├── Cargo.toml
    │   ├── src
    │       ├── lib.rs
    │       ├── argument.rs
    │       ├── random_ring.rs
    │       └── topology_aware.rs
    │   └── testbed.toml
├── README.md
├── .gitignore
├── scripts
    ├── clippy.sh
    ├── kill_background_flow.sh
    ├── testbed
    │   ├── one-click-configuration.sh
    │   ├── utils.sh
    │   ├── attach_vfs.sh
    │   ├── vfconfig
    │   │   ├── vf0.xml
    │   │   ├── vf1.xml
    │   │   ├── vf2.xml
    │   │   ├── vf3.xml
    │   │   ├── vf4.xml
    │   │   ├── vf5.xml
    │   │   ├── vf6.xml
    │   │   └── vf7.xml
    │   ├── environment
    │   │   ├── vf.xml.example
    │   │   ├── meta_config
    │   │   │   ├── interfaces.conf.sh
    │   │   │   └── set_rdma_intf.conf.sh
    │   │   ├── cpu_vm_stage1.sh
    │   │   └── README.md
    │   ├── provision_vms.sh
    │   ├── migrate_pf.sh
    │   ├── enable_sriov.sh
    │   ├── enable_eswitch.sh
    │   └── setup_ovs.sh
    ├── testbed-2
    │   ├── one-click-restore.sh
    │   ├── nixos-vm-setup
    │   │   ├── README.md
    │   │   ├── clean_all.sh
    │   │   ├── utils.sh
    │   │   ├── flake.nix
    │   │   ├── hardware-configuration.nix
    │   │   ├── pubkeys.nix
    │   │   └── bootstrap.sh
    │   ├── one-click-configuration.sh
    │   ├── meta_config
    │   │   └── set_rdma_intf.conf.sh
    │   ├── utils.sh
    │   ├── provision_vms.sh
    │   ├── setup_sflow.sh
    │   ├── migrate_pf.sh
    │   ├── setup_ovs.sh
    │   └── enable_sriov.sh
    ├── request-response.sh
    ├── switch_fairness_to.sh
    ├── recover_bandwidth_setting.sh
    ├── run_duplicates.sh
    ├── build_testbed_bins.sh
    ├── InfoCollectOverHead.hs
    └── run_duplicates_v2.sh
├── .gitmodules
├── evaluation
    ├── rl_configs
    │   ├── run_all.sh
    │   ├── run_paper.sh
    │   └── level2probe.toml
    ├── allreduce_configs
    │   ├── run_all.sh
    │   ├── run_paper.sh
    │   ├── standard3.toml
    │   ├── standard3_pervm.toml
    │   ├── standard3_pertenant.toml
    │   ├── background_off.toml
    │   ├── standard2.toml
    │   ├── background_dynamic_strong.toml
    │   └── background_static_strong.toml
    ├── model_serving_configs
    │   ├── run_paper.sh
    │   └── standard2.toml
    ├── mapreduce_configs
    │   ├── run_paper.sh
    │   └── run_all.sh
    ├── spectrum
    │   ├── run_spectrum3.sh
    │   ├── run_spectrum4.sh
    │   ├── run_spectrum5.sh
    │   ├── run_spectrum1.sh
    │   ├── run_spectrum2.sh
    │   ├── run_spectrum6.sh
    │   ├── spectrum4_base.toml
    │   ├── spectrum3_base.toml
    │   ├── spectrum1_base.toml
    │   ├── spectrum2_base.toml
    │   ├── spectrum5_base.toml
    │   └── spectrum6_base.toml
    ├── inaccuracy
    │   ├── run_inaccuracy2.sh
    │   ├── run_inaccuracy1.sh
    │   ├── inaccuracy1_base.toml
    │   └── inaccuracy2_base.toml
    ├── sensitivity
    │   ├── run_sensitivity_oversub.sh
    │   ├── run_sensitivity_probing_cost2.sh
    │   ├── run_sensitivity_probing_cost1.sh
    │   ├── run_sensitivity_rack_size.sh
    │   ├── sensitivity_probing_cost1_base.toml
    │   ├── sensitivity_probing_cost1_baseline.toml
    │   ├── sensitivity_oversub_base.toml
    │   └── sensitivity_rack_size_base.toml
    └── herd_behavior
    │   ├── run_mapreduce.sh
    │   ├── run_allreduce.sh
    │   └── allreduce_herd_base.toml
├── Cargo.toml
└── run_test.sh


/src/replayer/src/worker/mod.rs:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/mapreduce/.gitignore:
--------------------------------------------------------------------------------
1 | figure/*.pdf
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NetHint
2 | 
3 | To be updated.
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Session.vim
3 | *.code-workspace
4 | 


--------------------------------------------------------------------------------
/scripts/clippy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | find . | grep "\.rs$" | xargs touch ; cargo clippy
4 | 


--------------------------------------------------------------------------------
/scripts/kill_background_flow.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ps aux | grep iperf | awk '{print $2}' | xargs -I {} kill {}
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "nethint-bpf"]
2 | 	path = nethint-bpf
3 | 	url = https://github.com/crazyboycjr/nethint-bpf
4 | 


--------------------------------------------------------------------------------
/src/utils/src/lib.rs:
--------------------------------------------------------------------------------
1 | #![feature(command_access)]
2 | 
3 | pub mod cmd_helper;
4 | pub mod fs;
5 | pub mod net;
6 | pub mod algo;
7 | pub mod collector;
8 | 


--------------------------------------------------------------------------------
/src/nhagent_v2/src/sampler/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod sflow_sampler;
2 | pub use sflow_sampler::SFlowSampler;
3 | 
4 | pub mod bpf_sampler;
5 | pub use bpf_sampler::TcSampler;


--------------------------------------------------------------------------------
/src/replayer/src/controller/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod app;
2 | pub mod mapreduce;
3 | pub mod allreduce;
4 | pub mod rl;
5 | pub mod plink;
6 | pub mod background_flow;
7 | 


--------------------------------------------------------------------------------
/src/mapreduce/figure/mapreduce_fattree_16_100g_7.00_m20_r20:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyboycjr/nethint/HEAD/src/mapreduce/figure/mapreduce_fattree_16_100g_7.00_m20_r20


--------------------------------------------------------------------------------
/src/mapreduce/figure/mapreduce_cdf_fattree_16_100g_7.00_m20_r20:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crazyboycjr/nethint/HEAD/src/mapreduce/figure/mapreduce_cdf_fattree_16_100g_7.00_m20_r20


--------------------------------------------------------------------------------
/src/nhagent/README.md:
--------------------------------------------------------------------------------
1 | # NetHint Agent
2 | 
3 | It samples traffic by peroidically query flow table counters in OpenvSwitch. The collected results from all agents in the cluster are all-gathered and can be queried by each tenant.


--------------------------------------------------------------------------------
/scripts/testbed/one-click-configuration.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | 
 5 | sudo ./enable_sriov.sh rdma0 5
 6 | sudo ./enable_eswitch.sh rdma0
 7 | sudo ./setup_ovs.sh rdma0 4
 8 | sudo ./migrate_pf.sh up
 9 | sudo mlnx_qos -i rdma0 --prio_tc=0,1,2,3,4,5,6,7 -r 0,0,0,0,0,0,0,0
10 | 


--------------------------------------------------------------------------------
/src/litemsg/src/communicator.rs:
--------------------------------------------------------------------------------
 1 | pub struct Communicator {
 2 |     my_rank: usize,
 3 |     nodes: Vec<Nodes>,
 4 |     peers: Vec<TcpStream>,
 5 | }
 6 | 
 7 | 
 8 | impl Communicator {
 9 |     pub fn new(controller_uri: &str, num_workers: usize) -> Result<Self> {
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/nhagent/src/sampler/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod ovs_sampler;
 2 | pub mod ss_sampler;
 3 | 
 4 | pub use ss_sampler::SsSampler;
 5 | pub use ss_sampler::get_local_ip_table;
 6 | 
 7 | pub use ovs_sampler::OvsSampler;
 8 | pub use ovs_sampler::EthAddr;
 9 | pub use ovs_sampler::get_local_eth_table;
10 | 


--------------------------------------------------------------------------------
/evaluation/rl_configs/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | for conf in `ls *.toml`; do
 6 | 	echo $conf
 7 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -c $conf &
 8 | done
 9 | 
10 | wait
11 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/one-click-restore.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DIR=`dirname $(realpath $0)`
 4 | 
 5 | sudo "$DIR"/migrate_pf.sh down
 6 | sudo ovs-vsctl del-br ovs0
 7 | sudo "$DIR"/enable_sriov.sh rdma0 0
 8 | 
 9 | # sudo ip link set ovs-system mtu 1500 # no such device after removing ovs0
10 | sudo ip link set rdma0 mtu 1500
11 | 


--------------------------------------------------------------------------------
/src/replayer/README.md:
--------------------------------------------------------------------------------
 1 | # A distributed traffic pattern replayer.
 2 | 
 3 | Environment Variables
 4 | ```
 5 | RP_CONTROLLER_URI
 6 | RP_NUM_WORKER
 7 | ```
 8 | 
 9 | 
10 | Use the launcher
11 | ```
12 | ./rplaunch --controller-ssh 192.168.211.35 --controller-uri 192.168.211.35:9000 --hostfile ~/hostfile --jobname mapreduce
13 | ```
14 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["src/nethint", "src/logging", "src/mapreduce", "src/allreduce", "src/rl", "src/litemsg", "src/replayer", "src/nhagent_v2", "src/nhagent", "src/utils", "src/rat_solver"]
 3 | 
 4 | [profile.dev]
 5 | panic = "unwind"
 6 | 
 7 | [profile.release]
 8 | panic = "unwind"
 9 | lto = false # too slow for lto = true
10 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/nixos-vm-setup/README.md:
--------------------------------------------------------------------------------
 1 | Prepare
 2 | add nixos channel
 3 | install nix-install-tools to the current user profile
 4 | nix-channel --add https://nixos.org/channels/nixos-unstable-small nixos
 5 | nix-channel --update
 6 | 
 7 | 
 8 | How to run?
 9 | ```
10 | # ./bootstrap.sh
11 | ```
12 | 
13 | 
14 | This will create 8 nixos VMs;
15 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/nixos-vm-setup/clean_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=`dirname $(realpath $0)`
 4 | source "$DIR./utils.sh"
 5 | 
 6 | for name in `lsnames 8`; do
 7 | 	sudo virsh vol-delete ${name}.img --pool images
 8 | 	sudo virsh undefine $name
 9 | done
10 | 
11 | sudo virsh undefine nixosbase
12 | sudo rm /var/lib/libvirt/images/nixos_vm_base.img
13 | 


--------------------------------------------------------------------------------
/scripts/testbed/utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function lsnames {
 4 | 	host_id=`hostname | cut -d'-' -f2`
 5 | 	[[ -n $1 ]] && num_vms=$1 || num_vms=8
 6 | 	# base: host_id, len: num_vms, offset: i, id: j
 7 | 	for ((i=0;i<$num_vms;i++)); do
 8 | 		j=`expr $host_id \* $num_vms + $i - $num_vms`;
 9 | 		name=cpu${j}
10 | 		echo $name
11 | 	done
12 | }
13 | 


--------------------------------------------------------------------------------
/src/utils/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "utils"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | log = "0.4.14"
11 | anyhow = "1.0.38"
12 | lazy_static = "1.4.0"
13 | 


--------------------------------------------------------------------------------
/src/nhagent_v2/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(command_access)]
 2 | 
 3 | pub mod cluster;
 4 | pub mod sampler;
 5 | pub mod message;
 6 | pub mod communicator;
 7 | pub mod argument;
 8 | pub mod sdn_controller;
 9 | 
10 | pub use litemsg::Node;
11 | 
12 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
13 | pub enum Role {
14 |     GlobalLeader,
15 |     RackLeader,
16 | }
17 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/nixos-vm-setup/utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function lsnames {
 4 | 	host_id=`hostname | cut -d'-' -f2`
 5 | 	[[ -n $1 ]] && num_vms=$1 || num_vms=8
 6 | 	# base: host_id, len: num_vms, offset: i, id: j
 7 | 	for ((i=0;i<$num_vms;i++)); do
 8 | 		j=`expr $host_id \* $num_vms + $i - $num_vms`;
 9 | 		name=nixos${j}
10 | 		echo $name
11 | 	done
12 | }
13 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/one-click-configuration.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=`dirname $(realpath $0)`
 4 | 
 5 | sudo "$DIR"/enable_sriov.sh rdma0 1
 6 | sudo "$DIR"/setup_ovs.sh rdma0
 7 | sudo "$DIR"/migrate_pf.sh up
 8 | 
 9 | sudo ip link set rdma0 mtu 9000
10 | sudo ip link set ovs-system mtu 9000
11 | sudo mlnx_qos -i rdma0 --prio_tc=0,1,2,3,4,5,6,7 -r 0,0,0,0,0,0,0,0
12 | 


--------------------------------------------------------------------------------
/src/nhagent/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(command_access)]
 2 | 
 3 | pub mod cluster;
 4 | pub mod sampler;
 5 | pub mod message;
 6 | pub mod communicator;
 7 | pub mod argument;
 8 | pub mod timing;
 9 | 
10 | pub use litemsg::Node;
11 | 
12 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
13 | pub enum Role {
14 |     GlobalLeader,
15 |     RackLeader,
16 |     Worker,
17 | }
18 | 


--------------------------------------------------------------------------------
/scripts/testbed/attach_vfs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $UID -ne 0 ]; then
 4 | 	echo "Please run $0 as root"
 5 | 	exit 3
 6 | fi
 7 | 
 8 | source `dirname $0`/utils.sh
 9 | 
10 | names=(`lsnames 8`);
11 | for i in {0..7}; do
12 | 	name=${names[$i]};
13 | 	virsh attach-device --domain $name --file /nfs/cjr/Developing/nethint-rs/scripts/testbed/vfconfig/vf${i}.xml
14 | done
15 | 


--------------------------------------------------------------------------------
/src/logging/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "logging"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | publish = false
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | 
10 | [dependencies]
11 | log = "0.4.11"
12 | env_logger = "0.8.1"
13 | chrono = "0.4.19"
14 | 


--------------------------------------------------------------------------------
/src/rat_solver/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rat_solver"
 3 | version = "0.1.0"
 4 | edition = "2018"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | nethint = { path = "../nethint" }
10 | utils = { path = "../utils" }
11 | log = "0.4.14"
12 | lpsolve = "0.1.0"
13 | lpsolve-sys = "5.5.0"
14 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | # enable computation time
 6 | sed -i '/^buffer_size/a computation_speed = 0.1' *.toml
 7 | 
 8 | for conf in `ls *.toml`; do
 9 | 	echo $conf
10 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 2 -c $conf &
11 | done
12 | 
13 | wait
14 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf0.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x1'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf1.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x2'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf2.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x3'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf3.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x4'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf4.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x5'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf5.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x6'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf6.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x7'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/vfconfig/vf7.xml:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x01' function='0x0'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/scripts/testbed/environment/vf.xml.example:
--------------------------------------------------------------------------------
1 | <?xml version = "1.0" encoding = "UTF-8" ?>
2 | <!-- virsh attach-device -\-domain cpu0 -\-file vf.xml.example # this is notorious -->
3 | <hostdev mode='subsystem' type='pci' managed='yes'>
4 |   <source>
5 |     <address domain='0x0000' bus='0x18' slot='0x00' function='0x1'/>
6 |   </source>
7 |   <address type='pci' domain='0x0000' bus='0x06' slot='0x00' function='0x0'/>
8 | </hostdev>
9 | 


--------------------------------------------------------------------------------
/evaluation/model_serving_configs/run_paper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | configs=(
 6 | standard2.toml
 7 | nonnegligible_computing_overhead.toml
 8 | coarse_grained_workloads.toml
 9 | )
10 | 
11 | for conf in ${configs[@]}; do
12 | 	echo $conf
13 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf &
14 | done
15 | 
16 | wait
17 | 


--------------------------------------------------------------------------------
/scripts/testbed/environment/meta_config/interfaces.conf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IP_ADDR_CIDR="$1"
 4 | GATEWAY_ADDR="$2"
 5 | 
 6 | if [ $# -ne 2 ]; then
 7 | 	echo "Usage: $0 <IP_ADDR_CIDR> <GATEWAY_ADDR>"
 8 | 	echo "For example: $0 172.16.0.100/24 172.16.0.160"
 9 | 	exit 1
10 | fi
11 | 
12 | cat <<EOF
13 | [Match]
14 | Name=enp1s0
15 | 
16 | [Network]
17 | Address=${IP_ADDR_CIDR}
18 | Gateway=${GATEWAY_ADDR}
19 | DNS=152.3.140.31
20 | DNS=152.3.140.1
21 | EOF
22 | 


--------------------------------------------------------------------------------
/scripts/request-response.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WARMUP=10
 4 | ITERS=100
 5 | DATA_SIZE=224
 6 | 
 7 | ID=`hostname | cut -d '-' -f2`
 8 | if [ $ID -eq 1 ]; then
 9 | 	cargo run --release --example request-response -- --server --num_clients 20 --data_size $DATA_SIZE --warmup $WARMUP --iters $ITERS
10 | else
11 | 	for i in {1..4}; do
12 | 		cargo run --release --example request-response -- --client rdma0.danyang-01 --data_size $DATA_SIZE --warmup $WARMUP --iters $ITERS &
13 | 	done
14 | 	wait
15 | fi
16 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/run_paper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | # enable computation
 6 | # sed -i '/^buffer_size/a computation_speed = 0.1' *.toml
 7 | 
 8 | configs=(
 9 | standard3.toml
10 | standard3_pervm.toml
11 | standard3_pertenant.toml
12 | )
13 | 
14 | for conf in ${configs[@]}; do
15 | 	echo $conf
16 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
17 | done
18 | 
19 | wait
20 | 


--------------------------------------------------------------------------------
/src/litemsg/src/command.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | use crate::Node;
 3 | 
 4 | #[derive(Debug, Clone, Serialize, Deserialize)]
 5 | pub enum Command {
 6 |     /// (node, hostname), send by worker, processed by controller
 7 |     AddNode(Node, String),
 8 |     /// send by worker, processed by worker
 9 |     AddNodePeer(Node),
10 |     /// send by controller, processed by worker
11 |     BroadcastNodes(Vec<Node>),
12 |     /// send by worker, processed by controller
13 |     LeaveNode(Node),
14 | }
15 | 


--------------------------------------------------------------------------------
/evaluation/rl_configs/run_paper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | configs=(
 6 | standard2.toml
 7 | nonnegligible_computing_overhead.toml
 8 | # background_dynamic_strong.toml
 9 | # background_off.toml
10 | # background_static_strong.toml
11 | # level2bad.toml
12 | # level2probe.toml
13 | )
14 | 
15 | for conf in ${configs[@]}; do
16 | 	echo $conf
17 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf &
18 | done
19 | 
20 | wait
21 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/meta_config/set_rdma_intf.conf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | RDMA_IP_ADDR_CIDR="$1"
 4 | 
 5 | if [ $# -ne 1 ]; then
 6 | 	echo "Usage: $0 <IP_ADDR_CIDR>"
 7 | 	echo "For example: $0 1.1.1.1/24"
 8 | 	exit 1
 9 | fi
10 | 
11 | cat <<EOF
12 | #!/usr/bin/env bash
13 | 
14 | if [ \$# -ne 1 ]; then
15 | 	echo "Usage: \$0 <intf>"
16 | 	exit 1
17 | fi
18 | 
19 | intf=\$1
20 | 
21 | sudo ip link set \$intf name rdma0
22 | sudo ip link set rdma0 up
23 | sudo ip addr add ${RDMA_IP_ADDR_CIDR} dev rdma0
24 | sudo ip link set rdma0 mtu 8930
25 | EOF
26 | 


--------------------------------------------------------------------------------
/src/utils/src/net.rs:
--------------------------------------------------------------------------------
 1 | use std::process::Command;
 2 | use crate::cmd_helper::get_command_output;
 3 | use std::net::Ipv4Addr;
 4 | 
 5 | pub fn get_primary_ipv4(iface: &str) -> anyhow::Result<Ipv4Addr> {
 6 |     let mut cmd = Command::new("ip");
 7 |     cmd.arg("addr").arg("show").arg(iface);
 8 |     let output = get_command_output(cmd).expect("ip addr failed to execute");
 9 |     let start = 5 + output.find("inet ").expect("inet not found in the output");
10 |     let len = (&output[start..]).find("/").unwrap();
11 |     Ok((&output[start..start + len]).parse()?)
12 | }
13 | 


--------------------------------------------------------------------------------
/scripts/switch_fairness_to.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | check_args() {
 4 | 	[[ $# -eq 1 ]] && [[ $1 = "PerFlowMaxMin" || $1 = "TenantFlowMaxMin" ]]
 5 | }
 6 | 
 7 | usage_exit() {
 8 | 	echo "Usage: $0 [fairness]    fairness in [PerFlowMaxMin, TenantFlowMaxMin]" && exit 1
 9 | }
10 | 
11 | check_args $* || usage_exit
12 | 
13 | fairness=$1
14 | 
15 | echo switching to $fairness
16 | 
17 | sed -i 's/^fairness = "\(.*\)"/Fairness = "\1"/' *.toml
18 | sed -i "s/^# fairness = \"${fairness}\"/fairness = \"${fairness}\"/" *.toml
19 | sed -i 's/^Fairness = "\(.*\)"/# fairness = "\1"/' *.toml
20 | 


--------------------------------------------------------------------------------
/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ "$#" -lt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) number of tests required"; exit -1; fi
 3 | if [ "$#" -gt 1 ]; then echo "$(tput setaf 1)[ERROR]$(tput sgr 0) too many arguments: $#"; exit -1; fi
 4 | 
 5 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 6 | 
 7 | log_dir=log
 8 | mkdir -p $log_dir
 9 | 
10 | for ((i=0;i<$1;i++))
11 | do
12 |   ./rplaunch --controller-ssh 192.168.211.35 --controller-uri 192.168.211.35:9000 --hostfile hostfiles/$i --jobname mapreduce --config allreduce_tomls/$i.toml 2>&1 | tee $log_dir/$i &
13 |   sleep 10
14 | done


--------------------------------------------------------------------------------
/src/litemsg/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "litemsg"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | log = "0.4.14"
11 | anyhow = "1.0.38"
12 | thiserror = "1.0.23"
13 | bincode = "1.3.1"
14 | serde = { version = "1.0.123", features = ["derive"] }
15 | logging = { path = "../logging" }
16 | static_assertions = "1.1.0"
17 | lazy_static = "1.4.0"
18 | rand = "0.8.3"
19 | 
20 | [dependencies.mio]
21 | version = "0.6.23"
22 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/nixos-vm-setup/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   inputs.nixpkgs.url = "github:nixos/nixpkgs?rev=fb45fa64ae3460d6bd2701ab5a6c4512d781f166";
 3 |   outputs = { self, nixpkgs }: {
 4 |     nixosConfigurations.nixos = nixpkgs.lib.nixosSystem {
 5 |       system = "x86_64-linux";
 6 |       modules = [
 7 |         (
 8 |           { pkgs, ... }: {
 9 |             nix.registry.nixpkgs = {
10 |               from = { type = "indirect"; id = "nixpkgs"; };
11 |               flake = nixpkgs;
12 |             };
13 |           }
14 |         )
15 |         ./configuration.nix
16 |       ];
17 |     };
18 |   };
19 | }
20 | 


--------------------------------------------------------------------------------
/src/utils/src/collector.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Duration;
 2 | use crate::fs::append_to_file;
 3 | 
 4 | #[derive(Debug, Clone, Default)]
 5 | pub struct OverheadCollector {
 6 |     // controller overhead, job scale
 7 |     data: Vec<(Duration, usize)>,
 8 | }
 9 | 
10 | impl OverheadCollector {
11 |     pub fn collect(&mut self, duration: Duration, scale: usize) {
12 |         self.data.push((duration, scale));
13 | 
14 |         if let Ok(path) = std::env::var("NETHINT_COLLECT_CONTROLLER_OVERHEAD") {
15 |             append_to_file(path, &format!("{} {}", scale, duration.as_nanos()));
16 |         }
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/scripts/testbed/environment/meta_config/set_rdma_intf.conf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RDMA_IP_ADDR_CIDR="$1"
 4 | 
 5 | if [ $# -ne 1 ]; then
 6 | 	echo "Usage: $0 <IP_ADDR_CIDR>"
 7 | 	echo "For example: $0 1.1.1.1/24"
 8 | 	exit 1
 9 | fi
10 | 
11 | cat <<EOF
12 | #!/bin/bash
13 | 
14 | if [ \$# -ne 1 ]; then
15 | 	echo "Usage: \$0 <intf>"
16 | 	exit 1
17 | fi
18 | 
19 | intf=\$1
20 | 
21 | sudo ip link set \$intf name rdma0
22 | sudo ip link set rdma0 up
23 | sudo ip addr add ${RDMA_IP_ADDR_CIDR} dev rdma0
24 | sudo ip link set rdma0 mtu 1430
25 | echo 106 | sudo tee /sys/class/infiniband/mlx5_0/tc/1/traffic_class
26 | EOF
27 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function lsnames {
 4 | 	host_id=`hostname | cut -d'-' -f2`
 5 | 	[[ -n $1 ]] && num_vms=$1 || num_vms=8
 6 | 	# base: host_id, len: num_vms, offset: i, id: j
 7 | 	for ((i=0;i<$num_vms;i++)); do
 8 | 		j=`expr $host_id \* $num_vms + $i - $num_vms`;
 9 | 		name=cpu${j}
10 | 		echo $name
11 | 	done
12 | }
13 | 
14 | function get_rack_agent_ip {
15 | 	# my_ip=`ip a show rdma0 | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1`
16 | 	my_id=`hostname | cut -d'-' -f2`
17 | 	if [ $my_id -le 3 ]; then
18 | 		echo "192.168.211.2"
19 | 	else
20 | 		echo "192.168.211.130"
21 | 	fi
22 | }
23 | 


--------------------------------------------------------------------------------
/evaluation/mapreduce_configs/run_paper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | # enable computation time
 6 | # sed -i '/^num_reduce/a enable_computation_time = true' *.toml
 7 | 
 8 | configs=(
 9 | standard_hybrid2.toml
10 | casestudy1.toml
11 | fallback.toml
12 | # background_dynamic_strong.toml
13 | # background_off.toml
14 | # background_static_strong.toml
15 | # level2probe.toml
16 | )
17 | 
18 | for conf in ${configs[@]}; do
19 | 	echo $conf
20 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf &
21 | done
22 | 
23 | wait
24 | 


--------------------------------------------------------------------------------
/src/replayer/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use nethint::Token;
 2 | use serde::{Deserialize, Serialize};
 3 | 
 4 | pub mod message;
 5 | pub mod controller;
 6 | pub mod worker;
 7 | 
 8 | pub use litemsg::Node;
 9 | 
10 | #[derive(Debug, Clone, Serialize, Deserialize)]
11 | pub struct Flow {
12 |     pub bytes: usize,
13 |     pub src: Node,
14 |     pub dst: Node,
15 |     pub token: Option<Token>,
16 | }
17 | 
18 | impl Flow {
19 |     pub fn new(bytes: usize, src: Node, dst: Node, token: Option<Token>) -> Self {
20 |         Flow {
21 |             bytes,
22 |             src,
23 |             dst,
24 |             token,
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/run_spectrum3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | auto_tunes=(
 6 | 10
 7 | 20
 8 | 40
 9 | 80
10 | 160
11 | 320
12 | 640
13 | 1280
14 | )
15 | 
16 | cnt=0
17 | for f in ${auto_tunes[@]}; do
18 | 	echo $f
19 | 	conf=spectrum3_$f.toml
20 | 	cp spectrum3_base.toml $conf
21 | 	sed -i "s/^auto_tune = 10/auto_tune = $f/" $conf
22 | 	sed -i "s/spectrum3_1/spectrum3_$f/" $conf
23 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf &
24 | 	cnt=`expr $cnt + 5` # 5 threads
25 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
26 | done
27 | 
28 | wait
29 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/run_spectrum4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | auto_tunes=(
 6 | 1
 7 | 2
 8 | 4
 9 | 8
10 | 16
11 | 32
12 | 64
13 | 128
14 | )
15 | 
16 | cnt=0
17 | for f in ${auto_tunes[@]}; do
18 | 	echo $f
19 | 	conf=spectrum4_$f.toml
20 | 	cp spectrum4_base.toml $conf
21 | 	sed -i "s/^auto_tune = 10/auto_tune = $f/" $conf
22 | 	sed -i "s/spectrum4_1/spectrum4_$f/" $conf
23 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
24 | 	cnt=`expr $cnt + 5` # 5 threads
25 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
26 | done
27 | 
28 | wait
29 | 


--------------------------------------------------------------------------------
/src/replayer/src/vm_ip_addrs.in:
--------------------------------------------------------------------------------
 1 | vec![
 2 |     "192.168.211.3",
 3 |     "192.168.211.4",
 4 |     "192.168.211.5",
 5 |     "192.168.211.6",
 6 |     "192.168.211.35",
 7 |     "192.168.211.36",
 8 |     "192.168.211.37",
 9 |     "192.168.211.38",
10 |     "192.168.211.67",
11 |     "192.168.211.68",
12 |     "192.168.211.69",
13 |     "192.168.211.70",
14 |     "192.168.211.131",
15 |     "192.168.211.132",
16 |     "192.168.211.133",
17 |     "192.168.211.134",
18 |     "192.168.211.163",
19 |     "192.168.211.164",
20 |     "192.168.211.165",
21 |     "192.168.211.166",
22 |     "192.168.211.195",
23 |     "192.168.211.196",
24 |     "192.168.211.197",
25 |     "192.168.211.198",
26 | ]


--------------------------------------------------------------------------------
/evaluation/inaccuracy/run_inaccuracy2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | inaccuracies=(
 6 | 0.0
 7 | 0.1
 8 | 0.2
 9 | 0.3
10 | 0.4
11 | 0.5
12 | 0.6
13 | 0.7
14 | 0.8
15 | 0.9
16 | )
17 | 
18 | cnt=0
19 | for f in ${inaccuracies[@]}; do
20 | 	echo $f
21 | 	conf=inaccuracy2_$f.toml
22 | 	cp inaccuracy2_base.toml $conf
23 | 	sed -i "s/^inaccuracy = 0.1/inaccuracy = $f/" $conf
24 | 	sed -i "s/inaccuracy2_base/inaccuracy2_$f/" $conf
25 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf &
26 | 	cnt=`expr $cnt + 5` # 5 threads
27 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
28 | done
29 | 
30 | wait
31 | 


--------------------------------------------------------------------------------
/evaluation/inaccuracy/run_inaccuracy1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | inaccuracies=(
 6 | 0.0
 7 | 0.1
 8 | 0.2
 9 | 0.3
10 | 0.4
11 | 0.5
12 | 0.6
13 | 0.7
14 | 0.8
15 | 0.9
16 | )
17 | 
18 | cnt=0
19 | for f in ${inaccuracies[@]}; do
20 | 	echo $f
21 | 	conf=inaccuracy1_$f.toml
22 | 	cp inaccuracy1_base.toml $conf
23 | 	sed -i "s/^inaccuracy = 0.1/inaccuracy = $f/" $conf
24 | 	sed -i "s/inaccuracy1_base/inaccuracy1_$f/" $conf
25 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
26 | 	cnt=`expr $cnt + 5` # 5 threads
27 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
28 | done
29 | 
30 | wait
31 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/provision_vms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $UID -ne 0 ]; then
 4 | 	echo "Please run $0 as root"
 5 | 	exit 3
 6 | fi
 7 | 
 8 | source `dirname $0`/utils.sh
 9 | 
10 | # cpubase m4.xlarge
11 | virt-install --virt-type kvm --vcpus 8 --name cpubase --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --nographic --os-type=linux --os-variant=ubuntu20.04 --noreboot --import
12 | 
13 | 
14 | for name in `lsnames 4`; do
15 | 	virsh vol-delete ${name}.img --pool images
16 | 	virt-clone --replace --original cpubase --name $name --file /var/lib/libvirt/images/${name}.img
17 | 	virt-sysprep --domain $name --enable customize,dhcp-client-state,machine-id --hostname $name
18 | done
19 | 


--------------------------------------------------------------------------------
/scripts/testbed/provision_vms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $UID -ne 0 ]; then
 4 | 	echo "Please run $0 as root"
 5 | 	exit 3
 6 | fi
 7 | 
 8 | source `dirname $0`/utils.sh
 9 | 
10 | # cpubase m4.xlarge
11 | virt-install --virt-type kvm --vcpus 8 --name cpubase --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --nographic --os-type=linux --os-variant=ubuntu20.04 --noreboot --import
12 | 
13 | 
14 | for name in `lsnames 4`; do
15 | 	virsh vol-delete ${name}.img --pool images
16 | 	virt-clone --replace --original cpubase --name $name --file /var/lib/libvirt/images/${name}.img
17 | 	virt-sysprep --domain $name --enable customize,dhcp-client-state,machine-id --hostname $name
18 | done
19 | 


--------------------------------------------------------------------------------
/evaluation/mapreduce_configs/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | # enable computation time
 6 | # sed -i '/^num_reduce/a enable_computation_time = true' *.toml
 7 | 
 8 | for conf in `ls *.toml`; do
 9 | 	echo $conf
10 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 3 -c $conf &
11 | done
12 | 
13 | # RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -c standard_hybrid1.toml &
14 | # RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -c standard_hybrid2.toml &
15 | # RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -c standard_hybrid3.toml &
16 | 
17 | wait
18 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/setup_sflow.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $USER != "root" ]; then
 4 |         echo "Please run this script as root"
 5 |         exit 1
 6 | fi
 7 | 
 8 | if [ $# -eq 1 ]; then
 9 | 	COLLECTOR_IP=$1
10 | else
11 | 	COLLECTOR_IP=127.0.0.1
12 | fi
13 | 
14 | COLLECTOR_PORT=6343
15 | AGENT_IP=enp24s0v0
16 | HEADER_BYTES=128
17 | SAMPLING_N=10000
18 | POLLING_SECS=10
19 | BRIDGE=ovs0
20 | 
21 | ovs-vsctl -- --id=@sflow create sflow agent=${AGENT_IP} \
22 |     target="\"${COLLECTOR_IP}:${COLLECTOR_PORT}\"" header=${HEADER_BYTES} \
23 |     sampling=${SAMPLING_N} polling=${POLLING_SECS} \
24 |       -- set bridge ${BRIDGE} sflow=@sflow
25 | 
26 | ovs-vsctl list sflow
27 | 
28 | # to remove, use ovs-vsctl remove bridge $BRIDGE sflow <sFlow UUID>
29 | 


--------------------------------------------------------------------------------
/src/nethint/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nethint"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | lazy_static = "1.4.0"
11 | log = "0.4.11"
12 | petgraph = { version = "0.5.1", features = ["serde-1"] }
13 | fnv = "1.0.7"
14 | indexmap = "1.6.0"
15 | structopt = "0.3.21"
16 | thiserror = "1.0.22"
17 | rand = "0.8.3"
18 | smallvec = "1.5.1"
19 | serde = { version = "1.0.120", features = ["derive"] }
20 | toml = "0.5.8"
21 | rand_distr = "0.4.0"
22 | zipf = "7.0.0"
23 | utils = { path = "../utils" }
24 | 
25 | [dev-dependencies]
26 | logging = { path = "../logging" }
27 | 


--------------------------------------------------------------------------------
/scripts/recover_bandwidth_setting.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | for i in {1..6}; do
 5 | 	ssh danyang-0$i 'sudo mlnx_qos -i rdma0 -r 0,0,0,0,0,0,0,0'
 6 | done
 7 | 
 8 | 
 9 | 
10 | read -d '' cmds <<EOF
11 | "enable"
12 | "config terminal"
13 | 
14 | "interface ethernet 1/4 no bandwidth shape  "
15 | "interface ethernet 1/6 no bandwidth shape  "
16 | "interface ethernet 1/8 no bandwidth shape  "
17 | "interface ethernet 1/12 no bandwidth shape "
18 | "interface ethernet 1/14 no bandwidth shape "
19 | "interface ethernet 1/16 no bandwidth shape "
20 | 
21 | "interface ethernet 1/2 no bandwidth shape  "
22 | "interface ethernet 1/10 no bandwidth shape "
23 | EOF
24 | 
25 | ssh -oKexAlgorithms=+diffie-hellman-group14-sha1 danyang@danyang-mellanox-switch.cs.duke.edu cli -h $cmds
26 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/nixos-vm-setup/hardware-configuration.nix:
--------------------------------------------------------------------------------
 1 | # Do not modify this file!  It was generated by ‘nixos-generate-config’
 2 | # and may be overwritten by future invocations.  Please make changes
 3 | # to /etc/nixos/configuration.nix instead.
 4 | { config, lib, pkgs, modulesPath, ... }:
 5 | 
 6 | {
 7 |   imports =
 8 |     [ (modulesPath + "/profiles/qemu-guest.nix")
 9 |     ];
10 | 
11 |   boot.initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "ehci_pci" "sd_mod" "sr_mod" ];
12 |   boot.initrd.kernelModules = [ ];
13 |   boot.kernelModules = [ "kvm-intel" ];
14 |   boot.extraModulePackages = [ ];
15 | 
16 |   fileSystems."/" =
17 |     { device = "/dev/disk/by-label/NixOS";
18 |       fsType = "ext4";
19 |     };
20 | 
21 |   swapDevices = [ ];
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/replayer/src/message.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | use crate::{Flow, Node};
 3 | 
 4 | #[derive(Debug, Serialize, Deserialize)]
 5 | pub enum Command {
 6 |     /// emit a flow, send by controller, processed by worker
 7 |     EmitFlow(Flow),
 8 |     /// a flow has completed, send by worker, processed by controller
 9 |     FlowComplete(Flow),
10 |     /// send by controller, processed by worker
11 |     AppFinish,
12 |     /// send by worker, processed by controller
13 |     LeaveNode(Node),
14 |     /// send by worker, processed by worker
15 |     DataChunk(Flow),
16 |     /// send by worker, processed by worker
17 |     Data(Flow),
18 |     /// send by nhagent global leader, processed by controller
19 |     BrainResponse(nhagent_v2::message::Message),
20 | }
21 | 


--------------------------------------------------------------------------------
/scripts/run_duplicates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $UID -ne 0 ]; then
 4 | 	echo "Please run $0 as root"
 5 | 	exit 3
 6 | fi
 7 | 
 8 | if [ $# -ne 1 ]; then
 9 | 	echo "Usage: $0 <scale>"
10 | 	exit 1
11 | fi
12 | 
13 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
14 | 
15 | scale=$1
16 | num_workers=`expr 6 \* $scale`
17 | num_racks=`expr 2 \* $scale`
18 | 
19 | for ((i=0; i<$scale; i++)); do
20 | 	sampler_port=`expr 5555 + $i`
21 | 
22 | 	RUST_BACKTRACE=full \
23 | 	NH_CONTROLLER_URI=192.168.211.2:9000 \
24 | 	NH_NUM_WORKER=$num_workers \
25 | 		target/release/nhagent \
26 | 		--shadow-id $i \
27 | 		-p $sampler_port \
28 | 		-i 100 \
29 | 		-b 800000000000000:1:5:0.1 \
30 | 		arbitrary $num_racks 3 10 10 \
31 | 		&
32 | 		# --disable-v2 \
33 | done
34 | 
35 | wait
36 | 


--------------------------------------------------------------------------------
/src/nhagent_v2/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nhagent_v2"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | log = "0.4.14"
11 | anyhow = "1.0.38"
12 | logging = { path = "../logging" }
13 | nethint = { path = "../nethint" }
14 | litemsg = { path = "../litemsg" }
15 | utils = { path = "../utils" }
16 | thiserror = "1.0.23"
17 | lazy_static = "1.4.0"
18 | bincode = "1.3.1"
19 | serde = { version = "1.0.123", features = ["derive"] }
20 | structopt = "0.3.21"
21 | sflow = "0.0.1"
22 | etherparse = "0.9.0"
23 | 
24 | [dependencies.mio]
25 | version = "0.6.23"
26 | 
27 | [[bin]]
28 | name = "nhagent_v2"
29 | path = "src/main.rs"
30 | 


--------------------------------------------------------------------------------
/src/nhagent/testbed.toml:
--------------------------------------------------------------------------------
 1 | sample_interval_ns = 100_000_000 # 100ms
 2 | 
 3 | max_slots = 4
 4 | 
 5 | background_flow_hard = { enable = true, frequency_ns = 100_000_000_000, probability = 0.9, amplitude = 5 }
 6 | 
 7 | # The topology for simulation
 8 | [brain.topology]
 9 | type = "Arbitrary"  # another possible value is "FatTree"
10 | 
11 | # [brain.topology.args] # When type = "FatTree"
12 | # nports = 20           # the number of ports of a switch
13 | # bandwidth = 100       # in Gbps
14 | # oversub_ratio = 4.0   # oversubscription ratio
15 | 
16 | [brain.topology.args]         # When type = "Arbitrary"
17 | nracks = 2        # the number of racks
18 | rack_size = 3      # the number of hosts under a rack
19 | host_bw = 10       # bandwidth of a host, in Gbps
20 | rack_bw = 10       # bandwidth of a ToR switch, in Gbps


--------------------------------------------------------------------------------
/src/nhagent_v2/testbed.toml:
--------------------------------------------------------------------------------
 1 | sample_interval_ns = 100_000_000 # 100ms
 2 | 
 3 | max_slots = 4
 4 | 
 5 | background_flow_hard = { enable = true, frequency_ns = 100_000_000_000, probability = 0.9, amplitude = 5 }
 6 | 
 7 | # The topology for simulation
 8 | [brain.topology]
 9 | type = "Arbitrary"  # another possible value is "FatTree"
10 | 
11 | # [brain.topology.args] # When type = "FatTree"
12 | # nports = 20           # the number of ports of a switch
13 | # bandwidth = 100       # in Gbps
14 | # oversub_ratio = 4.0   # oversubscription ratio
15 | 
16 | [brain.topology.args]         # When type = "Arbitrary"
17 | nracks = 2        # the number of racks
18 | rack_size = 3      # the number of hosts under a rack
19 | host_bw = 10       # bandwidth of a host, in Gbps
20 | rack_bw = 10       # bandwidth of a ToR switch, in Gbps


--------------------------------------------------------------------------------
/src/nhagent/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nhagent"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | log = "0.4.14"
11 | anyhow = "1.0.38"
12 | logging = { path = "../logging" }
13 | nethint = { path = "../nethint" }
14 | litemsg = { path = "../litemsg" }
15 | utils = { path = "../utils" }
16 | thiserror = "1.0.23"
17 | lazy_static = "1.4.0"
18 | bincode = "1.3.1"
19 | serde = { version = "1.0.123", features = ["derive"] }
20 | structopt = "0.3.21"
21 | 
22 | [dependencies.mio]
23 | version = "0.6.23"
24 | 
25 | [[bin]]
26 | name = "nhagent"
27 | path = "src/main.rs"
28 | 
29 | [[bin]]
30 | name = "ssagent"
31 | path = "src/ssagent.rs"
32 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/run_sensitivity_oversub.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | rack_size=6
 6 | oversubs=(
 7 | 1
 8 | 1.5
 9 | 2
10 | 3
11 | 4
12 | 10
13 | )
14 | 
15 | cnt=0
16 | for f in ${oversubs[@]}; do
17 | 	echo $f
18 | 	conf=sensitivity_oversub_$f.toml
19 | 	cp sensitivity_oversub_base.toml $conf
20 | 	rack_bw=`python3 -c "print('{:.0f}'.format(${rack_size} * 100 / ${f}))"`
21 | 	echo rack_bw: ${rack_bw}
22 | 	sed -i "s/^rack_bw = 0/rack_bw = ${rack_bw}/" $conf
23 | 	sed -i "s/sensitivity_oversub_base/sensitivity_oversub_$f/" $conf
24 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
25 | 	cnt=`expr $cnt + 5` # 5 threads
26 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
27 | done
28 | 
29 | wait
30 | 


--------------------------------------------------------------------------------
/src/utils/src/fs.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{Seek, Write};
 2 | use std::path::Path;
 3 | use std::sync::Mutex;
 4 | use lazy_static::lazy_static;
 5 | 
 6 | lazy_static! {
 7 |     static ref FILE_MUTEX: Mutex<()> = Mutex::new(());
 8 | }
 9 | 
10 | pub fn open_with_create_append<P: AsRef<std::path::Path>>(path: P) -> std::fs::File {
11 |     std::fs::OpenOptions::new()
12 |         .append(true)
13 |         .create(true)
14 |         .open(&path)
15 |         .unwrap_or_else(|e| panic!("fail to open or create {:?}: {}", path.as_ref(), e))
16 | }
17 | 
18 | pub fn append_to_file<P: AsRef<Path>>(filename: P, content: &str) {
19 |     let _file_mutex = FILE_MUTEX.lock().unwrap();
20 | 
21 |     let mut f = open_with_create_append(filename);
22 |     f.seek(std::io::SeekFrom::End(0)).unwrap();
23 |     writeln!(f, "{}", content).unwrap();
24 | }
25 | 


--------------------------------------------------------------------------------
/src/rl/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rl"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | publish = false
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | 
10 | [dependencies]
11 | log = "0.4.11"
12 | rand = "0.8.3"
13 | gnuplot = "0.0.37"
14 | nethint = { path = "../nethint" }
15 | logging = { path = "../logging" }
16 | mapreduce = { path = "../mapreduce" }
17 | utils = { path = "../utils" }
18 | structopt = "0.3.21"
19 | rand_distr = "0.4.0"
20 | toml = "0.5.8"
21 | serde = { version = "1.0.122", features = ["derive"] }
22 | lazy_static = "1.4.0"
23 | rayon = "1.5.0"
24 | rat_solver = { path = "../rat_solver" }
25 | indicatif = "0.17.0-rc.4"
26 | 
27 | [[bin]]
28 | name = "rl_experiment"
29 | path = "src/experiment.rs"
30 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/run_sensitivity_probing_cost2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c sensitivity_probing_cost2_baseline.toml &
 6 | 
 7 | round_mses=(
 8 | 1
 9 | 10
10 | 25
11 | 50
12 | 75
13 | 100
14 | )
15 | 
16 | cnt=5
17 | for f in ${round_mses[@]}; do
18 | 	echo round_ms: $f
19 | 	conf=sensitivity_probing_cost2_$f.toml
20 | 	cp sensitivity_probing_cost2_base.toml $conf
21 | 	sed -i "s/round_ms = 100/round_ms = $f/" $conf
22 | 	sed -i "s/sensitivity_probing_cost2_base/sensitivity_probing_cost2_$f/" $conf
23 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf &
24 | 	cnt=`expr $cnt + 5` # 5 threads
25 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
26 | done
27 | 
28 | wait
29 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/run_spectrum5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | background_flow_freqs=(
 6 | 25_000_000
 7 | 50_000_000
 8 | 1_00_000_000
 9 | 2_00_000_000
10 | 4_00_000_000
11 | 8_00_000_000
12 | 1_600_000_000
13 | 3_200_000_000
14 | 6_400_000_000
15 | 12_800_000_000
16 | 25_600_000_000
17 | 51_200_000_000
18 | 102_400_000_000
19 | )
20 | 
21 | cnt=0
22 | for f in ${background_flow_freqs[@]}; do
23 | 	echo $f
24 | 	conf=spectrum5_$f.toml
25 | 	cp spectrum5_base.toml $conf
26 | 	sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf
27 | 	sed -i "s/spectrum5_1/spectrum5_$f/" $conf
28 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin rl_experiment --release -- -P 5 -c $conf &
29 | 	cnt=`expr $cnt + 5` # 5 threads
30 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
31 | done
32 | 
33 | wait
34 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/run_spectrum1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | background_flow_freqs=(
 6 | 25_000_000
 7 | 50_000_000
 8 | 1_00_000_000
 9 | 2_00_000_000
10 | 4_00_000_000
11 | 8_00_000_000
12 | 1_600_000_000
13 | 3_200_000_000
14 | 6_400_000_000
15 | 12_800_000_000
16 | 25_600_000_000
17 | 51_200_000_000
18 | 102_400_000_000
19 | )
20 | 
21 | cnt=0
22 | for f in ${background_flow_freqs[@]}; do
23 | 	echo $f
24 | 	conf=spectrum1_$f.toml
25 | 	cp spectrum1_base.toml $conf
26 | 	sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf
27 | 	sed -i "s/spectrum1_1/spectrum1_$f/" $conf
28 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
29 | 	cnt=`expr $cnt + 5` # 5 threads
30 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
31 | done
32 | 
33 | wait
34 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/run_spectrum2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | background_flow_freqs=(
 6 | 25_000_000
 7 | 50_000_000
 8 | 1_00_000_000
 9 | 2_00_000_000
10 | 4_00_000_000
11 | 8_00_000_000
12 | 1_600_000_000
13 | 3_200_000_000
14 | 6_400_000_000
15 | 12_800_000_000
16 | 25_600_000_000
17 | 51_200_000_000
18 | 102_400_000_000
19 | )
20 | 
21 | cnt=0
22 | for f in ${background_flow_freqs[@]}; do
23 | 	echo $f
24 | 	conf=spectrum2_$f.toml
25 | 	cp spectrum2_base.toml $conf
26 | 	sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf
27 | 	sed -i "s/spectrum2_1/spectrum2_$f/" $conf
28 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
29 | 	cnt=`expr $cnt + 5` # 5 threads
30 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
31 | done
32 | 
33 | wait
34 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/run_spectrum6.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | background_flow_freqs=(
 6 | 25_000_000
 7 | 50_000_000
 8 | 1_00_000_000
 9 | 2_00_000_000
10 | 4_00_000_000
11 | 8_00_000_000
12 | 1_600_000_000
13 | 3_200_000_000
14 | 6_400_000_000
15 | 12_800_000_000
16 | 25_600_000_000
17 | 51_200_000_000
18 | 102_400_000_000
19 | )
20 | 
21 | cnt=0
22 | for f in ${background_flow_freqs[@]}; do
23 | 	echo $f
24 | 	conf=spectrum6_$f.toml
25 | 	cp spectrum6_base.toml $conf
26 | 	sed -i "s/\(.*\)frequency_ns = 2_00_000_000\(.*\)/\1frequency_ns = $f\2/" $conf
27 | 	sed -i "s/spectrum6_1/spectrum6_$f/" $conf
28 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf &
29 | 	cnt=`expr $cnt + 5` # 5 threads
30 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
31 | done
32 | 
33 | wait
34 | 


--------------------------------------------------------------------------------
/src/allreduce/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "allreduce"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | publish = false
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | 
10 | [dependencies]
11 | log = "0.4.11"
12 | rand = "0.8.3"
13 | gnuplot = "0.0.37"
14 | nethint = { path = "../nethint" }
15 | logging = { path = "../logging" }
16 | mapreduce = { path = "../mapreduce" }
17 | utils = { path = "../utils" }
18 | structopt = "0.3.21"
19 | lpsolve = "0.1.0"
20 | lpsolve-sys = "5.5.0"
21 | rand_distr = "0.4.0"
22 | toml = "0.5.8"
23 | serde = { version = "1.0.122", features = ["derive"] }
24 | lazy_static = "1.4.0"
25 | rayon = "1.5.0"
26 | rat_solver = { path = "../rat_solver" }
27 | indicatif = "0.17.0-rc.4"
28 | 
29 | [[bin]]
30 | name = "allreduce_experiment"
31 | path = "src/experiment.rs"
32 | 


--------------------------------------------------------------------------------
/src/mapreduce/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mapreduce"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | publish = false
 7 | 
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | 
10 | [dependencies]
11 | log = "0.4.11"
12 | rand = "0.7.3"
13 | gnuplot = "0.0.37"
14 | nethint = { path = "../nethint" }
15 | logging = { path = "../logging" }
16 | spiril = { git = "https://github.com/Jeffail/spiril", branch = "master" }
17 | utils = { path = "../utils" }
18 | structopt = "0.3.21"
19 | futures = "0.3.8"
20 | async-std = "1.7.0"
21 | num_cpus = "1.13.0"
22 | anyhow = "1.0.35"
23 | zipf = "6.1.0"
24 | toml = "0.5.8"
25 | serde = { version = "1.0.122", features = ["derive"] }
26 | lazy_static = "1.4.0"
27 | rayon = "1.5.0"
28 | 
29 | [[bin]]
30 | name = "mapreduce_experiment"
31 | path = "src/experiment.rs"
32 | 


--------------------------------------------------------------------------------
/evaluation/herd_behavior/run_mapreduce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 3 | 
 4 | overlapped_jobs=(
 5 | 5
 6 | 10
 7 | 20
 8 | 30
 9 | )
10 | 
11 | time_scales=(
12 | 1
13 | 0.5
14 | 0.2
15 | 0.1
16 | )
17 | 
18 | # time_scales=(
19 | # 1
20 | # 0.1
21 | # 0.05
22 | # 0.01
23 | # )
24 | 
25 | idx=0
26 | for f in ${overlapped_jobs[@]}; do
27 | 	echo overlapped_jobs: $f
28 | 	conf=mapreduce_herd_$f.toml
29 | 	cp mapreduce_herd_base.toml $conf
30 | 	time_scale=${time_scales[$idx]}
31 | 	echo time_scale: $time_scale
32 | 	sed -i "s/^time_scale = 1/time_scale = ${time_scale}/" $conf
33 | 	sed -i "s/mapreduce_herd_base/mapreduce_herd_$f/" $conf
34 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin mapreduce_experiment --release -- -P 5 -c $conf &
35 | 	idx=`expr $idx + 1`
36 | 	cnt=`expr $cnt + 5` # 5 threads
37 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
38 | done
39 | 
40 | wait
41 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/migrate_pf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $# -ne 1 ]; then
 6 | 	echo "Usage: $0 up/down"
 7 | 	exit 1
 8 | fi
 9 | 
10 | 
11 | if [ $1 = "up" ]; then
12 | 
13 | 	echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind
14 | 	echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/mlx5_core/bind
15 | 	echo "sleep for 5 seconds to wait for the interface ready"
16 | 	sleep 5
17 | 	sudo ip link set enp24s0v0 up
18 | 	cidr=`ip a show rdma0 | grep 'inet ' | awk '{print $2}'`
19 | 	sudo ip addr del ${cidr} dev rdma0
20 | 	sudo ip addr add ${cidr} dev enp24s0v0
21 | 	sudo ip addr add ${cidr} dev rdma0
22 | 
23 | elif [ $1 = "down" ]; then
24 | 
25 | 	cidr=`ip a show enp24s0v0 | grep 'inet ' | awk '{print $2}'`
26 | 	sudo ip addr del ${cidr} dev enp24s0v0
27 | 	sudo ip link set enp24s0v0 down
28 | 	echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/mlx5_core/unbind
29 | 	echo 0000:18:00.1 | sudo tee /sys/bus/pci/drivers/vfio-pci/bind
30 | 
31 | else
32 | 	echo "Usage: $0 up/down"
33 | 	exit 1
34 | fi
35 | 


--------------------------------------------------------------------------------
/scripts/build_testbed_bins.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | cargo build --bin nhagent_v2
 4 | 
 5 | cargo build --release --bin controller
 6 | cargo build --release --bin worker
 7 | cargo build --release --bin rplaunch
 8 | cargo build --release --bin nhagent_v2
 9 | cargo build --release --bin scheduler
10 | # cargo build --release --bin ssagent
11 | 
12 | DIR=`cargo metadata --format-version 1 | jq -r '.workspace_root'`
13 | builtin cd $DIR/../nethint-bpf
14 | 
15 | nix develop -c cargo build
16 | nix develop -c cargo build --release
17 | 
18 | # build BPF program and its userspace program
19 | # DIR=$(dirname `realpath $0`)
20 | # nix develop $DIR/../nethint-bpf -c cargo build
21 | # nix develop $DIR/../nethint-bpf -c cargo build --release
22 | # for sec in {.BTF,.eh_frame,.text,.BTF.ext}; do
23 | # 	nix develop $DIR/../nethint-bpf -c \
24 | # 		llvm-strip --strip-unneeded --remove-section ${sec} \
25 | # 		/nfs/cjr/Developing/nethint-bpf/target/debug/build/nethint-userspace-0abfae651d38dbe6/out/target/bpf/programs/nethint/nethint.elf
26 | # done
27 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/run_sensitivity_probing_cost1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c sensitivity_probing_cost1_baseline.toml &
 6 | 
 7 | round_mses=(
 8 | 1
 9 | 10
10 | 25
11 | # 50
12 | # 75
13 | 100
14 | )
15 | 
16 | cnt=5
17 | for f in ${round_mses[@]}; do
18 | 	echo round_ms: $f
19 | 	conf=sensitivity_probing_cost1_$f.toml
20 | 	cp sensitivity_probing_cost1_base.toml $conf
21 | 	auto_tune=`python3 -c "print('{:.0f}'.format(${f} * 10))"`
22 | 	echo auto_tune: $auto_tune
23 | 	sed -i "s/round_ms = 100/round_ms = $f/" $conf
24 | 	sed -i "s/auto_tune = 1000/auto_tune = ${auto_tune}/" $conf
25 | 	sed -i "s/sensitivity_probing_cost1_base/sensitivity_probing_cost1_$f/" $conf
26 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
27 | 	cnt=`expr $cnt + 5` # 5 threads
28 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
29 | done
30 | 
31 | wait
32 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/setup_ovs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $UID -ne 0 ]; then
 6 | 	echo "Please run $0 as root"
 7 | 	exit 3
 8 | fi
 9 | 
10 | if [ $# -ne 1 ]; then
11 | 	echo "Usage: $0 <pf_intf>"
12 | 	exit 1
13 | fi
14 | 
15 | pf=$1
16 | 
17 | # Create an OVS bridge (here it's named ovs-sriov).
18 | ovs-vsctl add-br ovs0
19 | 
20 | # check the result
21 | ovs-vsctl get Open_vSwitch . other_config
22 | ovs-vsctl get Open_vSwitch . dpdk_initialized
23 | 
24 | # Restart the openvswitch service. This step is required for HW offload changes to take effect.
25 | systemctl restart openvswitch-switch.service
26 | 
27 | # Make sure to bring up the PF and representor netdevices.
28 | ovs-vsctl add-port ovs0 $pf
29 | 
30 | # show something
31 | ovs-vsctl list-ports ovs0
32 | ovs-dpctl show
33 | 
34 | # add sflow configuration
35 | DIR=$(dirname `realpath $0`)
36 | source "$DIR"/utils.sh
37 | rack_agent_ip=`get_rack_agent_ip`
38 | echo 'rack agent IP: ' $rack_agent_ip
39 | # do not setup sFlow, use BPF agent instead
40 | # ./setup_sflow.sh $rack_agent_ip
41 | 


--------------------------------------------------------------------------------
/src/allreduce/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | pub mod argument;
 4 | 
 5 | pub mod app;
 6 | 
 7 | pub mod random_ring;
 8 | 
 9 | pub mod topology_aware;
10 | 
11 | pub mod rat;
12 | 
13 | pub mod config;
14 | 
15 | use nethint::{cluster::Topology, Flow};
16 | use std::rc::Rc;
17 | 
18 | #[derive(Debug, Clone)]
19 | pub struct JobSpec {
20 |     pub num_workers: usize,
21 |     pub buffer_size: usize,
22 |     pub num_iterations: usize,
23 | }
24 | 
25 | impl JobSpec {
26 |     pub fn new(num_workers: usize, buffer_size: usize, num_iterations: usize) -> Self {
27 |         JobSpec {
28 |             num_workers,
29 |             buffer_size,
30 |             num_iterations,
31 |         }
32 |     }
33 | }
34 | 
35 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
36 | pub enum AllReducePolicy {
37 |     Random,
38 |     TopologyAware,
39 |     /// Resilient Aggregation Tree
40 |     RAT,
41 | }
42 | 
43 | pub trait AllReduceAlgorithm {
44 |     fn allreduce(&mut self, size: u64, vcluster: Rc<dyn Topology>) -> Vec<Flow>;
45 | }
46 | 


--------------------------------------------------------------------------------
/evaluation/herd_behavior/run_allreduce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | overlapped_jobs=(
 6 | 5
 7 | 10
 8 | 20
 9 | 30
10 | )
11 | 
12 | rack_sizes=(
13 | 7
14 | 15
15 | 30
16 | 40
17 | )
18 | 
19 | rack_bws=(
20 | 233
21 | 500
22 | 1000
23 | 1333
24 | )
25 | 
26 | idx=0
27 | for n in ${overlapped_jobs[@]}; do
28 | 	echo n: $n
29 | 	rack_size=${rack_sizes[$idx]}
30 | 	rack_bw=${rack_bws[$idx]}
31 | 	echo rack_size: $rack_size
32 | 	echo rack_bw: $rack_bw
33 | 	conf=allreduce_herd_$n.toml
34 | 	cp allreduce_herd_base.toml $conf
35 | 	sed -i "s/^ncases = 40/ncases = ${n}/" $conf
36 | 	sed -i "s/^rack_size = 40/rack_size = ${rack_size}/" $conf
37 | 	sed -i "s/^rack_bw = 1333/rack_bw = ${rack_bw}/" $conf
38 | 	sed -i "s/allreduce_herd_base/allreduce_herd_$n/" $conf
39 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
40 | 	idx=`expr $idx + 1`
41 | 	cnt=`expr $cnt + 5` # 5 threads
42 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
43 | done
44 | 
45 | wait
46 | 


--------------------------------------------------------------------------------
/scripts/testbed/migrate_pf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $# -ne 1 ]; then
 6 | 	echo "Usage: $0 up/down"
 7 | 	exit 1
 8 | fi
 9 | 
10 | 
11 | if [ $1 = "up" ]; then
12 | 
13 | 	echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/vfio-pci/unbind
14 | 	echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/mlx5_core/bind
15 | 	echo "sleep for 5 seconds to wait for the interface ready"
16 | 	sleep 5
17 | 	sudo ip link set enp24s0v4 up
18 | 	cidr=`ip a show rdma0 | grep 'inet ' | awk '{print $2}'`
19 | 	sudo ip addr del ${cidr} dev rdma0
20 | 	sudo ip addr add ${cidr} dev enp24s0v4
21 | 	sudo ip addr add ${cidr} dev rdma0
22 | 	sudo ovs-vsctl add-port ovs-sriov rdma0_4
23 | 
24 | elif [ $1 = "down" ]; then
25 | 
26 | 	sudo ovs-vsctl del-port ovs-sriov rdma0_4
27 | 	cidr=`ip a show enp24s0v4 | grep 'inet ' | awk '{print $2}'`
28 | 	sudo ip addr del ${cidr} dev enp24s0v4
29 | 	sudo ip link set enp24s0v4 down
30 | 	echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/mlx5_core/unbind
31 | 	echo 0000:18:00.5 | sudo tee /sys/bus/pci/drivers/vfio-pci/bind
32 | 
33 | else
34 | 	echo "Usage: $0 up/down"
35 | 	exit 1
36 | fi
37 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/run_sensitivity_rack_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
 4 | 
 5 | oversub=3
 6 | rack_sizes=(
 7 | 5
 8 | 10
 9 | 20
10 | )
11 | 
12 | cnt=0
13 | for f in ${rack_sizes[@]}; do
14 | 	echo rack_size: $f
15 | 	conf=sensitivity_rack_size_$f.toml
16 | 	cp sensitivity_rack_size_base.toml $conf
17 | 	rack_bw=`python3 -c "print('{:.0f}'.format(${f} * 100 / ${oversub}))"`
18 | 	job_size_distribution=`python3 -c "print('[[80, {}], [80, {}]]'.format($f // 5 * 8, $f // 5 * 12))"`
19 | 	echo rack_bw: ${rack_bw}
20 | 	echo job_size_distribution: ${job_size_distribution}
21 | 	sed -i "s/job_size_distribution = [[80, 8], [80, 12]]/job_size_distribution = ${job_size_distribution}/" $conf
22 | 	sed -i "s/^rack_size = 0/rack_size = $f/" $conf
23 | 	sed -i "s/^rack_bw = 0/rack_bw = ${rack_bw}/" $conf
24 | 	sed -i "s/sensitivity_rack_size_base/sensitivity_rack_size_$f/" $conf
25 | 	RUST_BACKTRACE=1 RUST_LOG=error cargo run --bin allreduce_experiment --release -- -P 5 -c $conf &
26 | 	cnt=`expr $cnt + 5` # 5 threads
27 | 	[[ $cnt -ge $(nproc) ]] && { wait; cnt=`expr $cnt - 5`; }
28 | done
29 | 
30 | wait
31 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/nixos-vm-setup/pubkeys.nix:
--------------------------------------------------------------------------------
 1 | [
 2 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAMvo/AjpONh9/Y4MEwSyygyucngxsAVuZwUDEt6fk3m root@danyang-01"
 3 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMbY3tVYI5q/vxab4YcIejCNUd58Azp4Bv7bT0RgPATX root@danyang-02"
 4 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILPFsLGA+mBhA7s+aIEt3q2if8QpkcQ542cvIjA5XKpl root@danyang-03"
 5 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM22OX0cdXpl9ItXHqsfwdA+hJr0GNcgpij8R7fXKGXa root@danyang-04"
 6 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM/GEWc20b7noIOP9de+tnFHGA6pFxxa69E/s//wAdDk root@danyang-05"
 7 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAu83mlLBROmGdMcbWo97ssMugGzM8Mp1bs1UZFo+xPz root@danyang-06"
 8 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGhs6gFH2lE9YUk2KDFQ7XyYr6MgVQwG8DHm1M/w/hCq cjr@danyang-01"
 9 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEFXTx3dU1by85bCjrlIdYmazlvCEOCc3Rx8Bg+pEe5I cjr@danyang-02"
10 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJjiy+/hk2tKBHxMzHeqzhFps+T5AVHQ2nyxOltD5VdJ cjr@danyang-03"
11 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPLNdTWXsb/M8sCwHpdzbLTMojfKZBlehzliSq1wP+rd cjr@danyang-04"
12 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJfKFXbfE4W8wm6n+Sfdcwdo8wXoARpde/8BSGemGUNy cjr@danyang-05"
13 |   "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAVG2U5Qo9Ac3nBdah4iCbhZwlPEh5jZtsduCVIkpA67 cjr@danyang-06"
14 | ]
15 | 


--------------------------------------------------------------------------------
/src/replayer/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "replayer"
 3 | version = "0.1.0"
 4 | authors = ["Jingrong Chen <crazyboycjr@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | log = "0.4.14"
11 | anyhow = "1.0.38"
12 | thiserror = "1.0.23"
13 | bincode = "1.3.1"
14 | serde = { version = "1.0.123", features = ["derive"] }
15 | logging = { path = "../logging" }
16 | nethint = { path = "../nethint" }
17 | litemsg = { path = "../litemsg" }
18 | mapreduce = { path = "../mapreduce" }
19 | allreduce = { path = "../allreduce" }
20 | rl = { path = "../rl" }
21 | nhagent_v2 = { path = "../nhagent_v2" }
22 | utils = { path = "../utils" }
23 | structopt = "0.3.21"
24 | nix = "0.19.1"
25 | serde_bytes = "0.11.5"
26 | num_cpus = "1.13.0"
27 | rand = "0.8.3"
28 | rand_distr = "0.4.0"
29 | toml = "0.5.8"
30 | zipf = "7.0.0"
31 | lazy_static = "1.4.0"
32 | sha2 = "0.9.3"
33 | crossbeam = "0.8.0"
34 | 
35 | [dependencies.mio]
36 | version = "0.6.23"
37 | 
38 | 
39 | [[bin]]
40 | name = "controller"
41 | path = "src/controller/main.rs"
42 | 
43 | [[bin]]
44 | name = "worker"
45 | path = "src/worker/main.rs"
46 | 
47 | [[bin]]
48 | name = "rplaunch"
49 | path = "src/launcher.rs"
50 | 
51 | [[bin]]
52 | name = "scheduler"
53 | path = "src/scheduler.rs"
54 | 


--------------------------------------------------------------------------------
/src/nhagent/src/argument.rs:
--------------------------------------------------------------------------------
 1 | use nethint::architecture::TopoArgs;
 2 | use nethint::background_flow_hard::BackgroundFlowHard;
 3 | use structopt::StructOpt;
 4 | 
 5 | #[derive(Debug, Clone, StructOpt)]
 6 | #[structopt(name = "nhagent", about = "NetHint Agent")]
 7 | pub struct Opts {
 8 |     /// The working interval of agent in millisecond
 9 |     #[structopt(short = "i", long = "interval", default_value = "100")]
10 |     pub interval_ms: u64,
11 | 
12 |     /// The listening port of the sampler
13 |     #[structopt(short = "p", long = "p", default_value = "5555")]
14 |     pub sampler_listen_port: u16,
15 | 
16 |     /// Specify the topology for testbed
17 |     #[structopt(subcommand)]
18 |     pub topo: TopoArgs,
19 | 
20 |     /// Background flow parameter by enforcing rate limit, the
21 |     /// format is freq:prob:amp[:avg_load]
22 |     #[structopt(short, long, default_value)]
23 |     pub background_flow_hard: BackgroundFlowHard,
24 | 
25 |     /// When specified, it represents the number of the duplicated agent.
26 |     /// This option is only used to measure the system overhead by running
27 |     /// multiple nhagents on the same servers.
28 |     #[structopt(short, long)]
29 |     pub shadow_id: Option<usize>,
30 | 
31 |     /// Disable HetHint v2, and only run NetHint v1.
32 |     #[structopt(short, long)]
33 |     pub disable_v2: bool,
34 | }
35 | 


--------------------------------------------------------------------------------
/src/rl/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | pub mod argument;
 4 | 
 5 | pub mod app;
 6 | 
 7 | pub mod contraction;
 8 | pub mod random_ring;
 9 | pub mod rat;
10 | pub mod topology_aware;
11 | 
12 | pub mod config;
13 | 
14 | use std::rc::Rc;
15 | use nethint::{cluster::Topology, Flow};
16 | 
17 | #[derive(Debug, Clone)]
18 | pub struct JobSpec {
19 |     pub num_workers: usize,
20 |     pub buffer_size: usize,
21 |     pub num_iterations: usize,
22 |     pub root_index: usize,
23 | }
24 | 
25 | impl JobSpec {
26 |     pub fn new(
27 |         num_workers: usize,
28 |         buffer_size: usize,
29 |         num_iterations: usize,
30 |         root_index: usize,
31 |     ) -> Self {
32 |         JobSpec {
33 |             num_workers,
34 |             buffer_size,
35 |             num_iterations,
36 |             root_index,
37 |         }
38 |     }
39 | }
40 | 
41 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
42 | pub enum RLPolicy {
43 |     Random,
44 |     TopologyAware,
45 |     Contraction,
46 |     /// Resilient Aggregation Tree
47 |     RAT,
48 | }
49 | 
50 | pub trait RLAlgorithm {
51 |     fn run_rl_traffic(
52 |         &mut self,
53 |         root_index: usize,
54 |         group: Option<Vec<usize>>, // worker group
55 |         size: u64,
56 |         vcluster: Rc<dyn Topology>,
57 |     ) -> Vec<Flow>;
58 | }
59 | 


--------------------------------------------------------------------------------
/src/replayer/src/controller/app.rs:
--------------------------------------------------------------------------------
 1 | use crate::Node;
 2 | use litemsg::endpoint::Endpoint;
 3 | use std::collections::HashMap;
 4 | use nethint::hint::NetHintVersion;
 5 | use nethint::TenantId;
 6 | use crate::message;
 7 | // use nhagent::timing::{self, TimeList};
 8 | 
 9 | pub trait Application {
10 |     fn workers(&self) -> &HashMap<Node, Endpoint>;
11 | 
12 |     fn workers_mut(&mut self) -> &mut HashMap<Node, Endpoint>;
13 | 
14 |     fn brain(&self) -> &Endpoint;
15 | 
16 |     fn brain_mut(&mut self) -> &mut Endpoint;
17 | 
18 |     fn tenant_id(&self) -> TenantId;
19 | 
20 |     fn hostname_to_node(&self) -> &HashMap<String, Node>;
21 | 
22 |     fn request_nethint(&mut self, version: NetHintVersion) -> anyhow::Result<()> {
23 |         // let mut time_list = TimeList::new();
24 |         // time_list.push_now(timing::ON_TENANT_SENT_REQ);
25 |         // let msg = nhagent::message::Message::NetHintRequest(self.tenant_id(), version, time_list);
26 |         let msg = nhagent_v2::message::Message::NetHintRequest(self.tenant_id(), version);
27 |         self.brain_mut().post(msg, None)?;
28 |         Ok(())
29 |     }
30 | 
31 |     fn start(&mut self) -> anyhow::Result<()>;
32 | 
33 |     fn on_event(&mut self, cmd: message::Command) -> anyhow::Result<bool>;
34 | 
35 |     fn finish(&mut self) -> anyhow::Result<()> {
36 |         for worker in self.workers_mut().values_mut() {
37 |             worker.post(message::Command::AppFinish, None)?;
38 |         }
39 |         Ok(())
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/rl/src/argument.rs:
--------------------------------------------------------------------------------
 1 | use nethint::architecture::TopoArgs;
 2 | use structopt::StructOpt;
 3 | 
 4 | #[derive(Debug, Clone, StructOpt)]
 5 | #[structopt(name = "AllReduce", about = "AllReduce Application")]
 6 | pub struct Opt {
 7 |     /// Specify the topology for simulation
 8 |     #[structopt(subcommand)]
 9 |     pub topo: TopoArgs,
10 | 
11 |     /// Number of workers.
12 |     #[structopt(short = "w", long = "num_workers", default_value = "16")]
13 |     pub num_workers: usize,
14 | 
15 |     /// Buffer size of allreduce.
16 |     #[structopt(short = "s", long = "buffer_size", default_value = "100000000")]
17 |     pub buffer_size: usize,
18 | 
19 |     /// Number of allreduce iterations.
20 |     #[structopt(short = "i", long = "num_iterations", default_value = "1200")]
21 |     pub num_iterations: usize,
22 | 
23 |     /// Number of jobs.
24 |     #[structopt(short = "n", long = "ncases", default_value = "1")]
25 |     pub ncases: usize,
26 | 
27 |     /// Nethint level.
28 |     #[structopt(short = "l", long = "nethint_level", default_value = "1")]
29 |     pub nethint_level: usize,
30 | 
31 |     /// Poisson arrival lambda.
32 |     #[structopt(short = "p", long = "poisson_lambda", default_value = "24000000000")]
33 |     pub poisson_lambda: f64,
34 | 
35 |     /// Asymmetric bandwidth
36 |     #[structopt(short = "a", long = "asymmetric")]
37 |     pub asym: bool,
38 | 
39 |     /// Auto tune after some itertions.
40 |     #[structopt(long = "autotune")]
41 |     pub tune: Option<usize>,
42 | }
43 | 


--------------------------------------------------------------------------------
/src/allreduce/src/argument.rs:
--------------------------------------------------------------------------------
 1 | use nethint::architecture::TopoArgs;
 2 | use structopt::StructOpt;
 3 | 
 4 | #[derive(Debug, Clone, StructOpt)]
 5 | #[structopt(name = "AllReduce", about = "AllReduce Application")]
 6 | pub struct Opt {
 7 |     /// Specify the topology for simulation
 8 |     #[structopt(subcommand)]
 9 |     pub topo: TopoArgs,
10 | 
11 |     /// Number of workers.
12 |     #[structopt(short = "w", long = "num_workers", default_value = "16")]
13 |     pub num_workers: usize,
14 | 
15 |     /// Buffer size of allreduce.
16 |     #[structopt(short = "s", long = "buffer_size", default_value = "100000000")]
17 |     pub buffer_size: usize,
18 | 
19 |     /// Number of allreduce iterations.
20 |     #[structopt(short = "i", long = "num_iterations", default_value = "1200")]
21 |     pub num_iterations: usize,
22 | 
23 |     /// Number of jobs.
24 |     #[structopt(short = "n", long = "ncases", default_value = "1")]
25 |     pub ncases: usize,
26 | 
27 |     /// Nethint level.
28 |     #[structopt(short = "l", long = "nethint_level", default_value = "1")]
29 |     pub nethint_level: usize,
30 | 
31 |     /// Poisson arrival lambda.
32 |     #[structopt(short = "p", long = "poisson_lambda", default_value = "24000000000")]
33 |     pub poisson_lambda: f64,
34 | 
35 |     /// Asymmetric bandwidth
36 |     #[structopt(short = "a", long = "asymmetric")]
37 |     pub asym: bool,
38 | 
39 |     /// Auto tune after some itertions.
40 |     #[structopt(long = "autotune")]
41 |     pub tune: Option<usize>,
42 | }
43 | 


--------------------------------------------------------------------------------
/src/nethint/src/runtime_est.rs:
--------------------------------------------------------------------------------
 1 | pub struct RunningTimeEstimator {
 2 |     total_trials: Option<usize>,
 3 |     done_trials: usize,
 4 |     data: Vec<std::time::Duration>, // running time for each single trial
 5 |     single_start: std::time::Instant,
 6 |     running_time: std::time::Duration,
 7 | }
 8 | 
 9 | impl RunningTimeEstimator {
10 |     pub fn new() -> Self {
11 |         RunningTimeEstimator {
12 |             total_trials: None,
13 |             done_trials: 0,
14 |             data: Vec::new(),
15 |             single_start: std::time::Instant::now(),
16 |             running_time: std::time::Duration::from_nanos(0),
17 |         }
18 |     }
19 | 
20 |     pub fn set_total_trials(&mut self, total_trials: usize) {
21 |         self.total_trials = Some(total_trials);
22 |     }
23 | 
24 |     pub fn bench_single_start(&mut self) {
25 |         let now = std::time::Instant::now();
26 |         self.running_time += now - self.single_start;
27 | 
28 |         if let Some(total_trials) = self.total_trials {
29 |             if self.done_trials > 0 {
30 |                 log::info!(
31 |                     "average speed: {:?} second/trial, time left: {:?}",
32 |                     self.running_time / self.done_trials as u32,
33 |                     self.running_time * (total_trials - self.done_trials) as u32
34 |                         / self.done_trials as u32
35 |                 );
36 |             }
37 |         }
38 | 
39 |         self.data.push(now - self.single_start);
40 |         self.done_trials += 1;
41 |         self.single_start = now;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/litemsg/src/buffer.rs:
--------------------------------------------------------------------------------
 1 | /// A Buffer represents a segment of sending or receiving data (maybe unfinished).
 2 | #[derive(Debug, Default)]
 3 | pub struct Buffer {
 4 |     inner: Vec<u8>,
 5 |     cur_pos: usize,
 6 | }
 7 | 
 8 | impl Buffer {
 9 |     pub fn from_vec(v: Vec<u8>) -> Self {
10 |         Buffer {
11 |             inner: v,
12 |             cur_pos: 0,
13 |         }
14 |     }
15 | 
16 |     pub fn with_len(len: usize) -> Self {
17 |         let mut inner = Vec::with_capacity(len);
18 |         unsafe {
19 |             inner.set_len(len);
20 |         }
21 |         Buffer { inner, cur_pos: 0 }
22 |     }
23 | 
24 |     pub fn take(&mut self) -> Vec<u8> {
25 |         std::mem::take(&mut self.inner)
26 |     }
27 | 
28 |     pub fn as_slice(&self) -> &[u8] {
29 |         assert!(self.is_clear());
30 |         &self.inner
31 |     }
32 | 
33 |     pub fn as_slice_mut(&mut self) -> &mut [u8] {
34 |         assert!(self.is_clear());
35 |         &mut self.inner
36 |     }
37 | 
38 |     pub fn mark_handled(&mut self, nbytes: usize) {
39 |         self.cur_pos += nbytes;
40 |         assert!(self.cur_pos <= self.inner.len());
41 |     }
42 | 
43 |     pub fn is_clear(&self) -> bool {
44 |         self.cur_pos == self.inner.len()
45 |     }
46 | 
47 |     pub fn get_remain_buffer(&self) -> &[u8] {
48 |         &self.inner[self.cur_pos..]
49 |     }
50 | 
51 |     pub fn get_remain_buffer_mut(&mut self) -> &mut [u8] {
52 |         &mut self.inner[self.cur_pos..]
53 |     }
54 | }
55 | 
56 | // /// Zero-copied buffer with reference counting.
57 | // #[derive(Debug, Default)]
58 | // struct ZBuffer {
59 | // }
60 | 


--------------------------------------------------------------------------------
/src/mapreduce/src/plink.rs:
--------------------------------------------------------------------------------
 1 | use nethint::{
 2 |     app::{AppEvent, Application, Sequence},
 3 |     background_flow::{BackgroundFlowApp, BackgroundFlowPattern},
 4 |     simulator::Events,
 5 |     Duration,
 6 | };
 7 | 
 8 | #[derive(Debug)]
 9 | pub struct PlinkApp<'a, T> {
10 |     dur_ms: Duration,
11 |     inner: Box<Sequence<'a, T>>,
12 | }
13 | 
14 | impl<'a, T: 'a> PlinkApp<'a, T>
15 | where
16 |     T: Default + Clone + std::fmt::Debug,
17 | {
18 |     pub fn new(nhosts: usize, round_ms: u64, app: Box<dyn Application<Output = T> + 'a>) -> Self {
19 |         let dur_ms = (nhosts as u64 * round_ms) as _;
20 |         let background_flow = Box::new(BackgroundFlowApp::new(
21 |             nhosts,
22 |             dur_ms,
23 |             BackgroundFlowPattern::PlinkProbe,
24 |             Some(100_000_000), // 8ms on 100G
25 |             T::default(),
26 |         ));
27 | 
28 |         let mut app_seq = Box::new(Sequence::new());
29 |         app_seq.add(background_flow);
30 |         app_seq.add(app);
31 | 
32 |         PlinkApp {
33 |             dur_ms,
34 |             inner: app_seq,
35 |         }
36 |     }
37 | }
38 | 
39 | impl<'a> Application for PlinkApp<'a, Option<Duration>> {
40 |     type Output = Option<Duration>;
41 | 
42 |     fn on_event(&mut self, event: AppEvent) -> Events {
43 |         self.inner.on_event(event)
44 |     }
45 | 
46 |     fn answer(&mut self) -> Option<Duration> {
47 |         // self.inner.answer().last().unwrap().clone()
48 |         self.inner
49 |             .answer()
50 |             .last()
51 |             .unwrap()
52 |             .map(|dur| dur + self.dur_ms * 1_000_000)
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/logging/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use log::info;
 2 | 
 3 | pub fn init_log() {
 4 |     use chrono::Utc;
 5 |     use std::io::Write;
 6 | 
 7 |     let env = env_logger::Env::default().default_filter_or("debug");
 8 |     env_logger::Builder::from_env(env)
 9 |         .format(|buf, record| {
10 |             let level_style = buf.default_level_style(record.level());
11 |             writeln!(
12 |                 buf,
13 |                 "[{} {} {}:{}] {}",
14 |                 Utc::now().format("%Y-%m-%d %H:%M:%S%.6f"),
15 |                 level_style.value(record.level()),
16 |                 record.file().unwrap_or("<unnamed>"),
17 |                 record.line().unwrap_or(0),
18 |                 &record.args()
19 |             )
20 |         })
21 |         .init();
22 | 
23 |     info!("env_logger initialized");
24 | }
25 | 
26 | pub fn init_log_with_id(id: String) {
27 |     use chrono::Utc;
28 |     use std::io::Write;
29 | 
30 |     let env = env_logger::Env::default().default_filter_or("debug");
31 |     env_logger::Builder::from_env(env)
32 |         .format(move |buf, record| {
33 |             let level_style = buf.default_level_style(record.level());
34 |             writeln!(
35 |                 buf,
36 |                 "[{} {} {}:{} {}] {}",
37 |                 Utc::now().format("%Y-%m-%d %H:%M:%S%.6f"),
38 |                 level_style.value(record.level()),
39 |                 record.file().unwrap_or("<unnamed>"),
40 |                 record.line().unwrap_or(0),
41 |                 id,
42 |                 &record.args()
43 |             )
44 |         })
45 |         .init();
46 | 
47 |     info!("env_logger initialized");
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/src/nhagent_v2/src/argument.rs:
--------------------------------------------------------------------------------
 1 | use nethint::architecture::TopoArgs;
 2 | use nethint::background_flow_hard::BackgroundFlowHard;
 3 | use structopt::StructOpt;
 4 | use std::net::SocketAddr;
 5 | 
 6 | #[derive(Debug, Clone, StructOpt)]
 7 | #[structopt(name = "nhagent", about = "NetHint Agent")]
 8 | pub struct Opts {
 9 |     /// The working interval of agent in millisecond
10 |     #[structopt(short = "i", long = "interval", default_value = "100")]
11 |     pub interval_ms: u64,
12 | 
13 |     /// The listening port of the sampler
14 |     #[structopt(short = "p", long, default_value = "6343")]
15 |     pub sampler_listen_port: u16,
16 | 
17 |     /// Specify the topology for testbed
18 |     #[structopt(subcommand)]
19 |     pub topo: TopoArgs,
20 | 
21 |     /// Background flow parameter by enforcing rate limit, the
22 |     /// format is freq:prob:amp[:avg_load]
23 |     #[structopt(short, long, default_value)]
24 |     pub background_flow_hard: BackgroundFlowHard,
25 | 
26 |     /// When specified, it represents the number of the duplicated agent.
27 |     /// This option is only used to measure the system overhead by running
28 |     /// multiple nhagents on the same servers.
29 |     #[structopt(short, long)]
30 |     pub shadow_id: Option<usize>,
31 | 
32 |     /// Disable HetHint v2, and only run NetHint v1.
33 |     #[structopt(short, long)]
34 |     pub disable_v2: bool,
35 | 
36 |     // the two fields below are used by the BPF userspace program
37 |     /// Physical interface name for the BPF program
38 |     #[structopt(long)]
39 |     pub iface: Option<String>,
40 | 
41 |     /// Rack leader address (ip:port)
42 |     #[structopt(long)]
43 |     pub rack_leader: Option<SocketAddr>,
44 | }
45 | 


--------------------------------------------------------------------------------
/scripts/InfoCollectOverHead.hs:
--------------------------------------------------------------------------------
 1 | import Text.Printf (printf)
 2 | 
 3 | numRacks = 1000
 4 | numMachinesEachRack = 20
 5 | numVMsEachMachine = 10
 6 | 
 7 | rackBandwidth = 600 * 1e9 -- Gbps
 8 | -- machineBandwidth = 100 -- Gbps
 9 | 
10 | nethintPeriodInSec = 0.1 -- 100ms
11 | 
12 | numVirtualLinksEachRack :: Float
13 | numVirtualLinksEachRack = 2 * (1 + numMachinesEachRack * numVMsEachMachine)
14 | -- 2 * (1 ToR switch + a bunch of VMs)
15 | -- 2 because of there are a upstream link and a downstream link
16 | 
17 | numVirtualLinks :: Float
18 | numVirtualLinks = numVirtualLinksEachRack * numRacks
19 | 
20 | -- in bytes
21 | nBrPairSize :: Float
22 | nBrPairSize = 2 * 8
23 | 
24 | -- in bytes
25 | virtualLinkIDSize :: Float
26 | virtualLinkIDSize = 8
27 | 
28 | -- one virtualLinkID, 8 bytes
29 | -- two (n, Br) pairs, one for traffic within the rack, and the other for traffic contributes to the cross rack link
30 | virtualLinkBytes :: Float
31 | virtualLinkBytes = virtualLinkIDSize + 2 * nBrPairSize
32 | 
33 | -- for each rack, it has to send and receive so much information
34 | -- note that the `numVirtualLinks` contains all the virtual links in the data center
35 | crossRackTrafficBytes :: Float -- bytes
36 | crossRackTrafficBytes = numVirtualLinks * virtualLinkBytes
37 | 
38 | crossRackTrafficPerSecond :: Float -- bits/second
39 | crossRackTrafficPerSecond = 8 * crossRackTrafficBytes / nethintPeriodInSec
40 | 
41 | computeBandwidthOverhead :: Float
42 | computeBandwidthOverhead = crossRackTrafficPerSecond / rackBandwidth
43 | 
44 | 
45 | showMB :: Float -> String
46 | showMB bytes = printf "%fMB" (bytes / 1e6)
47 | 
48 | main = do
49 |     let percentage = computeBandwidthOverhead
50 |     printf "%.5f%%\n" (percentage * 100)
51 | 


--------------------------------------------------------------------------------
/src/allreduce/src/random_ring.rs:
--------------------------------------------------------------------------------
 1 | use crate::AllReduceAlgorithm;
 2 | use nethint::{cluster::Topology, Flow};
 3 | use rand::prelude::SliceRandom;
 4 | use rand::{rngs::StdRng, SeedableRng};
 5 | use std::rc::Rc;
 6 | 
 7 | #[derive(Debug)]
 8 | pub struct RandomRingAllReduce {
 9 |     seed: u64,
10 |     num_rings: usize,
11 |     rng: StdRng,
12 | }
13 | 
14 | impl RandomRingAllReduce {
15 |     pub fn new(seed: u64, num_rings: usize) -> Self {
16 |         RandomRingAllReduce {
17 |             seed,
18 |             num_rings,
19 |             rng: StdRng::seed_from_u64(seed),
20 |         }
21 |     }
22 | }
23 | 
24 | impl AllReduceAlgorithm for RandomRingAllReduce {
25 |     fn allreduce(&mut self, size: u64, vcluster: Rc<dyn Topology>) -> Vec<Flow> {
26 |         let n = vcluster.num_hosts();
27 | 
28 |         let mut flows = Vec::new();
29 |         for _ in 0..self.num_rings {
30 |             let mut alloced_hosts: Vec<usize> = (0..n).into_iter().collect();
31 |             alloced_hosts.shuffle(&mut self.rng);
32 |             assert!(n > 0);
33 |             for _ in 0..2 {
34 |                 for i in 0..n {
35 |                     let pred = format!("host_{}", alloced_hosts[i]);
36 |                     let succ = format!("host_{}", alloced_hosts[(i + 1) % n]);
37 |                     log::debug!("pred: {}, succ: {}", pred, succ);
38 |                     let flow = Flow::new(
39 |                         size as usize * (n - 1) / n / self.num_rings,
40 |                         &pred,
41 |                         &succ,
42 |                         None,
43 |                     );
44 |                     flows.push(flow);
45 |                 }
46 |             }
47 |         }
48 | 
49 |         flows
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/enable_sriov.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $UID -ne 0 ]; then
 6 | 	echo "Please run $0 as root"
 7 | 	exit 3
 8 | fi
 9 | 
10 | if [ $# -ne 2 ]; then
11 | 	echo "Usage: $0 <pf_intf> <num_vfs>"
12 | 	exit 1
13 | fi
14 | 
15 | pf=$1
16 | num_vfs=$2
17 | 
18 | pci_addr=$(basename `readlink /sys/class/net/$pf/device`)
19 | echo "PCI address of $pf is $pci_addr"
20 | # domain:bus:slot.function
21 | # pci_slot=`echo $pci_addr | cut -d "." -f1`
22 | 
23 | # check if sriov has already been enabled
24 | echo 0 > /sys/class/net/$pf/device/sriov_numvfs
25 | num=`cat /sys/class/net/$pf/device/sriov_numvfs`
26 | if [ $num -ne 0 ]; then
27 | 	echo "$pf SR-IOV has already been enabled, to change the number of VFs, please disable it first, then execute this script"
28 | 	exit 2
29 | fi
30 | 
31 | echo $num_vfs > /sys/class/net/$pf/device/sriov_numvfs
32 | 
33 | # set mac address of VFs
34 | for ((i=0;i<$num_vfs;i++)); do
35 | 	macaddr=`tr -dc A-F0-9 < /dev/urandom | head -c 10 | sed -r 's/(..)/\1:/g;s/:$//;s/^/02:/'`
36 | 	ip link set $pf vf $i mac $macaddr;
37 | done
38 | 
39 | ip link show $pf
40 | 
41 | # bind VF's driver to vfio-pci
42 | modprobe vfio-pci
43 | 
44 | for ((i=0;i<$num_vfs;i++)); do
45 | 	vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`)
46 | 	current_driver=$(basename `readlink /sys/bus/pci/devices/$vf_pci_addr/driver`)
47 | 	if [ "x$current_driver" != "xvfio-pci" ]; then
48 | 		# unbind current driver
49 | 		echo $vf_pci_addr > /sys/bus/pci/devices/$vf_pci_addr/driver/unbind
50 | 		numeric_id=`lspci -s $vf_pci_addr -n | cut -d " " -f3 | tr ':' ' '`
51 | 		# bind to vfio-pci
52 | 		echo $numeric_id > /sys/bus/pci/drivers/vfio-pci/new_id
53 | 	fi
54 | 	# show results
55 | 	lspci -k -s $vf_pci_addr
56 | done
57 | 


--------------------------------------------------------------------------------
/scripts/testbed/enable_sriov.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $UID -ne 0 ]; then
 6 | 	echo "Please run $0 as root"
 7 | 	exit 3
 8 | fi
 9 | 
10 | if [ $# -ne 2 ]; then
11 | 	echo "Usage: $0 <pf_intf> <num_vfs>"
12 | 	exit 1
13 | fi
14 | 
15 | pf=$1
16 | num_vfs=$2
17 | 
18 | pci_addr=$(basename `readlink /sys/class/net/$pf/device`)
19 | echo "PCI address of $pf is $pci_addr"
20 | # domain:bus:slot.function
21 | # pci_slot=`echo $pci_addr | cut -d "." -f1`
22 | 
23 | # check if sriov has already been enabled
24 | echo 0 > /sys/class/net/$pf/device/sriov_numvfs
25 | num=`cat /sys/class/net/$pf/device/sriov_numvfs`
26 | if [ $num -ne 0 ]; then
27 | 	echo "$pf SR-IOV has already been enabled, to change the number of VFs, please disable it first, then execute this script"
28 | 	exit 2
29 | fi
30 | 
31 | echo $num_vfs > /sys/class/net/$pf/device/sriov_numvfs
32 | 
33 | # set mac address of VFs
34 | for ((i=0;i<$num_vfs;i++)); do
35 | 	macaddr=`tr -dc A-F0-9 < /dev/urandom | head -c 10 | sed -r 's/(..)/\1:/g;s/:$//;s/^/02:/'`
36 | 	ip link set $pf vf $i mac $macaddr;
37 | done
38 | 
39 | ip link show $pf
40 | 
41 | # bind VF's driver to vfio-pci
42 | modprobe vfio-pci
43 | 
44 | for ((i=0;i<$num_vfs;i++)); do
45 | 	vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`)
46 | 	current_driver=$(basename `readlink /sys/bus/pci/devices/$vf_pci_addr/driver`)
47 | 	if [ "x$current_driver" != "xvfio-pci" ]; then
48 | 		# unbind current driver
49 | 		echo $vf_pci_addr > /sys/bus/pci/devices/$vf_pci_addr/driver/unbind
50 | 		numeric_id=`lspci -s $vf_pci_addr -n | cut -d " " -f3 | tr ':' ' '`
51 | 		# bind to vfio-pci
52 | 		echo $numeric_id > /sys/bus/pci/drivers/vfio-pci/new_id
53 | 	fi
54 | 	# show results
55 | 	lspci -k -s $vf_pci_addr
56 | done
57 | 


--------------------------------------------------------------------------------
/scripts/run_duplicates_v2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $UID -ne 0 ]; then
 4 | 	echo "Please run $0 as root"
 5 | 	exit 3
 6 | fi
 7 | 
 8 | if [ $# -ne 1 ]; then
 9 | 	echo "Usage: $0 <scale>"
10 | 	exit 1
11 | fi
12 | 
13 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM SIGHUP EXIT
14 | 
15 | scale=$1
16 | rack_size=3 # you may want to change it to 20 in emulation
17 | num_racks=`expr 6 \* $scale`
18 | num_workers=`expr $num_racks \* $rack_size`
19 | 
20 | if [ "x$scale" = "x1" -o "x$scale" = "x" ]; then
21 | 	sampler_port=6343
22 | 
23 | 	RUST_BACKTRACE=full \
24 | 	NH_CONTROLLER_URI=danyang-01.cs.duke.edu:9000 \
25 | 	NH_NUM_RACKS=$num_racks \
26 | 		target/release/nhagent_v2 \
27 | 		-p $sampler_port \
28 | 		-i 100 \
29 | 		-b 10000000000:1:5:0.1 \
30 | 		arbitrary $num_racks $rack_size 10 10
31 | 
32 | else
33 |   # sampler_port=6343
34 | 
35 |   # RUST_BACKTRACE=full \
36 |   # NH_CONTROLLER_URI=danyang-01.cs.duke.edu:9000 \
37 |   # NH_NUM_RACKS=$num_racks \
38 |   #     valgrind --leak-check=full --show-reachable=yes target/release/nhagent_v2 \
39 |   #     --shadow-id 0 \
40 |   #     -p $sampler_port \
41 |   #     -i 100 \
42 |   #     -b 10000000000:1:5:0.1 \
43 |   #     arbitrary $num_racks $rack_size 10 10 \
44 |   #     &
45 |   #     # --disable-v2 \
46 | 
47 | 	for ((i=0; i<$scale; i++)); do
48 | 		sampler_port=`expr 6343 + $i`
49 | 
50 | 		RUST_BACKTRACE=full \
51 | 		NH_CONTROLLER_URI=danyang-01.cs.duke.edu:9000 \
52 | 		NH_NUM_RACKS=$num_racks \
53 | 			target/release/nhagent_v2 \
54 | 			--shadow-id $i \
55 | 			-p $sampler_port \
56 | 			-i 100 \
57 | 			-b 10000000000:1:5:0.1 \
58 | 			arbitrary $num_racks $rack_size 10 10 \
59 | 			&
60 | 			# --disable-v2 \
61 | 	done
62 | 
63 | 	wait
64 | fi
65 | 
66 | # DIR=$(dirname `realpath $0`)
67 | # nix develop $DIR/../nethint-bpf -c \
68 | # 	sudo -E NH_LOG=info RUST_BACKTRACE=1 \
69 | # 	$DIR/../nethint-bpf/target/debug/nethint-user \
70 | # 	arbitrary $num_racks 3 10 10


--------------------------------------------------------------------------------
/src/rl/src/random_ring.rs:
--------------------------------------------------------------------------------
 1 | use crate::RLAlgorithm;
 2 | use nethint::{cluster::Topology, Flow};
 3 | use rand::prelude::SliceRandom;
 4 | use rand::{rngs::StdRng, SeedableRng};
 5 | use std::rc::Rc;
 6 | 
 7 | #[derive(Debug)]
 8 | pub struct RandomChain {
 9 |     seed: u64,
10 |     num_trees: usize,
11 |     rng: StdRng,
12 | }
13 | 
14 | impl RandomChain {
15 |     pub fn new(seed: u64, num_trees: usize) -> Self {
16 |         RandomChain {
17 |             seed,
18 |             num_trees,
19 |             rng: StdRng::seed_from_u64(seed),
20 |         }
21 |     }
22 | }
23 | 
24 | impl RLAlgorithm for RandomChain {
25 |     fn run_rl_traffic(
26 |         &mut self,
27 |         root_index: usize,
28 |         group: Option<Vec<usize>>,
29 |         size: u64,
30 |         vcluster: Rc<dyn Topology>,
31 |     ) -> Vec<Flow> {
32 |         let mut flows = Vec::new();
33 | 
34 |         for _ in 0..self.num_trees {
35 |             let mut alloced_hosts: Vec<usize> = if group.is_none() {
36 |                 let n = vcluster.num_hosts();
37 |                 let mut hs: Vec<usize> = (0..n).into_iter().collect();
38 |                 hs.remove(root_index);
39 |                 hs
40 |             } else {
41 |                 group.clone().unwrap()
42 |             };
43 |             alloced_hosts.shuffle(&mut self.rng);
44 | 
45 |             alloced_hosts.insert(0, root_index);
46 | 
47 |             assert!(
48 |                 alloced_hosts.len() >= 2,
49 |                 "vcluster size must >= 2, worker group cannot be empty"
50 |             );
51 | 
52 |             for (&x, &y) in alloced_hosts.iter().zip(alloced_hosts.iter().skip(1)) {
53 |                 let pred = format!("host_{}", x);
54 |                 let succ = format!("host_{}", y);
55 |                 let flow = Flow::new(size as usize, &pred, &succ, None);
56 |                 flows.push(flow);
57 |             }
58 |         }
59 | 
60 |         for f in &mut flows {
61 |             f.bytes /= self.num_trees;
62 |         }
63 | 
64 |         log::info!("flows: {:?}", flows);
65 |         flows
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/allreduce/src/topology_aware.rs:
--------------------------------------------------------------------------------
 1 | use crate::AllReduceAlgorithm;
 2 | use nethint::{cluster::Topology, Flow};
 3 | use std::rc::Rc;
 4 | 
 5 | #[derive(Debug, Default)]
 6 | pub struct TopologyAwareRingAllReduce {
 7 |     seed: u64,
 8 |     num_rings: usize,
 9 | }
10 | 
11 | impl TopologyAwareRingAllReduce {
12 |     pub fn new(seed: u64, num_rings: usize) -> Self {
13 |         TopologyAwareRingAllReduce { seed, num_rings }
14 |     }
15 | }
16 | 
17 | impl AllReduceAlgorithm for TopologyAwareRingAllReduce {
18 |     fn allreduce(&mut self, size: u64, vcluster: Rc<dyn Topology>) -> Vec<Flow> {
19 |         use rand::prelude::SliceRandom;
20 |         use rand::{rngs::StdRng, SeedableRng};
21 |         let mut rng = StdRng::seed_from_u64(self.seed);
22 | 
23 |         let mut flows = Vec::new();
24 | 
25 |         for _ in 0..self.num_rings {
26 |             let mut ring = Vec::new();
27 | 
28 |             for i in 0..vcluster.num_switches() - 1 {
29 |                 let mut ringlet = Vec::new();
30 |                 let tor = format!("tor_{}", i);
31 | 
32 |                 for link_ix in vcluster.get_downlinks(vcluster.get_node_index(&tor)) {
33 |                     let h = vcluster.get_target(*link_ix);
34 |                     let host_idx = vcluster[h].name.strip_prefix("host_").unwrap().parse::<usize>().unwrap();
35 |                     ringlet.push(host_idx)
36 |                 }
37 |                 ringlet.shuffle(&mut rng);
38 |                 for node_idx in ringlet {
39 |                     ring.push(node_idx);
40 |                 }
41 |             }
42 | 
43 |             let n = vcluster.num_hosts();
44 |             for _ in 0..2 {
45 |                 for i in 0..n {
46 |                     let sender = format!("host_{}", ring[i]);
47 |                     let receiver = format!("host_{}", ring[(i + 1) % n]);
48 |                     let flow = Flow::new(size as usize * (n - 1) / n / self.num_rings, &sender, &receiver, None);
49 |                     flows.push(flow);
50 |                 }
51 |             }
52 |         }
53 | 
54 |         flows
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/scripts/testbed/enable_eswitch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $UID -ne 0 ]; then
 6 | 	echo "Please run $0 as root"
 7 | 	exit 3
 8 | fi
 9 | 
10 | if [ $# -ne 1 ]; then
11 | 	echo "Usage: $0 <pf_intf>"
12 | 	exit 1
13 | fi
14 | 
15 | pf=$1
16 | 
17 | pci_addr=$(basename `readlink /sys/class/net/$pf/device`)
18 | echo "PCI address of $pf is $pci_addr"
19 | 
20 | pci_bus=$(printf "%d" "0x`echo $pci_addr | cut -d':' -f2`")
21 | pci_slot=$(printf "%d" "0x`echo $pci_addr | cut -d':' -f3 | cut -d'.' -f1 `")
22 | pf_altname="enp${pci_bus}s${pci_slot}"
23 | echo "altname of PF: $pf_altname"
24 | 
25 | # check if device has already been set to eswitch mode
26 | mode=`cat /sys/class/net/$pf/compat/devlink/mode`
27 | if [ $mode = "switchmode" ]; then
28 | 	echo "$pf has already been set to eswitch mode"
29 | 	exit 2
30 | fi
31 | 
32 | # unbind the VFs
33 | num_vfs=`cat /sys/class/net/$pf/device/sriov_numvfs`
34 | drivers=(`seq 1 8`)
35 | 
36 | for ((i=0;i<$num_vfs;i++)); do
37 | 	vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`)
38 | 	driver=$(basename `readlink /sys/bus/pci/devices/$vf_pci_addr/driver`)
39 | 	drivers[$i]=$driver
40 | 	echo $vf_pci_addr > /sys/bus/pci/devices/$vf_pci_addr/driver/unbind
41 | 	# show results
42 | 	lspci -k -s $vf_pci_addr
43 | done
44 | 
45 | echo switchdev > "/sys/class/net/$pf/compat/devlink/mode"
46 | 
47 | echo "sleeping for 20 seconds..."
48 | sleep 20
49 | 
50 | # check if the VF representors has been renamed
51 | ip link
52 | 
53 | # It is necessary to first set the network VF representor device names
54 | # to be in the form of $PF_$VFID where $PF is the PF netdev name,
55 | # and $VFID is the VF ID=0,1,[..], bring up these VF representors
56 | for ((i=0;i<$num_vfs;i++)); do
57 | 	ip link set "${pf_altname}_${i}" name "${pf}_${i}"
58 | 	ip link set "${pf}_${i}" up
59 | done
60 | 
61 | 
62 | # re-bind the VFs' drivers
63 | for ((i=0;i<$num_vfs;i++)); do
64 | 	vf_pci_addr=$(basename `readlink /sys/class/net/$pf/device/virtfn$i`)
65 | 	echo $vf_pci_addr > /sys/bus/pci/drivers/${drivers[$i]}/bind
66 | 	# show results
67 | 	lspci -k -s $vf_pci_addr
68 | done
69 | 


--------------------------------------------------------------------------------
/src/nethint/tests/toy1.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | use nethint::bandwidth::BandwidthTrait;
 3 | use nethint::cluster::{Cluster, Node, NodeType};
 4 | use nethint::simulator::{Executor, Simulator};
 5 | use nethint::{Flow, Trace, TraceRecord};
 6 | 
 7 | #[test]
 8 | fn toy1() {
 9 |     logging::init_log();
10 | 
11 |     let nodes = vec![
12 |         ("a1", 3),
13 |         ("a2", 3),
14 |         ("a3", 3),
15 |         ("a5", 3),
16 |         ("a6", 3),
17 |         ("vs1", 2),
18 |         ("vs2", 2),
19 |         ("cloud", 1),
20 |     ]
21 |     .into_iter()
22 |     .map(|(n, depth)| {
23 |         Node::new(
24 |             n,
25 |             depth,
26 |             if depth == 3 {
27 |                 NodeType::Host
28 |             } else {
29 |                 NodeType::Switch
30 |             },
31 |         )
32 |     })
33 |     .collect();
34 | 
35 |     let mut cluster = Cluster::from_nodes(nodes);
36 | 
37 |     vec![
38 |         ("a1", "vs1", 20.gbps()),
39 |         ("a2", "vs1", 10.gbps()),
40 |         ("a3", "vs1", 9.gbps()),
41 |         ("a5", "vs2", 10.gbps()),
42 |         ("a6", "vs2", 5.gbps()),
43 |         ("vs1", "cloud", 35.gbps()),
44 |         ("vs2", "cloud", 15.gbps()),
45 |     ]
46 |     .into_iter()
47 |     .for_each(|args| cluster.add_link_by_name(args.1, args.0, args.2));
48 | 
49 |     let mut trace = Trace::new();
50 |     let records: Vec<TraceRecord> = vec![
51 |         (0, 1e6 as usize, "a1", "a5"),
52 |         (0, 1e6 as usize, "a2", "a6"),
53 |         (1000000, 1e6 as usize, "a2", "a3"),
54 |     ]
55 |     .into_iter()
56 |     .map(|args| TraceRecord::new(args.0, Flow::new(args.1, args.2, args.3, None), None))
57 |     .collect();
58 |     records.into_iter().for_each(|r| trace.add_record(r));
59 | 
60 |     let mut simulator = Simulator::new(cluster);
61 |     let output = simulator.run_with_trace(trace);
62 |     println!("{:#?}", output);
63 |     assert_eq!(output.recs[0].dura, Some(800_000));
64 |     assert_eq!(output.recs[1].dura, Some(1_600_000));
65 |     assert_eq!(
66 |         output.recs[2].dura,
67 |         Some(((600. + (10. / 16.) * 1600. * (5. / 9.)) * 1e3f64).round() as u64)
68 |     );
69 | }
70 | 


--------------------------------------------------------------------------------
/src/nhagent_v2/src/message.rs:
--------------------------------------------------------------------------------
 1 | use nethint::{TenantId, hint::{NetHintV1Real, NetHintV2Real, NetHintVersion}};
 2 | use nethint::counterunit::CounterUnit;
 3 | use nethint::cluster::LinkIx;
 4 | use serde::{Deserialize, Serialize};
 5 | use std::collections::HashMap;
 6 | use crate::communicator::BcastId;
 7 | 
 8 | #[derive(Debug, Serialize, Deserialize)]
 9 | pub enum Message {
10 |     // do not handle these messages for now
11 |     // /// send by worker, processed by leader
12 |     // LeaveNode(Node),
13 |     /// send by leader, processed by worker
14 |     AppFinish,
15 | 
16 |     /// send by non global leader, processed by global leader
17 |     /// barrier ID
18 |     SyncRequest(u64),
19 | 
20 |     /// send by global leader, processed by non global leader
21 |     /// barrier ID
22 |     SyncResponse(u64),
23 | 
24 |     /// broadcast type wrapper
25 |     BcastMessage(BcastId, Box<Message>),
26 | 
27 |     /// send by worker, processed by worker
28 |     DeclareHostname(String),
29 | 
30 |     /// A potential problem here is that LinkIx from different machines may not be compatible
31 |     /// send by rack leader, processed by rack leader
32 |     RackChunk(HashMap<LinkIx, Vec<CounterUnit>>),
33 | 
34 |     /// send by experiment scheduler, processed by rack leader, 
35 |     /// forward by rack leader, processed by global leader
36 |     /// in practice, we skip the forwarding pass
37 |     /// tenant_id, nhosts, allow_delay
38 |     ProvisionRequest(TenantId, usize, bool),
39 |     /// send by global leader, processed by rack leader
40 |     /// forward by rack leader, processed by experiment scheduler
41 |     /// in practice, we skip the forwarding pass
42 |     /// tenant_id, hintv1
43 |     ProvisionResponse(TenantId, NetHintV1Real),
44 |     /// send by app, processed by global leader
45 |     DestroyRequest(TenantId),
46 |     /// send by global leader, processed by app
47 |     DestroyResponse(TenantId),
48 |     /// send by app, processed by rack/global leader leader
49 |     NetHintRequest(TenantId, NetHintVersion),
50 |     /// send by rack/global leader, processed by app
51 |     NetHintResponseV1(TenantId, NetHintV1Real),
52 |     /// send by rack/global leader, processed by app
53 |     NetHintResponseV2(TenantId, NetHintV2Real),
54 | 
55 |     /// send by the scheduler
56 |     BatchDoneNotification,
57 | }
58 | 


--------------------------------------------------------------------------------
/scripts/testbed/environment/cpu_vm_stage1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script will create cpu_vm_base.img in the current directory,
 4 | # the cpu_vm_base.img can be used as a disk and boot by qemu
 5 | 
 6 | DISK_IMG=/tmp/cpu_vm_base.img
 7 | 
 8 | umount /mnt
 9 | losetup -D
10 | 
11 | # create raw disk, the 5GB is not enought for the latest OFED packages, so give it 10GB
12 | dd if=/dev/zero of=$DISK_IMG bs=1G count=10 status=progress && sync
13 | 
14 | # create only 1 partition, mark it bootable
15 | # an example command
16 | echo -en 'n\np\n1\n\n\na\nw\n\n' | fdisk $DISK_IMG
17 | 
18 | # in the below example output, I only create 100MB file disk
19 | # cjr@cpu22 /tmp % fdisk -l raw_disk.img
20 | # Disk raw.bin: 100 MiB, 104857600 bytes, 204800 sectors
21 | # Units: sectors of 1 * 512 = 512 bytes
22 | # Sector size (logical/physical): 512 bytes / 512 bytes
23 | # I/O size (minimum/optimal): 512 bytes / 512 bytes
24 | # Disklabel type: dos
25 | # Disk identifier: 0x7bcdb498
26 | # 
27 | # Device     Boot Start    End Sectors Size Id Type
28 | # raw.bin1   *     2048 204799  202752  99M 83 Linux 
29 | 
30 | LOOP_DEV=`losetup -f`
31 | if [ $? -ne 0 ]; then
32 | 	echo "losetup -f cannot find free loop device"
33 | 	exit 1
34 | fi
35 | 
36 | losetup $LOOP_DEV $DISK_IMG
37 | partprobe $LOOP_DEV
38 | # root@cpu21 /tmp # lsblk
39 | # NAME      MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
40 | # loop0       7:0    0  4.7G  0 loop
41 | # ├─loop0p1 259:0    0  4.7G  0 loop
42 | # sda         8:0    0  1.8T  0 disk
43 | # └─sda1      8:1    0  1.8T  0 part /
44 | 
45 | LOOP_PART=${LOOP_DEV}p1
46 | # format the filesystems
47 | mkfs.ext4 $LOOP_PART
48 | 
49 | # mount the partitions
50 | mount $LOOP_PART /mnt
51 | 
52 | # debootstrap
53 | apt install debootstrap -y
54 | debootstrap --merged-usr --keyring=/usr/share/keyrings/ubuntu-archive-keyring.gpg --verbose focal /mnt http://archive.ubuntu.com/ubuntu/
55 | 
56 | # generate fstab
57 | apt install arch-install-scripts -y
58 | genfstab -U /mnt | grep -v swap >> /mnt/etc/fstab
59 | 
60 | # after command finish, chroot to that directory
61 | cp /etc/apt/sources.list /mnt/etc/apt/sources.list
62 | 
63 | #mount -t proc /proc /mnt/proc
64 | #mount --rbind /sys /mnt/sys
65 | #mount --rbind /dev /mnt/dev
66 | #mount --rbind /run /mnt/run
67 | #cp /etc/resolv.conf /mnt/etc/resolv.conf
68 | #chroot /mnt /bin/bash
69 | 
70 | cp ./cpu_vm_stage2.sh /mnt/root
71 | arch-chroot /mnt /bin/bash /root/cpu_vm_stage2.sh $LOOP_DEV
72 | 
73 | sync
74 | umount /mnt
75 | losetup -D
76 | 


--------------------------------------------------------------------------------
/scripts/testbed-2/nixos-vm-setup/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | [[ $UID -ne 0 ]] && echo "Please run $0 as root" && exit 1
 5 | 
 6 | # this will create a disk image with nixos filesystem at /tmp/nixos_vm_base.img
 7 | ./prepare_disk.sh
 8 | 
 9 | DISK_FILE="/var/lib/libvirt/images/nixos_vm_base.img" 
10 | 
11 | [[ -e "$DISK_FILE" ]] && echo "$(tput setaf 1)[ERROR]$(tput sgr 0) image file already exists" && exit 2
12 | 
13 | # copy the image to image pool so virt-install can see it
14 | 
15 | echo "copying /tmp/nixos_vm_base.img to libvirt image pool"
16 | cp /tmp/nixos_vm_base.img $DISK_FILE
17 | 
18 | # cpubase m4.xlarge
19 | virt-install --virt-type kvm --name nixosbase --vcpus 8 --ram 16384 --boot hd --disk $DISK_FILE,format=raw --network network=default --network bridge=ovs0,virtualport_type=openvswitch,model=virtio --nographic --os-type=linux --os-variant=generic --noreboot --import
20 | 
21 | # provision 8 NixOS VMs
22 | source `dirname $0`/utils.sh
23 | 
24 | function customize()
25 | {
26 | 	name=$1
27 | 	# customize the ip address on ens3 interface
28 | 	# sed -i "s/192.168.211.3/192.168.211.3/" ./mnt/etc/nixos/configuration.nix
29 | 	# customize hostname
30 | 	sed -i "s/networking.hostName = \"nixos\"/networking.hostName = \"$name\"/" "$MNT_DIR/etc/nixos/configuration.nix"
31 | 	sed -i "s/nixosConfigurations.nixos/nixosConfigurations.$name/" "$MNT_DIR/etc/nixos/flake.nix"
32 | }
33 | 
34 | NIXOS_INSTALL=`command -v nixos-install`
35 | NIXOS_ENTER=`command -v nixos-enter`
36 | 
37 | for name in `lsnames 8`; do
38 | 	TARGET="/var/lib/libvirt/images/${name}.img"
39 | 	virsh vol-delete ${name}.img --pool images
40 | 	virt-clone --replace --original nixosbase --name $name --file "$TARGET"
41 | 
42 | 	# an alternative to virt-sysprep
43 | 	MNT_DIR=./mnt
44 | 	mkdir -p "$MNT_DIR"; umount "$MNT_DIR"; losetup -D
45 | 	LOOP_DEV=`losetup -f`
46 | 	losetup -P $LOOP_DEV $TARGET
47 | 	mount ${LOOP_DEV}p1 "$MNT_DIR"
48 | 	customize $name
49 | 	PATH_BAK=$PATH
50 | 	export NIX_PATH=nixpkgs=/nix/var/nix/profiles/per-user/cjr/channels/nixos/nixpkgs
51 | 	export PATH=/run/wrappers/bin:/root/.nix-profile/bin:/etc/profiles/per-user/root/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin:$PATH
52 | 	$NIXOS_INSTALL --root $(realpath "$MNT_DIR") --no-root-passwd --flake "$MNT_DIR/etc/nixos#$name" --impure
53 | 	$NIXOS_INSTALL --root $(realpath "$MNT_DIR") --no-bootloader --no-root-passwd --flake "$MNT_DIR/etc/nixos#$name" --impure
54 | 	export PATH=$PATH_BAK
55 | 	sync; umount -R "$MNT_DIR"; losetup -D
56 | done
57 | 


--------------------------------------------------------------------------------
/src/mapreduce/src/random.rs:
--------------------------------------------------------------------------------
 1 | use rand::{self, seq::SliceRandom};
 2 | use std::collections::HashMap;
 3 | 
 4 | use nethint::cluster::{Topology, LinkIx, RouteHint};
 5 | 
 6 | use crate::{JobSpec, PlaceReducer, Placement, Shuffle, RNG};
 7 | 
 8 | #[derive(Debug, Default)]
 9 | pub struct RandomReducerScheduler {}
10 | 
11 | impl RandomReducerScheduler {
12 |     pub fn new() -> Self {
13 |         Default::default()
14 |     }
15 | 
16 |     pub fn estimate_jct(
17 |         &mut self,
18 |         cluster: &dyn Topology,
19 |         job_spec: &JobSpec,
20 |         mapper: &Placement,
21 |         shuffle_pairs: &Shuffle,
22 |         collocate: bool,
23 |     ) -> f64 {
24 |         let reducers = self.place(cluster, job_spec, mapper, shuffle_pairs, collocate);
25 |         let mut traffic: HashMap<LinkIx, usize> = Default::default();
26 | 
27 |         for (mi, m) in mapper.0.iter().enumerate() {
28 |             let m_ix = cluster.get_node_index(m);
29 |             for (ri, r) in reducers.0.iter().enumerate() {
30 |                 let s = shuffle_pairs.0[mi][ri];
31 |                 let r_ix = cluster.get_node_index(r);
32 |                 if m_ix != r_ix {
33 |                     let route = cluster.resolve_route(m, r, &RouteHint::default(), None);
34 |                     for link_ix in route.path {
35 |                         *traffic.entry(link_ix).or_insert(0) += s;
36 |                     }
37 |                 }
38 |             }
39 |         }
40 | 
41 |         let mut est: f64 = 0.0;
42 |         for (&link_ix, &tr) in traffic.iter() {
43 |             let bw = cluster[link_ix].bandwidth;
44 |             est = est.max(tr as f64 * 8.0 / bw.val() as f64);
45 |         }
46 | 
47 |         // unit in seconds
48 |         est
49 |     }
50 | }
51 | 
52 | impl PlaceReducer for RandomReducerScheduler {
53 |     fn place(
54 |         &mut self,
55 |         cluster: &dyn Topology,
56 |         job_spec: &JobSpec,
57 |         mapper: &Placement,
58 |         _shuffle_pairs: &Shuffle,
59 |         collocate: bool,
60 |     ) -> Placement {
61 |         RNG.with(|rng| {
62 |             let mut rng = rng.borrow_mut();
63 |             let num_hosts = cluster.num_hosts();
64 |             let mut hosts: Vec<String> = (0..num_hosts).map(|x| format!("host_{}", x)).collect();
65 |             if !collocate {
66 |                 hosts.retain(|h| mapper.0.iter().find(|&m| m.eq(h)).is_none());
67 |             }
68 |             let mut hosts: Vec<String> = hosts
69 |                 .choose_multiple(&mut *rng, job_spec.num_reduce)
70 |                 .cloned()
71 |                 .collect();
72 |             hosts.shuffle(&mut *rng);
73 |             Placement(hosts)
74 |         })
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/nhagent/src/ssagent.rs:
--------------------------------------------------------------------------------
 1 | use std::net::SocketAddr;
 2 | use structopt::StructOpt;
 3 | use utils::cmd_helper::get_command_output;
 4 | use std::time::Duration;
 5 | use std::process::Command;
 6 | use nhagent::sampler::ss_sampler::SsTcpFlows;
 7 | 
 8 | #[derive(Debug, Clone, StructOpt)]
 9 | #[structopt(
10 |     name = "ssagent",
11 |     about = "while true; do ss -tuni | nc -w0 -u 127.0.0.1 9999; sleep 0.1; done"
12 | )]
13 | struct Opts {
14 |     /// Poll interval in ms
15 |     #[structopt(short = "i", long = "interval", default_value = "100")]
16 |     interval_ms: u64,
17 |     /// Target address
18 |     #[structopt(short = "t", long = "target")]
19 |     target: Option<SocketAddr>,
20 | }
21 | 
22 | #[inline]
23 | fn now() -> std::time::Instant {
24 |     std::time::Instant::now()
25 | }
26 | 
27 | fn get_default_target() -> SocketAddr {
28 |     let my_ip = utils::net::get_primary_ipv4("rdma0").unwrap();
29 |     let last_field = my_ip.octets()[3];
30 |     // the conversion is hard coded
31 |     // 3,4,5,6 -> 2, 35... -> 34
32 |     let target_num = (last_field / 32 * 32 + 2).to_string();
33 |     let numbers = my_ip.octets();
34 |     let addr = format!(
35 |         "{}.{}.{}.{}:{}",
36 |         numbers[0], numbers[1], numbers[2], target_num, 5555
37 |     );
38 |     addr.parse().unwrap_or_else(|_| panic!("addr: {}", addr))
39 | }
40 | 
41 | fn main() {
42 |     logging::init_log();
43 | 
44 |     let mut opts = Opts::from_args();
45 |     if opts.target.is_none() {
46 |         opts.target = Some(get_default_target());
47 |     }
48 |     log::info!("opts: {:?}", opts);
49 | 
50 |     let sock = std::net::UdpSocket::bind("0.0.0.0:34254").expect("bind failed");
51 |     sock.connect(opts.target.unwrap()).expect("connect failed");
52 |     sock.set_write_timeout(Some(Duration::from_millis(opts.interval_ms / 2))).unwrap();
53 |     let sleep_ms = Duration::from_millis(opts.interval_ms);
54 |     let mut last_ts = now();
55 | 
56 |     loop {
57 |         let mut cmd = Command::new("ss");
58 |         cmd.arg("-tuni");
59 |         let output = get_command_output(cmd).unwrap();
60 |         let ss_flows: SsTcpFlows = output.parse().expect("fail to parse ss output");
61 |         let ts = std::time::SystemTime::now();
62 |         let buf = bincode::serialize(&(ts, ss_flows)).expect("fail to serialize ss_flows");
63 |         assert!(buf.len() <= 65507);
64 |         match sock.send(&buf) {
65 |             Ok(_nbytes) => {}
66 |             Err(_e) => {}
67 |         }
68 |         let n = now();
69 |         if last_ts + sleep_ms > n {
70 |             // avoid duration < 0, which will cause a panic.
71 |             std::thread::sleep(last_ts + sleep_ms - n);
72 |         }
73 |         last_ts = now();
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/scripts/testbed/environment/README.md:
--------------------------------------------------------------------------------
 1 | # Base Image Preparation
 2 | 
 3 | Scripts to prepare a CPU base image file disk.
 4 | 
 5 | ```bash
 6 | sudo ./cpu_vm_stage1.sh
 7 | #   at the end of stage1, the script will mount the guest filesystem at
 8 | # /mnt, and copy the stage2 script to /mnt/root, and chroot to that and
 9 | # execute the stage2.sh.
10 | ```
11 | 
12 | To install and use the VM, use libvirt. First copy the `cpu_vm_base.img` to 
13 | `/var/lib/libvirt/images/cpu_vm_base.img`. Then run virt-install. The
14 | example below create an AWS `m4.xlarge` like CPU machine, except that the
15 | network uses RDMA SR-IOV. Remember to remove the `--print-xml` to really install
16 | the profile to libvirt.
17 | ```bash
18 | # you may finish the install without graphics
19 | virt-install --virt-type kvm --name cpubase --vcpus 4 --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --hostdev=pci_0000_18_00_1 --nographic --os-type=linux --os-variant=ubuntu20.04 --print-xml
20 | 
21 | # or with graphics
22 | virt-install --virt-type kvm --name cpubase --vcpus 4 --ram 16384 --boot hd --disk /var/lib/libvirt/images/cpu_vm_base.img,format=raw --network network=default --hostdev=pci_0000_18_00_1 --graphic vnc,listen=0.0.0.0 --os-type=linux --os-variant=ubuntu20.04 --print-xml
23 | ```
24 | 
25 | 
26 | To simplify the configuration, I use the default network created by libvirt to
27 | allow the internet access (SNAT). But that would not allow the VMs to reach
28 | each other. To allow this, I configure a correct IP address for the rdma
29 | interface and use that for interconnection. The switch has already been
30 | configured to support this.
31 | 
32 | 
33 | A bunch of things I decide to setup later after all VMs have been booted are
34 | 1. add many sshkeys to these VMs
35 | 2. generate a script for each VM to bring up and set different IP 
36 |    address for the rdma interface.
37 | 3. On every boot, attach VFs to corresponding VMs by using virsh 
38 |    attach-device, this gives more flexibility.
39 | 4. configure the name address resolution in both guests and hosts to
40 |    allow easy access by sth like rdma0.cpu5
41 | 
42 | 
43 | Then we are done!
44 | 
45 | 
46 | ### Notes
47 | A couple of things that has to be check for each reboot of physical
48 | server (aka what is not persistent).
49 | 1. `enable_sriov.sh`
50 | 2. `enable_eswitch.sh`
51 | 3. `setup_ovs.sh`
52 | 
53 | The order is important.
54 | 
55 | To clear ovs setings
56 | 1. `ovs-vsctl del-br ovs-sriov`
57 | 1. `ovs-dpctl show`
58 | 
59 | To disable eswtich and recovery the configuation
60 | 1. `echo legacy | sudo tee /sys/class/net/rdma0/compat/devlink/mode`
61 | 2. `enable_sriov.sh`
62 | 3. `echo 0 | sudo tee /sys/class/net/rdma0/device/sriov_numvfs` (optionally)
63 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/sensitivity_probing_cost1_base.toml:
--------------------------------------------------------------------------------
 1 | # Number of jobs
 2 | ncases = 30
 3 | 
 4 | # Job size distributions [(percentage, number of workers)]
 5 | job_size_distribution = [[80, 16], [80, 32]]
 6 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
 7 | 
 8 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 9 | buffer_size = 100_000_000
10 | 
11 | # Number of iterations for all jobs
12 | num_iterations = 1000
13 | 
14 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s
15 | poisson_lambda = 24_000_000_000.0
16 | 
17 | placement_strategy = { type = "Compact" }
18 | # placement_strategy = { type = "CompactLoadBalanced" }
19 | # placement_strategy = { type = "Spread" }
20 | # placement_strategy = { type = "Random", args = 0 }
21 | 
22 | # global seed
23 | seed = 1
24 | 
25 | # Output path of for the simulation results
26 | directory = "/tmp/sensitivity_probing_cost1/sensitivity_probing_cost1_base"
27 | 
28 | # Number of repeats for each batch of experiments
29 | batch_repeat = 5
30 | 
31 | [[batch]]
32 | policy = "RAT"
33 | probe = { enable = true, round_ms = 100 }
34 | nethint_level = 2
35 | auto_tune = 1000
36 | 
37 | [simulator]
38 | nethint = true
39 | sample_interval_ns = 100_000_000 # 100ms
40 | loopback_speed = 400
41 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
42 | # fairness = "TenantFlowMaxMin"
43 | fairness = "PerFlowMaxMin"
44 | 
45 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
46 | # background_flow_hard = { enable = false }
47 | # nethint_delay_ms = 100
48 | 
49 | [brain]
50 | # Random seed for multiple uses
51 | seed = 1
52 | # Whether the cluster's bandwidth is asymmetric
53 | asymmetric = false
54 | # The percentage of nodes marked broken
55 | broken = 0.0
56 | # The slots of each physical machine
57 | max_slots = 1
58 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
59 | sharing_mode = "Guaranteed"
60 | guaranteed_bandwidth = 25
61 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
62 | # background_flow_high_freq = { enable = false }
63 | gc_period = 10
64 | 
65 | # The topology for simulation
66 | [brain.topology]
67 | type = "Arbitrary"  # another possible value is "FatTree"
68 | 
69 | # [brain.topology.args] # When type = "FatTree"
70 | # nports = 20           # the number of ports of a switch
71 | # bandwidth = 100       # in Gbps
72 | # oversub_ratio = 4.0   # oversubscription ratio
73 | 
74 | [brain.topology.args]         # When type = "Arbitrary"
75 | nracks = 320        # the number of racks
76 | rack_size = 18     # the number of hosts under a rack
77 | host_bw = 100       # bandwidth of a host, in Gbps
78 | rack_bw = 600       # bandwidth of a ToR switch, in Gbps
79 | 
80 | # [envs]
81 | # KEY = "value"
82 | 


--------------------------------------------------------------------------------
/scripts/testbed/setup_ovs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ $UID -ne 0 ]; then
 6 | 	echo "Please run $0 as root"
 7 | 	exit 3
 8 | fi
 9 | 
10 | if [ $# -ne 2 ]; then
11 | 	echo "Usage: $0 <pf_intf> <num_vfs>"
12 | 	exit 1
13 | fi
14 | 
15 | pf=$1
16 | num_vfs=$2
17 | 
18 | # Create an OVS bridge (here it's named ovs-sriov).
19 | ovs-vsctl add-br ovs-sriov
20 | 
21 | # Enable hardware offload (disabled by default).
22 | ovs-vsctl set Open_vSwitch . other_config:hw-offload=true
23 | 
24 | # The aging timeout of OVS is given is ms and can be controlled with this command:
25 | ovs-vsctl set Open_vSwitch . other_config:max-idle=30000
26 | 
27 | # check the result
28 | ovs-vsctl get Open_vSwitch . other_config
29 | 
30 | # Restart the openvswitch service. This step is required for HW offload changes to take effect.
31 | systemctl restart openvswitch-switch.service
32 | 
33 | 
34 | # Make sure to bring up the PF and representor netdevices.
35 | ovs-vsctl add-port ovs-sriov $pf
36 | for ((i=0;i<$num_vfs;i++)); do
37 | 	ovs-vsctl add-port ovs-sriov ${pf}_${i}
38 | done
39 | 
40 | # show something
41 | ovs-vsctl list-ports ovs-sriov
42 | ovs-dpctl show
43 | 
44 | 
45 | # sudo ovs-appctl dpctl/dump-flows type=all -m
46 | # This will give results like below. All type of flows are displayed!
47 | # recirc_id(0),in_port(2),eth(src=02:49:61:d4:70:e8,dst=02:bc:b6:ff:bf:97),eth_type(0x0800),ipv4(frag=no), packets:584316, bytes:36666200, used:4.940s, actions:3
48 | # recirc_id(0),in_port(2),eth(src=02:49:61:d4:70:e8,dst=02:bc:b6:ff:bf:97),eth_type(0x0806), packets:0, bytes:0, used:3.110s, actions:3
49 | # recirc_id(0),in_port(3),eth(src=02:bc:b6:ff:bf:97,dst=02:49:61:d4:70:e8),eth_type(0x0800),ipv4(frag=no), packets:29461402, bytes:34366302886, used:4.940s, actions:2
50 | # recirc_id(0),in_port(3),eth(src=02:bc:b6:ff:bf:97,dst=02:49:61:d4:70:e8),eth_type(0x0806), packets:2, bytes:120, used:2.090s, actions:2
51 | # recirc_id(0),in_port(3),eth(src=1c:34:da:a5:55:94,dst=01:80:c2:00:00:0e),eth_type(0x88cc), packets:0, bytes:0, used:4.340s, actions:drop
52 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1b:06,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.162,tip=192.168.211.2,op=1/0xff), packets:2120, bytes:127200, used:0.141s, actions:1,2
53 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1a:4e,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.130,tip=192.168.211.2,op=1/0xff), packets:2121, bytes:127260, used:0.633s, actions:1,2
54 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1b:5a,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.194,tip=192.168.211.2,op=1/0xff), packets:2120, bytes:127200, used:0.401s, actions:1,2
55 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1b:26,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.34,tip=192.168.211.2,op=1/0xff), packets:3120, bytes:187200, used:0.721s, actions:1,2
56 | # recirc_id(0),in_port(3),eth(src=0c:42:a1:ef:1a:4a,dst=ff:ff:ff:ff:ff:ff),eth_type(0x0806),arp(sip=192.168.211.66,tip=192.168.211.2,op=1/0xff), packets:2118, bytes:127080, used:0.845s, actions:1,2
57 | 


--------------------------------------------------------------------------------
/src/nhagent/src/message.rs:
--------------------------------------------------------------------------------
 1 | use crate::sampler::EthAddr;
 2 | 
 3 | use nethint::{TenantId, hint::{NetHintV1Real, NetHintV2Real, NetHintVersion}};
 4 | use nethint::counterunit::CounterUnit;
 5 | use nethint::cluster::LinkIx;
 6 | use serde::{Deserialize, Serialize};
 7 | use std::collections::HashMap;
 8 | use std::net::IpAddr;
 9 | use crate::communicator::BcastId;
10 | use crate::timing::TimeList;
11 | 
12 | #[derive(Debug, Serialize, Deserialize)]
13 | pub enum Message {
14 |     // do not handle these messages for now
15 |     // /// send by worker, processed by leader
16 |     // LeaveNode(Node),
17 |     /// send by leader, processed by worker
18 |     AppFinish,
19 | 
20 |     /// send by non global leader, processed by global leader
21 |     /// barrier ID
22 |     SyncRequest(u64),
23 | 
24 |     /// send by global leader, processed by non global leader
25 |     /// barrier ID
26 |     SyncResponse(u64),
27 | 
28 |     /// broadcast type wrapper
29 |     BcastMessage(BcastId, Box<Message>),
30 | 
31 |     /// send by worker, procesed by worker
32 |     /// declare the table to map ethaddr to hostname collected locally
33 |     DeclareEthHostTable(HashMap<EthAddr, String>),
34 | 
35 |     /// send by worker, procesed by worker
36 |     /// declare the table to map ethaddr to hostname collected locally
37 |     DeclareIpHostTable(HashMap<IpAddr, String>),
38 | 
39 |     /// send by worker, processed by worker
40 |     DeclareHostname(String),
41 | 
42 |     /// send by worker, processed by rack leader
43 |     ServerChunk(Vec<CounterUnit>, TimeList),
44 |     /// A potential problem here is that LinkIx from different machines may not be compatible
45 |     /// send by rack leader, processed by rack leader
46 |     RackChunk(HashMap<LinkIx, Vec<CounterUnit>>, TimeList),
47 |     /// send by rack leader, processed by worker
48 |     AllHints(HashMap<LinkIx, Vec<CounterUnit>>),
49 | 
50 |     /// send by experiment scheduler, processed by rack leader, 
51 |     /// forward by rack leader, processed by global leader
52 |     /// in practice, we skip the forwarding pass
53 |     /// tenant_id, nhosts, allow_delay
54 |     ProvisionRequest(TenantId, usize, bool),
55 |     /// send by global leader, processed by rack leader
56 |     /// forward by rack leader, processed by experiment scheduler
57 |     /// in practice, we skip the forwarding pass
58 |     /// tenant_id, hintv1
59 |     ProvisionResponse(TenantId, NetHintV1Real),
60 |     /// send by app, processed by global leader
61 |     DestroyRequest(TenantId),
62 |     /// send by global leader, processed by app
63 |     DestroyResponse(TenantId),
64 |     /// send by app, processed by rack/global leader leader
65 |     NetHintRequest(TenantId, NetHintVersion, TimeList),
66 |     /// send by rack/global leader, processed by app
67 |     NetHintResponseV1(TenantId, NetHintV1Real),
68 |     /// send by rack/global leader, processed by app
69 |     NetHintResponseV2(TenantId, NetHintV2Real, TimeList),
70 |     /// send by global leader, processed by all
71 |     UpdateRateLimit(usize),
72 | }
73 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/sensitivity_probing_cost1_baseline.toml:
--------------------------------------------------------------------------------
 1 | # Number of jobs
 2 | ncases = 30
 3 | 
 4 | # Job size distributions [(percentage, number of workers)]
 5 | job_size_distribution = [[80, 16], [80, 32]]
 6 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
 7 | 
 8 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 9 | buffer_size = 100_000_000
10 | 
11 | # Number of iterations for all jobs
12 | num_iterations = 1000
13 | 
14 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s
15 | poisson_lambda = 24_000_000_000.0
16 | 
17 | placement_strategy = { type = "Compact" }
18 | # placement_strategy = { type = "CompactLoadBalanced" }
19 | # placement_strategy = { type = "Spread" }
20 | # placement_strategy = { type = "Random", args = 0 }
21 | 
22 | # global seed
23 | seed = 1
24 | 
25 | # Output path of for the simulation results
26 | directory = "/tmp/sensitivity_probing_cost1/sensitivity_probing_cost1_baseline"
27 | 
28 | # Number of repeats for each batch of experiments
29 | batch_repeat = 5
30 | 
31 | [[batch]]
32 | policy = "Random"
33 | probe = { enable = false }
34 | nethint_level = 2
35 | 
36 | [[batch]]
37 | policy = "RAT"
38 | probe = { enable = false }
39 | nethint_level = 2
40 | # Auto tune after some iterations. default is disabled
41 | auto_tune = 10
42 | 
43 | [simulator]
44 | nethint = true
45 | sample_interval_ns = 100_000_000 # 100ms
46 | loopback_speed = 400
47 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
48 | # fairness = "TenantFlowMaxMin"
49 | fairness = "PerFlowMaxMin"
50 | 
51 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
52 | # background_flow_hard = { enable = false }
53 | # nethint_delay_ms = 100
54 | 
55 | [brain]
56 | # Random seed for multiple uses
57 | seed = 1
58 | # Whether the cluster's bandwidth is asymmetric
59 | asymmetric = false
60 | # The percentage of nodes marked broken
61 | broken = 0.0
62 | # The slots of each physical machine
63 | max_slots = 1
64 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
65 | sharing_mode = "Guaranteed"
66 | guaranteed_bandwidth = 25
67 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
68 | # background_flow_high_freq = { enable = false }
69 | gc_period = 10
70 | 
71 | # The topology for simulation
72 | [brain.topology]
73 | type = "Arbitrary"  # another possible value is "FatTree"
74 | 
75 | # [brain.topology.args] # When type = "FatTree"
76 | # nports = 20           # the number of ports of a switch
77 | # bandwidth = 100       # in Gbps
78 | # oversub_ratio = 4.0   # oversubscription ratio
79 | 
80 | [brain.topology.args]         # When type = "Arbitrary"
81 | nracks = 320        # the number of racks
82 | rack_size = 18      # the number of hosts under a rack
83 | host_bw = 100       # bandwidth of a host, in Gbps
84 | rack_bw = 600       # bandwidth of a ToR switch, in Gbps
85 | 
86 | # [envs]
87 | # KEY = "value"
88 | 


--------------------------------------------------------------------------------
/evaluation/inaccuracy/inaccuracy1_base.toml:
--------------------------------------------------------------------------------
 1 | # Number of jobs
 2 | ncases = 30
 3 | 
 4 | # Job size distributions [(percentage, number of workers)]
 5 | job_size_distribution = [[80, 16], [80, 32]]
 6 | 
 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 8 | buffer_size = 100_000_000
 9 | 
10 | # Number of iterations for all jobs
11 | num_iterations = 100
12 | 
13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s
14 | poisson_lambda = 2_400_000_000.0
15 | 
16 | placement_strategy = { type = "Compact" }
17 | # placement_strategy = { type = "CompactLoadBalanced" }
18 | # placement_strategy = { type = "Spread" }
19 | # placement_strategy = { type = "Random", args = 0 }
20 | 
21 | # global seed
22 | seed = 1
23 | 
24 | # Output path of for the simulation results
25 | directory = "/tmp/inaccuracy1/inaccuracy1_base"
26 | 
27 | # Number of repeats for each batch of experiments
28 | batch_repeat = 5
29 | 
30 | [[batch]]
31 | policy = "Random"
32 | probe = { enable = false }
33 | nethint_level = 2
34 | 
35 | [[batch]]
36 | policy = "TopologyAware" # which is ring
37 | probe = { enable = false }
38 | nethint_level = 1
39 | 
40 | [[batch]]
41 | policy = "RAT"
42 | probe = { enable = false }
43 | nethint_level = 2
44 | # Auto tune after some iterations. default is disabled
45 | auto_tune = 10
46 | 
47 | [simulator]
48 | nethint = true
49 | sample_interval_ns = 100_000_000 # 100ms
50 | loopback_speed = 400
51 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
52 | # fairness = "TenantFlowMaxMin"
53 | fairness = "PerFlowMaxMin"
54 | 
55 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
56 | # background_flow_hard = { enable = false }
57 | # nethint_delay_ms = 100
58 | 
59 | [brain]
60 | # Random seed for multiple uses
61 | seed = 1
62 | # Whether the cluster's bandwidth is asymmetric
63 | asymmetric = false
64 | # The percentage of nodes marked broken
65 | broken = 0.0
66 | # The slots of each physical machine
67 | max_slots = 1
68 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
69 | sharing_mode = "Guaranteed"
70 | guaranteed_bandwidth = 25
71 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
72 | # background_flow_high_freq = { enable = false }
73 | gc_period = 10
74 | # inaccuracy
75 | inaccuracy = 0.1
76 | 
77 | # The topology for simulation
78 | [brain.topology]
79 | type = "Arbitrary"  # another possible value is "FatTree"
80 | 
81 | # [brain.topology.args] # When type = "FatTree"
82 | # nports = 20           # the number of ports of a switch
83 | # bandwidth = 100       # in Gbps
84 | # oversub_ratio = 4.0   # oversubscription ratio
85 | 
86 | [brain.topology.args]         # When type = "Arbitrary"
87 | nracks = 320        # the number of racks
88 | rack_size = 6      # the number of hosts under a rack
89 | host_bw = 100       # bandwidth of a host, in Gbps
90 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
91 | 
92 | # [envs]
93 | # KEY = "value"
94 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/sensitivity_oversub_base.toml:
--------------------------------------------------------------------------------
 1 | # Number of jobs
 2 | ncases = 30
 3 | 
 4 | # Job size distributions [(percentage, number of workers)]
 5 | job_size_distribution = [[80, 16], [80, 32]]
 6 | 
 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 8 | buffer_size = 100_000_000
 9 | 
10 | # Number of iterations for all jobs
11 | num_iterations = 100
12 | 
13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s
14 | poisson_lambda = 2_400_000_000.0
15 | 
16 | placement_strategy = { type = "Compact" }
17 | # placement_strategy = { type = "CompactLoadBalanced" }
18 | # placement_strategy = { type = "Spread" }
19 | # placement_strategy = { type = "Random", args = 0 }
20 | 
21 | # global seed
22 | seed = 1
23 | 
24 | # Output path of for the simulation results
25 | directory = "/tmp/sensitivity_oversub/sensitivity_oversub_base"
26 | 
27 | # Number of repeats for each batch of experiments
28 | batch_repeat = 5
29 | 
30 | [[batch]]
31 | policy = "Random"
32 | probe = { enable = false }
33 | nethint_level = 2
34 | 
35 | # [[batch]]
36 | # policy = "RAT"
37 | # probe = { enable = true, round_ms = 100 }
38 | # nethint_level = 2
39 | # auto_tune = 1000
40 | 
41 | [[batch]]
42 | policy = "RAT"
43 | probe = { enable = false }
44 | nethint_level = 2
45 | # Auto tune after some iterations. default is disabled
46 | auto_tune = 10
47 | 
48 | [simulator]
49 | nethint = true
50 | sample_interval_ns = 100_000_000 # 100ms
51 | loopback_speed = 400
52 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
53 | # fairness = "TenantFlowMaxMin"
54 | fairness = "PerFlowMaxMin"
55 | 
56 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
57 | # background_flow_hard = { enable = false }
58 | # nethint_delay_ms = 100
59 | 
60 | [brain]
61 | # Random seed for multiple uses
62 | seed = 1
63 | # Whether the cluster's bandwidth is asymmetric
64 | asymmetric = false
65 | # The percentage of nodes marked broken
66 | broken = 0.0
67 | # The slots of each physical machine
68 | max_slots = 1
69 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
70 | sharing_mode = "Guaranteed"
71 | guaranteed_bandwidth = 25
72 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
73 | # background_flow_high_freq = { enable = false }
74 | gc_period = 10
75 | 
76 | # The topology for simulation
77 | [brain.topology]
78 | type = "Arbitrary"  # another possible value is "FatTree"
79 | 
80 | # [brain.topology.args] # When type = "FatTree"
81 | # nports = 20           # the number of ports of a switch
82 | # bandwidth = 100       # in Gbps
83 | # oversub_ratio = 4.0   # oversubscription ratio
84 | 
85 | [brain.topology.args]         # When type = "Arbitrary"
86 | nracks = 320        # the number of racks
87 | rack_size = 6      # the number of hosts under a rack
88 | host_bw = 100       # bandwidth of a host, in Gbps
89 | rack_bw = 0       # bandwidth of a ToR switch, in Gbps
90 | 
91 | # [envs]
92 | # KEY = "value"
93 | 


--------------------------------------------------------------------------------
/evaluation/sensitivity/sensitivity_rack_size_base.toml:
--------------------------------------------------------------------------------
 1 | # Number of jobs
 2 | ncases = 30
 3 | 
 4 | # Job size distributions [(percentage, number of workers)]
 5 | job_size_distribution = [[80, 8], [80, 12]]
 6 | 
 7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 8 | buffer_size = 100_000_000
 9 | 
10 | # Number of iterations for all jobs
11 | num_iterations = 100
12 | 
13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2s
14 | poisson_lambda = 2_400_000_000.0
15 | 
16 | placement_strategy = { type = "Compact" }
17 | # placement_strategy = { type = "CompactLoadBalanced" }
18 | # placement_strategy = { type = "Spread" }
19 | # placement_strategy = { type = "Random", args = 0 }
20 | 
21 | # global seed
22 | seed = 1
23 | 
24 | # Output path of for the simulation results
25 | directory = "/tmp/sensitivity_rack_size/sensitivity_rack_size_base"
26 | 
27 | # Number of repeats for each batch of experiments
28 | batch_repeat = 5
29 | 
30 | [[batch]]
31 | policy = "Random"
32 | probe = { enable = false }
33 | nethint_level = 2
34 | 
35 | # [[batch]]
36 | # policy = "RAT"
37 | # probe = { enable = true, round_ms = 100 }
38 | # nethint_level = 2
39 | # auto_tune = 1000
40 | 
41 | [[batch]]
42 | policy = "RAT"
43 | probe = { enable = false }
44 | nethint_level = 2
45 | # Auto tune after some iterations. default is disabled
46 | auto_tune = 10
47 | 
48 | [simulator]
49 | nethint = true
50 | sample_interval_ns = 100_000_000 # 100ms
51 | loopback_speed = 400
52 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
53 | # fairness = "TenantFlowMaxMin"
54 | fairness = "PerFlowMaxMin"
55 | 
56 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
57 | # background_flow_hard = { enable = false }
58 | # nethint_delay_ms = 100
59 | 
60 | [brain]
61 | # Random seed for multiple uses
62 | seed = 1
63 | # Whether the cluster's bandwidth is asymmetric
64 | asymmetric = false
65 | # The percentage of nodes marked broken
66 | broken = 0.0
67 | # The slots of each physical machine
68 | max_slots = 1
69 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
70 | sharing_mode = "Guaranteed"
71 | guaranteed_bandwidth = 25
72 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
73 | # background_flow_high_freq = { enable = false }
74 | gc_period = 10
75 | 
76 | # The topology for simulation
77 | [brain.topology]
78 | type = "Arbitrary"  # another possible value is "FatTree"
79 | 
80 | # [brain.topology.args] # When type = "FatTree"
81 | # nports = 20           # the number of ports of a switch
82 | # bandwidth = 100       # in Gbps
83 | # oversub_ratio = 4.0   # oversubscription ratio
84 | 
85 | [brain.topology.args]         # When type = "Arbitrary"
86 | nracks = 320        # the number of racks
87 | rack_size = 0      # the number of hosts under a rack
88 | host_bw = 100       # bandwidth of a host, in Gbps
89 | rack_bw = 0       # bandwidth of a ToR switch, in Gbps
90 | 
91 | # [envs]
92 | # KEY = "value"
93 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/standard3.toml:
--------------------------------------------------------------------------------
 1 | # Specifiation of a Allreduce application experiment
 2 | 
 3 | # Number of jobs
 4 | ncases = 30
 5 | 
 6 | # Job size distributions [(percentage, number of workers)]
 7 | job_size_distribution = [[40, 16], [40, 32]]
 8 | 
 9 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
10 | buffer_size = 100_000_000
11 | 
12 | # Number of iterations for all jobs
13 | num_iterations = 1000
14 | 
15 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
16 | poisson_lambda = 24_000_000_000.0
17 | 
18 | placement_strategy = { type = "Compact" }
19 | # placement_strategy = { type = "CompactLoadBalanced" }
20 | # placement_strategy = { type = "Spread" }
21 | # placement_strategy = { type = "Random", args = 0 }
22 | 
23 | # global seed
24 | seed = 1
25 | 
26 | # Output path of for the simulation results
27 | directory = "/tmp/allreduce_result_for_paper/standard3"
28 | 
29 | # Number of repeats for each batch of experiments
30 | batch_repeat = 5
31 | 
32 | [[batch]]
33 | policy = "Random"
34 | probe = { enable = false }
35 | nethint_level = 0
36 | 
37 | [[batch]]
38 | policy = "RAT"
39 | probe = { enable = true, round_ms = 100 }
40 | nethint_level = 2
41 | auto_tune = 1000
42 | 
43 | [[batch]]
44 | policy = "RAT"
45 | probe = { enable = false }
46 | nethint_level = 2
47 | # Auto tune after some iterations. default is disabled
48 | auto_tune = 10
49 | 
50 | [simulator]
51 | nethint = true
52 | sample_interval_ns = 100_000_000 # 100ms
53 | loopback_speed = 400
54 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
55 | # fairness = "TenantFlowMaxMin"
56 | fairness = "PerFlowMaxMin"
57 | 
58 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 }
59 | # background_flow_hard = { enable = false }
60 | # nethint_delay_ms = 100
61 | 
62 | [brain]
63 | # Random seed for multiple uses
64 | seed = 1
65 | # Whether the cluster's bandwidth is asymmetric
66 | asymmetric = false
67 | # The percentage of nodes marked broken
68 | broken = 0.0
69 | # The slots of each physical machine
70 | max_slots = 1
71 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
72 | sharing_mode = "Guaranteed"
73 | # in Gbps
74 | guaranteed_bandwidth = 25
75 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
76 | # background_flow_high_freq = { enable = false }
77 | gc_period = 100
78 | 
79 | # The topology for simulation
80 | [brain.topology]
81 | type = "Arbitrary"  # another possible value is "FatTree"
82 | 
83 | # [brain.topology.args] # When type = "FatTree"
84 | # nports = 20           # the number of ports of a switch
85 | # bandwidth = 100       # in Gbps
86 | # oversub_ratio = 4.0   # oversubscription ratio
87 | 
88 | [brain.topology.args]         # When type = "Arbitrary"
89 | nracks = 300        # the number of racks
90 | rack_size = 18      # the number of hosts under a rack
91 | host_bw = 100       # bandwidth of a host, in Gbps
92 | rack_bw = 600       # bandwidth of a ToR switch, in Gbps
93 | 
94 | # [envs]
95 | # KEY = "value"
96 | 


--------------------------------------------------------------------------------
/evaluation/inaccuracy/inaccuracy2_base.toml:
--------------------------------------------------------------------------------
 1 | # Number of jobs
 2 | ncases = 30
 3 | 
 4 | # Job size distributions [(percentage, number of workers)]
 5 | job_size_distribution = [[50, 8], [50, 16]]
 6 | 
 7 | # Images in a ML serving batch request, batch size = 64, each image has 3 channels, crop to 256x256
 8 | buffer_size = 12582912
 9 | 
10 | # Number of iterations for all jobs
11 | num_iterations = 1000
12 | 
13 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 4000 = 2s
14 | poisson_lambda = 2_400_000_000.0
15 | 
16 | partially_sync = false
17 | 
18 | placement_strategy = { type = "Compact" }
19 | # placement_strategy = { type = "CompactLoadBalanced" }
20 | # placement_strategy = { type = "Spread" }
21 | # placement_strategy = { type = "Random", args = 0 }
22 | 
23 | # global seed
24 | seed = 1
25 | 
26 | # Output path of for the simulation results
27 | directory = "/tmp/inaccuracy2/inaccuracy2_base"
28 | 
29 | # Number of repeats for each batch of experiments
30 | batch_repeat = 5
31 | 
32 | [[batch]]
33 | policy = "Random"
34 | probe = { enable = false }
35 | nethint_level = 2
36 | 
37 | [[batch]]
38 | policy = "TopologyAware" # which is ring
39 | probe = { enable = false }
40 | nethint_level = 1
41 | 
42 | [[batch]]
43 | policy = "RAT"
44 | probe = { enable = false }
45 | nethint_level = 2
46 | # Auto tune after some iterations. default is disabled
47 | auto_tune = 10
48 | 
49 | [simulator]
50 | nethint = true
51 | sample_interval_ns = 100_000_000 # 100ms
52 | loopback_speed = 400
53 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
54 | # fairness = "TenantFlowMaxMin"
55 | fairness = "PerFlowMaxMin"
56 | 
57 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
58 | # background_flow_hard = { enable = false }
59 | # nethint_delay_ms = 100
60 | 
61 | [brain]
62 | # Random seed for multiple uses
63 | seed = 1
64 | # Whether the cluster's bandwidth is asymmetric
65 | asymmetric = false
66 | # The percentage of nodes marked broken
67 | broken = 0.0
68 | # The slots of each physical machine
69 | max_slots = 1
70 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
71 | sharing_mode = "Guaranteed"
72 | guaranteed_bandwidth = 25
73 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
74 | # background_flow_high_freq = { enable = false }
75 | gc_period = 10
76 | # inaccuracy
77 | inaccuracy = 0.1
78 | 
79 | # The topology for simulation
80 | [brain.topology]
81 | type = "Arbitrary"  # another possible value is "FatTree"
82 | 
83 | # [brain.topology.args] # When type = "FatTree"
84 | # nports = 20           # the number of ports of a switch
85 | # bandwidth = 100       # in Gbps
86 | # oversub_ratio = 4.0   # oversubscription ratio
87 | 
88 | [brain.topology.args]         # When type = "Arbitrary"
89 | nracks = 320        # the number of racks
90 | rack_size = 6      # the number of hosts under a rack
91 | host_bw = 100       # bandwidth of a host, in Gbps
92 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
93 | 
94 | # [envs]
95 | # KEY = "value"
96 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/standard3_pervm.toml:
--------------------------------------------------------------------------------
 1 | # Specifiation of a Allreduce application experiment
 2 | 
 3 | # Number of jobs
 4 | ncases = 30
 5 | 
 6 | # Job size distributions [(percentage, number of workers)]
 7 | job_size_distribution = [[40, 16], [40, 32]]
 8 | 
 9 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
10 | buffer_size = 100_000_000
11 | 
12 | # Number of iterations for all jobs
13 | num_iterations = 1000
14 | 
15 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
16 | poisson_lambda = 24_000_000_000.0
17 | 
18 | placement_strategy = { type = "Compact" }
19 | # placement_strategy = { type = "CompactLoadBalanced" }
20 | # placement_strategy = { type = "Spread" }
21 | # placement_strategy = { type = "Random", args = 0 }
22 | 
23 | # global seed
24 | seed = 1
25 | 
26 | # Output path of for the simulation results
27 | directory = "/tmp/allreduce_result_for_paper/standard3_pervm"
28 | 
29 | # Number of repeats for each batch of experiments
30 | batch_repeat = 5
31 | 
32 | [[batch]]
33 | policy = "Random"
34 | probe = { enable = false }
35 | nethint_level = 0
36 | 
37 | [[batch]]
38 | policy = "RAT"
39 | probe = { enable = true, round_ms = 100 }
40 | nethint_level = 2
41 | auto_tune = 1000
42 | 
43 | [[batch]]
44 | policy = "RAT"
45 | probe = { enable = false }
46 | nethint_level = 2
47 | # Auto tune after some iterations. default is disabled
48 | auto_tune = 10
49 | 
50 | [simulator]
51 | nethint = true
52 | sample_interval_ns = 100_000_000 # 100ms
53 | loopback_speed = 400
54 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
55 | # fairness = "TenantFlowMaxMin"
56 | # fairness = "PerFlowMaxMin"
57 | fairness = "PerVmPairMaxMin"
58 | 
59 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 }
60 | # background_flow_hard = { enable = false }
61 | # nethint_delay_ms = 100
62 | 
63 | [brain]
64 | # Random seed for multiple uses
65 | seed = 1
66 | # Whether the cluster's bandwidth is asymmetric
67 | asymmetric = false
68 | # The percentage of nodes marked broken
69 | broken = 0.0
70 | # The slots of each physical machine
71 | max_slots = 1
72 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
73 | sharing_mode = "Guaranteed"
74 | # in Gbps
75 | guaranteed_bandwidth = 25
76 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
77 | # background_flow_high_freq = { enable = false }
78 | gc_period = 100
79 | 
80 | # The topology for simulation
81 | [brain.topology]
82 | type = "Arbitrary"  # another possible value is "FatTree"
83 | 
84 | # [brain.topology.args] # When type = "FatTree"
85 | # nports = 20           # the number of ports of a switch
86 | # bandwidth = 100       # in Gbps
87 | # oversub_ratio = 4.0   # oversubscription ratio
88 | 
89 | [brain.topology.args]         # When type = "Arbitrary"
90 | nracks = 300        # the number of racks
91 | rack_size = 6       # the number of hosts under a rack
92 | host_bw = 100       # bandwidth of a host, in Gbps
93 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
94 | 
95 | # [envs]
96 | # KEY = "value"
97 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/standard3_pertenant.toml:
--------------------------------------------------------------------------------
 1 | # Specifiation of a Allreduce application experiment
 2 | 
 3 | # Number of jobs
 4 | ncases = 30
 5 | 
 6 | # Job size distributions [(percentage, number of workers)]
 7 | job_size_distribution = [[40, 16], [40, 32]]
 8 | 
 9 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
10 | buffer_size = 100_000_000
11 | 
12 | # Number of iterations for all jobs
13 | num_iterations = 1000
14 | 
15 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
16 | poisson_lambda = 24_000_000_000.0
17 | 
18 | placement_strategy = { type = "Compact" }
19 | # placement_strategy = { type = "CompactLoadBalanced" }
20 | # placement_strategy = { type = "Spread" }
21 | # placement_strategy = { type = "Random", args = 0 }
22 | 
23 | # global seed
24 | seed = 1
25 | 
26 | # Output path of for the simulation results
27 | directory = "/tmp/allreduce_result_for_paper/standard3_pertenant"
28 | 
29 | # Number of repeats for each batch of experiments
30 | batch_repeat = 5
31 | 
32 | [[batch]]
33 | policy = "Random"
34 | probe = { enable = false }
35 | nethint_level = 0
36 | 
37 | [[batch]]
38 | policy = "RAT"
39 | probe = { enable = true, round_ms = 100 }
40 | nethint_level = 2
41 | auto_tune = 1000
42 | 
43 | [[batch]]
44 | policy = "RAT"
45 | probe = { enable = false }
46 | nethint_level = 2
47 | # Auto tune after some iterations. default is disabled
48 | auto_tune = 10
49 | 
50 | [simulator]
51 | nethint = true
52 | sample_interval_ns = 100_000_000 # 100ms
53 | loopback_speed = 400
54 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
55 | fairness = "TenantFlowMaxMin"
56 | # fairness = "PerFlowMaxMin"
57 | # fairness = "PerVmPairMaxMin"
58 | 
59 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 }
60 | # background_flow_hard = { enable = false }
61 | # nethint_delay_ms = 100
62 | 
63 | [brain]
64 | # Random seed for multiple uses
65 | seed = 1
66 | # Whether the cluster's bandwidth is asymmetric
67 | asymmetric = false
68 | # The percentage of nodes marked broken
69 | broken = 0.0
70 | # The slots of each physical machine
71 | max_slots = 1
72 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
73 | sharing_mode = "Guaranteed"
74 | # in Gbps
75 | guaranteed_bandwidth = 25
76 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
77 | # background_flow_high_freq = { enable = false }
78 | gc_period = 100
79 | 
80 | # The topology for simulation
81 | [brain.topology]
82 | type = "Arbitrary"  # another possible value is "FatTree"
83 | 
84 | # [brain.topology.args] # When type = "FatTree"
85 | # nports = 20           # the number of ports of a switch
86 | # bandwidth = 100       # in Gbps
87 | # oversub_ratio = 4.0   # oversubscription ratio
88 | 
89 | [brain.topology.args]         # When type = "Arbitrary"
90 | nracks = 300        # the number of racks
91 | rack_size = 18      # the number of hosts under a rack
92 | host_bw = 100       # bandwidth of a host, in Gbps
93 | rack_bw = 600       # bandwidth of a ToR switch, in Gbps
94 | 
95 | # [envs]
96 | # KEY = "value"
97 | 


--------------------------------------------------------------------------------
/src/allreduce/testbed.toml:
--------------------------------------------------------------------------------
  1 | # Specifiation of a Allreduce application experiment
  2 | 
  3 | # Number of jobs
  4 | ncases = 20
  5 | 
  6 | allow_delay = true
  7 | 
  8 | # Job size distributions [(percentage, number of workers)]
  9 | job_size_distribution = [[40, 2], [80, 4], [90, 6]]
 10 | 
 11 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 12 | buffer_size = 100_000_000
 13 | 
 14 | # Number of iterations for all jobs
 15 | num_iterations = 10
 16 | 
 17 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
 18 | poisson_lambda = 4_000_000_000.0
 19 | 
 20 | placement_strategy = { type = "Compact" }
 21 | 
 22 | # global seed
 23 | seed = 1
 24 | 
 25 | # Output path of for the simulation results
 26 | directory = "/tmp/allreduce_result"
 27 | 
 28 | # Number of repeats for each batch of experiments
 29 | batch_repeat = 1
 30 | 
 31 | [[batch]]
 32 | policy = "Random"
 33 | probe = { enable = false }
 34 | nethint_level = 0
 35 | 
 36 | [[batch]]
 37 | policy = "TopologyAware" # which is ring
 38 | probe = { enable = true, round_ms = 10 }
 39 | nethint_level = 1
 40 | 
 41 | [[batch]]
 42 | policy = "TopologyAware" # which is ring
 43 | probe = { enable = false }
 44 | nethint_level = 1
 45 | 
 46 | [[batch]]
 47 | policy = "RAT"
 48 | probe = { enable = false }
 49 | nethint_level = 2
 50 | # Auto tune after some iterations. default is disabled
 51 | auto_tune = 10
 52 | 
 53 | [simulator]
 54 | nethint = true
 55 | sample_interval_ns = 100_000_000 # 100ms
 56 | loopback_speed = 400
 57 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 58 | # fairness = "TenantFlowMaxMin"
 59 | fairness = "PerFlowMaxMin"
 60 | 
 61 | background_flow_hard = { enable = true, frequency_ns = 1_000_000_000, probability = 0.1, amplitude = 1 }
 62 | # background_flow_hard = { enable = false }
 63 | # nethint_delay_ms = 100
 64 | 
 65 | # These fields below are ignored
 66 | [brain]
 67 | # Random seed for multiple uses
 68 | seed = 1
 69 | # Whether the cluster's bandwidth is asymmetric
 70 | asymmetric = false
 71 | # The percentage of nodes marked broken
 72 | broken = 0.0
 73 | # The slots of each physical machine
 74 | max_slots = 1
 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 76 | sharing_mode = "Guaranteed"
 77 | # in Gbps
 78 | guaranteed_bandwidth = 25
 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 80 | background_flow_high_freq = { enable = false }
 81 | gc_period = 100
 82 | 
 83 | # The topology for simulation
 84 | [brain.topology]
 85 | type = "Arbitrary"  # another possible value is "FatTree"
 86 | 
 87 | # [brain.topology.args] # When type = "FatTree"
 88 | # nports = 20           # the number of ports of a switch
 89 | # bandwidth = 100       # in Gbps
 90 | # oversub_ratio = 4.0   # oversubscription ratio
 91 | 
 92 | [brain.topology.args]         # When type = "Arbitrary"
 93 | nracks = 2        # the number of racks
 94 | rack_size = 6       # the number of hosts under a rack
 95 | host_bw = 100       # bandwidth of a host, in Gbps
 96 | rack_bw = 100       # bandwidth of a ToR switch, in Gbps
 97 | 
 98 | # [envs]
 99 | # KEY = "value"
100 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/spectrum4_base.toml:
--------------------------------------------------------------------------------
 1 | # background=0.64s, poisson=2.4s, 128 iterations, 0.032s each iter, 
 2 | # alpha from 0.05 to 6.4 (0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4)
 3 | # corresponding auto_tune: 1, 2, 4, 8, 16, (20, alpha=1), 32, 64, 128
 4 | 
 5 | # Number of jobs
 6 | ncases = 30
 7 | 
 8 | # Job size distributions [(percentage, number of workers)]
 9 | job_size_distribution = [[80, 16], [80, 32]]
10 | 
11 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
12 | buffer_size = 100_000_000
13 | 
14 | # Number of iterations for all jobs
15 | num_iterations = 128
16 | 
17 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 3.2*1.28s
18 | poisson_lambda = 24_00_000_000.0
19 | 
20 | placement_strategy = { type = "Compact" }
21 | # placement_strategy = { type = "CompactLoadBalanced" }
22 | # placement_strategy = { type = "Spread" }
23 | # placement_strategy = { type = "Random", args = 0 }
24 | 
25 | # global seed
26 | seed = 1
27 | 
28 | # Output path of for the simulation results
29 | directory = "/tmp/spectrum4/spectrum4_1"
30 | 
31 | # Number of repeats for each batch of experiments
32 | batch_repeat = 5
33 | 
34 | [[batch]]
35 | policy = "Random"
36 | probe = { enable = false }
37 | nethint_level = 2
38 | 
39 | [[batch]]
40 | policy = "TopologyAware" # which is ring
41 | probe = { enable = false }
42 | nethint_level = 1
43 | 
44 | [[batch]]
45 | policy = "RAT"
46 | probe = { enable = false }
47 | nethint_level = 2
48 | # Auto tune after some iterations. default is disabled
49 | auto_tune = 10
50 | 
51 | [simulator]
52 | nethint = true
53 | sample_interval_ns = 100_000_000 # 100ms
54 | loopback_speed = 400
55 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
56 | # fairness = "TenantFlowMaxMin"
57 | fairness = "PerFlowMaxMin"
58 | 
59 | background_flow_hard = { enable = true, frequency_ns = 640_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
60 | # background_flow_hard = { enable = false }
61 | # nethint_delay_ms = 100
62 | 
63 | [brain]
64 | # Random seed for multiple uses
65 | seed = 1
66 | # Whether the cluster's bandwidth is asymmetric
67 | asymmetric = false
68 | # The percentage of nodes marked broken
69 | broken = 0.0
70 | # The slots of each physical machine
71 | max_slots = 1
72 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
73 | sharing_mode = "Guaranteed"
74 | guaranteed_bandwidth = 25
75 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
76 | # background_flow_high_freq = { enable = false }
77 | gc_period = 10
78 | 
79 | # The topology for simulation
80 | [brain.topology]
81 | type = "Arbitrary"  # another possible value is "FatTree"
82 | 
83 | # [brain.topology.args] # When type = "FatTree"
84 | # nports = 20           # the number of ports of a switch
85 | # bandwidth = 100       # in Gbps
86 | # oversub_ratio = 4.0   # oversubscription ratio
87 | 
88 | [brain.topology.args]         # When type = "Arbitrary"
89 | nracks = 320        # the number of racks
90 | rack_size = 6      # the number of hosts under a rack
91 | host_bw = 100       # bandwidth of a host, in Gbps
92 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
93 | 
94 | # [envs]
95 | # KEY = "value"
96 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/spectrum3_base.toml:
--------------------------------------------------------------------------------
 1 | # background=0.4s, poisson=2.6s, 1280 iterations, 0.002s each iter, 
 2 | # alpha from 0.05 to 6.4 (0.05, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4)
 3 | # corresponding auto_tune: 10, 20, 40, 80, 160, (200, alpha=1), 320, 640, 1280
 4 | 
 5 | # Number of jobs
 6 | ncases = 30
 7 | 
 8 | # Job size distributions [(percentage, number of workers)]
 9 | job_size_distribution = [[50, 8], [50, 16]]
10 | 
11 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
12 | buffer_size = 12582912
13 | 
14 | # Number of iterations for all jobs
15 | num_iterations = 1280
16 | 
17 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 1280 = 2.56s
18 | poisson_lambda = 2_600_000_000.0
19 | 
20 | partially_sync = false
21 | 
22 | placement_strategy = { type = "Compact" }
23 | # placement_strategy = { type = "CompactLoadBalanced" }
24 | # placement_strategy = { type = "Spread" }
25 | # placement_strategy = { type = "Random", args = 0 }
26 | 
27 | # global seed
28 | seed = 1
29 | 
30 | # Output path of for the simulation results
31 | directory = "/tmp/spectrum3/spectrum3_1"
32 | 
33 | # Number of repeats for each batch of experiments
34 | batch_repeat = 5
35 | 
36 | [[batch]]
37 | policy = "Random"
38 | probe = { enable = false }
39 | nethint_level = 2
40 | 
41 | [[batch]]
42 | policy = "TopologyAware" # which is ring
43 | probe = { enable = false }
44 | nethint_level = 1
45 | 
46 | [[batch]]
47 | policy = "RAT"
48 | probe = { enable = false }
49 | nethint_level = 2
50 | # Auto tune after some iterations. default is disabled
51 | auto_tune = 10
52 | 
53 | [simulator]
54 | nethint = true
55 | sample_interval_ns = 100_000_000 # 100ms
56 | loopback_speed = 400
57 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
58 | # fairness = "TenantFlowMaxMin"
59 | fairness = "PerFlowMaxMin"
60 | 
61 | background_flow_hard = { enable = true, frequency_ns = 400_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
62 | # background_flow_hard = { enable = false }
63 | # nethint_delay_ms = 100
64 | 
65 | [brain]
66 | # Random seed for multiple uses
67 | seed = 1
68 | # Whether the cluster's bandwidth is asymmetric
69 | asymmetric = false
70 | # The percentage of nodes marked broken
71 | broken = 0.0
72 | # The slots of each physical machine
73 | max_slots = 1
74 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
75 | sharing_mode = "Guaranteed"
76 | guaranteed_bandwidth = 25
77 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
78 | # background_flow_high_freq = { enable = false }
79 | gc_period = 10
80 | 
81 | # The topology for simulation
82 | [brain.topology]
83 | type = "Arbitrary"  # another possible value is "FatTree"
84 | 
85 | # [brain.topology.args] # When type = "FatTree"
86 | # nports = 20           # the number of ports of a switch
87 | # bandwidth = 100       # in Gbps
88 | # oversub_ratio = 4.0   # oversubscription ratio
89 | 
90 | [brain.topology.args]         # When type = "Arbitrary"
91 | nracks = 320        # the number of racks
92 | rack_size = 6      # the number of hosts under a rack
93 | host_bw = 100       # bandwidth of a host, in Gbps
94 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
95 | 
96 | # [envs]
97 | # KEY = "value"
98 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/spectrum1_base.toml:
--------------------------------------------------------------------------------
  1 | # Number of jobs
  2 | ncases = 30
  3 | 
  4 | # Job size distributions [(percentage, number of workers)]
  5 | job_size_distribution = [[80, 16], [80, 32]]
  6 | 
  7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
  8 | buffer_size = 100_000_000
  9 | 
 10 | # Number of iterations for all jobs
 11 | num_iterations = 100
 12 | 
 13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
 14 | poisson_lambda = 24_00_000_000.0
 15 | 
 16 | placement_strategy = { type = "Compact" }
 17 | # placement_strategy = { type = "CompactLoadBalanced" }
 18 | # placement_strategy = { type = "Spread" }
 19 | # placement_strategy = { type = "Random", args = 0 }
 20 | 
 21 | # global seed
 22 | seed = 1
 23 | 
 24 | # Output path of for the simulation results
 25 | directory = "/tmp/spectrum1/spectrum1_1"
 26 | 
 27 | # Number of repeats for each batch of experiments
 28 | batch_repeat = 5
 29 | 
 30 | [[batch]]
 31 | policy = "Random"
 32 | probe = { enable = false }
 33 | nethint_level = 2
 34 | 
 35 | [[batch]]
 36 | policy = "TopologyAware" # which is ring
 37 | probe = { enable = false }
 38 | nethint_level = 1
 39 | 
 40 | [[batch]]
 41 | policy = "RAT"
 42 | probe = { enable = false }
 43 | nethint_level = 2
 44 | # Auto tune after some iterations. default is disabled
 45 | auto_tune = 10
 46 | 
 47 | [[batch]]
 48 | policy = "RAT"
 49 | probe = { enable = false }
 50 | nethint_level = 2
 51 | # Auto tune after some iterations. default is disabled
 52 | auto_tune = 10
 53 | auto_fallback = true
 54 | alpha = 1.0
 55 | 
 56 | [simulator]
 57 | nethint = true
 58 | sample_interval_ns = 100_000_000 # 100ms
 59 | loopback_speed = 400
 60 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 61 | # fairness = "TenantFlowMaxMin"
 62 | fairness = "PerFlowMaxMin"
 63 | 
 64 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
 65 | # background_flow_hard = { enable = false }
 66 | # nethint_delay_ms = 100
 67 | 
 68 | [brain]
 69 | # Random seed for multiple uses
 70 | seed = 1
 71 | # Whether the cluster's bandwidth is asymmetric
 72 | asymmetric = false
 73 | # The percentage of nodes marked broken
 74 | broken = 0.0
 75 | # The slots of each physical machine
 76 | max_slots = 1
 77 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 78 | sharing_mode = "Guaranteed"
 79 | guaranteed_bandwidth = 25
 80 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 81 | # background_flow_high_freq = { enable = false }
 82 | gc_period = 10
 83 | 
 84 | # The topology for simulation
 85 | [brain.topology]
 86 | type = "Arbitrary"  # another possible value is "FatTree"
 87 | 
 88 | # [brain.topology.args] # When type = "FatTree"
 89 | # nports = 20           # the number of ports of a switch
 90 | # bandwidth = 100       # in Gbps
 91 | # oversub_ratio = 4.0   # oversubscription ratio
 92 | 
 93 | [brain.topology.args]         # When type = "Arbitrary"
 94 | nracks = 320        # the number of racks
 95 | rack_size = 6      # the number of hosts under a rack
 96 | host_bw = 100       # bandwidth of a host, in Gbps
 97 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
 98 | 
 99 | # [envs]
100 | # KEY = "value"
101 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/spectrum2_base.toml:
--------------------------------------------------------------------------------
  1 | # Number of jobs
  2 | ncases = 30
  3 | 
  4 | # Job size distributions [(percentage, number of workers)]
  5 | job_size_distribution = [[80, 16], [80, 32]]
  6 | 
  7 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
  8 | buffer_size = 100_000_000
  9 | 
 10 | # Number of iterations for all jobs
 11 | num_iterations = 100
 12 | 
 13 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
 14 | poisson_lambda = 24_00_000_000.0
 15 | 
 16 | placement_strategy = { type = "Compact" }
 17 | # placement_strategy = { type = "CompactLoadBalanced" }
 18 | # placement_strategy = { type = "Spread" }
 19 | # placement_strategy = { type = "Random", args = 0 }
 20 | 
 21 | # global seed
 22 | seed = 1
 23 | 
 24 | # Output path of for the simulation results
 25 | directory = "/tmp/spectrum2/spectrum2_1"
 26 | 
 27 | # Number of repeats for each batch of experiments
 28 | batch_repeat = 5
 29 | 
 30 | [[batch]]
 31 | policy = "Random"
 32 | probe = { enable = false }
 33 | nethint_level = 2
 34 | 
 35 | [[batch]]
 36 | policy = "TopologyAware" # which is ring
 37 | probe = { enable = false }
 38 | nethint_level = 1
 39 | 
 40 | [[batch]]
 41 | policy = "RAT"
 42 | probe = { enable = false }
 43 | nethint_level = 2
 44 | # Auto tune after some iterations. default is disabled
 45 | auto_tune = 10
 46 | 
 47 | [[batch]]
 48 | policy = "RAT"
 49 | probe = { enable = false }
 50 | nethint_level = 2
 51 | # Auto tune after some iterations. default is disabled
 52 | auto_tune = 10
 53 | auto_fallback = true
 54 | alpha = 1.0
 55 | 
 56 | [simulator]
 57 | nethint = true
 58 | sample_interval_ns = 100_000_000 # 100ms
 59 | loopback_speed = 400
 60 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 61 | # fairness = "TenantFlowMaxMin"
 62 | fairness = "PerFlowMaxMin"
 63 | 
 64 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
 65 | # background_flow_hard = { enable = false }
 66 | # nethint_delay_ms = 100
 67 | 
 68 | [brain]
 69 | # Random seed for multiple uses
 70 | seed = 1
 71 | # Whether the cluster's bandwidth is asymmetric
 72 | asymmetric = false
 73 | # The percentage of nodes marked broken
 74 | broken = 0.0
 75 | # The slots of each physical machine
 76 | max_slots = 1
 77 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 78 | sharing_mode = "Guaranteed"
 79 | guaranteed_bandwidth = 25
 80 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 81 | # background_flow_high_freq = { enable = false }
 82 | gc_period = 10
 83 | 
 84 | # The topology for simulation
 85 | [brain.topology]
 86 | type = "Arbitrary"  # another possible value is "FatTree"
 87 | 
 88 | # [brain.topology.args] # When type = "FatTree"
 89 | # nports = 20           # the number of ports of a switch
 90 | # bandwidth = 100       # in Gbps
 91 | # oversub_ratio = 4.0   # oversubscription ratio
 92 | 
 93 | [brain.topology.args]         # When type = "Arbitrary"
 94 | nracks = 320        # the number of racks
 95 | rack_size = 6      # the number of hosts under a rack
 96 | host_bw = 100       # bandwidth of a host, in Gbps
 97 | rack_bw = 400       # bandwidth of a ToR switch, in Gbps
 98 | 
 99 | # [envs]
100 | # KEY = "value"
101 | 


--------------------------------------------------------------------------------
/evaluation/herd_behavior/allreduce_herd_base.toml:
--------------------------------------------------------------------------------
 1 | # 3:1, jobsize = 4n, poisson = 24s, background: (60s, 0.5, 5), max_slots = 1
 2 | # Specifiation of a Allreduce application experiment
 3 | 
 4 | # Number of jobs
 5 | ncases = 40
 6 | 
 7 | # Job size distributions [(percentage, number of workers)]
 8 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
 9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]]
10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]]
11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]]
12 | job_size_distribution = [[40, 12]]
13 | 
14 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
15 | buffer_size = 100_000_000
16 | 
17 | # Number of iterations for all jobs
18 | num_iterations = 30
19 | 
20 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
21 | poisson_lambda = 1_000.0
22 | 
23 | # placement_strategy = { type = "Compact" }
24 | # placement_strategy = { type = "CompactLoadBalanced" }
25 | # placement_strategy = { type = "Spread" }
26 | placement_strategy = { type = "Random", args = 0 }
27 | 
28 | # global seed
29 | seed = 1
30 | 
31 | # Output path of for the simulation results
32 | directory = "/tmp/herd_behavior/allreduce_herd_base"
33 | 
34 | # Number of repeats for each batch of experiments
35 | batch_repeat = 5
36 | 
37 | [[batch]]
38 | policy = "Random"
39 | probe = { enable = false }
40 | nethint_level = 0
41 | 
42 | [[batch]]
43 | policy = "RAT"
44 | probe = { enable = false }
45 | nethint_level = 2
46 | # Auto tune after some iterations. default is disabled
47 | auto_tune = 10
48 | 
49 | [simulator]
50 | nethint = true
51 | sample_interval_ns = 100_000_000 # 100ms
52 | loopback_speed = 400
53 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
54 | # fairness = "TenantFlowMaxMin"
55 | fairness = "PerFlowMaxMin"
56 | 
57 | # background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 }
58 | background_flow_hard = { enable = false }
59 | # nethint_delay_ms = 100
60 | 
61 | [brain]
62 | # Random seed for multiple uses
63 | seed = 1
64 | # Whether the cluster's bandwidth is asymmetric
65 | asymmetric = false
66 | # The percentage of nodes marked broken
67 | broken = 0.0
68 | # The slots of each physical machine
69 | max_slots = 1
70 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
71 | sharing_mode = "Guaranteed"
72 | # in Gbps
73 | guaranteed_bandwidth = 25
74 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
75 | background_flow_high_freq = { enable = false }
76 | gc_period = 100
77 | 
78 | # The topology for simulation
79 | [brain.topology]
80 | type = "Arbitrary"  # another possible value is "FatTree"
81 | 
82 | # [brain.topology.args] # When type = "FatTree"
83 | # nports = 20           # the number of ports of a switch
84 | # bandwidth = 100       # in Gbps
85 | # oversub_ratio = 4.0   # oversubscription ratio
86 | 
87 | [brain.topology.args]         # When type = "Arbitrary"
88 | nracks = 9        # the number of racks
89 | rack_size = 40       # the number of hosts under a rack
90 | host_bw = 100       # bandwidth of a host, in Gbps
91 | rack_bw = 1333       # bandwidth of a ToR switch, in Gbps
92 | 
93 | # [envs]
94 | # KEY = "value"
95 | 


--------------------------------------------------------------------------------
/src/rl/testbed.toml:
--------------------------------------------------------------------------------
  1 | # Specifiation of a Allreduce application experiment
  2 | 
  3 | # Number of jobs
  4 | ncases = 30
  5 | 
  6 | allow_delay = true
  7 | 
  8 | # Job size distributions [(percentage, number of workers)]
  9 | job_size_distribution = [[40, 2], [80, 4], [90, 6]]
 10 | # job_size_distribution = [[40, 6]]
 11 | 
 12 | # Buffer size of all jobs, in bytes
 13 | buffer_size = 100_000_000
 14 | 
 15 | # Number of iterations for all jobs
 16 | num_iterations = 10
 17 | 
 18 | # Lambda of the poisson arrival, 2*100MB/25Gbps*120 = 7.68s
 19 | # poisson_lambda = 8_000_000_000.0
 20 | poisson_lambda = 10_000_000_000.0
 21 | 
 22 | placement_strategy = { type = "Compact" }
 23 | # placement_strategy = { type = "CompactLoadBalanced" }
 24 | # placement_strategy = { type = "Spread" }
 25 | # placement_strategy = { type = "Random", args = 0 }
 26 | 
 27 | # global seed
 28 | seed = 1
 29 | 
 30 | # Output path of for the simulation results
 31 | directory = "/tmp/rl_result"
 32 | 
 33 | # Number of repeats for each batch of experiments
 34 | batch_repeat = 1
 35 | 
 36 | [[batch]]
 37 | policy = "Random"
 38 | probe = { enable = false }
 39 | nethint_level = 2
 40 | 
 41 | [[batch]]
 42 | policy = "TopologyAware" # which is ring
 43 | probe = { enable = true, round_ms = 10 }
 44 | nethint_level = 1
 45 | 
 46 | [[batch]]
 47 | policy = "TopologyAware" # which is ring
 48 | probe = { enable = false }
 49 | nethint_level = 1
 50 | 
 51 | [[batch]]
 52 | policy = "RAT"
 53 | probe = { enable = false }
 54 | nethint_level = 2
 55 | # Auto tune after some iterations. default is disabled
 56 | auto_tune = 10
 57 | 
 58 | [simulator]
 59 | nethint = true
 60 | sample_interval_ns = 100_000_000 # 100ms
 61 | loopback_speed = 400
 62 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 63 | # fairness = "TenantFlowMaxMin"
 64 | fairness = "PerFlowMaxMin"
 65 | 
 66 | background_flow_hard = { enable = true, frequency_ns = 1000_000_000_000, probability = 0.5, amplitude = 5, average_load = 0.1 }
 67 | # background_flow_hard = { enable = false }
 68 | # nethint_delay_ms = 100
 69 | 
 70 | [brain]
 71 | # Random seed for multiple uses
 72 | seed = 1
 73 | # Whether the cluster's bandwidth is asymmetric
 74 | asymmetric = false
 75 | # The percentage of nodes marked broken
 76 | broken = 0.0
 77 | # The slots of each physical machine
 78 | max_slots = 1
 79 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 80 | sharing_mode = "Guaranteed"
 81 | guaranteed_bandwidth = 2.5
 82 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 83 | # background_flow_high_freq = { enable = false }
 84 | gc_period = 0
 85 | 
 86 | # The topology for simulation
 87 | [brain.topology]
 88 | type = "Arbitrary"  # another possible value is "FatTree"
 89 | 
 90 | # [brain.topology.args] # When type = "FatTree"
 91 | # nports = 20           # the number of ports of a switch
 92 | # bandwidth = 100       # in Gbps
 93 | # oversub_ratio = 4.0   # oversubscription ratio
 94 | 
 95 | [brain.topology.args]         # When type = "Arbitrary"
 96 | nracks = 2        # the number of racks
 97 | rack_size = 3      # the number of hosts under a rack
 98 | host_bw = 10       # bandwidth of a host, in Gbps
 99 | rack_bw = 10       # bandwidth of a ToR switch, in Gbps
100 | 
101 | # [envs]
102 | # KEY = "value"
103 | 


--------------------------------------------------------------------------------
/src/mapreduce/src/argument.rs:
--------------------------------------------------------------------------------
 1 | use nethint::architecture::TopoArgs;
 2 | use structopt::StructOpt;
 3 | 
 4 | use crate::{JobSpec, ShufflePattern};
 5 | 
 6 | #[derive(Debug, Clone, StructOpt)]
 7 | #[structopt(name = "MapReduce", about = "MapReduce Application")]
 8 | pub struct Opt {
 9 |     /// Specify the topology for simulation
10 |     #[structopt(subcommand)]
11 |     pub topo: TopoArgs,
12 | 
13 |     /// Asymmetric bandwidth
14 |     #[structopt(short = "a", long = "asymmetric")]
15 |     pub asym: bool,
16 | 
17 |     /// Probability distribution of shuffle flows, examples: uniform_1000000, zipf_1000000_0.5
18 |     #[structopt(
19 |         short = "s",
20 |         long = "shuffle-pattern",
21 |         name = "distribution",
22 |         default_value = "uniform_1000000"
23 |     )]
24 |     pub shuffle: ShufflePattern,
25 | 
26 |     /// Number of map tasks. When using trace, this parameter means map scale factor
27 |     #[structopt(short = "m", long = "map", default_value = "4")]
28 |     pub num_map: usize,
29 | 
30 |     /// Number of reduce tasks. When using trace, this parameter means reduce scale factor
31 |     #[structopt(short = "r", long = "reduce", default_value = "4")]
32 |     pub num_reduce: usize,
33 | 
34 |     /// Number of testcases
35 |     #[structopt(short = "n", long = "ncases", default_value = "10")]
36 |     pub ncases: usize,
37 | 
38 |     /// Traffic scale, multiply the traffic size by a number to allow job overlaps
39 |     #[structopt(short = "t", long = "traffic-scale", default_value = "1.0")]
40 |     pub traffic_scale: f64,
41 | 
42 |     /// Run experiments from trace file
43 |     #[structopt(short = "f", long = "file")]
44 |     pub trace: Option<std::path::PathBuf>,
45 | 
46 |     /// Output path of the figure
47 |     #[structopt(short = "d", long = "directory")]
48 |     pub directory: Option<std::path::PathBuf>,
49 | 
50 |     /// Run simulation experiments in parallel, default using the hardware concurrency
51 |     #[structopt(short = "P", long = "parallel", name = "nthreads")]
52 |     pub parallel: Option<usize>,
53 | 
54 |     /// Normalize, draw speed up instead of absolution job completion time
55 |     #[structopt(short = "N", long = "normalize")]
56 |     pub normalize: bool,
57 | 
58 |     /// Inspect the trace file, see the overlap among multiple jobs
59 |     #[structopt(long = "inspect")]
60 |     pub inspect: bool,
61 | 
62 |     /// Multi-tenant
63 |     #[structopt(long = "multitenant")]
64 |     pub multitenant: bool,
65 | 
66 |     /// Nethint level.
67 |     #[structopt(short = "l", long = "nethint_level", default_value = "1")]
68 |     pub nethint_level: usize,
69 | 
70 |     /// Collocate or De-collocate
71 |     #[structopt(short = "c", long = "collocate")]
72 |     pub collocate: bool,
73 | 
74 |     /// Mark some nodes as Broken to be more realistic
75 |     #[structopt(short = "b", long = "broken")]
76 |     pub broken: bool,
77 | }
78 | 
79 | impl Opt {
80 |     pub fn to_filename(&self, prefix: &str) -> String {
81 |         if let Some(_f) = self.trace.as_ref() {
82 |             format!(
83 |                 "{}_{}_from_trace_m{}_r{}.pdf",
84 |                 prefix, self.topo, self.num_map, self.num_reduce
85 |             )
86 |         } else {
87 |             let job_spec = JobSpec::new(self.num_map, self.num_reduce, self.shuffle.clone());
88 |             format!("{}_{}_{}.pdf", prefix, self.topo, job_spec)
89 |         }
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/spectrum5_base.toml:
--------------------------------------------------------------------------------
  1 | # Number of jobs
  2 | ncases = 30
  3 | 
  4 | # Job size distributions [(percentage, number of workers)]
  5 | job_size_distribution = [[50, 8], [50, 16]]
  6 | 
  7 | # Images in a ML serving batch request, batch size = 64, each image has 3 channels, crop to 256x256
  8 | buffer_size = 12582912
  9 | 
 10 | # Number of iterations for all jobs
 11 | num_iterations = 1000
 12 | 
 13 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 4000 = 2s
 14 | poisson_lambda = 2_400_000_000.0
 15 | 
 16 | partially_sync = false
 17 | 
 18 | placement_strategy = { type = "Compact" }
 19 | # placement_strategy = { type = "CompactLoadBalanced" }
 20 | # placement_strategy = { type = "Spread" }
 21 | # placement_strategy = { type = "Random", args = 0 }
 22 | 
 23 | # global seed
 24 | seed = 1
 25 | 
 26 | # Output path of for the simulation results
 27 | directory = "/tmp/spectrum5/spectrum5_1"
 28 | 
 29 | # Number of repeats for each batch of experiments
 30 | batch_repeat = 5
 31 | 
 32 | [[batch]]
 33 | policy = "Random"
 34 | probe = { enable = false }
 35 | nethint_level = 2
 36 | 
 37 | [[batch]]
 38 | policy = "TopologyAware" # which is ring
 39 | probe = { enable = false }
 40 | nethint_level = 1
 41 | 
 42 | [[batch]]
 43 | policy = "RAT"
 44 | probe = { enable = false }
 45 | nethint_level = 2
 46 | # Auto tune after some iterations. default is disabled
 47 | auto_tune = 10
 48 | 
 49 | [[batch]]
 50 | policy = "RAT"
 51 | probe = { enable = false }
 52 | nethint_level = 2
 53 | # Auto tune after some iterations. default is disabled
 54 | auto_tune = 10
 55 | auto_fallback = true
 56 | alpha = 1.0
 57 | 
 58 | [simulator]
 59 | nethint = true
 60 | sample_interval_ns = 100_000_000 # 100ms
 61 | loopback_speed = 400
 62 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 63 | # fairness = "TenantFlowMaxMin"
 64 | fairness = "PerFlowMaxMin"
 65 | 
 66 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
 67 | # background_flow_hard = { enable = false }
 68 | # nethint_delay_ms = 100
 69 | 
 70 | [brain]
 71 | # Random seed for multiple uses
 72 | seed = 1
 73 | # Whether the cluster's bandwidth is asymmetric
 74 | asymmetric = false
 75 | # The percentage of nodes marked broken
 76 | broken = 0.0
 77 | # The slots of each physical machine
 78 | max_slots = 1
 79 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 80 | sharing_mode = "Guaranteed"
 81 | guaranteed_bandwidth = 25
 82 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 83 | # background_flow_high_freq = { enable = false }
 84 | gc_period = 10
 85 | 
 86 | # The topology for simulation
 87 | [brain.topology]
 88 | type = "Arbitrary"  # another possible value is "FatTree"
 89 | 
 90 | # [brain.topology.args] # When type = "FatTree"
 91 | # nports = 20           # the number of ports of a switch
 92 | # bandwidth = 100       # in Gbps
 93 | # oversub_ratio = 4.0   # oversubscription ratio
 94 | 
 95 | [brain.topology.args]         # When type = "Arbitrary"
 96 | nracks = 320        # the number of racks
 97 | rack_size = 6      # the number of hosts under a rack
 98 | host_bw = 100       # bandwidth of a host, in Gbps
 99 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
100 | 
101 | # [envs]
102 | # KEY = "value"
103 | 


--------------------------------------------------------------------------------
/evaluation/spectrum/spectrum6_base.toml:
--------------------------------------------------------------------------------
  1 | # Run the experiments from trace file
  2 | trace = "../../src/mapreduce/FB2010-1Hr-150-0.txt"
  3 | 
  4 | # Number of testcases to run
  5 | ncases = 500
  6 | 
  7 | # Number of map tasks and reduce tasks
  8 | # When running from trace, these parameters become scale factors
  9 | num_map = 1
 10 | num_reduce = 1
 11 | enable_computation_time = false
 12 | 
 13 | # Multiply the traffic size by a number
 14 | traffic_scale = 10.0
 15 | 
 16 | # Mapper placement policy
 17 | mapper_policy = { type = "Greedy" }
 18 | 
 19 | placement_strategy = { type = "Compact" }
 20 | 
 21 | # Output path of for the simulation results
 22 | directory = "/tmp/spectrum6/spectrum6_1"
 23 | 
 24 | # Whether to allow a mapper to collocate with a reduce
 25 | collocate = true
 26 | 
 27 | # Number of repeats for each batch of experiments
 28 | batch_repeat = 5
 29 | 
 30 | [[batch]]
 31 | reducer_policy = "Random"
 32 | probe = { enable = false }
 33 | # NetHint level, possible values are 0, 1, 2
 34 | nethint_level = 1
 35 | 
 36 | [[batch]]
 37 | reducer_policy = "HierarchicalGreedyLevel1"
 38 | probe = { enable = false }
 39 | nethint_level = 1
 40 | 
 41 | [[batch]]
 42 | reducer_policy = "HierarchicalGreedyPaper" # please use this
 43 | # reducer_policy = "HierarchicalGreedy"
 44 | probe = { enable = false }
 45 | nethint_level = 2
 46 | 
 47 | [[batch]]
 48 | reducer_policy = "HierarchicalGreedyPaper" # please use this
 49 | # reducer_policy = "HierarchicalGreedy"
 50 | probe = { enable = false }
 51 | nethint_level = 2
 52 | auto_fallback = true
 53 | alpha = 1.0
 54 | 
 55 | [simulator]
 56 | nethint = true
 57 | sample_interval_ns = 100_000_000 # 100ms
 58 | loopback_speed = 400
 59 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 60 | # fairness = "TenantFlowMaxMin"
 61 | fairness = "PerFlowMaxMin"
 62 | 
 63 | background_flow_hard = { enable = true, frequency_ns = 2_00_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
 64 | # background_flow_hard = { enable = false }
 65 | # nethint_delay_ms = 100
 66 | 
 67 | [brain]
 68 | # Random seed for multiple uses
 69 | seed = 1
 70 | # Whether the cluster's bandwidth is asymmetric
 71 | asymmetric = false
 72 | # The percentage of nodes marked broken
 73 | broken = 0.1
 74 | # The slots of each physical machine
 75 | max_slots = 4
 76 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 77 | sharing_mode = "Guaranteed"
 78 | guaranteed_bandwidth = 25
 79 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 80 | # background_flow_high_freq = { enable = false }
 81 | gc_period = 100
 82 | 
 83 | # The topology for simulation
 84 | [brain.topology]
 85 | type = "Arbitrary"  # another possible value is "FatTree"
 86 | 
 87 | # [brain.topology.args] # When type = "FatTree"
 88 | # nports = 20           # the number of ports of a switch
 89 | # bandwidth = 100       # in Gbps
 90 | # oversub_ratio = 4.0   # oversubscription ratio
 91 | 
 92 | [brain.topology.args]         # When type = "Arbitrary"
 93 | nracks = 150        # the number of racks
 94 | rack_size = 18      # the number of hosts under a rack
 95 | host_bw = 100       # bandwidth of a host, in Gbps
 96 | rack_bw = 600       # bandwidth of a ToR switch, in Gbps
 97 | 
 98 | [envs]
 99 | NETHINT_SHOW_DECISION = "/tmp/spectrum6/spectrum6_1/decision.txt"
100 | 


--------------------------------------------------------------------------------
/src/nhagent/src/timing.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Serialize, Deserialize};
  2 | use std::time::SystemTime;
  3 | 
  4 | pub const ON_COLLECTED: &str = "OnCollected";
  5 | pub const ON_SAMPLED: &str = "OnSampled";
  6 | pub const ON_CHUNK_SENT: &str = "OnChunkSent";
  7 | pub const ON_ALL_RECEIVED: &str = "OnAllReceived";
  8 | pub const ON_TENANT_SENT_REQ: &str = "OnTenantSentRequest";
  9 | pub const ON_RECV_TENANT_REQ: &str = "OnRecvTenantRequest";
 10 | pub const ON_TENANT_RECV_RES: &str = "OnTenantRecvResponse";
 11 | 
 12 | #[derive(Debug, Clone, Serialize, Deserialize)]
 13 | pub struct TimeRecord {
 14 |     pub stage: String,
 15 |     pub ts: SystemTime,
 16 | }
 17 | 
 18 | impl TimeRecord {
 19 |     pub fn new(stage: &str) -> Self {
 20 |         TimeRecord {
 21 |             stage: stage.to_owned(),
 22 |             ts: SystemTime::now(),
 23 |         }
 24 |     }
 25 | 
 26 |     pub fn with_ts(stage: &str, ts: SystemTime) -> Self {
 27 |         TimeRecord {
 28 |             stage: stage.to_owned(),
 29 |             ts,
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 35 | pub struct TimeList {
 36 |     recs: Vec<TimeRecord>,
 37 | }
 38 | 
 39 | impl TimeList {
 40 |     pub fn new() -> Self {
 41 |         Default::default()
 42 |     }
 43 | 
 44 |     pub fn clear(&mut self) {
 45 |         self.recs.clear()
 46 |     }
 47 | 
 48 |     pub fn get(&self, stage: &str) -> Option<TimeRecord> {
 49 |         self.recs.iter().find(|x| x.stage == stage).cloned()
 50 |     }
 51 | 
 52 |     pub fn push(&mut self, stage: &str, ts: SystemTime) {
 53 |         self.recs.push(TimeRecord::with_ts(stage, ts))
 54 |     }
 55 | 
 56 |     pub fn push_now(&mut self, stage: &str) {
 57 |         self.recs.push(TimeRecord::new(stage));
 58 |     }
 59 | 
 60 |     pub fn update(&mut self, stage: &str, ts: SystemTime) {
 61 |         let e = self.recs.iter_mut().rfind(|x| x.stage == stage);
 62 |         if let Some(x) = e {
 63 |             x.ts = ts.max(x.ts);
 64 |         } else {
 65 |             self.recs.push(TimeRecord::with_ts(stage, ts));
 66 |         }
 67 |     }
 68 | 
 69 |     pub fn update_now(&mut self, stage: &str) {
 70 |         self.update(stage, SystemTime::now());
 71 |     }
 72 | 
 73 |     /// sync the latest corresponding element in `self` with `other`,
 74 |     /// if not exists, append that element to `self`.
 75 |     pub fn update_time_list(&mut self, other: &TimeList) {
 76 |         other.recs.iter().for_each(|o| self.update(&o.stage, o.ts));
 77 |     }
 78 | 
 79 |     pub fn update_min(&mut self, stage: &str, other: &TimeList) {
 80 |         if let Some(o) = other.get(stage) {
 81 |             if let Some(e) = self.recs.iter_mut().rfind(|x| x.stage == stage) {
 82 |                 e.ts = o.ts.min(e.ts);
 83 |             } else {
 84 |                 self.recs.push(TimeRecord::with_ts(&o.stage, o.ts));
 85 |             }
 86 |         }
 87 |     }
 88 | }
 89 | 
 90 | impl std::fmt::Display for TimeList {
 91 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 92 |         let mut rs = self.recs.clone();
 93 |         rs.sort_by_key(|x| x.ts);
 94 |         if !rs.is_empty() {
 95 |             let eariest = rs[0].ts;
 96 |             for r in rs {
 97 |                 writeln!(f, "{} {}", r.stage, r.ts.duration_since(eariest).unwrap().as_micros())?;
 98 |             }
 99 |         }
100 |         writeln!(f, "\n")
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/evaluation/model_serving_configs/standard2.toml:
--------------------------------------------------------------------------------
  1 | # 3:1, jobsize = 4n, poisson = 24s, background: (60s, 0.5, 5), max_slots = 1
  2 | # Specifiation of a Allreduce application experiment
  3 | 
  4 | # Number of jobs
  5 | ncases = 100
  6 | 
  7 | # Job size distributions [(percentage, number of workers)]
  8 | job_size_distribution = [[50, 8], [50, 16]]
  9 | 
 10 | # Images in a ML serving batch request, batch size = 64, each image has 3 channels, crop to 256x256
 11 | buffer_size = 12582912
 12 | 
 13 | # Number of iterations for all jobs
 14 | num_iterations = 4000
 15 | 
 16 | # Lambda of the poisson arrival, 12582912 * 8 / 50G * 4000 = 8s
 17 | poisson_lambda = 8_000_000_000.0
 18 | 
 19 | partially_sync = false
 20 | 
 21 | placement_strategy = { type = "Compact" }
 22 | # placement_strategy = { type = "CompactLoadBalanced" }
 23 | # placement_strategy = { type = "Spread" }
 24 | # placement_strategy = { type = "Random", args = 0 }
 25 | 
 26 | # global seed
 27 | seed = 1
 28 | 
 29 | # Output path of for the simulation results
 30 | directory = "/tmp/model_serving_result_for_paper/standard2"
 31 | 
 32 | # Number of repeats for each batch of experiments
 33 | batch_repeat = 1
 34 | 
 35 | [[batch]]
 36 | policy = "Random"
 37 | probe = { enable = false }
 38 | nethint_level = 0
 39 | 
 40 | # [[batch]]
 41 | # policy = "RAT"
 42 | # probe = { enable = true, round_ms = 100 }
 43 | # nethint_level = 2
 44 | # auto_tune = 1
 45 | 
 46 | [[batch]]
 47 | policy = "RAT"
 48 | probe = { enable = true, round_ms = 100 }
 49 | nethint_level = 2
 50 | auto_tune = 4000
 51 | 
 52 | [[batch]]
 53 | policy = "RAT"
 54 | probe = { enable = false }
 55 | nethint_level = 2
 56 | # Auto tune after some iterations. default is disabled
 57 | auto_tune = 10
 58 | 
 59 | [simulator]
 60 | nethint = true
 61 | sample_interval_ns = 100_000_000 # 100ms
 62 | loopback_speed = 400
 63 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 64 | # fairness = "TenantFlowMaxMin"
 65 | fairness = "PerFlowMaxMin"
 66 | 
 67 | background_flow_hard = { enable = true, frequency_ns = 10_000_000_000, probability = 1.0, amplitude = 5 }
 68 | # background_flow_hard = { enable = false }
 69 | # nethint_delay_ms = 100
 70 | 
 71 | [brain]
 72 | # Random seed for multiple uses
 73 | seed = 1
 74 | # Whether the cluster's bandwidth is asymmetric
 75 | asymmetric = false
 76 | # The percentage of nodes marked broken
 77 | broken = 0.0
 78 | # The slots of each physical machine
 79 | max_slots = 1
 80 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 81 | sharing_mode = "Guaranteed"
 82 | # in Gbps
 83 | guaranteed_bandwidth = 25
 84 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 85 | # background_flow_high_freq = { enable = false }
 86 | gc_period = 100
 87 | 
 88 | # The topology for simulation
 89 | [brain.topology]
 90 | type = "Arbitrary"  # another possible value is "FatTree"
 91 | 
 92 | # [brain.topology.args] # When type = "FatTree"
 93 | # nports = 20           # the number of ports of a switch
 94 | # bandwidth = 100       # in Gbps
 95 | # oversub_ratio = 4.0   # oversubscription ratio
 96 | 
 97 | [brain.topology.args]         # When type = "Arbitrary"
 98 | nracks = 300        # the number of racks
 99 | rack_size = 18      # the number of hosts under a rack
100 | host_bw = 100       # bandwidth of a host, in Gbps
101 | rack_bw = 600       # bandwidth of a ToR switch, in Gbps
102 | 
103 | # [envs]
104 | # KEY = "value"
105 | 


--------------------------------------------------------------------------------
/src/mapreduce/src/inspect.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     app::run_map_reduce, argument::Opt, trace::JobTrace, JobSpec, ReducerPlacementPolicy,
 3 |     ShufflePattern,
 4 | };
 5 | use async_std::task;
 6 | use futures::stream::StreamExt;
 7 | use log::{debug, info};
 8 | use nethint::{cluster::Cluster, Duration, Timestamp, ToStdDuration};
 9 | use std::sync::Arc;
10 | 
11 | #[derive(Debug, Clone, Copy)]
12 | pub struct JobLifetime {
13 |     // start time (ns) of the job, grabbed from trace
14 |     pub start: Timestamp,
15 |     // duration of the job, simulated
16 |     pub dura: Duration,
17 | }
18 | 
19 | pub fn run_experiments(opt: &Opt, cluster: Arc<Cluster>) -> Option<Vec<(usize, JobLifetime)>> {
20 |     assert!(opt.trace.is_some(), "need to specify a trace file");
21 | 
22 |     let num_cpus = opt.parallel.unwrap_or_else(num_cpus::get);
23 | 
24 |     let job_trace = opt.trace.as_ref().map(|p| {
25 |         JobTrace::from_path(p)
26 |             .unwrap_or_else(|e| panic!("failed to load from file: {:?}, error: {}", p, e))
27 |     });
28 | 
29 |     assert!(job_trace.is_some());
30 | 
31 |     task::block_on(async {
32 |         let experiments = futures::stream::iter({
33 |             let ncases = std::cmp::min(
34 |                 opt.ncases,
35 |                 job_trace.as_ref().map(|v| v.count).unwrap_or(usize::MAX),
36 |             );
37 |             (0..ncases).map(|i| {
38 |                 let id = i;
39 |                 let cluster = Arc::clone(&cluster);
40 | 
41 |                 let (start_ts, job_spec) = job_trace
42 |                     .as_ref()
43 |                     .map(|job_trace| {
44 |                         let mut record = job_trace.records[id].clone();
45 |                         // mutiple traffic by a number
46 |                         record.reducers = record
47 |                             .reducers
48 |                             .into_iter()
49 |                             .map(|(a, b)| (a, b * opt.traffic_scale))
50 |                             .collect();
51 |                         debug!("record: {:?}", record);
52 |                         let ts = record.ts;
53 |                         let job_spec = JobSpec::new(
54 |                             record.num_map * opt.num_map,
55 |                             record.num_reduce * opt.num_reduce,
56 |                             ShufflePattern::FromTrace(Box::new(record)),
57 |                         );
58 |                         (ts, job_spec)
59 |                     })
60 |                     .unwrap();
61 | 
62 |                 let policy = ReducerPlacementPolicy::HierarchicalGreedy;
63 | 
64 |                 task::spawn(async move {
65 |                     info!("testcase: {}", id);
66 |                     let jct = run_map_reduce(&cluster, &job_spec, policy, id as _);
67 |                     // let time = output.recs.into_iter().map(|r| r.dura.unwrap()).max();
68 |                     info!(
69 |                         "{:?}, job_finish_time: {:?}",
70 |                         policy,
71 |                         jct.unwrap().to_dura()
72 |                     );
73 |                     Some((
74 |                         i,
75 |                         JobLifetime {
76 |                             start: start_ts * 1_000_000,
77 |                             dura: jct.unwrap(),
78 |                         },
79 |                     ))
80 |                 })
81 |             })
82 |         })
83 |         .buffer_unordered(num_cpus)
84 |         .collect::<Vec<Option<(usize, JobLifetime)>>>();
85 |         experiments.await.into_iter().collect()
86 |     })
87 | }
88 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/background_off.toml:
--------------------------------------------------------------------------------
  1 | # 3:1, jobsize = 4n, poisson = 24s, background: off, max_slots = 1
  2 | # Specifiation of a Allreduce application experiment
  3 | 
  4 | # Number of jobs
  5 | ncases = 100
  6 | 
  7 | # Job size distributions [(percentage, number of workers)]
  8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
  9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]]
 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]]
 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]]
 12 | 
 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 14 | buffer_size = 100_000_000
 15 | 
 16 | # Number of iterations for all jobs
 17 | num_iterations = 100
 18 | 
 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
 20 | poisson_lambda = 24_00_000_000.0
 21 | 
 22 | placement_strategy = { type = "Compact" }
 23 | # placement_strategy = { type = "CompactLoadBalanced" }
 24 | # placement_strategy = { type = "Spread" }
 25 | # placement_strategy = { type = "Random", args = 0 }
 26 | 
 27 | # global seed
 28 | seed = 1
 29 | 
 30 | # Output path of for the simulation results
 31 | directory = "/tmp/allreduce_result_for_paper/background_off"
 32 | 
 33 | # Number of repeats for each batch of experiments
 34 | batch_repeat = 5
 35 | 
 36 | [[batch]]
 37 | policy = "Random"
 38 | probe = { enable = false }
 39 | nethint_level = 0
 40 | 
 41 | [[batch]]
 42 | policy = "RAT"
 43 | probe = { enable = true, round_ms = 10 }
 44 | nethint_level = 2
 45 | auto_tune = 10
 46 | 
 47 | [[batch]]
 48 | policy = "RAT"
 49 | probe = { enable = false }
 50 | nethint_level = 2
 51 | # Auto tune after some iterations. default is disabled
 52 | auto_tune = 10
 53 | 
 54 | [simulator]
 55 | nethint = true
 56 | sample_interval_ns = 100_000_000 # 100ms
 57 | loopback_speed = 400
 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 59 | # fairness = "TenantFlowMaxMin"
 60 | fairness = "PerFlowMaxMin"
 61 | 
 62 | # background_flow_hard = { enable = true, frequency_ns = 1_000_000_000, probability = 0.1, amplitude = 1 }
 63 | background_flow_hard = { enable = false }
 64 | # nethint_delay_ms = 100
 65 | 
 66 | [brain]
 67 | # Random seed for multiple uses
 68 | seed = 1
 69 | # Whether the cluster's bandwidth is asymmetric
 70 | asymmetric = false
 71 | # The percentage of nodes marked broken
 72 | broken = 0.0
 73 | # The slots of each physical machine
 74 | max_slots = 1
 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 76 | sharing_mode = "Guaranteed"
 77 | # in Gbps
 78 | guaranteed_bandwidth = 25
 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 80 | background_flow_high_freq = { enable = false }
 81 | gc_period = 100
 82 | 
 83 | # The topology for simulation
 84 | [brain.topology]
 85 | type = "Arbitrary"  # another possible value is "FatTree"
 86 | 
 87 | # [brain.topology.args] # When type = "FatTree"
 88 | # nports = 20           # the number of ports of a switch
 89 | # bandwidth = 100       # in Gbps
 90 | # oversub_ratio = 4.0   # oversubscription ratio
 91 | 
 92 | [brain.topology.args]         # When type = "Arbitrary"
 93 | nracks = 300        # the number of racks
 94 | rack_size = 6       # the number of hosts under a rack
 95 | host_bw = 100       # bandwidth of a host, in Gbps
 96 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
 97 | 
 98 | # [envs]
 99 | # KEY = "value"
100 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/standard2.toml:
--------------------------------------------------------------------------------
  1 | # 3:1, jobsize = 4n, poisson = 24s, background: (60s, 0.5, 5), max_slots = 1
  2 | # Specifiation of a Allreduce application experiment
  3 | 
  4 | # Number of jobs
  5 | ncases = 100
  6 | 
  7 | # Job size distributions [(percentage, number of workers)]
  8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
  9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]]
 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]]
 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]]
 12 | 
 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 14 | buffer_size = 100_000_000
 15 | 
 16 | # Number of iterations for all jobs
 17 | num_iterations = 100
 18 | 
 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
 20 | poisson_lambda = 24_00_000_000.0
 21 | 
 22 | placement_strategy = { type = "Compact" }
 23 | # placement_strategy = { type = "CompactLoadBalanced" }
 24 | # placement_strategy = { type = "Spread" }
 25 | # placement_strategy = { type = "Random", args = 0 }
 26 | 
 27 | # global seed
 28 | seed = 1
 29 | 
 30 | # Output path of for the simulation results
 31 | directory = "/tmp/allreduce_result_for_paper/standard2"
 32 | 
 33 | # Number of repeats for each batch of experiments
 34 | batch_repeat = 5
 35 | 
 36 | [[batch]]
 37 | policy = "Random"
 38 | probe = { enable = false }
 39 | nethint_level = 0
 40 | 
 41 | [[batch]]
 42 | policy = "RAT"
 43 | probe = { enable = true, round_ms = 10 }
 44 | nethint_level = 2
 45 | auto_tune = 10
 46 | 
 47 | [[batch]]
 48 | policy = "RAT"
 49 | probe = { enable = false }
 50 | nethint_level = 2
 51 | # Auto tune after some iterations. default is disabled
 52 | auto_tune = 10
 53 | 
 54 | [simulator]
 55 | nethint = true
 56 | sample_interval_ns = 100_000_000 # 100ms
 57 | loopback_speed = 400
 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 59 | # fairness = "TenantFlowMaxMin"
 60 | fairness = "PerFlowMaxMin"
 61 | 
 62 | background_flow_hard = { enable = true, frequency_ns = 60_000_000_000, probability = 0.5, amplitude = 5 }
 63 | # background_flow_hard = { enable = false }
 64 | # nethint_delay_ms = 100
 65 | 
 66 | [brain]
 67 | # Random seed for multiple uses
 68 | seed = 1
 69 | # Whether the cluster's bandwidth is asymmetric
 70 | asymmetric = false
 71 | # The percentage of nodes marked broken
 72 | broken = 0.0
 73 | # The slots of each physical machine
 74 | max_slots = 1
 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 76 | sharing_mode = "Guaranteed"
 77 | # in Gbps
 78 | guaranteed_bandwidth = 25
 79 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 80 | # background_flow_high_freq = { enable = false }
 81 | gc_period = 100
 82 | 
 83 | # The topology for simulation
 84 | [brain.topology]
 85 | type = "Arbitrary"  # another possible value is "FatTree"
 86 | 
 87 | # [brain.topology.args] # When type = "FatTree"
 88 | # nports = 20           # the number of ports of a switch
 89 | # bandwidth = 100       # in Gbps
 90 | # oversub_ratio = 4.0   # oversubscription ratio
 91 | 
 92 | [brain.topology.args]         # When type = "Arbitrary"
 93 | nracks = 300        # the number of racks
 94 | rack_size = 6       # the number of hosts under a rack
 95 | host_bw = 100       # bandwidth of a host, in Gbps
 96 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
 97 | 
 98 | # [envs]
 99 | # KEY = "value"
100 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/background_dynamic_strong.toml:
--------------------------------------------------------------------------------
  1 | # 3:1, jobsize = 4n, poisson = 24s, background: (1s, 0.9, 9), max_slots = 1
  2 | # Specifiation of a Allreduce application experiment
  3 | 
  4 | # Number of jobs
  5 | ncases = 100
  6 | 
  7 | # Job size distributions [(percentage, number of workers)]
  8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
  9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]]
 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]]
 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]]
 12 | 
 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 14 | buffer_size = 100_000_000
 15 | 
 16 | # Number of iterations for all jobs
 17 | num_iterations = 100
 18 | 
 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
 20 | poisson_lambda = 24_00_000_000.0
 21 | 
 22 | placement_strategy = { type = "Compact" }
 23 | # placement_strategy = { type = "CompactLoadBalanced" }
 24 | # placement_strategy = { type = "Spread" }
 25 | # placement_strategy = { type = "Random", args = 0 }
 26 | 
 27 | # global seed
 28 | seed = 1
 29 | 
 30 | # Output path of for the simulation results
 31 | directory = "/tmp/allreduce_result_for_paper/background_dynamic_strong"
 32 | 
 33 | # Number of repeats for each batch of experiments
 34 | batch_repeat = 5
 35 | 
 36 | [[batch]]
 37 | policy = "Random"
 38 | probe = { enable = false }
 39 | nethint_level = 0
 40 | 
 41 | [[batch]]
 42 | policy = "RAT"
 43 | probe = { enable = true, round_ms = 10 }
 44 | nethint_level = 2
 45 | auto_tune = 10
 46 | 
 47 | [[batch]]
 48 | policy = "RAT"
 49 | probe = { enable = false }
 50 | nethint_level = 2
 51 | # Auto tune after some iterations. default is disabled
 52 | auto_tune = 10
 53 | 
 54 | [simulator]
 55 | nethint = true
 56 | sample_interval_ns = 100_000_000 # 100ms
 57 | loopback_speed = 400
 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 59 | # fairness = "TenantFlowMaxMin"
 60 | fairness = "PerFlowMaxMin"
 61 | 
 62 | background_flow_hard = { enable = true, frequency_ns = 1_000_000_000, probability = 0.9, amplitude = 8 }
 63 | # background_flow_hard = { enable = false }
 64 | # nethint_delay_ms = 100
 65 | 
 66 | [brain]
 67 | # Random seed for multiple uses
 68 | seed = 1
 69 | # Whether the cluster's bandwidth is asymmetric
 70 | asymmetric = false
 71 | # The percentage of nodes marked broken
 72 | broken = 0.0
 73 | # The slots of each physical machine
 74 | max_slots = 1
 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 76 | sharing_mode = "Guaranteed"
 77 | # in Gbps
 78 | guaranteed_bandwidth = 25
 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 80 | background_flow_high_freq = { enable = false }
 81 | gc_period = 100
 82 | 
 83 | # The topology for simulation
 84 | [brain.topology]
 85 | type = "Arbitrary"  # another possible value is "FatTree"
 86 | 
 87 | # [brain.topology.args] # When type = "FatTree"
 88 | # nports = 20           # the number of ports of a switch
 89 | # bandwidth = 100       # in Gbps
 90 | # oversub_ratio = 4.0   # oversubscription ratio
 91 | 
 92 | [brain.topology.args]         # When type = "Arbitrary"
 93 | nracks = 300        # the number of racks
 94 | rack_size = 6       # the number of hosts under a rack
 95 | host_bw = 100       # bandwidth of a host, in Gbps
 96 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
 97 | 
 98 | # [envs]
 99 | # KEY = "value"
100 | 


--------------------------------------------------------------------------------
/evaluation/allreduce_configs/background_static_strong.toml:
--------------------------------------------------------------------------------
  1 | # 3:1, jobsize = 4n, poisson = 24s, background: (1000s, 0.9, 9), max_slots = 1
  2 | # Specifiation of a Allreduce application experiment
  3 | 
  4 | # Number of jobs
  5 | ncases = 100
  6 | 
  7 | # Job size distributions [(percentage, number of workers)]
  8 | job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
  9 | # job_size_distribution = [[40, 2], [80, 4], [90, 8], [25, 16], [5, 32]]
 10 | # job_size_distribution = [[40, 6], [80, 12], [90, 24], [25, 48], [5, 96]]
 11 | # job_size_distribution = [[80, 12], [90, 24], [25, 48], [5, 96]]
 12 | 
 13 | # Buffer size of all jobs, in bytes, similar to ResNet50 ~= 98MB
 14 | buffer_size = 100_000_000
 15 | 
 16 | # Number of iterations for all jobs
 17 | num_iterations = 100
 18 | 
 19 | # Lambda of the poisson arrival, 2*100MB/50Gbps = 0.032s each iteration, total in 32s
 20 | poisson_lambda = 24_00_000_000.0
 21 | 
 22 | placement_strategy = { type = "Compact" }
 23 | # placement_strategy = { type = "CompactLoadBalanced" }
 24 | # placement_strategy = { type = "Spread" }
 25 | # placement_strategy = { type = "Random", args = 0 }
 26 | 
 27 | # global seed
 28 | seed = 1
 29 | 
 30 | # Output path of for the simulation results
 31 | directory = "/tmp/allreduce_result_for_paper/background_static_strong"
 32 | 
 33 | # Number of repeats for each batch of experiments
 34 | batch_repeat = 5
 35 | 
 36 | [[batch]]
 37 | policy = "Random"
 38 | probe = { enable = false }
 39 | nethint_level = 0
 40 | 
 41 | [[batch]]
 42 | policy = "RAT"
 43 | probe = { enable = true, round_ms = 10 }
 44 | nethint_level = 2
 45 | auto_tune = 10
 46 | 
 47 | [[batch]]
 48 | policy = "RAT"
 49 | probe = { enable = false }
 50 | nethint_level = 2
 51 | # Auto tune after some iterations. default is disabled
 52 | auto_tune = 10
 53 | 
 54 | [simulator]
 55 | nethint = true
 56 | sample_interval_ns = 100_000_000 # 100ms
 57 | loopback_speed = 400
 58 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 59 | # fairness = "TenantFlowMaxMin"
 60 | fairness = "PerFlowMaxMin"
 61 | 
 62 | background_flow_hard = { enable = true, frequency_ns = 1000_000_000_000, probability = 0.9, amplitude = 8 }
 63 | # background_flow_hard = { enable = false }
 64 | # nethint_delay_ms = 100
 65 | 
 66 | [brain]
 67 | # Random seed for multiple uses
 68 | seed = 1
 69 | # Whether the cluster's bandwidth is asymmetric
 70 | asymmetric = false
 71 | # The percentage of nodes marked broken
 72 | broken = 0.0
 73 | # The slots of each physical machine
 74 | max_slots = 1
 75 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 76 | sharing_mode = "Guaranteed"
 77 | # in Gbps
 78 | guaranteed_bandwidth = 25
 79 | # background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 80 | background_flow_high_freq = { enable = false }
 81 | gc_period = 100
 82 | 
 83 | # The topology for simulation
 84 | [brain.topology]
 85 | type = "Arbitrary"  # another possible value is "FatTree"
 86 | 
 87 | # [brain.topology.args] # When type = "FatTree"
 88 | # nports = 20           # the number of ports of a switch
 89 | # bandwidth = 100       # in Gbps
 90 | # oversub_ratio = 4.0   # oversubscription ratio
 91 | 
 92 | [brain.topology.args]         # When type = "Arbitrary"
 93 | nracks = 300        # the number of racks
 94 | rack_size = 6       # the number of hosts under a rack
 95 | host_bw = 100       # bandwidth of a host, in Gbps
 96 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
 97 | 
 98 | # [envs]
 99 | # KEY = "value"
100 | 


--------------------------------------------------------------------------------
/src/mapreduce/src/trace.rs:
--------------------------------------------------------------------------------
  1 | use anyhow::Result;
  2 | use nethint::Timestamp;
  3 | use std::io::BufRead;
  4 | 
  5 | pub struct JobTrace {
  6 |     pub nracks: usize,
  7 |     pub count: usize,
  8 |     pub records: Vec<Record>,
  9 | }
 10 | 
 11 | // example: 3 13122 2 66 138 1 38:4.0
 12 | #[derive(Debug, Clone)]
 13 | pub struct Record {
 14 |     pub id: usize,
 15 |     pub ts: Timestamp,
 16 |     pub num_map: usize,
 17 |     pub mappers: Vec<usize>,
 18 |     pub num_reduce: usize,
 19 |     pub reducers: Vec<(usize, f64)>,
 20 | }
 21 | 
 22 | #[derive(Debug, Clone, Copy)]
 23 | pub struct ParseRecordError;
 24 | 
 25 | impl std::fmt::Display for ParseRecordError {
 26 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 27 |         write!(f, "{:?}", self)
 28 |     }
 29 | }
 30 | 
 31 | macro_rules! parse_next {
 32 |     ($tokens:expr, $ret:ty) => {
 33 |         $tokens
 34 |             .next()
 35 |             .and_then(|f| f.parse::<$ret>().ok())
 36 |             .ok_or(ParseRecordError)?
 37 |     };
 38 | }
 39 | 
 40 | impl std::str::FromStr for Record {
 41 |     type Err = ParseRecordError;
 42 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
 43 |         let mut tokens = s.trim().split(' ');
 44 |         let id = parse_next!(tokens, usize);
 45 |         let ts = parse_next!(tokens, Timestamp);
 46 |         let num_map = parse_next!(tokens, usize);
 47 |         let mappers: Vec<usize> = tokens
 48 |             .by_ref()
 49 |             .take(num_map)
 50 |             .map(|x| x.parse::<usize>().ok())
 51 |             .collect::<Option<Vec<_>>>()
 52 |             .ok_or(ParseRecordError)?;
 53 |         let num_reduce = parse_next!(tokens, usize);
 54 |         let reducers: Vec<(usize, f64)> = tokens
 55 |             .take(num_reduce)
 56 |             .map(|x| {
 57 |                 x.split_once(":")
 58 |                     .and_then(|(a, b)| a.parse::<usize>().ok().zip(b.parse::<f64>().ok()))
 59 |             })
 60 |             .collect::<Option<Vec<_>>>()
 61 |             .ok_or(ParseRecordError)?;
 62 | 
 63 |         assert_eq!(num_map, mappers.len());
 64 |         assert_eq!(num_reduce, reducers.len());
 65 | 
 66 |         Ok(Record {
 67 |             id,
 68 |             ts,
 69 |             num_map,
 70 |             mappers,
 71 |             num_reduce,
 72 |             reducers,
 73 |         })
 74 |     }
 75 | }
 76 | 
 77 | impl JobTrace {
 78 |     pub fn from_path<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
 79 |         let f = std::fs::File::open(path)?;
 80 |         let mut reader = std::io::BufReader::new(f);
 81 |         let mut line = String::new();
 82 |         reader.read_line(&mut line)?;
 83 |         let v: Vec<usize> = line
 84 |             .trim()
 85 |             .split(' ')
 86 |             .map(|x| x.parse().ok())
 87 |             .collect::<Option<_>>()
 88 |             .unwrap();
 89 |         assert_eq!(v.len(), 2);
 90 | 
 91 |         let nracks = v[0];
 92 |         let count = v[1];
 93 |         let mut records = Vec::new();
 94 |         for _i in 0..count {
 95 |             let mut line = String::new();
 96 |             reader.read_line(&mut line)?;
 97 |             if line.starts_with('#') {
 98 |                 continue;
 99 |             }
100 |             let r: Record = line
101 |                 .parse()
102 |                 .unwrap_or_else(|e| panic!("pare line failed: {}, line: {}", e, line));
103 |             records.push(r);
104 |         }
105 | 
106 |         Ok(JobTrace {
107 |             nracks,
108 |             count,
109 |             records,
110 |         })
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/evaluation/rl_configs/level2probe.toml:
--------------------------------------------------------------------------------
  1 | # Specifiation of a Allreduce application experiment
  2 | 
  3 | # Number of jobs
  4 | ncases = 50
  5 | 
  6 | # Job size distributions [(percentage, number of workers)]
  7 | # job_size_distribution = [[40, 4], [80, 8], [90, 16], [25, 32], [5, 64]]
  8 | # job_size_distribution = [[40, 4], [40, 8], [40, 12], [40, 16], [40, 32], [40, 64]]
  9 | # job_size_distribution = [[40, 4], [40, 8], [40, 12]]
 10 | # job_size_distribution = [[40, 4], [40, 8], [20, 12], [20, 16]]
 11 | # job_size_distribution = [[40, 32], [40, 64]]
 12 | job_size_distribution = [[40, 4], [40, 8], [40, 12]]
 13 | 
 14 | # Buffer size of all jobs, in bytes
 15 | buffer_size = 100_000_000
 16 | 
 17 | # Number of iterations for all jobs
 18 | num_iterations = 100
 19 | 
 20 | # Lambda of the poisson arrival, 2*100MB/25Gbps*120 = 7.68s
 21 | # poisson_lambda = 8_000_000_000.0
 22 | poisson_lambda = 8_00_000_000.0
 23 | 
 24 | placement_strategy = { type = "Compact" }
 25 | # placement_strategy = { type = "CompactLoadBalanced" }
 26 | # placement_strategy = { type = "Spread" }
 27 | # placement_strategy = { type = "Random", args = 0 }
 28 | 
 29 | # global seed
 30 | seed = 1
 31 | 
 32 | # Output path of for the simulation results
 33 | directory = "/tmp/rl_result_for_paper/level2probe"
 34 | 
 35 | # Number of repeats for each batch of experiments
 36 | batch_repeat = 5
 37 | 
 38 | [[batch]]
 39 | policy = "Random"
 40 | probe = { enable = false }
 41 | nethint_level = 2
 42 | 
 43 | [[batch]]
 44 | policy = "RAT"
 45 | probe = { enable = true, round_ms = 10 }
 46 | nethint_level = 2
 47 | # Auto tune after some iterations. default is disabled
 48 | auto_tune = 10
 49 | 
 50 | [[batch]]
 51 | policy = "RAT"
 52 | probe = { enable = false }
 53 | nethint_level = 2
 54 | # Auto tune after some iterations. default is disabled
 55 | auto_tune = 10
 56 | 
 57 | [simulator]
 58 | nethint = true
 59 | sample_interval_ns = 100_000_000 # 100ms
 60 | loopback_speed = 400
 61 | # possible values of fairness model are "PerFlowMaxMin", "PerVmPairMaxMin", and "TenantFlowMaxMin"
 62 | # fairness = "TenantFlowMaxMin"
 63 | fairness = "PerFlowMaxMin"
 64 | 
 65 | background_flow_hard = { enable = true, frequency_ns = 60_000_000_000, probability = 1.0, amplitude = 5, average_load = 0.1 }
 66 | # background_flow_hard = { enable = false }
 67 | # nethint_delay_ms = 100
 68 | 
 69 | [brain]
 70 | # Random seed for multiple uses
 71 | seed = 1
 72 | # Whether the cluster's bandwidth is asymmetric
 73 | asymmetric = false
 74 | # The percentage of nodes marked broken
 75 | broken = 0.0
 76 | # The slots of each physical machine
 77 | max_slots = 1
 78 | # how bandwidth is partitioned among multiple VMs in the same physical server, possible values are "RateLimited", "Guaranteed"
 79 | sharing_mode = "Guaranteed"
 80 | guaranteed_bandwidth = 25
 81 | background_flow_high_freq = { enable = true, probability = 1.0, amplitude = 10 }
 82 | # background_flow_high_freq = { enable = false }
 83 | gc_period = 100
 84 | 
 85 | # The topology for simulation
 86 | [brain.topology]
 87 | type = "Arbitrary"  # another possible value is "FatTree"
 88 | 
 89 | # [brain.topology.args] # When type = "FatTree"
 90 | # nports = 20           # the number of ports of a switch
 91 | # bandwidth = 100       # in Gbps
 92 | # oversub_ratio = 4.0   # oversubscription ratio
 93 | 
 94 | [brain.topology.args]         # When type = "Arbitrary"
 95 | nracks = 320        # the number of racks
 96 | rack_size = 6      # the number of hosts under a rack
 97 | host_bw = 100       # bandwidth of a host, in Gbps
 98 | rack_bw = 200       # bandwidth of a ToR switch, in Gbps
 99 | 
100 | # [envs]
101 | # KEY = "value"
102 | 


--------------------------------------------------------------------------------
/src/mapreduce/src/config.rs:
--------------------------------------------------------------------------------
  1 | use crate::{ShufflePattern, mapper::MapperPlacementPolicy, ReducerPlacementPolicy};
  2 | use nethint::brain::{self, BrainSetting};
  3 | use nethint::simulator::SimulatorSetting;
  4 | use serde::{Deserialize, Serialize};
  5 | 
  6 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
  7 | pub struct ProbeConfig {
  8 |     pub enable: bool,
  9 |     #[serde(default)]
 10 |     pub round_ms: u64,
 11 | }
 12 | 
 13 | #[derive(Debug, Clone, Serialize, Deserialize)]
 14 | pub struct BatchConfig {
 15 |     /// Reducer placement policy
 16 |     pub reducer_policy: ReducerPlacementPolicy,
 17 |     /// whether to use plink
 18 |     pub probe: ProbeConfig,
 19 |     /// Nethint level.
 20 |     pub nethint_level: usize,
 21 |     /// automatically choose which solution to use, BW or TO
 22 |     #[serde(default)]
 23 |     pub auto_fallback: Option<bool>,
 24 |     /// the alpha, details in paper
 25 |     #[serde(default)]
 26 |     pub alpha: Option<f64>,
 27 | }
 28 | 
 29 | #[derive(Debug, Clone, Serialize, Deserialize)]
 30 | #[serde(deny_unknown_fields)]
 31 | pub struct ExperimentConfig {
 32 |     /// Run experiments from trace file
 33 |     #[serde(default)]
 34 |     pub trace: Option<std::path::PathBuf>,
 35 | 
 36 |     /// How to generate the shuffle
 37 |     pub shuffle: Option<ShufflePattern>,
 38 | 
 39 |     /// Number of testcases
 40 |     pub ncases: usize,
 41 | 
 42 |     /// Number of map tasks. When using trace, this parameter means map scale factor
 43 |     pub num_map: usize,
 44 | 
 45 |     /// Number of reduce tasks. When using trace, this parameter means reduce scale factor
 46 |     pub num_reduce: usize,
 47 | 
 48 |     /// The map scale used only in testbed setting to support scale down; default 1.0
 49 |     pub map_scale: Option<f64>,
 50 | 
 51 |     /// The reduce scale used only in testbed setting to support scale down; default 1.0
 52 |     pub reduce_scale: Option<f64>,
 53 | 
 54 |     /// Traffic scale, multiply the traffic size by a number to allow job overlaps
 55 |     pub traffic_scale: f64,
 56 | 
 57 |     /// Scale the time of job arrival; default 1.0
 58 |     pub time_scale: Option<f64>,
 59 | 
 60 |     /// Computation time switch
 61 |     pub enable_computation_time: bool,
 62 | 
 63 |     /// Mapper placement policy
 64 |     pub mapper_policy: MapperPlacementPolicy,
 65 | 
 66 |     /// akin to AWS Placement Group
 67 |     pub placement_strategy: brain::PlacementStrategy,
 68 | 
 69 |     /// Whether to allow delay scheduling, default to false, in simulation, it must be false
 70 |     pub allow_delay: Option<bool>,
 71 | 
 72 |     /// Whether to skip trivial jobs; default false
 73 |     pub skip_trivial: Option<bool>,
 74 | 
 75 |     /// Collocate or De-collocate
 76 |     pub collocate: bool,
 77 | 
 78 |     /// Number of repeats for each batch of experiments
 79 |     pub batch_repeat: usize,
 80 | 
 81 |     #[serde(rename = "batch")]
 82 |     pub batches: Vec<BatchConfig>,
 83 | 
 84 |     /// Output path of the figure
 85 |     #[serde(default)]
 86 |     pub directory: Option<std::path::PathBuf>,
 87 | 
 88 |     /// Simulator settings
 89 |     pub simulator: SimulatorSetting,
 90 | 
 91 |     /// Brain settings
 92 |     pub brain: BrainSetting,
 93 | 
 94 |     /// Environment variables
 95 |     #[serde(default)]
 96 |     pub envs: toml::value::Table,
 97 | }
 98 | 
 99 | pub fn read_config<P: AsRef<std::path::Path>>(path: P) -> ExperimentConfig {
100 |     use std::io::Read;
101 |     let mut file = std::fs::File::open(path).expect("fail to open file");
102 |     let mut content = String::new();
103 |     file.read_to_string(&mut content).unwrap();
104 |     toml::from_str(&content).expect("parse failed")
105 | }
106 | 


--------------------------------------------------------------------------------
/src/utils/src/cmd_helper.rs:
--------------------------------------------------------------------------------
 1 | use std::process::Command;
 2 | 
 3 | pub fn get_command_str(cmd: &Command) -> String {
 4 |     let prog = cmd.get_program().to_str().unwrap();
 5 |     let args: Vec<&str> = cmd.get_args().map(|x| x.to_str().unwrap()).collect();
 6 |     let cmd_str = std::iter::once(prog).chain(args).collect::<Vec<_>>().join(" ");
 7 |     cmd_str
 8 | }
 9 | 
10 | pub fn get_command_output(mut cmd: Command) -> anyhow::Result<String> {
11 |     let cmd_str = get_command_str(&cmd);
12 |     log::debug!("executing command: {}", cmd_str);
13 | 
14 |     use std::os::unix::process::ExitStatusExt; // for status.signal()
15 |     let result = cmd.output()?;
16 | 
17 |     if !result.status.success() {
18 |         return match result.status.code() {
19 |             Some(code) => Err(anyhow::anyhow!(
20 |                 "Exited with code: {}, cmd: {}",
21 |                 code,
22 |                 cmd_str
23 |             )),
24 |             None => Err(anyhow::anyhow!(
25 |                 "Process terminated by signal: {}, cmd: {}",
26 |                 result.status.signal().unwrap(),
27 |                 cmd_str,
28 |             )),
29 |         };
30 |     }
31 | 
32 |     Ok(std::str::from_utf8(&result.stdout)?.to_owned())
33 | }
34 | 
35 | #[macro_export]
36 | macro_rules! poll_cmd {
37 |     ($cmd:expr, $stop_flag:expr) => {{
38 |         let prog = $cmd.get_program().to_str().unwrap();
39 |         let args: Vec<&str> = $cmd.get_args().map(|x| x.to_str().unwrap()).collect();
40 |         let cmd_str = (std::iter::once(prog).chain(args).collect::<Vec<_>>()).join(" ");
41 |         log::debug!("command: {}", cmd_str);
42 | 
43 |         use std::os::unix::process::ExitStatusExt; // for status.signal()
44 |         let mut child = $cmd.spawn().expect("Failed to rplaunch");
45 |         loop {
46 |             match child.try_wait() {
47 |                 Ok(Some(status)) => {
48 |                     if !status.success() {
49 |                         match status.code() {
50 |                             Some(code) => {
51 |                                 log::error!("Exited with code: {}, cmd: {}", code, cmd_str)
52 |                             }
53 |                             None => log::error!(
54 |                                 "Process terminated by signal: {}, cmd: {}",
55 |                                 status.signal().unwrap(),
56 |                                 cmd_str,
57 |                             ),
58 |                         }
59 |                     }
60 |                     break;
61 |                 }
62 |                 Ok(None) => {
63 |                     log::trace!("status not ready yet, sleep for 5 ms");
64 |                     std::thread::sleep(std::time::Duration::from_millis(5));
65 |                 }
66 |                 Err(e) => {
67 |                     panic!("Command wasn't running: {}", e);
68 |                 }
69 |             }
70 |             // check if kill is needed
71 |             if $stop_flag.load(SeqCst) {
72 |                 log::warn!("killing the child process: {}", cmd_str);
73 |                 // instead of SIGKILL, we use SIGTERM here to gracefully shutdown ssh process tree.
74 |                 // SIGKILL can cause terminal control characters to mess up, which must be
75 |                 // fixed later with sth like "stty sane".
76 |                 // signal::kill(nix::unistd::Pid::from_raw(child.id() as _), signal::SIGTERM)
77 |                 //     .unwrap_or_else(|e| panic!("Failed to kill: {}", e));
78 |                 child
79 |                     .kill()
80 |                     .unwrap_or_else(|e| panic!("Failed to kill: {}", e));
81 |                 log::warn!("child process terminated")
82 |             }
83 |         }
84 |     }}
85 | }


--------------------------------------------------------------------------------
/src/rl/src/topology_aware.rs:
--------------------------------------------------------------------------------
  1 | use crate::RLAlgorithm;
  2 | use nethint::{cluster::Topology, Flow};
  3 | use std::rc::Rc;
  4 | 
  5 | #[derive(Debug, Default)]
  6 | pub struct TopologyAwareTree {
  7 |     seed: u64,
  8 |     num_trees: usize,
  9 | }
 10 | 
 11 | impl TopologyAwareTree {
 12 |     pub fn new(seed: u64, num_trees: usize) -> Self {
 13 |         TopologyAwareTree { seed, num_trees }
 14 |     }
 15 | }
 16 | 
 17 | impl RLAlgorithm for TopologyAwareTree {
 18 |     fn run_rl_traffic(
 19 |         &mut self,
 20 |         root_index: usize,
 21 |         group: Option<Vec<usize>>,
 22 |         size: u64,
 23 |         vcluster: Rc<dyn Topology>,
 24 |     ) -> Vec<Flow> {
 25 |         use rand::prelude::SliceRandom;
 26 |         use rand::{rngs::StdRng, SeedableRng};
 27 |         let mut rng = StdRng::seed_from_u64(self.seed);
 28 | 
 29 |         let mut flows = Vec::new();
 30 | 
 31 |         for _ in 0..self.num_trees {
 32 |             let mut ring = Vec::new();
 33 | 
 34 |             for i in 0..vcluster.num_switches() - 1 {
 35 |                 let mut ringlet = Vec::new();
 36 |                 let tor = format!("tor_{}", i);
 37 | 
 38 |                 for link_ix in vcluster.get_downlinks(vcluster.get_node_index(&tor)) {
 39 |                     let h = vcluster.get_target(*link_ix);
 40 |                     let host_idx = vcluster[h]
 41 |                         .name
 42 |                         .strip_prefix("host_")
 43 |                         .unwrap()
 44 |                         .parse::<usize>()
 45 |                         .unwrap();
 46 |                     ringlet.push(host_idx)
 47 |                 }
 48 | 
 49 |                 let pos = ringlet.iter().position(|x| *x == root_index);
 50 | 
 51 |                 if pos == None {
 52 |                     ringlet.shuffle(&mut rng);
 53 |                 } else {
 54 |                     let pos = pos.unwrap();
 55 |                     ringlet.remove(pos);
 56 |                     ringlet.shuffle(&mut rng);
 57 |                     ringlet.insert(0, root_index);
 58 |                 }
 59 |                 for node_idx in ringlet {
 60 |                     ring.push(node_idx);
 61 |                 }
 62 |             }
 63 | 
 64 |             // filter all nodes in the communication group
 65 |             if group.is_some() {
 66 |                 let g = group.clone().unwrap();
 67 |                 ring.retain(|x| *x == root_index || g.contains(x));
 68 |             }
 69 | 
 70 |             let pos = ring.iter().position(|x| *x == root_index).unwrap();
 71 |             let n = ring.len();
 72 | 
 73 |             // log::error!("pos {} n {}", pos, n);
 74 |             // log::error!("{}",root_index);
 75 |             // log::error!("{:?}",ring);
 76 | 
 77 |             for i in pos..n {
 78 |                 let sender = format!("host_{}", ring[i]);
 79 |                 let receiver = format!("host_{}", ring[(i + 1) % n]);
 80 |                 if (i + 1) % n == pos {
 81 |                     break;
 82 |                 }
 83 |                 let flow = Flow::new(size as usize, &sender, &receiver, None);
 84 |                 flows.push(flow);
 85 |             }
 86 | 
 87 |             if pos > 0 {
 88 |                 for i in 0..pos - 1 {
 89 |                     let sender = format!("host_{}", ring[i]);
 90 |                     let receiver = format!("host_{}", ring[i + 1]);
 91 |                     let flow = Flow::new(size as usize, &sender, &receiver, None);
 92 |                     flows.push(flow);
 93 |                 }
 94 |             }
 95 |         }
 96 | 
 97 |         for f in &mut flows {
 98 |             f.bytes /= self.num_trees;
 99 |         }
100 | 
101 |         flows
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------