├── .gitignore
├── .gitmodules
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── overview.md
    ├── setup.md
    └── topology.png
├── eval
    ├── dynamic-config
    │   ├── dynamic-patch.toml
    │   ├── launch-allreduce-ring-reconfig.toml
    │   ├── launch-gpt-1.toml
    │   ├── launch-gpt-2.toml
    │   ├── launch-ring-reconfig.toml
    │   ├── launch.toml
    │   ├── reconfig-patch.toml
    │   ├── reconfig.toml
    │   └── setup4-trace-fair.toml
    ├── multi-app
    │   ├── .gitignore
    │   ├── collect_multi.py
    │   ├── collect_real_workload.py
    │   ├── ecmp-setup1.toml
    │   ├── ecmp-setup2.toml
    │   ├── ecmp-setup3.toml
    │   ├── ecmp-setup4.toml
    │   ├── flow-setup1.toml
    │   ├── flow-setup2.toml
    │   ├── flow-setup3.toml
    │   ├── flow-setup4.toml
    │   ├── gen_config.py
    │   ├── gen_traffic_gen_config.py
    │   ├── interval.py
    │   ├── nccl-test.patch
    │   ├── setup1-trace-fair.toml
    │   ├── setup1-trace-profile.toml
    │   ├── setup1-trace-qos.toml
    │   ├── setup2-trace-fair.toml
    │   ├── setup2-trace-qosv1.toml
    │   ├── setup2-trace-qosv2.toml
    │   ├── setup4-trace-ecmp-fair.toml
    │   ├── setup4-trace-ecmp-qosv1.toml
    │   ├── setup4-trace-fair.toml
    │   └── setup4-trace-qosv1.toml
    ├── plot
    │   ├── data
    │   │   └── .gitignore
    │   ├── multi_app
    │   │   ├── __init__.py
    │   │   ├── main.py
    │   │   ├── setting1.csv
    │   │   ├── setting2.csv
    │   │   ├── setting3.csv
    │   │   └── setting4.csv
    │   ├── plt_show.py
    │   ├── real_workload
    │   │   ├── jct.csv
    │   │   └── plot_jct.py
    │   └── single_app
    │   │   ├── __init__.py
    │   │   ├── allgather_4gpu.csv
    │   │   ├── allgather_8gpu.csv
    │   │   ├── allreduce_4gpu.csv
    │   │   ├── allreduce_8gpu.csv
    │   │   ├── main.py
    │   │   └── style.py
    ├── set_ecmp_hashing_algo.sh
    └── single-app
    │   ├── .gitignore
    │   ├── 4gpu.toml
    │   ├── 8gpu.toml
    │   ├── collect.py
    │   └── gen_config.py
├── justfile
├── launcher
    ├── Cargo.toml
    ├── README.md
    ├── benchmark
    │   ├── allgather.toml
    │   ├── alltoall-3w-1mb.toml
    │   └── write_lat-32b.toml
    ├── config.toml
    └── src
    │   ├── line_reader.rs
    │   ├── main.rs
    │   └── tee.rs
├── mccs.toml
├── nccl-tests-mccs
    ├── .gitignore
    ├── LICENSE.txt
    ├── Makefile
    ├── README.md
    ├── doc
    │   └── PERFORMANCE.md
    ├── microbenchmark
    │   ├── 4gpu.toml
    │   ├── 8gpu.toml
    │   ├── collect_nccl.py
    │   ├── nccl_result.csv
    │   ├── one_click_run_nccl_all.sh
    │   ├── run_nccl_multiple_times.sh
    │   ├── run_nccl_once.sh
    │   └── set_ecmp_hashing_algo.sh
    ├── nccl-test.patch
    ├── setting1
    │   ├── collect_nccl.py
    │   ├── run_all_jobs_once.sh
    │   ├── run_nccl_all_jobs_multiple_times.sh
    │   └── run_nccl_job_small.sh
    ├── setting2
    │   ├── collect_nccl.py
    │   ├── run_all_jobs.sh
    │   ├── run_all_jobs_once.sh
    │   ├── run_nccl_all_jobs_multiple_times.sh
    │   ├── run_nccl_job_blue.sh
    │   └── run_nccl_job_small.sh
    ├── setting3
    │   ├── collect_nccl.py
    │   ├── run_all_jobs_once.sh
    │   ├── run_nccl_all_jobs_multiple_times.sh
    │   └── run_nccl_job_small.sh
    ├── setting4
    │   ├── collect_nccl.py
    │   ├── run_all_jobs_once.sh
    │   ├── run_nccl_all_jobs_multiple_times.sh
    │   └── run_nccl_job.sh
    ├── src
    │   ├── Makefile
    │   ├── all_gather.cu
    │   ├── all_reduce.cu
    │   ├── alltoall.cu
    │   ├── broadcast.cu
    │   ├── common.cu
    │   ├── common.h
    │   ├── gather.cu
    │   ├── hypercube.cu
    │   ├── nccl1_compat.h
    │   ├── reduce.cu
    │   ├── reduce_scatter.cu
    │   ├── scatter.cu
    │   ├── sendrecv.cu
    │   ├── timer.cc
    │   └── timer.h
    └── verifiable
    │   ├── Makefile
    │   ├── inexact_regress.cu
    │   ├── verifiable.cu
    │   ├── verifiable.h
    │   └── verifiable.mk
├── rust-toolchain
├── src
    ├── collectives-sys
    │   ├── Cargo.toml
    │   ├── build.rs
    │   ├── src
    │   │   └── lib.rs
    │   └── wrapper.h
    ├── collectives
    │   ├── Makefile
    │   ├── gen_rules.sh
    │   ├── include
    │   │   ├── align.h
    │   │   ├── collectives.h
    │   │   └── devcomm.h
    │   ├── makefiles
    │   │   └── common.mk
    │   └── src
    │   │   ├── all_gather.cu
    │   │   ├── all_gather.h
    │   │   ├── all_reduce.cu
    │   │   ├── all_reduce.h
    │   │   ├── common.h
    │   │   ├── common_kernel.h
    │   │   ├── functions.cu
    │   │   ├── op128.h
    │   │   ├── primitives.h
    │   │   ├── prims_simple.h
    │   │   └── reduce_kernel.h
    ├── cuda-sys
    │   ├── cuda-driver-sys
    │   │   ├── Cargo.toml
    │   │   ├── build.rs
    │   │   ├── src
    │   │   │   └── lib.rs
    │   │   └── wrapper.h
    │   ├── cuda-finder
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── lib.rs
    │   ├── cuda-runtime-sys
    │   │   ├── Cargo.toml
    │   │   ├── build.rs
    │   │   ├── src
    │   │   │   └── lib.rs
    │   │   └── wrapper.h
    │   └── nvml-sys
    │   │   ├── Cargo.toml
    │   │   ├── build.rs
    │   │   ├── src
    │   │       └── lib.rs
    │   │   └── wrapper.h
    ├── experimental
    │   ├── Cargo.toml
    │   ├── examples
    │   │   ├── cuda_ipc_client.rs
    │   │   ├── cuda_ipc_server.rs
    │   │   └── get_hwinfo.rs
    │   └── src
    │   │   └── lib.rs
    ├── gdrcopy-sys
    │   ├── Cargo.toml
    │   ├── build.rs
    │   ├── src
    │   │   └── lib.rs
    │   └── wrapper.h
    ├── ibverbs
    │   ├── Cargo.toml
    │   ├── build.rs
    │   ├── src
    │   │   ├── ffi.rs
    │   │   ├── lib.rs
    │   │   └── sliceindex.rs
    │   └── wrapper.h
    ├── ipc
    │   ├── Cargo.toml
    │   ├── core
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   ├── buf.rs
    │   │   │   ├── channel
    │   │   │       ├── flavors
    │   │   │       │   ├── concurrent.rs
    │   │   │       │   ├── mod.rs
    │   │   │       │   └── sequential.rs
    │   │   │       └── mod.rs
    │   │   │   ├── control.rs
    │   │   │   ├── customer.rs
    │   │   │   ├── ipc_channel.rs
    │   │   │   ├── lib.rs
    │   │   │   ├── service.rs
    │   │   │   ├── shmem_ipc.rs
    │   │   │   ├── shmobj.rs
    │   │   │   └── unix.rs
    │   ├── mccs
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   ├── command.rs
    │   │   │   ├── dp.rs
    │   │   │   ├── handle.rs
    │   │   │   ├── lib.rs
    │   │   │   └── reconfig.rs
    │   └── src
    │   │   └── lib.rs
    ├── libmccs
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── collectives.rs
    │   │   ├── communicator.rs
    │   │   ├── lib.rs
    │   │   └── memory.rs
    ├── mccs
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── bootstrap
    │   │       ├── mod.rs
    │   │       └── task.rs
    │   │   ├── comm
    │   │       ├── device.rs
    │   │       ├── mod.rs
    │   │       └── profile.rs
    │   │   ├── config.rs
    │   │   ├── control.rs
    │   │   ├── cuda
    │   │       ├── alloc.rs
    │   │       ├── mapped_ptr.rs
    │   │       ├── mod.rs
    │   │       └── ptr.rs
    │   │   ├── daemon
    │   │       ├── engine.rs
    │   │       └── mod.rs
    │   │   ├── engine.rs
    │   │   ├── exchange
    │   │       ├── command.rs
    │   │       ├── engine.rs
    │   │       ├── message.rs
    │   │       └── mod.rs
    │   │   ├── lib.rs
    │   │   ├── main.rs
    │   │   ├── message.rs
    │   │   ├── pattern.rs
    │   │   ├── proxy
    │   │       ├── command.rs
    │   │       ├── engine.rs
    │   │       ├── init.rs
    │   │       ├── message.rs
    │   │       ├── mod.rs
    │   │       ├── op.rs
    │   │       ├── plan.rs
    │   │       └── task.rs
    │   │   ├── registry.rs
    │   │   ├── runtime
    │   │       ├── affinity.rs
    │   │       ├── executor.rs
    │   │       ├── manager.rs
    │   │       └── mod.rs
    │   │   ├── transport
    │   │       ├── catalog.rs
    │   │       ├── channel.rs
    │   │       ├── delegator.rs
    │   │       ├── engine.rs
    │   │       ├── message.rs
    │   │       ├── meta.rs
    │   │       ├── mod.rs
    │   │       ├── net
    │   │       │   ├── agent.rs
    │   │       │   ├── buffer.rs
    │   │       │   ├── config.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── provider
    │   │       │   │   ├── mod.rs
    │   │       │   │   └── rdma.rs
    │   │       │   ├── resources.rs
    │   │       │   └── transporter.rs
    │   │       ├── op.rs
    │   │       ├── queue.rs
    │   │       ├── setup.rs
    │   │       ├── shm
    │   │       │   ├── agent.rs
    │   │       │   ├── buffer.rs
    │   │       │   ├── config.rs
    │   │       │   ├── mod.rs
    │   │       │   ├── resources.rs
    │   │       │   └── transporter.rs
    │   │       ├── task.rs
    │   │       └── transporter.rs
    │   │   └── utils
    │   │       ├── duplex_chan.rs
    │   │       ├── gdr.rs
    │   │       ├── interfaces.rs
    │   │       ├── mod.rs
    │   │       ├── pool.rs
    │   │       └── tcp.rs
    ├── mccs_examples
    │   ├── allgather_bench
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── main.rs
    │   ├── allgather_proto
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── main.rs
    │   ├── allreduce_bench
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── main.rs
    │   ├── allreduce_proto
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── main.rs
    │   ├── cuda_hello
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── main.rs
    │   ├── ring_config
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │   │   └── main.rs
    │   └── traffic_gen
    │   │   ├── Cargo.toml
    │   │   └── src
    │   │       └── main.rs
    ├── mccs_tests
    │   └── rdma_transport
    │   │   ├── Cargo.toml
    │   │   ├── examples
    │   │       ├── client.rs
    │   │       └── server.rs
    │   │   └── src
    │   │       └── lib.rs
    └── qos-service
    │   ├── Cargo.toml
    │   └── src
    │       └── lib.rs
└── workloads
    ├── reconfig_gpt.toml
    ├── setup-1_gpt_0.toml
    ├── setup-1_gpt_1.toml
    ├── setup-1_resnet_0.toml
    ├── setup-1_resnet_1.toml
    ├── setup-1_vgg_0.toml
    ├── setup-1_vgg_1.toml
    ├── setup-2_gpt_1.toml
    ├── setup-2_gpt_2.toml
    ├── setup-2_resnet.toml
    ├── setup-2_vgg.toml
    ├── setup-3_gpt_1.toml
    ├── setup-3_gpt_2.toml
    ├── setup-4_gpt_1.toml
    ├── setup-4_gpt_2.toml
    ├── setup-4_resnet_0.toml
    └── setup-4_vgg.toml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # MPI hostfile
  2 | hostfile*
  3 | 
  4 | # Prerequisites
  5 | *.d
  6 | 
  7 | # Compiled Object files
  8 | *.slo
  9 | *.lo
 10 | *.o
 11 | *.obj
 12 | 
 13 | # Precompiled Headers
 14 | *.gch
 15 | *.pch
 16 | 
 17 | # Compiled Dynamic libraries
 18 | *.so
 19 | *.dylib
 20 | *.dll
 21 | 
 22 | # Fortran module files
 23 | *.mod
 24 | *.smod
 25 | 
 26 | # Compiled Static libraries
 27 | *.lai
 28 | *.la
 29 | *.a
 30 | *.lib
 31 | 
 32 | # Executables
 33 | *.exe
 34 | *.out
 35 | *.app
 36 | 
 37 | 
 38 | # build
 39 | build/
 40 | # clangd
 41 | .clangd/
 42 | .cache/clangd/
 43 | compile_commands.json
 44 | 
 45 | # vim session file
 46 | Session.vim
 47 | # vscode
 48 | *.workspace
 49 | *.code-workspace
 50 | .vscode/
 51 | 
 52 | 
 53 | # Added by cargo
 54 | 
 55 | /target
 56 | 
 57 | # macOS
 58 | # General
 59 | .DS_Store
 60 | .AppleDouble
 61 | .LSOverride
 62 | 
 63 | # Icon must end with two \r
 64 | Icon
 65 | 
 66 | 
 67 | # Thumbnails
 68 | ._*
 69 | 
 70 | # Jetbrains IDE
 71 | .idea/
 72 | cmake-build-*/
 73 | 
 74 | # Private folder for development
 75 | /_private
 76 | # Prerequisites
 77 | *.d
 78 | 
 79 | # Compiled Object files
 80 | *.slo
 81 | *.lo
 82 | *.o
 83 | *.obj
 84 | 
 85 | # Precompiled Headers
 86 | *.gch
 87 | *.pch
 88 | 
 89 | # Compiled Dynamic libraries
 90 | *.so
 91 | *.dylib
 92 | *.dll
 93 | 
 94 | # Fortran module files
 95 | *.mod
 96 | *.smod
 97 | 
 98 | # Compiled Static libraries
 99 | *.lai
100 | *.la
101 | *.a
102 | *.lib
103 | 
104 | # Executables
105 | *.exe
106 | *.out
107 | *.app
108 | 
109 | 
110 | # build
111 | build/
112 | # clangd
113 | .clangd/
114 | .cache/clangd/
115 | compile_commands.json
116 | 
117 | # vim session file
118 | Session.vim
119 | # vscode
120 | *.workspace
121 | *.code-workspace
122 | .vscode/
123 | 
124 | 
125 | # Added by cargo
126 | 
127 | /target
128 | 
129 | # macOS
130 | # General
131 | .DS_Store
132 | .AppleDouble
133 | .LSOverride
134 | 
135 | # Icon must end with two \r
136 | Icon
137 | 
138 | 
139 | # Thumbnails
140 | ._*
141 | 
142 | # Jetbrains IDE
143 | .idea/
144 | cmake-build-*/
145 | 
146 | # Private folder for development
147 | /_private
148 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/ibverbs/vendor/rdma-core"]
2 | 	path = src/ibverbs/vendor/rdma-core
3 | 	url = https://github.com/linux-rdma/rdma-core.git
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | resolver = "2"
 3 | 
 4 | members = [
 5 |     "src/cuda-sys/cuda-driver-sys",
 6 |     "src/cuda-sys/cuda-runtime-sys",
 7 |     "src/cuda-sys/nvml-sys",
 8 |     "src/cuda-sys/cuda-finder",
 9 |     "src/gdrcopy-sys",
10 |     "src/collectives-sys",
11 |     "src/ibverbs",
12 |     "src/experimental",
13 |     "src/ipc",
14 |     "src/ipc/core",
15 |     "src/ipc/mccs",
16 |     "src/libmccs",
17 |     "src/mccs",
18 |     "src/mccs_examples/cuda_hello",
19 |     "src/mccs_examples/allgather_proto",
20 |     "src/mccs_examples/allreduce_proto",
21 |     "src/mccs_examples/allgather_bench",
22 |     "src/mccs_examples/allreduce_bench",
23 |     "src/mccs_examples/ring_config",
24 |     "src/mccs_tests/rdma_transport", 
25 |     "src/qos-service", 
26 |     "src/mccs_examples/traffic_gen",
27 |     "launcher",
28 | ]
29 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | run:
2 | 	cargo build --release
3 | 	RUST_LOG=INFO ./target/release/mccs
4 | 
5 | build-all:
6 | 	make -j -C src/collectives
7 | 	cargo build --release


--------------------------------------------------------------------------------
/docs/setup.md:
--------------------------------------------------------------------------------
 1 | # MCCS Evaluation Setup
 2 | 
 3 | ## Hardware Setup
 4 | <img src="topology.png" alt="Topology Setup" width="600"/>
 5 | 
 6 | As shown in the figure, in our evaluation, we have four nodes in our testbed, each equipped with 2 NVIDIA RTX 3090 GPUs and a 100 Gbps Mellanox ConnectX-5 NIC. Using a single 100 Gbps Mellanox SN2100 switch, we emulate a spine-leaf topology with 2 leaf switches and 2 spine switches through self-wiring. Four nodes are placed under two racks, where each rack corresponds to a leaf switch. The links between the switches are limited to 50 Gbps, while the links between each host and the leaf switches are limited to 100 Gbps. On each host, we use IB traffic class (TC) and rate limit each TC to emulate two 50 Gbps virtual NICs (one per GPU).
 7 | 
 8 | ## System Environment
 9 | - NVIDIA GPU drivers and CUDA must be installed. Our code is tested with CUDA 12.1.
10 | - GDRcopy library must be installed. which can be found here: https://github.com/NVIDIA/gdrcopy
11 | - Mellanox OFED drivers, which can be found here: https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/
12 | - Rust version `nightly-2023-11-11 `. Follow https://www.rust-lang.org/tools/install to install `rustup`. `rustup` will automatically install this version configured in `rust-toolchain` file. `Cargo.lock` file provides the concrete version we used for each Rust dependency.


--------------------------------------------------------------------------------
/docs/topology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phoenix-dataplane/mCCS/86a25c71a9b050bd7b8a1a1f08fa116470492ce9/docs/topology.png


--------------------------------------------------------------------------------
/eval/dynamic-config/dynamic-patch.toml:
--------------------------------------------------------------------------------
 1 | mccs_addrs = [
 2 |     "192.168.211.2",
 3 |     "192.168.211.34",
 4 |     "192.168.211.66",
 5 |     "192.168.211.162",
 6 | ]
 7 | mccs_port = 5000
 8 | 
 9 | [[comm_patterns_reconfig]]
10 | communicator_id = 200
11 | channels = [
12 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
13 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
14 | ]
15 | 
16 | [[comm_patterns_reconfig]]
17 | communicator_id = 201
18 | channels = [
19 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
20 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
21 | ]
22 | 
23 | 
24 | [[comm_patterns_reconfig]]
25 | communicator_id = 202
26 | channels = [
27 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
28 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
29 | ]


--------------------------------------------------------------------------------
/eval/dynamic-config/launch-allreduce-ring-reconfig.toml:
--------------------------------------------------------------------------------
 1 | name = "8gpu-dynamic-allreduce"
 2 | group = "8gpu-dynamic-allreduce"
 3 | [[worker]]
 4 | host = "danyang-01"
 5 | bin = "mccs"
 6 | args = "--host 1 --config eval/dynamic-config/reconfig.toml"
 7 | weak = true
 8 | dependencies = []
 9 | 
10 | [[worker]]
11 | host = "danyang-02"
12 | bin = "mccs"
13 | args = "--host 2 --config eval/dynamic-config/reconfig.toml"
14 | weak = true
15 | dependencies = []
16 | 
17 | [[worker]]
18 | host = "danyang-03"
19 | bin = "mccs"
20 | args = "--host 3 --config eval/dynamic-config/reconfig.toml"
21 | weak = true
22 | dependencies = []
23 | 
24 | [[worker]]
25 | host = "danyang-05"
26 | bin = "mccs"
27 | args = "--host 5 --config eval/dynamic-config/reconfig.toml"
28 | weak = true
29 | dependencies = []
30 | 
31 | [[worker]]
32 | host = "danyang-02"
33 | bin = "allreduce_bench"
34 | args = "--root-addr 192.168.211.34 --rank 0 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
35 | dependencies = [ 0, 1, 2, 3,]
36 | 
37 | [[worker]]
38 | host = "danyang-02"
39 | bin = "allreduce_bench"
40 | args = "--root-addr 192.168.211.34 --rank 1 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
41 | dependencies = [ 0, 1, 2, 3,]
42 | 
43 | [[worker]]
44 | host = "danyang-03"
45 | bin = "allreduce_bench"
46 | args = "--root-addr 192.168.211.34 --rank 2 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
47 | dependencies = [ 0, 1, 2, 3,]
48 | 
49 | [[worker]]
50 | host = "danyang-03"
51 | bin = "allreduce_bench"
52 | args = "--root-addr 192.168.211.34 --rank 3 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
53 | dependencies = [ 0, 1, 2, 3,]
54 | 
55 | [[worker]]
56 | host = "danyang-01"
57 | bin = "allreduce_bench"
58 | args = "--root-addr 192.168.211.34 --rank 4 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
59 | dependencies = [ 0, 1, 2, 3,]
60 | 
61 | [[worker]]
62 | host = "danyang-01"
63 | bin = "allreduce_bench"
64 | args = "--root-addr 192.168.211.34 --rank 5 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
65 | dependencies = [ 0, 1, 2, 3,]
66 | 
67 | [[worker]]
68 | host = "danyang-05"
69 | bin = "allreduce_bench"
70 | args = "--root-addr 192.168.211.34 --rank 6 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
71 | dependencies = [ 0, 1, 2, 3,]
72 | 
73 | [[worker]]
74 | host = "danyang-05"
75 | bin = "allreduce_bench"
76 | args = "--root-addr 192.168.211.34 --rank 7 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4"
77 | dependencies = [ 0, 1, 2, 3,]
78 | 
79 | 


--------------------------------------------------------------------------------
/eval/dynamic-config/launch-gpt-1.toml:
--------------------------------------------------------------------------------
 1 | name = "setup4-dynamic-gpt-1"
 2 | group = "setup4-dynamic-gpt-1"
 3 | [[worker]]
 4 | host = "danyang-03"
 5 | bin = "traffic_gen"
 6 | args = "--root-addr 192.168.211.66 --rank 0 --iters 40001 --config workloads/setup-4_gpt_1.toml --verbose --name gpt_1"
 7 | dependencies = []
 8 | 
 9 | [[worker]]
10 | host = "danyang-05"
11 | bin = "traffic_gen"
12 | args = "--root-addr 192.168.211.66 --rank 1 --iters 40001 --config workloads/setup-4_gpt_1.toml --verbose --name gpt_1"
13 | dependencies = []
14 | 
15 | 


--------------------------------------------------------------------------------
/eval/dynamic-config/launch-gpt-2.toml:
--------------------------------------------------------------------------------
 1 | name = "setup4-dynamic-gpt-2"
 2 | group = "setup4-dynamic-gpt-2"
 3 | [[worker]]
 4 | host = "danyang-03"
 5 | bin = "traffic_gen"
 6 | args = "--root-addr 192.168.211.66 --rank 0 --iters 40001 --config workloads/setup-4_gpt_2.toml --verbose --name gpt_2"
 7 | dependencies = []
 8 | 
 9 | [[worker]]
10 | host = "danyang-05"
11 | bin = "traffic_gen"
12 | args = "--root-addr 192.168.211.66 --rank 1 --iters 40001 --config workloads/setup-4_gpt_2.toml --verbose --name gpt_2"
13 | dependencies = []
14 | 
15 | 


--------------------------------------------------------------------------------
/eval/dynamic-config/launch-ring-reconfig.toml:
--------------------------------------------------------------------------------
 1 | name = "setup4-dynamic"
 2 | group = "setup4-dynamic"
 3 | [[worker]]
 4 | host = "danyang-01"
 5 | bin = "mccs"
 6 | args = "--host 1 --config eval/dynamic-config/reconfig.toml"
 7 | weak = true
 8 | dependencies = []
 9 | 
10 | [[worker]]
11 | host = "danyang-02"
12 | bin = "mccs"
13 | args = "--host 2 --config eval/dynamic-config/reconfig.toml"
14 | weak = true
15 | dependencies = []
16 | 
17 | [[worker]]
18 | host = "danyang-03"
19 | bin = "mccs"
20 | args = "--host 3 --config eval/dynamic-config/reconfig.toml"
21 | weak = true
22 | dependencies = []
23 | 
24 | [[worker]]
25 | host = "danyang-05"
26 | bin = "mccs"
27 | args = "--host 5 --config eval/dynamic-config/reconfig.toml"
28 | weak = true
29 | dependencies = []
30 | 
31 | [[worker]]
32 | host = "danyang-02"
33 | bin = "traffic_gen"
34 | args = "--root-addr 192.168.211.34 --rank 0 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
35 | dependencies = [ 0, 1, 2, 3,]
36 | 
37 | [[worker]]
38 | host = "danyang-02"
39 | bin = "traffic_gen"
40 | args = "--root-addr 192.168.211.34 --rank 1 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
41 | dependencies = [ 0, 1, 2, 3,]
42 | 
43 | [[worker]]
44 | host = "danyang-03"
45 | bin = "traffic_gen"
46 | args = "--root-addr 192.168.211.34 --rank 2 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
47 | dependencies = [ 0, 1, 2, 3,]
48 | 
49 | [[worker]]
50 | host = "danyang-03"
51 | bin = "traffic_gen"
52 | args = "--root-addr 192.168.211.34 --rank 3 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
53 | dependencies = [ 0, 1, 2, 3,]
54 | 
55 | [[worker]]
56 | host = "danyang-01"
57 | bin = "traffic_gen"
58 | args = "--root-addr 192.168.211.34 --rank 4 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
59 | dependencies = [ 0, 1, 2, 3,]
60 | 
61 | [[worker]]
62 | host = "danyang-01"
63 | bin = "traffic_gen"
64 | args = "--root-addr 192.168.211.34 --rank 5 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
65 | dependencies = [ 0, 1, 2, 3,]
66 | 
67 | [[worker]]
68 | host = "danyang-05"
69 | bin = "traffic_gen"
70 | args = "--root-addr 192.168.211.34 --rank 6 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
71 | dependencies = [ 0, 1, 2, 3,]
72 | 
73 | [[worker]]
74 | host = "danyang-05"
75 | bin = "traffic_gen"
76 | args = "--root-addr 192.168.211.34 --rank 7 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt"
77 | dependencies = [ 0, 1, 2, 3,]
78 | 
79 | 


--------------------------------------------------------------------------------
/eval/dynamic-config/launch.toml:
--------------------------------------------------------------------------------
 1 | name = "setup4-dynamic"
 2 | group = "setup4-dynamic"
 3 | [[worker]]
 4 | host = "danyang-01"
 5 | bin = "mccs"
 6 | args = "--host 1 --config eval/dynamic-config/setup4-trace-fair.toml"
 7 | weak = true
 8 | dependencies = []
 9 | 
10 | [[worker]]
11 | host = "danyang-02"
12 | bin = "mccs"
13 | args = "--host 2 --config eval/dynamic-config/setup4-trace-fair.toml"
14 | weak = true
15 | dependencies = []
16 | 
17 | [[worker]]
18 | host = "danyang-03"
19 | bin = "mccs"
20 | args = "--host 3 --config eval/dynamic-config/setup4-trace-fair.toml"
21 | weak = true
22 | dependencies = []
23 | 
24 | [[worker]]
25 | host = "danyang-05"
26 | bin = "mccs"
27 | args = "--host 5 --config eval/dynamic-config/setup4-trace-fair.toml"
28 | weak = true
29 | dependencies = []
30 | 
31 | [[worker]]
32 | host = "danyang-02"
33 | bin = "traffic_gen"
34 | args = "--root-addr 192.168.211.34 --rank 0 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg"
35 | dependencies = [ 0, 1, 2, 3,]
36 | 
37 | [[worker]]
38 | host = "danyang-02"
39 | bin = "traffic_gen"
40 | args = "--root-addr 192.168.211.34 --rank 1 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg"
41 | dependencies = [ 0, 1, 2, 3,]
42 | 
43 | [[worker]]
44 | host = "danyang-01"
45 | bin = "traffic_gen"
46 | args = "--root-addr 192.168.211.34 --rank 2 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg"
47 | dependencies = [ 0, 1, 2, 3,]
48 | 
49 | [[worker]]
50 | host = "danyang-01"
51 | bin = "traffic_gen"
52 | args = "--root-addr 192.168.211.34 --rank 3 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg"
53 | dependencies = [ 0, 1, 2, 3,]
54 | 
55 | 


--------------------------------------------------------------------------------
/eval/dynamic-config/reconfig-patch.toml:
--------------------------------------------------------------------------------
 1 | mccs_addrs = [
 2 |     "192.168.211.2",
 3 |     "192.168.211.34",
 4 |     "192.168.211.66",
 5 |     "192.168.211.162",
 6 | ]
 7 | mccs_port = 5000
 8 | 
 9 | [[comm_patterns_reconfig]]
10 | communicator_id = 600
11 | channels = [
12 |     { channel_id = 0, ring = [7, 6, 5, 4, 3, 2, 1, 0], udp_sport = [[4, 3, 49200], [0,7, 49200]], net_dev = "mlx5_0" },
13 |     { channel_id = 1, ring = [7, 6, 5, 4, 3, 2, 1, 0], udp_sport = [[4, 3, 49202], [0,7, 49202]], net_dev = "mlx5_0" },
14 | ]
15 | ib_traffic_class = 0


--------------------------------------------------------------------------------
/eval/dynamic-config/reconfig.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 600
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 0


--------------------------------------------------------------------------------
/eval/multi-app/.gitignore:
--------------------------------------------------------------------------------
1 | output/


--------------------------------------------------------------------------------
/eval/multi-app/collect_real_workload.py:
--------------------------------------------------------------------------------
 1 | import subprocess as sb
 2 | data = sb.getoutput('grep --recursive "Rank 0: run time" /tmp').split('\n')
 3 | # filter with /tmp/ at the beginning, and remove the first 5 characters
 4 | data = [i[5:] for i in data if i.find('/tmp/')==0]
 5 | txt = ['setting,job,jct']
 6 | for i in data:
 7 |     setting = '-'.join(i.split('/')[1].split('-')[2:])
 8 |     if setting.find('ecmp-qosv2')==-1:
 9 |         app = i.split('[')[1].split(']')[0]
10 |         time = i.split(': ')[-1].split(' ')[0]
11 |         txt.append(f'{setting},{app},{time}')
12 | with open('../plot/data/real_workload.csv', 'w') as f:
13 |     f.write('\n'.join(txt))
14 |     


--------------------------------------------------------------------------------
/eval/multi-app/ecmp-setup1.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 0
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1, 2, 3],  net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 0
67 | 
68 | 


--------------------------------------------------------------------------------
/eval/multi-app/ecmp-setup2.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 66
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 106
67 | 
68 | [[comm_patterns_override]]
69 | communicator_id = 83
70 | channels = [
71 |     { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" },
72 |     { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" },
73 | ]
74 | ib_traffic_class = 106


--------------------------------------------------------------------------------
/eval/multi-app/ecmp-setup3.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 106
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 66
67 | 
68 | 


--------------------------------------------------------------------------------
/eval/multi-app/ecmp-setup4.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 66
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 106
67 | 
68 | [[comm_patterns_override]]
69 | communicator_id = 83
70 | channels = [
71 |     { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" },
72 |     { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" },
73 | ]
74 | ib_traffic_class = 106


--------------------------------------------------------------------------------
/eval/multi-app/flow-setup1.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 0
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 0
67 | 
68 | 


--------------------------------------------------------------------------------
/eval/multi-app/flow-setup2.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 66
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 106
67 | 
68 | [[comm_patterns_override]]
69 | communicator_id = 83
70 | channels = [
71 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
72 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
73 | ]
74 | ib_traffic_class = 106


--------------------------------------------------------------------------------
/eval/multi-app/flow-setup3.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 106
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 66
67 | 
68 | 


--------------------------------------------------------------------------------
/eval/multi-app/flow-setup4.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 81
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | ib_traffic_class = 66
58 | 
59 | 
60 | [[comm_patterns_override]]
61 | communicator_id = 82
62 | channels = [
63 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
64 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
65 | ]
66 | ib_traffic_class = 106
67 | 
68 | [[comm_patterns_override]]
69 | communicator_id = 83
70 | channels = [
71 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
72 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
73 | ]
74 | ib_traffic_class = 106


--------------------------------------------------------------------------------
/eval/multi-app/nccl-test.patch:
--------------------------------------------------------------------------------
 1 | # nccl branch: v2.13.8
 2 | diff --git a/src/Makefile b/src/Makefile
 3 | index 393de8e..ba21a59 100644
 4 | --- a/src/Makefile
 5 | +++ b/src/Makefile
 6 | @@ -59,7 +59,7 @@ endif
 7 |  BUILDDIR ?= ../build
 8 |  ifneq ($(NCCL_HOME), "")
 9 |  NVCUFLAGS += -I$(NCCL_HOME)/include/
10 | -NVLDFLAGS += -L$(NCCL_HOME)/lib
11 | +NVLDFLAGS += -L$(NCCL_HOME)/lib -Xlinker=-rpath,$(NCCL_HOME)/lib
12 |  endif
13 |  
14 |  ifeq ($(MPI), 1)
15 | diff --git a/src/common.cu b/src/common.cu
16 | index 8588047..c91461b 100644
17 | --- a/src/common.cu
18 | +++ b/src/common.cu
19 | @@ -595,14 +595,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
20 |    TESTCHECK(completeColl(args));
21 |  
22 |    // Benchmark
23 | +  int epochs = 1;
24 | +  char* epochs_str = getenv("NCCL_EPOCHS");
25 | +  if (epochs_str) {
26 | +      epochs = std::stoi(epochs_str);
27 | +  }
28 |    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
29 |        setupArgs(size, type, args);
30 |        char rootName[100];
31 |        sprintf(rootName, "%6i", root);
32 | -      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
33 | -      TESTCHECK(BenchTime(args, type, op, root, 0));
34 | -      TESTCHECK(BenchTime(args, type, op, root, 1));
35 | -      PRINT("\n");
36 | +      for (int i = 0; i < epochs; i++) {
37 | +          PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
38 | +          TESTCHECK(BenchTime(args, type, op, root, 0));
39 | +          TESTCHECK(BenchTime(args, type, op, root, 1));
40 | +          PRINT("\n");
41 | +      }
42 |    }
43 |    return testSuccess;
44 |  }
45 | 


--------------------------------------------------------------------------------
/eval/multi-app/setup1-trace-fair.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 100
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
55 | ]
56 | ib_traffic_class = 0
57 | 
58 | [[comm_patterns_override]]
59 | communicator_id = 101
60 | channels = [
61 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
62 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
63 | ]
64 | ib_traffic_class = 0


--------------------------------------------------------------------------------
/eval/multi-app/setup1-trace-profile.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 100
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
55 | ]
56 | ib_traffic_class = 0
57 | 
58 | [[comm_patterns_override]]
59 | communicator_id = 101
60 | channels = [
61 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
62 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
63 | ]
64 | ib_traffic_class = 0


--------------------------------------------------------------------------------
/eval/multi-app/setup2-trace-fair.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 200
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
55 | ]
56 | ib_traffic_class = 106
57 | 
58 | [[comm_patterns_override]]
59 | communicator_id = 201
60 | channels = [
61 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
62 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
63 | ]
64 | ib_traffic_class = 66
65 | 
66 | 
67 | [[comm_patterns_override]]
68 | communicator_id = 202
69 | channels = [
70 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
71 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
72 | ]
73 | ib_traffic_class = 66


--------------------------------------------------------------------------------
/eval/multi-app/setup2-trace-qosv1.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 200
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
55 |     # { channel_id = 2, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
56 |     # { channel_id = 3, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
57 |     # { channel_id = 4, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
58 |     # { channel_id = 5, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
59 |     # { channel_id = 6, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
60 |     # { channel_id = 7, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
61 | ]
62 | ib_traffic_class = 106
63 | 
64 | [[comm_patterns_override]]
65 | communicator_id = 201
66 | channels = [
67 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
68 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
69 | ]
70 | ib_traffic_class = 66
71 | 
72 | 
73 | [[comm_patterns_override]]
74 | communicator_id = 202
75 | channels = [
76 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
77 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
78 | ]
79 | ib_traffic_class = 66


--------------------------------------------------------------------------------
/eval/multi-app/setup4-trace-ecmp-fair.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 200
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3],net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3],net_dev = "mlx5_0" },
55 | ]
56 | ib_traffic_class = 0
57 | 
58 | [[comm_patterns_override]]
59 | communicator_id = 201
60 | channels = [
61 |     { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" },
62 |     { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" },
63 | ]
64 | ib_traffic_class = 106
65 | 
66 | 
67 | [[comm_patterns_override]]
68 | communicator_id = 202
69 | channels = [
70 |     { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" },
71 |     { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" },
72 | ]
73 | ib_traffic_class = 66


--------------------------------------------------------------------------------
/eval/multi-app/setup4-trace-ecmp-qosv1.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 200
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3],net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3],net_dev = "mlx5_0" },
55 | ]
56 | ib_traffic_class = 0
57 | 
58 | [[comm_patterns_override]]
59 | communicator_id = 201
60 | channels = [
61 |     { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" },
62 |     { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" },
63 | ]
64 | ib_traffic_class = 106
65 | 
66 | 
67 | [[comm_patterns_override]]
68 | communicator_id = 202
69 | channels = [
70 |     { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" },
71 |     { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" },
72 | ]
73 | ib_traffic_class = 66


--------------------------------------------------------------------------------
/eval/multi-app/setup4-trace-fair.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 200
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
55 | ]
56 | ib_traffic_class = 0
57 | 
58 | [[comm_patterns_override]]
59 | communicator_id = 201
60 | channels = [
61 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
62 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
63 | ]
64 | ib_traffic_class = 106
65 | 
66 | 
67 | [[comm_patterns_override]]
68 | communicator_id = 202
69 | channels = [
70 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
71 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
72 | ]
73 | ib_traffic_class = 66


--------------------------------------------------------------------------------
/eval/multi-app/setup4-trace-qosv1.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | # magic number: 49200 & 49202
49 | 
50 | [[comm_patterns_override]]
51 | communicator_id = 200
52 | channels = [
53 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
54 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
55 | ]
56 | ib_traffic_class = 0
57 | 
58 | [[comm_patterns_override]]
59 | communicator_id = 201
60 | channels = [
61 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
62 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
63 | ]
64 | ib_traffic_class = 106
65 | 
66 | 
67 | [[comm_patterns_override]]
68 | communicator_id = 202
69 | channels = [
70 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
71 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
72 | ]
73 | ib_traffic_class = 66


--------------------------------------------------------------------------------
/eval/plot/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore


--------------------------------------------------------------------------------
/eval/plot/multi_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phoenix-dataplane/mCCS/86a25c71a9b050bd7b8a1a1f08fa116470492ce9/eval/plot/multi_app/__init__.py


--------------------------------------------------------------------------------
/eval/plot/plt_show.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def is_notebook() -> bool:
 5 |     try:
 6 |         shell = get_ipython().__class__.__name__
 7 |         # print(shell)
 8 |         if shell == 'ZMQInteractiveShell':
 9 |             return True   # Jupyter notebook or qtconsole
10 |         elif shell == 'TerminalInteractiveShell':
11 |             return False  # Terminal running IPython
12 |         else:
13 |             return False  # Other type (?)
14 |     except NameError:
15 |         return False      # Probably standard Python interpreter
16 | 
17 | def plt_show():
18 |     if is_notebook():
19 |         plt.show()
20 | 


--------------------------------------------------------------------------------
/eval/plot/single_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phoenix-dataplane/mCCS/86a25c71a9b050bd7b8a1a1f08fa116470492ce9/eval/plot/single_app/__init__.py


--------------------------------------------------------------------------------
/eval/set_ecmp_hashing_algo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 |         echo "Usage: $0: [everything|source-port]"
 5 | }
 6 | 
 7 | if [ $# -ne 1 ]; then
 8 |         usage
 9 |         exit 1
10 | fi
11 | 
12 | algo=$1
13 | 
14 | case $algo in
15 |         everything)
16 |                 algo_args="source-destination-mac source-destination-ip source-destination-port l3-protocol l2-protocol flow-label"
17 |                 ;;
18 |         source-port)
19 |                 algo_args="source-port"
20 |                 ;;
21 |         *)
22 |                 echo "Error: algo should be either 'everything' or 'source-port', got $algo"
23 |                 usage
24 |                 exit 1
25 |                 ;;
26 | esac
27 | 
28 | ssh danyang-01 \
29 | "ssh -oKexAlgorithms=+diffie-hellman-group14-sha1 danyang@danyang-mellanox-switch.cs.duke.edu \
30 |         cli -h '\"enable\" \"config terminal\" \"port-channel load-balance ethernet $algo_args\" \"show interfaces port-channel load-balance\"'"
31 | 
32 |         # cli -h '\"enable\" \"show lldp remote\"'"
33 | 


--------------------------------------------------------------------------------
/eval/single-app/.gitignore:
--------------------------------------------------------------------------------
1 | output/


--------------------------------------------------------------------------------
/eval/single-app/4gpu.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 106
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 114
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | 
58 | 
59 | [[comm_patterns_override]]
60 | communicator_id = 137
61 | channels = [
62 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
63 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
64 | ]
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/eval/single-app/8gpu.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 137
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | 
58 | 


--------------------------------------------------------------------------------
/launcher/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "launcher"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | ansi_term = "0.12.1"
10 | anyhow = "1.0.57"
11 | bytes = "1.1.0"
12 | chrono = "0.4.19"
13 | env_logger = "0.9.0"
14 | lazy_static = "1.4.0"
15 | log = "0.4.17"
16 | nix = { version = "0.24.1", default-features = false, features = ["signal"] }
17 | serde = { version = "1.0.137", features = ["derive"] }
18 | shellexpand = "2.1.0"
19 | structopt = "0.3.26"
20 | tokio-anyfd = "0.2.0"
21 | toml = "0.5.9"
22 | walkdir = "2.3.2"
23 | 
24 | [[bin]]
25 | name = "launcher"
26 | path = "src/main.rs"
27 | 


--------------------------------------------------------------------------------
/launcher/README.md:
--------------------------------------------------------------------------------
 1 | # Phoenix benchmark suites
 2 | 
 3 | There is a set of defined benchmark configurations under `benchmark/`.
 4 | To use this launcher, you currently need to first edit `config.toml`,
 5 | and run the launcher within this directory (the same directory as this
 6 | README). You need to at least update the `workdir` to point the project's
 7 | path on your file system. To run Phoenix examples, the `workdir` should 
 8 | be set to the path to phoenix project. To run mRPC examples, the `workdir`
 9 | must be set to points to the path of `phoenix/experimental/mrpc`.
10 | You also need to have ssh connections to the worker machines
11 | specified in the benchmark configurations.
12 | 
13 | ```
14 | $ cargo rr --bin launcher -- --help
15 |     Finished release [optimized + debuginfo] target(s) in 0.41s
16 |      Running `target/release/launcher --help`
17 | [2023-03-02 11:46:08.073007 INFO benchmark/src/main.rs:632] env_logger initialized
18 | launcher 0.1.0
19 | Launcher of the benchmark suite.
20 | 
21 | USAGE:
22 |     launcher [FLAGS] [OPTIONS] --benchmark <benchmark>
23 | 
24 | FLAGS:
25 |         --debug          Run with debug mode (cargo build without --release)
26 |         --dry-run        Dry-run. Use this option to check the configs
27 |     -h, --help           Prints help information
28 |         --logical-and    kill all threads if any thread ends
29 |     -s, --silent         Do out print to stdout
30 |     -V, --version        Prints version information
31 | 
32 | OPTIONS:
33 |     -b, --benchmark <benchmark>            Run a single benchmark task
34 |     -c, --configfile <configfile>          configfile [default: config.toml]
35 |         --timeout <global-timeout-secs>    Timeout in seconds, 0 means infinity. Can be overwritten by specific case
36 |                                            configs [default: 60]
37 |     -g, --group <group>                    Run a benchmark group
38 |     -o, --output-dir <output-dir>          Output directory of log files
39 | ```
40 | 
41 | To start with the basic connectivity, first update `benchmark/rpc_hello.toml`. Then run (make sure `workdir` is correct)
42 | ```
43 | $ cargo rr --bin launcher -- --benchmark benchmark/rpc_hello.toml
44 | ```
45 | 
46 | To test the latency for mRPC, you can run (make sure `workdir` is
47 | correct).
48 | ```
49 | $ cargo rr --bin launcher -- -o /tmp/output --benchmark benchmark/rpc_bench_latency/rpc_bench_latency_64b.toml
50 | ```
51 | 
52 | Similarly, to test the bandwidth or RPC rate, you can run
53 | ```
54 | $ cargo rr --bin launcher -- -o /tmp/output --benchmark benchmark/rpc_bench_tput/rpc_bench_tput_1mb.toml
55 | or
56 | $ cargo rr --bin launcher -- -o /tmp/output --benchmark benchmark/rpc_bench_rate/rpc_bench_tput_32b_4c.toml
57 | ```
58 | 
59 | You can also specify a __benchmark group__ and run a group of tests.
60 | For more information, please read the commandline usage and the
61 | benchmark configuration files.
62 | 


--------------------------------------------------------------------------------
/launcher/benchmark/allgather.toml:
--------------------------------------------------------------------------------
 1 | name = "benchmark/allgather"
 2 | group = "allgather"
 3 | 
 4 | [[worker]]
 5 | host = "danyang-03"
 6 | bin = "mccs"
 7 | args = "--host 3"
 8 | dependencies = []
 9 | weak = true
10 | 
11 | [[worker]]
12 | host = "danyang-01"
13 | bin = "mccs"
14 | args = "--host 1"
15 | dependencies = []
16 | weak = true
17 | 
18 | [[worker]]
19 | host = "danyang-03"
20 | bin = "allgather_bench"
21 | args = "--root-addr 192.168.211.66 --rank 0 --num-ranks 2 --cuda-device-idx 0 --size 128 --communicator 114 --round 20"
22 | dependencies = [0, 1]
23 | 
24 | [[worker]]
25 | host = "danyang-01"
26 | bin = "allgather_bench"
27 | args = "--root-addr 192.168.211.66 --rank 1 --num-ranks 2 --cuda-device-idx 0 --size 128 --communicator 114 --round 20"
28 | dependencies = [0, 1]


--------------------------------------------------------------------------------
/launcher/benchmark/alltoall-3w-1mb.toml:
--------------------------------------------------------------------------------
 1 | name = "benchmark/alltoall-3w-1mb"
 2 | description = "Run bandwidth benchmark for all-to-all traffic pattern"
 3 | group = "alltoall"
 4 | 
 5 | [[worker]]
 6 | host = "danyang-06"
 7 | bin = "alltoall"
 8 | args = "st --hosts rdma0.danyang-06,rdma0.danyang-05,rdma0.danyang-04 -m 1024000 -p 4000"
 9 | dependencies = []
10 | 
11 | [[worker]]
12 | host = "danyang-05"
13 | bin = "alltoall"
14 | args = "st --hosts rdma0.danyang-06,rdma0.danyang-05,rdma0.danyang-04 -m 1024000 -p 4000"
15 | dependencies = [0]
16 | 
17 | [[worker]]
18 | host = "danyang-04"
19 | bin = "alltoall"
20 | args = "st --hosts rdma0.danyang-06,rdma0.danyang-05,rdma0.danyang-04 -m 1024000 -p 4000"
21 | dependencies = [0, 1]
22 | 


--------------------------------------------------------------------------------
/launcher/benchmark/write_lat-32b.toml:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/launcher/config.toml:
--------------------------------------------------------------------------------
1 | workdir = "~/nfs/mCCS"
2 | 
3 | [env]
4 | RUST_BACKTRACE = "1"
5 | RUST_LOG_STYLE = "never"
6 | CARGO_TERM_COLOR = "never"


--------------------------------------------------------------------------------
/launcher/src/tee.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | use std::io::{Read, Write};
 3 | use std::os::unix::io::{AsRawFd, RawFd};
 4 | 
 5 | pub struct DevNull;
 6 | 
 7 | impl Write for DevNull {
 8 |     #[inline]
 9 |     fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
10 |         Ok(buf.len())
11 |     }
12 | 
13 |     #[inline]
14 |     fn flush(&mut self) -> io::Result<()> {
15 |         Ok(())
16 |     }
17 | }
18 | 
19 | /// An adapter for readers whose inputs
20 | /// are written to a "tee"'d writer
21 | pub struct TeeReader<R: Read, W: Write> {
22 |     reader: R,
23 |     writer: W,
24 | }
25 | 
26 | impl<R: Read + AsRawFd, W: Write> AsRawFd for TeeReader<R, W> {
27 |     fn as_raw_fd(&self) -> RawFd {
28 |         self.reader.as_raw_fd()
29 |     }
30 | }
31 | 
32 | impl<R: Read, W: Write> TeeReader<R, W> {
33 |     /// Returns a TeeReader which can be used as Read whose
34 |     /// reads delegate bytes read to the provided reader and write to the provided
35 |     /// writer. The write operation must complete before the read completes.
36 |     ///
37 |     /// Errors reported by the write operation will be interpreted as errors for the read
38 |     pub fn new(reader: R, writer: W) -> TeeReader<R, W> {
39 |         TeeReader { reader, writer }
40 |     }
41 | }
42 | 
43 | impl<R: Read, W: Write> Read for TeeReader<R, W> {
44 |     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
45 |         let n = self.reader.read(buf)?;
46 |         self.writer.write_all(&buf[..n])?;
47 |         Ok(n)
48 |     }
49 | }
50 | 
51 | #[cfg(test)]
52 | mod tests {
53 |     use super::*;
54 |     use std::io::Read;
55 | 
56 |     #[test]
57 |     fn tee() {
58 |         let mut reader = "It's over 9000!".as_bytes();
59 |         let mut teeout = Vec::new();
60 |         let mut stdout = Vec::new();
61 |         {
62 |             let mut tee = TeeReader::new(&mut reader, &mut teeout);
63 |             let _ = tee.read_to_end(&mut stdout);
64 |         }
65 |         assert_eq!(teeout, stdout);
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/mccs.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | [qos_schedule]
49 | epoch_microsecs = 85000
50 | 
51 | [qos_schedule.schedule.99]
52 | intervals = [[0, 35000]]
53 | mode = "Allow"
54 | 
55 | [qos_schedule.schedule.100]
56 | intervals = [[35000, 69000]]
57 | mode = "Allow"
58 | 
59 | # magic number: 49200 & 49202
60 | 
61 | [[comm_patterns_override]]
62 | communicator_id = 114
63 | channels = [
64 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
65 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
66 | ]
67 | 
68 | 
69 | [[comm_patterns_override]]
70 | communicator_id = 137
71 | channels = [
72 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49153], [3, 0, 49155]], net_dev = "mlx5_0" },
73 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49152], [3, 0, 49154]], net_dev = "mlx5_0" },
74 | ]
75 | 
76 | 
77 | [[comm_patterns_override]]
78 | communicator_id = 138
79 | channels = [
80 |     { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" },
81 |     { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" },
82 | ]
83 | 
84 | 
85 | [[comm_patterns_override]]
86 | communicator_id = 46
87 | channels = [
88 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
89 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
90 | ]
91 | 
92 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | 
 2 |  Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
 3 | 
 4 |  Redistribution and use in source and binary forms, with or without
 5 |  modification, are permitted provided that the following conditions
 6 |  are met:
 7 |   * Redistributions of source code must retain the above copyright
 8 |     notice, this list of conditions and the following disclaimer.
 9 |   * Redistributions in binary form must reproduce the above copyright
10 |     notice, this list of conditions and the following disclaimer in the
11 |     documentation and/or other materials provided with the distribution.
12 |   * Neither the name of NVIDIA CORPORATION, nor the names of their
13 |     contributors may be used to endorse or promote products derived
14 |     from this software without specific prior written permission.
15 | 
16 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17 |  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 |  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24 |  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 |  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 |  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENCE.txt for license information
 5 | #
 6 | 
 7 | BUILDDIR ?= build
 8 | override BUILDDIR := $(abspath $(BUILDDIR))
 9 | 
10 | .PHONY: all clean
11 | 
12 | default: src.build
13 | 
14 | TARGETS=src
15 | 
16 | all:   ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.build:
20 | 	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
21 | 
22 | %.clean:
23 | 	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
24 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/4gpu.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 106
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 114
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | 
58 | 
59 | [[comm_patterns_override]]
60 | communicator_id = 137
61 | channels = [
62 |     { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" },
63 |     { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" },
64 | ]
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/8gpu.toml:
--------------------------------------------------------------------------------
 1 | mccs_daemon_basename = "mccs-deamon"
 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}"
 3 | addrs = [
 4 |     "0.0.0.0",
 5 |     "192.168.211.2",
 6 |     "192.168.211.34",
 7 |     "192.168.211.66",
 8 |     "192.168.211.130",
 9 |     "192.168.211.162",
10 |     "192.168.211.195",
11 | ]
12 | listen_port = 5000
13 | 
14 | [control]
15 | prefix = "/tmp/mccs-${USER}"
16 | path = "control.sock"
17 | 
18 | [comm_default_config]
19 | buffer_sizes = [4194304]
20 | channel_count = 2
21 | 
22 | [comm_global_config]
23 | [comm_global_config.net_rdma]
24 | gid_index = 3
25 | qps_per_conn = 1
26 | timeout = 18
27 | retry_count = 7
28 | pkey = 0
29 | use_inline = false
30 | service_level = 0
31 | traffic_class = 0
32 | adaptive_routing = false
33 | ar_threshold = 8192
34 | pci_relaxed_ordering = false
35 | gdr_flush_disable = true
36 | socket_if_prefix = "rdma"
37 | 
38 | [comm_global_config.net]
39 | gdr_enable = false
40 | gdr_copy_sync_enable = false
41 | gdr_copy_flush_enable = false
42 | 
43 | [comm_global_config.shm]
44 | locality = "Sender"
45 | memcpy_send = false
46 | memcpy_recv = false
47 | 
48 | 
49 | # magic number: 49200 & 49202
50 | 
51 | [[comm_patterns_override]]
52 | communicator_id = 137
53 | channels = [
54 |     { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" },
55 |     { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" },
56 | ]
57 | 
58 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/collect_nccl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | import sys
 6 | import csv
 7 | import glob
 8 | import os.path
 9 | import argparse
10 | 
11 | OUTPUT_DIR = "/tmp/nccl_single_app"
12 | 
13 | parser = argparse.ArgumentParser(description='')
14 | parser.add_argument('--app', '--app', type=str,
15 |                     help = 'The app of the trial, either allgather or allreduce')
16 | parser.add_argument('--num-gpus', '--num-gpus', type=str,
17 |                     help = 'The number of gpus to match, either 1 or 2')
18 | 
19 | args = parser.parse_args()
20 | assert args.app
21 | assert args.num_gpus
22 | 
23 | nccl_results = glob.glob(os.path.join(OUTPUT_DIR, "*{}_*_{}.stdout".format(args.app, args.num_gpus)))
24 | 
25 | writer = csv.writer(sys.stdout)
26 | writer.writerow(['Solution', 'App', 'Size (Bytes)', 'Dtype', 'Latency (us)', 'AlgBW (GB/s)', 'BusBW (GB/s)'])
27 | 
28 | pat = re.compile(r'\s*\d+\s+\d+.*')
29 | for path in nccl_results:
30 |     with open(path, 'r') as fin:
31 |         solution = path.split('/')[-1].split('_')[-2]
32 |         for line in fin:
33 |             match = pat.match(line)
34 |             if match is not None:
35 |                 # print(line.split())
36 |                 tokens = line.split()
37 |                 # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0']
38 |                 size = tokens[0]
39 |                 dtype = tokens[2]
40 |                 latency_us = tokens[9]
41 |                 algbw = tokens[10]
42 |                 busbw = tokens[11]
43 |                 writer.writerow([solution, args.app, size, dtype, latency_us, algbw, busbw])
44 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/nccl_result.csv:
--------------------------------------------------------------------------------
 1 | Solution,App,Size (Bytes),Dtype,Latency (us),AlgBW (GB/s),BusBW (GB/s)
 2 | NCCL Bad Ring,allgather,32768,float,46.59,0.70,0.53
 3 | NCCL Bad Ring,allgather,131072,float,72.53,1.81,1.36
 4 | NCCL Bad Ring,allgather,524288,float,138.6,3.78,2.84
 5 | NCCL Bad Ring,allgather,2097152,float,455.7,4.60,3.45
 6 | NCCL Bad Ring,allgather,8388608,float,1782.5,4.71,3.53
 7 | NCCL Bad Ring,allgather,33554432,float,7152.5,4.69,3.52
 8 | NCCL Bad Ring,allgather,134217728,float,28699,4.68,3.51
 9 | NCCL Bad Ring,allgather,536870912,float,113508,4.73,3.55
10 | NCCL Bad Ring,allgather,32768,float,45.54,0.72,0.54
11 | NCCL Bad Ring,allgather,131072,float,66.93,1.96,1.47
12 | NCCL Bad Ring,allgather,524288,float,120.6,4.35,3.26
13 | NCCL Bad Ring,allgather,2097152,float,347.0,6.04,4.53
14 | NCCL Bad Ring,allgather,8388608,float,1256.1,6.68,5.01
15 | NCCL Bad Ring,allgather,33554432,float,4878.2,6.88,5.16
16 | NCCL Bad Ring,allgather,134217728,float,18866,7.11,5.34
17 | NCCL Bad Ring,allgather,536870912,float,73930,7.26,5.45
18 | NCCL Bad Ring,allgather,32768,float,45.99,0.71,0.53
19 | NCCL Bad Ring,allgather,131072,float,71.36,1.84,1.38
20 | NCCL Bad Ring,allgather,524288,float,135.5,3.87,2.90
21 | NCCL Bad Ring,allgather,2097152,float,450.6,4.65,3.49
22 | NCCL Bad Ring,allgather,8388608,float,1783.7,4.70,3.53
23 | NCCL Bad Ring,allgather,33554432,float,7360.0,4.56,3.42
24 | NCCL Bad Ring,allgather,134217728,float,28552,4.70,3.53
25 | NCCL Bad Ring,allgather,536870912,float,112582,4.77,3.58
26 | NCCL Bad Ring,allgather,32768,float,45.05,0.73,0.55
27 | NCCL Bad Ring,allgather,131072,float,67.81,1.93,1.45
28 | NCCL Bad Ring,allgather,524288,float,121.7,4.31,3.23
29 | NCCL Bad Ring,allgather,2097152,float,346.4,6.05,4.54
30 | NCCL Bad Ring,allgather,8388608,float,1250.8,6.71,5.03
31 | NCCL Bad Ring,allgather,33554432,float,4820.9,6.96,5.22
32 | NCCL Bad Ring,allgather,134217728,float,18698,7.18,5.38
33 | NCCL Bad Ring,allgather,536870912,float,73600,7.29,5.47
34 | NCCL Bad Ring,allgather,32768,float,45.12,0.73,0.54
35 | NCCL Bad Ring,allgather,131072,float,68.43,1.92,1.44
36 | NCCL Bad Ring,allgather,524288,float,131.1,4.00,3.00
37 | NCCL Bad Ring,allgather,2097152,float,424.5,4.94,3.71
38 | NCCL Bad Ring,allgather,8388608,float,1803.3,4.65,3.49
39 | NCCL Bad Ring,allgather,33554432,float,7326.4,4.58,3.43
40 | NCCL Bad Ring,allgather,134217728,float,28911,4.64,3.48
41 | NCCL Bad Ring,allgather,536870912,float,114558,4.69,3.51
42 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/one_click_run_nccl_all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 | 	echo "Usage: $0 <num_iters>"
 5 | }
 6 | 
 7 | if [ $# -ne 1 ]; then
 8 | 	usage
 9 | 	exit 1
10 | fi
11 | 
12 | num_iters=$1
13 | if [ $num_iters -gt 20 ]; then
14 | 	echo "$num_iters too large"
15 | 	exit 1
16 | fi
17 | 
18 | suffix=`date +%Y%m%d.%H.%M.%S`
19 | # output_dir=/tmp/${app}_${ring_type}_${num_gpus}gpus.${suffix}
20 | output_dir=/tmp/nccl_single_app.${suffix}
21 | mkdir -p $output_dir
22 | unlink /tmp/nccl_single_app
23 | ln -sf $output_dir /tmp/nccl_single_app
24 | 
25 | ./run_nccl_multiple_times.sh $num_iters 1 badring allgather
26 | ./run_nccl_multiple_times.sh $num_iters 1 goodring allgather
27 | ./run_nccl_multiple_times.sh $num_iters 2 badring allgather
28 | ./run_nccl_multiple_times.sh $num_iters 2 goodring allgather
29 | 
30 | ./run_nccl_multiple_times.sh $num_iters 1 badring allreduce
31 | ./run_nccl_multiple_times.sh $num_iters 1 goodring allreduce
32 | ./run_nccl_multiple_times.sh $num_iters 2 badring allreduce
33 | ./run_nccl_multiple_times.sh $num_iters 2 goodring allreduce
34 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/run_nccl_multiple_times.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 | 	echo "Usage: $0 <num_iters> <num_gpus> <ring_type> <app>     num_gpus=1|2, ring_type=goodring|badring, app=allgather|allreduce"
 5 | }
 6 | 
 7 | if [ $# -ne 4 ]; then
 8 | 	usage
 9 | 	exit 1
10 | fi
11 | 
12 | num_iters=$1
13 | if [ $num_iters -gt 20 ]; then
14 | 	echo "$num_iters too large"
15 | 	exit 1
16 | fi
17 | 
18 | shift
19 | num_gpus=$1
20 | ring_type=$2
21 | app=$3
22 | 
23 | output_dir=/tmp/nccl_single_app
24 | 
25 | for i in `seq 1 $num_iters`; do
26 | 	echo Case $i
27 | 	./run_nccl_once.sh $num_gpus $ring_type $app |& tee $output_dir/${i}_${app}_${ring_type}_${num_gpus}.stdout
28 | done
29 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/run_nccl_once.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <num_gpus> <ring_type> <app>     num_gpus=1|2, ring_type=goodring|badring, app=allgather|allreduce"
 7 | }
 8 | 
 9 | if [ $# -ne 3 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | num_gpus=$1
15 | ring_type=$2
16 | app_type=$3
17 | 
18 | case $num_gpus in
19 | 	1)
20 | 		tclass=106
21 | 		;;
22 | 	2)
23 | 		tclass=0
24 | 		;;
25 | 	*)
26 | 		echo "Error: num_gpus should be either '1' or '2', got $num_gpus"
27 | 		usage
28 | 		exit 1
29 | 		;;
30 | esac
31 | 
32 | echo "Traffic class=$tclass"
33 | 
34 | case $ring_type in
35 | 	goodring)
36 | 		cat > hostfile.$ring_type <<EOF
37 | danyang-02 slots=$num_gpus
38 | danyang-03 slots=$num_gpus
39 | danyang-01 slots=$num_gpus
40 | danyang-05 slots=$num_gpus
41 | EOF
42 | 		;;
43 | 	badring)
44 | 		cat > hostfile.$ring_type <<EOF
45 | danyang-02 slots=$num_gpus
46 | danyang-01 slots=$num_gpus
47 | danyang-03 slots=$num_gpus
48 | danyang-05 slots=$num_gpus
49 | EOF
50 | 		;;
51 | 	*)
52 | 		echo "Error: ring_type should be either 'goodring' or 'badring', got $ring_type"
53 | 		usage
54 | 		exit 1
55 | 		;;
56 | esac
57 | 
58 | case $app_type in
59 | 	allgather)
60 | 		app=all_gather_perf
61 | 		dtype=" "
62 | 		;;
63 | 	allreduce)
64 | 		app=all_reduce_perf
65 | 		dtype="--datatype=half"
66 | 		;;
67 | 	*)
68 | 		echo "Error: app must be either 'allgather' or 'allreduce', got $app_type"
69 | 		usage
70 | 		exit 1
71 | 		;;
72 | esac
73 | 
74 | mpirun --hostfile hostfile.$ring_type -mca pml ob1 -mca btl tcp,self -mca btl_tcp_if_include eno1 \
75 | 	-x CUDA_VISIBLE_DEVICES=0,1 \
76 | 	-x NCCL_DEBUG=INFO -x NCCL_ALGO=Ring -x NCCL_PROTO=Simple \
77 | 	-x NCCL_IB_GID_INDEX=3 -x NCCL_SOCKET_IFNAME=rdma0 \
78 | 	-x NCCL_MAX_NCHANNELS=2 -x NCCL_MIN_NCHANNELS=2 -x NCCL_IB_QPS_PER_CONNECTION=1 \
79 | 	-x NCCL_IB_TC=$tclass \
80 | 		$WORKDIR/../build/$app $dtype -b 32K -e 512M -f 4
81 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/microbenchmark/set_ecmp_hashing_algo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 | 	echo "Usage: $0: [everything|source-port]"
 5 | }
 6 | 
 7 | if [ $# -ne 1 ]; then
 8 | 	usage
 9 | 	exit 1
10 | fi
11 | 
12 | algo=$1
13 | 
14 | case $algo in
15 | 	everything)
16 | 		algo_args="source-destination-mac source-destination-ip source-destination-port l3-protocol l2-protocol flow-label"
17 | 		;;
18 | 	source-port)
19 | 		algo_args="source-port"
20 | 		;;
21 | 	*)
22 | 		echo "Error: algo should be either 'everything' or 'source-port', got $algo"
23 | 		usage
24 | 		exit 1
25 | 		;;
26 | esac
27 | 
28 | sudo -u cjr \
29 | ssh danyang-01 \
30 | "ssh -oKexAlgorithms=+diffie-hellman-group14-sha1 danyang@danyang-mellanox-switch.cs.duke.edu \
31 | 	cli -h '\"enable\" \"config terminal\" \"port-channel load-balance ethernet $algo_args\" \"show interfaces port-channel load-balance\"'"
32 | 
33 | 	# cli -h '\"enable\" \"show lldp remote\"'"
34 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/nccl-test.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/Makefile b/src/Makefile
 2 | index 393de8e..ba21a59 100644
 3 | --- a/src/Makefile
 4 | +++ b/src/Makefile
 5 | @@ -59,7 +59,7 @@ endif
 6 |  BUILDDIR ?= ../build
 7 |  ifneq ($(NCCL_HOME), "")
 8 |  NVCUFLAGS += -I$(NCCL_HOME)/include/
 9 | -NVLDFLAGS += -L$(NCCL_HOME)/lib
10 | +NVLDFLAGS += -L$(NCCL_HOME)/lib -Xlinker=-rpath,$(NCCL_HOME)/lib
11 |  endif
12 |  
13 |  ifeq ($(MPI), 1)
14 | diff --git a/src/common.cu b/src/common.cu
15 | index 8588047..c91461b 100644
16 | --- a/src/common.cu
17 | +++ b/src/common.cu
18 | @@ -595,14 +595,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
19 |    TESTCHECK(completeColl(args));
20 |  
21 |    // Benchmark
22 | +  int epochs = 1;
23 | +  char* epochs_str = getenv("NCCL_EPOCHS");
24 | +  if (epochs_str) {
25 | +      epochs = std::stoi(epochs_str);
26 | +  }
27 |    for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) {
28 |        setupArgs(size, type, args);
29 |        char rootName[100];
30 |        sprintf(rootName, "%6i", root);
31 | -      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
32 | -      TESTCHECK(BenchTime(args, type, op, root, 0));
33 | -      TESTCHECK(BenchTime(args, type, op, root, 1));
34 | -      PRINT("\n");
35 | +      for (int i = 0; i < epochs; i++) {
36 | +          PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
37 | +          TESTCHECK(BenchTime(args, type, op, root, 0));
38 | +          TESTCHECK(BenchTime(args, type, op, root, 1));
39 | +          PRINT("\n");
40 | +      }
41 |    }
42 |    return testSuccess;
43 |  }
44 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting1/collect_nccl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | import sys
 6 | import csv
 7 | import glob
 8 | import os.path
 9 | import argparse
10 | 
11 | OUTPUT_DIR = "/tmp/nccl_setting1"
12 | 
13 | parser = argparse.ArgumentParser(description='Launch a dsagent and deepscheduler')
14 | parser.add_argument('--solution', '--solution', required=True, type=str,
15 |                     help = 'Give a name to the trial, either NCCL Bad Ring|NCCL Good Ring.')
16 | parser.add_argument('--strip-head', '--strip-head', required=False, type=int, default=1,
17 |                     help = 'Omit the first few lines of the output.')
18 | parser.add_argument('--strip-tail', '--strip-tail', required=False, type=int, default=0,
19 |                     help = 'Omit the last few lines of the output.')
20 | 
21 | args = parser.parse_args()
22 | assert args.solution
23 | 
24 | writer = csv.writer(sys.stdout)
25 | writer.writerow(['Solution', 'App', 'Size (Bytes)', 'Dtype', 'Latency (us)', 'AlgBW (GB/s)', 'BusBW (GB/s)', 'Trial ID'])
26 | 
27 | pat = re.compile(r'\s*\d+\s+\d+.*')
28 | 
29 | def get_latency(rec) -> int:
30 |     return int(rec[4])
31 | 
32 | def get_job_duration(records) -> int:
33 |     return sum(map(get_latency, records))
34 | 
35 | def work(app_color: str, trial_id, nccl_output_path) -> None:
36 |     results = []
37 |     with open(nccl_output_path, 'r') as fin:
38 |         for line in fin:
39 |             match = pat.match(line)
40 |             if match is not None:
41 |                 tokens = line.split()
42 |                 # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0']
43 |                 size = tokens[0]
44 |                 dtype = tokens[2]
45 |                 latency_us = tokens[9]
46 |                 algbw = tokens[10]
47 |                 busbw = tokens[11]
48 |                 results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id])
49 |     return results
50 | 
51 | jobs = []
52 | 
53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')):
54 |     tokens = path.split('.')
55 |     trial_id = tokens[-3].split('/')[-1]
56 |     color = tokens[-2]
57 |     jobs += [work(color, trial_id, path)]
58 | 
59 | # min_dura = min(map(get_job_duration, jobs))
60 | # print(f'min job duration: {min_dura}')
61 | 
62 | for results in jobs:
63 |     assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results))
64 |     if args.strip_tail == 0:
65 |         stripped_results = results[args.strip_head:]
66 |     else:
67 |         stripped_results = results[args.strip_head:-args.strip_tail]
68 |     for rec in stripped_results:
69 |         writer.writerow(rec)
70 |     # writer.writerow(rec)
71 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting1/run_all_jobs_once.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <trial_id>"
 7 | }
 8 | 
 9 | if [ $# -ne 1 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | trial_id=$1
15 | 
16 | OUTPUT_DIR=/tmp/nccl_setting1
17 | 
18 | $WORKDIR/run_nccl_job_small.sh blue |& tee $OUTPUT_DIR/$trial_id.blue.stdout &
19 | $WORKDIR/run_nccl_job_small.sh red |& tee $OUTPUT_DIR/$trial_id.red.stdout &
20 | 
21 | wait
22 | # tail -f /tmp/nccl_setting1_blue.stdout /tmp/nccl_setting1_red.stdout
23 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting1/run_nccl_all_jobs_multiple_times.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 | 	echo "Usage: $0 <num_iters>"
 5 | }
 6 | 
 7 | if [ $# -ne 1 ]; then
 8 | 	usage
 9 | 	exit 1
10 | fi
11 | 
12 | num_iters=$1
13 | if [ $num_iters -gt 20 ]; then
14 | 	echo "$num_iters too large"
15 | 	exit 1
16 | fi
17 | 
18 | suffix=`date +%Y%m%d.%H.%M.%S`
19 | output_dir=/tmp/nccl_setting1.${suffix}
20 | mkdir -p $output_dir
21 | unlink /tmp/nccl_setting1
22 | ln -sf $output_dir /tmp/nccl_setting1
23 | 
24 | 
25 | for i in `seq 1 $num_iters`; do
26 | 	echo Case $i
27 | 	./run_all_jobs_once.sh $i
28 | done
29 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting1/run_nccl_job_small.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | # green or red: both take 2 gpus
 6 | 
 7 | usage() {
 8 | 	echo "Usage: $0 <job_color>     ring_type=green|red"
 9 | }
10 | 
11 | if [ $# -ne 1 ]; then
12 | 	usage
13 | 	exit 1
14 | fi
15 | 
16 | color=$1
17 | 
18 | case $color in
19 | 	blue)
20 | 		cat > hostfile.$color <<EOF
21 | danyang-02 slots=2
22 | danyang-01 slots=2
23 | EOF
24 | 		;;
25 | 	red)
26 | 		cat > hostfile.$color <<EOF
27 | danyang-03 slots=2
28 | danyang-05 slots=2
29 | EOF
30 | 		;;
31 | 	*)
32 | 		echo "Error: job_color should be either 'blue' or 'red', got $color"
33 | 		usage
34 | 		exit 1
35 | 		;;
36 | esac
37 | 
38 | mpirun --hostfile hostfile.$color -mca pml ob1 -mca btl tcp,self -mca btl_tcp_if_include eno1 \
39 | 	-x CUDA_VISIBLE_DEVICES=0,1 \
40 | 	-x NCCL_DEBUG=INFO -x NCCL_ALGO=Ring -x NCCL_PROTO=Simple \
41 | 	-x NCCL_IB_GID_INDEX=3 -x NCCL_SOCKET_IFNAME=rdma0 \
42 | 	-x NCCL_MAX_NCHANNELS=2 -x NCCL_MIN_NCHANNELS=2 -x NCCL_IB_QPS_PER_CONNECTION=1 \
43 | 	-x NCCL_IB_TC=0 \
44 | 	-x NCCL_EPOCHS=10 \
45 | 		$WORKDIR/../build/all_reduce_perf --datatype=half -b 128M -e 128M
46 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting2/collect_nccl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | import sys
 6 | import csv
 7 | import glob
 8 | import os.path
 9 | import argparse
10 | 
11 | OUTPUT_DIR = "/tmp/nccl_setting2"
12 | 
13 | parser = argparse.ArgumentParser(description='Launch a dsagent and deepscheduler')
14 | parser.add_argument('--solution', '--solution', required=True, type=str,
15 |                     help = 'Give a name to the trial, either NCCL Bad Ring|NCCL Good Ring.')
16 | parser.add_argument('--strip-head', '--strip-head', required=False, type=int, default=1,
17 |                     help = 'Omit the first few lines of the output.')
18 | parser.add_argument('--strip-tail', '--strip-tail', required=False, type=int, default=0,
19 |                     help = 'Omit the last few lines of the output.')
20 | 
21 | args = parser.parse_args()
22 | assert args.solution
23 | 
24 | writer = csv.writer(sys.stdout)
25 | writer.writerow(['Solution', 'App', 'Size (Bytes)', 'Dtype', 'Latency (us)', 'AlgBW (GB/s)', 'BusBW (GB/s)', 'Trial ID'])
26 | 
27 | pat = re.compile(r'\s*\d+\s+\d+.*')
28 | 
29 | def get_latency(rec) -> int:
30 |     return int(rec[4])
31 | 
32 | def get_job_duration(records) -> int:
33 |     return sum(map(get_latency, records))
34 | 
35 | def work(app_color: str, trial_id, nccl_output_path) -> None:
36 |     results = []
37 |     with open(nccl_output_path, 'r') as fin:
38 |         for line in fin:
39 |             match = pat.match(line)
40 |             if match is not None:
41 |                 tokens = line.split()
42 |                 # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0']
43 |                 size = tokens[0]
44 |                 dtype = tokens[2]
45 |                 latency_us = tokens[9]
46 |                 algbw = tokens[10]
47 |                 busbw = tokens[11]
48 |                 results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id])
49 |     return results
50 | 
51 | jobs = []
52 | 
53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')):
54 |     tokens = path.split('.')
55 |     trial_id = tokens[-3].split('/')[-1]
56 |     color = tokens[-2]
57 |     jobs += [work(color, trial_id, path)]
58 | 
59 | # min_dura = min(map(get_job_duration, jobs))
60 | # print(f'min job duration: {min_dura}')
61 | 
62 | for results in jobs:
63 |     assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results))
64 |     if args.strip_tail == 0:
65 |         stripped_results = results[args.strip_head:]
66 |     else:
67 |         stripped_results = results[args.strip_head:-args.strip_tail]
68 |     for rec in stripped_results:
69 |         writer.writerow(rec)
70 |     # writer.writerow(rec)
71 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting2/run_all_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <ring_type>     ring_type=goodring|badring"
 7 | }
 8 | 
 9 | if [ $# -ne 1 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | ring_type=$1
15 | 
16 | $WORKDIR/run_nccl_job_blue.sh $ring_type 2>&1 > /tmp/nccl_setting2_blue.stdout &
17 | $WORKDIR/run_nccl_job_small.sh green  2>&1 > /tmp/nccl_setting2_green.stdout &
18 | $WORKDIR/run_nccl_job_small.sh red 2>&1 > /tmp/nccl_setting2_red.stdout &
19 | 
20 | tail -f /tmp/nccl_setting2_blue.stdout /tmp/nccl_setting2_green.stdout /tmp/nccl_setting2_red.stdout
21 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting2/run_all_jobs_once.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <ring_type> <trial_id>    ring_type=goodring|badring"
 7 | }
 8 | 
 9 | if [ $# -ne 2 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | ring_type=$1
15 | trial_id=$2
16 | 
17 | OUTPUT_DIR=/tmp/nccl_setting2
18 | 
19 | $WORKDIR/run_nccl_job_blue.sh $ring_type |& tee $OUTPUT_DIR/$trial_id.blue.stdout &
20 | $WORKDIR/run_nccl_job_small.sh green  |& tee $OUTPUT_DIR/$trial_id.green.stdout &
21 | $WORKDIR/run_nccl_job_small.sh red |& tee $OUTPUT_DIR/$trial_id.red.stdout &
22 | 
23 | wait
24 | # tail -f /tmp/nccl_setting2_blue.stdout /tmp/nccl_setting2_green.stdout /tmp/nccl_setting2_red.stdout
25 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting2/run_nccl_all_jobs_multiple_times.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 | 	echo "Usage: $0 <num_iters> <ring_type>     ring_type=goodring|badring"
 5 | }
 6 | 
 7 | if [ $# -ne 2 ]; then
 8 | 	usage
 9 | 	exit 1
10 | fi
11 | 
12 | num_iters=$1
13 | if [ $num_iters -gt 20 ]; then
14 | 	echo "$num_iters too large"
15 | 	exit 1
16 | fi
17 | 
18 | shift
19 | ring_type=$1
20 | 
21 | suffix=`date +%Y%m%d.%H.%M.%S`
22 | output_dir=/tmp/nccl_setting2_${ring_type}.${suffix}
23 | mkdir -p $output_dir
24 | unlink /tmp/nccl_setting2
25 | ln -sf $output_dir /tmp/nccl_setting2
26 | 
27 | 
28 | for i in `seq 1 $num_iters`; do
29 | 	echo Case $i
30 | 	./run_all_jobs_once.sh $ring_type $i
31 | done
32 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting2/run_nccl_job_blue.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | 
 6 | usage() {
 7 | 	echo "Usage: $0 <ring_type>     ring_type=goodring|badring"
 8 | }
 9 | 
10 | if [ $# -ne 1 ]; then
11 | 	usage
12 | 	exit 1
13 | fi
14 | 
15 | ring_type=$1
16 | 
17 | case $ring_type in
18 | 	goodring)
19 | 		cat > hostfile.blue.$ring_type <<EOF
20 | danyang-02 slots=1
21 | danyang-03 slots=1
22 | danyang-01 slots=1
23 | danyang-05 slots=1
24 | EOF
25 | 		;;
26 | 	badring)
27 | 		cat > hostfile.blue.$ring_type <<EOF
28 | danyang-02 slots=1
29 | danyang-01 slots=1
30 | danyang-03 slots=1
31 | danyang-05 slots=1
32 | EOF
33 | 		;;
34 | 	*)
35 | 		echo "Error: ring_type should be either 'goodring' or 'badring', got $ring_type"
36 | 		usage
37 | 		exit 1
38 | 		;;
39 | esac
40 | 
41 | mpirun --hostfile hostfile.blue.$ring_type -mca pml ob1 -mca btl tcp,self -mca btl_tcp_if_include eno1 \
42 | 	-x CUDA_VISIBLE_DEVICES=0 \
43 | 	-x NCCL_DEBUG=INFO -x NCCL_ALGO=Ring -x NCCL_PROTO=Simple \
44 | 	-x NCCL_IB_GID_INDEX=3 -x NCCL_SOCKET_IFNAME=rdma0 \
45 | 	-x NCCL_MAX_NCHANNELS=2 -x NCCL_MIN_NCHANNELS=2 -x NCCL_IB_QPS_PER_CONNECTION=1 \
46 | 	-x NCCL_IB_TC=106 \
47 | 	-x NCCL_EPOCHS=20 \
48 | 		$WORKDIR/../build/all_reduce_perf --datatype=half -b 128M -e 128M
49 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting2/run_nccl_job_small.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | # green or red: both take 2 gpus
 6 | 
 7 | usage() {
 8 | 	echo "Usage: $0 <job_color>     ring_type=green|red"
 9 | }
10 | 
11 | if [ $# -ne 1 ]; then
12 | 	usage
13 | 	exit 1
14 | fi
15 | 
16 | color=$1
17 | 
18 | case $color in
19 | 	green)
20 | 		cat > hostfile.$color <<EOF
21 | danyang-02 slots=1
22 | danyang-01 slots=1
23 | EOF
24 | 		;;
25 | 	red)
26 | 		cat > hostfile.$color <<EOF
27 | danyang-03 slots=1
28 | danyang-05 slots=1
29 | EOF
30 | 		;;
31 | 	*)
32 | 		echo "Error: job_color should be either 'green' or 'red', got $color"
33 | 		usage
34 | 		exit 1
35 | 		;;
36 | esac
37 | 
38 | mpirun --hostfile hostfile.$color -mca pml ob1 -mca btl tcp,self -mca btl_tcp_if_include eno1 \
39 | 	-x CUDA_VISIBLE_DEVICES=1 \
40 | 	-x NCCL_DEBUG=INFO -x NCCL_ALGO=Ring -x NCCL_PROTO=Simple \
41 | 	-x NCCL_IB_GID_INDEX=3 -x NCCL_SOCKET_IFNAME=rdma0 \
42 | 	-x NCCL_MAX_NCHANNELS=2 -x NCCL_MIN_NCHANNELS=2 -x NCCL_IB_QPS_PER_CONNECTION=1 \
43 | 	-x NCCL_IB_TC=66 \
44 | 	-x NCCL_EPOCHS=20 \
45 | 		$WORKDIR/../build/all_reduce_perf --datatype=half -b 128M -e 128M
46 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting3/collect_nccl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | import sys
 6 | import csv
 7 | import glob
 8 | import os.path
 9 | import argparse
10 | 
11 | OUTPUT_DIR = "/tmp/nccl_setting3"
12 | 
13 | parser = argparse.ArgumentParser(description='Launch a dsagent and deepscheduler')
14 | parser.add_argument('--solution', '--solution', required=True, type=str,
15 |                     help = 'Give a name to the trial, either NCCL Bad Ring|NCCL Good Ring.')
16 | parser.add_argument('--strip-head', '--strip-head', required=False, type=int, default=1,
17 |                     help = 'Omit the first few lines of the output.')
18 | parser.add_argument('--strip-tail', '--strip-tail', required=False, type=int, default=0,
19 |                     help = 'Omit the last few lines of the output.')
20 | 
21 | args = parser.parse_args()
22 | assert args.solution
23 | 
24 | writer = csv.writer(sys.stdout)
25 | writer.writerow(['Solution', 'App', 'Size (Bytes)', 'Dtype', 'Latency (us)', 'AlgBW (GB/s)', 'BusBW (GB/s)', 'Trial ID'])
26 | 
27 | pat = re.compile(r'\s*\d+\s+\d+.*')
28 | 
29 | def get_latency(rec) -> int:
30 |     return int(rec[4])
31 | 
32 | def get_job_duration(records) -> int:
33 |     return sum(map(get_latency, records))
34 | 
35 | def work(app_color: str, trial_id, nccl_output_path) -> None:
36 |     results = []
37 |     with open(nccl_output_path, 'r') as fin:
38 |         for line in fin:
39 |             match = pat.match(line)
40 |             if match is not None:
41 |                 tokens = line.split()
42 |                 # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0']
43 |                 size = tokens[0]
44 |                 dtype = tokens[2]
45 |                 latency_us = tokens[9]
46 |                 algbw = tokens[10]
47 |                 busbw = tokens[11]
48 |                 results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id])
49 |     return results
50 | 
51 | jobs = []
52 | 
53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')):
54 |     tokens = path.split('.')
55 |     trial_id = tokens[-3].split('/')[-1]
56 |     color = tokens[-2]
57 |     jobs += [work(color, trial_id, path)]
58 | 
59 | # min_dura = min(map(get_job_duration, jobs))
60 | # print(f'min job duration: {min_dura}')
61 | 
62 | for results in jobs:
63 |     assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results))
64 |     if args.strip_tail == 0:
65 |         stripped_results = results[args.strip_head:]
66 |     else:
67 |         stripped_results = results[args.strip_head:-args.strip_tail]
68 |     for rec in stripped_results:
69 |         writer.writerow(rec)
70 |     # writer.writerow(rec)
71 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting3/run_all_jobs_once.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <ring_type> <trial_id>    ring_type=goodring|badring"
 7 | }
 8 | 
 9 | if [ $# -ne 2 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | ring_type=$1
15 | trial_id=$2
16 | 
17 | OUTPUT_DIR=/tmp/nccl_setting3
18 | 
19 | $WORKDIR/run_nccl_job_small.sh $ring_type blue |& tee $OUTPUT_DIR/$trial_id.blue.stdout &
20 | $WORKDIR/run_nccl_job_small.sh $ring_type red |& tee $OUTPUT_DIR/$trial_id.red.stdout &
21 | 
22 | wait
23 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting3/run_nccl_all_jobs_multiple_times.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 | 	echo "Usage: $0 <num_iters> <ring_type>    ring_type=goodring|badring"
 5 | }
 6 | 
 7 | if [ $# -ne 2 ]; then
 8 | 	usage
 9 | 	exit 1
10 | fi
11 | 
12 | num_iters=$1
13 | if [ $num_iters -gt 20 ]; then
14 | 	echo "$num_iters too large"
15 | 	exit 1
16 | fi
17 | 
18 | ring_type=$2
19 | 
20 | suffix=`date +%Y%m%d.%H.%M.%S`
21 | output_dir=/tmp/nccl_setting3.${suffix}
22 | mkdir -p $output_dir
23 | unlink /tmp/nccl_setting3
24 | ln -sf $output_dir /tmp/nccl_setting3
25 | 
26 | 
27 | for i in `seq 1 $num_iters`; do
28 | 	echo Case $i
29 | 	./run_all_jobs_once.sh $ring_type $i
30 | done
31 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting3/run_nccl_job_small.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <ring_type> <job_color>    ring_type=goodring|badring, job_color=blue|red"
 7 | }
 8 | 
 9 | if [ $# -ne 2 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | ring_type=$1
15 | color=$2
16 | 
17 | case $ring_type in
18 | 	goodring)
19 | 		cat > hostfile.$color.$ring_type <<EOF
20 | danyang-02 slots=1
21 | danyang-03 slots=1
22 | danyang-01 slots=1
23 | danyang-05 slots=1
24 | EOF
25 | 		;;
26 | 	badring)
27 | 		cat > hostfile.$color.$ring_type <<EOF
28 | danyang-02 slots=1
29 | danyang-01 slots=1
30 | danyang-03 slots=1
31 | danyang-05 slots=1
32 | EOF
33 | 		;;
34 | 	*)
35 | 		echo "Error: ring_type should be either 'goodring' or 'badring', got $ring_type"
36 | 		usage
37 | 		exit 1
38 | 		;;
39 | esac
40 | 
41 | case $color in
42 | 	blue)
43 | 		device_id=0
44 | 		tclass=106
45 | 		;;
46 | 	red)
47 | 		device_id=1
48 | 		tclass=66
49 | 		;;
50 | 	*)
51 | 		echo "Error: job_color should be either 'blue' or 'red', got $color"
52 | 		usage
53 | 		exit 1
54 | 		;;
55 | esac
56 | 
57 | echo device_id = $device_id
58 | echo Traffic class = $tclass
59 | 
60 | mpirun --hostfile hostfile.$color.$ring_type -mca pml ob1 -mca btl tcp,self -mca btl_tcp_if_include eno1 \
61 | 	-x CUDA_VISIBLE_DEVICES=$device_id \
62 | 	-x NCCL_DEBUG=INFO -x NCCL_ALGO=Ring -x NCCL_PROTO=Simple \
63 | 	-x NCCL_IB_GID_INDEX=3 -x NCCL_SOCKET_IFNAME=rdma0 \
64 | 	-x NCCL_MAX_NCHANNELS=2 -x NCCL_MIN_NCHANNELS=2 -x NCCL_IB_QPS_PER_CONNECTION=1 \
65 | 	-x NCCL_IB_TC=$tclass \
66 | 	-x NCCL_EPOCHS=20 \
67 | 		$WORKDIR/../build/all_reduce_perf --datatype=half -b 128M -e 128M
68 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting4/collect_nccl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import re
 5 | import sys
 6 | import csv
 7 | import glob
 8 | import os.path
 9 | import argparse
10 | 
11 | OUTPUT_DIR = "/tmp/nccl_setting4"
12 | 
13 | parser = argparse.ArgumentParser(description='')
14 | parser.add_argument('--solution', '--solution', required=True, type=str,
15 |                     help = 'Give a name to the trial, either NCCL Bad Ring|NCCL Good Ring.')
16 | parser.add_argument('--strip-head', '--strip-head', required=False, type=int, default=1,
17 |                     help = 'Omit the first few lines of the output.')
18 | parser.add_argument('--strip-tail', '--strip-tail', required=False, type=int, default=0,
19 |                     help = 'Omit the last few lines of the output.')
20 | 
21 | args = parser.parse_args()
22 | assert args.solution
23 | 
24 | writer = csv.writer(sys.stdout)
25 | writer.writerow(['Solution', 'App', 'Size (Bytes)', 'Dtype', 'Latency (us)', 'AlgBW (GB/s)', 'BusBW (GB/s)', 'Trial ID'])
26 | 
27 | pat = re.compile(r'\s*\d+\s+\d+.*')
28 | 
29 | def get_latency(rec) -> int:
30 |     return int(rec[4])
31 | 
32 | def get_job_duration(records) -> int:
33 |     return sum(map(get_latency, records))
34 | 
35 | def work(app_color: str, trial_id, nccl_output_path) -> None:
36 |     results = []
37 |     with open(nccl_output_path, 'r') as fin:
38 |         for line in fin:
39 |             match = pat.match(line)
40 |             if match is not None:
41 |                 tokens = line.split()
42 |                 # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0']
43 |                 size = tokens[0]
44 |                 dtype = tokens[2]
45 |                 latency_us = tokens[9]
46 |                 algbw = tokens[10]
47 |                 busbw = tokens[11]
48 |                 results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id])
49 |     return results
50 | 
51 | jobs = []
52 | 
53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')):
54 |     tokens = path.split('.')
55 |     trial_id = tokens[-3].split('/')[-1]
56 |     color = tokens[-2]
57 |     jobs += [work(color, trial_id, path)]
58 | 
59 | # min_dura = min(map(get_job_duration, jobs))
60 | # print(f'min job duration: {min_dura}')
61 | 
62 | for results in jobs:
63 |     assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results))
64 |     if args.strip_tail == 0:
65 |         stripped_results = results[args.strip_head:]
66 |     else:
67 |         stripped_results = results[args.strip_head:-args.strip_tail]
68 |     for rec in stripped_results:
69 |         writer.writerow(rec)
70 |     # writer.writerow(rec)
71 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting4/run_all_jobs_once.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <trial_id>"
 7 | }
 8 | 
 9 | if [ $# -ne 1 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | trial_id=$1
15 | 
16 | OUTPUT_DIR=/tmp/nccl_setting4
17 | 
18 | for job in {blue,red,green}; do
19 | 	echo $job
20 | 	$WORKDIR/run_nccl_job.sh $job |& tee $OUTPUT_DIR/$trial_id.$job.stdout &
21 | done
22 | 
23 | wait
24 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting4/run_nccl_all_jobs_multiple_times.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | usage() {
 4 | 	echo "Usage: $0 <num_iters>"
 5 | }
 6 | 
 7 | if [ $# -ne 1 ]; then
 8 | 	usage
 9 | 	exit 1
10 | fi
11 | 
12 | num_iters=$1
13 | if [ $num_iters -gt 20 ]; then
14 | 	echo "$num_iters too large"
15 | 	exit 1
16 | fi
17 | 
18 | suffix=`date +%Y%m%d.%H.%M.%S`
19 | output_dir=/tmp/nccl_setting4.${suffix}
20 | mkdir -p $output_dir
21 | unlink /tmp/nccl_setting4
22 | ln -sf $output_dir /tmp/nccl_setting4
23 | 
24 | 
25 | for i in `seq 1 $num_iters`; do
26 | 	echo Case $i
27 | 	./run_all_jobs_once.sh $i
28 | done
29 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/setting4/run_nccl_job.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | WORKDIR=`dirname $(realpath $0)`
 4 | 
 5 | usage() {
 6 | 	echo "Usage: $0 <job>    job=blue|green|red"
 7 | }
 8 | 
 9 | if [ $# -ne 1 ]; then
10 | 	usage
11 | 	exit 1
12 | fi
13 | 
14 | job=$1
15 | 
16 | case $job in
17 | 	blue)
18 | 		device_id="0,1"
19 | 		tclass=0
20 | 		num_channels=2
21 | 		cat > hostfile.$job <<EOF
22 | danyang-02 slots=2
23 | danyang-01 slots=2
24 | EOF
25 | 		;;
26 | 	green)
27 | 		device_id=0
28 | 		tclass=106
29 | 		num_channels=2
30 | 		cat > hostfile.$job <<EOF
31 | danyang-03 slots=1
32 | danyang-05 slots=1
33 | EOF
34 | 		;;
35 | 	red)
36 | 		device_id=1
37 | 		tclass=66
38 | 		num_channels=2
39 | 		cat > hostfile.$job <<EOF
40 | danyang-03 slots=1
41 | danyang-05 slots=1
42 | EOF
43 | 		;;
44 | 	*)
45 | 		echo "Error: job should be either 'blue', 'green' or 'red', got $job"
46 | 		usage
47 | 		exit 1
48 | 		;;
49 | esac
50 | 
51 | echo device_id = $device_id
52 | echo Traffic class = $tclass
53 | 
54 | mpirun --hostfile hostfile.$job -mca pml ob1 -mca btl tcp,self -mca btl_tcp_if_include eno1 \
55 | 	-x CUDA_VISIBLE_DEVICES=$device_id \
56 | 	-x NCCL_DEBUG=INFO -x NCCL_ALGO=Ring -x NCCL_PROTO=Simple \
57 | 	-x NCCL_IB_GID_INDEX=3 -x NCCL_SOCKET_IFNAME=rdma0 \
58 | 	-x NCCL_MAX_NCHANNELS=$num_channels -x NCCL_MIN_NCHANNELS=$num_channels -x NCCL_IB_QPS_PER_CONNECTION=1 \
59 | 	-x NCCL_IB_TC=$tclass \
60 | 	-x NCCL_EPOCHS=20 \
61 | 		$WORKDIR/../build/all_reduce_perf --datatype=half -b 128M -e 128M
62 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/src/nccl1_compat.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL1_COMPAT_H
 8 | #define NCCL1_COMPAT_H
 9 | 
10 | #ifndef NCCL_MAJOR // NCCL 1.x
11 | #define NCCL_MAJOR 1
12 | #define NCCL_MINOR 0
13 | 
14 | #define ncclNumOps nccl_NUM_OPS
15 | #define ncclNumTypes nccl_NUM_TYPES
16 | 
17 | static ncclResult_t ncclGroupStart() { return ncclSuccess; }
18 | static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
19 | 
20 | #define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
21 | 
22 | static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
23 |     ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
24 |   CHECKCOUNT(count);
25 |   return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
26 | }
27 | static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
28 |     ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) {
29 |   CHECKCOUNT(count);
30 |   return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
31 | }
32 | static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
33 |     ncclComm_t comm, cudaStream_t stream) {
34 |   CHECKCOUNT(count);
35 |   return ncclBcast(buff, (int)count, datatype, root, comm, stream);
36 | }
37 | static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
38 |     size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
39 |     cudaStream_t stream) {
40 |   CHECKCOUNT(recvcount);
41 |   return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
42 | }
43 | static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
44 |     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
45 |   CHECKCOUNT(sendcount);
46 |   return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
47 | }
48 | #endif
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/src/timer.cc:
--------------------------------------------------------------------------------
 1 | #include "timer.h"
 2 | 
 3 | // Make sure to compile this translation unit with the host compiler and not
 4 | // nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
 5 | #include <chrono>
 6 | 
 7 | namespace {
 8 |   std::uint64_t now() {
 9 |     using clock = std::chrono::steady_clock;
10 |     return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
11 |   }
12 | }
13 | 
14 | timer::timer() {
15 |   t0 = now();
16 | }
17 | 
18 | double timer::elapsed() const {
19 |   std::uint64_t t1 = now();
20 |   return 1.e-9*(t1 - t0);
21 | }
22 | 
23 | double timer::reset() {
24 |   std::uint64_t t1 = now();
25 |   double ans = 1.e-9*(t1 - t0);
26 |   t0 = t1;
27 |   return ans;
28 | }
29 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/src/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _408319ecdd5b47b28bf8f511c4fdf816
 2 | #define _408319ecdd5b47b28bf8f511c4fdf816
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | // Can't include <chrono> because of bug with gcc 10.3.0
 7 | class timer {
 8 |   std::uint64_t t0;
 9 | public:
10 |   timer();
11 |   double elapsed() const;
12 |   double reset();
13 | };
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/verifiable/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../makefiles/common.mk
 2 | 
 3 | .PHONY: all clean
 4 | 
 5 | BUILDDIR := $(abspath ../../build)
 6 | NCCLDIR := $(BUILDDIR)
 7 | NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
 8 | DST_DIR := $(BUILDDIR)/test/verifiable
 9 | 
10 | all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
11 | 
12 | clean:
13 | 	rm -rf $(DST_DIR)
14 | 
15 | TEST_VERIFIABLE_SRCDIR := .
16 | TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
17 | include verifiable.mk
18 | 
19 | self_test: $(DST_DIR)/self_test
20 | 
21 | $(DST_DIR)/self_test: verifiable.cu verifiable.h
22 | 	@printf "Linking  %s\n" $@
23 | 	@mkdir -p $(DST_DIR)
24 | 	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
25 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/verifiable/verifiable.h:
--------------------------------------------------------------------------------
 1 | #ifndef _d41d8cd98f00b204e9800998ecf8427e
 2 | #define _d41d8cd98f00b204e9800998ecf8427e
 3 | 
 4 | #include <cuda_runtime.h>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | /* Routines for launching kernels that verify reduction results. A significant
 9 |  * feature of these routines is they carefully craft floating point input
10 |  * to produce exactly predictable output.
11 |  *
12 |  * int elt_ty: actually just a ncclDataType_t
13 |  *
14 |  * int red_op: mostly just a  ncclRedOp_t. Since PreMulSum ops are dynamically
15 |  * created, these are encoded as the value ncclNumOps and their scalar is
16 |  * assumed to be `ncclVerifiablePremulScalar(rank_me)`
17 |  *
18 |  * uint64_t seed: arbitrary 64-bits to use in seeding the random values
19 |  *
20 |  * intptr_t elt_ix0: index of first element pointed to by elts when generating
21 |  * random values. This makes it possible to generate subsequences independently
22 |  * as well as in aggregate.
23 |  *
24 |  * int rank_n: Number of contributions into the reduction. Non-reduction
25 |  * collectives like broadcast, gather, etc will always set this to one.
26 |  *
27 |  * int rank_me: Index of this contribution
28 |  */
29 | 
30 | // Use this as the local scalar for PreMulSum ops
31 | template<typename T>
32 | __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
33 |   return T(rank_me%2 == 0 ? 1.0f : 2.0f);
34 | }
35 | 
36 | // Enqueue kernel to generate data which is to be reduced.
37 | void ncclVerifiablePrepareInput(
38 |   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
39 |   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
40 | );
41 | 
42 | // Enqueue kernel to generate expected results of reduction.
43 | void ncclVerifiablePrepareExpected(
44 |   void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
45 |   uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
46 | );
47 | 
48 | // Enqueue kernel to verify reduced data matches expectation. The number of
49 | // failed elements is written to bad_elt_n which must be in cudaHost memory.
50 | // If `expected == nullptr` then the expected results are generated on-the-fly
51 | // which can be costly. Thus if you plan to run the same reduction multiple
52 | // times it is advantageous to precompute the expected values with
53 | // ncclVerifiablePrepareExpected and pass them as `expected` here.
54 | void ncclVerifiableVerify(
55 |   void const *results, void const *expected, intptr_t elt_n, int elt_ty,
56 |   int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
57 |   int64_t *bad_elt_n, cudaStream_t stream
58 | );
59 | #endif
60 | 


--------------------------------------------------------------------------------
/nccl-tests-mccs/verifiable/verifiable.mk:
--------------------------------------------------------------------------------
 1 | # We requires both of the following paths to be set upon including this makefile
 2 | # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 3 | # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
 4 | 
 5 | TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 6 | TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 7 | 
 8 | $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
 9 | 	@printf "Compiling %s\n" $@
10 | 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
11 | 	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
12 | 


--------------------------------------------------------------------------------
/rust-toolchain:
--------------------------------------------------------------------------------
1 | nightly-2023-11-11


--------------------------------------------------------------------------------
/src/collectives-sys/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "collectives-sys"
3 | version = "0.1.0"
4 | edition = "2021"
5 | 
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 | 
8 | [build-dependencies]
9 | bindgen = "0.62"


--------------------------------------------------------------------------------
/src/collectives-sys/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | 
 4 | fn main() {
 5 |     let dir = env!("CARGO_MANIFEST_DIR");
 6 |     let mut build_dir = PathBuf::from(dir);
 7 |     build_dir.pop();
 8 |     build_dir.push("collectives/build/device");
 9 | 
10 |     println!(
11 |         "cargo:rustc-link-search={}",
12 |         build_dir.as_os_str().to_str().unwrap()
13 |     );
14 |     println!("cargo:rustc-link-lib=colldevice");
15 |     println!("cargo:rerun-if-changed=wrapper.h");
16 |     println!(
17 |         "cargo:rerun-if-changed={}",
18 |         build_dir.as_os_str().to_str().unwrap()
19 |     );
20 | 
21 |     let bindings = bindgen::Builder::default()
22 |         // The input header we would like to generate
23 |         // bindings for.
24 |         .header("wrapper.h")
25 |         .clang_arg("-I../collectives/include")
26 |         .clang_arg("-I/usr/local/cuda/include")
27 |         .clang_arg("-I/usr/local/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include")
28 |         .clang_arg("-x")
29 |         .clang_arg("c++")
30 |         .clang_arg("-std=c++11")
31 |         .clang_arg("-stdlib=libc++")
32 |         .allowlist_type("^mccsDev.*")
33 |         .allowlist_function("^mccsKernel.*")
34 |         .allowlist_var("^MCCS.*")
35 |         .default_enum_style(bindgen::EnumVariation::Rust {
36 |             non_exhaustive: false,
37 |         })
38 |         // Tell cargo to invalidate the built crate whenever any of the
39 |         // included header files changed.
40 |         .parse_callbacks(Box::new(bindgen::CargoCallbacks))
41 |         .derive_eq(true)
42 |         .derive_hash(true)
43 |         // Finish the builder and generate the bindings.
44 |         .generate()
45 |         // Unwrap the Result and panic on failure.
46 |         .expect("Unable to generate bindings");
47 | 
48 |     // Write the bindings to the $OUT_DIR/bindings.rs file.
49 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
50 |     bindings
51 |         .write_to_file(out_path.join("bindings.rs"))
52 |         .expect("Couldn't write bindings!");
53 | }
54 | 


--------------------------------------------------------------------------------
/src/collectives-sys/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_upper_case_globals)]
 2 | #![allow(non_camel_case_types)]
 3 | #![allow(non_snake_case)]
 4 | 
 5 | #[allow(clippy::all)]
 6 | mod binding {
 7 |     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 8 | }
 9 | pub use binding::*;
10 | 


--------------------------------------------------------------------------------
/src/collectives-sys/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cuda_runtime.h"
2 | #include "devcomm.h"
3 | #include "collectives.h"
4 | 


--------------------------------------------------------------------------------
/src/collectives/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENSE.txt for license information
 5 | #
 6 | 
 7 | include makefiles/common.mk
 8 | 
 9 | BUILDDIR ?= $(abspath build)
10 | OBJDIR := $(BUILDDIR)/device
11 | 
12 | LIBSRCFILES := src/functions.cu
13 | 
14 | LIBSRCFILES += src/all_gather.cu 
15 | LIBSRCFILES += src/all_reduce.cu 
16 | 
17 | 
18 | DEPFILES   := $(patsubst src/%.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
19 | DEPENDFILES:= $(DEPFILES:%.d=%.dep)
20 | STATICLIB  := $(OBJDIR)/libcolldevice.a
21 | DEVOBJ     := $(OBJDIR)/devlink.o
22 | RULESFILE  := $(OBJDIR)/Makefile.rules
23 | 
24 | NVCUFLAGS  += -Isrc -Iinclude --compiler-options "-fPIC -fvisibility=hidden"
25 | 
26 | 
27 | all: $(STATICLIB)
28 | 
29 | # Dummy rule so that the extra dependency (%.dep) files are preserved by make
30 | all_deps: $(DEPENDFILES)
31 | 
32 | # Auto-generating the rules per op/reduction/datatype/algorithm
33 | $(RULESFILE) : gen_rules.sh
34 | 	@printf "Generating %-35s > %s\n" rules $@
35 | 	@mkdir -p $(OBJDIR)
36 | 	@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
37 | 
38 | -include $(RULESFILE)
39 | 
40 | LIBOBJ     := $(GENOBJS) $(OBJDIR)/functions.o 
41 | 
42 | -include $(DEPFILES)
43 | 
44 | $(STATICLIB): $(LIBOBJ) $(DEVOBJ)
45 | 	@printf "Archiving  %-35s > %s\n" objects $@
46 | 	ar cr $@ $^
47 | 
48 | # We do not want make to build *.d when running make clean.
49 | # So we only provide targets for .dep which will produce .dep and .d,
50 | # with only .d being included, and .dep keeping track of what needs to
51 | # be regenerated.
52 | $(OBJDIR)/%.dep : src/%.cu
53 | 	@mkdir -p $(OBJDIR)
54 | 	@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
55 | 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
56 | 	@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
57 |                 sed -e 's/^ *//' -e 's/$$/:/' >> $@
58 | 	@rm -f $@.tmp
59 | 	@cp $@ $(@:.dep=.d)
60 | 
61 | # Compiled kernels and collectives with relocatable device code ...
62 | $(OBJDIR)/functions.o : src/functions.cu $(OBJDIR)/functions.dep
63 | 	@printf "Compiling  %-35s > %s\n" $< $@
64 | 	mkdir -p `dirname $@`
65 | 	$(NVCC) $(NVCUFLAGS) -dc $< -o $@
66 | 
67 | # ... and create the device-side linked object with all those.
68 | $(DEVOBJ) : $(LIBOBJ)
69 | 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
70 | 
71 | clean:
72 | 	rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
73 | 


--------------------------------------------------------------------------------
/src/collectives/gen_rules.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dir=$1
 4 | 
 5 | datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64"
 6 | if [ "$CUDA_MAJOR" -ge 11 ]
 7 | then
 8 |     datatypes+=" bf16"
 9 | fi
10 | 
11 | targets="GENOBJS := \\\\\n"
12 | 
13 | for base in {"all_gather","all_reduce"}; do
14 |   opn=0
15 |   for op in {"sum","prod"}; do
16 |     if [ "$base" = "all_gather" ] && [ "$op" != "sum" ]; then
17 |       continue
18 |     fi
19 |     dtn=0
20 |     # Order must match that of the ncclDataType_t enum
21 |     for dt in ${datatypes}; do
22 |       # Generate a unique filename for each compilation unit,
23 |       # otherwise the __nv_module_id may conflict at link time
24 |       echo "${dir}/${base}_${op}_${dt}.cu : src/${base}.cu"
25 |       echo "	@printf \"Copying    %-35s > %s\\\\n\" \$< \$@"
26 |       echo "	cp \$< \$@"
27 |       echo ""
28 |       # Compile the file
29 |       echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu src/${base}.cu ${dir}/${base}.dep"
30 | 
31 |       echo "	@printf \"Compiling  %-35s > %s\\\\n\" src/${base}.cu ${dir}/${base}_${op}_${dt}.o"
32 |       echo "	mkdir -p ${dir}"
33 |       echo "	\${NVCC} -DMCCS_OP=${opn} -DMCCS_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
34 |       echo ""
35 |       targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
36 |       dtn=$(($dtn + 1))
37 |     done
38 |     opn=$(($opn + 1))
39 |   done
40 | done
41 | echo -e "$targets"
42 | 


--------------------------------------------------------------------------------
/src/collectives/include/align.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_ALIGN_H_
 8 | #define NCCL_ALIGN_H_
 9 | 
10 | #define DIVUP(x, y) \
11 |     (((x)+(y)-1)/(y))
12 | 
13 | #define ROUNDUP(x, y) \
14 |     (DIVUP((x), (y))*(y))
15 | 
16 | #define ALIGN_SIZE(size, align) \
17 |   size = ((size + (align) - 1) / (align)) * (align);
18 | 
19 | #if !__CUDA_ARCH__
20 |   #ifndef __host__
21 |     #define __host__
22 |   #endif
23 |   #ifndef __device__
24 |     #define __device__
25 |   #endif
26 | #endif
27 | 
28 | template<typename X, typename Y, typename Z = decltype(X()+Y())>
29 | __host__ __device__ constexpr Z divUp(X x, Y y) {
30 |   return (x+y-1)/y;
31 | }
32 | 
33 | template<typename X, typename Y, typename Z = decltype(X()+Y())>
34 | __host__ __device__ constexpr Z roundUp(X x, Y y) {
35 |   return (x+y-1) - (x+y-1)%y;
36 | }
37 | 
38 | // assumes second argument is a power of 2
39 | template<typename X, typename Z = decltype(X()+int())>
40 | __host__ __device__ constexpr Z alignUp(X x, int a) {
41 |   return (x+a-1) & Z(-a);
42 | }
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/collectives/src/all_gather.cu:
--------------------------------------------------------------------------------
1 | #include "all_gather.h"
2 | #include "common.h"
3 | #include "collectives.h"
4 | 
5 | IMPL_COLL_C(AllGather);
6 | 


--------------------------------------------------------------------------------
/src/collectives/src/all_reduce.cu:
--------------------------------------------------------------------------------
1 | #include "all_reduce.h"
2 | #include "common.h"
3 | #include "collectives.h"
4 | 
5 | 
6 | IMPL_COLL_R(AllReduce);


--------------------------------------------------------------------------------
/src/collectives/src/op128.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See LICENSE.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef OP128_H_
 8 | #define OP128_H_
 9 | 
10 | inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
11 |   asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];"
12 |       : "=l"(v0), "=l"(v1) : "l"(ptr));
13 | }
14 | 
15 | inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
16 |   asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};"
17 |       :: "l"(v0), "l"(v1), "l"(ptr));
18 | }
19 | 
20 | inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
21 |   uint64_t* shmemAsmPtr;
22 |   asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr));
23 |   return shmemAsmPtr;
24 | }
25 | 
26 | inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
27 |   asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];"
28 |       : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr));
29 | }
30 | 
31 | inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
32 |   asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};"
33 |       :: "l"(v0), "l"(v1), "l"(shmemAsmPtr));
34 | }
35 | 
36 | template<typename T>
37 | inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) {
38 |   union {
39 |     uint32_t tmp4[4];
40 |     uint64_t tmp8[2];
41 |   };
42 |   if(sizeof(T) < 4) {
43 |     uint32_t *ptr4 = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(ptr) & -uintptr_t(4));
44 |     #pragma unroll
45 |     for(int e=0; e < 4; e++) {
46 |       // Produce 4 bytes of sub-register type by reading 2 4-byte
47 |       // aligned values and shifting.
48 |       uint32_t lo, hi;
49 |       asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0));
50 |       asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1));
51 |       tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
52 |     }
53 |   }
54 |   else if(sizeof(T) == 4) {
55 |     #pragma unroll
56 |     for(int e=0; e < 4; e++)
57 |       asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e));
58 |   }
59 |   else /*sizeof(T)==8*/ {
60 |     #pragma unroll
61 |     for(int e=0; e < 2; e++)
62 |       asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e));
63 |   }
64 |   v0 = tmp8[0];
65 |   v1 = tmp8[1];
66 | }
67 | 
68 | #endif
69 | 


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-driver-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cuda-driver-sys"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | 
10 | [build-dependencies]
11 | cuda-finder = { path = "../cuda-finder" }
12 | bindgen = "0.62"


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-driver-sys/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | 
 4 | use cuda_finder::{find_cuda, find_cuda_windows};
 5 | 
 6 | fn main() {
 7 |     let bindings = bindgen::Builder::default()
 8 |         .header("wrapper.h")
 9 |         .allowlist_type("^CU.*")
10 |         .allowlist_type("^cuuint(32|64)_t")
11 |         .allowlist_type("^cudaError_enum")
12 |         .allowlist_type("^cu.*Complex$")
13 |         .allowlist_type("^cuda.*")
14 |         .allowlist_type("^libraryPropertyType.*")
15 |         .allowlist_var("^CU.*")
16 |         .allowlist_function("^cu.*")
17 |         .default_enum_style(bindgen::EnumVariation::Rust {
18 |             non_exhaustive: false,
19 |         })
20 |         .generate_comments(false)
21 |         .derive_default(true)
22 |         .derive_eq(true)
23 |         .derive_hash(true)
24 |         .derive_ord(true)
25 |         .clang_arg("-I/usr/local/cuda/include")
26 |         .generate()
27 |         .expect("Unable to generate bindings");
28 | 
29 |     // Write the bindings to the $OUT_DIR/bindings.rs file.
30 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
31 |     bindings
32 |         .write_to_file(out_path.join("bindings.rs"))
33 |         .expect("Couldn't write bindings!");
34 | 
35 |     if cfg!(target_os = "windows") {
36 |         println!(
37 |             "cargo:rustc-link-search=native={}",
38 |             find_cuda_windows().display()
39 |         );
40 |     } else {
41 |         for path in find_cuda() {
42 |             println!("cargo:rustc-link-search=native={}", path.display());
43 |         }
44 |     };
45 | 
46 |     println!("cargo:rustc-link-lib=dylib=cuda");
47 |     println!("cargo:rerun-if-changed=build.rs");
48 |     println!("cargo:rerun-if-env-changed=CUDA_LIBRARY_PATH");
49 | }
50 | 


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-driver-sys/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_upper_case_globals)]
 2 | #![allow(non_camel_case_types)]
 3 | #![allow(non_snake_case)]
 4 | 
 5 | #[allow(clippy::all)]
 6 | mod binding {
 7 |     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 8 | }
 9 | pub use binding::*;
10 | 


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-driver-sys/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cuComplex.h"
2 | #include "cuda.h"
3 | #include "cudaProfiler.h"
4 | #include "library_types.h"
5 | #include "vector_types.h"


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-finder/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "cuda-finder"
3 | version = "0.1.0"
4 | edition = "2021"
5 | 
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 | 
8 | [dependencies]
9 | glob = "0.3"


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-runtime-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cuda-runtime-sys"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | 
10 | [build-dependencies]
11 | cuda-finder = { path = "../cuda-finder" }
12 | bindgen = "0.62"


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-runtime-sys/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | 
 4 | use cuda_finder::{find_cuda, find_cuda_windows};
 5 | 
 6 | fn main() {
 7 |     let bindings = bindgen::Builder::default()
 8 |         .header("wrapper.h")
 9 |         .allowlist_type("^cuda.*")
10 |         .allowlist_type("^surfaceReference")
11 |         .allowlist_type("^textureReference")
12 |         .allowlist_var("^cuda.*")
13 |         .allowlist_function("^cuda.*")
14 |         .default_enum_style(bindgen::EnumVariation::Rust {
15 |             non_exhaustive: false,
16 |         })
17 |         .generate_comments(false)
18 |         .derive_default(true)
19 |         .derive_eq(true)
20 |         .derive_hash(true)
21 |         .derive_ord(true)
22 |         .clang_arg("-I/usr/local/cuda/include")
23 |         .generate()
24 |         .expect("Unable to generate bindings");
25 | 
26 |     // Write the bindings to the $OUT_DIR/bindings.rs file.
27 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
28 |     bindings
29 |         .write_to_file(out_path.join("bindings.rs"))
30 |         .expect("Couldn't write bindings!");
31 | 
32 |     if cfg!(target_os = "windows") {
33 |         println!(
34 |             "cargo:rustc-link-search=native={}",
35 |             find_cuda_windows().display()
36 |         );
37 |     } else {
38 |         for path in find_cuda() {
39 |             println!("cargo:rustc-link-search=native={}", path.display());
40 |         }
41 |     };
42 | 
43 |     println!("cargo:rustc-link-lib=dylib=cudart");
44 |     println!("cargo:rerun-if-changed=build.rs");
45 |     println!("cargo:rerun-if-env-changed=CUDA_LIBRARY_PATH");
46 | }
47 | 


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-runtime-sys/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_upper_case_globals)]
 2 | #![allow(non_camel_case_types)]
 3 | #![allow(non_snake_case)]
 4 | 
 5 | #[allow(clippy::all)]
 6 | mod binding {
 7 |     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 8 | }
 9 | pub use binding::*;
10 | 


--------------------------------------------------------------------------------
/src/cuda-sys/cuda-runtime-sys/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "cuda_runtime.h"


--------------------------------------------------------------------------------
/src/cuda-sys/nvml-sys/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "nvml-sys"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | 
10 | [build-dependencies]
11 | cuda-finder = { path = "../cuda-finder" }
12 | bindgen = "0.62"


--------------------------------------------------------------------------------
/src/cuda-sys/nvml-sys/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | 
 4 | use cuda_finder::{find_cuda, find_cuda_windows};
 5 | 
 6 | fn main() {
 7 |     let bindings = bindgen::Builder::default()
 8 |         .header("wrapper.h")
 9 |         .allowlist_type("^NVML.*")
10 |         .allowlist_var("^NVML.*")
11 |         .allowlist_type("^nvml.*")
12 |         .allowlist_function("^nvml.*")
13 |         .default_enum_style(bindgen::EnumVariation::Rust {
14 |             non_exhaustive: false,
15 |         })
16 |         .generate_comments(false)
17 |         .derive_default(true)
18 |         .derive_eq(true)
19 |         .derive_hash(true)
20 |         .derive_ord(true)
21 |         .clang_arg("-I/usr/local/cuda/include")
22 |         .generate()
23 |         .expect("Unable to generate bindings");
24 | 
25 |     // Write the bindings to the $OUT_DIR/bindings.rs file.
26 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
27 |     bindings
28 |         .write_to_file(out_path.join("bindings.rs"))
29 |         .expect("Couldn't write bindings!");
30 | 
31 |     if cfg!(target_os = "windows") {
32 |         println!(
33 |             "cargo:rustc-link-search=native={}",
34 |             find_cuda_windows().display()
35 |         );
36 |     } else {
37 |         for path in find_cuda() {
38 |             println!("cargo:rustc-link-search=native={}", path.display());
39 |         }
40 |     };
41 | 
42 |     println!("cargo:rustc-link-lib=dylib=nvidia-ml");
43 |     println!("cargo:rerun-if-changed=build.rs");
44 |     println!("cargo:rerun-if-env-changed=CUDA_LIBRARY_PATH");
45 | }
46 | 


--------------------------------------------------------------------------------
/src/cuda-sys/nvml-sys/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_upper_case_globals)]
 2 | #![allow(non_camel_case_types)]
 3 | #![allow(non_snake_case)]
 4 | 
 5 | #[allow(clippy::all)]
 6 | mod binding {
 7 |     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 8 | }
 9 | pub use binding::*;
10 | 


--------------------------------------------------------------------------------
/src/cuda-sys/nvml-sys/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "nvml.h"


--------------------------------------------------------------------------------
/src/experimental/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "experimental"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-runtime-sys = { path = "../cuda-sys/cuda-runtime-sys" }
10 | nvml-sys = { path = "../cuda-sys/nvml-sys" }
11 | 


--------------------------------------------------------------------------------
/src/experimental/examples/cuda_ipc_client.rs:
--------------------------------------------------------------------------------
 1 | use std::ffi::c_void;
 2 | use std::io::Read;
 3 | use std::mem::size_of;
 4 | use std::net::TcpStream;
 5 | 
 6 | use cuda_runtime_sys::{
 7 |     cudaError, cudaIpcMemHandle_t, cudaIpcMemLazyEnablePeerAccess, cudaIpcOpenMemHandle,
 8 |     cudaMemcpy, cudaMemcpyKind,
 9 | };
10 | 
11 | const BUFFER_SIZE: usize = 1 * 1024 * 1024;
12 | 
13 | fn main() {
14 |     let mut buf = vec![0i32; BUFFER_SIZE / size_of::<i32>()];
15 |     buf.shrink_to_fit();
16 |     assert_eq!(buf.capacity(), BUFFER_SIZE / size_of::<i32>());
17 | 
18 |     let mut handle = cudaIpcMemHandle_t::default();
19 |     {
20 |         let mut stream = TcpStream::connect("localhost:2042").unwrap();
21 |         stream.set_nonblocking(false).unwrap();
22 |         stream.set_nodelay(true).unwrap();
23 |         stream
24 |             .read_exact(unsafe {
25 |                 std::slice::from_raw_parts_mut(
26 |                     &mut handle as *mut _ as *mut u8,
27 |                     size_of::<cudaIpcMemHandle_t>(),
28 |                 )
29 |             })
30 |             .unwrap();
31 |     }
32 |     let mut dev_ptr: *mut c_void = std::ptr::null_mut();
33 |     let err = unsafe {
34 |         cudaIpcOpenMemHandle(
35 |             &mut dev_ptr as *mut _,
36 |             handle,
37 |             cudaIpcMemLazyEnablePeerAccess,
38 |         )
39 |     };
40 |     if err != cudaError::cudaSuccess {
41 |         panic!("cudaIpcOpenMemHandle failed")
42 |     }
43 |     let err = unsafe {
44 |         cudaMemcpy(
45 |             buf.as_mut_ptr() as *mut _,
46 |             dev_ptr,
47 |             BUFFER_SIZE,
48 |             cudaMemcpyKind::cudaMemcpyDeviceToHost,
49 |         )
50 |     };
51 |     if err != cudaError::cudaSuccess {
52 |         panic!("cudaMemcpy failed")
53 |     }
54 | 
55 |     for x in buf.iter() {
56 |         assert_eq!(*x, 42, "CUDA IPC content mismatch");
57 |     }
58 |     println!("buf={}", buf[0]);
59 | }
60 | 


--------------------------------------------------------------------------------
/src/experimental/examples/cuda_ipc_server.rs:
--------------------------------------------------------------------------------
 1 | use std::mem::size_of;
 2 | use std::net::TcpListener;
 3 | use std::{ffi::c_void, io::Write};
 4 | 
 5 | use cuda_runtime_sys::{
 6 |     cudaError, cudaIpcGetMemHandle, cudaIpcMemHandle_t, cudaMalloc, cudaMemcpy, cudaMemcpyKind,
 7 | };
 8 | 
 9 | const BUFFER_SIZE: usize = 1 * 1024 * 1024;
10 | 
11 | fn main() {
12 |     let mut dev_ptr: *mut c_void = std::ptr::null_mut();
13 |     let err = unsafe { cudaMalloc(&mut dev_ptr as *mut _, BUFFER_SIZE) };
14 |     if err != cudaError::cudaSuccess {
15 |         panic!("cudaMalloc failed")
16 |     }
17 | 
18 |     let buf = vec![42i32; BUFFER_SIZE / size_of::<i32>()];
19 |     let err = unsafe {
20 |         cudaMemcpy(
21 |             dev_ptr,
22 |             buf.as_ptr() as *const _,
23 |             BUFFER_SIZE,
24 |             cudaMemcpyKind::cudaMemcpyHostToDevice,
25 |         )
26 |     };
27 |     if err != cudaError::cudaSuccess {
28 |         panic!("cudaMemcpy failed")
29 |     }
30 | 
31 |     let mut handle = cudaIpcMemHandle_t::default();
32 |     let err = unsafe { cudaIpcGetMemHandle(&mut handle as *mut _, dev_ptr) };
33 |     if err != cudaError::cudaSuccess {
34 |         panic!("cudaIpcGetMemHandle failed")
35 |     }
36 | 
37 |     let listener = TcpListener::bind("localhost:2042").unwrap();
38 |     match listener.accept() {
39 |         Ok((mut socket, addr)) => {
40 |             socket
41 |                 .write_all(unsafe {
42 |                     std::slice::from_raw_parts(
43 |                         &handle as *const _ as *const u8,
44 |                         size_of::<cudaIpcMemHandle_t>(),
45 |                     )
46 |                 })
47 |                 .unwrap();
48 |             println!("new client: {addr:?}")
49 |         }
50 |         Err(e) => println!("couldn't get client: {e:?}"),
51 |     }
52 |     std::thread::sleep(std::time::Duration::from_secs(2));
53 | }
54 | 


--------------------------------------------------------------------------------
/src/experimental/examples/get_hwinfo.rs:
--------------------------------------------------------------------------------
 1 | use std::ffi::CString;
 2 | 
 3 | use cuda_runtime_sys::cudaDeviceGetPCIBusId;
 4 | use nvml_sys::{nvmlDeviceGetCpuAffinity, nvmlDeviceGetHandleByPciBusId_v2, nvmlInit_v2};
 5 | 
 6 | fn main() {
 7 |     let bus_id = CString::new(b"00000000:00:00.0").unwrap();
 8 |     let raw_bus_id = bus_id.as_c_str();
 9 |     // including the null terminator
10 |     let len = raw_bus_id.to_bytes().len() + 1;
11 |     let device = 0;
12 |     unsafe {
13 |         cudaDeviceGetPCIBusId(raw_bus_id.as_ptr() as *mut _, len as i32, device);
14 |         let mut handle = std::ptr::null_mut();
15 |         nvmlInit_v2();
16 |         nvmlDeviceGetHandleByPciBusId_v2(raw_bus_id.as_ptr() as *mut _, &mut handle);
17 |         let mut cpu_set = 0u64;
18 |         nvmlDeviceGetCpuAffinity(handle, 1, &mut cpu_set);
19 |         println!("CPU set for device {}: {:#066b}", device, cpu_set);
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/experimental/src/lib.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/gdrcopy-sys/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "gdrcopy-sys"
3 | version = "0.1.0"
4 | edition = "2021"
5 | 
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 | 
8 | [build-dependencies]
9 | bindgen = "0.62"


--------------------------------------------------------------------------------
/src/gdrcopy-sys/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::PathBuf;
 3 | 
 4 | fn main() {
 5 |     let bindings = bindgen::Builder::default()
 6 |         .header("wrapper.h")
 7 |         .allowlist_type("^gdr_.*")
 8 |         .allowlist_var("GPU_.*")
 9 |         .allowlist_function("^gdr_.*")
10 |         .default_enum_style(bindgen::EnumVariation::Rust {
11 |             non_exhaustive: false,
12 |         })
13 |         .generate_comments(false)
14 |         .derive_default(true)
15 |         .derive_eq(true)
16 |         .derive_hash(true)
17 |         .derive_ord(true)
18 |         // .clang_arg("-I/usr/local/cuda/include")
19 |         .generate()
20 |         .expect("Unable to generate bindings");
21 | 
22 |     // Write the bindings to the $OUT_DIR/bindings.rs file.
23 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
24 |     bindings
25 |         .write_to_file(out_path.join("bindings.rs"))
26 |         .expect("Couldn't write bindings!");
27 | 
28 |     println!("cargo:rustc-link-lib=dylib=gdrapi");
29 |     println!("cargo:rerun-if-changed=build.rs");
30 |     println!("cargo:rerun-if-changed=wrapper.h");
31 | }
32 | 


--------------------------------------------------------------------------------
/src/gdrcopy-sys/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![allow(non_upper_case_globals)]
 2 | #![allow(non_camel_case_types)]
 3 | #![allow(non_snake_case)]
 4 | 
 5 | #[allow(clippy::all)]
 6 | mod binding {
 7 |     include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
 8 | }
 9 | pub use binding::*;
10 | 


--------------------------------------------------------------------------------
/src/gdrcopy-sys/wrapper.h:
--------------------------------------------------------------------------------
1 | #include "gdrapi.h"
2 | 


--------------------------------------------------------------------------------
/src/ibverbs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ibverbs"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies.serde]
 9 | version = "1.0"
10 | optional = true
11 | features = ["derive"]
12 | 
13 | [features]
14 | default = ["serde"]
15 | 
16 | [dev-dependencies]
17 | bincode = "1.3"
18 | 
19 | [build-dependencies]
20 | bindgen = "0.69"


--------------------------------------------------------------------------------
/src/ibverbs/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::path::{Path, PathBuf};
 3 | use std::process::Command;
 4 | 
 5 | fn main() {
 6 |     // println!("cargo:include=vendor/rdma-core/build/include");
 7 |     // println!("cargo:rustc-link-search=native=vendor/rdma-core/build/lib");
 8 |     println!("cargo:rustc-link-lib=ibverbs");
 9 |     println!("cargo:rustc-link-lib=mlx5");
10 |     println!("cargo:rerun-if-changed=build.rs");
11 |     // println!("cargo:rerun-if-changed=vendor/rdma-core/libibverbs/verbs.h");
12 |     println!("cargo:rerun-if-changed=wrapper.h");
13 | 
14 |     // initialize and update submodules
15 |     if Path::new(".git").is_dir() {
16 |         Command::new("git")
17 |             .args(&["submodule", "update", "--init"])
18 |             .status()
19 |             .expect("Failed to update submodules.");
20 |     } else {
21 |         assert!(
22 |             Path::new("vendor/rdma-core").is_dir(),
23 |             "vendor source not included"
24 |         );
25 |     }
26 | 
27 |     // build vendor/rdma-core
28 |     Command::new("bash")
29 |         .current_dir("vendor/rdma-core/")
30 |         .args(&["build.sh"])
31 |         .status()
32 |         .expect("Failed to build vendor/rdma-core using build.sh");
33 | 
34 |     // generate the bindings
35 |     let bindings = bindgen::Builder::default()
36 |         .header("/usr/include/infiniband/verbs.h")
37 |         .header("wrapper.h")
38 |         .clang_arg("-Ivendor/rdma-core/build/include/")
39 |         .allowlist_function("ibv_.*")
40 |         .allowlist_type("ibv_.*")
41 |         .allowlist_function("mlx5dv_.*")
42 |         .allowlist_var("IBV_LINK_LAYER_.*")
43 |         .bitfield_enum("ibv_access_flags")
44 |         .bitfield_enum("ibv_qp_attr_mask")
45 |         .bitfield_enum("ibv_wc_flags")
46 |         .bitfield_enum("ibv_send_flags")
47 |         .bitfield_enum("ibv_port_cap_flags")
48 |         .constified_enum_module("ibv_qp_type")
49 |         .constified_enum_module("ibv_qp_state")
50 |         .constified_enum_module("ibv_port_state")
51 |         .constified_enum_module("ibv_wc_opcode")
52 |         .constified_enum_module("ibv_wr_opcode")
53 |         .constified_enum_module("ibv_wc_status")
54 |         //.constified_enum_module("IBV_WC_.*")
55 |         //.constified_enum_module("IBV_WR_.*")
56 |         //.constified_enum_module("IBV_QPS_.*")
57 |         //.constified_enum_module("IBV_PORT_.*")
58 |         .derive_default(true)
59 |         .derive_debug(true)
60 |         .prepend_enum_name(false)
61 |         .blocklist_type("ibv_wc")
62 |         .size_t_is_usize(true)
63 |         .generate()
64 |         .expect("Unable to generate bindings");
65 | 
66 |     // write the bindings to the $OUT_DIR/bindings.rs file.
67 |     let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
68 |     bindings
69 |         .write_to_file(out_path.join("bindings.rs"))
70 |         .expect("Could not write bindings");
71 | }
72 | 


--------------------------------------------------------------------------------
/src/ibverbs/wrapper.h:
--------------------------------------------------------------------------------
1 | #include <infiniband/mlx5dv.h>
2 | 


--------------------------------------------------------------------------------
/src/ipc/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ipc"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | ipc-core = { path = "core" }
10 | mccs = { path = "mccs", package = "ipc-mccs", optional = true }
11 | 
12 | [features]
13 | default = []
14 | customer = ["ipc-core/customer"]


--------------------------------------------------------------------------------
/src/ipc/core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ipc-core"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | ipc-channel = "0.15.0"
10 | thiserror = "1.0.29"
11 | serde = { version = "1.0.130", features = ["derive"] }
12 | bincode = "1.3.3"
13 | libc = "0.2.103"
14 | shmem-ipc = "0.2.0"
15 | zerocopy = "0.3.0"
16 | memfd = "0.4.1"
17 | memmap2 = "0.5.0"
18 | uuid = "0.8.2"
19 | atomic-traits = "0.3.0"
20 | nix = "0.23.0"
21 | crossbeam = "0.8.1"
22 | unique = "0.9.1"
23 | minstant = "0.1.1"
24 | mio = "0.8.4"
25 | async-io = { version = "1.9.0", optional = true }
26 | 
27 | [features]
28 | default = []
29 | customer = ["dep:async-io"]
30 | 


--------------------------------------------------------------------------------
/src/ipc/core/src/buf.rs:
--------------------------------------------------------------------------------
 1 | //! Buffer to hold the fat pointer of a slice.
 2 | use std::slice::SliceIndex;
 3 | 
 4 | use serde::{Deserialize, Serialize};
 5 | use zerocopy::{AsBytes, FromBytes};
 6 | 
 7 | #[repr(C)]
 8 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, FromBytes, AsBytes)]
 9 | pub struct Range {
10 |     pub offset: u64,
11 |     pub len: u64,
12 | }
13 | 
14 | impl Range {
15 |     #[inline]
16 |     pub fn new<T, R>(mr: &[T], range: R) -> Self
17 |     where
18 |         R: SliceIndex<[T], Output = [T]>,
19 |     {
20 |         let buffer = range.index(mr);
21 |         let r1 = mr.as_ptr_range();
22 |         let r2 = buffer.as_ptr_range();
23 |         Range {
24 |             offset: (r2.start as u64 - r1.start as u64),
25 |             len: (r2.end as u64 - r2.start as u64),
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/ipc/core/src/channel/flavors/concurrent.rs:
--------------------------------------------------------------------------------
1 | //! Concurrent channel, encapsulated over crossbeam channel.
2 | 
3 | pub(crate) use crossbeam::channel::{unbounded, Receiver, Sender};
4 | 
5 | pub(crate) fn create_channel<T>() -> (Sender<T>, Receiver<T>) {
6 |     unbounded()
7 | }
8 | 


--------------------------------------------------------------------------------
/src/ipc/core/src/channel/flavors/mod.rs:
--------------------------------------------------------------------------------
1 | pub(crate) mod concurrent;
2 | pub(crate) mod sequential;
3 | 


--------------------------------------------------------------------------------
/src/ipc/core/src/control.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | 
 3 | pub use libc::pid_t;
 4 | use serde::{Deserialize, Serialize};
 5 | use thiserror::Error;
 6 | 
 7 | #[derive(Debug, Clone, Error, Serialize, Deserialize)]
 8 | pub enum Error {
 9 |     #[error("{0}")]
10 |     Generic(String),
11 | }
12 | 
13 | type IResult<T> = Result<T, Error>;
14 | 
15 | #[derive(Debug, Clone, Serialize, Deserialize)]
16 | pub enum Request {
17 |     // New client with device affinity,
18 |     NewClient(Option<i32>),
19 | }
20 | 
21 | #[derive(Debug, Clone, Serialize, Deserialize)]
22 | pub enum ResponseKind {
23 |     /// path of the engine's domain socket
24 |     NewClient(PathBuf),
25 |     /// .0: the requested scheduling mode
26 |     /// .1: name of the OneShotServer
27 |     /// .2: data path work queue capacity in bytes
28 |     ConnectEngine {
29 |         one_shot_name: String,
30 |         wq_cap: usize,
31 |         cq_cap: usize,
32 |     },
33 | }
34 | 
35 | #[derive(Debug, Serialize, Deserialize)]
36 | pub struct Response(pub IResult<ResponseKind>);
37 | 


--------------------------------------------------------------------------------
/src/ipc/core/src/ipc_channel.rs:
--------------------------------------------------------------------------------
 1 | //! Re-exports of some types in IPC-channel crate.
 2 | //! It also provides an IpcSenderNotify class.
 3 | use std::sync::atomic::{AtomicUsize, Ordering};
 4 | 
 5 | use serde::Serialize;
 6 | 
 7 | use crate::shmobj::ShmObject;
 8 | 
 9 | pub use ipc_channel::ipc::TryRecvError;
10 | pub(crate) use ipc_channel::ipc::{
11 |     channel, IpcError as IpcRecvError, IpcOneShotServer as OneShotServer, IpcReceiver, IpcSender,
12 | };
13 | pub(crate) use ipc_channel::Error as IpcSendError;
14 | 
15 | pub struct IpcSenderNotify<T> {
16 |     inner: IpcSender<T>,
17 |     entries: ShmObject<AtomicUsize>,
18 | }
19 | 
20 | impl<T: Serialize> IpcSenderNotify<T> {
21 |     pub(crate) fn new(inner: IpcSender<T>, entries: ShmObject<AtomicUsize>) -> Self {
22 |         IpcSenderNotify { inner, entries }
23 |     }
24 | 
25 |     pub(crate) fn send(&self, data: T) -> Result<(), bincode::Error> {
26 |         self.inner.send(data)?;
27 |         self.entries.fetch_add(1, Ordering::Relaxed);
28 |         Ok(())
29 |     }
30 | }
31 | 
32 | impl<T> Clone for IpcSenderNotify<T>
33 | where
34 |     T: Serialize,
35 | {
36 |     fn clone(&self) -> IpcSenderNotify<T> {
37 |         IpcSenderNotify {
38 |             inner: self.inner.clone(),
39 |             entries: self.entries.clone(),
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/ipc/core/src/shmem_ipc.rs:
--------------------------------------------------------------------------------
1 | //! Re-exports of shmem-ipc
2 | 
3 | pub use shmem_ipc::ringbuf::Error as ShmRingbufError;
4 | pub use shmem_ipc::ringbuf::{Receiver as RingReceiver, Sender as RingSender};
5 | 
6 | pub use shmem_ipc::sharedring::{Receiver as ShmReceiver, Sender as ShmSender};
7 | pub use shmem_ipc::Error as ShmIpcError;
8 | 


--------------------------------------------------------------------------------
/src/ipc/mccs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ipc-mccs"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | ipc-core = { path = "../core" }
10 | 
11 | serde = { version = "1.0.149", features = ["derive"] }
12 | serde-big-array = { version = "0.4.1" }
13 | static_assertions = "1.1.0" 
14 | 
15 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" }


--------------------------------------------------------------------------------
/src/ipc/mccs/src/dp.rs:
--------------------------------------------------------------------------------
 1 | pub type WorkRequestSlot = [u8; 128];
 2 | pub type CompletionSlot = [u8; 64];
 3 | 
 4 | use serde::{Deserialize, Serialize};
 5 | 
 6 | use super::command::{AllGather, AllReduce};
 7 | 
 8 | #[repr(C, align(64))]
 9 | #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
10 | pub enum WorkRequest {
11 |     AllReduce(AllReduce),
12 |     AllGather(AllGather),
13 | }
14 | 
15 | #[repr(C, align(64))]
16 | #[derive(Debug, Clone, Copy, Serialize, Deserialize)]
17 | pub enum WorkCompletion {
18 |     AllReduce,
19 |     AllGather,
20 | }
21 | 
22 | mod sa {
23 |     use super::*;
24 |     use static_assertions::const_assert;
25 |     use std::mem::size_of;
26 |     const_assert!(size_of::<WorkRequest>() <= size_of::<WorkRequestSlot>());
27 |     const_assert!(size_of::<WorkCompletion>() <= size_of::<CompletionSlot>());
28 | }
29 | 


--------------------------------------------------------------------------------
/src/ipc/mccs/src/handle.rs:
--------------------------------------------------------------------------------
 1 | use std::os::raw::c_char;
 2 | 
 3 | use cuda_runtime_sys::cudaIpcEventHandle_t;
 4 | use serde::{Deserialize, Serialize};
 5 | use serde_big_array::BigArray;
 6 | 
 7 | #[repr(C)]
 8 | #[derive(Debug, Clone, Serialize, Deserialize)]
 9 | pub struct CudaMemHandle(#[serde(with = "BigArray")] pub [c_char; 64usize]);
10 | 
11 | #[repr(C)]
12 | #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
13 | pub struct CommunicatorHandle(pub u64);
14 | 
15 | #[repr(C)]
16 | #[derive(Debug, Clone, Serialize, Deserialize)]
17 | pub struct CudaEventHandle(#[serde(with = "BigArray")] pub [c_char; 64usize]);
18 | 
19 | impl From<cudaIpcEventHandle_t> for CudaEventHandle {
20 |     fn from(value: cudaIpcEventHandle_t) -> Self {
21 |         Self(value.reserved)
22 |     }
23 | }
24 | 
25 | impl Into<cudaIpcEventHandle_t> for CudaEventHandle {
26 |     fn into(self) -> cudaIpcEventHandle_t {
27 |         cudaIpcEventHandle_t { reserved: self.0 }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/ipc/mccs/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod command;
2 | pub mod dp;
3 | pub mod handle;
4 | pub mod reconfig;
5 | 


--------------------------------------------------------------------------------
/src/ipc/mccs/src/reconfig.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(Debug, Clone, Serialize, Deserialize)]
 4 | pub struct ChannelPattern {
 5 |     pub channel_id: u32,
 6 |     pub ring: Vec<usize>,
 7 |     // (send_rank, recv_rank) -> port
 8 |     pub udp_sport: Option<Vec<(usize, usize, u16)>>,
 9 |     pub net_dev: Option<String>,
10 | }
11 | 
12 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
13 | #[repr(transparent)]
14 | pub struct CommunicatorId(pub u32);
15 | 
16 | #[derive(Debug, Clone, Serialize, Deserialize)]
17 | pub struct CommPatternReconfig {
18 |     pub communicator_id: CommunicatorId,
19 |     pub channels: Vec<ChannelPattern>,
20 |     pub ib_traffic_class: Option<u8>,
21 | }
22 | 
23 | #[derive(Debug, Clone, Serialize, Deserialize)]
24 | pub enum ExchangeReconfigCommand {
25 |     CommPatternReconfig(Vec<CommPatternReconfig>),
26 | }
27 | 


--------------------------------------------------------------------------------
/src/ipc/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub use ipc_core::*;
2 | 
3 | #[cfg(feature = "mccs")]
4 | pub use mccs;
5 | 


--------------------------------------------------------------------------------
/src/libmccs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "libmccs"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-runtime-sys = { path = "../cuda-sys/cuda-runtime-sys" }
10 | nvml-sys = { path = "../cuda-sys/nvml-sys" }
11 | 
12 | ipc = { path = "../ipc", features = ["mccs", "customer"] }
13 | 
14 | lazy_static = "1.4.0"
15 | thiserror = "1.0.37"
16 | serde_json = "1.0.89"
17 | libnuma = "0.0.4"


--------------------------------------------------------------------------------
/src/libmccs/src/communicator.rs:
--------------------------------------------------------------------------------
 1 | use std::net::IpAddr;
 2 | 
 3 | use cuda_runtime_sys::cudaIpcEventHandle_t;
 4 | use cuda_runtime_sys::cudaStream_t;
 5 | use cuda_runtime_sys::{cudaEventCreateWithFlags, cudaIpcGetEventHandle, cudaIpcOpenEventHandle};
 6 | use cuda_runtime_sys::{cudaEventDisableTiming, cudaEventInterprocess};
 7 | use ipc::mccs::command::{Command, CommunicatorInit, CompletionKind};
 8 | 
 9 | use crate::checked_cuda;
10 | use crate::rx_recv_impl;
11 | use crate::Error;
12 | use crate::MccsCommunicatorHandle;
13 | use crate::{MCCS_CTX, MCCS_STREAM_SYNC};
14 | 
15 | pub fn init_communicator_rank(
16 |     unique_id: u32,
17 |     rank: usize,
18 |     num_ranks: usize,
19 |     cuda_device_idx: i32,
20 |     root_addr: IpAddr,
21 | ) -> Result<MccsCommunicatorHandle, Error> {
22 |     let init = CommunicatorInit {
23 |         id: unique_id,
24 |         rank,
25 |         num_ranks,
26 |         root_addr,
27 |         cuda_device_idx,
28 |     };
29 |     let (comm_handle, event_handle) = MCCS_CTX.with(move |ctx| {
30 |         let req = Command::InitCommunicator(init);
31 |         ctx.service.send_cmd(req)?;
32 |         rx_recv_impl!(ctx.service, CompletionKind::InitCommunicator, handles, {
33 |             Ok(handles)
34 |         })
35 |     })?;
36 |     let mut event = std::ptr::null_mut();
37 |     unsafe {
38 |         checked_cuda!(cudaIpcOpenEventHandle(&mut event, event_handle.into()));
39 |     };
40 |     let handle = MccsCommunicatorHandle {
41 |         comm_handle,
42 |         backend_event: event,
43 |     };
44 |     Ok(handle)
45 | }
46 | 
47 | pub fn register_stream(cuda_dev: i32, stream: cudaStream_t) -> Result<(), Error> {
48 |     let mut event = std::ptr::null_mut();
49 |     let mut event_handle = cudaIpcEventHandle_t::default();
50 |     unsafe {
51 |         checked_cuda!(cudaEventCreateWithFlags(
52 |             &mut event,
53 |             cudaEventInterprocess | cudaEventDisableTiming
54 |         ));
55 |         checked_cuda!(cudaIpcGetEventHandle(&mut event_handle, event));
56 |     }
57 |     MCCS_STREAM_SYNC.with_borrow_mut(|sync| {
58 |         sync.insert(stream, event);
59 |     });
60 | 
61 |     MCCS_CTX.with(move |ctx| {
62 |         let req = Command::RegisterStream(cuda_dev, stream.addr(), event_handle.into());
63 |         ctx.service.send_cmd(req)?;
64 |         rx_recv_impl!(ctx.service, CompletionKind::RegisterStream)
65 |     })?;
66 |     Ok(())
67 | }
68 | 


--------------------------------------------------------------------------------
/src/libmccs/src/memory.rs:
--------------------------------------------------------------------------------
 1 | use std::os::raw::c_void;
 2 | 
 3 | use cuda_runtime_sys::cudaIpcMemLazyEnablePeerAccess;
 4 | use cuda_runtime_sys::cudaIpcOpenMemHandle;
 5 | use cuda_runtime_sys::{cudaError, cudaIpcMemHandle_t};
 6 | use ipc::mccs::command::{Command, CompletionKind, MccsDeviceMemoryHandle};
 7 | 
 8 | use crate::Error;
 9 | use crate::MCCS_CTX;
10 | use crate::{rx_recv_impl, DevicePtr};
11 | 
12 | pub fn cuda_malloc(device_idx: i32, size: usize) -> Result<DevicePtr, Error> {
13 |     MCCS_CTX.with(|ctx| {
14 |         let req = Command::CudaMalloc(device_idx, size);
15 |         ctx.service.send_cmd(req)?;
16 |         rx_recv_impl!(ctx.service, CompletionKind::CudaMalloc, result, {
17 |             let mut dev_ptr: *mut c_void = std::ptr::null_mut();
18 |             let handle = cudaIpcMemHandle_t {
19 |                 reserved: result.0 .0,
20 |             };
21 |             let err = unsafe {
22 |                 cudaIpcOpenMemHandle(
23 |                     &mut dev_ptr as *mut _,
24 |                     handle,
25 |                     cudaIpcMemLazyEnablePeerAccess,
26 |                 )
27 |             };
28 |             if err != cudaError::cudaSuccess {
29 |                 return Err(Error::Cuda(err));
30 |             }
31 |             Ok(DevicePtr {
32 |                 ptr: dev_ptr,
33 |                 backup_mem: result.1,
34 |             })
35 |         })
36 |     })
37 | }
38 | 
39 | pub fn cuda_free(_ptr: MccsDeviceMemoryHandle) {
40 |     todo!()
41 | }
42 | 


--------------------------------------------------------------------------------
/src/mccs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "mccs"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-driver-sys = { path = "../cuda-sys/cuda-driver-sys" }
10 | cuda-runtime-sys = { path = "../cuda-sys/cuda-runtime-sys" }
11 | nvml-sys = { path = "../cuda-sys/nvml-sys" }
12 | ipc = { path = "../ipc", features = ["mccs"] }
13 | collectives-sys = { path = "../collectives-sys" }
14 | gdrcopy-sys = { path = "../gdrcopy-sys" }
15 | ibverbs = { path = "../ibverbs" }
16 | qos-service = { path = "../qos-service" }
17 | 
18 | smol = { version = "2.0.0" }
19 | socket2 = { version = "0.5.5", features = ["all"] }
20 | anyhow = "1.0.66"
21 | bincode = "1.3.3"
22 | serde = "1.0.149"
23 | toml = "0.5.9"
24 | thiserror = "1.0.37"
25 | uuid = { version = "1.2.2", features = ["v4"] }
26 | structopt = "0.3.26"
27 | nix = { version = "0.27.1", features = ["feature", "net", "socket", "signal", "fs", "mman"] }
28 | dashmap = "5.4.0"
29 | crossbeam = "0.8.2"
30 | rand = "0.8.5"
31 | static_assertions = "1.1.0"
32 | async-trait = "0.1.64"
33 | futures = "0.3.26"
34 | memoffset = "0.8.0"
35 | log = { version = "0.4.20", features = ["max_level_trace", "release_max_level_info"] }
36 | env_logger = "0.10.0"
37 | bitflags = "2.4.1"
38 | once_cell = "1.18.0"
39 | num_enum = "0.7.1"
40 | strum = { version = "0.25", features = ["derive"] }
41 | volatile = "0.5.1"
42 | byteorder = "1.5.0"
43 | atoi = "2.0.0"
44 | bytes = "1.5.0"
45 | itertools = "0.12.0"
46 | chrono = "0.4.31"
47 | fastrand = "2.0.1"
48 | spin = "0.9.8"
49 | better-panic = "0.3.0"
50 | libnuma = "0.0.4"
51 | gcollections = "1.5.0"
52 | 


--------------------------------------------------------------------------------
/src/mccs/src/bootstrap/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod task;
 2 | 
 3 | use std::net::SocketAddr;
 4 | 
 5 | use serde::{Deserialize, Serialize};
 6 | use smol::lock::Mutex;
 7 | use smol::net::{TcpListener, TcpStream};
 8 | use thiserror::Error;
 9 | 
10 | pub use task::{bootstrap_create_root, bootstrap_root};
11 | 
12 | #[derive(Debug, Error)]
13 | pub enum BootstrapError {
14 |     #[error("IO error: {0}")]
15 |     Io(#[from] std::io::Error),
16 |     #[error("Bootstrap root received inconsistent rank count of {0} vs {1}")]
17 |     NumRanksMismatch(usize, usize),
18 |     #[error("Bootstrap root received duplicate check-in from rank {0}")]
19 |     DuplicatedCheckIn(usize),
20 |     #[error("Bootstrap root received incorrect rank number {0}")]
21 |     RankOverflow(usize),
22 |     #[error("Received {0} bytes instead of {1} bytes")]
23 |     RecvSizeMismatch(u32, u32),
24 |     #[error(
25 |         "Could not acquire Mutex in bootstrap state, only a single outstanding task is allowed"
26 |     )]
27 |     MutexAcquire,
28 | }
29 | 
30 | #[derive(Debug, Clone, Serialize, Deserialize)]
31 | pub struct BootstrapHandle {
32 |     pub addr: SocketAddr,
33 |     pub magic: u64,
34 | }
35 | 
36 | pub struct UnexpectedConn {
37 |     pub stream: TcpStream,
38 |     pub peer: usize,
39 |     pub tag: u32,
40 | }
41 | 
42 | pub struct BootstrapRing {
43 |     pub ring_recv: TcpStream,
44 |     pub ring_send: TcpStream,
45 | }
46 | 
47 | pub struct BootstrapState {
48 |     pub listener: TcpListener,
49 |     pub ring: Mutex<BootstrapRing>,
50 |     pub peer_addrs: Vec<SocketAddr>,
51 |     // Mutex is not necessary as proxy engine will ensure that
52 |     // only a single outstanding recv task will access this field
53 |     pub unexpected_connections: Mutex<Vec<UnexpectedConn>>,
54 |     pub rank: usize,
55 |     pub num_ranks: usize,
56 |     pub magic: u64,
57 | }
58 | 


--------------------------------------------------------------------------------
/src/mccs/src/comm/profile.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | 
 3 | use crate::transport::channel::{ChannelId, PeerConnId};
 4 | use crate::transport::net::provider::RDMA_TRANSPORT;
 5 | use crate::transport::net::provider::{NetProperties, NetProvierWrap};
 6 | use crate::transport::NUM_PROTOCOLS;
 7 | 
 8 | // comm profile, setting and
 9 | pub struct CommProfile {
10 |     pub buff_sizes: [usize; NUM_PROTOCOLS],
11 |     pub udp_sport_map: HashMap<PeerConnId, u16>,
12 |     pub channel_net_device_map: HashMap<ChannelId, String>,
13 |     pub tc: Option<u8>,
14 | }
15 | 
16 | impl CommProfile {
17 |     // (net_device, proxy_rank)
18 |     // TODO: choose net dev that is closest to the specified GPU
19 |     // and allow admins to specify the set of allowed net devs
20 |     #[inline]
21 |     pub fn get_network_device(
22 |         &self,
23 |         channel_id: ChannelId,
24 |         my_rank: usize,
25 |         _peer_rank: usize,
26 |     ) -> (usize, usize) {
27 |         let prefix = self.channel_net_device_map.get(&channel_id);
28 |         let num_devices = RDMA_TRANSPORT.get_num_devices().unwrap();
29 |         if let Some(prefix) = prefix {
30 |             for dev in 0..num_devices {
31 |                 let props = RDMA_TRANSPORT.get_properties(dev).unwrap();
32 |                 if props.name.starts_with(prefix) {
33 |                     return (dev, my_rank);
34 |                 }
35 |             }
36 |         }
37 |         (0, my_rank)
38 |     }
39 | 
40 |     #[inline]
41 |     pub fn get_udp_sport(&self, peer_conn_id: &PeerConnId) -> Option<u16> {
42 |         self.udp_sport_map.get(peer_conn_id).copied()
43 |     }
44 | 
45 |     #[inline]
46 |     pub fn get_tc(&self) -> Option<u8> {
47 |         self.tc
48 |     }
49 | 
50 |     #[inline]
51 |     pub fn check_gdr(&self, _rank: usize, _net_dev: usize, _read: bool) -> bool {
52 |         false
53 |     }
54 | 
55 |     #[inline]
56 |     pub fn check_gdr_need_flush(&self, _rank: usize) -> bool {
57 |         false
58 |     }
59 | 
60 |     #[inline]
61 |     pub fn get_net_provider(&self) -> &'static dyn NetProvierWrap {
62 |         &RDMA_TRANSPORT
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/mccs/src/config.rs:
--------------------------------------------------------------------------------
 1 | use std::fs;
 2 | use std::net::IpAddr;
 3 | use std::path::{Path, PathBuf};
 4 | 
 5 | use serde::{Deserialize, Serialize};
 6 | 
 7 | use qos_service::QosScheduleDef;
 8 | 
 9 | use crate::transport::net::config::NetTransportConfig;
10 | use crate::transport::net::provider::RdmaTransportConfig;
11 | use crate::transport::shm::config::ShmTransportConfig;
12 | use crate::transport::NUM_PROTOCOLS;
13 | 
14 | #[derive(Debug, Clone, Serialize, Deserialize)]
15 | pub struct DefaultCommConfig {
16 |     #[serde(rename = "buffer_sizes")]
17 |     pub buf_sizes: [usize; NUM_PROTOCOLS],
18 |     pub channel_count: u32,
19 |     // TODO: specify number of channels and ring for each channel
20 | }
21 | 
22 | impl Default for DefaultCommConfig {
23 |     fn default() -> Self {
24 |         DefaultCommConfig {
25 |             // 4MB
26 |             buf_sizes: [1 << 22],
27 |             channel_count: 1,
28 |         }
29 |     }
30 | }
31 | 
32 | #[derive(Debug, Clone, Serialize, Deserialize)]
33 | pub struct ChannelPattern {
34 |     pub channel_id: u32,
35 |     pub ring: Vec<usize>,
36 |     // (send_rank, recv_rank) -> port
37 |     pub udp_sport: Option<Vec<(usize, usize, u16)>>,
38 |     pub net_dev: Option<String>,
39 | }
40 | 
41 | #[derive(Debug, Clone, Serialize, Deserialize)]
42 | pub struct CommPatternConfig {
43 |     pub communicator_id: u32,
44 |     pub channels: Vec<ChannelPattern>,
45 |     pub ib_traffic_class: Option<u8>,
46 | }
47 | 
48 | #[derive(Debug, Clone, Serialize, Deserialize)]
49 | pub struct CommGlobalConfig {
50 |     #[serde(rename = "net_rdma", default)]
51 |     pub rdma_config: RdmaTransportConfig,
52 |     #[serde(rename = "net", default)]
53 |     pub net_config: NetTransportConfig,
54 |     #[serde(rename = "shm", default)]
55 |     pub shm_config: ShmTransportConfig,
56 | }
57 | 
58 | impl Default for CommGlobalConfig {
59 |     fn default() -> Self {
60 |         CommGlobalConfig {
61 |             rdma_config: Default::default(),
62 |             net_config: Default::default(),
63 |             shm_config: Default::default(),
64 |         }
65 |     }
66 | }
67 | 
68 | #[derive(Debug, Clone, Serialize, Deserialize)]
69 | #[serde(deny_unknown_fields)]
70 | pub struct Control {
71 |     pub prefix: PathBuf,
72 |     pub path: PathBuf,
73 | }
74 | 
75 | #[derive(Debug, Clone, Serialize, Deserialize)]
76 | #[serde(deny_unknown_fields)]
77 | pub struct Config {
78 |     pub control: Control,
79 |     #[serde(default)]
80 |     pub comm_global_config: CommGlobalConfig,
81 |     #[serde(default)]
82 |     pub comm_default_config: DefaultCommConfig,
83 |     pub addrs: Vec<IpAddr>,
84 |     pub listen_port: u16,
85 |     pub mccs_daemon_basename: String,
86 |     pub mccs_daemon_prefix: PathBuf,
87 |     pub qos_schedule: Option<QosScheduleDef>,
88 |     pub comm_patterns_override: Option<Vec<CommPatternConfig>>,
89 | }
90 | 
91 | impl Config {
92 |     pub fn from_path<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
93 |         let content = fs::read_to_string(path)?;
94 |         let config = toml::from_str(&content)?;
95 |         Ok(config)
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/mccs/src/cuda/mapped_ptr.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | use std::num::NonZeroUsize;
 3 | 
 4 | pub struct DeviceHostPtr<T> {
 5 |     host_ptr: *const T,
 6 |     dev_ptr: *const T,
 7 | }
 8 | 
 9 | impl<T> DeviceHostPtr<T> {
10 |     #[inline]
11 |     pub unsafe fn new_unchecked(ptr_host: *mut T, ptr_dev: *mut T) -> Self {
12 |         DeviceHostPtr {
13 |             host_ptr: ptr_host,
14 |             dev_ptr: ptr_dev,
15 |         }
16 |     }
17 | 
18 |     #[inline]
19 |     pub fn new(ptr_host: *mut T, ptr_dev: *mut T) -> Option<Self> {
20 |         if !ptr_host.is_null() && !ptr_dev.is_null() {
21 |             Some(unsafe { Self::new_unchecked(ptr_host, ptr_dev) })
22 |         } else {
23 |             None
24 |         }
25 |     }
26 | 
27 |     #[must_use]
28 |     #[inline]
29 |     pub fn addr_host(&self) -> NonZeroUsize {
30 |         unsafe { NonZeroUsize::new_unchecked(self.host_ptr.addr()) }
31 |     }
32 | 
33 |     #[must_use]
34 |     #[inline]
35 |     pub fn addr_dev(&self) -> NonZeroUsize {
36 |         unsafe { NonZeroUsize::new_unchecked(self.dev_ptr.addr()) }
37 |     }
38 | 
39 |     #[must_use]
40 |     #[inline]
41 |     pub fn as_ptr_host(&self) -> *mut T {
42 |         self.host_ptr as *mut T
43 |     }
44 | 
45 |     #[must_use]
46 |     #[inline]
47 |     pub fn as_ptr_dev(&self) -> *mut T {
48 |         self.dev_ptr as *mut T
49 |     }
50 | 
51 |     #[inline]
52 |     pub fn cast<U>(self) -> DeviceHostPtr<U> {
53 |         unsafe {
54 |             DeviceHostPtr::new_unchecked(self.as_ptr_host() as *mut U, self.as_ptr_dev() as *mut U)
55 |         }
56 |     }
57 | }
58 | 
59 | impl<T> Clone for DeviceHostPtr<T> {
60 |     #[inline(always)]
61 |     fn clone(&self) -> Self {
62 |         *self
63 |     }
64 | }
65 | 
66 | impl<T> Copy for DeviceHostPtr<T> {}
67 | 
68 | impl<T> fmt::Debug for DeviceHostPtr<T> {
69 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
70 |         f.debug_tuple("DeviceHostMapped")
71 |             .field(&self.as_ptr_host())
72 |             .field(&self.as_ptr_dev())
73 |             .finish()
74 |     }
75 | }
76 | 
77 | impl<T> fmt::Pointer for DeviceHostPtr<T> {
78 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
79 |         f.debug_tuple("DeviceHostMapped")
80 |             .field(&self.as_ptr_host())
81 |             .field(&self.as_ptr_dev())
82 |             .finish()
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/mccs/src/cuda/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod alloc;
2 | pub mod mapped_ptr;
3 | pub mod ptr;
4 | 


--------------------------------------------------------------------------------
/src/mccs/src/cuda/ptr.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | use std::num::NonZeroUsize;
 3 | 
 4 | #[repr(transparent)]
 5 | pub struct DeviceNonNull<T> {
 6 |     pointer: *const T,
 7 | }
 8 | 
 9 | impl<T> DeviceNonNull<T> {
10 |     pub const unsafe fn new_unchecked(ptr: *mut T) -> Self {
11 |         DeviceNonNull { pointer: ptr as _ }
12 |     }
13 | 
14 |     #[inline]
15 |     pub fn new(ptr: *mut T) -> Option<Self> {
16 |         if !ptr.is_null() {
17 |             Some(unsafe { Self::new_unchecked(ptr) })
18 |         } else {
19 |             None
20 |         }
21 |     }
22 | 
23 |     #[inline]
24 |     pub fn addr(self) -> NonZeroUsize {
25 |         unsafe { NonZeroUsize::new_unchecked(self.pointer.addr()) }
26 |     }
27 | 
28 |     #[inline]
29 |     pub fn with_addr(self, addr: NonZeroUsize) -> Self {
30 |         unsafe { Self::new_unchecked(self.pointer.with_addr(addr.get()) as *mut _) }
31 |     }
32 | 
33 |     #[inline]
34 |     pub fn map_addr(self, f: impl FnOnce(NonZeroUsize) -> NonZeroUsize) -> Self {
35 |         self.with_addr(f(self.addr()))
36 |     }
37 | 
38 |     #[inline]
39 |     pub const fn as_ptr(self) -> *mut T {
40 |         self.pointer as *mut T
41 |     }
42 | 
43 |     #[inline]
44 |     pub const fn cast<U>(self) -> DeviceNonNull<U> {
45 |         unsafe { DeviceNonNull::new_unchecked(self.as_ptr() as *mut U) }
46 |     }
47 | }
48 | 
49 | impl<T> Clone for DeviceNonNull<T> {
50 |     #[inline(always)]
51 |     fn clone(&self) -> Self {
52 |         *self
53 |     }
54 | }
55 | 
56 | impl<T> Copy for DeviceNonNull<T> {}
57 | 
58 | impl<T> fmt::Debug for DeviceNonNull<T> {
59 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60 |         f.debug_tuple("DeviceNonNull")
61 |             .field(&self.as_ptr())
62 |             .finish()
63 |     }
64 | }
65 | 
66 | impl<T> fmt::Pointer for DeviceNonNull<T> {
67 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
68 |         f.debug_tuple("DeviceNonNull")
69 |             .field(&self.as_ptr())
70 |             .finish()
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/mccs/src/daemon/mod.rs:
--------------------------------------------------------------------------------
 1 | use thiserror::Error;
 2 | 
 3 | pub mod engine;
 4 | 
 5 | #[derive(Debug, Error)]
 6 | pub(crate) enum Error {
 7 |     #[error("ipc-channel TryRecvError")]
 8 |     IpcTryRecv,
 9 |     #[error("Customer error: {0}")]
10 |     Customer(#[from] ipc::Error),
11 | }
12 | 
13 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
14 | #[repr(transparent)]
15 | pub struct DaemonId(pub u32);
16 | 


--------------------------------------------------------------------------------
/src/mccs/src/engine.rs:
--------------------------------------------------------------------------------
 1 | pub enum SchedulingMode {
 2 |     Dedicated,
 3 |     Compact,
 4 | }
 5 | 
 6 | pub enum EngineStatus {
 7 |     Idle,
 8 |     Progressed,
 9 |     Completed,
10 | }
11 | 
12 | pub trait Engine: Send + Unpin + 'static {
13 |     fn progress(&mut self) -> EngineStatus;
14 | 
15 |     fn scheduling_hint(&self) -> SchedulingMode {
16 |         SchedulingMode::Dedicated
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/mccs/src/exchange/command.rs:
--------------------------------------------------------------------------------
 1 | use std::net::SocketAddr;
 2 | 
 3 | use crate::bootstrap::BootstrapHandle;
 4 | use crate::comm::CommunicatorId;
 5 | use ipc::mccs::reconfig::CommPatternReconfig;
 6 | 
 7 | pub enum ExchangeCommand {
 8 |     RegisterBootstrapHandle(CommunicatorId, BootstrapHandle),
 9 |     // communicator id, root mccs exchange engine listen addr
10 |     RecvBootstrapHandle(CommunicatorId, SocketAddr),
11 |     RemoveCommunicator(CommunicatorId),
12 | }
13 | 
14 | pub enum ExchangeNotification {
15 |     RegisterBootstrapHandle,
16 |     RecvBootstrapHandle(CommunicatorId, BootstrapHandle),
17 |     CommPatternReconfig(CommPatternReconfig),
18 | }
19 | 


--------------------------------------------------------------------------------
/src/mccs/src/exchange/message.rs:
--------------------------------------------------------------------------------
 1 | use std::net::SocketAddr;
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | use ipc::mccs::reconfig::ExchangeReconfigCommand;
 6 | 
 7 | use crate::bootstrap::BootstrapHandle;
 8 | use crate::comm::CommunicatorId;
 9 | 
10 | #[derive(Debug, Clone, Serialize, Deserialize)]
11 | pub enum ExchangeProxyMessage {
12 |     BootstrapHandle(CommunicatorId, BootstrapHandle),
13 |     BootstrapHandleRequest(CommunicatorId, SocketAddr),
14 | }
15 | 
16 | #[derive(Debug, Clone, Serialize, Deserialize)]
17 | pub enum ExchangeMessage {
18 |     ProxyMessage(ExchangeProxyMessage),
19 |     ReconfigCommand(ExchangeReconfigCommand),
20 | }
21 | 


--------------------------------------------------------------------------------
/src/mccs/src/exchange/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod command;
 2 | pub mod engine;
 3 | pub mod message;
 4 | 
 5 | use thiserror::Error;
 6 | 
 7 | #[derive(Debug, Error)]
 8 | pub enum ExchangeError {
 9 |     #[error("IO error: {0}")]
10 |     Io(#[from] std::io::Error),
11 |     #[error("Bincode error: {0}")]
12 |     Bincode(#[from] bincode::Error),
13 | }
14 | 


--------------------------------------------------------------------------------
/src/mccs/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![feature(peer_credentials_unix_socket)]
 2 | #![feature(strict_provenance)]
 3 | #![feature(maybe_uninit_uninit_array)]
 4 | #![feature(maybe_uninit_array_assume_init)]
 5 | #![feature(int_roundings)]
 6 | #![feature(variant_count)]
 7 | #![feature(atomic_from_mut)]
 8 | #![feature(slice_ptr_get)]
 9 | #![feature(extract_if)]
10 | // todo: temporary
11 | #![allow(dead_code)]
12 | #![allow(unused)]
13 | 
14 | pub mod bootstrap;
15 | #[allow(dead_code)]
16 | pub mod comm;
17 | pub mod config;
18 | pub mod control;
19 | pub mod cuda;
20 | pub mod daemon;
21 | pub mod engine;
22 | pub mod exchange;
23 | pub mod message;
24 | pub mod pattern;
25 | pub mod proxy;
26 | pub mod registry;
27 | pub mod runtime;
28 | pub mod transport;
29 | pub mod utils;
30 | 


--------------------------------------------------------------------------------
/src/mccs/src/main.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | use std::sync::atomic::{AtomicBool, Ordering};
 3 | 
 4 | use nix::sys::signal;
 5 | 
 6 | use anyhow::Result;
 7 | use structopt::StructOpt;
 8 | 
 9 | use mccs::config::Config;
10 | use mccs::control::Control;
11 | 
12 | use chrono::Timelike;
13 | use env_logger::fmt::Color;
14 | use std::io::Write;
15 | 
16 | #[derive(Debug, Clone, StructOpt)]
17 | #[structopt(name = "mCCS Service")]
18 | struct Opts {
19 |     /// Phoenix config path
20 |     #[structopt(short, long, default_value = "mccs.toml")]
21 |     config: PathBuf,
22 |     #[structopt(short, long)]
23 |     host: usize,
24 | }
25 | 
26 | static TERMINATE: AtomicBool = AtomicBool::new(false);
27 | 
28 | extern "C" fn handle_sigint(sig: i32) {
29 |     assert_eq!(sig, signal::SIGINT as i32);
30 |     TERMINATE.store(true, Ordering::Relaxed);
31 | }
32 | 
33 | fn main() -> Result<()> {
34 |     better_panic::install();
35 |     // load config
36 |     let opts = Opts::from_args();
37 |     let config = Config::from_path(opts.config)?;
38 |     env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info"))
39 |         .format(|buf, record| {
40 |             let time = chrono::Local::now();
41 |             let style = buf
42 |                 .style()
43 |                 .set_color(Color::Black)
44 |                 .set_intense(true)
45 |                 .clone();
46 |             let time = format!(
47 |                 "{:02}:{:02}:{:02}.{:03}",
48 |                 time.hour() % 24,
49 |                 time.minute(),
50 |                 time.second(),
51 |                 time.timestamp_subsec_millis()
52 |             );
53 |             writeln!(
54 |                 buf,
55 |                 "{}{} {} {}{} {}",
56 |                 style.value("["),
57 |                 time,
58 |                 buf.default_styled_level(record.level()),
59 |                 record.module_path().unwrap_or(""),
60 |                 style.value("]"),
61 |                 record.args()
62 |             )
63 |         })
64 |         .init();
65 | 
66 |     // process Ctrl-C event
67 |     let sig_action = signal::SigAction::new(
68 |         signal::SigHandler::Handler(handle_sigint),
69 |         signal::SaFlags::empty(),
70 |         signal::SigSet::empty(),
71 |     );
72 |     unsafe { signal::sigaction(signal::SIGINT, &sig_action) }
73 |         .expect("failed to register sighandler");
74 | 
75 |     // the Control now takes over
76 |     let mut control = Control::new(config, opts.host);
77 |     log::info!("Started mCCS");
78 | 
79 |     control.mainloop(&TERMINATE)
80 | }
81 | 


--------------------------------------------------------------------------------
/src/mccs/src/message.rs:
--------------------------------------------------------------------------------
 1 | use crate::daemon::DaemonId;
 2 | use crate::proxy::command::{ProxyCommand, ProxyCompletion};
 3 | use cuda_runtime_sys::cudaStream_t;
 4 | 
 5 | use crate::transport::engine::TransportEngineId;
 6 | use crate::transport::message::{TransportEngineReply, TransportEngineRequest};
 7 | use crate::utils::duplex_chan::DuplexChannel;
 8 | 
 9 | pub enum ControlRequest {
10 |     NewTransportEngine(TransportEngineId),
11 | }
12 | 
13 | pub enum ControlNotification {
14 |     NewDaemon {
15 |         id: DaemonId,
16 |         chan: DuplexChannel<ProxyCompletion, ProxyCommand>,
17 |     },
18 |     NewTransportEngine {
19 |         id: TransportEngineId,
20 |         chan: DuplexChannel<TransportEngineRequest, TransportEngineReply>,
21 |     },
22 | }
23 | 
24 | #[derive(Debug, Clone)]
25 | pub struct CudaStream(usize);
26 | 
27 | impl Into<cudaStream_t> for CudaStream {
28 |     fn into(self) -> cudaStream_t {
29 |         self.0 as cudaStream_t
30 |     }
31 | }
32 | 
33 | impl From<cudaStream_t> for CudaStream {
34 |     fn from(value: cudaStream_t) -> Self {
35 |         Self(value as usize)
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/mccs/src/pattern.rs:
--------------------------------------------------------------------------------
 1 | pub const MCCS_STEP: u32 = 8;
 2 | pub const ALLGATHER_CHUNK_STEPS: u32 = MCCS_STEP / 2;
 3 | pub const ALLREDUCE_CHUNK_STEPS: u32 = MCCS_STEP / 2;
 4 | pub const ALLGATHER_SLICE_STEPS: u32 = MCCS_STEP / 4;
 5 | pub const ALLREDUCE_SLICE_STEPS: u32 = MCCS_STEP / 4;
 6 | 
 7 | #[derive(Clone, Debug)]
 8 | pub struct RingPattern {
 9 |     pub prev: usize,
10 |     pub next: usize,
11 |     pub user_ranks: Vec<usize>,
12 |     // rank 0's distance to my rank along the ring send path
13 |     pub index: usize,
14 | }
15 | 


--------------------------------------------------------------------------------
/src/mccs/src/proxy/command.rs:
--------------------------------------------------------------------------------
 1 | use std::net::IpAddr;
 2 | 
 3 | use crate::comm::CommunicatorId;
 4 | use ipc::mccs::handle::CudaEventHandle;
 5 | 
 6 | use super::task::{TaskDataType, TaskReduceOpType};
 7 | 
 8 | pub struct InitCommunicator {
 9 |     pub communicator_id: CommunicatorId,
10 |     pub root_mccs_addr: IpAddr,
11 |     pub rank: usize,
12 |     pub num_ranks: usize,
13 | }
14 | 
15 | #[derive(Clone, Debug)]
16 | pub struct AllGatherRequest {
17 |     pub communicator_id: CommunicatorId,
18 |     pub send_buf_addr: usize,
19 |     pub recv_buf_addr: usize,
20 |     pub size: usize,
21 |     // user stream handle
22 |     pub user_stream: usize,
23 | }
24 | 
25 | #[derive(Clone, Debug)]
26 | pub struct AllReduceRequest {
27 |     pub communicator_id: CommunicatorId,
28 |     pub send_buf_addr: usize,
29 |     pub recv_buf_addr: usize,
30 |     pub size: usize,
31 |     pub data_type: TaskDataType,
32 |     pub op_type: TaskReduceOpType,
33 |     // user stream handle
34 |     pub user_stream: usize,
35 | }
36 | 
37 | pub enum CollRequest {
38 |     AllGather(AllGatherRequest),
39 |     AllReduce(AllReduceRequest),
40 | }
41 | 
42 | pub enum ProxyCommand {
43 |     InitCommunicator(InitCommunicator),
44 |     // user stream and user event IPC handle
45 |     RegisterStream(usize, CudaEventHandle),
46 |     AllGather(AllGatherRequest),
47 |     AllReduce(AllReduceRequest),
48 |     GroupCall(Vec<CollRequest>),
49 |     DestroyCommunicator(CommunicatorId),
50 | }
51 | 
52 | pub enum ProxyCompletion {
53 |     InitCommunicator(CudaEventHandle),
54 |     RegisterStream,
55 |     AllGather,
56 |     AllReduce,
57 |     GroupCall,
58 | }
59 | 


--------------------------------------------------------------------------------
/src/mccs/src/proxy/message.rs:
--------------------------------------------------------------------------------
1 | use crate::comm::CommunicatorId;
2 | use crate::transport::channel::PeerConnId;
3 | use crate::transport::transporter::ConnectHandle;
4 | 
5 | pub enum ProxyPeerMessage {
6 |     ConnectInfoExchange(CommunicatorId, PeerConnId, ConnectHandle),
7 | }
8 | 


--------------------------------------------------------------------------------
/src/mccs/src/proxy/mod.rs:
--------------------------------------------------------------------------------
 1 | use std::net::IpAddr;
 2 | 
 3 | pub mod command;
 4 | pub mod engine;
 5 | pub mod init;
 6 | pub mod message;
 7 | pub mod op;
 8 | pub mod plan;
 9 | pub mod task;
10 | 
11 | pub struct DeviceInfo {
12 |     pub host: IpAddr,
13 |     pub listen_port: u16,
14 |     pub cuda_device_idx: i32,
15 | }
16 | 


--------------------------------------------------------------------------------
/src/mccs/src/proxy/op.rs:
--------------------------------------------------------------------------------
1 | use crate::{comm::CommunicatorId, daemon::DaemonId};
2 | 
3 | pub enum ProxyOp {
4 |     InitCommunicator(DaemonId, CommunicatorId),
5 |     RebootCommunicator(CommunicatorId),
6 |     PollCommunicatorComplete(CommunicatorId),
7 | }
8 | 


--------------------------------------------------------------------------------
/src/mccs/src/registry.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | use std::sync::Arc;
 3 | 
 4 | use crate::comm::CommunicatorId;
 5 | use crate::config::CommPatternConfig;
 6 | use crate::config::DefaultCommConfig;
 7 | use crate::transport::catalog::TransportCatalog;
 8 | use crate::transport::delegator::TransportDelegator;
 9 | 
10 | #[derive(Clone)]
11 | pub struct GlobalRegistry {
12 |     pub default_comm_config: DefaultCommConfig,
13 |     pub comm_pattern_override: HashMap<CommunicatorId, CommPatternConfig>,
14 |     pub transport_delegator: Arc<TransportDelegator>,
15 |     pub transport_catalog: Arc<TransportCatalog>,
16 | }
17 | 


--------------------------------------------------------------------------------
/src/mccs/src/runtime/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod affinity;
 2 | pub mod executor;
 3 | pub mod manager;
 4 | 
 5 | pub use affinity::CoreMask;
 6 | pub use executor::{Runtime, RuntimeId, RuntimeMode};
 7 | pub use manager::RuntimeManager;
 8 | 
 9 | use crate::engine::Engine;
10 | pub type EngineContainer = Box<dyn Engine>;
11 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/catalog.rs:
--------------------------------------------------------------------------------
 1 | use std::any::Any;
 2 | 
 3 | use dashmap::mapref::one::{MappedRef, MappedRefMut};
 4 | use dashmap::DashMap;
 5 | use thiserror::Error;
 6 | 
 7 | pub type AnyConfig = Box<dyn Any + Send + Sync>;
 8 | pub type ConfigRef<'a, T> = MappedRef<'a, String, AnyConfig, T>;
 9 | pub type ConfigRefMut<'a, T> = MappedRefMut<'a, String, AnyConfig, T>;
10 | 
11 | #[derive(Error, Debug)]
12 | pub enum Error {
13 |     #[error("Fail to downcast to a concrete type")]
14 |     Downcast,
15 |     #[error("Resources not found")]
16 |     NotFound,
17 | }
18 | 
19 | // TODO: temporary solution for async agent setup
20 | pub struct TransportCatalog {
21 |     config: DashMap<String, AnyConfig>,
22 | }
23 | 
24 | impl TransportCatalog {
25 |     pub fn new() -> Self {
26 |         TransportCatalog {
27 |             config: DashMap::new(),
28 |         }
29 |     }
30 | }
31 | 
32 | impl Default for TransportCatalog {
33 |     fn default() -> Self {
34 |         Self::new()
35 |     }
36 | }
37 | 
38 | impl TransportCatalog {
39 |     pub fn register_config<T>(&self, name: String, config: T)
40 |     where
41 |         T: Any + Send + Sync,
42 |     {
43 |         let boxed = Box::new(config);
44 |         self.config.insert(name, boxed);
45 |     }
46 | 
47 |     pub fn remove_config<T>(&self, name: &str) {
48 |         self.config.remove(name);
49 |     }
50 | 
51 |     pub fn get_config<T>(&self, name: &str) -> Result<ConfigRef<T>, Error>
52 |     where
53 |         T: Any + Send + Sync,
54 |     {
55 |         let config = self.config.get(name);
56 |         if let Some(entry) = config {
57 |             let concrete = entry
58 |                 .try_map(|x| x.downcast_ref::<T>())
59 |                 .map_err(|_| Error::Downcast)?;
60 |             Ok(concrete)
61 |         } else {
62 |             Err(Error::NotFound)
63 |         }
64 |     }
65 | 
66 |     pub fn get_config_mut<T>(&self, name: &str) -> Result<ConfigRefMut<T>, Error>
67 |     where
68 |         T: Any + Send + Sync,
69 |     {
70 |         let config = self.config.get_mut(name);
71 |         if let Some(entry) = config {
72 |             let concrete = entry
73 |                 .try_map(|x| x.downcast_mut::<T>())
74 |                 .map_err(|_| Error::Downcast)?;
75 |             Ok(concrete)
76 |         } else {
77 |             Err(Error::NotFound)
78 |         }
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/channel.rs:
--------------------------------------------------------------------------------
 1 | use std::any::Any;
 2 | use std::collections::HashMap;
 3 | 
 4 | use super::engine::TransportEngineId;
 5 | use super::transporter::Transporter;
 6 | use super::NUM_PROTOCOLS;
 7 | use crate::cuda::ptr::DeviceNonNull;
 8 | use crate::pattern::RingPattern;
 9 | use std::fmt::Display;
10 | 
11 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
12 | pub enum ConnType {
13 |     Send,
14 |     Recv,
15 | }
16 | 
17 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
18 | pub struct PeerConnId {
19 |     pub(crate) peer_rank: usize,
20 |     pub(crate) channel: ChannelId,
21 |     pub(crate) conn_index: u32,
22 |     pub(crate) conn_type: ConnType,
23 | }
24 | 
25 | pub struct PeerConnInfo {
26 |     pub bufs: [DeviceNonNull<u8>; NUM_PROTOCOLS],
27 |     pub head: DeviceNonNull<u64>,
28 |     pub tail: DeviceNonNull<u64>,
29 |     pub slots_size: Option<DeviceNonNull<u32>>,
30 | }
31 | 
32 | pub struct PeerConnector {
33 |     pub conn_info: PeerConnInfo,
34 |     pub transport_agent_engine: Option<TransportEngineId>,
35 |     pub transporter: &'static dyn Transporter,
36 |     pub transport_resources: Box<dyn Any>,
37 | }
38 | 
39 | pub const CHANNEL_MAX_CONNS: usize = 2;
40 | 
41 | pub struct ChannelPeerConn {
42 |     // conn_index -> PeerConnector
43 |     pub send: [Option<PeerConnector>; CHANNEL_MAX_CONNS],
44 |     // conn_index -> PeerConnector
45 |     pub recv: [Option<PeerConnector>; CHANNEL_MAX_CONNS],
46 | }
47 | 
48 | pub struct CommChannel {
49 |     // peer -> ChannelPeerConn
50 |     pub peers: HashMap<usize, ChannelPeerConn>,
51 |     pub ring: RingPattern,
52 |     pub work_queue_next_available: u32,
53 | }
54 | 
55 | #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
56 | pub struct ChannelId(pub u32);
57 | 
58 | impl Display for ChannelId {
59 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
60 |         f.write_str(self.0.to_string().as_str())
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/message.rs:
--------------------------------------------------------------------------------
 1 | use super::{
 2 |     op::TransportOp,
 3 |     transporter::{AgentMessage, TransportAgentId, Transporter},
 4 | };
 5 | 
 6 | pub enum TransportEngineRequest {
 7 |     AgentSetup(&'static dyn Transporter, TransportAgentId, AgentMessage),
 8 |     AgentConnect(&'static dyn Transporter, TransportAgentId, AgentMessage),
 9 |     AgentTransportOp(TransportAgentId, TransportOp),
10 |     AgentShutdown(TransportAgentId),
11 | }
12 | 
13 | pub enum TransportEngineReply {
14 |     AgentSetup(TransportAgentId, AgentMessage),
15 |     AgentConnect(TransportAgentId, AgentMessage),
16 |     AgentShutdown(TransportAgentId),
17 | }
18 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/meta.rs:
--------------------------------------------------------------------------------
 1 | use std::ffi::c_void;
 2 | 
 3 | use super::NUM_BUFFER_SLOTS;
 4 | const CACHE_LINE_SIZE: usize = 128;
 5 | 
 6 | #[repr(C, align(4096))]
 7 | pub struct SendBufMeta {
 8 |     pub head: u64,
 9 |     _pad1: [u8; CACHE_LINE_SIZE - std::mem::size_of::<u64>()],
10 |     _ptr_exchange: *mut c_void,
11 |     _reduce_op_arg_exchange: [u64; 2],
12 |     _pad2:
13 |         [u8; CACHE_LINE_SIZE - std::mem::size_of::<*mut c_void>() - 2 * std::mem::size_of::<u64>()],
14 |     _slots_offsets: [i32; NUM_BUFFER_SLOTS],
15 | }
16 | 
17 | static_assertions::const_assert_eq!(std::mem::size_of::<SendBufMeta>(), 4096);
18 | 
19 | impl SendBufMeta {
20 |     pub fn new() -> Self {
21 |         SendBufMeta {
22 |             head: 0,
23 |             _pad1: [0; CACHE_LINE_SIZE - std::mem::size_of::<u64>()],
24 |             _ptr_exchange: std::ptr::null_mut(),
25 |             _reduce_op_arg_exchange: [0; 2],
26 |             _pad2: [0; CACHE_LINE_SIZE
27 |                 - std::mem::size_of::<*mut c_void>()
28 |                 - 2 * std::mem::size_of::<u64>()],
29 |             _slots_offsets: [0; NUM_BUFFER_SLOTS],
30 |         }
31 |     }
32 | }
33 | 
34 | impl Default for SendBufMeta {
35 |     fn default() -> Self {
36 |         Self::new()
37 |     }
38 | }
39 | 
40 | #[repr(C, align(4096))]
41 | pub struct RecvBufMeta {
42 |     pub tail: u64,
43 |     _pad1: [u8; CACHE_LINE_SIZE - std::mem::size_of::<u64>()],
44 |     pub slots_sizes: [i32; NUM_BUFFER_SLOTS],
45 |     _slots_offsets: [i32; NUM_BUFFER_SLOTS],
46 |     _flush: i32,
47 | }
48 | 
49 | static_assertions::const_assert_eq!(std::mem::size_of::<RecvBufMeta>(), 4096);
50 | 
51 | impl Default for RecvBufMeta {
52 |     fn default() -> Self {
53 |         Self::new()
54 |     }
55 | }
56 | 
57 | impl RecvBufMeta {
58 |     pub fn new() -> Self {
59 |         RecvBufMeta {
60 |             tail: 0,
61 |             _pad1: [0; CACHE_LINE_SIZE - std::mem::size_of::<u64>()],
62 |             slots_sizes: [0; NUM_BUFFER_SLOTS],
63 |             _slots_offsets: [0; NUM_BUFFER_SLOTS],
64 |             _flush: 0,
65 |         }
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/mod.rs:
--------------------------------------------------------------------------------
 1 | use strum::{EnumCount, EnumIter};
 2 | 
 3 | use collectives_sys::MCCS_BUFFER_SLOTS;
 4 | use collectives_sys::{MCCS_NUM_PROTOCOLS, MCCS_PROTO_SIMPLE};
 5 | 
 6 | pub mod catalog;
 7 | pub mod channel;
 8 | pub mod delegator;
 9 | pub mod engine;
10 | pub mod message;
11 | pub mod meta;
12 | pub mod net;
13 | pub mod op;
14 | pub mod queue;
15 | pub mod setup;
16 | pub mod shm;
17 | pub mod task;
18 | pub mod transporter;
19 | 
20 | use net::transporter::NetTransport;
21 | use shm::transporter::ShmTransporter;
22 | use transporter::Transporter;
23 | 
24 | pub static SHM_TRANSPORTER: ShmTransporter = ShmTransporter;
25 | pub static NET_TRANSPORTER: NetTransport = NetTransport;
26 | pub static ALL_TRANSPORTERS: &[&'static dyn Transporter] = &[&SHM_TRANSPORTER, &NET_TRANSPORTER];
27 | 
28 | pub const NUM_BUFFER_SLOTS: usize = MCCS_BUFFER_SLOTS as _;
29 | pub const NUM_PROTOCOLS: usize = MCCS_NUM_PROTOCOLS as _;
30 | 
31 | #[derive(Debug, PartialEq, Eq, Clone, Copy, EnumIter, EnumCount)]
32 | #[repr(usize)]
33 | pub enum Protocol {
34 |     Simple = MCCS_PROTO_SIMPLE as _,
35 | }
36 | 
37 | static_assertions::const_assert_eq!(std::mem::variant_count::<Protocol>(), NUM_PROTOCOLS);
38 | 
39 | pub const DEFAULT_BUFFER_SIZE: usize = 1 << 22;
40 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/net/config.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(Debug, Clone, Serialize, Deserialize)]
 4 | pub struct NetTransportConfig {
 5 |     pub gdr_enable: bool,
 6 |     pub gdr_copy_sync_enable: bool,
 7 |     pub gdr_copy_flush_enable: bool,
 8 | }
 9 | 
10 | impl Default for NetTransportConfig {
11 |     fn default() -> Self {
12 |         NetTransportConfig {
13 |             gdr_enable: false,
14 |             gdr_copy_sync_enable: false,
15 |             gdr_copy_flush_enable: false,
16 |         }
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/net/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod agent;
 2 | pub mod buffer;
 3 | pub mod config;
 4 | pub mod provider;
 5 | pub mod resources;
 6 | pub mod transporter;
 7 | 
 8 | pub use provider::RDMA_TRANSPORT;
 9 | pub use provider::{NetProperties, NetProvierWrap};
10 | pub use transporter::NET_TRANSPORT;
11 | 
12 | use thiserror::Error;
13 | 
14 | use crate::transport::transporter::ConnectHandleError;
15 | use provider::NetProviderError;
16 | 
17 | #[derive(Debug, Error)]
18 | pub enum NetTransportError {
19 |     #[error("Failed to downcast setup resources")]
20 |     DowncastSetupResources,
21 |     #[error("Failed to downcast agent reply")]
22 |     DowncastAgentReply,
23 |     #[error("Invalid agent reply")]
24 |     InvalidAgentReply,
25 |     #[error("Connection handle: {0}")]
26 |     ConnectionHandle(#[from] ConnectHandleError),
27 |     #[error("Net provider error: {0}")]
28 |     NetProvider(#[from] NetProviderError),
29 | }
30 | 
31 | #[derive(Debug, Error)]
32 | pub enum NetAgentError {
33 |     #[error("Net provider error: {0}")]
34 |     NetProvider(#[from] NetProviderError),
35 |     #[error("Ring buffer registration error: {0}")]
36 |     BufferRegistration(String),
37 |     #[error("Failed to downcast agent request")]
38 |     DowncastAgentRequest,
39 |     #[error("Failed to downcast agent resources")]
40 |     DowncastAgentResources,
41 |     #[error("Transport catalog error: {0}")]
42 |     TransportCatalog(#[from] crate::transport::catalog::Error),
43 | }
44 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/shm/config.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(Debug, Clone, Serialize, Deserialize)]
 4 | pub struct ShmTransportConfig {
 5 |     pub locality: ShmLocality,
 6 |     #[serde(rename = "memcpy_send")]
 7 |     pub use_memcpy_send: bool,
 8 |     #[serde(rename = "memcpy_recv")]
 9 |     pub use_memcpy_recv: bool,
10 | }
11 | 
12 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
13 | pub enum ShmLocality {
14 |     Sender,
15 |     Receiver,
16 | }
17 | 
18 | impl Default for ShmTransportConfig {
19 |     fn default() -> Self {
20 |         ShmTransportConfig {
21 |             locality: ShmLocality::Sender,
22 |             use_memcpy_send: false,
23 |             use_memcpy_recv: false,
24 |         }
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/shm/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod agent;
2 | pub mod buffer;
3 | pub mod config;
4 | pub mod resources;
5 | pub mod transporter;
6 | 


--------------------------------------------------------------------------------
/src/mccs/src/transport/task.rs:
--------------------------------------------------------------------------------
1 | use super::{op::TransportOp, transporter::TransportAgentId};
2 | 
3 | pub struct TransportTask {
4 |     pub(crate) agent_id: TransportAgentId,
5 |     pub(crate) op: TransportOp,
6 | }
7 | 


--------------------------------------------------------------------------------
/src/mccs/src/utils/duplex_chan.rs:
--------------------------------------------------------------------------------
 1 | pub struct DuplexChannel<T, R> {
 2 |     pub tx: crossbeam::channel::Sender<T>,
 3 |     pub rx: crossbeam::channel::Receiver<R>,
 4 | }
 5 | 
 6 | impl<T, R> DuplexChannel<T, R> {
 7 |     pub fn new_unbound_pair() -> (DuplexChannel<T, R>, DuplexChannel<R, T>) {
 8 |         let (t_tx, t_rx) = crossbeam::channel::unbounded();
 9 |         let (r_tx, r_rx) = crossbeam::channel::unbounded();
10 |         (
11 |             DuplexChannel { tx: t_tx, rx: r_rx },
12 |             DuplexChannel { tx: r_tx, rx: t_rx },
13 |         )
14 |     }
15 | 
16 |     pub fn new_bound_pair(
17 |         t_to_r: usize,
18 |         r_to_t: usize,
19 |     ) -> (DuplexChannel<T, R>, DuplexChannel<R, T>) {
20 |         let (t_tx, t_rx) = crossbeam::channel::bounded(t_to_r);
21 |         let (r_tx, r_rx) = crossbeam::channel::bounded(r_to_t);
22 |         (
23 |             DuplexChannel { tx: t_tx, rx: r_rx },
24 |             DuplexChannel { tx: r_tx, rx: t_rx },
25 |         )
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/mccs/src/utils/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod duplex_chan;
 2 | pub mod gdr;
 3 | pub mod interfaces;
 4 | pub mod pool;
 5 | pub mod tcp;
 6 | 
 7 | #[macro_export]
 8 | macro_rules! cuda_warning {
 9 |     ($cuda_op:expr) => {{
10 |         let e = $cuda_op;
11 |         if e != cuda_runtime_sys::cudaError::cudaSuccess {
12 |             log::error!(
13 |                 "CUDA runtime failed with {:?} at {}:{}.",
14 |                 e,
15 |                 file!(),
16 |                 line!()
17 |             )
18 |         }
19 |     }};
20 |     ($cuda_op:expr,$ctx:expr) => {{
21 |         let e = $cuda_op;
22 |         if e != cuda_runtime_sys::cudaError::cudaSuccess {
23 |             log::error!(
24 |                 "CUDA runtime failed with {:?} at {}:{}. Context={}",
25 |                 e,
26 |                 file!(),
27 |                 line!(),
28 |                 $ctx
29 |             )
30 |         }
31 |     }};
32 | }
33 | 
34 | #[macro_export]
35 | macro_rules! cu_warning {
36 |     ($cu_op:expr) => {{
37 |         let e = $cu_op;
38 |         if e != cuda_driver_sys::CUresult::CUDA_SUCCESS {
39 |             log::error!(
40 |                 "CUDA driver failed with {:?} at {}:{}.",
41 |                 e,
42 |                 file!(),
43 |                 line!()
44 |             )
45 |         }
46 |     }};
47 |     ($cu_op:expr,$ctx:expr) => {{
48 |         let e = $cu_op;
49 |         if e != cuda_driver_sys: CUresult::CUDA_SUCCESS {
50 |             log::error!(
51 |                 "CUDA driver failed with {:?} at {}:{}. Context={}",
52 |                 e,
53 |                 file!(),
54 |                 line!(),
55 |                 $ctx
56 |             )
57 |         }
58 |     }};
59 | }
60 | 
61 | thread_local!(pub static CU_INIT: () = (|| unsafe {
62 |     cu_warning!(cuda_driver_sys::cuInit(0));
63 |     ()
64 | })());
65 | 


--------------------------------------------------------------------------------
/src/mccs/src/utils/pool.rs:
--------------------------------------------------------------------------------
 1 | pub struct WorkPool<T> {
 2 |     pool: Vec<T>,
 3 | }
 4 | 
 5 | impl<T> WorkPool<T> {
 6 |     pub fn new() -> Self {
 7 |         WorkPool { pool: Vec::new() }
 8 |     }
 9 | }
10 | 
11 | impl<T> Default for WorkPool<T> {
12 |     fn default() -> Self {
13 |         Self::new()
14 |     }
15 | }
16 | 
17 | impl<T> WorkPool<T> {
18 |     pub fn progress<F>(&mut self, mut f: F)
19 |     where
20 |         F: FnMut(&mut T) -> bool,
21 |     {
22 |         let mut idx = 0;
23 |         while idx < self.pool.len() {
24 |             let finished = f(&mut self.pool[idx]);
25 |             if finished {
26 |                 self.pool.swap_remove(idx);
27 |             } else {
28 |                 idx += 1;
29 |             }
30 |         }
31 |     }
32 | 
33 |     pub fn enqueue(&mut self, elem: T) {
34 |         self.pool.push(elem);
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/mccs_examples/allgather_bench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "allgather_bench"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" }
10 | libmccs = { path = "../../libmccs" }
11 | 
12 | structopt = "0.3.26"


--------------------------------------------------------------------------------
/src/mccs_examples/allgather_proto/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "allgather_proto"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" }
10 | libmccs = { path = "../../libmccs" }
11 | 
12 | structopt = "0.3.26"


--------------------------------------------------------------------------------
/src/mccs_examples/allreduce_bench/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "allreduce_bench"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | chrono = "0.4.33"
10 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" }
11 | libmccs = { path = "../../libmccs" }
12 | 
13 | structopt = "0.3.26"
14 | 


--------------------------------------------------------------------------------
/src/mccs_examples/allreduce_proto/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "allreduce_proto"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" }
10 | libmccs = { path = "../../libmccs" }
11 | 
12 | structopt = "0.3.26"


--------------------------------------------------------------------------------
/src/mccs_examples/cuda_hello/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cuda_hello"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" }
10 | libmccs = { path = "../../libmccs" }


--------------------------------------------------------------------------------
/src/mccs_examples/cuda_hello/src/main.rs:
--------------------------------------------------------------------------------
 1 | use cuda_runtime_sys::cudaMemcpy;
 2 | use cuda_runtime_sys::{cudaError, cudaMemcpyKind};
 3 | 
 4 | use libmccs::memory::cuda_malloc;
 5 | 
 6 | const BUFFER_SIZE: usize = 1024 * 1024;
 7 | 
 8 | fn main() {
 9 |     let dev_ptr = cuda_malloc(0, BUFFER_SIZE).unwrap();
10 |     let buf = vec![42i32; BUFFER_SIZE / std::mem::size_of::<i32>()];
11 |     let err = unsafe {
12 |         cudaMemcpy(
13 |             dev_ptr.ptr,
14 |             buf.as_ptr() as *const _,
15 |             BUFFER_SIZE,
16 |             cudaMemcpyKind::cudaMemcpyHostToDevice,
17 |         )
18 |     };
19 |     if err != cudaError::cudaSuccess {
20 |         panic!("cudaMemcpy failed")
21 |     }
22 | 
23 |     println!("cudaMemcpy success");
24 | }
25 | 


--------------------------------------------------------------------------------
/src/mccs_examples/ring_config/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ring_config"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | ipc = { path = "../../ipc", features = ["mccs"] }
10 | 
11 | bincode = "1.3.3"
12 | serde = "1.0.149"
13 | byteorder = "1.5.0"
14 | structopt = "0.3.26"
15 | toml = "0.8.8"


--------------------------------------------------------------------------------
/src/mccs_examples/ring_config/src/main.rs:
--------------------------------------------------------------------------------
 1 | use std::io::Write;
 2 | use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
 3 | use std::path::{Path, PathBuf};
 4 | 
 5 | use byteorder::{ByteOrder, LittleEndian};
 6 | use serde::{Deserialize, Serialize};
 7 | use structopt::StructOpt;
 8 | 
 9 | use ipc::mccs::reconfig::{CommPatternReconfig, ExchangeReconfigCommand};
10 | 
11 | #[derive(Debug, Clone, Serialize, Deserialize)]
12 | struct Config {
13 |     mccs_addrs: Vec<IpAddr>,
14 |     mccs_port: u16,
15 |     comm_patterns_reconfig: Vec<CommPatternReconfig>,
16 | }
17 | 
18 | impl Config {
19 |     fn from_path<P: AsRef<Path>>(path: P) -> Config {
20 |         let content = std::fs::read_to_string(path).unwrap();
21 |         let config = toml::from_str(&content).unwrap();
22 |         config
23 |     }
24 | }
25 | 
26 | #[derive(Debug, Clone, StructOpt)]
27 | #[structopt(name = "Comm pattern configurator")]
28 | struct Opts {
29 |     // Path to toml traffic trace
30 |     #[structopt(long, short = "c")]
31 |     config: PathBuf,
32 | }
33 | 
34 | const EXCHANGE_MAGIC: u64 = 0x424ab9f2fc4b9d6e;
35 | 
36 | fn main() {
37 |     let opts = Opts::from_args();
38 |     let config = Config::from_path(opts.config);
39 | 
40 |     let pattern_config = config.comm_patterns_reconfig.clone();
41 |     let command = ExchangeReconfigCommand::CommPatternReconfig(pattern_config);
42 |     let encoded = bincode::serialize(&command).unwrap();
43 | 
44 |     for addr in config.mccs_addrs.iter() {
45 |         let addr = SocketAddr::new(*addr, config.mccs_port);
46 |         let mut buf = [0u8; 5];
47 |         buf[0] = 1;
48 |         LittleEndian::write_u32(&mut buf[1..], encoded.len() as u32);
49 |         let mut magic_buf = [0u8; std::mem::size_of::<u64>()];
50 |         LittleEndian::write_u64(&mut magic_buf, EXCHANGE_MAGIC);
51 |         let mut stream = TcpStream::connect(addr).unwrap();
52 |         stream.set_nodelay(true).unwrap();
53 |         stream.write_all(&magic_buf).unwrap();
54 |         stream.write_all(&buf).unwrap();
55 |         stream.write_all(encoded.as_slice()).unwrap();
56 | 
57 |         println!("Sent command to {}", addr);
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/mccs_examples/traffic_gen/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "traffic_gen"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" }
10 | libmccs = { path = "../../libmccs" }
11 | csv = "1.3.0"
12 | 
13 | structopt = "0.3.26"
14 | serde = { version = "1.0.195", features = ["derive"] }
15 | toml = "0.8.8"
16 | spin_sleep = "1.2.0"
17 | chrono = "0.4.33"
18 | 


--------------------------------------------------------------------------------
/src/mccs_tests/rdma_transport/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "rdma_transport"
3 | version = "0.1.0"
4 | edition = "2021"
5 | 
6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7 | 
8 | [dependencies]
9 | mccs = { path = "../../mccs" }


--------------------------------------------------------------------------------
/src/mccs_tests/rdma_transport/examples/client.rs:
--------------------------------------------------------------------------------
1 | fn main() {}
2 | 


--------------------------------------------------------------------------------
/src/mccs_tests/rdma_transport/examples/server.rs:
--------------------------------------------------------------------------------
1 | fn main() {}
2 | 


--------------------------------------------------------------------------------
/src/mccs_tests/rdma_transport/src/lib.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/qos-service/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "qos-service"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | intervallum = "1.4.1"
10 | bincode = "1.3.3"
11 | serde = "1.0.149"


--------------------------------------------------------------------------------
/src/qos-service/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | 
 3 | use interval::interval_set::ToIntervalSet;
 4 | use interval::IntervalSet;
 5 | use serde::{Deserialize, Serialize};
 6 | 
 7 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
 8 | #[repr(transparent)]
 9 | #[serde(transparent)]
10 | pub struct CommunicatorId(pub u32);
11 | 
12 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
13 | pub enum QosMode {
14 |     Allow,
15 |     Deny,
16 | }
17 | 
18 | #[derive(Clone, Debug, Serialize, Deserialize)]
19 | pub struct QosIntervalDef {
20 |     // start and end timestamps in microseconds
21 |     pub intervals: Vec<(u64, u64)>,
22 |     pub mode: QosMode,
23 |     pub enforce_step: Option<u64>,
24 | }
25 | 
26 | #[derive(Clone, Debug, Serialize, Deserialize)]
27 | pub struct QosScheduleDef {
28 |     #[serde(deserialize_with = "deserialize_schedule")]
29 |     pub schedule: HashMap<CommunicatorId, QosIntervalDef>,
30 |     pub epoch_microsecs: u64,
31 | }
32 | 
33 | fn deserialize_schedule<'de, D>(
34 |     deserializer: D,
35 | ) -> Result<HashMap<CommunicatorId, QosIntervalDef>, D::Error>
36 | where
37 |     D: serde::Deserializer<'de>,
38 | {
39 |     let map: HashMap<String, QosIntervalDef> = Deserialize::deserialize(deserializer)?;
40 |     map.into_iter()
41 |         .map(|(k, v)| {
42 |             k.parse::<u32>()
43 |                 .map_err(serde::de::Error::custom)
44 |                 .map(|key| (CommunicatorId(key), v))
45 |         })
46 |         .collect()
47 | }
48 | 
49 | #[derive(Clone, Debug)]
50 | pub struct QosInterval {
51 |     pub intervals: IntervalSet<u64>,
52 |     pub mode: QosMode,
53 |     pub enforce_step: Option<u64>,
54 | }
55 | 
56 | impl From<QosIntervalDef> for QosInterval {
57 |     fn from(def: QosIntervalDef) -> Self {
58 |         let intervals = def.intervals.to_interval_set();
59 |         QosInterval {
60 |             intervals,
61 |             mode: def.mode,
62 |             enforce_step: def.enforce_step,
63 |         }
64 |     }
65 | }
66 | 
67 | #[derive(Clone, Debug)]
68 | pub struct QosSchedule {
69 |     pub schedule: HashMap<CommunicatorId, QosInterval>,
70 |     pub epoch_microsecs: u64,
71 | }
72 | 
73 | impl From<QosScheduleDef> for QosSchedule {
74 |     fn from(def: QosScheduleDef) -> Self {
75 |         let schedule = def
76 |             .schedule
77 |             .into_iter()
78 |             .map(|(k, v)| (k, v.into()))
79 |             .collect();
80 |         QosSchedule {
81 |             schedule,
82 |             epoch_microsecs: def.epoch_microsecs,
83 |         }
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/workloads/reconfig_gpt.toml:
--------------------------------------------------------------------------------
1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3
2 | communicator_id = 600
3 | cuda_devices = [0, 1, 0, 1, 0, 1, 0, 1]
4 | 
5 | [[traces]]
6 | size = 83886080
7 | type = "all_reduce"
8 | compute_interval = 6000
9 | 


--------------------------------------------------------------------------------
/workloads/setup-1_gpt_0.toml:
--------------------------------------------------------------------------------
 1 | # VGG on 4 GPUs (2 on H1, 2 on H3)
 2 | communicator_id = 100
 3 | cuda_devices = [0, 1, 0, 1]
 4 | 
 5 | [[traces]]
 6 | # 678M all-reduce
 7 | size = 711839744
 8 | type = "all_reduce"
 9 | # 1750ms compute
10 | compute_interval = 930000


--------------------------------------------------------------------------------
/workloads/setup-1_gpt_1.toml:
--------------------------------------------------------------------------------
 1 | # VGG on 4 GPUs (2 on H1, 2 on H3)
 2 | communicator_id = 101
 3 | cuda_devices = [0, 1, 0, 1]
 4 | 
 5 | [[traces]]
 6 | # 678M all-reduce
 7 | size = 711839744
 8 | type = "all_reduce"
 9 | # 1750ms compute
10 | compute_interval = 930000


--------------------------------------------------------------------------------
/workloads/setup-1_resnet_0.toml:
--------------------------------------------------------------------------------
 1 | # ResNet on 4 GPUs (2 on H0, 2 on H2)
 2 | communicator_id = 100
 3 | cuda_devices = [0, 1, 0, 1]
 4 | 
 5 | [[traces]]
 6 | # 170M all-reduce
 7 | size = 178196640
 8 | type = "all_reduce"
 9 | # 230ms compute
10 | compute_interval = 230000


--------------------------------------------------------------------------------
/workloads/setup-1_resnet_1.toml:
--------------------------------------------------------------------------------
 1 | # ResNet on 4 GPUs (2 on H0, 2 on H2)
 2 | communicator_id = 101
 3 | cuda_devices = [0, 1, 0, 1]
 4 | 
 5 | [[traces]]
 6 | # 170M all-reduce
 7 | size = 178196640
 8 | type = "all_reduce"
 9 | # 230ms compute
10 | compute_interval = 230000


--------------------------------------------------------------------------------
/workloads/setup-1_vgg_0.toml:
--------------------------------------------------------------------------------
 1 | # VGG on 2 GPUs (1 on H1, 1 on H3)
 2 | communicator_id = 100
 3 | cuda_devices = [0, 1, 0, 1]
 4 | 
 5 | [[traces]]
 6 | # 548M all-reduce
 7 | size = 574668960
 8 | type = "all_reduce"
 9 | # 310ms compute
10 | # compute_interval = 310000
11 | compute_interval = 155000
12 | 


--------------------------------------------------------------------------------
/workloads/setup-1_vgg_1.toml:
--------------------------------------------------------------------------------
 1 | # VGG on 2 GPUs (1 on H1, 1 on H3)
 2 | communicator_id = 101
 3 | cuda_devices = [0, 1, 0, 1]
 4 | 
 5 | [[traces]]
 6 | # 548M all-reduce
 7 | size = 574668960
 8 | type = "all_reduce"
 9 | # 310ms compute
10 | # compute_interval = 310000
11 | compute_interval = 155000
12 | 


--------------------------------------------------------------------------------
/workloads/setup-2_gpt_1.toml:
--------------------------------------------------------------------------------
1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3
2 | communicator_id = 201
3 | cuda_devices = [0, 0]
4 | 
5 | [[traces]]
6 | size = 83886080
7 | type = "all_reduce"
8 | compute_interval = 6000
9 | 


--------------------------------------------------------------------------------
/workloads/setup-2_gpt_2.toml:
--------------------------------------------------------------------------------
1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3
2 | communicator_id = 202
3 | cuda_devices = [0, 0]
4 | 
5 | [[traces]]
6 | size = 83886080
7 | type = "all_reduce"
8 | compute_interval = 6000
9 | 


--------------------------------------------------------------------------------
/workloads/setup-2_resnet.toml:
--------------------------------------------------------------------------------
 1 | # ResNet on 2 GPUs (1 on H0, 1 on H2)
 2 | communicator_id = 'bad'
 3 | cuda_devices = [0, 0]
 4 | 
 5 | [[traces]]
 6 | # 170M all-reduce
 7 | size = 178196640
 8 | type = "all_reduce"
 9 | # 230ms compute
10 | compute_interval = 230000


--------------------------------------------------------------------------------
/workloads/setup-2_vgg.toml:
--------------------------------------------------------------------------------
 1 | # VGG on 2 GPUs (1 on H1, 1 on H3)
 2 | communicator_id = 200
 3 | cuda_devices = [1, 1, 1, 1]
 4 | 
 5 | [[traces]]
 6 | # 548M all-reduce
 7 | size = 574668960
 8 | type = "all_reduce"
 9 | # 310ms compute
10 | compute_interval = 160000
11 | 


--------------------------------------------------------------------------------
/workloads/setup-3_gpt_1.toml:
--------------------------------------------------------------------------------
 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3
 2 | communicator_id = 300
 3 | cuda_devices = [0, 0, 0, 0]
 4 | 
 5 | [[traces]]
 6 | # 678M all-reduce
 7 | size = 711839744
 8 | type = "all_reduce"
 9 | # 1750ms compute
10 | compute_interval = 930000


--------------------------------------------------------------------------------
/workloads/setup-3_gpt_2.toml:
--------------------------------------------------------------------------------
 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3
 2 | communicator_id = 301
 3 | cuda_devices = [1, 1, 1, 1]
 4 | 
 5 | [[traces]]
 6 | # 678M all-reduce
 7 | size = 711839744
 8 | type = "all_reduce"
 9 | # 1750ms compute
10 | compute_interval = 930000


--------------------------------------------------------------------------------
/workloads/setup-4_gpt_1.toml:
--------------------------------------------------------------------------------
1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3
2 | communicator_id = 201
3 | cuda_devices = [0, 0]
4 | 
5 | [[traces]]
6 | size = 83886080
7 | type = "all_reduce"
8 | compute_interval = 6000
9 | 


--------------------------------------------------------------------------------
/workloads/setup-4_gpt_2.toml:
--------------------------------------------------------------------------------
1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3
2 | communicator_id = 202
3 | cuda_devices = [1, 1]
4 | 
5 | [[traces]]
6 | size = 83886080
7 | type = "all_reduce"
8 | compute_interval = 6000
9 | 


--------------------------------------------------------------------------------
/workloads/setup-4_resnet_0.toml:
--------------------------------------------------------------------------------
 1 | # ResNet on 4 GPUs (2 on H0, 2 on H2)
 2 | communicator_id = 202
 3 | cuda_devices = [1, 1]
 4 | 
 5 | [[traces]]
 6 | # 170M all-reduce
 7 | size = 178196640
 8 | type = "all_reduce"
 9 | # 230ms compute
10 | compute_interval = 120000
11 | 


--------------------------------------------------------------------------------
/workloads/setup-4_vgg.toml:
--------------------------------------------------------------------------------
 1 | # VGG on 2 GPUs (1 on H1, 1 on H3)
 2 | communicator_id = 200
 3 | cuda_devices = [0, 1, 0, 1]
 4 | 
 5 | [[traces]]
 6 | # 548M all-reduce
 7 | size = 574668960
 8 | type = "all_reduce"
 9 | # 310ms compute
10 | compute_interval = 160000
11 | 


--------------------------------------------------------------------------------