├── .gitignore ├── .gitmodules ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── docs ├── overview.md ├── setup.md └── topology.png ├── eval ├── dynamic-config │ ├── dynamic-patch.toml │ ├── launch-allreduce-ring-reconfig.toml │ ├── launch-gpt-1.toml │ ├── launch-gpt-2.toml │ ├── launch-ring-reconfig.toml │ ├── launch.toml │ ├── reconfig-patch.toml │ ├── reconfig.toml │ └── setup4-trace-fair.toml ├── multi-app │ ├── .gitignore │ ├── collect_multi.py │ ├── collect_real_workload.py │ ├── ecmp-setup1.toml │ ├── ecmp-setup2.toml │ ├── ecmp-setup3.toml │ ├── ecmp-setup4.toml │ ├── flow-setup1.toml │ ├── flow-setup2.toml │ ├── flow-setup3.toml │ ├── flow-setup4.toml │ ├── gen_config.py │ ├── gen_traffic_gen_config.py │ ├── interval.py │ ├── nccl-test.patch │ ├── setup1-trace-fair.toml │ ├── setup1-trace-profile.toml │ ├── setup1-trace-qos.toml │ ├── setup2-trace-fair.toml │ ├── setup2-trace-qosv1.toml │ ├── setup2-trace-qosv2.toml │ ├── setup4-trace-ecmp-fair.toml │ ├── setup4-trace-ecmp-qosv1.toml │ ├── setup4-trace-fair.toml │ └── setup4-trace-qosv1.toml ├── plot │ ├── data │ │ └── .gitignore │ ├── multi_app │ │ ├── __init__.py │ │ ├── main.py │ │ ├── setting1.csv │ │ ├── setting2.csv │ │ ├── setting3.csv │ │ └── setting4.csv │ ├── plt_show.py │ ├── real_workload │ │ ├── jct.csv │ │ └── plot_jct.py │ └── single_app │ │ ├── __init__.py │ │ ├── allgather_4gpu.csv │ │ ├── allgather_8gpu.csv │ │ ├── allreduce_4gpu.csv │ │ ├── allreduce_8gpu.csv │ │ ├── main.py │ │ └── style.py ├── set_ecmp_hashing_algo.sh └── single-app │ ├── .gitignore │ ├── 4gpu.toml │ ├── 8gpu.toml │ ├── collect.py │ └── gen_config.py ├── justfile ├── launcher ├── Cargo.toml ├── README.md ├── benchmark │ ├── allgather.toml │ ├── alltoall-3w-1mb.toml │ └── write_lat-32b.toml ├── config.toml └── src │ ├── line_reader.rs │ ├── main.rs │ └── tee.rs ├── mccs.toml ├── nccl-tests-mccs ├── .gitignore ├── LICENSE.txt ├── Makefile ├── README.md ├── doc │ └── PERFORMANCE.md ├── microbenchmark │ ├── 4gpu.toml │ ├── 8gpu.toml │ ├── collect_nccl.py │ ├── nccl_result.csv │ ├── one_click_run_nccl_all.sh │ ├── run_nccl_multiple_times.sh │ ├── run_nccl_once.sh │ └── set_ecmp_hashing_algo.sh ├── nccl-test.patch ├── setting1 │ ├── collect_nccl.py │ ├── run_all_jobs_once.sh │ ├── run_nccl_all_jobs_multiple_times.sh │ └── run_nccl_job_small.sh ├── setting2 │ ├── collect_nccl.py │ ├── run_all_jobs.sh │ ├── run_all_jobs_once.sh │ ├── run_nccl_all_jobs_multiple_times.sh │ ├── run_nccl_job_blue.sh │ └── run_nccl_job_small.sh ├── setting3 │ ├── collect_nccl.py │ ├── run_all_jobs_once.sh │ ├── run_nccl_all_jobs_multiple_times.sh │ └── run_nccl_job_small.sh ├── setting4 │ ├── collect_nccl.py │ ├── run_all_jobs_once.sh │ ├── run_nccl_all_jobs_multiple_times.sh │ └── run_nccl_job.sh ├── src │ ├── Makefile │ ├── all_gather.cu │ ├── all_reduce.cu │ ├── alltoall.cu │ ├── broadcast.cu │ ├── common.cu │ ├── common.h │ ├── gather.cu │ ├── hypercube.cu │ ├── nccl1_compat.h │ ├── reduce.cu │ ├── reduce_scatter.cu │ ├── scatter.cu │ ├── sendrecv.cu │ ├── timer.cc │ └── timer.h └── verifiable │ ├── Makefile │ ├── inexact_regress.cu │ ├── verifiable.cu │ ├── verifiable.h │ └── verifiable.mk ├── rust-toolchain ├── src ├── collectives-sys │ ├── Cargo.toml │ ├── build.rs │ ├── src │ │ └── lib.rs │ └── wrapper.h ├── collectives │ ├── Makefile │ ├── gen_rules.sh │ ├── include │ │ ├── align.h │ │ ├── collectives.h │ │ └── devcomm.h │ ├── makefiles │ │ └── common.mk │ └── src │ │ ├── all_gather.cu │ │ ├── all_gather.h │ │ ├── all_reduce.cu │ │ ├── all_reduce.h │ │ ├── common.h │ │ ├── common_kernel.h │ │ ├── functions.cu │ │ ├── op128.h │ │ ├── primitives.h │ │ ├── prims_simple.h │ │ └── reduce_kernel.h ├── cuda-sys │ ├── cuda-driver-sys │ │ ├── Cargo.toml │ │ ├── build.rs │ │ ├── src │ │ │ └── lib.rs │ │ └── wrapper.h │ ├── cuda-finder │ │ ├── Cargo.toml │ │ └── src │ │ │ └── lib.rs │ ├── cuda-runtime-sys │ │ ├── Cargo.toml │ │ ├── build.rs │ │ ├── src │ │ │ └── lib.rs │ │ └── wrapper.h │ └── nvml-sys │ │ ├── Cargo.toml │ │ ├── build.rs │ │ ├── src │ │ └── lib.rs │ │ └── wrapper.h ├── experimental │ ├── Cargo.toml │ ├── examples │ │ ├── cuda_ipc_client.rs │ │ ├── cuda_ipc_server.rs │ │ └── get_hwinfo.rs │ └── src │ │ └── lib.rs ├── gdrcopy-sys │ ├── Cargo.toml │ ├── build.rs │ ├── src │ │ └── lib.rs │ └── wrapper.h ├── ibverbs │ ├── Cargo.toml │ ├── build.rs │ ├── src │ │ ├── ffi.rs │ │ ├── lib.rs │ │ └── sliceindex.rs │ └── wrapper.h ├── ipc │ ├── Cargo.toml │ ├── core │ │ ├── Cargo.toml │ │ └── src │ │ │ ├── buf.rs │ │ │ ├── channel │ │ │ ├── flavors │ │ │ │ ├── concurrent.rs │ │ │ │ ├── mod.rs │ │ │ │ └── sequential.rs │ │ │ └── mod.rs │ │ │ ├── control.rs │ │ │ ├── customer.rs │ │ │ ├── ipc_channel.rs │ │ │ ├── lib.rs │ │ │ ├── service.rs │ │ │ ├── shmem_ipc.rs │ │ │ ├── shmobj.rs │ │ │ └── unix.rs │ ├── mccs │ │ ├── Cargo.toml │ │ └── src │ │ │ ├── command.rs │ │ │ ├── dp.rs │ │ │ ├── handle.rs │ │ │ ├── lib.rs │ │ │ └── reconfig.rs │ └── src │ │ └── lib.rs ├── libmccs │ ├── Cargo.toml │ └── src │ │ ├── collectives.rs │ │ ├── communicator.rs │ │ ├── lib.rs │ │ └── memory.rs ├── mccs │ ├── Cargo.toml │ └── src │ │ ├── bootstrap │ │ ├── mod.rs │ │ └── task.rs │ │ ├── comm │ │ ├── device.rs │ │ ├── mod.rs │ │ └── profile.rs │ │ ├── config.rs │ │ ├── control.rs │ │ ├── cuda │ │ ├── alloc.rs │ │ ├── mapped_ptr.rs │ │ ├── mod.rs │ │ └── ptr.rs │ │ ├── daemon │ │ ├── engine.rs │ │ └── mod.rs │ │ ├── engine.rs │ │ ├── exchange │ │ ├── command.rs │ │ ├── engine.rs │ │ ├── message.rs │ │ └── mod.rs │ │ ├── lib.rs │ │ ├── main.rs │ │ ├── message.rs │ │ ├── pattern.rs │ │ ├── proxy │ │ ├── command.rs │ │ ├── engine.rs │ │ ├── init.rs │ │ ├── message.rs │ │ ├── mod.rs │ │ ├── op.rs │ │ ├── plan.rs │ │ └── task.rs │ │ ├── registry.rs │ │ ├── runtime │ │ ├── affinity.rs │ │ ├── executor.rs │ │ ├── manager.rs │ │ └── mod.rs │ │ ├── transport │ │ ├── catalog.rs │ │ ├── channel.rs │ │ ├── delegator.rs │ │ ├── engine.rs │ │ ├── message.rs │ │ ├── meta.rs │ │ ├── mod.rs │ │ ├── net │ │ │ ├── agent.rs │ │ │ ├── buffer.rs │ │ │ ├── config.rs │ │ │ ├── mod.rs │ │ │ ├── provider │ │ │ │ ├── mod.rs │ │ │ │ └── rdma.rs │ │ │ ├── resources.rs │ │ │ └── transporter.rs │ │ ├── op.rs │ │ ├── queue.rs │ │ ├── setup.rs │ │ ├── shm │ │ │ ├── agent.rs │ │ │ ├── buffer.rs │ │ │ ├── config.rs │ │ │ ├── mod.rs │ │ │ ├── resources.rs │ │ │ └── transporter.rs │ │ ├── task.rs │ │ └── transporter.rs │ │ └── utils │ │ ├── duplex_chan.rs │ │ ├── gdr.rs │ │ ├── interfaces.rs │ │ ├── mod.rs │ │ ├── pool.rs │ │ └── tcp.rs ├── mccs_examples │ ├── allgather_bench │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── allgather_proto │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── allreduce_bench │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── allreduce_proto │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── cuda_hello │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ ├── ring_config │ │ ├── Cargo.toml │ │ └── src │ │ │ └── main.rs │ └── traffic_gen │ │ ├── Cargo.toml │ │ └── src │ │ └── main.rs ├── mccs_tests │ └── rdma_transport │ │ ├── Cargo.toml │ │ ├── examples │ │ ├── client.rs │ │ └── server.rs │ │ └── src │ │ └── lib.rs └── qos-service │ ├── Cargo.toml │ └── src │ └── lib.rs └── workloads ├── reconfig_gpt.toml ├── setup-1_gpt_0.toml ├── setup-1_gpt_1.toml ├── setup-1_resnet_0.toml ├── setup-1_resnet_1.toml ├── setup-1_vgg_0.toml ├── setup-1_vgg_1.toml ├── setup-2_gpt_1.toml ├── setup-2_gpt_2.toml ├── setup-2_resnet.toml ├── setup-2_vgg.toml ├── setup-3_gpt_1.toml ├── setup-3_gpt_2.toml ├── setup-4_gpt_1.toml ├── setup-4_gpt_2.toml ├── setup-4_resnet_0.toml └── setup-4_vgg.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # MPI hostfile 2 | hostfile* 3 | 4 | # Prerequisites 5 | *.d 6 | 7 | # Compiled Object files 8 | *.slo 9 | *.lo 10 | *.o 11 | *.obj 12 | 13 | # Precompiled Headers 14 | *.gch 15 | *.pch 16 | 17 | # Compiled Dynamic libraries 18 | *.so 19 | *.dylib 20 | *.dll 21 | 22 | # Fortran module files 23 | *.mod 24 | *.smod 25 | 26 | # Compiled Static libraries 27 | *.lai 28 | *.la 29 | *.a 30 | *.lib 31 | 32 | # Executables 33 | *.exe 34 | *.out 35 | *.app 36 | 37 | 38 | # build 39 | build/ 40 | # clangd 41 | .clangd/ 42 | .cache/clangd/ 43 | compile_commands.json 44 | 45 | # vim session file 46 | Session.vim 47 | # vscode 48 | *.workspace 49 | *.code-workspace 50 | .vscode/ 51 | 52 | 53 | # Added by cargo 54 | 55 | /target 56 | 57 | # macOS 58 | # General 59 | .DS_Store 60 | .AppleDouble 61 | .LSOverride 62 | 63 | # Icon must end with two \r 64 | Icon 65 | 66 | 67 | # Thumbnails 68 | ._* 69 | 70 | # Jetbrains IDE 71 | .idea/ 72 | cmake-build-*/ 73 | 74 | # Private folder for development 75 | /_private 76 | # Prerequisites 77 | *.d 78 | 79 | # Compiled Object files 80 | *.slo 81 | *.lo 82 | *.o 83 | *.obj 84 | 85 | # Precompiled Headers 86 | *.gch 87 | *.pch 88 | 89 | # Compiled Dynamic libraries 90 | *.so 91 | *.dylib 92 | *.dll 93 | 94 | # Fortran module files 95 | *.mod 96 | *.smod 97 | 98 | # Compiled Static libraries 99 | *.lai 100 | *.la 101 | *.a 102 | *.lib 103 | 104 | # Executables 105 | *.exe 106 | *.out 107 | *.app 108 | 109 | 110 | # build 111 | build/ 112 | # clangd 113 | .clangd/ 114 | .cache/clangd/ 115 | compile_commands.json 116 | 117 | # vim session file 118 | Session.vim 119 | # vscode 120 | *.workspace 121 | *.code-workspace 122 | .vscode/ 123 | 124 | 125 | # Added by cargo 126 | 127 | /target 128 | 129 | # macOS 130 | # General 131 | .DS_Store 132 | .AppleDouble 133 | .LSOverride 134 | 135 | # Icon must end with two \r 136 | Icon 137 | 138 | 139 | # Thumbnails 140 | ._* 141 | 142 | # Jetbrains IDE 143 | .idea/ 144 | cmake-build-*/ 145 | 146 | # Private folder for development 147 | /_private 148 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/ibverbs/vendor/rdma-core"] 2 | path = src/ibverbs/vendor/rdma-core 3 | url = https://github.com/linux-rdma/rdma-core.git 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | 4 | members = [ 5 | "src/cuda-sys/cuda-driver-sys", 6 | "src/cuda-sys/cuda-runtime-sys", 7 | "src/cuda-sys/nvml-sys", 8 | "src/cuda-sys/cuda-finder", 9 | "src/gdrcopy-sys", 10 | "src/collectives-sys", 11 | "src/ibverbs", 12 | "src/experimental", 13 | "src/ipc", 14 | "src/ipc/core", 15 | "src/ipc/mccs", 16 | "src/libmccs", 17 | "src/mccs", 18 | "src/mccs_examples/cuda_hello", 19 | "src/mccs_examples/allgather_proto", 20 | "src/mccs_examples/allreduce_proto", 21 | "src/mccs_examples/allgather_bench", 22 | "src/mccs_examples/allreduce_bench", 23 | "src/mccs_examples/ring_config", 24 | "src/mccs_tests/rdma_transport", 25 | "src/qos-service", 26 | "src/mccs_examples/traffic_gen", 27 | "launcher", 28 | ] 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | run: 2 | cargo build --release 3 | RUST_LOG=INFO ./target/release/mccs 4 | 5 | build-all: 6 | make -j -C src/collectives 7 | cargo build --release -------------------------------------------------------------------------------- /docs/setup.md: -------------------------------------------------------------------------------- 1 | # MCCS Evaluation Setup 2 | 3 | ## Hardware Setup 4 | Topology Setup 5 | 6 | As shown in the figure, in our evaluation, we have four nodes in our testbed, each equipped with 2 NVIDIA RTX 3090 GPUs and a 100 Gbps Mellanox ConnectX-5 NIC. Using a single 100 Gbps Mellanox SN2100 switch, we emulate a spine-leaf topology with 2 leaf switches and 2 spine switches through self-wiring. Four nodes are placed under two racks, where each rack corresponds to a leaf switch. The links between the switches are limited to 50 Gbps, while the links between each host and the leaf switches are limited to 100 Gbps. On each host, we use IB traffic class (TC) and rate limit each TC to emulate two 50 Gbps virtual NICs (one per GPU). 7 | 8 | ## System Environment 9 | - NVIDIA GPU drivers and CUDA must be installed. Our code is tested with CUDA 12.1. 10 | - GDRcopy library must be installed. which can be found here: https://github.com/NVIDIA/gdrcopy 11 | - Mellanox OFED drivers, which can be found here: https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/ 12 | - Rust version `nightly-2023-11-11 `. Follow https://www.rust-lang.org/tools/install to install `rustup`. `rustup` will automatically install this version configured in `rust-toolchain` file. `Cargo.lock` file provides the concrete version we used for each Rust dependency. -------------------------------------------------------------------------------- /docs/topology.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phoenix-dataplane/mCCS/86a25c71a9b050bd7b8a1a1f08fa116470492ce9/docs/topology.png -------------------------------------------------------------------------------- /eval/dynamic-config/dynamic-patch.toml: -------------------------------------------------------------------------------- 1 | mccs_addrs = [ 2 | "192.168.211.2", 3 | "192.168.211.34", 4 | "192.168.211.66", 5 | "192.168.211.162", 6 | ] 7 | mccs_port = 5000 8 | 9 | [[comm_patterns_reconfig]] 10 | communicator_id = 200 11 | channels = [ 12 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 13 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 14 | ] 15 | 16 | [[comm_patterns_reconfig]] 17 | communicator_id = 201 18 | channels = [ 19 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 20 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 21 | ] 22 | 23 | 24 | [[comm_patterns_reconfig]] 25 | communicator_id = 202 26 | channels = [ 27 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 28 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 29 | ] -------------------------------------------------------------------------------- /eval/dynamic-config/launch-allreduce-ring-reconfig.toml: -------------------------------------------------------------------------------- 1 | name = "8gpu-dynamic-allreduce" 2 | group = "8gpu-dynamic-allreduce" 3 | [[worker]] 4 | host = "danyang-01" 5 | bin = "mccs" 6 | args = "--host 1 --config eval/dynamic-config/reconfig.toml" 7 | weak = true 8 | dependencies = [] 9 | 10 | [[worker]] 11 | host = "danyang-02" 12 | bin = "mccs" 13 | args = "--host 2 --config eval/dynamic-config/reconfig.toml" 14 | weak = true 15 | dependencies = [] 16 | 17 | [[worker]] 18 | host = "danyang-03" 19 | bin = "mccs" 20 | args = "--host 3 --config eval/dynamic-config/reconfig.toml" 21 | weak = true 22 | dependencies = [] 23 | 24 | [[worker]] 25 | host = "danyang-05" 26 | bin = "mccs" 27 | args = "--host 5 --config eval/dynamic-config/reconfig.toml" 28 | weak = true 29 | dependencies = [] 30 | 31 | [[worker]] 32 | host = "danyang-02" 33 | bin = "allreduce_bench" 34 | args = "--root-addr 192.168.211.34 --rank 0 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 35 | dependencies = [ 0, 1, 2, 3,] 36 | 37 | [[worker]] 38 | host = "danyang-02" 39 | bin = "allreduce_bench" 40 | args = "--root-addr 192.168.211.34 --rank 1 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 41 | dependencies = [ 0, 1, 2, 3,] 42 | 43 | [[worker]] 44 | host = "danyang-03" 45 | bin = "allreduce_bench" 46 | args = "--root-addr 192.168.211.34 --rank 2 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 47 | dependencies = [ 0, 1, 2, 3,] 48 | 49 | [[worker]] 50 | host = "danyang-03" 51 | bin = "allreduce_bench" 52 | args = "--root-addr 192.168.211.34 --rank 3 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 53 | dependencies = [ 0, 1, 2, 3,] 54 | 55 | [[worker]] 56 | host = "danyang-01" 57 | bin = "allreduce_bench" 58 | args = "--root-addr 192.168.211.34 --rank 4 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 59 | dependencies = [ 0, 1, 2, 3,] 60 | 61 | [[worker]] 62 | host = "danyang-01" 63 | bin = "allreduce_bench" 64 | args = "--root-addr 192.168.211.34 --rank 5 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 65 | dependencies = [ 0, 1, 2, 3,] 66 | 67 | [[worker]] 68 | host = "danyang-05" 69 | bin = "allreduce_bench" 70 | args = "--root-addr 192.168.211.34 --rank 6 --num-ranks 8 --cuda-device-idx 0 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 71 | dependencies = [ 0, 1, 2, 3,] 72 | 73 | [[worker]] 74 | host = "danyang-05" 75 | bin = "allreduce_bench" 76 | args = "--root-addr 192.168.211.34 --rank 7 --num-ranks 8 --cuda-device-idx 1 --size 134217728 --communicator 600 --round 1 --size-in-byte --name reconfig-allreduce --epoch 4" 77 | dependencies = [ 0, 1, 2, 3,] 78 | 79 | -------------------------------------------------------------------------------- /eval/dynamic-config/launch-gpt-1.toml: -------------------------------------------------------------------------------- 1 | name = "setup4-dynamic-gpt-1" 2 | group = "setup4-dynamic-gpt-1" 3 | [[worker]] 4 | host = "danyang-03" 5 | bin = "traffic_gen" 6 | args = "--root-addr 192.168.211.66 --rank 0 --iters 40001 --config workloads/setup-4_gpt_1.toml --verbose --name gpt_1" 7 | dependencies = [] 8 | 9 | [[worker]] 10 | host = "danyang-05" 11 | bin = "traffic_gen" 12 | args = "--root-addr 192.168.211.66 --rank 1 --iters 40001 --config workloads/setup-4_gpt_1.toml --verbose --name gpt_1" 13 | dependencies = [] 14 | 15 | -------------------------------------------------------------------------------- /eval/dynamic-config/launch-gpt-2.toml: -------------------------------------------------------------------------------- 1 | name = "setup4-dynamic-gpt-2" 2 | group = "setup4-dynamic-gpt-2" 3 | [[worker]] 4 | host = "danyang-03" 5 | bin = "traffic_gen" 6 | args = "--root-addr 192.168.211.66 --rank 0 --iters 40001 --config workloads/setup-4_gpt_2.toml --verbose --name gpt_2" 7 | dependencies = [] 8 | 9 | [[worker]] 10 | host = "danyang-05" 11 | bin = "traffic_gen" 12 | args = "--root-addr 192.168.211.66 --rank 1 --iters 40001 --config workloads/setup-4_gpt_2.toml --verbose --name gpt_2" 13 | dependencies = [] 14 | 15 | -------------------------------------------------------------------------------- /eval/dynamic-config/launch-ring-reconfig.toml: -------------------------------------------------------------------------------- 1 | name = "setup4-dynamic" 2 | group = "setup4-dynamic" 3 | [[worker]] 4 | host = "danyang-01" 5 | bin = "mccs" 6 | args = "--host 1 --config eval/dynamic-config/reconfig.toml" 7 | weak = true 8 | dependencies = [] 9 | 10 | [[worker]] 11 | host = "danyang-02" 12 | bin = "mccs" 13 | args = "--host 2 --config eval/dynamic-config/reconfig.toml" 14 | weak = true 15 | dependencies = [] 16 | 17 | [[worker]] 18 | host = "danyang-03" 19 | bin = "mccs" 20 | args = "--host 3 --config eval/dynamic-config/reconfig.toml" 21 | weak = true 22 | dependencies = [] 23 | 24 | [[worker]] 25 | host = "danyang-05" 26 | bin = "mccs" 27 | args = "--host 5 --config eval/dynamic-config/reconfig.toml" 28 | weak = true 29 | dependencies = [] 30 | 31 | [[worker]] 32 | host = "danyang-02" 33 | bin = "traffic_gen" 34 | args = "--root-addr 192.168.211.34 --rank 0 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 35 | dependencies = [ 0, 1, 2, 3,] 36 | 37 | [[worker]] 38 | host = "danyang-02" 39 | bin = "traffic_gen" 40 | args = "--root-addr 192.168.211.34 --rank 1 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 41 | dependencies = [ 0, 1, 2, 3,] 42 | 43 | [[worker]] 44 | host = "danyang-03" 45 | bin = "traffic_gen" 46 | args = "--root-addr 192.168.211.34 --rank 2 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 47 | dependencies = [ 0, 1, 2, 3,] 48 | 49 | [[worker]] 50 | host = "danyang-03" 51 | bin = "traffic_gen" 52 | args = "--root-addr 192.168.211.34 --rank 3 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 53 | dependencies = [ 0, 1, 2, 3,] 54 | 55 | [[worker]] 56 | host = "danyang-01" 57 | bin = "traffic_gen" 58 | args = "--root-addr 192.168.211.34 --rank 4 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 59 | dependencies = [ 0, 1, 2, 3,] 60 | 61 | [[worker]] 62 | host = "danyang-01" 63 | bin = "traffic_gen" 64 | args = "--root-addr 192.168.211.34 --rank 5 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 65 | dependencies = [ 0, 1, 2, 3,] 66 | 67 | [[worker]] 68 | host = "danyang-05" 69 | bin = "traffic_gen" 70 | args = "--root-addr 192.168.211.34 --rank 6 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 71 | dependencies = [ 0, 1, 2, 3,] 72 | 73 | [[worker]] 74 | host = "danyang-05" 75 | bin = "traffic_gen" 76 | args = "--root-addr 192.168.211.34 --rank 7 --iters 5001 --config workloads/reconfig_gpt.toml --verbose --name gpt" 77 | dependencies = [ 0, 1, 2, 3,] 78 | 79 | -------------------------------------------------------------------------------- /eval/dynamic-config/launch.toml: -------------------------------------------------------------------------------- 1 | name = "setup4-dynamic" 2 | group = "setup4-dynamic" 3 | [[worker]] 4 | host = "danyang-01" 5 | bin = "mccs" 6 | args = "--host 1 --config eval/dynamic-config/setup4-trace-fair.toml" 7 | weak = true 8 | dependencies = [] 9 | 10 | [[worker]] 11 | host = "danyang-02" 12 | bin = "mccs" 13 | args = "--host 2 --config eval/dynamic-config/setup4-trace-fair.toml" 14 | weak = true 15 | dependencies = [] 16 | 17 | [[worker]] 18 | host = "danyang-03" 19 | bin = "mccs" 20 | args = "--host 3 --config eval/dynamic-config/setup4-trace-fair.toml" 21 | weak = true 22 | dependencies = [] 23 | 24 | [[worker]] 25 | host = "danyang-05" 26 | bin = "mccs" 27 | args = "--host 5 --config eval/dynamic-config/setup4-trace-fair.toml" 28 | weak = true 29 | dependencies = [] 30 | 31 | [[worker]] 32 | host = "danyang-02" 33 | bin = "traffic_gen" 34 | args = "--root-addr 192.168.211.34 --rank 0 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg" 35 | dependencies = [ 0, 1, 2, 3,] 36 | 37 | [[worker]] 38 | host = "danyang-02" 39 | bin = "traffic_gen" 40 | args = "--root-addr 192.168.211.34 --rank 1 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg" 41 | dependencies = [ 0, 1, 2, 3,] 42 | 43 | [[worker]] 44 | host = "danyang-01" 45 | bin = "traffic_gen" 46 | args = "--root-addr 192.168.211.34 --rank 2 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg" 47 | dependencies = [ 0, 1, 2, 3,] 48 | 49 | [[worker]] 50 | host = "danyang-01" 51 | bin = "traffic_gen" 52 | args = "--root-addr 192.168.211.34 --rank 3 --iters 5001 --config workloads/setup-4_vgg.toml --verbose --name vgg" 53 | dependencies = [ 0, 1, 2, 3,] 54 | 55 | -------------------------------------------------------------------------------- /eval/dynamic-config/reconfig-patch.toml: -------------------------------------------------------------------------------- 1 | mccs_addrs = [ 2 | "192.168.211.2", 3 | "192.168.211.34", 4 | "192.168.211.66", 5 | "192.168.211.162", 6 | ] 7 | mccs_port = 5000 8 | 9 | [[comm_patterns_reconfig]] 10 | communicator_id = 600 11 | channels = [ 12 | { channel_id = 0, ring = [7, 6, 5, 4, 3, 2, 1, 0], udp_sport = [[4, 3, 49200], [0,7, 49200]], net_dev = "mlx5_0" }, 13 | { channel_id = 1, ring = [7, 6, 5, 4, 3, 2, 1, 0], udp_sport = [[4, 3, 49202], [0,7, 49202]], net_dev = "mlx5_0" }, 14 | ] 15 | ib_traffic_class = 0 -------------------------------------------------------------------------------- /eval/dynamic-config/reconfig.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 600 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 0 -------------------------------------------------------------------------------- /eval/multi-app/.gitignore: -------------------------------------------------------------------------------- 1 | output/ -------------------------------------------------------------------------------- /eval/multi-app/collect_real_workload.py: -------------------------------------------------------------------------------- 1 | import subprocess as sb 2 | data = sb.getoutput('grep --recursive "Rank 0: run time" /tmp').split('\n') 3 | # filter with /tmp/ at the beginning, and remove the first 5 characters 4 | data = [i[5:] for i in data if i.find('/tmp/')==0] 5 | txt = ['setting,job,jct'] 6 | for i in data: 7 | setting = '-'.join(i.split('/')[1].split('-')[2:]) 8 | if setting.find('ecmp-qosv2')==-1: 9 | app = i.split('[')[1].split(']')[0] 10 | time = i.split(': ')[-1].split(' ')[0] 11 | txt.append(f'{setting},{app},{time}') 12 | with open('../plot/data/real_workload.csv', 'w') as f: 13 | f.write('\n'.join(txt)) 14 | -------------------------------------------------------------------------------- /eval/multi-app/ecmp-setup1.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 0 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 0 67 | 68 | -------------------------------------------------------------------------------- /eval/multi-app/ecmp-setup2.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 66 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 106 67 | 68 | [[comm_patterns_override]] 69 | communicator_id = 83 70 | channels = [ 71 | { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" }, 72 | { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" }, 73 | ] 74 | ib_traffic_class = 106 -------------------------------------------------------------------------------- /eval/multi-app/ecmp-setup3.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 106 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 66 67 | 68 | -------------------------------------------------------------------------------- /eval/multi-app/ecmp-setup4.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 66 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 106 67 | 68 | [[comm_patterns_override]] 69 | communicator_id = 83 70 | channels = [ 71 | { channel_id = 0, ring = [0, 1], net_dev = "mlx5_0" }, 72 | { channel_id = 1, ring = [0, 1], net_dev = "mlx5_0" }, 73 | ] 74 | ib_traffic_class = 106 -------------------------------------------------------------------------------- /eval/multi-app/flow-setup1.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 0 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 0 67 | 68 | -------------------------------------------------------------------------------- /eval/multi-app/flow-setup2.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 66 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 106 67 | 68 | [[comm_patterns_override]] 69 | communicator_id = 83 70 | channels = [ 71 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 72 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 73 | ] 74 | ib_traffic_class = 106 -------------------------------------------------------------------------------- /eval/multi-app/flow-setup3.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 106 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 66 67 | 68 | -------------------------------------------------------------------------------- /eval/multi-app/flow-setup4.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 81 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | ib_traffic_class = 66 58 | 59 | 60 | [[comm_patterns_override]] 61 | communicator_id = 82 62 | channels = [ 63 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 64 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 65 | ] 66 | ib_traffic_class = 106 67 | 68 | [[comm_patterns_override]] 69 | communicator_id = 83 70 | channels = [ 71 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 72 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 73 | ] 74 | ib_traffic_class = 106 -------------------------------------------------------------------------------- /eval/multi-app/nccl-test.patch: -------------------------------------------------------------------------------- 1 | # nccl branch: v2.13.8 2 | diff --git a/src/Makefile b/src/Makefile 3 | index 393de8e..ba21a59 100644 4 | --- a/src/Makefile 5 | +++ b/src/Makefile 6 | @@ -59,7 +59,7 @@ endif 7 | BUILDDIR ?= ../build 8 | ifneq ($(NCCL_HOME), "") 9 | NVCUFLAGS += -I$(NCCL_HOME)/include/ 10 | -NVLDFLAGS += -L$(NCCL_HOME)/lib 11 | +NVLDFLAGS += -L$(NCCL_HOME)/lib -Xlinker=-rpath,$(NCCL_HOME)/lib 12 | endif 13 | 14 | ifeq ($(MPI), 1) 15 | diff --git a/src/common.cu b/src/common.cu 16 | index 8588047..c91461b 100644 17 | --- a/src/common.cu 18 | +++ b/src/common.cu 19 | @@ -595,14 +595,21 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* 20 | TESTCHECK(completeColl(args)); 21 | 22 | // Benchmark 23 | + int epochs = 1; 24 | + char* epochs_str = getenv("NCCL_EPOCHS"); 25 | + if (epochs_str) { 26 | + epochs = std::stoi(epochs_str); 27 | + } 28 | for (size_t size = args->minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { 29 | setupArgs(size, type, args); 30 | char rootName[100]; 31 | sprintf(rootName, "%6i", root); 32 | - PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); 33 | - TESTCHECK(BenchTime(args, type, op, root, 0)); 34 | - TESTCHECK(BenchTime(args, type, op, root, 1)); 35 | - PRINT("\n"); 36 | + for (int i = 0; i < epochs; i++) { 37 | + PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); 38 | + TESTCHECK(BenchTime(args, type, op, root, 0)); 39 | + TESTCHECK(BenchTime(args, type, op, root, 1)); 40 | + PRINT("\n"); 41 | + } 42 | } 43 | return testSuccess; 44 | } 45 | -------------------------------------------------------------------------------- /eval/multi-app/setup1-trace-fair.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 100 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 55 | ] 56 | ib_traffic_class = 0 57 | 58 | [[comm_patterns_override]] 59 | communicator_id = 101 60 | channels = [ 61 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 62 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 63 | ] 64 | ib_traffic_class = 0 -------------------------------------------------------------------------------- /eval/multi-app/setup1-trace-profile.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 100 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 55 | ] 56 | ib_traffic_class = 0 57 | 58 | [[comm_patterns_override]] 59 | communicator_id = 101 60 | channels = [ 61 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 62 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 63 | ] 64 | ib_traffic_class = 0 -------------------------------------------------------------------------------- /eval/multi-app/setup2-trace-fair.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 200 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 55 | ] 56 | ib_traffic_class = 106 57 | 58 | [[comm_patterns_override]] 59 | communicator_id = 201 60 | channels = [ 61 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 62 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 63 | ] 64 | ib_traffic_class = 66 65 | 66 | 67 | [[comm_patterns_override]] 68 | communicator_id = 202 69 | channels = [ 70 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 71 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 72 | ] 73 | ib_traffic_class = 66 -------------------------------------------------------------------------------- /eval/multi-app/setup2-trace-qosv1.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 200 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 55 | # { channel_id = 2, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 56 | # { channel_id = 3, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 57 | # { channel_id = 4, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 58 | # { channel_id = 5, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 59 | # { channel_id = 6, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 60 | # { channel_id = 7, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 61 | ] 62 | ib_traffic_class = 106 63 | 64 | [[comm_patterns_override]] 65 | communicator_id = 201 66 | channels = [ 67 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 68 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 69 | ] 70 | ib_traffic_class = 66 71 | 72 | 73 | [[comm_patterns_override]] 74 | communicator_id = 202 75 | channels = [ 76 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 77 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 78 | ] 79 | ib_traffic_class = 66 -------------------------------------------------------------------------------- /eval/multi-app/setup4-trace-ecmp-fair.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 200 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3],net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3],net_dev = "mlx5_0" }, 55 | ] 56 | ib_traffic_class = 0 57 | 58 | [[comm_patterns_override]] 59 | communicator_id = 201 60 | channels = [ 61 | { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" }, 62 | { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" }, 63 | ] 64 | ib_traffic_class = 106 65 | 66 | 67 | [[comm_patterns_override]] 68 | communicator_id = 202 69 | channels = [ 70 | { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" }, 71 | { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" }, 72 | ] 73 | ib_traffic_class = 66 -------------------------------------------------------------------------------- /eval/multi-app/setup4-trace-ecmp-qosv1.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 200 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3],net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3],net_dev = "mlx5_0" }, 55 | ] 56 | ib_traffic_class = 0 57 | 58 | [[comm_patterns_override]] 59 | communicator_id = 201 60 | channels = [ 61 | { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" }, 62 | { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" }, 63 | ] 64 | ib_traffic_class = 106 65 | 66 | 67 | [[comm_patterns_override]] 68 | communicator_id = 202 69 | channels = [ 70 | { channel_id = 0, ring = [0, 1],net_dev = "mlx5_0" }, 71 | { channel_id = 1, ring = [0, 1],net_dev = "mlx5_0" }, 72 | ] 73 | ib_traffic_class = 66 -------------------------------------------------------------------------------- /eval/multi-app/setup4-trace-fair.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 200 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 55 | ] 56 | ib_traffic_class = 0 57 | 58 | [[comm_patterns_override]] 59 | communicator_id = 201 60 | channels = [ 61 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 62 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 63 | ] 64 | ib_traffic_class = 106 65 | 66 | 67 | [[comm_patterns_override]] 68 | communicator_id = 202 69 | channels = [ 70 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 71 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 72 | ] 73 | ib_traffic_class = 66 -------------------------------------------------------------------------------- /eval/multi-app/setup4-trace-qosv1.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | # magic number: 49200 & 49202 49 | 50 | [[comm_patterns_override]] 51 | communicator_id = 200 52 | channels = [ 53 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 54 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 55 | ] 56 | ib_traffic_class = 0 57 | 58 | [[comm_patterns_override]] 59 | communicator_id = 201 60 | channels = [ 61 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 62 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 63 | ] 64 | ib_traffic_class = 106 65 | 66 | 67 | [[comm_patterns_override]] 68 | communicator_id = 202 69 | channels = [ 70 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 71 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 72 | ] 73 | ib_traffic_class = 66 -------------------------------------------------------------------------------- /eval/plot/data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /eval/plot/multi_app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phoenix-dataplane/mCCS/86a25c71a9b050bd7b8a1a1f08fa116470492ce9/eval/plot/multi_app/__init__.py -------------------------------------------------------------------------------- /eval/plot/plt_show.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | 4 | def is_notebook() -> bool: 5 | try: 6 | shell = get_ipython().__class__.__name__ 7 | # print(shell) 8 | if shell == 'ZMQInteractiveShell': 9 | return True # Jupyter notebook or qtconsole 10 | elif shell == 'TerminalInteractiveShell': 11 | return False # Terminal running IPython 12 | else: 13 | return False # Other type (?) 14 | except NameError: 15 | return False # Probably standard Python interpreter 16 | 17 | def plt_show(): 18 | if is_notebook(): 19 | plt.show() 20 | -------------------------------------------------------------------------------- /eval/plot/single_app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phoenix-dataplane/mCCS/86a25c71a9b050bd7b8a1a1f08fa116470492ce9/eval/plot/single_app/__init__.py -------------------------------------------------------------------------------- /eval/set_ecmp_hashing_algo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo "Usage: $0: [everything|source-port]" 5 | } 6 | 7 | if [ $# -ne 1 ]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | algo=$1 13 | 14 | case $algo in 15 | everything) 16 | algo_args="source-destination-mac source-destination-ip source-destination-port l3-protocol l2-protocol flow-label" 17 | ;; 18 | source-port) 19 | algo_args="source-port" 20 | ;; 21 | *) 22 | echo "Error: algo should be either 'everything' or 'source-port', got $algo" 23 | usage 24 | exit 1 25 | ;; 26 | esac 27 | 28 | ssh danyang-01 \ 29 | "ssh -oKexAlgorithms=+diffie-hellman-group14-sha1 danyang@danyang-mellanox-switch.cs.duke.edu \ 30 | cli -h '\"enable\" \"config terminal\" \"port-channel load-balance ethernet $algo_args\" \"show interfaces port-channel load-balance\"'" 31 | 32 | # cli -h '\"enable\" \"show lldp remote\"'" 33 | -------------------------------------------------------------------------------- /eval/single-app/.gitignore: -------------------------------------------------------------------------------- 1 | output/ -------------------------------------------------------------------------------- /eval/single-app/4gpu.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 106 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 114 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | 58 | 59 | [[comm_patterns_override]] 60 | communicator_id = 137 61 | channels = [ 62 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 63 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 64 | ] 65 | 66 | 67 | -------------------------------------------------------------------------------- /eval/single-app/8gpu.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 137 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | 58 | -------------------------------------------------------------------------------- /launcher/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "launcher" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | ansi_term = "0.12.1" 10 | anyhow = "1.0.57" 11 | bytes = "1.1.0" 12 | chrono = "0.4.19" 13 | env_logger = "0.9.0" 14 | lazy_static = "1.4.0" 15 | log = "0.4.17" 16 | nix = { version = "0.24.1", default-features = false, features = ["signal"] } 17 | serde = { version = "1.0.137", features = ["derive"] } 18 | shellexpand = "2.1.0" 19 | structopt = "0.3.26" 20 | tokio-anyfd = "0.2.0" 21 | toml = "0.5.9" 22 | walkdir = "2.3.2" 23 | 24 | [[bin]] 25 | name = "launcher" 26 | path = "src/main.rs" 27 | -------------------------------------------------------------------------------- /launcher/README.md: -------------------------------------------------------------------------------- 1 | # Phoenix benchmark suites 2 | 3 | There is a set of defined benchmark configurations under `benchmark/`. 4 | To use this launcher, you currently need to first edit `config.toml`, 5 | and run the launcher within this directory (the same directory as this 6 | README). You need to at least update the `workdir` to point the project's 7 | path on your file system. To run Phoenix examples, the `workdir` should 8 | be set to the path to phoenix project. To run mRPC examples, the `workdir` 9 | must be set to points to the path of `phoenix/experimental/mrpc`. 10 | You also need to have ssh connections to the worker machines 11 | specified in the benchmark configurations. 12 | 13 | ``` 14 | $ cargo rr --bin launcher -- --help 15 | Finished release [optimized + debuginfo] target(s) in 0.41s 16 | Running `target/release/launcher --help` 17 | [2023-03-02 11:46:08.073007 INFO benchmark/src/main.rs:632] env_logger initialized 18 | launcher 0.1.0 19 | Launcher of the benchmark suite. 20 | 21 | USAGE: 22 | launcher [FLAGS] [OPTIONS] --benchmark 23 | 24 | FLAGS: 25 | --debug Run with debug mode (cargo build without --release) 26 | --dry-run Dry-run. Use this option to check the configs 27 | -h, --help Prints help information 28 | --logical-and kill all threads if any thread ends 29 | -s, --silent Do out print to stdout 30 | -V, --version Prints version information 31 | 32 | OPTIONS: 33 | -b, --benchmark Run a single benchmark task 34 | -c, --configfile configfile [default: config.toml] 35 | --timeout Timeout in seconds, 0 means infinity. Can be overwritten by specific case 36 | configs [default: 60] 37 | -g, --group Run a benchmark group 38 | -o, --output-dir Output directory of log files 39 | ``` 40 | 41 | To start with the basic connectivity, first update `benchmark/rpc_hello.toml`. Then run (make sure `workdir` is correct) 42 | ``` 43 | $ cargo rr --bin launcher -- --benchmark benchmark/rpc_hello.toml 44 | ``` 45 | 46 | To test the latency for mRPC, you can run (make sure `workdir` is 47 | correct). 48 | ``` 49 | $ cargo rr --bin launcher -- -o /tmp/output --benchmark benchmark/rpc_bench_latency/rpc_bench_latency_64b.toml 50 | ``` 51 | 52 | Similarly, to test the bandwidth or RPC rate, you can run 53 | ``` 54 | $ cargo rr --bin launcher -- -o /tmp/output --benchmark benchmark/rpc_bench_tput/rpc_bench_tput_1mb.toml 55 | or 56 | $ cargo rr --bin launcher -- -o /tmp/output --benchmark benchmark/rpc_bench_rate/rpc_bench_tput_32b_4c.toml 57 | ``` 58 | 59 | You can also specify a __benchmark group__ and run a group of tests. 60 | For more information, please read the commandline usage and the 61 | benchmark configuration files. 62 | -------------------------------------------------------------------------------- /launcher/benchmark/allgather.toml: -------------------------------------------------------------------------------- 1 | name = "benchmark/allgather" 2 | group = "allgather" 3 | 4 | [[worker]] 5 | host = "danyang-03" 6 | bin = "mccs" 7 | args = "--host 3" 8 | dependencies = [] 9 | weak = true 10 | 11 | [[worker]] 12 | host = "danyang-01" 13 | bin = "mccs" 14 | args = "--host 1" 15 | dependencies = [] 16 | weak = true 17 | 18 | [[worker]] 19 | host = "danyang-03" 20 | bin = "allgather_bench" 21 | args = "--root-addr 192.168.211.66 --rank 0 --num-ranks 2 --cuda-device-idx 0 --size 128 --communicator 114 --round 20" 22 | dependencies = [0, 1] 23 | 24 | [[worker]] 25 | host = "danyang-01" 26 | bin = "allgather_bench" 27 | args = "--root-addr 192.168.211.66 --rank 1 --num-ranks 2 --cuda-device-idx 0 --size 128 --communicator 114 --round 20" 28 | dependencies = [0, 1] -------------------------------------------------------------------------------- /launcher/benchmark/alltoall-3w-1mb.toml: -------------------------------------------------------------------------------- 1 | name = "benchmark/alltoall-3w-1mb" 2 | description = "Run bandwidth benchmark for all-to-all traffic pattern" 3 | group = "alltoall" 4 | 5 | [[worker]] 6 | host = "danyang-06" 7 | bin = "alltoall" 8 | args = "st --hosts rdma0.danyang-06,rdma0.danyang-05,rdma0.danyang-04 -m 1024000 -p 4000" 9 | dependencies = [] 10 | 11 | [[worker]] 12 | host = "danyang-05" 13 | bin = "alltoall" 14 | args = "st --hosts rdma0.danyang-06,rdma0.danyang-05,rdma0.danyang-04 -m 1024000 -p 4000" 15 | dependencies = [0] 16 | 17 | [[worker]] 18 | host = "danyang-04" 19 | bin = "alltoall" 20 | args = "st --hosts rdma0.danyang-06,rdma0.danyang-05,rdma0.danyang-04 -m 1024000 -p 4000" 21 | dependencies = [0, 1] 22 | -------------------------------------------------------------------------------- /launcher/benchmark/write_lat-32b.toml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /launcher/config.toml: -------------------------------------------------------------------------------- 1 | workdir = "~/nfs/mCCS" 2 | 3 | [env] 4 | RUST_BACKTRACE = "1" 5 | RUST_LOG_STYLE = "never" 6 | CARGO_TERM_COLOR = "never" -------------------------------------------------------------------------------- /launcher/src/tee.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::io::{Read, Write}; 3 | use std::os::unix::io::{AsRawFd, RawFd}; 4 | 5 | pub struct DevNull; 6 | 7 | impl Write for DevNull { 8 | #[inline] 9 | fn write(&mut self, buf: &[u8]) -> io::Result { 10 | Ok(buf.len()) 11 | } 12 | 13 | #[inline] 14 | fn flush(&mut self) -> io::Result<()> { 15 | Ok(()) 16 | } 17 | } 18 | 19 | /// An adapter for readers whose inputs 20 | /// are written to a "tee"'d writer 21 | pub struct TeeReader { 22 | reader: R, 23 | writer: W, 24 | } 25 | 26 | impl AsRawFd for TeeReader { 27 | fn as_raw_fd(&self) -> RawFd { 28 | self.reader.as_raw_fd() 29 | } 30 | } 31 | 32 | impl TeeReader { 33 | /// Returns a TeeReader which can be used as Read whose 34 | /// reads delegate bytes read to the provided reader and write to the provided 35 | /// writer. The write operation must complete before the read completes. 36 | /// 37 | /// Errors reported by the write operation will be interpreted as errors for the read 38 | pub fn new(reader: R, writer: W) -> TeeReader { 39 | TeeReader { reader, writer } 40 | } 41 | } 42 | 43 | impl Read for TeeReader { 44 | fn read(&mut self, buf: &mut [u8]) -> io::Result { 45 | let n = self.reader.read(buf)?; 46 | self.writer.write_all(&buf[..n])?; 47 | Ok(n) 48 | } 49 | } 50 | 51 | #[cfg(test)] 52 | mod tests { 53 | use super::*; 54 | use std::io::Read; 55 | 56 | #[test] 57 | fn tee() { 58 | let mut reader = "It's over 9000!".as_bytes(); 59 | let mut teeout = Vec::new(); 60 | let mut stdout = Vec::new(); 61 | { 62 | let mut tee = TeeReader::new(&mut reader, &mut teeout); 63 | let _ = tee.read_to_end(&mut stdout); 64 | } 65 | assert_eq!(teeout, stdout); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /mccs.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | [qos_schedule] 49 | epoch_microsecs = 85000 50 | 51 | [qos_schedule.schedule.99] 52 | intervals = [[0, 35000]] 53 | mode = "Allow" 54 | 55 | [qos_schedule.schedule.100] 56 | intervals = [[35000, 69000]] 57 | mode = "Allow" 58 | 59 | # magic number: 49200 & 49202 60 | 61 | [[comm_patterns_override]] 62 | communicator_id = 114 63 | channels = [ 64 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 65 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 66 | ] 67 | 68 | 69 | [[comm_patterns_override]] 70 | communicator_id = 137 71 | channels = [ 72 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49153], [3, 0, 49155]], net_dev = "mlx5_0" }, 73 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49152], [3, 0, 49154]], net_dev = "mlx5_0" }, 74 | ] 75 | 76 | 77 | [[comm_patterns_override]] 78 | communicator_id = 138 79 | channels = [ 80 | { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" }, 81 | { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" }, 82 | ] 83 | 84 | 85 | [[comm_patterns_override]] 86 | communicator_id = 46 87 | channels = [ 88 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 89 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 90 | ] 91 | 92 | -------------------------------------------------------------------------------- /nccl-tests-mccs/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /nccl-tests-mccs/LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | * Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of NVIDIA CORPORATION, nor the names of their 13 | contributors may be used to endorse or promote products derived 14 | from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | -------------------------------------------------------------------------------- /nccl-tests-mccs/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENCE.txt for license information 5 | # 6 | 7 | BUILDDIR ?= build 8 | override BUILDDIR := $(abspath $(BUILDDIR)) 9 | 10 | .PHONY: all clean 11 | 12 | default: src.build 13 | 14 | TARGETS=src 15 | 16 | all: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.build: 20 | ${MAKE} -C $* build BUILDDIR=${BUILDDIR} 21 | 22 | %.clean: 23 | ${MAKE} -C $* clean BUILDDIR=${BUILDDIR} 24 | -------------------------------------------------------------------------------- /nccl-tests-mccs/microbenchmark/4gpu.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 106 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 114 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1], udp_sport = [[0, 1, 49200], [1, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1], udp_sport = [[0, 1, 49202], [1, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | 58 | 59 | [[comm_patterns_override]] 60 | communicator_id = 137 61 | channels = [ 62 | { channel_id = 0, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49200], [3, 0, 49200]], net_dev = "mlx5_0" }, 63 | { channel_id = 1, ring = [0, 1, 2, 3], udp_sport = [[1, 2, 49202], [3, 0, 49202]], net_dev = "mlx5_0" }, 64 | ] 65 | 66 | 67 | -------------------------------------------------------------------------------- /nccl-tests-mccs/microbenchmark/8gpu.toml: -------------------------------------------------------------------------------- 1 | mccs_daemon_basename = "mccs-deamon" 2 | mccs_daemon_prefix = "/tmp/mccs-${USER}" 3 | addrs = [ 4 | "0.0.0.0", 5 | "192.168.211.2", 6 | "192.168.211.34", 7 | "192.168.211.66", 8 | "192.168.211.130", 9 | "192.168.211.162", 10 | "192.168.211.195", 11 | ] 12 | listen_port = 5000 13 | 14 | [control] 15 | prefix = "/tmp/mccs-${USER}" 16 | path = "control.sock" 17 | 18 | [comm_default_config] 19 | buffer_sizes = [4194304] 20 | channel_count = 2 21 | 22 | [comm_global_config] 23 | [comm_global_config.net_rdma] 24 | gid_index = 3 25 | qps_per_conn = 1 26 | timeout = 18 27 | retry_count = 7 28 | pkey = 0 29 | use_inline = false 30 | service_level = 0 31 | traffic_class = 0 32 | adaptive_routing = false 33 | ar_threshold = 8192 34 | pci_relaxed_ordering = false 35 | gdr_flush_disable = true 36 | socket_if_prefix = "rdma" 37 | 38 | [comm_global_config.net] 39 | gdr_enable = false 40 | gdr_copy_sync_enable = false 41 | gdr_copy_flush_enable = false 42 | 43 | [comm_global_config.shm] 44 | locality = "Sender" 45 | memcpy_send = false 46 | memcpy_recv = false 47 | 48 | 49 | # magic number: 49200 & 49202 50 | 51 | [[comm_patterns_override]] 52 | communicator_id = 137 53 | channels = [ 54 | { channel_id = 0, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49200], [7, 0, 49200]], net_dev = "mlx5_0" }, 55 | { channel_id = 1, ring = [0, 1, 2, 3, 4, 5, 6, 7], udp_sport = [[3, 4, 49202], [7, 0, 49202]], net_dev = "mlx5_0" }, 56 | ] 57 | 58 | -------------------------------------------------------------------------------- /nccl-tests-mccs/microbenchmark/collect_nccl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import re 5 | import sys 6 | import csv 7 | import glob 8 | import os.path 9 | import argparse 10 | 11 | OUTPUT_DIR = "/tmp/nccl_single_app" 12 | 13 | parser = argparse.ArgumentParser(description='') 14 | parser.add_argument('--app', '--app', type=str, 15 | help = 'The app of the trial, either allgather or allreduce') 16 | parser.add_argument('--num-gpus', '--num-gpus', type=str, 17 | help = 'The number of gpus to match, either 1 or 2') 18 | 19 | args = parser.parse_args() 20 | assert args.app 21 | assert args.num_gpus 22 | 23 | nccl_results = glob.glob(os.path.join(OUTPUT_DIR, "*{}_*_{}.stdout".format(args.app, args.num_gpus))) 24 | 25 | writer = csv.writer(sys.stdout) 26 | writer.writerow(['Solution', 'App', 'Size (Bytes)', 'Dtype', 'Latency (us)', 'AlgBW (GB/s)', 'BusBW (GB/s)']) 27 | 28 | pat = re.compile(r'\s*\d+\s+\d+.*') 29 | for path in nccl_results: 30 | with open(path, 'r') as fin: 31 | solution = path.split('/')[-1].split('_')[-2] 32 | for line in fin: 33 | match = pat.match(line) 34 | if match is not None: 35 | # print(line.split()) 36 | tokens = line.split() 37 | # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0'] 38 | size = tokens[0] 39 | dtype = tokens[2] 40 | latency_us = tokens[9] 41 | algbw = tokens[10] 42 | busbw = tokens[11] 43 | writer.writerow([solution, args.app, size, dtype, latency_us, algbw, busbw]) 44 | -------------------------------------------------------------------------------- /nccl-tests-mccs/microbenchmark/nccl_result.csv: -------------------------------------------------------------------------------- 1 | Solution,App,Size (Bytes),Dtype,Latency (us),AlgBW (GB/s),BusBW (GB/s) 2 | NCCL Bad Ring,allgather,32768,float,46.59,0.70,0.53 3 | NCCL Bad Ring,allgather,131072,float,72.53,1.81,1.36 4 | NCCL Bad Ring,allgather,524288,float,138.6,3.78,2.84 5 | NCCL Bad Ring,allgather,2097152,float,455.7,4.60,3.45 6 | NCCL Bad Ring,allgather,8388608,float,1782.5,4.71,3.53 7 | NCCL Bad Ring,allgather,33554432,float,7152.5,4.69,3.52 8 | NCCL Bad Ring,allgather,134217728,float,28699,4.68,3.51 9 | NCCL Bad Ring,allgather,536870912,float,113508,4.73,3.55 10 | NCCL Bad Ring,allgather,32768,float,45.54,0.72,0.54 11 | NCCL Bad Ring,allgather,131072,float,66.93,1.96,1.47 12 | NCCL Bad Ring,allgather,524288,float,120.6,4.35,3.26 13 | NCCL Bad Ring,allgather,2097152,float,347.0,6.04,4.53 14 | NCCL Bad Ring,allgather,8388608,float,1256.1,6.68,5.01 15 | NCCL Bad Ring,allgather,33554432,float,4878.2,6.88,5.16 16 | NCCL Bad Ring,allgather,134217728,float,18866,7.11,5.34 17 | NCCL Bad Ring,allgather,536870912,float,73930,7.26,5.45 18 | NCCL Bad Ring,allgather,32768,float,45.99,0.71,0.53 19 | NCCL Bad Ring,allgather,131072,float,71.36,1.84,1.38 20 | NCCL Bad Ring,allgather,524288,float,135.5,3.87,2.90 21 | NCCL Bad Ring,allgather,2097152,float,450.6,4.65,3.49 22 | NCCL Bad Ring,allgather,8388608,float,1783.7,4.70,3.53 23 | NCCL Bad Ring,allgather,33554432,float,7360.0,4.56,3.42 24 | NCCL Bad Ring,allgather,134217728,float,28552,4.70,3.53 25 | NCCL Bad Ring,allgather,536870912,float,112582,4.77,3.58 26 | NCCL Bad Ring,allgather,32768,float,45.05,0.73,0.55 27 | NCCL Bad Ring,allgather,131072,float,67.81,1.93,1.45 28 | NCCL Bad Ring,allgather,524288,float,121.7,4.31,3.23 29 | NCCL Bad Ring,allgather,2097152,float,346.4,6.05,4.54 30 | NCCL Bad Ring,allgather,8388608,float,1250.8,6.71,5.03 31 | NCCL Bad Ring,allgather,33554432,float,4820.9,6.96,5.22 32 | NCCL Bad Ring,allgather,134217728,float,18698,7.18,5.38 33 | NCCL Bad Ring,allgather,536870912,float,73600,7.29,5.47 34 | NCCL Bad Ring,allgather,32768,float,45.12,0.73,0.54 35 | NCCL Bad Ring,allgather,131072,float,68.43,1.92,1.44 36 | NCCL Bad Ring,allgather,524288,float,131.1,4.00,3.00 37 | NCCL Bad Ring,allgather,2097152,float,424.5,4.94,3.71 38 | NCCL Bad Ring,allgather,8388608,float,1803.3,4.65,3.49 39 | NCCL Bad Ring,allgather,33554432,float,7326.4,4.58,3.43 40 | NCCL Bad Ring,allgather,134217728,float,28911,4.64,3.48 41 | NCCL Bad Ring,allgather,536870912,float,114558,4.69,3.51 42 | -------------------------------------------------------------------------------- /nccl-tests-mccs/microbenchmark/one_click_run_nccl_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo "Usage: $0 " 5 | } 6 | 7 | if [ $# -ne 1 ]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | num_iters=$1 13 | if [ $num_iters -gt 20 ]; then 14 | echo "$num_iters too large" 15 | exit 1 16 | fi 17 | 18 | suffix=`date +%Y%m%d.%H.%M.%S` 19 | # output_dir=/tmp/${app}_${ring_type}_${num_gpus}gpus.${suffix} 20 | output_dir=/tmp/nccl_single_app.${suffix} 21 | mkdir -p $output_dir 22 | unlink /tmp/nccl_single_app 23 | ln -sf $output_dir /tmp/nccl_single_app 24 | 25 | ./run_nccl_multiple_times.sh $num_iters 1 badring allgather 26 | ./run_nccl_multiple_times.sh $num_iters 1 goodring allgather 27 | ./run_nccl_multiple_times.sh $num_iters 2 badring allgather 28 | ./run_nccl_multiple_times.sh $num_iters 2 goodring allgather 29 | 30 | ./run_nccl_multiple_times.sh $num_iters 1 badring allreduce 31 | ./run_nccl_multiple_times.sh $num_iters 1 goodring allreduce 32 | ./run_nccl_multiple_times.sh $num_iters 2 badring allreduce 33 | ./run_nccl_multiple_times.sh $num_iters 2 goodring allreduce 34 | -------------------------------------------------------------------------------- /nccl-tests-mccs/microbenchmark/run_nccl_multiple_times.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo "Usage: $0 num_gpus=1|2, ring_type=goodring|badring, app=allgather|allreduce" 5 | } 6 | 7 | if [ $# -ne 4 ]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | num_iters=$1 13 | if [ $num_iters -gt 20 ]; then 14 | echo "$num_iters too large" 15 | exit 1 16 | fi 17 | 18 | shift 19 | num_gpus=$1 20 | ring_type=$2 21 | app=$3 22 | 23 | output_dir=/tmp/nccl_single_app 24 | 25 | for i in `seq 1 $num_iters`; do 26 | echo Case $i 27 | ./run_nccl_once.sh $num_gpus $ring_type $app |& tee $output_dir/${i}_${app}_${ring_type}_${num_gpus}.stdout 28 | done 29 | -------------------------------------------------------------------------------- /nccl-tests-mccs/microbenchmark/run_nccl_once.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 num_gpus=1|2, ring_type=goodring|badring, app=allgather|allreduce" 7 | } 8 | 9 | if [ $# -ne 3 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | num_gpus=$1 15 | ring_type=$2 16 | app_type=$3 17 | 18 | case $num_gpus in 19 | 1) 20 | tclass=106 21 | ;; 22 | 2) 23 | tclass=0 24 | ;; 25 | *) 26 | echo "Error: num_gpus should be either '1' or '2', got $num_gpus" 27 | usage 28 | exit 1 29 | ;; 30 | esac 31 | 32 | echo "Traffic class=$tclass" 33 | 34 | case $ring_type in 35 | goodring) 36 | cat > hostfile.$ring_type < hostfile.$ring_type <minbytes; size<=args->maxbytes; size = ((args->stepfactor > 1) ? size*args->stepfactor : size+args->stepbytes)) { 28 | setupArgs(size, type, args); 29 | char rootName[100]; 30 | sprintf(rootName, "%6i", root); 31 | - PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); 32 | - TESTCHECK(BenchTime(args, type, op, root, 0)); 33 | - TESTCHECK(BenchTime(args, type, op, root, 1)); 34 | - PRINT("\n"); 35 | + for (int i = 0; i < epochs; i++) { 36 | + PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); 37 | + TESTCHECK(BenchTime(args, type, op, root, 0)); 38 | + TESTCHECK(BenchTime(args, type, op, root, 1)); 39 | + PRINT("\n"); 40 | + } 41 | } 42 | return testSuccess; 43 | } 44 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting1/collect_nccl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import re 5 | import sys 6 | import csv 7 | import glob 8 | import os.path 9 | import argparse 10 | 11 | OUTPUT_DIR = "/tmp/nccl_setting1" 12 | 13 | parser = argparse.ArgumentParser(description='Launch a dsagent and deepscheduler') 14 | parser.add_argument('--solution', '--solution', required=True, type=str, 15 | help = 'Give a name to the trial, either NCCL Bad Ring|NCCL Good Ring.') 16 | parser.add_argument('--strip-head', '--strip-head', required=False, type=int, default=1, 17 | help = 'Omit the first few lines of the output.') 18 | parser.add_argument('--strip-tail', '--strip-tail', required=False, type=int, default=0, 19 | help = 'Omit the last few lines of the output.') 20 | 21 | args = parser.parse_args() 22 | assert args.solution 23 | 24 | writer = csv.writer(sys.stdout) 25 | writer.writerow(['Solution', 'App', 'Size (Bytes)', 'Dtype', 'Latency (us)', 'AlgBW (GB/s)', 'BusBW (GB/s)', 'Trial ID']) 26 | 27 | pat = re.compile(r'\s*\d+\s+\d+.*') 28 | 29 | def get_latency(rec) -> int: 30 | return int(rec[4]) 31 | 32 | def get_job_duration(records) -> int: 33 | return sum(map(get_latency, records)) 34 | 35 | def work(app_color: str, trial_id, nccl_output_path) -> None: 36 | results = [] 37 | with open(nccl_output_path, 'r') as fin: 38 | for line in fin: 39 | match = pat.match(line) 40 | if match is not None: 41 | tokens = line.split() 42 | # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0'] 43 | size = tokens[0] 44 | dtype = tokens[2] 45 | latency_us = tokens[9] 46 | algbw = tokens[10] 47 | busbw = tokens[11] 48 | results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id]) 49 | return results 50 | 51 | jobs = [] 52 | 53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')): 54 | tokens = path.split('.') 55 | trial_id = tokens[-3].split('/')[-1] 56 | color = tokens[-2] 57 | jobs += [work(color, trial_id, path)] 58 | 59 | # min_dura = min(map(get_job_duration, jobs)) 60 | # print(f'min job duration: {min_dura}') 61 | 62 | for results in jobs: 63 | assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results)) 64 | if args.strip_tail == 0: 65 | stripped_results = results[args.strip_head:] 66 | else: 67 | stripped_results = results[args.strip_head:-args.strip_tail] 68 | for rec in stripped_results: 69 | writer.writerow(rec) 70 | # writer.writerow(rec) 71 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting1/run_all_jobs_once.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 " 7 | } 8 | 9 | if [ $# -ne 1 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | trial_id=$1 15 | 16 | OUTPUT_DIR=/tmp/nccl_setting1 17 | 18 | $WORKDIR/run_nccl_job_small.sh blue |& tee $OUTPUT_DIR/$trial_id.blue.stdout & 19 | $WORKDIR/run_nccl_job_small.sh red |& tee $OUTPUT_DIR/$trial_id.red.stdout & 20 | 21 | wait 22 | # tail -f /tmp/nccl_setting1_blue.stdout /tmp/nccl_setting1_red.stdout 23 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting1/run_nccl_all_jobs_multiple_times.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo "Usage: $0 " 5 | } 6 | 7 | if [ $# -ne 1 ]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | num_iters=$1 13 | if [ $num_iters -gt 20 ]; then 14 | echo "$num_iters too large" 15 | exit 1 16 | fi 17 | 18 | suffix=`date +%Y%m%d.%H.%M.%S` 19 | output_dir=/tmp/nccl_setting1.${suffix} 20 | mkdir -p $output_dir 21 | unlink /tmp/nccl_setting1 22 | ln -sf $output_dir /tmp/nccl_setting1 23 | 24 | 25 | for i in `seq 1 $num_iters`; do 26 | echo Case $i 27 | ./run_all_jobs_once.sh $i 28 | done 29 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting1/run_nccl_job_small.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | # green or red: both take 2 gpus 6 | 7 | usage() { 8 | echo "Usage: $0 ring_type=green|red" 9 | } 10 | 11 | if [ $# -ne 1 ]; then 12 | usage 13 | exit 1 14 | fi 15 | 16 | color=$1 17 | 18 | case $color in 19 | blue) 20 | cat > hostfile.$color < hostfile.$color < int: 30 | return int(rec[4]) 31 | 32 | def get_job_duration(records) -> int: 33 | return sum(map(get_latency, records)) 34 | 35 | def work(app_color: str, trial_id, nccl_output_path) -> None: 36 | results = [] 37 | with open(nccl_output_path, 'r') as fin: 38 | for line in fin: 39 | match = pat.match(line) 40 | if match is not None: 41 | tokens = line.split() 42 | # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0'] 43 | size = tokens[0] 44 | dtype = tokens[2] 45 | latency_us = tokens[9] 46 | algbw = tokens[10] 47 | busbw = tokens[11] 48 | results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id]) 49 | return results 50 | 51 | jobs = [] 52 | 53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')): 54 | tokens = path.split('.') 55 | trial_id = tokens[-3].split('/')[-1] 56 | color = tokens[-2] 57 | jobs += [work(color, trial_id, path)] 58 | 59 | # min_dura = min(map(get_job_duration, jobs)) 60 | # print(f'min job duration: {min_dura}') 61 | 62 | for results in jobs: 63 | assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results)) 64 | if args.strip_tail == 0: 65 | stripped_results = results[args.strip_head:] 66 | else: 67 | stripped_results = results[args.strip_head:-args.strip_tail] 68 | for rec in stripped_results: 69 | writer.writerow(rec) 70 | # writer.writerow(rec) 71 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting2/run_all_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 ring_type=goodring|badring" 7 | } 8 | 9 | if [ $# -ne 1 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | ring_type=$1 15 | 16 | $WORKDIR/run_nccl_job_blue.sh $ring_type 2>&1 > /tmp/nccl_setting2_blue.stdout & 17 | $WORKDIR/run_nccl_job_small.sh green 2>&1 > /tmp/nccl_setting2_green.stdout & 18 | $WORKDIR/run_nccl_job_small.sh red 2>&1 > /tmp/nccl_setting2_red.stdout & 19 | 20 | tail -f /tmp/nccl_setting2_blue.stdout /tmp/nccl_setting2_green.stdout /tmp/nccl_setting2_red.stdout 21 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting2/run_all_jobs_once.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 ring_type=goodring|badring" 7 | } 8 | 9 | if [ $# -ne 2 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | ring_type=$1 15 | trial_id=$2 16 | 17 | OUTPUT_DIR=/tmp/nccl_setting2 18 | 19 | $WORKDIR/run_nccl_job_blue.sh $ring_type |& tee $OUTPUT_DIR/$trial_id.blue.stdout & 20 | $WORKDIR/run_nccl_job_small.sh green |& tee $OUTPUT_DIR/$trial_id.green.stdout & 21 | $WORKDIR/run_nccl_job_small.sh red |& tee $OUTPUT_DIR/$trial_id.red.stdout & 22 | 23 | wait 24 | # tail -f /tmp/nccl_setting2_blue.stdout /tmp/nccl_setting2_green.stdout /tmp/nccl_setting2_red.stdout 25 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting2/run_nccl_all_jobs_multiple_times.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo "Usage: $0 ring_type=goodring|badring" 5 | } 6 | 7 | if [ $# -ne 2 ]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | num_iters=$1 13 | if [ $num_iters -gt 20 ]; then 14 | echo "$num_iters too large" 15 | exit 1 16 | fi 17 | 18 | shift 19 | ring_type=$1 20 | 21 | suffix=`date +%Y%m%d.%H.%M.%S` 22 | output_dir=/tmp/nccl_setting2_${ring_type}.${suffix} 23 | mkdir -p $output_dir 24 | unlink /tmp/nccl_setting2 25 | ln -sf $output_dir /tmp/nccl_setting2 26 | 27 | 28 | for i in `seq 1 $num_iters`; do 29 | echo Case $i 30 | ./run_all_jobs_once.sh $ring_type $i 31 | done 32 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting2/run_nccl_job_blue.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | 6 | usage() { 7 | echo "Usage: $0 ring_type=goodring|badring" 8 | } 9 | 10 | if [ $# -ne 1 ]; then 11 | usage 12 | exit 1 13 | fi 14 | 15 | ring_type=$1 16 | 17 | case $ring_type in 18 | goodring) 19 | cat > hostfile.blue.$ring_type < hostfile.blue.$ring_type < ring_type=green|red" 9 | } 10 | 11 | if [ $# -ne 1 ]; then 12 | usage 13 | exit 1 14 | fi 15 | 16 | color=$1 17 | 18 | case $color in 19 | green) 20 | cat > hostfile.$color < hostfile.$color < int: 30 | return int(rec[4]) 31 | 32 | def get_job_duration(records) -> int: 33 | return sum(map(get_latency, records)) 34 | 35 | def work(app_color: str, trial_id, nccl_output_path) -> None: 36 | results = [] 37 | with open(nccl_output_path, 'r') as fin: 38 | for line in fin: 39 | match = pat.match(line) 40 | if match is not None: 41 | tokens = line.split() 42 | # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0'] 43 | size = tokens[0] 44 | dtype = tokens[2] 45 | latency_us = tokens[9] 46 | algbw = tokens[10] 47 | busbw = tokens[11] 48 | results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id]) 49 | return results 50 | 51 | jobs = [] 52 | 53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')): 54 | tokens = path.split('.') 55 | trial_id = tokens[-3].split('/')[-1] 56 | color = tokens[-2] 57 | jobs += [work(color, trial_id, path)] 58 | 59 | # min_dura = min(map(get_job_duration, jobs)) 60 | # print(f'min job duration: {min_dura}') 61 | 62 | for results in jobs: 63 | assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results)) 64 | if args.strip_tail == 0: 65 | stripped_results = results[args.strip_head:] 66 | else: 67 | stripped_results = results[args.strip_head:-args.strip_tail] 68 | for rec in stripped_results: 69 | writer.writerow(rec) 70 | # writer.writerow(rec) 71 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting3/run_all_jobs_once.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 ring_type=goodring|badring" 7 | } 8 | 9 | if [ $# -ne 2 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | ring_type=$1 15 | trial_id=$2 16 | 17 | OUTPUT_DIR=/tmp/nccl_setting3 18 | 19 | $WORKDIR/run_nccl_job_small.sh $ring_type blue |& tee $OUTPUT_DIR/$trial_id.blue.stdout & 20 | $WORKDIR/run_nccl_job_small.sh $ring_type red |& tee $OUTPUT_DIR/$trial_id.red.stdout & 21 | 22 | wait 23 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting3/run_nccl_all_jobs_multiple_times.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo "Usage: $0 ring_type=goodring|badring" 5 | } 6 | 7 | if [ $# -ne 2 ]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | num_iters=$1 13 | if [ $num_iters -gt 20 ]; then 14 | echo "$num_iters too large" 15 | exit 1 16 | fi 17 | 18 | ring_type=$2 19 | 20 | suffix=`date +%Y%m%d.%H.%M.%S` 21 | output_dir=/tmp/nccl_setting3.${suffix} 22 | mkdir -p $output_dir 23 | unlink /tmp/nccl_setting3 24 | ln -sf $output_dir /tmp/nccl_setting3 25 | 26 | 27 | for i in `seq 1 $num_iters`; do 28 | echo Case $i 29 | ./run_all_jobs_once.sh $ring_type $i 30 | done 31 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting3/run_nccl_job_small.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 ring_type=goodring|badring, job_color=blue|red" 7 | } 8 | 9 | if [ $# -ne 2 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | ring_type=$1 15 | color=$2 16 | 17 | case $ring_type in 18 | goodring) 19 | cat > hostfile.$color.$ring_type < hostfile.$color.$ring_type < int: 30 | return int(rec[4]) 31 | 32 | def get_job_duration(records) -> int: 33 | return sum(map(get_latency, records)) 34 | 35 | def work(app_color: str, trial_id, nccl_output_path) -> None: 36 | results = [] 37 | with open(nccl_output_path, 'r') as fin: 38 | for line in fin: 39 | match = pat.match(line) 40 | if match is not None: 41 | tokens = line.split() 42 | # ['536870912', '33554432', 'float', 'none', '-1', '115671', '4.64', '3.48', '0', '114558', '4.69', '3.51', '0'] 43 | size = tokens[0] 44 | dtype = tokens[2] 45 | latency_us = tokens[9] 46 | algbw = tokens[10] 47 | busbw = tokens[11] 48 | results.append([args.solution, app_color, size, dtype, latency_us, algbw, busbw, trial_id]) 49 | return results 50 | 51 | jobs = [] 52 | 53 | for path in glob.glob(os.path.join(OUTPUT_DIR, '*')): 54 | tokens = path.split('.') 55 | trial_id = tokens[-3].split('/')[-1] 56 | color = tokens[-2] 57 | jobs += [work(color, trial_id, path)] 58 | 59 | # min_dura = min(map(get_job_duration, jobs)) 60 | # print(f'min job duration: {min_dura}') 61 | 62 | for results in jobs: 63 | assert len(results) > args.strip_head + args.strip_tail, "len(results) = {}".format(len(results)) 64 | if args.strip_tail == 0: 65 | stripped_results = results[args.strip_head:] 66 | else: 67 | stripped_results = results[args.strip_head:-args.strip_tail] 68 | for rec in stripped_results: 69 | writer.writerow(rec) 70 | # writer.writerow(rec) 71 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting4/run_all_jobs_once.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 " 7 | } 8 | 9 | if [ $# -ne 1 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | trial_id=$1 15 | 16 | OUTPUT_DIR=/tmp/nccl_setting4 17 | 18 | for job in {blue,red,green}; do 19 | echo $job 20 | $WORKDIR/run_nccl_job.sh $job |& tee $OUTPUT_DIR/$trial_id.$job.stdout & 21 | done 22 | 23 | wait 24 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting4/run_nccl_all_jobs_multiple_times.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | usage() { 4 | echo "Usage: $0 " 5 | } 6 | 7 | if [ $# -ne 1 ]; then 8 | usage 9 | exit 1 10 | fi 11 | 12 | num_iters=$1 13 | if [ $num_iters -gt 20 ]; then 14 | echo "$num_iters too large" 15 | exit 1 16 | fi 17 | 18 | suffix=`date +%Y%m%d.%H.%M.%S` 19 | output_dir=/tmp/nccl_setting4.${suffix} 20 | mkdir -p $output_dir 21 | unlink /tmp/nccl_setting4 22 | ln -sf $output_dir /tmp/nccl_setting4 23 | 24 | 25 | for i in `seq 1 $num_iters`; do 26 | echo Case $i 27 | ./run_all_jobs_once.sh $i 28 | done 29 | -------------------------------------------------------------------------------- /nccl-tests-mccs/setting4/run_nccl_job.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | WORKDIR=`dirname $(realpath $0)` 4 | 5 | usage() { 6 | echo "Usage: $0 job=blue|green|red" 7 | } 8 | 9 | if [ $# -ne 1 ]; then 10 | usage 11 | exit 1 12 | fi 13 | 14 | job=$1 15 | 16 | case $job in 17 | blue) 18 | device_id="0,1" 19 | tclass=0 20 | num_channels=2 21 | cat > hostfile.$job < hostfile.$job < hostfile.$job < INT_MAX) return ncclInvalidArgument; 21 | 22 | static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, 23 | ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { 24 | CHECKCOUNT(count); 25 | return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream); 26 | } 27 | static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, 28 | ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { 29 | CHECKCOUNT(count); 30 | return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream); 31 | } 32 | static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, 33 | ncclComm_t comm, cudaStream_t stream) { 34 | CHECKCOUNT(count); 35 | return ncclBcast(buff, (int)count, datatype, root, comm, stream); 36 | } 37 | static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, 38 | size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, 39 | cudaStream_t stream) { 40 | CHECKCOUNT(recvcount); 41 | return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream); 42 | } 43 | static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, 44 | ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { 45 | CHECKCOUNT(sendcount); 46 | return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream); 47 | } 48 | #endif 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /nccl-tests-mccs/src/timer.cc: -------------------------------------------------------------------------------- 1 | #include "timer.h" 2 | 3 | // Make sure to compile this translation unit with the host compiler and not 4 | // nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0 5 | #include 6 | 7 | namespace { 8 | std::uint64_t now() { 9 | using clock = std::chrono::steady_clock; 10 | return std::chrono::duration_cast(clock::now().time_since_epoch()).count(); 11 | } 12 | } 13 | 14 | timer::timer() { 15 | t0 = now(); 16 | } 17 | 18 | double timer::elapsed() const { 19 | std::uint64_t t1 = now(); 20 | return 1.e-9*(t1 - t0); 21 | } 22 | 23 | double timer::reset() { 24 | std::uint64_t t1 = now(); 25 | double ans = 1.e-9*(t1 - t0); 26 | t0 = t1; 27 | return ans; 28 | } 29 | -------------------------------------------------------------------------------- /nccl-tests-mccs/src/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef _408319ecdd5b47b28bf8f511c4fdf816 2 | #define _408319ecdd5b47b28bf8f511c4fdf816 3 | 4 | #include 5 | 6 | // Can't include because of bug with gcc 10.3.0 7 | class timer { 8 | std::uint64_t t0; 9 | public: 10 | timer(); 11 | double elapsed() const; 12 | double reset(); 13 | }; 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /nccl-tests-mccs/verifiable/Makefile: -------------------------------------------------------------------------------- 1 | include ../../makefiles/common.mk 2 | 3 | .PHONY: all clean 4 | 5 | BUILDDIR := $(abspath ../../build) 6 | NCCLDIR := $(BUILDDIR) 7 | NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include 8 | DST_DIR := $(BUILDDIR)/test/verifiable 9 | 10 | all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o 11 | 12 | clean: 13 | rm -rf $(DST_DIR) 14 | 15 | TEST_VERIFIABLE_SRCDIR := . 16 | TEST_VERIFIABLE_BUILDDIR := $(DST_DIR) 17 | include verifiable.mk 18 | 19 | self_test: $(DST_DIR)/self_test 20 | 21 | $(DST_DIR)/self_test: verifiable.cu verifiable.h 22 | @printf "Linking %s\n" $@ 23 | @mkdir -p $(DST_DIR) 24 | $(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS) 25 | -------------------------------------------------------------------------------- /nccl-tests-mccs/verifiable/verifiable.h: -------------------------------------------------------------------------------- 1 | #ifndef _d41d8cd98f00b204e9800998ecf8427e 2 | #define _d41d8cd98f00b204e9800998ecf8427e 3 | 4 | #include 5 | 6 | #include 7 | 8 | /* Routines for launching kernels that verify reduction results. A significant 9 | * feature of these routines is they carefully craft floating point input 10 | * to produce exactly predictable output. 11 | * 12 | * int elt_ty: actually just a ncclDataType_t 13 | * 14 | * int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically 15 | * created, these are encoded as the value ncclNumOps and their scalar is 16 | * assumed to be `ncclVerifiablePremulScalar(rank_me)` 17 | * 18 | * uint64_t seed: arbitrary 64-bits to use in seeding the random values 19 | * 20 | * intptr_t elt_ix0: index of first element pointed to by elts when generating 21 | * random values. This makes it possible to generate subsequences independently 22 | * as well as in aggregate. 23 | * 24 | * int rank_n: Number of contributions into the reduction. Non-reduction 25 | * collectives like broadcast, gather, etc will always set this to one. 26 | * 27 | * int rank_me: Index of this contribution 28 | */ 29 | 30 | // Use this as the local scalar for PreMulSum ops 31 | template 32 | __host__ __device__ T ncclVerifiablePremulScalar(int rank_me) { 33 | return T(rank_me%2 == 0 ? 1.0f : 2.0f); 34 | } 35 | 36 | // Enqueue kernel to generate data which is to be reduced. 37 | void ncclVerifiablePrepareInput( 38 | void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me, 39 | uint64_t seed, intptr_t elt_ix0, cudaStream_t stream 40 | ); 41 | 42 | // Enqueue kernel to generate expected results of reduction. 43 | void ncclVerifiablePrepareExpected( 44 | void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, 45 | uint64_t seed, intptr_t elt_ix0, cudaStream_t stream 46 | ); 47 | 48 | // Enqueue kernel to verify reduced data matches expectation. The number of 49 | // failed elements is written to bad_elt_n which must be in cudaHost memory. 50 | // If `expected == nullptr` then the expected results are generated on-the-fly 51 | // which can be costly. Thus if you plan to run the same reduction multiple 52 | // times it is advantageous to precompute the expected values with 53 | // ncclVerifiablePrepareExpected and pass them as `expected` here. 54 | void ncclVerifiableVerify( 55 | void const *results, void const *expected, intptr_t elt_n, int elt_ty, 56 | int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0, 57 | int64_t *bad_elt_n, cudaStream_t stream 58 | ); 59 | #endif 60 | -------------------------------------------------------------------------------- /nccl-tests-mccs/verifiable/verifiable.mk: -------------------------------------------------------------------------------- 1 | # We requires both of the following paths to be set upon including this makefile 2 | # TEST_VERIFIABLE_SRCDIR = 3 | # TEST_VERIFIABLE_BUILDDIR = 4 | 5 | TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h 6 | TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o 7 | 8 | $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS) 9 | @printf "Compiling %s\n" $@ 10 | @mkdir -p $(TEST_VERIFIABLE_BUILDDIR) 11 | $(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu 12 | -------------------------------------------------------------------------------- /rust-toolchain: -------------------------------------------------------------------------------- 1 | nightly-2023-11-11 -------------------------------------------------------------------------------- /src/collectives-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "collectives-sys" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [build-dependencies] 9 | bindgen = "0.62" -------------------------------------------------------------------------------- /src/collectives-sys/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | 4 | fn main() { 5 | let dir = env!("CARGO_MANIFEST_DIR"); 6 | let mut build_dir = PathBuf::from(dir); 7 | build_dir.pop(); 8 | build_dir.push("collectives/build/device"); 9 | 10 | println!( 11 | "cargo:rustc-link-search={}", 12 | build_dir.as_os_str().to_str().unwrap() 13 | ); 14 | println!("cargo:rustc-link-lib=colldevice"); 15 | println!("cargo:rerun-if-changed=wrapper.h"); 16 | println!( 17 | "cargo:rerun-if-changed={}", 18 | build_dir.as_os_str().to_str().unwrap() 19 | ); 20 | 21 | let bindings = bindgen::Builder::default() 22 | // The input header we would like to generate 23 | // bindings for. 24 | .header("wrapper.h") 25 | .clang_arg("-I../collectives/include") 26 | .clang_arg("-I/usr/local/cuda/include") 27 | .clang_arg("-I/usr/local/cuda/targets/x86_64-linux/include/cuda/std/detail/libcxx/include") 28 | .clang_arg("-x") 29 | .clang_arg("c++") 30 | .clang_arg("-std=c++11") 31 | .clang_arg("-stdlib=libc++") 32 | .allowlist_type("^mccsDev.*") 33 | .allowlist_function("^mccsKernel.*") 34 | .allowlist_var("^MCCS.*") 35 | .default_enum_style(bindgen::EnumVariation::Rust { 36 | non_exhaustive: false, 37 | }) 38 | // Tell cargo to invalidate the built crate whenever any of the 39 | // included header files changed. 40 | .parse_callbacks(Box::new(bindgen::CargoCallbacks)) 41 | .derive_eq(true) 42 | .derive_hash(true) 43 | // Finish the builder and generate the bindings. 44 | .generate() 45 | // Unwrap the Result and panic on failure. 46 | .expect("Unable to generate bindings"); 47 | 48 | // Write the bindings to the $OUT_DIR/bindings.rs file. 49 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 50 | bindings 51 | .write_to_file(out_path.join("bindings.rs")) 52 | .expect("Couldn't write bindings!"); 53 | } 54 | -------------------------------------------------------------------------------- /src/collectives-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | 5 | #[allow(clippy::all)] 6 | mod binding { 7 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 8 | } 9 | pub use binding::*; 10 | -------------------------------------------------------------------------------- /src/collectives-sys/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" 2 | #include "devcomm.h" 3 | #include "collectives.h" 4 | -------------------------------------------------------------------------------- /src/collectives/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENSE.txt for license information 5 | # 6 | 7 | include makefiles/common.mk 8 | 9 | BUILDDIR ?= $(abspath build) 10 | OBJDIR := $(BUILDDIR)/device 11 | 12 | LIBSRCFILES := src/functions.cu 13 | 14 | LIBSRCFILES += src/all_gather.cu 15 | LIBSRCFILES += src/all_reduce.cu 16 | 17 | 18 | DEPFILES := $(patsubst src/%.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) 19 | DEPENDFILES:= $(DEPFILES:%.d=%.dep) 20 | STATICLIB := $(OBJDIR)/libcolldevice.a 21 | DEVOBJ := $(OBJDIR)/devlink.o 22 | RULESFILE := $(OBJDIR)/Makefile.rules 23 | 24 | NVCUFLAGS += -Isrc -Iinclude --compiler-options "-fPIC -fvisibility=hidden" 25 | 26 | 27 | all: $(STATICLIB) 28 | 29 | # Dummy rule so that the extra dependency (%.dep) files are preserved by make 30 | all_deps: $(DEPENDFILES) 31 | 32 | # Auto-generating the rules per op/reduction/datatype/algorithm 33 | $(RULESFILE) : gen_rules.sh 34 | @printf "Generating %-35s > %s\n" rules $@ 35 | @mkdir -p $(OBJDIR) 36 | @CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@ 37 | 38 | -include $(RULESFILE) 39 | 40 | LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o 41 | 42 | -include $(DEPFILES) 43 | 44 | $(STATICLIB): $(LIBOBJ) $(DEVOBJ) 45 | @printf "Archiving %-35s > %s\n" objects $@ 46 | ar cr $@ $^ 47 | 48 | # We do not want make to build *.d when running make clean. 49 | # So we only provide targets for .dep which will produce .dep and .d, 50 | # with only .d being included, and .dep keeping track of what needs to 51 | # be regenerated. 52 | $(OBJDIR)/%.dep : src/%.cu 53 | @mkdir -p $(OBJDIR) 54 | @$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp 55 | @sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@ 56 | @sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \ 57 | sed -e 's/^ *//' -e 's/$$/:/' >> $@ 58 | @rm -f $@.tmp 59 | @cp $@ $(@:.dep=.d) 60 | 61 | # Compiled kernels and collectives with relocatable device code ... 62 | $(OBJDIR)/functions.o : src/functions.cu $(OBJDIR)/functions.dep 63 | @printf "Compiling %-35s > %s\n" $< $@ 64 | mkdir -p `dirname $@` 65 | $(NVCC) $(NVCUFLAGS) -dc $< -o $@ 66 | 67 | # ... and create the device-side linked object with all those. 68 | $(DEVOBJ) : $(LIBOBJ) 69 | $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ 70 | 71 | clean: 72 | rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB) 73 | -------------------------------------------------------------------------------- /src/collectives/gen_rules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dir=$1 4 | 5 | datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64" 6 | if [ "$CUDA_MAJOR" -ge 11 ] 7 | then 8 | datatypes+=" bf16" 9 | fi 10 | 11 | targets="GENOBJS := \\\\\n" 12 | 13 | for base in {"all_gather","all_reduce"}; do 14 | opn=0 15 | for op in {"sum","prod"}; do 16 | if [ "$base" = "all_gather" ] && [ "$op" != "sum" ]; then 17 | continue 18 | fi 19 | dtn=0 20 | # Order must match that of the ncclDataType_t enum 21 | for dt in ${datatypes}; do 22 | # Generate a unique filename for each compilation unit, 23 | # otherwise the __nv_module_id may conflict at link time 24 | echo "${dir}/${base}_${op}_${dt}.cu : src/${base}.cu" 25 | echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@" 26 | echo " cp \$< \$@" 27 | echo "" 28 | # Compile the file 29 | echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu src/${base}.cu ${dir}/${base}.dep" 30 | 31 | echo " @printf \"Compiling %-35s > %s\\\\n\" src/${base}.cu ${dir}/${base}_${op}_${dt}.o" 32 | echo " mkdir -p ${dir}" 33 | echo " \${NVCC} -DMCCS_OP=${opn} -DMCCS_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@" 34 | echo "" 35 | targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" 36 | dtn=$(($dtn + 1)) 37 | done 38 | opn=$(($opn + 1)) 39 | done 40 | done 41 | echo -e "$targets" 42 | -------------------------------------------------------------------------------- /src/collectives/include/align.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_ALIGN_H_ 8 | #define NCCL_ALIGN_H_ 9 | 10 | #define DIVUP(x, y) \ 11 | (((x)+(y)-1)/(y)) 12 | 13 | #define ROUNDUP(x, y) \ 14 | (DIVUP((x), (y))*(y)) 15 | 16 | #define ALIGN_SIZE(size, align) \ 17 | size = ((size + (align) - 1) / (align)) * (align); 18 | 19 | #if !__CUDA_ARCH__ 20 | #ifndef __host__ 21 | #define __host__ 22 | #endif 23 | #ifndef __device__ 24 | #define __device__ 25 | #endif 26 | #endif 27 | 28 | template 29 | __host__ __device__ constexpr Z divUp(X x, Y y) { 30 | return (x+y-1)/y; 31 | } 32 | 33 | template 34 | __host__ __device__ constexpr Z roundUp(X x, Y y) { 35 | return (x+y-1) - (x+y-1)%y; 36 | } 37 | 38 | // assumes second argument is a power of 2 39 | template 40 | __host__ __device__ constexpr Z alignUp(X x, int a) { 41 | return (x+a-1) & Z(-a); 42 | } 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/collectives/src/all_gather.cu: -------------------------------------------------------------------------------- 1 | #include "all_gather.h" 2 | #include "common.h" 3 | #include "collectives.h" 4 | 5 | IMPL_COLL_C(AllGather); 6 | -------------------------------------------------------------------------------- /src/collectives/src/all_reduce.cu: -------------------------------------------------------------------------------- 1 | #include "all_reduce.h" 2 | #include "common.h" 3 | #include "collectives.h" 4 | 5 | 6 | IMPL_COLL_R(AllReduce); -------------------------------------------------------------------------------- /src/collectives/src/op128.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See LICENSE.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef OP128_H_ 8 | #define OP128_H_ 9 | 10 | inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) { 11 | asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" 12 | : "=l"(v0), "=l"(v1) : "l"(ptr)); 13 | } 14 | 15 | inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) { 16 | asm volatile("st.volatile.global.v2.u64 [%2], {%0,%1};" 17 | :: "l"(v0), "l"(v1), "l"(ptr)); 18 | } 19 | 20 | inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) { 21 | uint64_t* shmemAsmPtr; 22 | asm volatile("cvta.to.shared.u64 %0, %1;" : "=l"(shmemAsmPtr) : "l"(shmemGenericPtr)); 23 | return shmemAsmPtr; 24 | } 25 | 26 | inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) { 27 | asm volatile("ld.volatile.shared.v2.u64 {%0,%1}, [%2];" 28 | : "=l"(v0), "=l"(v1) : "l"(shmemAsmPtr)); 29 | } 30 | 31 | inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) { 32 | asm volatile("st.volatile.shared.v2.u64 [%2], {%0,%1};" 33 | :: "l"(v0), "l"(v1), "l"(shmemAsmPtr)); 34 | } 35 | 36 | template 37 | inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) { 38 | union { 39 | uint32_t tmp4[4]; 40 | uint64_t tmp8[2]; 41 | }; 42 | if(sizeof(T) < 4) { 43 | uint32_t *ptr4 = reinterpret_cast(reinterpret_cast(ptr) & -uintptr_t(4)); 44 | #pragma unroll 45 | for(int e=0; e < 4; e++) { 46 | // Produce 4 bytes of sub-register type by reading 2 4-byte 47 | // aligned values and shifting. 48 | uint32_t lo, hi; 49 | asm("ld.shared.b32 %0,[%1];" : "=r"(lo) : "l"(ptr4+e+0)); 50 | asm("ld.shared.b32 %0,[%1];" : "=r"(hi) : "l"(ptr4+e+1)); 51 | tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast(ptr))%4)); 52 | } 53 | } 54 | else if(sizeof(T) == 4) { 55 | #pragma unroll 56 | for(int e=0; e < 4; e++) 57 | asm("ld.shared.b32 %0,[%1];" : "=r"(tmp4[e]) : "l"(ptr+e)); 58 | } 59 | else /*sizeof(T)==8*/ { 60 | #pragma unroll 61 | for(int e=0; e < 2; e++) 62 | asm("ld.shared.b64 %0,[%1];" : "=l"(tmp8[e]) : "l"(ptr+e)); 63 | } 64 | v0 = tmp8[0]; 65 | v1 = tmp8[1]; 66 | } 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /src/cuda-sys/cuda-driver-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cuda-driver-sys" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | 10 | [build-dependencies] 11 | cuda-finder = { path = "../cuda-finder" } 12 | bindgen = "0.62" -------------------------------------------------------------------------------- /src/cuda-sys/cuda-driver-sys/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | 4 | use cuda_finder::{find_cuda, find_cuda_windows}; 5 | 6 | fn main() { 7 | let bindings = bindgen::Builder::default() 8 | .header("wrapper.h") 9 | .allowlist_type("^CU.*") 10 | .allowlist_type("^cuuint(32|64)_t") 11 | .allowlist_type("^cudaError_enum") 12 | .allowlist_type("^cu.*Complex$") 13 | .allowlist_type("^cuda.*") 14 | .allowlist_type("^libraryPropertyType.*") 15 | .allowlist_var("^CU.*") 16 | .allowlist_function("^cu.*") 17 | .default_enum_style(bindgen::EnumVariation::Rust { 18 | non_exhaustive: false, 19 | }) 20 | .generate_comments(false) 21 | .derive_default(true) 22 | .derive_eq(true) 23 | .derive_hash(true) 24 | .derive_ord(true) 25 | .clang_arg("-I/usr/local/cuda/include") 26 | .generate() 27 | .expect("Unable to generate bindings"); 28 | 29 | // Write the bindings to the $OUT_DIR/bindings.rs file. 30 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 31 | bindings 32 | .write_to_file(out_path.join("bindings.rs")) 33 | .expect("Couldn't write bindings!"); 34 | 35 | if cfg!(target_os = "windows") { 36 | println!( 37 | "cargo:rustc-link-search=native={}", 38 | find_cuda_windows().display() 39 | ); 40 | } else { 41 | for path in find_cuda() { 42 | println!("cargo:rustc-link-search=native={}", path.display()); 43 | } 44 | }; 45 | 46 | println!("cargo:rustc-link-lib=dylib=cuda"); 47 | println!("cargo:rerun-if-changed=build.rs"); 48 | println!("cargo:rerun-if-env-changed=CUDA_LIBRARY_PATH"); 49 | } 50 | -------------------------------------------------------------------------------- /src/cuda-sys/cuda-driver-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | 5 | #[allow(clippy::all)] 6 | mod binding { 7 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 8 | } 9 | pub use binding::*; 10 | -------------------------------------------------------------------------------- /src/cuda-sys/cuda-driver-sys/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "cuComplex.h" 2 | #include "cuda.h" 3 | #include "cudaProfiler.h" 4 | #include "library_types.h" 5 | #include "vector_types.h" -------------------------------------------------------------------------------- /src/cuda-sys/cuda-finder/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cuda-finder" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | glob = "0.3" -------------------------------------------------------------------------------- /src/cuda-sys/cuda-runtime-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cuda-runtime-sys" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | 10 | [build-dependencies] 11 | cuda-finder = { path = "../cuda-finder" } 12 | bindgen = "0.62" -------------------------------------------------------------------------------- /src/cuda-sys/cuda-runtime-sys/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | 4 | use cuda_finder::{find_cuda, find_cuda_windows}; 5 | 6 | fn main() { 7 | let bindings = bindgen::Builder::default() 8 | .header("wrapper.h") 9 | .allowlist_type("^cuda.*") 10 | .allowlist_type("^surfaceReference") 11 | .allowlist_type("^textureReference") 12 | .allowlist_var("^cuda.*") 13 | .allowlist_function("^cuda.*") 14 | .default_enum_style(bindgen::EnumVariation::Rust { 15 | non_exhaustive: false, 16 | }) 17 | .generate_comments(false) 18 | .derive_default(true) 19 | .derive_eq(true) 20 | .derive_hash(true) 21 | .derive_ord(true) 22 | .clang_arg("-I/usr/local/cuda/include") 23 | .generate() 24 | .expect("Unable to generate bindings"); 25 | 26 | // Write the bindings to the $OUT_DIR/bindings.rs file. 27 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 28 | bindings 29 | .write_to_file(out_path.join("bindings.rs")) 30 | .expect("Couldn't write bindings!"); 31 | 32 | if cfg!(target_os = "windows") { 33 | println!( 34 | "cargo:rustc-link-search=native={}", 35 | find_cuda_windows().display() 36 | ); 37 | } else { 38 | for path in find_cuda() { 39 | println!("cargo:rustc-link-search=native={}", path.display()); 40 | } 41 | }; 42 | 43 | println!("cargo:rustc-link-lib=dylib=cudart"); 44 | println!("cargo:rerun-if-changed=build.rs"); 45 | println!("cargo:rerun-if-env-changed=CUDA_LIBRARY_PATH"); 46 | } 47 | -------------------------------------------------------------------------------- /src/cuda-sys/cuda-runtime-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | 5 | #[allow(clippy::all)] 6 | mod binding { 7 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 8 | } 9 | pub use binding::*; 10 | -------------------------------------------------------------------------------- /src/cuda-sys/cuda-runtime-sys/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "cuda_runtime.h" -------------------------------------------------------------------------------- /src/cuda-sys/nvml-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "nvml-sys" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | 10 | [build-dependencies] 11 | cuda-finder = { path = "../cuda-finder" } 12 | bindgen = "0.62" -------------------------------------------------------------------------------- /src/cuda-sys/nvml-sys/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | 4 | use cuda_finder::{find_cuda, find_cuda_windows}; 5 | 6 | fn main() { 7 | let bindings = bindgen::Builder::default() 8 | .header("wrapper.h") 9 | .allowlist_type("^NVML.*") 10 | .allowlist_var("^NVML.*") 11 | .allowlist_type("^nvml.*") 12 | .allowlist_function("^nvml.*") 13 | .default_enum_style(bindgen::EnumVariation::Rust { 14 | non_exhaustive: false, 15 | }) 16 | .generate_comments(false) 17 | .derive_default(true) 18 | .derive_eq(true) 19 | .derive_hash(true) 20 | .derive_ord(true) 21 | .clang_arg("-I/usr/local/cuda/include") 22 | .generate() 23 | .expect("Unable to generate bindings"); 24 | 25 | // Write the bindings to the $OUT_DIR/bindings.rs file. 26 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 27 | bindings 28 | .write_to_file(out_path.join("bindings.rs")) 29 | .expect("Couldn't write bindings!"); 30 | 31 | if cfg!(target_os = "windows") { 32 | println!( 33 | "cargo:rustc-link-search=native={}", 34 | find_cuda_windows().display() 35 | ); 36 | } else { 37 | for path in find_cuda() { 38 | println!("cargo:rustc-link-search=native={}", path.display()); 39 | } 40 | }; 41 | 42 | println!("cargo:rustc-link-lib=dylib=nvidia-ml"); 43 | println!("cargo:rerun-if-changed=build.rs"); 44 | println!("cargo:rerun-if-env-changed=CUDA_LIBRARY_PATH"); 45 | } 46 | -------------------------------------------------------------------------------- /src/cuda-sys/nvml-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | 5 | #[allow(clippy::all)] 6 | mod binding { 7 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 8 | } 9 | pub use binding::*; 10 | -------------------------------------------------------------------------------- /src/cuda-sys/nvml-sys/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "nvml.h" -------------------------------------------------------------------------------- /src/experimental/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "experimental" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-runtime-sys = { path = "../cuda-sys/cuda-runtime-sys" } 10 | nvml-sys = { path = "../cuda-sys/nvml-sys" } 11 | -------------------------------------------------------------------------------- /src/experimental/examples/cuda_ipc_client.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::c_void; 2 | use std::io::Read; 3 | use std::mem::size_of; 4 | use std::net::TcpStream; 5 | 6 | use cuda_runtime_sys::{ 7 | cudaError, cudaIpcMemHandle_t, cudaIpcMemLazyEnablePeerAccess, cudaIpcOpenMemHandle, 8 | cudaMemcpy, cudaMemcpyKind, 9 | }; 10 | 11 | const BUFFER_SIZE: usize = 1 * 1024 * 1024; 12 | 13 | fn main() { 14 | let mut buf = vec![0i32; BUFFER_SIZE / size_of::()]; 15 | buf.shrink_to_fit(); 16 | assert_eq!(buf.capacity(), BUFFER_SIZE / size_of::()); 17 | 18 | let mut handle = cudaIpcMemHandle_t::default(); 19 | { 20 | let mut stream = TcpStream::connect("localhost:2042").unwrap(); 21 | stream.set_nonblocking(false).unwrap(); 22 | stream.set_nodelay(true).unwrap(); 23 | stream 24 | .read_exact(unsafe { 25 | std::slice::from_raw_parts_mut( 26 | &mut handle as *mut _ as *mut u8, 27 | size_of::(), 28 | ) 29 | }) 30 | .unwrap(); 31 | } 32 | let mut dev_ptr: *mut c_void = std::ptr::null_mut(); 33 | let err = unsafe { 34 | cudaIpcOpenMemHandle( 35 | &mut dev_ptr as *mut _, 36 | handle, 37 | cudaIpcMemLazyEnablePeerAccess, 38 | ) 39 | }; 40 | if err != cudaError::cudaSuccess { 41 | panic!("cudaIpcOpenMemHandle failed") 42 | } 43 | let err = unsafe { 44 | cudaMemcpy( 45 | buf.as_mut_ptr() as *mut _, 46 | dev_ptr, 47 | BUFFER_SIZE, 48 | cudaMemcpyKind::cudaMemcpyDeviceToHost, 49 | ) 50 | }; 51 | if err != cudaError::cudaSuccess { 52 | panic!("cudaMemcpy failed") 53 | } 54 | 55 | for x in buf.iter() { 56 | assert_eq!(*x, 42, "CUDA IPC content mismatch"); 57 | } 58 | println!("buf={}", buf[0]); 59 | } 60 | -------------------------------------------------------------------------------- /src/experimental/examples/cuda_ipc_server.rs: -------------------------------------------------------------------------------- 1 | use std::mem::size_of; 2 | use std::net::TcpListener; 3 | use std::{ffi::c_void, io::Write}; 4 | 5 | use cuda_runtime_sys::{ 6 | cudaError, cudaIpcGetMemHandle, cudaIpcMemHandle_t, cudaMalloc, cudaMemcpy, cudaMemcpyKind, 7 | }; 8 | 9 | const BUFFER_SIZE: usize = 1 * 1024 * 1024; 10 | 11 | fn main() { 12 | let mut dev_ptr: *mut c_void = std::ptr::null_mut(); 13 | let err = unsafe { cudaMalloc(&mut dev_ptr as *mut _, BUFFER_SIZE) }; 14 | if err != cudaError::cudaSuccess { 15 | panic!("cudaMalloc failed") 16 | } 17 | 18 | let buf = vec![42i32; BUFFER_SIZE / size_of::()]; 19 | let err = unsafe { 20 | cudaMemcpy( 21 | dev_ptr, 22 | buf.as_ptr() as *const _, 23 | BUFFER_SIZE, 24 | cudaMemcpyKind::cudaMemcpyHostToDevice, 25 | ) 26 | }; 27 | if err != cudaError::cudaSuccess { 28 | panic!("cudaMemcpy failed") 29 | } 30 | 31 | let mut handle = cudaIpcMemHandle_t::default(); 32 | let err = unsafe { cudaIpcGetMemHandle(&mut handle as *mut _, dev_ptr) }; 33 | if err != cudaError::cudaSuccess { 34 | panic!("cudaIpcGetMemHandle failed") 35 | } 36 | 37 | let listener = TcpListener::bind("localhost:2042").unwrap(); 38 | match listener.accept() { 39 | Ok((mut socket, addr)) => { 40 | socket 41 | .write_all(unsafe { 42 | std::slice::from_raw_parts( 43 | &handle as *const _ as *const u8, 44 | size_of::(), 45 | ) 46 | }) 47 | .unwrap(); 48 | println!("new client: {addr:?}") 49 | } 50 | Err(e) => println!("couldn't get client: {e:?}"), 51 | } 52 | std::thread::sleep(std::time::Duration::from_secs(2)); 53 | } 54 | -------------------------------------------------------------------------------- /src/experimental/examples/get_hwinfo.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::CString; 2 | 3 | use cuda_runtime_sys::cudaDeviceGetPCIBusId; 4 | use nvml_sys::{nvmlDeviceGetCpuAffinity, nvmlDeviceGetHandleByPciBusId_v2, nvmlInit_v2}; 5 | 6 | fn main() { 7 | let bus_id = CString::new(b"00000000:00:00.0").unwrap(); 8 | let raw_bus_id = bus_id.as_c_str(); 9 | // including the null terminator 10 | let len = raw_bus_id.to_bytes().len() + 1; 11 | let device = 0; 12 | unsafe { 13 | cudaDeviceGetPCIBusId(raw_bus_id.as_ptr() as *mut _, len as i32, device); 14 | let mut handle = std::ptr::null_mut(); 15 | nvmlInit_v2(); 16 | nvmlDeviceGetHandleByPciBusId_v2(raw_bus_id.as_ptr() as *mut _, &mut handle); 17 | let mut cpu_set = 0u64; 18 | nvmlDeviceGetCpuAffinity(handle, 1, &mut cpu_set); 19 | println!("CPU set for device {}: {:#066b}", device, cpu_set); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/experimental/src/lib.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/gdrcopy-sys/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "gdrcopy-sys" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [build-dependencies] 9 | bindgen = "0.62" -------------------------------------------------------------------------------- /src/gdrcopy-sys/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::PathBuf; 3 | 4 | fn main() { 5 | let bindings = bindgen::Builder::default() 6 | .header("wrapper.h") 7 | .allowlist_type("^gdr_.*") 8 | .allowlist_var("GPU_.*") 9 | .allowlist_function("^gdr_.*") 10 | .default_enum_style(bindgen::EnumVariation::Rust { 11 | non_exhaustive: false, 12 | }) 13 | .generate_comments(false) 14 | .derive_default(true) 15 | .derive_eq(true) 16 | .derive_hash(true) 17 | .derive_ord(true) 18 | // .clang_arg("-I/usr/local/cuda/include") 19 | .generate() 20 | .expect("Unable to generate bindings"); 21 | 22 | // Write the bindings to the $OUT_DIR/bindings.rs file. 23 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 24 | bindings 25 | .write_to_file(out_path.join("bindings.rs")) 26 | .expect("Couldn't write bindings!"); 27 | 28 | println!("cargo:rustc-link-lib=dylib=gdrapi"); 29 | println!("cargo:rerun-if-changed=build.rs"); 30 | println!("cargo:rerun-if-changed=wrapper.h"); 31 | } 32 | -------------------------------------------------------------------------------- /src/gdrcopy-sys/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(non_upper_case_globals)] 2 | #![allow(non_camel_case_types)] 3 | #![allow(non_snake_case)] 4 | 5 | #[allow(clippy::all)] 6 | mod binding { 7 | include!(concat!(env!("OUT_DIR"), "/bindings.rs")); 8 | } 9 | pub use binding::*; 10 | -------------------------------------------------------------------------------- /src/gdrcopy-sys/wrapper.h: -------------------------------------------------------------------------------- 1 | #include "gdrapi.h" 2 | -------------------------------------------------------------------------------- /src/ibverbs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ibverbs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies.serde] 9 | version = "1.0" 10 | optional = true 11 | features = ["derive"] 12 | 13 | [features] 14 | default = ["serde"] 15 | 16 | [dev-dependencies] 17 | bincode = "1.3" 18 | 19 | [build-dependencies] 20 | bindgen = "0.69" -------------------------------------------------------------------------------- /src/ibverbs/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::{Path, PathBuf}; 3 | use std::process::Command; 4 | 5 | fn main() { 6 | // println!("cargo:include=vendor/rdma-core/build/include"); 7 | // println!("cargo:rustc-link-search=native=vendor/rdma-core/build/lib"); 8 | println!("cargo:rustc-link-lib=ibverbs"); 9 | println!("cargo:rustc-link-lib=mlx5"); 10 | println!("cargo:rerun-if-changed=build.rs"); 11 | // println!("cargo:rerun-if-changed=vendor/rdma-core/libibverbs/verbs.h"); 12 | println!("cargo:rerun-if-changed=wrapper.h"); 13 | 14 | // initialize and update submodules 15 | if Path::new(".git").is_dir() { 16 | Command::new("git") 17 | .args(&["submodule", "update", "--init"]) 18 | .status() 19 | .expect("Failed to update submodules."); 20 | } else { 21 | assert!( 22 | Path::new("vendor/rdma-core").is_dir(), 23 | "vendor source not included" 24 | ); 25 | } 26 | 27 | // build vendor/rdma-core 28 | Command::new("bash") 29 | .current_dir("vendor/rdma-core/") 30 | .args(&["build.sh"]) 31 | .status() 32 | .expect("Failed to build vendor/rdma-core using build.sh"); 33 | 34 | // generate the bindings 35 | let bindings = bindgen::Builder::default() 36 | .header("/usr/include/infiniband/verbs.h") 37 | .header("wrapper.h") 38 | .clang_arg("-Ivendor/rdma-core/build/include/") 39 | .allowlist_function("ibv_.*") 40 | .allowlist_type("ibv_.*") 41 | .allowlist_function("mlx5dv_.*") 42 | .allowlist_var("IBV_LINK_LAYER_.*") 43 | .bitfield_enum("ibv_access_flags") 44 | .bitfield_enum("ibv_qp_attr_mask") 45 | .bitfield_enum("ibv_wc_flags") 46 | .bitfield_enum("ibv_send_flags") 47 | .bitfield_enum("ibv_port_cap_flags") 48 | .constified_enum_module("ibv_qp_type") 49 | .constified_enum_module("ibv_qp_state") 50 | .constified_enum_module("ibv_port_state") 51 | .constified_enum_module("ibv_wc_opcode") 52 | .constified_enum_module("ibv_wr_opcode") 53 | .constified_enum_module("ibv_wc_status") 54 | //.constified_enum_module("IBV_WC_.*") 55 | //.constified_enum_module("IBV_WR_.*") 56 | //.constified_enum_module("IBV_QPS_.*") 57 | //.constified_enum_module("IBV_PORT_.*") 58 | .derive_default(true) 59 | .derive_debug(true) 60 | .prepend_enum_name(false) 61 | .blocklist_type("ibv_wc") 62 | .size_t_is_usize(true) 63 | .generate() 64 | .expect("Unable to generate bindings"); 65 | 66 | // write the bindings to the $OUT_DIR/bindings.rs file. 67 | let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); 68 | bindings 69 | .write_to_file(out_path.join("bindings.rs")) 70 | .expect("Could not write bindings"); 71 | } 72 | -------------------------------------------------------------------------------- /src/ibverbs/wrapper.h: -------------------------------------------------------------------------------- 1 | #include 2 | -------------------------------------------------------------------------------- /src/ipc/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ipc" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | ipc-core = { path = "core" } 10 | mccs = { path = "mccs", package = "ipc-mccs", optional = true } 11 | 12 | [features] 13 | default = [] 14 | customer = ["ipc-core/customer"] -------------------------------------------------------------------------------- /src/ipc/core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ipc-core" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | ipc-channel = "0.15.0" 10 | thiserror = "1.0.29" 11 | serde = { version = "1.0.130", features = ["derive"] } 12 | bincode = "1.3.3" 13 | libc = "0.2.103" 14 | shmem-ipc = "0.2.0" 15 | zerocopy = "0.3.0" 16 | memfd = "0.4.1" 17 | memmap2 = "0.5.0" 18 | uuid = "0.8.2" 19 | atomic-traits = "0.3.0" 20 | nix = "0.23.0" 21 | crossbeam = "0.8.1" 22 | unique = "0.9.1" 23 | minstant = "0.1.1" 24 | mio = "0.8.4" 25 | async-io = { version = "1.9.0", optional = true } 26 | 27 | [features] 28 | default = [] 29 | customer = ["dep:async-io"] 30 | -------------------------------------------------------------------------------- /src/ipc/core/src/buf.rs: -------------------------------------------------------------------------------- 1 | //! Buffer to hold the fat pointer of a slice. 2 | use std::slice::SliceIndex; 3 | 4 | use serde::{Deserialize, Serialize}; 5 | use zerocopy::{AsBytes, FromBytes}; 6 | 7 | #[repr(C)] 8 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, FromBytes, AsBytes)] 9 | pub struct Range { 10 | pub offset: u64, 11 | pub len: u64, 12 | } 13 | 14 | impl Range { 15 | #[inline] 16 | pub fn new(mr: &[T], range: R) -> Self 17 | where 18 | R: SliceIndex<[T], Output = [T]>, 19 | { 20 | let buffer = range.index(mr); 21 | let r1 = mr.as_ptr_range(); 22 | let r2 = buffer.as_ptr_range(); 23 | Range { 24 | offset: (r2.start as u64 - r1.start as u64), 25 | len: (r2.end as u64 - r2.start as u64), 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/ipc/core/src/channel/flavors/concurrent.rs: -------------------------------------------------------------------------------- 1 | //! Concurrent channel, encapsulated over crossbeam channel. 2 | 3 | pub(crate) use crossbeam::channel::{unbounded, Receiver, Sender}; 4 | 5 | pub(crate) fn create_channel() -> (Sender, Receiver) { 6 | unbounded() 7 | } 8 | -------------------------------------------------------------------------------- /src/ipc/core/src/channel/flavors/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod concurrent; 2 | pub(crate) mod sequential; 3 | -------------------------------------------------------------------------------- /src/ipc/core/src/control.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | pub use libc::pid_t; 4 | use serde::{Deserialize, Serialize}; 5 | use thiserror::Error; 6 | 7 | #[derive(Debug, Clone, Error, Serialize, Deserialize)] 8 | pub enum Error { 9 | #[error("{0}")] 10 | Generic(String), 11 | } 12 | 13 | type IResult = Result; 14 | 15 | #[derive(Debug, Clone, Serialize, Deserialize)] 16 | pub enum Request { 17 | // New client with device affinity, 18 | NewClient(Option), 19 | } 20 | 21 | #[derive(Debug, Clone, Serialize, Deserialize)] 22 | pub enum ResponseKind { 23 | /// path of the engine's domain socket 24 | NewClient(PathBuf), 25 | /// .0: the requested scheduling mode 26 | /// .1: name of the OneShotServer 27 | /// .2: data path work queue capacity in bytes 28 | ConnectEngine { 29 | one_shot_name: String, 30 | wq_cap: usize, 31 | cq_cap: usize, 32 | }, 33 | } 34 | 35 | #[derive(Debug, Serialize, Deserialize)] 36 | pub struct Response(pub IResult); 37 | -------------------------------------------------------------------------------- /src/ipc/core/src/ipc_channel.rs: -------------------------------------------------------------------------------- 1 | //! Re-exports of some types in IPC-channel crate. 2 | //! It also provides an IpcSenderNotify class. 3 | use std::sync::atomic::{AtomicUsize, Ordering}; 4 | 5 | use serde::Serialize; 6 | 7 | use crate::shmobj::ShmObject; 8 | 9 | pub use ipc_channel::ipc::TryRecvError; 10 | pub(crate) use ipc_channel::ipc::{ 11 | channel, IpcError as IpcRecvError, IpcOneShotServer as OneShotServer, IpcReceiver, IpcSender, 12 | }; 13 | pub(crate) use ipc_channel::Error as IpcSendError; 14 | 15 | pub struct IpcSenderNotify { 16 | inner: IpcSender, 17 | entries: ShmObject, 18 | } 19 | 20 | impl IpcSenderNotify { 21 | pub(crate) fn new(inner: IpcSender, entries: ShmObject) -> Self { 22 | IpcSenderNotify { inner, entries } 23 | } 24 | 25 | pub(crate) fn send(&self, data: T) -> Result<(), bincode::Error> { 26 | self.inner.send(data)?; 27 | self.entries.fetch_add(1, Ordering::Relaxed); 28 | Ok(()) 29 | } 30 | } 31 | 32 | impl Clone for IpcSenderNotify 33 | where 34 | T: Serialize, 35 | { 36 | fn clone(&self) -> IpcSenderNotify { 37 | IpcSenderNotify { 38 | inner: self.inner.clone(), 39 | entries: self.entries.clone(), 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/ipc/core/src/shmem_ipc.rs: -------------------------------------------------------------------------------- 1 | //! Re-exports of shmem-ipc 2 | 3 | pub use shmem_ipc::ringbuf::Error as ShmRingbufError; 4 | pub use shmem_ipc::ringbuf::{Receiver as RingReceiver, Sender as RingSender}; 5 | 6 | pub use shmem_ipc::sharedring::{Receiver as ShmReceiver, Sender as ShmSender}; 7 | pub use shmem_ipc::Error as ShmIpcError; 8 | -------------------------------------------------------------------------------- /src/ipc/mccs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ipc-mccs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | ipc-core = { path = "../core" } 10 | 11 | serde = { version = "1.0.149", features = ["derive"] } 12 | serde-big-array = { version = "0.4.1" } 13 | static_assertions = "1.1.0" 14 | 15 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" } -------------------------------------------------------------------------------- /src/ipc/mccs/src/dp.rs: -------------------------------------------------------------------------------- 1 | pub type WorkRequestSlot = [u8; 128]; 2 | pub type CompletionSlot = [u8; 64]; 3 | 4 | use serde::{Deserialize, Serialize}; 5 | 6 | use super::command::{AllGather, AllReduce}; 7 | 8 | #[repr(C, align(64))] 9 | #[derive(Debug, Clone, Copy, Serialize, Deserialize)] 10 | pub enum WorkRequest { 11 | AllReduce(AllReduce), 12 | AllGather(AllGather), 13 | } 14 | 15 | #[repr(C, align(64))] 16 | #[derive(Debug, Clone, Copy, Serialize, Deserialize)] 17 | pub enum WorkCompletion { 18 | AllReduce, 19 | AllGather, 20 | } 21 | 22 | mod sa { 23 | use super::*; 24 | use static_assertions::const_assert; 25 | use std::mem::size_of; 26 | const_assert!(size_of::() <= size_of::()); 27 | const_assert!(size_of::() <= size_of::()); 28 | } 29 | -------------------------------------------------------------------------------- /src/ipc/mccs/src/handle.rs: -------------------------------------------------------------------------------- 1 | use std::os::raw::c_char; 2 | 3 | use cuda_runtime_sys::cudaIpcEventHandle_t; 4 | use serde::{Deserialize, Serialize}; 5 | use serde_big_array::BigArray; 6 | 7 | #[repr(C)] 8 | #[derive(Debug, Clone, Serialize, Deserialize)] 9 | pub struct CudaMemHandle(#[serde(with = "BigArray")] pub [c_char; 64usize]); 10 | 11 | #[repr(C)] 12 | #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)] 13 | pub struct CommunicatorHandle(pub u64); 14 | 15 | #[repr(C)] 16 | #[derive(Debug, Clone, Serialize, Deserialize)] 17 | pub struct CudaEventHandle(#[serde(with = "BigArray")] pub [c_char; 64usize]); 18 | 19 | impl From for CudaEventHandle { 20 | fn from(value: cudaIpcEventHandle_t) -> Self { 21 | Self(value.reserved) 22 | } 23 | } 24 | 25 | impl Into for CudaEventHandle { 26 | fn into(self) -> cudaIpcEventHandle_t { 27 | cudaIpcEventHandle_t { reserved: self.0 } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/ipc/mccs/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod command; 2 | pub mod dp; 3 | pub mod handle; 4 | pub mod reconfig; 5 | -------------------------------------------------------------------------------- /src/ipc/mccs/src/reconfig.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Debug, Clone, Serialize, Deserialize)] 4 | pub struct ChannelPattern { 5 | pub channel_id: u32, 6 | pub ring: Vec, 7 | // (send_rank, recv_rank) -> port 8 | pub udp_sport: Option>, 9 | pub net_dev: Option, 10 | } 11 | 12 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] 13 | #[repr(transparent)] 14 | pub struct CommunicatorId(pub u32); 15 | 16 | #[derive(Debug, Clone, Serialize, Deserialize)] 17 | pub struct CommPatternReconfig { 18 | pub communicator_id: CommunicatorId, 19 | pub channels: Vec, 20 | pub ib_traffic_class: Option, 21 | } 22 | 23 | #[derive(Debug, Clone, Serialize, Deserialize)] 24 | pub enum ExchangeReconfigCommand { 25 | CommPatternReconfig(Vec), 26 | } 27 | -------------------------------------------------------------------------------- /src/ipc/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub use ipc_core::*; 2 | 3 | #[cfg(feature = "mccs")] 4 | pub use mccs; 5 | -------------------------------------------------------------------------------- /src/libmccs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "libmccs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-runtime-sys = { path = "../cuda-sys/cuda-runtime-sys" } 10 | nvml-sys = { path = "../cuda-sys/nvml-sys" } 11 | 12 | ipc = { path = "../ipc", features = ["mccs", "customer"] } 13 | 14 | lazy_static = "1.4.0" 15 | thiserror = "1.0.37" 16 | serde_json = "1.0.89" 17 | libnuma = "0.0.4" -------------------------------------------------------------------------------- /src/libmccs/src/communicator.rs: -------------------------------------------------------------------------------- 1 | use std::net::IpAddr; 2 | 3 | use cuda_runtime_sys::cudaIpcEventHandle_t; 4 | use cuda_runtime_sys::cudaStream_t; 5 | use cuda_runtime_sys::{cudaEventCreateWithFlags, cudaIpcGetEventHandle, cudaIpcOpenEventHandle}; 6 | use cuda_runtime_sys::{cudaEventDisableTiming, cudaEventInterprocess}; 7 | use ipc::mccs::command::{Command, CommunicatorInit, CompletionKind}; 8 | 9 | use crate::checked_cuda; 10 | use crate::rx_recv_impl; 11 | use crate::Error; 12 | use crate::MccsCommunicatorHandle; 13 | use crate::{MCCS_CTX, MCCS_STREAM_SYNC}; 14 | 15 | pub fn init_communicator_rank( 16 | unique_id: u32, 17 | rank: usize, 18 | num_ranks: usize, 19 | cuda_device_idx: i32, 20 | root_addr: IpAddr, 21 | ) -> Result { 22 | let init = CommunicatorInit { 23 | id: unique_id, 24 | rank, 25 | num_ranks, 26 | root_addr, 27 | cuda_device_idx, 28 | }; 29 | let (comm_handle, event_handle) = MCCS_CTX.with(move |ctx| { 30 | let req = Command::InitCommunicator(init); 31 | ctx.service.send_cmd(req)?; 32 | rx_recv_impl!(ctx.service, CompletionKind::InitCommunicator, handles, { 33 | Ok(handles) 34 | }) 35 | })?; 36 | let mut event = std::ptr::null_mut(); 37 | unsafe { 38 | checked_cuda!(cudaIpcOpenEventHandle(&mut event, event_handle.into())); 39 | }; 40 | let handle = MccsCommunicatorHandle { 41 | comm_handle, 42 | backend_event: event, 43 | }; 44 | Ok(handle) 45 | } 46 | 47 | pub fn register_stream(cuda_dev: i32, stream: cudaStream_t) -> Result<(), Error> { 48 | let mut event = std::ptr::null_mut(); 49 | let mut event_handle = cudaIpcEventHandle_t::default(); 50 | unsafe { 51 | checked_cuda!(cudaEventCreateWithFlags( 52 | &mut event, 53 | cudaEventInterprocess | cudaEventDisableTiming 54 | )); 55 | checked_cuda!(cudaIpcGetEventHandle(&mut event_handle, event)); 56 | } 57 | MCCS_STREAM_SYNC.with_borrow_mut(|sync| { 58 | sync.insert(stream, event); 59 | }); 60 | 61 | MCCS_CTX.with(move |ctx| { 62 | let req = Command::RegisterStream(cuda_dev, stream.addr(), event_handle.into()); 63 | ctx.service.send_cmd(req)?; 64 | rx_recv_impl!(ctx.service, CompletionKind::RegisterStream) 65 | })?; 66 | Ok(()) 67 | } 68 | -------------------------------------------------------------------------------- /src/libmccs/src/memory.rs: -------------------------------------------------------------------------------- 1 | use std::os::raw::c_void; 2 | 3 | use cuda_runtime_sys::cudaIpcMemLazyEnablePeerAccess; 4 | use cuda_runtime_sys::cudaIpcOpenMemHandle; 5 | use cuda_runtime_sys::{cudaError, cudaIpcMemHandle_t}; 6 | use ipc::mccs::command::{Command, CompletionKind, MccsDeviceMemoryHandle}; 7 | 8 | use crate::Error; 9 | use crate::MCCS_CTX; 10 | use crate::{rx_recv_impl, DevicePtr}; 11 | 12 | pub fn cuda_malloc(device_idx: i32, size: usize) -> Result { 13 | MCCS_CTX.with(|ctx| { 14 | let req = Command::CudaMalloc(device_idx, size); 15 | ctx.service.send_cmd(req)?; 16 | rx_recv_impl!(ctx.service, CompletionKind::CudaMalloc, result, { 17 | let mut dev_ptr: *mut c_void = std::ptr::null_mut(); 18 | let handle = cudaIpcMemHandle_t { 19 | reserved: result.0 .0, 20 | }; 21 | let err = unsafe { 22 | cudaIpcOpenMemHandle( 23 | &mut dev_ptr as *mut _, 24 | handle, 25 | cudaIpcMemLazyEnablePeerAccess, 26 | ) 27 | }; 28 | if err != cudaError::cudaSuccess { 29 | return Err(Error::Cuda(err)); 30 | } 31 | Ok(DevicePtr { 32 | ptr: dev_ptr, 33 | backup_mem: result.1, 34 | }) 35 | }) 36 | }) 37 | } 38 | 39 | pub fn cuda_free(_ptr: MccsDeviceMemoryHandle) { 40 | todo!() 41 | } 42 | -------------------------------------------------------------------------------- /src/mccs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "mccs" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-driver-sys = { path = "../cuda-sys/cuda-driver-sys" } 10 | cuda-runtime-sys = { path = "../cuda-sys/cuda-runtime-sys" } 11 | nvml-sys = { path = "../cuda-sys/nvml-sys" } 12 | ipc = { path = "../ipc", features = ["mccs"] } 13 | collectives-sys = { path = "../collectives-sys" } 14 | gdrcopy-sys = { path = "../gdrcopy-sys" } 15 | ibverbs = { path = "../ibverbs" } 16 | qos-service = { path = "../qos-service" } 17 | 18 | smol = { version = "2.0.0" } 19 | socket2 = { version = "0.5.5", features = ["all"] } 20 | anyhow = "1.0.66" 21 | bincode = "1.3.3" 22 | serde = "1.0.149" 23 | toml = "0.5.9" 24 | thiserror = "1.0.37" 25 | uuid = { version = "1.2.2", features = ["v4"] } 26 | structopt = "0.3.26" 27 | nix = { version = "0.27.1", features = ["feature", "net", "socket", "signal", "fs", "mman"] } 28 | dashmap = "5.4.0" 29 | crossbeam = "0.8.2" 30 | rand = "0.8.5" 31 | static_assertions = "1.1.0" 32 | async-trait = "0.1.64" 33 | futures = "0.3.26" 34 | memoffset = "0.8.0" 35 | log = { version = "0.4.20", features = ["max_level_trace", "release_max_level_info"] } 36 | env_logger = "0.10.0" 37 | bitflags = "2.4.1" 38 | once_cell = "1.18.0" 39 | num_enum = "0.7.1" 40 | strum = { version = "0.25", features = ["derive"] } 41 | volatile = "0.5.1" 42 | byteorder = "1.5.0" 43 | atoi = "2.0.0" 44 | bytes = "1.5.0" 45 | itertools = "0.12.0" 46 | chrono = "0.4.31" 47 | fastrand = "2.0.1" 48 | spin = "0.9.8" 49 | better-panic = "0.3.0" 50 | libnuma = "0.0.4" 51 | gcollections = "1.5.0" 52 | -------------------------------------------------------------------------------- /src/mccs/src/bootstrap/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod task; 2 | 3 | use std::net::SocketAddr; 4 | 5 | use serde::{Deserialize, Serialize}; 6 | use smol::lock::Mutex; 7 | use smol::net::{TcpListener, TcpStream}; 8 | use thiserror::Error; 9 | 10 | pub use task::{bootstrap_create_root, bootstrap_root}; 11 | 12 | #[derive(Debug, Error)] 13 | pub enum BootstrapError { 14 | #[error("IO error: {0}")] 15 | Io(#[from] std::io::Error), 16 | #[error("Bootstrap root received inconsistent rank count of {0} vs {1}")] 17 | NumRanksMismatch(usize, usize), 18 | #[error("Bootstrap root received duplicate check-in from rank {0}")] 19 | DuplicatedCheckIn(usize), 20 | #[error("Bootstrap root received incorrect rank number {0}")] 21 | RankOverflow(usize), 22 | #[error("Received {0} bytes instead of {1} bytes")] 23 | RecvSizeMismatch(u32, u32), 24 | #[error( 25 | "Could not acquire Mutex in bootstrap state, only a single outstanding task is allowed" 26 | )] 27 | MutexAcquire, 28 | } 29 | 30 | #[derive(Debug, Clone, Serialize, Deserialize)] 31 | pub struct BootstrapHandle { 32 | pub addr: SocketAddr, 33 | pub magic: u64, 34 | } 35 | 36 | pub struct UnexpectedConn { 37 | pub stream: TcpStream, 38 | pub peer: usize, 39 | pub tag: u32, 40 | } 41 | 42 | pub struct BootstrapRing { 43 | pub ring_recv: TcpStream, 44 | pub ring_send: TcpStream, 45 | } 46 | 47 | pub struct BootstrapState { 48 | pub listener: TcpListener, 49 | pub ring: Mutex, 50 | pub peer_addrs: Vec, 51 | // Mutex is not necessary as proxy engine will ensure that 52 | // only a single outstanding recv task will access this field 53 | pub unexpected_connections: Mutex>, 54 | pub rank: usize, 55 | pub num_ranks: usize, 56 | pub magic: u64, 57 | } 58 | -------------------------------------------------------------------------------- /src/mccs/src/comm/profile.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use crate::transport::channel::{ChannelId, PeerConnId}; 4 | use crate::transport::net::provider::RDMA_TRANSPORT; 5 | use crate::transport::net::provider::{NetProperties, NetProvierWrap}; 6 | use crate::transport::NUM_PROTOCOLS; 7 | 8 | // comm profile, setting and 9 | pub struct CommProfile { 10 | pub buff_sizes: [usize; NUM_PROTOCOLS], 11 | pub udp_sport_map: HashMap, 12 | pub channel_net_device_map: HashMap, 13 | pub tc: Option, 14 | } 15 | 16 | impl CommProfile { 17 | // (net_device, proxy_rank) 18 | // TODO: choose net dev that is closest to the specified GPU 19 | // and allow admins to specify the set of allowed net devs 20 | #[inline] 21 | pub fn get_network_device( 22 | &self, 23 | channel_id: ChannelId, 24 | my_rank: usize, 25 | _peer_rank: usize, 26 | ) -> (usize, usize) { 27 | let prefix = self.channel_net_device_map.get(&channel_id); 28 | let num_devices = RDMA_TRANSPORT.get_num_devices().unwrap(); 29 | if let Some(prefix) = prefix { 30 | for dev in 0..num_devices { 31 | let props = RDMA_TRANSPORT.get_properties(dev).unwrap(); 32 | if props.name.starts_with(prefix) { 33 | return (dev, my_rank); 34 | } 35 | } 36 | } 37 | (0, my_rank) 38 | } 39 | 40 | #[inline] 41 | pub fn get_udp_sport(&self, peer_conn_id: &PeerConnId) -> Option { 42 | self.udp_sport_map.get(peer_conn_id).copied() 43 | } 44 | 45 | #[inline] 46 | pub fn get_tc(&self) -> Option { 47 | self.tc 48 | } 49 | 50 | #[inline] 51 | pub fn check_gdr(&self, _rank: usize, _net_dev: usize, _read: bool) -> bool { 52 | false 53 | } 54 | 55 | #[inline] 56 | pub fn check_gdr_need_flush(&self, _rank: usize) -> bool { 57 | false 58 | } 59 | 60 | #[inline] 61 | pub fn get_net_provider(&self) -> &'static dyn NetProvierWrap { 62 | &RDMA_TRANSPORT 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/mccs/src/config.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::net::IpAddr; 3 | use std::path::{Path, PathBuf}; 4 | 5 | use serde::{Deserialize, Serialize}; 6 | 7 | use qos_service::QosScheduleDef; 8 | 9 | use crate::transport::net::config::NetTransportConfig; 10 | use crate::transport::net::provider::RdmaTransportConfig; 11 | use crate::transport::shm::config::ShmTransportConfig; 12 | use crate::transport::NUM_PROTOCOLS; 13 | 14 | #[derive(Debug, Clone, Serialize, Deserialize)] 15 | pub struct DefaultCommConfig { 16 | #[serde(rename = "buffer_sizes")] 17 | pub buf_sizes: [usize; NUM_PROTOCOLS], 18 | pub channel_count: u32, 19 | // TODO: specify number of channels and ring for each channel 20 | } 21 | 22 | impl Default for DefaultCommConfig { 23 | fn default() -> Self { 24 | DefaultCommConfig { 25 | // 4MB 26 | buf_sizes: [1 << 22], 27 | channel_count: 1, 28 | } 29 | } 30 | } 31 | 32 | #[derive(Debug, Clone, Serialize, Deserialize)] 33 | pub struct ChannelPattern { 34 | pub channel_id: u32, 35 | pub ring: Vec, 36 | // (send_rank, recv_rank) -> port 37 | pub udp_sport: Option>, 38 | pub net_dev: Option, 39 | } 40 | 41 | #[derive(Debug, Clone, Serialize, Deserialize)] 42 | pub struct CommPatternConfig { 43 | pub communicator_id: u32, 44 | pub channels: Vec, 45 | pub ib_traffic_class: Option, 46 | } 47 | 48 | #[derive(Debug, Clone, Serialize, Deserialize)] 49 | pub struct CommGlobalConfig { 50 | #[serde(rename = "net_rdma", default)] 51 | pub rdma_config: RdmaTransportConfig, 52 | #[serde(rename = "net", default)] 53 | pub net_config: NetTransportConfig, 54 | #[serde(rename = "shm", default)] 55 | pub shm_config: ShmTransportConfig, 56 | } 57 | 58 | impl Default for CommGlobalConfig { 59 | fn default() -> Self { 60 | CommGlobalConfig { 61 | rdma_config: Default::default(), 62 | net_config: Default::default(), 63 | shm_config: Default::default(), 64 | } 65 | } 66 | } 67 | 68 | #[derive(Debug, Clone, Serialize, Deserialize)] 69 | #[serde(deny_unknown_fields)] 70 | pub struct Control { 71 | pub prefix: PathBuf, 72 | pub path: PathBuf, 73 | } 74 | 75 | #[derive(Debug, Clone, Serialize, Deserialize)] 76 | #[serde(deny_unknown_fields)] 77 | pub struct Config { 78 | pub control: Control, 79 | #[serde(default)] 80 | pub comm_global_config: CommGlobalConfig, 81 | #[serde(default)] 82 | pub comm_default_config: DefaultCommConfig, 83 | pub addrs: Vec, 84 | pub listen_port: u16, 85 | pub mccs_daemon_basename: String, 86 | pub mccs_daemon_prefix: PathBuf, 87 | pub qos_schedule: Option, 88 | pub comm_patterns_override: Option>, 89 | } 90 | 91 | impl Config { 92 | pub fn from_path>(path: P) -> anyhow::Result { 93 | let content = fs::read_to_string(path)?; 94 | let config = toml::from_str(&content)?; 95 | Ok(config) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/mccs/src/cuda/mapped_ptr.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::num::NonZeroUsize; 3 | 4 | pub struct DeviceHostPtr { 5 | host_ptr: *const T, 6 | dev_ptr: *const T, 7 | } 8 | 9 | impl DeviceHostPtr { 10 | #[inline] 11 | pub unsafe fn new_unchecked(ptr_host: *mut T, ptr_dev: *mut T) -> Self { 12 | DeviceHostPtr { 13 | host_ptr: ptr_host, 14 | dev_ptr: ptr_dev, 15 | } 16 | } 17 | 18 | #[inline] 19 | pub fn new(ptr_host: *mut T, ptr_dev: *mut T) -> Option { 20 | if !ptr_host.is_null() && !ptr_dev.is_null() { 21 | Some(unsafe { Self::new_unchecked(ptr_host, ptr_dev) }) 22 | } else { 23 | None 24 | } 25 | } 26 | 27 | #[must_use] 28 | #[inline] 29 | pub fn addr_host(&self) -> NonZeroUsize { 30 | unsafe { NonZeroUsize::new_unchecked(self.host_ptr.addr()) } 31 | } 32 | 33 | #[must_use] 34 | #[inline] 35 | pub fn addr_dev(&self) -> NonZeroUsize { 36 | unsafe { NonZeroUsize::new_unchecked(self.dev_ptr.addr()) } 37 | } 38 | 39 | #[must_use] 40 | #[inline] 41 | pub fn as_ptr_host(&self) -> *mut T { 42 | self.host_ptr as *mut T 43 | } 44 | 45 | #[must_use] 46 | #[inline] 47 | pub fn as_ptr_dev(&self) -> *mut T { 48 | self.dev_ptr as *mut T 49 | } 50 | 51 | #[inline] 52 | pub fn cast(self) -> DeviceHostPtr { 53 | unsafe { 54 | DeviceHostPtr::new_unchecked(self.as_ptr_host() as *mut U, self.as_ptr_dev() as *mut U) 55 | } 56 | } 57 | } 58 | 59 | impl Clone for DeviceHostPtr { 60 | #[inline(always)] 61 | fn clone(&self) -> Self { 62 | *self 63 | } 64 | } 65 | 66 | impl Copy for DeviceHostPtr {} 67 | 68 | impl fmt::Debug for DeviceHostPtr { 69 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 70 | f.debug_tuple("DeviceHostMapped") 71 | .field(&self.as_ptr_host()) 72 | .field(&self.as_ptr_dev()) 73 | .finish() 74 | } 75 | } 76 | 77 | impl fmt::Pointer for DeviceHostPtr { 78 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 79 | f.debug_tuple("DeviceHostMapped") 80 | .field(&self.as_ptr_host()) 81 | .field(&self.as_ptr_dev()) 82 | .finish() 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/mccs/src/cuda/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod alloc; 2 | pub mod mapped_ptr; 3 | pub mod ptr; 4 | -------------------------------------------------------------------------------- /src/mccs/src/cuda/ptr.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::num::NonZeroUsize; 3 | 4 | #[repr(transparent)] 5 | pub struct DeviceNonNull { 6 | pointer: *const T, 7 | } 8 | 9 | impl DeviceNonNull { 10 | pub const unsafe fn new_unchecked(ptr: *mut T) -> Self { 11 | DeviceNonNull { pointer: ptr as _ } 12 | } 13 | 14 | #[inline] 15 | pub fn new(ptr: *mut T) -> Option { 16 | if !ptr.is_null() { 17 | Some(unsafe { Self::new_unchecked(ptr) }) 18 | } else { 19 | None 20 | } 21 | } 22 | 23 | #[inline] 24 | pub fn addr(self) -> NonZeroUsize { 25 | unsafe { NonZeroUsize::new_unchecked(self.pointer.addr()) } 26 | } 27 | 28 | #[inline] 29 | pub fn with_addr(self, addr: NonZeroUsize) -> Self { 30 | unsafe { Self::new_unchecked(self.pointer.with_addr(addr.get()) as *mut _) } 31 | } 32 | 33 | #[inline] 34 | pub fn map_addr(self, f: impl FnOnce(NonZeroUsize) -> NonZeroUsize) -> Self { 35 | self.with_addr(f(self.addr())) 36 | } 37 | 38 | #[inline] 39 | pub const fn as_ptr(self) -> *mut T { 40 | self.pointer as *mut T 41 | } 42 | 43 | #[inline] 44 | pub const fn cast(self) -> DeviceNonNull { 45 | unsafe { DeviceNonNull::new_unchecked(self.as_ptr() as *mut U) } 46 | } 47 | } 48 | 49 | impl Clone for DeviceNonNull { 50 | #[inline(always)] 51 | fn clone(&self) -> Self { 52 | *self 53 | } 54 | } 55 | 56 | impl Copy for DeviceNonNull {} 57 | 58 | impl fmt::Debug for DeviceNonNull { 59 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 60 | f.debug_tuple("DeviceNonNull") 61 | .field(&self.as_ptr()) 62 | .finish() 63 | } 64 | } 65 | 66 | impl fmt::Pointer for DeviceNonNull { 67 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 68 | f.debug_tuple("DeviceNonNull") 69 | .field(&self.as_ptr()) 70 | .finish() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/mccs/src/daemon/mod.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | pub mod engine; 4 | 5 | #[derive(Debug, Error)] 6 | pub(crate) enum Error { 7 | #[error("ipc-channel TryRecvError")] 8 | IpcTryRecv, 9 | #[error("Customer error: {0}")] 10 | Customer(#[from] ipc::Error), 11 | } 12 | 13 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] 14 | #[repr(transparent)] 15 | pub struct DaemonId(pub u32); 16 | -------------------------------------------------------------------------------- /src/mccs/src/engine.rs: -------------------------------------------------------------------------------- 1 | pub enum SchedulingMode { 2 | Dedicated, 3 | Compact, 4 | } 5 | 6 | pub enum EngineStatus { 7 | Idle, 8 | Progressed, 9 | Completed, 10 | } 11 | 12 | pub trait Engine: Send + Unpin + 'static { 13 | fn progress(&mut self) -> EngineStatus; 14 | 15 | fn scheduling_hint(&self) -> SchedulingMode { 16 | SchedulingMode::Dedicated 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/mccs/src/exchange/command.rs: -------------------------------------------------------------------------------- 1 | use std::net::SocketAddr; 2 | 3 | use crate::bootstrap::BootstrapHandle; 4 | use crate::comm::CommunicatorId; 5 | use ipc::mccs::reconfig::CommPatternReconfig; 6 | 7 | pub enum ExchangeCommand { 8 | RegisterBootstrapHandle(CommunicatorId, BootstrapHandle), 9 | // communicator id, root mccs exchange engine listen addr 10 | RecvBootstrapHandle(CommunicatorId, SocketAddr), 11 | RemoveCommunicator(CommunicatorId), 12 | } 13 | 14 | pub enum ExchangeNotification { 15 | RegisterBootstrapHandle, 16 | RecvBootstrapHandle(CommunicatorId, BootstrapHandle), 17 | CommPatternReconfig(CommPatternReconfig), 18 | } 19 | -------------------------------------------------------------------------------- /src/mccs/src/exchange/message.rs: -------------------------------------------------------------------------------- 1 | use std::net::SocketAddr; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | use ipc::mccs::reconfig::ExchangeReconfigCommand; 6 | 7 | use crate::bootstrap::BootstrapHandle; 8 | use crate::comm::CommunicatorId; 9 | 10 | #[derive(Debug, Clone, Serialize, Deserialize)] 11 | pub enum ExchangeProxyMessage { 12 | BootstrapHandle(CommunicatorId, BootstrapHandle), 13 | BootstrapHandleRequest(CommunicatorId, SocketAddr), 14 | } 15 | 16 | #[derive(Debug, Clone, Serialize, Deserialize)] 17 | pub enum ExchangeMessage { 18 | ProxyMessage(ExchangeProxyMessage), 19 | ReconfigCommand(ExchangeReconfigCommand), 20 | } 21 | -------------------------------------------------------------------------------- /src/mccs/src/exchange/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod command; 2 | pub mod engine; 3 | pub mod message; 4 | 5 | use thiserror::Error; 6 | 7 | #[derive(Debug, Error)] 8 | pub enum ExchangeError { 9 | #[error("IO error: {0}")] 10 | Io(#[from] std::io::Error), 11 | #[error("Bincode error: {0}")] 12 | Bincode(#[from] bincode::Error), 13 | } 14 | -------------------------------------------------------------------------------- /src/mccs/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![feature(peer_credentials_unix_socket)] 2 | #![feature(strict_provenance)] 3 | #![feature(maybe_uninit_uninit_array)] 4 | #![feature(maybe_uninit_array_assume_init)] 5 | #![feature(int_roundings)] 6 | #![feature(variant_count)] 7 | #![feature(atomic_from_mut)] 8 | #![feature(slice_ptr_get)] 9 | #![feature(extract_if)] 10 | // todo: temporary 11 | #![allow(dead_code)] 12 | #![allow(unused)] 13 | 14 | pub mod bootstrap; 15 | #[allow(dead_code)] 16 | pub mod comm; 17 | pub mod config; 18 | pub mod control; 19 | pub mod cuda; 20 | pub mod daemon; 21 | pub mod engine; 22 | pub mod exchange; 23 | pub mod message; 24 | pub mod pattern; 25 | pub mod proxy; 26 | pub mod registry; 27 | pub mod runtime; 28 | pub mod transport; 29 | pub mod utils; 30 | -------------------------------------------------------------------------------- /src/mccs/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | use std::sync::atomic::{AtomicBool, Ordering}; 3 | 4 | use nix::sys::signal; 5 | 6 | use anyhow::Result; 7 | use structopt::StructOpt; 8 | 9 | use mccs::config::Config; 10 | use mccs::control::Control; 11 | 12 | use chrono::Timelike; 13 | use env_logger::fmt::Color; 14 | use std::io::Write; 15 | 16 | #[derive(Debug, Clone, StructOpt)] 17 | #[structopt(name = "mCCS Service")] 18 | struct Opts { 19 | /// Phoenix config path 20 | #[structopt(short, long, default_value = "mccs.toml")] 21 | config: PathBuf, 22 | #[structopt(short, long)] 23 | host: usize, 24 | } 25 | 26 | static TERMINATE: AtomicBool = AtomicBool::new(false); 27 | 28 | extern "C" fn handle_sigint(sig: i32) { 29 | assert_eq!(sig, signal::SIGINT as i32); 30 | TERMINATE.store(true, Ordering::Relaxed); 31 | } 32 | 33 | fn main() -> Result<()> { 34 | better_panic::install(); 35 | // load config 36 | let opts = Opts::from_args(); 37 | let config = Config::from_path(opts.config)?; 38 | env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")) 39 | .format(|buf, record| { 40 | let time = chrono::Local::now(); 41 | let style = buf 42 | .style() 43 | .set_color(Color::Black) 44 | .set_intense(true) 45 | .clone(); 46 | let time = format!( 47 | "{:02}:{:02}:{:02}.{:03}", 48 | time.hour() % 24, 49 | time.minute(), 50 | time.second(), 51 | time.timestamp_subsec_millis() 52 | ); 53 | writeln!( 54 | buf, 55 | "{}{} {} {}{} {}", 56 | style.value("["), 57 | time, 58 | buf.default_styled_level(record.level()), 59 | record.module_path().unwrap_or(""), 60 | style.value("]"), 61 | record.args() 62 | ) 63 | }) 64 | .init(); 65 | 66 | // process Ctrl-C event 67 | let sig_action = signal::SigAction::new( 68 | signal::SigHandler::Handler(handle_sigint), 69 | signal::SaFlags::empty(), 70 | signal::SigSet::empty(), 71 | ); 72 | unsafe { signal::sigaction(signal::SIGINT, &sig_action) } 73 | .expect("failed to register sighandler"); 74 | 75 | // the Control now takes over 76 | let mut control = Control::new(config, opts.host); 77 | log::info!("Started mCCS"); 78 | 79 | control.mainloop(&TERMINATE) 80 | } 81 | -------------------------------------------------------------------------------- /src/mccs/src/message.rs: -------------------------------------------------------------------------------- 1 | use crate::daemon::DaemonId; 2 | use crate::proxy::command::{ProxyCommand, ProxyCompletion}; 3 | use cuda_runtime_sys::cudaStream_t; 4 | 5 | use crate::transport::engine::TransportEngineId; 6 | use crate::transport::message::{TransportEngineReply, TransportEngineRequest}; 7 | use crate::utils::duplex_chan::DuplexChannel; 8 | 9 | pub enum ControlRequest { 10 | NewTransportEngine(TransportEngineId), 11 | } 12 | 13 | pub enum ControlNotification { 14 | NewDaemon { 15 | id: DaemonId, 16 | chan: DuplexChannel, 17 | }, 18 | NewTransportEngine { 19 | id: TransportEngineId, 20 | chan: DuplexChannel, 21 | }, 22 | } 23 | 24 | #[derive(Debug, Clone)] 25 | pub struct CudaStream(usize); 26 | 27 | impl Into for CudaStream { 28 | fn into(self) -> cudaStream_t { 29 | self.0 as cudaStream_t 30 | } 31 | } 32 | 33 | impl From for CudaStream { 34 | fn from(value: cudaStream_t) -> Self { 35 | Self(value as usize) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/mccs/src/pattern.rs: -------------------------------------------------------------------------------- 1 | pub const MCCS_STEP: u32 = 8; 2 | pub const ALLGATHER_CHUNK_STEPS: u32 = MCCS_STEP / 2; 3 | pub const ALLREDUCE_CHUNK_STEPS: u32 = MCCS_STEP / 2; 4 | pub const ALLGATHER_SLICE_STEPS: u32 = MCCS_STEP / 4; 5 | pub const ALLREDUCE_SLICE_STEPS: u32 = MCCS_STEP / 4; 6 | 7 | #[derive(Clone, Debug)] 8 | pub struct RingPattern { 9 | pub prev: usize, 10 | pub next: usize, 11 | pub user_ranks: Vec, 12 | // rank 0's distance to my rank along the ring send path 13 | pub index: usize, 14 | } 15 | -------------------------------------------------------------------------------- /src/mccs/src/proxy/command.rs: -------------------------------------------------------------------------------- 1 | use std::net::IpAddr; 2 | 3 | use crate::comm::CommunicatorId; 4 | use ipc::mccs::handle::CudaEventHandle; 5 | 6 | use super::task::{TaskDataType, TaskReduceOpType}; 7 | 8 | pub struct InitCommunicator { 9 | pub communicator_id: CommunicatorId, 10 | pub root_mccs_addr: IpAddr, 11 | pub rank: usize, 12 | pub num_ranks: usize, 13 | } 14 | 15 | #[derive(Clone, Debug)] 16 | pub struct AllGatherRequest { 17 | pub communicator_id: CommunicatorId, 18 | pub send_buf_addr: usize, 19 | pub recv_buf_addr: usize, 20 | pub size: usize, 21 | // user stream handle 22 | pub user_stream: usize, 23 | } 24 | 25 | #[derive(Clone, Debug)] 26 | pub struct AllReduceRequest { 27 | pub communicator_id: CommunicatorId, 28 | pub send_buf_addr: usize, 29 | pub recv_buf_addr: usize, 30 | pub size: usize, 31 | pub data_type: TaskDataType, 32 | pub op_type: TaskReduceOpType, 33 | // user stream handle 34 | pub user_stream: usize, 35 | } 36 | 37 | pub enum CollRequest { 38 | AllGather(AllGatherRequest), 39 | AllReduce(AllReduceRequest), 40 | } 41 | 42 | pub enum ProxyCommand { 43 | InitCommunicator(InitCommunicator), 44 | // user stream and user event IPC handle 45 | RegisterStream(usize, CudaEventHandle), 46 | AllGather(AllGatherRequest), 47 | AllReduce(AllReduceRequest), 48 | GroupCall(Vec), 49 | DestroyCommunicator(CommunicatorId), 50 | } 51 | 52 | pub enum ProxyCompletion { 53 | InitCommunicator(CudaEventHandle), 54 | RegisterStream, 55 | AllGather, 56 | AllReduce, 57 | GroupCall, 58 | } 59 | -------------------------------------------------------------------------------- /src/mccs/src/proxy/message.rs: -------------------------------------------------------------------------------- 1 | use crate::comm::CommunicatorId; 2 | use crate::transport::channel::PeerConnId; 3 | use crate::transport::transporter::ConnectHandle; 4 | 5 | pub enum ProxyPeerMessage { 6 | ConnectInfoExchange(CommunicatorId, PeerConnId, ConnectHandle), 7 | } 8 | -------------------------------------------------------------------------------- /src/mccs/src/proxy/mod.rs: -------------------------------------------------------------------------------- 1 | use std::net::IpAddr; 2 | 3 | pub mod command; 4 | pub mod engine; 5 | pub mod init; 6 | pub mod message; 7 | pub mod op; 8 | pub mod plan; 9 | pub mod task; 10 | 11 | pub struct DeviceInfo { 12 | pub host: IpAddr, 13 | pub listen_port: u16, 14 | pub cuda_device_idx: i32, 15 | } 16 | -------------------------------------------------------------------------------- /src/mccs/src/proxy/op.rs: -------------------------------------------------------------------------------- 1 | use crate::{comm::CommunicatorId, daemon::DaemonId}; 2 | 3 | pub enum ProxyOp { 4 | InitCommunicator(DaemonId, CommunicatorId), 5 | RebootCommunicator(CommunicatorId), 6 | PollCommunicatorComplete(CommunicatorId), 7 | } 8 | -------------------------------------------------------------------------------- /src/mccs/src/registry.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::Arc; 3 | 4 | use crate::comm::CommunicatorId; 5 | use crate::config::CommPatternConfig; 6 | use crate::config::DefaultCommConfig; 7 | use crate::transport::catalog::TransportCatalog; 8 | use crate::transport::delegator::TransportDelegator; 9 | 10 | #[derive(Clone)] 11 | pub struct GlobalRegistry { 12 | pub default_comm_config: DefaultCommConfig, 13 | pub comm_pattern_override: HashMap, 14 | pub transport_delegator: Arc, 15 | pub transport_catalog: Arc, 16 | } 17 | -------------------------------------------------------------------------------- /src/mccs/src/runtime/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod affinity; 2 | pub mod executor; 3 | pub mod manager; 4 | 5 | pub use affinity::CoreMask; 6 | pub use executor::{Runtime, RuntimeId, RuntimeMode}; 7 | pub use manager::RuntimeManager; 8 | 9 | use crate::engine::Engine; 10 | pub type EngineContainer = Box; 11 | -------------------------------------------------------------------------------- /src/mccs/src/transport/catalog.rs: -------------------------------------------------------------------------------- 1 | use std::any::Any; 2 | 3 | use dashmap::mapref::one::{MappedRef, MappedRefMut}; 4 | use dashmap::DashMap; 5 | use thiserror::Error; 6 | 7 | pub type AnyConfig = Box; 8 | pub type ConfigRef<'a, T> = MappedRef<'a, String, AnyConfig, T>; 9 | pub type ConfigRefMut<'a, T> = MappedRefMut<'a, String, AnyConfig, T>; 10 | 11 | #[derive(Error, Debug)] 12 | pub enum Error { 13 | #[error("Fail to downcast to a concrete type")] 14 | Downcast, 15 | #[error("Resources not found")] 16 | NotFound, 17 | } 18 | 19 | // TODO: temporary solution for async agent setup 20 | pub struct TransportCatalog { 21 | config: DashMap, 22 | } 23 | 24 | impl TransportCatalog { 25 | pub fn new() -> Self { 26 | TransportCatalog { 27 | config: DashMap::new(), 28 | } 29 | } 30 | } 31 | 32 | impl Default for TransportCatalog { 33 | fn default() -> Self { 34 | Self::new() 35 | } 36 | } 37 | 38 | impl TransportCatalog { 39 | pub fn register_config(&self, name: String, config: T) 40 | where 41 | T: Any + Send + Sync, 42 | { 43 | let boxed = Box::new(config); 44 | self.config.insert(name, boxed); 45 | } 46 | 47 | pub fn remove_config(&self, name: &str) { 48 | self.config.remove(name); 49 | } 50 | 51 | pub fn get_config(&self, name: &str) -> Result, Error> 52 | where 53 | T: Any + Send + Sync, 54 | { 55 | let config = self.config.get(name); 56 | if let Some(entry) = config { 57 | let concrete = entry 58 | .try_map(|x| x.downcast_ref::()) 59 | .map_err(|_| Error::Downcast)?; 60 | Ok(concrete) 61 | } else { 62 | Err(Error::NotFound) 63 | } 64 | } 65 | 66 | pub fn get_config_mut(&self, name: &str) -> Result, Error> 67 | where 68 | T: Any + Send + Sync, 69 | { 70 | let config = self.config.get_mut(name); 71 | if let Some(entry) = config { 72 | let concrete = entry 73 | .try_map(|x| x.downcast_mut::()) 74 | .map_err(|_| Error::Downcast)?; 75 | Ok(concrete) 76 | } else { 77 | Err(Error::NotFound) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/mccs/src/transport/channel.rs: -------------------------------------------------------------------------------- 1 | use std::any::Any; 2 | use std::collections::HashMap; 3 | 4 | use super::engine::TransportEngineId; 5 | use super::transporter::Transporter; 6 | use super::NUM_PROTOCOLS; 7 | use crate::cuda::ptr::DeviceNonNull; 8 | use crate::pattern::RingPattern; 9 | use std::fmt::Display; 10 | 11 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] 12 | pub enum ConnType { 13 | Send, 14 | Recv, 15 | } 16 | 17 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] 18 | pub struct PeerConnId { 19 | pub(crate) peer_rank: usize, 20 | pub(crate) channel: ChannelId, 21 | pub(crate) conn_index: u32, 22 | pub(crate) conn_type: ConnType, 23 | } 24 | 25 | pub struct PeerConnInfo { 26 | pub bufs: [DeviceNonNull; NUM_PROTOCOLS], 27 | pub head: DeviceNonNull, 28 | pub tail: DeviceNonNull, 29 | pub slots_size: Option>, 30 | } 31 | 32 | pub struct PeerConnector { 33 | pub conn_info: PeerConnInfo, 34 | pub transport_agent_engine: Option, 35 | pub transporter: &'static dyn Transporter, 36 | pub transport_resources: Box, 37 | } 38 | 39 | pub const CHANNEL_MAX_CONNS: usize = 2; 40 | 41 | pub struct ChannelPeerConn { 42 | // conn_index -> PeerConnector 43 | pub send: [Option; CHANNEL_MAX_CONNS], 44 | // conn_index -> PeerConnector 45 | pub recv: [Option; CHANNEL_MAX_CONNS], 46 | } 47 | 48 | pub struct CommChannel { 49 | // peer -> ChannelPeerConn 50 | pub peers: HashMap, 51 | pub ring: RingPattern, 52 | pub work_queue_next_available: u32, 53 | } 54 | 55 | #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] 56 | pub struct ChannelId(pub u32); 57 | 58 | impl Display for ChannelId { 59 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 60 | f.write_str(self.0.to_string().as_str()) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/mccs/src/transport/message.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | op::TransportOp, 3 | transporter::{AgentMessage, TransportAgentId, Transporter}, 4 | }; 5 | 6 | pub enum TransportEngineRequest { 7 | AgentSetup(&'static dyn Transporter, TransportAgentId, AgentMessage), 8 | AgentConnect(&'static dyn Transporter, TransportAgentId, AgentMessage), 9 | AgentTransportOp(TransportAgentId, TransportOp), 10 | AgentShutdown(TransportAgentId), 11 | } 12 | 13 | pub enum TransportEngineReply { 14 | AgentSetup(TransportAgentId, AgentMessage), 15 | AgentConnect(TransportAgentId, AgentMessage), 16 | AgentShutdown(TransportAgentId), 17 | } 18 | -------------------------------------------------------------------------------- /src/mccs/src/transport/meta.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::c_void; 2 | 3 | use super::NUM_BUFFER_SLOTS; 4 | const CACHE_LINE_SIZE: usize = 128; 5 | 6 | #[repr(C, align(4096))] 7 | pub struct SendBufMeta { 8 | pub head: u64, 9 | _pad1: [u8; CACHE_LINE_SIZE - std::mem::size_of::()], 10 | _ptr_exchange: *mut c_void, 11 | _reduce_op_arg_exchange: [u64; 2], 12 | _pad2: 13 | [u8; CACHE_LINE_SIZE - std::mem::size_of::<*mut c_void>() - 2 * std::mem::size_of::()], 14 | _slots_offsets: [i32; NUM_BUFFER_SLOTS], 15 | } 16 | 17 | static_assertions::const_assert_eq!(std::mem::size_of::(), 4096); 18 | 19 | impl SendBufMeta { 20 | pub fn new() -> Self { 21 | SendBufMeta { 22 | head: 0, 23 | _pad1: [0; CACHE_LINE_SIZE - std::mem::size_of::()], 24 | _ptr_exchange: std::ptr::null_mut(), 25 | _reduce_op_arg_exchange: [0; 2], 26 | _pad2: [0; CACHE_LINE_SIZE 27 | - std::mem::size_of::<*mut c_void>() 28 | - 2 * std::mem::size_of::()], 29 | _slots_offsets: [0; NUM_BUFFER_SLOTS], 30 | } 31 | } 32 | } 33 | 34 | impl Default for SendBufMeta { 35 | fn default() -> Self { 36 | Self::new() 37 | } 38 | } 39 | 40 | #[repr(C, align(4096))] 41 | pub struct RecvBufMeta { 42 | pub tail: u64, 43 | _pad1: [u8; CACHE_LINE_SIZE - std::mem::size_of::()], 44 | pub slots_sizes: [i32; NUM_BUFFER_SLOTS], 45 | _slots_offsets: [i32; NUM_BUFFER_SLOTS], 46 | _flush: i32, 47 | } 48 | 49 | static_assertions::const_assert_eq!(std::mem::size_of::(), 4096); 50 | 51 | impl Default for RecvBufMeta { 52 | fn default() -> Self { 53 | Self::new() 54 | } 55 | } 56 | 57 | impl RecvBufMeta { 58 | pub fn new() -> Self { 59 | RecvBufMeta { 60 | tail: 0, 61 | _pad1: [0; CACHE_LINE_SIZE - std::mem::size_of::()], 62 | slots_sizes: [0; NUM_BUFFER_SLOTS], 63 | _slots_offsets: [0; NUM_BUFFER_SLOTS], 64 | _flush: 0, 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/mccs/src/transport/mod.rs: -------------------------------------------------------------------------------- 1 | use strum::{EnumCount, EnumIter}; 2 | 3 | use collectives_sys::MCCS_BUFFER_SLOTS; 4 | use collectives_sys::{MCCS_NUM_PROTOCOLS, MCCS_PROTO_SIMPLE}; 5 | 6 | pub mod catalog; 7 | pub mod channel; 8 | pub mod delegator; 9 | pub mod engine; 10 | pub mod message; 11 | pub mod meta; 12 | pub mod net; 13 | pub mod op; 14 | pub mod queue; 15 | pub mod setup; 16 | pub mod shm; 17 | pub mod task; 18 | pub mod transporter; 19 | 20 | use net::transporter::NetTransport; 21 | use shm::transporter::ShmTransporter; 22 | use transporter::Transporter; 23 | 24 | pub static SHM_TRANSPORTER: ShmTransporter = ShmTransporter; 25 | pub static NET_TRANSPORTER: NetTransport = NetTransport; 26 | pub static ALL_TRANSPORTERS: &[&'static dyn Transporter] = &[&SHM_TRANSPORTER, &NET_TRANSPORTER]; 27 | 28 | pub const NUM_BUFFER_SLOTS: usize = MCCS_BUFFER_SLOTS as _; 29 | pub const NUM_PROTOCOLS: usize = MCCS_NUM_PROTOCOLS as _; 30 | 31 | #[derive(Debug, PartialEq, Eq, Clone, Copy, EnumIter, EnumCount)] 32 | #[repr(usize)] 33 | pub enum Protocol { 34 | Simple = MCCS_PROTO_SIMPLE as _, 35 | } 36 | 37 | static_assertions::const_assert_eq!(std::mem::variant_count::(), NUM_PROTOCOLS); 38 | 39 | pub const DEFAULT_BUFFER_SIZE: usize = 1 << 22; 40 | -------------------------------------------------------------------------------- /src/mccs/src/transport/net/config.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Debug, Clone, Serialize, Deserialize)] 4 | pub struct NetTransportConfig { 5 | pub gdr_enable: bool, 6 | pub gdr_copy_sync_enable: bool, 7 | pub gdr_copy_flush_enable: bool, 8 | } 9 | 10 | impl Default for NetTransportConfig { 11 | fn default() -> Self { 12 | NetTransportConfig { 13 | gdr_enable: false, 14 | gdr_copy_sync_enable: false, 15 | gdr_copy_flush_enable: false, 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/mccs/src/transport/net/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod agent; 2 | pub mod buffer; 3 | pub mod config; 4 | pub mod provider; 5 | pub mod resources; 6 | pub mod transporter; 7 | 8 | pub use provider::RDMA_TRANSPORT; 9 | pub use provider::{NetProperties, NetProvierWrap}; 10 | pub use transporter::NET_TRANSPORT; 11 | 12 | use thiserror::Error; 13 | 14 | use crate::transport::transporter::ConnectHandleError; 15 | use provider::NetProviderError; 16 | 17 | #[derive(Debug, Error)] 18 | pub enum NetTransportError { 19 | #[error("Failed to downcast setup resources")] 20 | DowncastSetupResources, 21 | #[error("Failed to downcast agent reply")] 22 | DowncastAgentReply, 23 | #[error("Invalid agent reply")] 24 | InvalidAgentReply, 25 | #[error("Connection handle: {0}")] 26 | ConnectionHandle(#[from] ConnectHandleError), 27 | #[error("Net provider error: {0}")] 28 | NetProvider(#[from] NetProviderError), 29 | } 30 | 31 | #[derive(Debug, Error)] 32 | pub enum NetAgentError { 33 | #[error("Net provider error: {0}")] 34 | NetProvider(#[from] NetProviderError), 35 | #[error("Ring buffer registration error: {0}")] 36 | BufferRegistration(String), 37 | #[error("Failed to downcast agent request")] 38 | DowncastAgentRequest, 39 | #[error("Failed to downcast agent resources")] 40 | DowncastAgentResources, 41 | #[error("Transport catalog error: {0}")] 42 | TransportCatalog(#[from] crate::transport::catalog::Error), 43 | } 44 | -------------------------------------------------------------------------------- /src/mccs/src/transport/shm/config.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Debug, Clone, Serialize, Deserialize)] 4 | pub struct ShmTransportConfig { 5 | pub locality: ShmLocality, 6 | #[serde(rename = "memcpy_send")] 7 | pub use_memcpy_send: bool, 8 | #[serde(rename = "memcpy_recv")] 9 | pub use_memcpy_recv: bool, 10 | } 11 | 12 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] 13 | pub enum ShmLocality { 14 | Sender, 15 | Receiver, 16 | } 17 | 18 | impl Default for ShmTransportConfig { 19 | fn default() -> Self { 20 | ShmTransportConfig { 21 | locality: ShmLocality::Sender, 22 | use_memcpy_send: false, 23 | use_memcpy_recv: false, 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/mccs/src/transport/shm/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod agent; 2 | pub mod buffer; 3 | pub mod config; 4 | pub mod resources; 5 | pub mod transporter; 6 | -------------------------------------------------------------------------------- /src/mccs/src/transport/task.rs: -------------------------------------------------------------------------------- 1 | use super::{op::TransportOp, transporter::TransportAgentId}; 2 | 3 | pub struct TransportTask { 4 | pub(crate) agent_id: TransportAgentId, 5 | pub(crate) op: TransportOp, 6 | } 7 | -------------------------------------------------------------------------------- /src/mccs/src/utils/duplex_chan.rs: -------------------------------------------------------------------------------- 1 | pub struct DuplexChannel { 2 | pub tx: crossbeam::channel::Sender, 3 | pub rx: crossbeam::channel::Receiver, 4 | } 5 | 6 | impl DuplexChannel { 7 | pub fn new_unbound_pair() -> (DuplexChannel, DuplexChannel) { 8 | let (t_tx, t_rx) = crossbeam::channel::unbounded(); 9 | let (r_tx, r_rx) = crossbeam::channel::unbounded(); 10 | ( 11 | DuplexChannel { tx: t_tx, rx: r_rx }, 12 | DuplexChannel { tx: r_tx, rx: t_rx }, 13 | ) 14 | } 15 | 16 | pub fn new_bound_pair( 17 | t_to_r: usize, 18 | r_to_t: usize, 19 | ) -> (DuplexChannel, DuplexChannel) { 20 | let (t_tx, t_rx) = crossbeam::channel::bounded(t_to_r); 21 | let (r_tx, r_rx) = crossbeam::channel::bounded(r_to_t); 22 | ( 23 | DuplexChannel { tx: t_tx, rx: r_rx }, 24 | DuplexChannel { tx: r_tx, rx: t_rx }, 25 | ) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/mccs/src/utils/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod duplex_chan; 2 | pub mod gdr; 3 | pub mod interfaces; 4 | pub mod pool; 5 | pub mod tcp; 6 | 7 | #[macro_export] 8 | macro_rules! cuda_warning { 9 | ($cuda_op:expr) => {{ 10 | let e = $cuda_op; 11 | if e != cuda_runtime_sys::cudaError::cudaSuccess { 12 | log::error!( 13 | "CUDA runtime failed with {:?} at {}:{}.", 14 | e, 15 | file!(), 16 | line!() 17 | ) 18 | } 19 | }}; 20 | ($cuda_op:expr,$ctx:expr) => {{ 21 | let e = $cuda_op; 22 | if e != cuda_runtime_sys::cudaError::cudaSuccess { 23 | log::error!( 24 | "CUDA runtime failed with {:?} at {}:{}. Context={}", 25 | e, 26 | file!(), 27 | line!(), 28 | $ctx 29 | ) 30 | } 31 | }}; 32 | } 33 | 34 | #[macro_export] 35 | macro_rules! cu_warning { 36 | ($cu_op:expr) => {{ 37 | let e = $cu_op; 38 | if e != cuda_driver_sys::CUresult::CUDA_SUCCESS { 39 | log::error!( 40 | "CUDA driver failed with {:?} at {}:{}.", 41 | e, 42 | file!(), 43 | line!() 44 | ) 45 | } 46 | }}; 47 | ($cu_op:expr,$ctx:expr) => {{ 48 | let e = $cu_op; 49 | if e != cuda_driver_sys: CUresult::CUDA_SUCCESS { 50 | log::error!( 51 | "CUDA driver failed with {:?} at {}:{}. Context={}", 52 | e, 53 | file!(), 54 | line!(), 55 | $ctx 56 | ) 57 | } 58 | }}; 59 | } 60 | 61 | thread_local!(pub static CU_INIT: () = (|| unsafe { 62 | cu_warning!(cuda_driver_sys::cuInit(0)); 63 | () 64 | })()); 65 | -------------------------------------------------------------------------------- /src/mccs/src/utils/pool.rs: -------------------------------------------------------------------------------- 1 | pub struct WorkPool { 2 | pool: Vec, 3 | } 4 | 5 | impl WorkPool { 6 | pub fn new() -> Self { 7 | WorkPool { pool: Vec::new() } 8 | } 9 | } 10 | 11 | impl Default for WorkPool { 12 | fn default() -> Self { 13 | Self::new() 14 | } 15 | } 16 | 17 | impl WorkPool { 18 | pub fn progress(&mut self, mut f: F) 19 | where 20 | F: FnMut(&mut T) -> bool, 21 | { 22 | let mut idx = 0; 23 | while idx < self.pool.len() { 24 | let finished = f(&mut self.pool[idx]); 25 | if finished { 26 | self.pool.swap_remove(idx); 27 | } else { 28 | idx += 1; 29 | } 30 | } 31 | } 32 | 33 | pub fn enqueue(&mut self, elem: T) { 34 | self.pool.push(elem); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/mccs_examples/allgather_bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "allgather_bench" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" } 10 | libmccs = { path = "../../libmccs" } 11 | 12 | structopt = "0.3.26" -------------------------------------------------------------------------------- /src/mccs_examples/allgather_proto/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "allgather_proto" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" } 10 | libmccs = { path = "../../libmccs" } 11 | 12 | structopt = "0.3.26" -------------------------------------------------------------------------------- /src/mccs_examples/allreduce_bench/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "allreduce_bench" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | chrono = "0.4.33" 10 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" } 11 | libmccs = { path = "../../libmccs" } 12 | 13 | structopt = "0.3.26" 14 | -------------------------------------------------------------------------------- /src/mccs_examples/allreduce_proto/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "allreduce_proto" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" } 10 | libmccs = { path = "../../libmccs" } 11 | 12 | structopt = "0.3.26" -------------------------------------------------------------------------------- /src/mccs_examples/cuda_hello/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cuda_hello" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" } 10 | libmccs = { path = "../../libmccs" } -------------------------------------------------------------------------------- /src/mccs_examples/cuda_hello/src/main.rs: -------------------------------------------------------------------------------- 1 | use cuda_runtime_sys::cudaMemcpy; 2 | use cuda_runtime_sys::{cudaError, cudaMemcpyKind}; 3 | 4 | use libmccs::memory::cuda_malloc; 5 | 6 | const BUFFER_SIZE: usize = 1024 * 1024; 7 | 8 | fn main() { 9 | let dev_ptr = cuda_malloc(0, BUFFER_SIZE).unwrap(); 10 | let buf = vec![42i32; BUFFER_SIZE / std::mem::size_of::()]; 11 | let err = unsafe { 12 | cudaMemcpy( 13 | dev_ptr.ptr, 14 | buf.as_ptr() as *const _, 15 | BUFFER_SIZE, 16 | cudaMemcpyKind::cudaMemcpyHostToDevice, 17 | ) 18 | }; 19 | if err != cudaError::cudaSuccess { 20 | panic!("cudaMemcpy failed") 21 | } 22 | 23 | println!("cudaMemcpy success"); 24 | } 25 | -------------------------------------------------------------------------------- /src/mccs_examples/ring_config/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ring_config" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | ipc = { path = "../../ipc", features = ["mccs"] } 10 | 11 | bincode = "1.3.3" 12 | serde = "1.0.149" 13 | byteorder = "1.5.0" 14 | structopt = "0.3.26" 15 | toml = "0.8.8" -------------------------------------------------------------------------------- /src/mccs_examples/ring_config/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream}; 3 | use std::path::{Path, PathBuf}; 4 | 5 | use byteorder::{ByteOrder, LittleEndian}; 6 | use serde::{Deserialize, Serialize}; 7 | use structopt::StructOpt; 8 | 9 | use ipc::mccs::reconfig::{CommPatternReconfig, ExchangeReconfigCommand}; 10 | 11 | #[derive(Debug, Clone, Serialize, Deserialize)] 12 | struct Config { 13 | mccs_addrs: Vec, 14 | mccs_port: u16, 15 | comm_patterns_reconfig: Vec, 16 | } 17 | 18 | impl Config { 19 | fn from_path>(path: P) -> Config { 20 | let content = std::fs::read_to_string(path).unwrap(); 21 | let config = toml::from_str(&content).unwrap(); 22 | config 23 | } 24 | } 25 | 26 | #[derive(Debug, Clone, StructOpt)] 27 | #[structopt(name = "Comm pattern configurator")] 28 | struct Opts { 29 | // Path to toml traffic trace 30 | #[structopt(long, short = "c")] 31 | config: PathBuf, 32 | } 33 | 34 | const EXCHANGE_MAGIC: u64 = 0x424ab9f2fc4b9d6e; 35 | 36 | fn main() { 37 | let opts = Opts::from_args(); 38 | let config = Config::from_path(opts.config); 39 | 40 | let pattern_config = config.comm_patterns_reconfig.clone(); 41 | let command = ExchangeReconfigCommand::CommPatternReconfig(pattern_config); 42 | let encoded = bincode::serialize(&command).unwrap(); 43 | 44 | for addr in config.mccs_addrs.iter() { 45 | let addr = SocketAddr::new(*addr, config.mccs_port); 46 | let mut buf = [0u8; 5]; 47 | buf[0] = 1; 48 | LittleEndian::write_u32(&mut buf[1..], encoded.len() as u32); 49 | let mut magic_buf = [0u8; std::mem::size_of::()]; 50 | LittleEndian::write_u64(&mut magic_buf, EXCHANGE_MAGIC); 51 | let mut stream = TcpStream::connect(addr).unwrap(); 52 | stream.set_nodelay(true).unwrap(); 53 | stream.write_all(&magic_buf).unwrap(); 54 | stream.write_all(&buf).unwrap(); 55 | stream.write_all(encoded.as_slice()).unwrap(); 56 | 57 | println!("Sent command to {}", addr); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/mccs_examples/traffic_gen/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "traffic_gen" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | cuda-runtime-sys = { path = "../../cuda-sys/cuda-runtime-sys" } 10 | libmccs = { path = "../../libmccs" } 11 | csv = "1.3.0" 12 | 13 | structopt = "0.3.26" 14 | serde = { version = "1.0.195", features = ["derive"] } 15 | toml = "0.8.8" 16 | spin_sleep = "1.2.0" 17 | chrono = "0.4.33" 18 | -------------------------------------------------------------------------------- /src/mccs_tests/rdma_transport/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rdma_transport" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | mccs = { path = "../../mccs" } -------------------------------------------------------------------------------- /src/mccs_tests/rdma_transport/examples/client.rs: -------------------------------------------------------------------------------- 1 | fn main() {} 2 | -------------------------------------------------------------------------------- /src/mccs_tests/rdma_transport/examples/server.rs: -------------------------------------------------------------------------------- 1 | fn main() {} 2 | -------------------------------------------------------------------------------- /src/mccs_tests/rdma_transport/src/lib.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/qos-service/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "qos-service" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | intervallum = "1.4.1" 10 | bincode = "1.3.3" 11 | serde = "1.0.149" -------------------------------------------------------------------------------- /src/qos-service/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use interval::interval_set::ToIntervalSet; 4 | use interval::IntervalSet; 5 | use serde::{Deserialize, Serialize}; 6 | 7 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)] 8 | #[repr(transparent)] 9 | #[serde(transparent)] 10 | pub struct CommunicatorId(pub u32); 11 | 12 | #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] 13 | pub enum QosMode { 14 | Allow, 15 | Deny, 16 | } 17 | 18 | #[derive(Clone, Debug, Serialize, Deserialize)] 19 | pub struct QosIntervalDef { 20 | // start and end timestamps in microseconds 21 | pub intervals: Vec<(u64, u64)>, 22 | pub mode: QosMode, 23 | pub enforce_step: Option, 24 | } 25 | 26 | #[derive(Clone, Debug, Serialize, Deserialize)] 27 | pub struct QosScheduleDef { 28 | #[serde(deserialize_with = "deserialize_schedule")] 29 | pub schedule: HashMap, 30 | pub epoch_microsecs: u64, 31 | } 32 | 33 | fn deserialize_schedule<'de, D>( 34 | deserializer: D, 35 | ) -> Result, D::Error> 36 | where 37 | D: serde::Deserializer<'de>, 38 | { 39 | let map: HashMap = Deserialize::deserialize(deserializer)?; 40 | map.into_iter() 41 | .map(|(k, v)| { 42 | k.parse::() 43 | .map_err(serde::de::Error::custom) 44 | .map(|key| (CommunicatorId(key), v)) 45 | }) 46 | .collect() 47 | } 48 | 49 | #[derive(Clone, Debug)] 50 | pub struct QosInterval { 51 | pub intervals: IntervalSet, 52 | pub mode: QosMode, 53 | pub enforce_step: Option, 54 | } 55 | 56 | impl From for QosInterval { 57 | fn from(def: QosIntervalDef) -> Self { 58 | let intervals = def.intervals.to_interval_set(); 59 | QosInterval { 60 | intervals, 61 | mode: def.mode, 62 | enforce_step: def.enforce_step, 63 | } 64 | } 65 | } 66 | 67 | #[derive(Clone, Debug)] 68 | pub struct QosSchedule { 69 | pub schedule: HashMap, 70 | pub epoch_microsecs: u64, 71 | } 72 | 73 | impl From for QosSchedule { 74 | fn from(def: QosScheduleDef) -> Self { 75 | let schedule = def 76 | .schedule 77 | .into_iter() 78 | .map(|(k, v)| (k, v.into())) 79 | .collect(); 80 | QosSchedule { 81 | schedule, 82 | epoch_microsecs: def.epoch_microsecs, 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /workloads/reconfig_gpt.toml: -------------------------------------------------------------------------------- 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3 2 | communicator_id = 600 3 | cuda_devices = [0, 1, 0, 1, 0, 1, 0, 1] 4 | 5 | [[traces]] 6 | size = 83886080 7 | type = "all_reduce" 8 | compute_interval = 6000 9 | -------------------------------------------------------------------------------- /workloads/setup-1_gpt_0.toml: -------------------------------------------------------------------------------- 1 | # VGG on 4 GPUs (2 on H1, 2 on H3) 2 | communicator_id = 100 3 | cuda_devices = [0, 1, 0, 1] 4 | 5 | [[traces]] 6 | # 678M all-reduce 7 | size = 711839744 8 | type = "all_reduce" 9 | # 1750ms compute 10 | compute_interval = 930000 -------------------------------------------------------------------------------- /workloads/setup-1_gpt_1.toml: -------------------------------------------------------------------------------- 1 | # VGG on 4 GPUs (2 on H1, 2 on H3) 2 | communicator_id = 101 3 | cuda_devices = [0, 1, 0, 1] 4 | 5 | [[traces]] 6 | # 678M all-reduce 7 | size = 711839744 8 | type = "all_reduce" 9 | # 1750ms compute 10 | compute_interval = 930000 -------------------------------------------------------------------------------- /workloads/setup-1_resnet_0.toml: -------------------------------------------------------------------------------- 1 | # ResNet on 4 GPUs (2 on H0, 2 on H2) 2 | communicator_id = 100 3 | cuda_devices = [0, 1, 0, 1] 4 | 5 | [[traces]] 6 | # 170M all-reduce 7 | size = 178196640 8 | type = "all_reduce" 9 | # 230ms compute 10 | compute_interval = 230000 -------------------------------------------------------------------------------- /workloads/setup-1_resnet_1.toml: -------------------------------------------------------------------------------- 1 | # ResNet on 4 GPUs (2 on H0, 2 on H2) 2 | communicator_id = 101 3 | cuda_devices = [0, 1, 0, 1] 4 | 5 | [[traces]] 6 | # 170M all-reduce 7 | size = 178196640 8 | type = "all_reduce" 9 | # 230ms compute 10 | compute_interval = 230000 -------------------------------------------------------------------------------- /workloads/setup-1_vgg_0.toml: -------------------------------------------------------------------------------- 1 | # VGG on 2 GPUs (1 on H1, 1 on H3) 2 | communicator_id = 100 3 | cuda_devices = [0, 1, 0, 1] 4 | 5 | [[traces]] 6 | # 548M all-reduce 7 | size = 574668960 8 | type = "all_reduce" 9 | # 310ms compute 10 | # compute_interval = 310000 11 | compute_interval = 155000 12 | -------------------------------------------------------------------------------- /workloads/setup-1_vgg_1.toml: -------------------------------------------------------------------------------- 1 | # VGG on 2 GPUs (1 on H1, 1 on H3) 2 | communicator_id = 101 3 | cuda_devices = [0, 1, 0, 1] 4 | 5 | [[traces]] 6 | # 548M all-reduce 7 | size = 574668960 8 | type = "all_reduce" 9 | # 310ms compute 10 | # compute_interval = 310000 11 | compute_interval = 155000 12 | -------------------------------------------------------------------------------- /workloads/setup-2_gpt_1.toml: -------------------------------------------------------------------------------- 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3 2 | communicator_id = 201 3 | cuda_devices = [0, 0] 4 | 5 | [[traces]] 6 | size = 83886080 7 | type = "all_reduce" 8 | compute_interval = 6000 9 | -------------------------------------------------------------------------------- /workloads/setup-2_gpt_2.toml: -------------------------------------------------------------------------------- 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3 2 | communicator_id = 202 3 | cuda_devices = [0, 0] 4 | 5 | [[traces]] 6 | size = 83886080 7 | type = "all_reduce" 8 | compute_interval = 6000 9 | -------------------------------------------------------------------------------- /workloads/setup-2_resnet.toml: -------------------------------------------------------------------------------- 1 | # ResNet on 2 GPUs (1 on H0, 1 on H2) 2 | communicator_id = 'bad' 3 | cuda_devices = [0, 0] 4 | 5 | [[traces]] 6 | # 170M all-reduce 7 | size = 178196640 8 | type = "all_reduce" 9 | # 230ms compute 10 | compute_interval = 230000 -------------------------------------------------------------------------------- /workloads/setup-2_vgg.toml: -------------------------------------------------------------------------------- 1 | # VGG on 2 GPUs (1 on H1, 1 on H3) 2 | communicator_id = 200 3 | cuda_devices = [1, 1, 1, 1] 4 | 5 | [[traces]] 6 | # 548M all-reduce 7 | size = 574668960 8 | type = "all_reduce" 9 | # 310ms compute 10 | compute_interval = 160000 11 | -------------------------------------------------------------------------------- /workloads/setup-3_gpt_1.toml: -------------------------------------------------------------------------------- 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3 2 | communicator_id = 300 3 | cuda_devices = [0, 0, 0, 0] 4 | 5 | [[traces]] 6 | # 678M all-reduce 7 | size = 711839744 8 | type = "all_reduce" 9 | # 1750ms compute 10 | compute_interval = 930000 -------------------------------------------------------------------------------- /workloads/setup-3_gpt_2.toml: -------------------------------------------------------------------------------- 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3 2 | communicator_id = 301 3 | cuda_devices = [1, 1, 1, 1] 4 | 5 | [[traces]] 6 | # 678M all-reduce 7 | size = 711839744 8 | type = "all_reduce" 9 | # 1750ms compute 10 | compute_interval = 930000 -------------------------------------------------------------------------------- /workloads/setup-4_gpt_1.toml: -------------------------------------------------------------------------------- 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3 2 | communicator_id = 201 3 | cuda_devices = [0, 0] 4 | 5 | [[traces]] 6 | size = 83886080 7 | type = "all_reduce" 8 | compute_interval = 6000 9 | -------------------------------------------------------------------------------- /workloads/setup-4_gpt_2.toml: -------------------------------------------------------------------------------- 1 | # GPT-350M 4 GPUs on H0, H1, H2 and H3 2 | communicator_id = 202 3 | cuda_devices = [1, 1] 4 | 5 | [[traces]] 6 | size = 83886080 7 | type = "all_reduce" 8 | compute_interval = 6000 9 | -------------------------------------------------------------------------------- /workloads/setup-4_resnet_0.toml: -------------------------------------------------------------------------------- 1 | # ResNet on 4 GPUs (2 on H0, 2 on H2) 2 | communicator_id = 202 3 | cuda_devices = [1, 1] 4 | 5 | [[traces]] 6 | # 170M all-reduce 7 | size = 178196640 8 | type = "all_reduce" 9 | # 230ms compute 10 | compute_interval = 120000 11 | -------------------------------------------------------------------------------- /workloads/setup-4_vgg.toml: -------------------------------------------------------------------------------- 1 | # VGG on 2 GPUs (1 on H1, 1 on H3) 2 | communicator_id = 200 3 | cuda_devices = [0, 1, 0, 1] 4 | 5 | [[traces]] 6 | # 548M all-reduce 7 | size = 574668960 8 | type = "all_reduce" 9 | # 310ms compute 10 | compute_interval = 160000 11 | --------------------------------------------------------------------------------