├── .gitignore ├── .gitmodules ├── LICENSE ├── Makefile ├── README.md ├── benchmark ├── README.md ├── benchmark_all2all.py ├── benchmark_plan.py ├── plot_results.py ├── plots │ ├── dgx1_all2all.pdf │ ├── dgx1_quad_all2all.pdf │ ├── dgx1_quad_scatter.pdf │ ├── dgx1_scatter.pdf │ ├── p100_quad_all2all.pdf │ └── p100_quad_scatter.pdf └── results │ ├── dgx1 │ ├── all2all │ │ ├── direct.csv │ │ ├── opt.csv │ │ ├── opt_1chunk.csv │ │ ├── rings.csv │ │ └── symm.csv │ ├── all2all_async │ │ ├── direct.csv │ │ ├── opt.csv │ │ ├── opt_1chunk.csv │ │ ├── rings.csv │ │ └── symm.csv │ ├── bisection.csv │ ├── gather │ │ ├── direct.csv │ │ ├── opt.csv │ │ ├── opt_1chunk.csv │ │ ├── rings.csv │ │ └── symm.csv │ ├── nvlink_bandwidth.csv │ ├── nvlink_latency.csv │ └── scatter │ │ ├── direct.csv │ │ ├── opt.csv │ │ ├── opt_1chunk.csv │ │ ├── rings.csv │ │ └── symm.csv │ ├── dgx1_quad │ ├── all2all │ │ ├── direct_p100.csv │ │ ├── opt_p100.csv │ │ └── rings_p100.csv │ ├── all2all_async │ │ ├── direct_p100.csv │ │ ├── opt.csv │ │ ├── opt_p100.csv │ │ └── rings_p100.csv │ ├── gather │ │ ├── direct_p100.csv │ │ ├── opt.csv │ │ ├── opt_p100.csv │ │ └── rings_p100.csv │ └── scatter │ │ ├── direct_p100.csv │ │ ├── opt.csv │ │ ├── opt_p100.csv │ │ └── rings_p100.csv │ ├── old │ ├── p100_quad │ │ ├── all2all │ │ │ └── rings.csv │ │ ├── all2all_async │ │ │ └── rings.csv │ │ ├── gather │ │ │ └── rings.csv │ │ └── scatter │ │ │ └── rings.csv │ └── v100_quad │ │ ├── all2all │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ │ ├── all2all_async │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ │ ├── gather │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ │ └── scatter │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ ├── p100_quad │ ├── all2all │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ ├── all2all_async │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ ├── gather │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ └── scatter │ │ ├── direct.csv │ │ ├── opt.csv │ │ └── rings.csv │ └── throughput.txt ├── execute.cu ├── executor.cuh ├── include ├── gossip.cuh ├── gossip │ ├── all_to_all.cuh │ ├── all_to_all_async.cuh │ ├── all_to_all_plan.hpp │ ├── broadcast.cuh │ ├── broadcast_plan.hpp │ ├── common.cuh │ ├── config.h │ ├── context.cuh │ ├── error_checking.hpp │ ├── gather.cuh │ ├── gather_plan.hpp │ ├── memory_manager.cuh │ ├── multisplit.cuh │ ├── point_to_point.cuh │ ├── scatter.cuh │ ├── scatter_plan.hpp │ ├── transfer_plan.hpp │ └── utils.cuh ├── json.hpp ├── plan_parser.cpp └── plan_parser.hpp ├── plans ├── 8v100_switched │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx1_direct │ ├── all2all_plan.json │ ├── broadcast_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx1_opt │ ├── all2all_plan.json │ ├── broadcast_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx1_opt_1chunk │ ├── all2all_plan.json │ ├── broadcast_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx1_quad_opt │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx1_quad_opt2 │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx1_rings │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx1_symm │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx2_direct │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── dgx2_opt │ └── all2all_plan.json ├── p100_quad_direct │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── p100_quad_opt │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── p100_quad_rings │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── v100_quad_opt │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json └── v100_quad_rings │ ├── all2all_plan.json │ ├── gather_plan.json │ └── scatter_plan.json ├── scripts ├── dgx1_topology.txt ├── plan_from_rings.py ├── plan_from_topology_asynch.py ├── plan_from_topology_synch.py └── topology_parser.py └── simulate.cu /.gitignore: -------------------------------------------------------------------------------- 1 | /build* 2 | *.exe 3 | *.o 4 | execute 5 | simulate -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "include/clipp"] 2 | path = include/clipp 3 | url = https://github.com/muellan/clipp.git 4 | branch = master 5 | [submodule "include/hpc_helpers"] 6 | path = include/hpc_helpers 7 | url = https://gitlab.rlp.net/pararch/hpc_helpers 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Parallel and Distributed Architectures 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | NVCCGENCODE = \ 3 | -gencode arch=compute_60,code=sm_60 \ 4 | -gencode arch=compute_70,code=sm_70 5 | 6 | NVCCFLAGS = $(NVCCGENCODE) -O3 -std=c++11 --expt-extended-lambda -Xcompiler="-fopenmp" -Wreorder -lineinfo 7 | 8 | HEADERS = include/gossip.cuh \ 9 | include/gossip/all_to_all_async.cuh \ 10 | include/gossip/all_to_all_plan.hpp \ 11 | include/gossip/all_to_all.cuh \ 12 | include/gossip/broadcast_plan.hpp \ 13 | include/gossip/broadcast.cuh \ 14 | include/gossip/common.cuh \ 15 | include/gossip/context.cuh \ 16 | include/gossip/error_checking.hpp \ 17 | include/gossip/gather_plan.hpp \ 18 | include/gossip/gather.cuh \ 19 | include/gossip/memory_manager.cuh \ 20 | include/gossip/multisplit.cuh \ 21 | include/gossip/point_to_point.cuh \ 22 | include/gossip/scatter_plan.hpp \ 23 | include/gossip/scatter.cuh \ 24 | include/gossip/transfer_plan.hpp 25 | 26 | BUILD_DIR = build 27 | 28 | 29 | .PHONY: all clean 30 | 31 | all: execute simulate 32 | 33 | 34 | execute: $(BUILD_DIR) $(BUILD_DIR)/plan_parser.o $(BUILD_DIR)/execute.o 35 | $(NVCC) $(NVCCFLAGS) $(BUILD_DIR)/plan_parser.o $(BUILD_DIR)/execute.o -o execute 36 | 37 | $(BUILD_DIR)/execute.o: execute.cu executor.cuh $(HEADERS) include/plan_parser.hpp 38 | $(NVCC) $(NVCCFLAGS) -c execute.cu -o $(BUILD_DIR)/execute.o 39 | 40 | 41 | simulate: $(BUILD_DIR) $(BUILD_DIR)/plan_parser.o $(BUILD_DIR)/simulate.o 42 | $(NVCC) $(NVCCFLAGS) $(BUILD_DIR)/plan_parser.o $(BUILD_DIR)/simulate.o -o simulate 43 | 44 | $(BUILD_DIR)/simulate.o: simulate.cu executor.cuh $(HEADERS) include/plan_parser.hpp 45 | $(NVCC) $(NVCCFLAGS) -c simulate.cu -o $(BUILD_DIR)/simulate.o 46 | 47 | 48 | $(BUILD_DIR): 49 | mkdir $(BUILD_DIR) 50 | 51 | $(BUILD_DIR)/plan_parser.o: include/plan_parser.cpp include/plan_parser.hpp 52 | $(NVCC) $(NVCCFLAGS) -c include/plan_parser.cpp -o $(BUILD_DIR)/plan_parser.o 53 | 54 | 55 | clean: 56 | rm -rf $(BUILD_DIR) 57 | rm -rf execute 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `gossip`: Efficient Communication Primitives for Multi-GPU Systems 2 | 3 | Gossip supports scatter, gather and all-to-all communication. To execute one of the communication primitives a transfer plan is needed. Use the provided [scripts](scripts) to generate optimized plans for your specific NVLink topology. The [plans directory](plans) contains optimized plans for typical 4 GPU configurations ([P100](plans/p100_quad_opt/) and [V100](plans/v100_quad_opt/)) as well as 8 GPU [DGX-1 Volta](plans/dgx1_opt). If no transfer plan is provided gossip will fall back to the default strategy using direct transfers between GPUs. 4 | 5 | Gossip was presented at [ICPP '19](https://dl.acm.org/citation.cfm?id=3337889). 6 | 7 | 8 | ## Using gossip 9 | 10 | To use gossip clone this repository and check out the submodule *hpc_helpers* by calling `git submodule update --init include/hpc_helpers`. Include the header [gossip.cuh](include/gossip.cuh) in your project which provides all communication primitives. To parse transfer plans make use of the [plan parser](include/plan_parser.hpp) which can be compiled as a separate unit like in the example [Makefile](Makefile). 11 | 12 | 13 | ## Examples 14 | 15 | The example [execute.cu](execute.cu) executes gossip's communication primitives on uniformly distributed random numbers. The data is first split into a number of chunks corresponding to the number of GPUs (multisplit). The chunks sizes are displayed as a partiton table (row=source GPU, column=target GPU). Then the data is transferred between the GPUs according to the provided transfer plan. At the end it validates if all data reached the correct destinations. 16 | 17 | The example [simulate.cu](simulate.cu) allows to run the multi-GPU example above simulated on a single GPU. 18 | 19 | ### Build example 20 | 21 | Compile the example using the provided [Makefile](Makefile) by calling `git submodule update --init && make`. 22 | 23 | Requirements: 24 | 25 | - CUDA >= 9.2 26 | - GNU g++ >= 5.5 compatible with your CUDA version 27 | - Python >= 3.0 including 28 | - Matplotlib 29 | - NumPy 30 | 31 | ### Run example 32 | 33 | ```bash 34 | ./execute (all2all|all2all_async) [--size ] [--memory-factor ] 35 | 36 | ./execute scatter_gather [--size ] [--memory-factor ] 37 | ``` 38 | 39 | Use `./simulate` instead of `./execute` if you want to simulate the example on a single GPU. 40 | 41 | Mandatory: 42 | 43 | - Choose all2all (double buffered), all2all_async or scatter_gather mode 44 | - Provide path(s) to transfer plan(s) (one for all2all, two for scatter+gather) 45 | 46 | Optional: 47 | 48 | - Choose data size (2^\ 64-bit elements per GPU) (default: 28) 49 | - Choose memory factor (account for random transfer sizes) (default: 1.5) 50 | 51 | 52 | ## Benchmark 53 | 54 | For benchmark scripts and results see the [benchmark directory](benchmark). 55 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark Suite 2 | 3 | ## ```benchmark_plan.py``` 4 | ### Benchmark a Transfer Plan 5 | 6 | #### Prequisits 7 | Compile the project using the provided Makefile by simply calling ```git submodule update --init && make``` in the base directory. 8 | 9 | You will need: 10 | 11 | - CUDA >= 9.2 12 | - GNU g++ >= 6.3.0 and compatible with your CUDA version 13 | - Python >= 3.0 with 14 | - Matplotlib 15 | - NumPy 16 | #### Usage 17 | ``` 18 | usage: benchmark_plan.py [-h] [--dir DIR] [--repeats REPEATS] 19 | [--maxsize MAXSIZE] [--minsize MINSIZE] 20 | type plan 21 | 22 | positional arguments: 23 | type collective type 24 | (all2all|all2all_async|scatter_gather|broadcast) 25 | plan JSON which specifies the communication strategy 26 | 27 | optional arguments: 28 | -h, --help show this help message and exit 29 | --dir DIR output directory (default: .) 30 | --repeats REPEATS number of repeated executions (default: 5) 31 | --maxsize MAXSIZE maximum overall amount of data to be communicated (bytes 32 | log2) (default: 28) 33 | --minsize MINSIZE minimum overall amount of data to be communicated (bytes 34 | log2) (default: 12) 35 | ``` 36 | 37 | __Note: Only supports ```all2all``` and ```all2all_async``` at the moment. Will be fixed soon...__ 38 | 39 | #### Result 40 | 41 | A ```csv``` file with the following format: 42 | 43 | ``` 44 | ,,,... 45 | ,,,... 46 | ,,,... 47 | ... 48 | ``` 49 | 50 | ------ 51 | ## ```benchmark_all2all.py``` 52 | ### Benchmark and Plot All2All Performance on DGX-1 or DGX-2 Topologies 53 | 54 | #### Prequisits 55 | Compile the project using the provided Makefile by simply calling ```make``` in the base directory. 56 | #### Usage 57 | This script runs a complete benchmark of all relevant all2all configurations on either DGX-1 or DGX-2 and automatically generates the corresponding visualizations. 58 | 59 | ``` 60 | usage: benchmark_all2all.py [-h] [--dir DIR] [--python PYTHON] arch 61 | 62 | positional arguments: 63 | arch architecture (dgx1|dgx2) 64 | 65 | optional arguments: 66 | -h, --help show this help message and exit 67 | --dir DIR output directory (default: .) 68 | --python PYTHON python interpreter (default: python3) 69 | ``` 70 | 71 | __Note: Takes approximately 22 minutes on a DGX-1 (Volta).__ 72 | #### Result 73 | Benchmark files (```csv```) and plots (```pdf```) for either DGX-1 or DGX-2 topology. 74 | 75 | ------ 76 | ## ```plot_results.py``` 77 | ### Plot Benchmark Results 78 | 79 | #### Prequisits 80 | You have already generated one or multiple ```csv``` files using ```benchmark_plan.py``` or ```benchmark_all2all.py```. 81 | #### Usage 82 | ``` 83 | usage: plot_results.py [-h] [--output OUTPUT] [--reduction REDUCTION] 84 | csvs [csvs ...] 85 | 86 | positional arguments: 87 | csvs csv traces to be included in the plot e.g. 88 | or or 89 | 90 | optional arguments: 91 | -h, --help show this help message and exit 92 | --output OUTPUT, -o OUTPUT 93 | output file (default: show plot) 94 | --reduction REDUCTION, -r REDUCTION 95 | reduction operation of multiple runs 96 | (median|mean|min|max) (default: median) 97 | ``` 98 | 99 | __Note: ```%``` denotes the seperator between filepath, label and color e.g. ```mypath/myplan.json%mytracelabel%red```.__ 100 | 101 | #### Result 102 | A graph generated by ```matplotlib``` showing the collective's achieved throupghput (either displayed or saved). 103 | 104 | -------------------------------------------------------------------------------- /benchmark/benchmark_all2all.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import subprocess 3 | import argparse 4 | import os 5 | import signal 6 | from pathlib import Path 7 | 8 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 9 | parser.add_argument("arch", type=str, help="architecture (dgx1|dgx2)") 10 | parser.add_argument("--dir", type=str, help="output directory", default=".") 11 | parser.add_argument("--python", type=str, help="python interpreter", default="python3") 12 | args = parser.parse_args() 13 | 14 | if args.dir.endswith('/'): 15 | args.dir = args.dir[:-1] 16 | 17 | # args.arch has to be one of the following options 18 | types = ["dgx1", "dgx2"] 19 | assert(args.arch in types) 20 | 21 | base_path = str(Path(__file__).parent.parent.resolve()) 22 | 23 | # relevant plans for DGX-1 topology 24 | dgx1_plans = [ 25 | base_path + "/plans/dgx1_direct/all2all_plan.json", 26 | base_path + "/plans/dgx1_opt/all2all_plan.json", 27 | base_path + "/plans/dgx1_rings/all2all_plan.json" 28 | ] 29 | 30 | # relevant plans for DGX-2 topology 31 | dgx2_plans = [ 32 | base_path + "/plans/dgx2_direct/all2all_plan.json", 33 | base_path + "/plans/dgx2_opt/all2all_plan.json" 34 | ] 35 | 36 | plans = {"dgx1": dgx1_plans, "dgx2": dgx2_plans} 37 | 38 | # mkdir 39 | if not os.path.exists(args.dir): 40 | os.makedirs(args.dir) 41 | 42 | # make sure subprocesses return at SIGINT 43 | def signal_handler(signal, frame): 44 | sys.exit(0) 45 | signal.signal(signal.SIGINT, signal_handler) 46 | 47 | for i, plan in enumerate(plans[args.arch]): 48 | print("PROGRESS: plan " + str(i+1) + "/" + str(len(plans[args.arch]))) 49 | 50 | plan_label = plan.split('/')[-2] 51 | 52 | print("\tall2all") 53 | p1 = subprocess.Popen([args.python, base_path + "/benchmark/benchmark_plan.py", "all2all", plan, "-o", args.dir + "/" + plan_label + "_all2all.csv"]).wait() 54 | print("\tall2all_async") 55 | p2 = subprocess.Popen([args.python, base_path + "/benchmark/benchmark_plan.py", "all2all_async", plan, "-o", args.dir + "/" + plan_label + "_all2all_async.csv"]).wait() 56 | 57 | if p1 or p2: 58 | print("ERROR: subprocess terminated with non-zero exit code") 59 | sys.exit(1) 60 | 61 | 62 | subprocess.Popen([args.python, base_path + "/benchmark/plot_results.py", "-o", args.dir + "/" + args.arch + "_all2all_benchmark.pdf", args.dir + "/*"]).wait() -------------------------------------------------------------------------------- /benchmark/benchmark_plan.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import subprocess 3 | import argparse 4 | import os 5 | from pathlib import Path 6 | 7 | def absolute_path(path): 8 | return str(Path(path).resolve()) 9 | 10 | # check if output shows no errors 11 | def valid(output): 12 | for line in output.decode().split('\n'): 13 | if 'error' in line.lower(): 14 | return False 15 | return True 16 | 17 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 18 | parser.add_argument("type", type=str, help="collective type (all2all|all2all_async|scatter_gather|broadcast)") 19 | parser.add_argument("plan", type=str, nargs='+', help="JSON which specifies the communication strategy\n\ 20 | scatter_gather requires one plan for each collective") 21 | parser.add_argument("--output", "-o", type=str, help="output file", default="benchmark.csv") 22 | parser.add_argument("--repeats", "-r", type=int, help="number of repeated executions", default=3) 23 | parser.add_argument("--maxsize", type=int, help="maximum overall amount of data to be communicated (bytes log2)", default=28) 24 | parser.add_argument("--minsize", type=int, help="minimum overall amount of data to be communicated (bytes log2)", default=12) 25 | args = parser.parse_args() 26 | 27 | # args.type has to be one of the following options 28 | types = ["all2all", "all2all_async", "scatter_gather", "broadcast"] 29 | assert(args.type in types) 30 | assert(args.output.endswith(".csv")) 31 | if args.type == "scatter_gather": 32 | assert(len(args.plan) >= 2) 33 | 34 | base_path = str(Path(__file__).parent.parent.resolve()) 35 | 36 | # execute collective for a range of data sizes 37 | sizes = range(args.minsize, args.maxsize) # max: dgxv1=28, dgx1v2=29, dgx2=30 38 | 39 | # calls the collective 40 | exe = base_path + "/execute" 41 | # output is a csv file, where each line has the form ,,,... 42 | out_csv = "" 43 | # extract filename to use as output file descriptor 44 | plans = [absolute_path(plan) for plan in args.plan] 45 | 46 | # main loop over data sizes 47 | for i, s in enumerate(sizes): 48 | print("\tPROGRESS: size " + str(i+1) + "/" + str(len(sizes))) 49 | 50 | # secondary loop over repeats 51 | for r in range(args.repeats): 52 | print("\t\tPROGRESS: repeat " + str(r+1) + "/" + str(args.repeats)) 53 | 54 | # execute collective 55 | if args.type == "scatter_gather": 56 | out = subprocess.check_output([exe, args.type, plans[0], plans[1], "--size", str(s)]) 57 | else: 58 | out = subprocess.check_output([exe, args.type, plans[0], "--size", str(s)]) 59 | 60 | # process result and extract runtime 61 | for line in out.decode().split('\n'): 62 | if r == 0: 63 | # add data size [bytes] as first columns to csv 64 | if line.startswith('INFO') and line.endswith("(" + args.type + ")"): 65 | out_csv += str(line.split(' ')[1]) 66 | 67 | # add runtime [ms] as subsequent column(s) to csv 68 | if args.type == "scatter_gather": 69 | # only measure scatter since both operations perform the same 70 | if line.startswith('TIMING') and line.endswith("(" + "scatter" + ")"): 71 | out_csv += "," + str(float(line.split(' ')[1])) 72 | else: 73 | if line.startswith('TIMING') and line.endswith("(" + args.type + ")"): 74 | out_csv += "," + str(float(line.split(' ')[1])) 75 | 76 | # check if output shows any errors 77 | if(not valid(out)): 78 | print("ERROR at size=" + str(s) + " repeat=" + str(r+1)) 79 | print(out) 80 | sys.exit(1) 81 | 82 | if(i < len(sizes)-1): 83 | out_csv += '\n' 84 | 85 | # write output to csv 86 | with open(args.output, "w+") as f: 87 | f.write(out_csv) 88 | 89 | print("done!") 90 | -------------------------------------------------------------------------------- /benchmark/plot_results.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | import csv 4 | import glob 5 | import argparse 6 | import itertools 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | import matplotlib.pyplot as plt 10 | plt.style.use('seaborn-paper') 11 | 12 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 13 | parser.add_argument("csvs", nargs='+', type=str, help="csv traces to be included in the plot e.g. or or ",) 14 | parser.add_argument("--output", "-o", type=str, help="output file", default="show plot") 15 | parser.add_argument("--reduction", "-r", type=str, help="reduction operation of multiple runs (median|mean|min|max)", default="median") 16 | args = parser.parse_args() 17 | 18 | # args.reduction has to be one of the following options 19 | reductions = {"median": np.median, "mean": np.mean, "min": np.min, "max": np.max} 20 | assert(args.reduction in reductions) 21 | 22 | def handle_args(args): 23 | ret_filenames = [] 24 | ret_labels = [] 25 | ret_colors = [] 26 | 27 | for a in args: 28 | splits = a.split('%') 29 | filenames = [f for f in glob.glob(splits[0]) if f.endswith(".csv")] 30 | labels = [f.split('/')[-1].split(".")[:-1][0] for f in filenames] 31 | colors = [None] * len(filenames) 32 | 33 | if len(filenames) is 1: 34 | if(len(splits) > 1): 35 | labels = [splits[1]] 36 | 37 | if(len(splits) > 2): 38 | colors = [splits[2]] 39 | 40 | ret_filenames.extend(filenames) 41 | ret_labels.extend(labels) 42 | ret_colors.extend(colors) 43 | 44 | return zip(ret_filenames, ret_labels, ret_colors) 45 | 46 | # convert csv to trace by reducing multiple runs to a scalars 47 | def csv_to_trace(filename, reduce=np.median): 48 | xs, ys = [], [] 49 | with open(filename, 'r') as file: 50 | csv_file = csv.reader(file, delimiter=',') 51 | 52 | for row in csv_file: 53 | xs.append(int(row[0])) 54 | y = reduce([float(x) for x in row[1:]]) 55 | ys.append(y) 56 | return xs, ys 57 | 58 | def bandwidth(bs, ts): 59 | return [((float(b))/1024**3)/(t/1000.0) for b, t in zip(bs, ts)] 60 | 61 | fig, ax = plt.subplots() 62 | ax.set_xscale("log", basex=2, nonposx='clip') 63 | #ax.set_yscale("log", basey=10, nonposy='clip') 64 | ax.grid() 65 | 66 | config = handle_args(args.csvs) 67 | 68 | for c in config: 69 | xs, ys = csv_to_trace(c[0], reductions[args.reduction]) 70 | bw = bandwidth(xs, ys) 71 | print("trace: " + str(c) + ", peak troughput: " + str(int(np.max(bw))) + " GB/s") 72 | 73 | ax.scatter(xs, bw, c=c[2]) 74 | ax.plot(xs, bw, label=c[1], c=c[2]) 75 | 76 | plt.xlabel("Input Size [Bytes]") 77 | plt.ylabel("Throughput [GB/s]") 78 | 79 | plt.legend() 80 | 81 | # either show plot or save as file 82 | if args.output == "show plot": 83 | plt.show() 84 | print("plot displayed") 85 | else: 86 | plt.savefig(args.output, transparent=True, bbox_inches='tight') 87 | print("plot saved as " + args.output) 88 | -------------------------------------------------------------------------------- /benchmark/plots/dgx1_all2all.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Funatiq/gossip/dca28648a77eb58507d3b5c7ccbbb14005113e55/benchmark/plots/dgx1_all2all.pdf -------------------------------------------------------------------------------- /benchmark/plots/dgx1_quad_all2all.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Funatiq/gossip/dca28648a77eb58507d3b5c7ccbbb14005113e55/benchmark/plots/dgx1_quad_all2all.pdf -------------------------------------------------------------------------------- /benchmark/plots/dgx1_quad_scatter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Funatiq/gossip/dca28648a77eb58507d3b5c7ccbbb14005113e55/benchmark/plots/dgx1_quad_scatter.pdf -------------------------------------------------------------------------------- /benchmark/plots/dgx1_scatter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Funatiq/gossip/dca28648a77eb58507d3b5c7ccbbb14005113e55/benchmark/plots/dgx1_scatter.pdf -------------------------------------------------------------------------------- /benchmark/plots/p100_quad_all2all.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Funatiq/gossip/dca28648a77eb58507d3b5c7ccbbb14005113e55/benchmark/plots/p100_quad_all2all.pdf -------------------------------------------------------------------------------- /benchmark/plots/p100_quad_scatter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Funatiq/gossip/dca28648a77eb58507d3b5c7ccbbb14005113e55/benchmark/plots/p100_quad_scatter.pdf -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all/direct.csv: -------------------------------------------------------------------------------- 1 | 262144,0.915456,0.999424,0.914432,1.30253,0.883712 2 | 524288,0.878592,0.86528,0.886784,0.881664,0.941056 3 | 1048576,1.16326,1.02502,0.902144,0.676864,0.88064 4 | 2097152,1.01888,0.871424,0.978944,0.888832,0.902144 5 | 4194304,0.888864,0.900096,0.9472,0.88576,0.905216 6 | 8388608,0.792576,1.152,0.89088,1.28205,0.908288 7 | 16777216,0.913408,0.958464,0.9216,0.883712,1.00864 8 | 33554432,1.05165,0.984064,1.24826,0.982016,0.991232 9 | 67108864,1.59949,1.71622,1.74694,1.63635,1.90976 10 | 134217728,2.80474,2.6665,2.87744,2.65421,2.7136 11 | 268435456,4.85683,4.89165,4.9449,5.10464,5.0729 12 | 536870912,9.49043,9.59181,9.53856,9.67168,9.48531 13 | 1073741824,18.263,19.1119,18.3337,17.6609,18.1719 14 | 2147483648,36.396,35.8318,35.7478,35.8318,36.4124 15 | 4294967296,69.2603,70.9786,71.9544,71.1721,71.9391 16 | 8589934592,139.609,139.768,139.698,143.099,138.984 17 | 17179869184,335.789,347.782,348.554,317.863,324.573 18 | 34359738368,855.176,852.287,854.037,850.333,853.642 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all/opt.csv: -------------------------------------------------------------------------------- 1 | 262144,1.40493,1.39776,1.64352,1.38138,1.4295 2 | 524288,1.35885,1.56672,1.35373,1.66093,1.36704 3 | 1048576,1.35168,1.35373,1.34861,1.57798,1.2759 4 | 2097152,1.39878,1.2544,1.36704,1.64352,1.34758 5 | 4194304,1.57389,1.34656,1.38035,1.42336,1.792 6 | 8388608,1.34451,1.71622,1.23392,1.35782,1.3824 7 | 16777216,1.72954,1.32198,1.6343,1.29638,1.67014 8 | 33554432,1.53293,1.16429,1.16429,1.1264,1.4121 9 | 67108864,1.4848,1.12128,1.24314,1.11309,1.39981 10 | 134217728,1.26464,1.22573,1.22266,1.21549,1.60051 11 | 268435456,1.50016,1.53293,1.47968,1.61382,1.52269 12 | 536870912,2.24973,2.21491,1.98963,2.00294,1.98861 13 | 1073741824,3.10067,3.20614,3.13754,3.05562,3.02182 14 | 2147483648,5.20704,5.19066,5.54496,5.12922,5.67091 15 | 4294967296,9.35424,9.29075,9.20269,9.38701,9.28154 16 | 8589934592,17.6476,17.5913,17.4643,17.5708,17.5411 17 | 17179869184,33.9712,33.9395,33.9384,33.964,33.921 18 | 34359738368,67.0843,67.1826,66.9747,67.0331,66.9716 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all/opt_1chunk.csv: -------------------------------------------------------------------------------- 1 | 262144,1.09158,1.16634,1.00045,1.02093,1.01376 2 | 524288,1.15507,1.15302,0.791552,1.03219,1.05779 3 | 1048576,1.06803,1.04448,0.856064,1.10694,1.20218 4 | 2097152,1.03731,1.08032,1.06291,1.03629,0.920576 5 | 4194304,1.08646,1.03219,0.73728,1.05574,1.19194 6 | 8388608,1.23187,1.14995,1.24109,1.05472,1.08954 7 | 16777216,0.95232,0.971776,0.961536,0.984064,0.9216 8 | 33554432,0.894976,0.953344,0.856064,0.8192,0.902144 9 | 67108864,1.00762,0.932864,0.874496,1.32915,0.89088 10 | 134217728,1.20422,3.20205,1.04038,0.963584,1.09363 11 | 268435456,1.21037,1.20115,1.22982,1.17146,1.5913 12 | 536870912,1.70701,1.72134,1.71315,1.71827,1.85242 13 | 1073741824,2.81498,2.90099,2.84774,2.86106,2.86003 14 | 2147483648,5.0473,4.95104,5.00429,4.97664,5.02886 15 | 4294967296,9.58157,9.4167,9.44947,9.55187,9.44435 16 | 8589934592,18.2221,18.2794,18.2057,18.2313,18.2487 17 | 17179869184,35.9137,35.9219,35.969,35.9117,35.9895 18 | 34359738368,71.166,71.1598,71.2847,71.1823,71.166 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all/rings.csv: -------------------------------------------------------------------------------- 1 | 262144,9.33786,9.35834,9.20269,9.0409,9.06035 2 | 524288,10.0823,9.37574,10.6004,8.9856,9.05626 3 | 1048576,9.15456,9.74746,9.28256,9.21498,9.87648 4 | 2097152,9.07162,9.12179,8.95693,8.81152,9.81299 5 | 4194304,9.00915,9.12691,9.68397,9.68192,9.09107 6 | 8388608,9.3655,9.33683,9.29382,9.99526,9.84371 7 | 16777216,10.1253,9.02349,8.78694,9.7536,10.455 8 | 33554432,8.91392,9.23955,9.2457,9.06445,9.29382 9 | 67108864,9.29587,9.18733,9.07776,9.11565,8.78899 10 | 134217728,9.6256,9.03475,9.07674,9.25798,9.1689 11 | 268435456,12.0289,12.2788,12.3658,12.1416,12.0955 12 | 536870912,13.2035,13.0263,13.2168,13.1041,13.226 13 | 1073741824,14.4773,14.165,14.1722,14.4005,14.5224 14 | 2147483648,16.3901,16.7434,16.5878,16.9062,16.0584 15 | 4294967296,21.1036,21.0555,21.8542,21.8583,21.2316 16 | 8589934592,34.9655,35.1181,36.2977,35.0433,35.1857 17 | 17179869184,64.9646,64.8468,64.8151,64.7762,64.9646 18 | 34359738368,124.867,124.874,124.879,124.874,124.812 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all/symm.csv: -------------------------------------------------------------------------------- 1 | 262144,0.991232,0.77312,1.0537,1.04755,1.09978 2 | 524288,1.14688,1.10694,1.0199,1.04858,1.15098 3 | 1048576,1.05062,1.06496,1.04653,1.07827,1.09978 4 | 2097152,1.09056,1.09978,1.03322,1.03629,1.06394 5 | 4194304,1.16531,1.12845,1.05882,1.04346,1.01478 6 | 8388608,1.01376,1.05165,1.15507,1.03731,1.16122 7 | 16777216,0.949248,0.943104,1.12742,0.955392,0.973824 8 | 33554432,0.846848,0.779264,1.03014,0.850944,0.847872 9 | 67108864,0.95744,1.0455,0.946176,0.940032,0.954368 10 | 134217728,1.25133,1.37933,1.20934,1.20832,1.21344 11 | 268435456,1.664,1.6681,1.73466,1.78483,1.65478 12 | 536870912,2.55795,2.57126,2.78528,2.56922,2.57638 13 | 1073741824,4.33152,4.33254,4.33357,4.54554,4.3735 14 | 2147483648,7.91245,7.89197,7.9319,8.01485,7.9104 15 | 4294967296,15.2259,15.2125,15.2044,15.2351,15.1982 16 | 8589934592,29.6827,29.6735,29.7155,29.8291,29.7452 17 | 17179869184,58.7868,58.9117,58.9763,58.8349,58.8268 18 | 34359738368,116.709,116.737,116.739,116.889,116.706 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all_async/direct.csv: -------------------------------------------------------------------------------- 1 | 262144,0.838656,0.765952,0.812032,0.933888,0.83456 2 | 524288,0.897024,0.866304,0.883712,0.8192,1.09875 3 | 1048576,0.951296,0.868352,0.837632,1.06291,0.80384 4 | 2097152,0.82432,0.836608,0.852992,0.872448,0.833536 5 | 4194304,0.831488,0.812032,1.06496,0.80384,0.882688 6 | 8388608,0.9984,0.949248,0.813056,0.929792,0.96256 7 | 16777216,0.871424,0.96976,0.822272,0.813056,0.851968 8 | 33554432,0.994304,1.00659,1.32198,0.969728,0.937984 9 | 67108864,1.65274,1.91795,1.66912,1.67936,1.78483 10 | 134217728,2.86515,2.64192,2.78016,2.61018,2.63475 11 | 268435456,4.99507,5.02477,4.90291,5.11898,4.86298 12 | 536870912,9.43514,9.22522,9.23136,9.2631,9.63686 13 | 1073741824,17.8596,17.8002,18.1484,18.4402,18.0214 14 | 2147483648,35.755,35.4785,35.627,35.8676,36.4585 15 | 4294967296,69.5347,69.2367,69.4938,69.9761,69.9576 16 | 8589934592,139.436,141.026,141.508,139.304,140.373 17 | 17179869184,335.121,346.063,344.26,337.485,336.11 18 | 34359738368,858.075,883.971,874.616,878.392,875.381 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all_async/opt.csv: -------------------------------------------------------------------------------- 1 | 262144,1.15814,1.50835,1.56262,1.13254,1.23597 2 | 524288,1.27693,1.14381,1.44896,1.14586,1.14074 3 | 1048576,1.14483,1.14688,1.12845,1.28717,1.30662 4 | 2097152,1.14688,1.30662,1.1735,1.3312,1.30048 5 | 4194304,1.13254,1.11002,1.11309,1.11104,1.09158 6 | 8388608,1.0281,1.30048,1.16122,1.09568,1.18477 7 | 16777216,1.04858,1.024,1.29126,1.07008,1.31584 8 | 33554432,1.13459,1.38138,0.989184,0.970752,1.4807 9 | 67108864,1.06803,0.99328,0.95232,0.919552,0.99328 10 | 134217728,0.948224,0.945152,0.944128,0.924672,0.956416 11 | 268435456,1.26669,1.0711,1.25542,1.20627,1.02707 12 | 536870912,1.60666,1.47456,1.55136,1.4848,1.44077 13 | 1073741824,2.42074,2.41254,2.41254,2.47603,2.40845 14 | 2147483648,4.29875,4.29261,4.30899,4.31206,4.33254 15 | 4294967296,8.09165,8.04864,8.19405,8.09267,8.07322 16 | 8589934592,15.8556,15.6908,15.7112,15.7921,15.785 17 | 17179869184,30.7118,30.7128,30.6852,30.6627,30.6954 18 | 34359738368,60.9751,60.8287,60.7683,60.9321,60.7631 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all_async/opt_1chunk.csv: -------------------------------------------------------------------------------- 1 | 262144,0.903168,0.858112,0.745472,0.835584,0.98304 2 | 524288,1.08237,0.914432,0.856064,0.91648,0.835584 3 | 1048576,0.909312,0.914432,0.884736,0.83968,0.903168 4 | 2097152,0.823296,0.888832,0.825344,0.861184,0.886784 5 | 4194304,0.903168,0.832512,0.818176,0.887808,1.37421 6 | 8388608,0.900096,0.910336,0.992256,0.928768,0.763904 7 | 16777216,0.801792,0.945152,0.823296,0.862208,0.935936 8 | 33554432,0.748544,0.761856,0.712704,0.726016,0.759808 9 | 67108864,0.697344,0.722944,0.668672,1.11104,0.668672 10 | 134217728,0.797696,0.741376,0.866304,0.735232,0.71168 11 | 268435456,0.903168,0.88064,0.924672,1.11206,0.918528 12 | 536870912,1.45306,1.4121,1.408,1.408,1.45715 13 | 1073741824,2.46067,2.4873,2.49037,2.49754,2.4873 14 | 2147483648,4.63462,4.69606,4.62336,4.72474,4.60595 15 | 4294967296,8.89856,8.84122,8.85043,8.91085,8.84531 16 | 8589934592,17.5391,17.3722,17.3609,17.324,17.4787 17 | 17179869184,34.4125,34.391,34.3572,34.3716,34.3941 18 | 34359738368,68.3295,68.3295,68.1882,68.182,68.31 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all_async/rings.csv: -------------------------------------------------------------------------------- 1 | 262144,3.30854,2.86822,3.328,4.11238,3.6864 2 | 524288,3.65568,3.38125,3.61574,3.57274,3.2983 3 | 1048576,3.27578,3.36896,3.80826,3.25325,3.57478 4 | 2097152,3.21536,3.87379,3.24506,2.99418,3.74682 5 | 4194304,3.23482,4.03149,3.73248,3.55021,3.25734 6 | 8388608,3.57683,3.70688,3.55021,3.93421,3.69869 7 | 16777216,3.65158,3.21331,3.26861,3.78675,3.57478 8 | 33554432,3.14675,3.22765,3.1785,3.17747,3.23174 9 | 67108864,3.50208,3.21536,3.19181,3.05971,2.97472 10 | 134217728,3.32493,3.49184,3.13958,3.23174,3.07814 11 | 268435456,3.10989,3.28397,3.14061,3.72429,3.03411 12 | 536870912,3.06381,3.09453,3.04435,3.07712,3.01773 13 | 1073741824,3.43245,3.41402,3.36384,3.42835,3.28192 14 | 2147483648,4.70426,4.91622,4.72678,4.95514,4.89165 15 | 4294967296,8.32,8.31795,8.35789,8.36198,8.29645 16 | 8589934592,15.8853,15.9048,15.7983,15.9601,15.9201 17 | 17179869184,30.8879,30.9504,31.0272,30.9279,30.9484 18 | 34359738368,61.2076,61.1809,61.2721,61.0662,61.0642 -------------------------------------------------------------------------------- /benchmark/results/dgx1/all2all_async/symm.csv: -------------------------------------------------------------------------------- 1 | 262144,0.848896,0.845824,0.822272,0.825344,0.852992 2 | 524288,0.841728,0.840704,0.882688,0.878592,0.8448 3 | 1048576,0.821248,0.809984,0.828416,0.843776,0.832512 4 | 2097152,0.828416,0.825344,0.815104,0.847872,0.846848 5 | 4194304,0.826368,0.866304,0.835584,0.811008,0.815104 6 | 8388608,0.806912,0.823296,0.845824,0.83456,0.83456 7 | 16777216,0.801792,0.76288,0.811008,0.81408,0.91648 8 | 33554432,0.67072,0.666624,0.688128,0.684032,0.672768 9 | 67108864,0.731136,0.654336,0.858112,0.649216,0.771072 10 | 134217728,0.845824,0.87552,0.862208,0.847872,0.848896 11 | 268435456,1.2544,1.24826,1.24928,1.43258,1.24518 12 | 536870912,2.0265,2.03776,2.19034,2.16269,2.05312 13 | 1073741824,3.61882,3.61165,3.62086,3.64442,3.88096 14 | 2147483648,6.9161,6.94886,6.89664,6.80653,6.94784 15 | 4294967296,13.0826,13.1185,13.0724,13.0744,13.0724 16 | 8589934592,26.0086,25.8755,25.9011,25.9318,25.8284 17 | 17179869184,51.0433,50.9757,51.0904,51.0444,51.0218 18 | 34359738368,101.46,101.401,101.499,101.64,101.389 -------------------------------------------------------------------------------- /benchmark/results/dgx1/bisection.csv: -------------------------------------------------------------------------------- 1 | 262144,0.016,0.012832,0.012608,0.012608,0.013408,0.011968,0.013568,0.012224,0.012608,0.011456 2 | 524288,0.017216,0.012704,0.012544,0.023744,0.012,0.011776,0.015328,0.012256,0.011392,0.012096 3 | 1048576,0.016896,0.013664,0.012512,0.0136,0.014336,0.0136,0.013632,0.013504,0.013504,0.013088 4 | 2097152,0.01664,0.015552,0.02304,0.0152,0.023104,0.015424,0.014432,0.023648,0.014016,0.014208 5 | 4194304,0.028096,0.023968,0.023104,0.025984,0.023488,0.024192,0.022432,0.022944,0.02256,0.022368 6 | 8388608,0.028896,0.02656,0.033376,0.02528,0.028352,0.024672,0.0256,0.025568,0.025568,0.025216 7 | 16777216,0.042176,0.040608,0.040608,0.047296,0.039584,0.047456,0.048224,0.0472,0.047008,0.048224 8 | 33554432,0.077696,0.073984,0.07264,0.073024,0.071776,0.071744,0.071776,0.071776,0.07168,0.07184 9 | 67108864,0.128096,0.12656,0.134336,0.12608,0.133536,0.133152,0.133888,0.133344,0.133248,0.133312 10 | 134217728,0.250176,0.245888,0.245248,0.24384,0.24352,0.244864,0.24384,0.243648,0.24384,0.243552 11 | 268435456,0.474656,0.480096,0.477408,0.4768,0.477856,0.476672,0.477152,0.476864,0.476736,0.478048 12 | 536870912,0.938336,0.93472,0.9328,0.933536,0.934144,0.934304,0.932512,0.93376,0.933504,0.934528 13 | 1073741824,1.86067,1.85546,1.85578,1.85453,1.85546,1.85558,1.8551,1.8543,1.85523,1.85533 14 | 2147483648,3.7025,3.70006,3.69971,3.69862,3.69856,3.69866,3.69891,3.69984,3.6993,3.6993 15 | 4294967296,7.38931,7.38816,7.39085,7.38902,7.38669,7.38595,7.3865,7.38784,7.3927,7.38586 16 | 8589934592,14.7702,14.7716,14.764,14.7692,14.7646,14.7695,14.7727,14.7654,14.7666,14.7694 17 | 17179869184,29.5181,29.5181,29.5208,29.5162,29.5141,29.5176,29.5176,29.5149,29.5158,29.5221 18 | 34359738368,59.0192,59.0248,59.0166,59.0221,59.0404,59.0176,59.0203,59.0163,59.0167,59.0187 -------------------------------------------------------------------------------- /benchmark/results/dgx1/gather/direct.csv: -------------------------------------------------------------------------------- 1 | 32768,0.211968,0.228352,0.197632,0.202752,0.197632 2 | 65536,0.19456,0.19968,0.20992,0.201728,0.185344 3 | 131072,0.24576,0.227328,0.228352,0.183296,0.219136 4 | 262144,0.188416,0.19968,0.185344,0.227328,0.181248 5 | 524288,0.233472,0.198656,0.297984,0.192512,0.231424 6 | 1048576,0.224256,0.229376,0.237568,0.232448,0.279552 7 | 2097152,0.21504,0.211968,0.258048,0.190464,0.211968 8 | 4194304,0.274432,0.305152,0.30208,0.287744,0.287744 9 | 8388608,0.46592,0.520192,0.49152,0.490496,0.46592 10 | 16777216,0.884736,0.8704,0.922624,0.887808,0.8704 11 | 33554432,1.46227,1.46022,1.46534,1.48582,1.47558 12 | 67108864,2.64909,2.64909,2.66957,2.66752,2.66445 13 | 134217728,5.19475,5.20397,5.30534,5.13638,5.15482 14 | 268435456,9.91744,9.95123,10.0577,9.90822,10.0731 15 | 536870912,19.3178,19.5,19.4222,19.3034,19.3772 16 | 1073741824,38.5085,38.5812,38.2986,38.5546,38.4614 17 | 2147483648,77.3489,76.503,76.5461,76.885,76.5041 18 | 4294967296,153.001,153.459,152.787,152.927,153.691 -------------------------------------------------------------------------------- /benchmark/results/dgx1/gather/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.555008,0.534528,0.462848,0.500736,0.4608 2 | 65536,0.652288,0.463872,0.664576,0.494592,0.499712 3 | 131072,0.475136,0.478208,0.53248,0.531456,2.46682 4 | 262144,0.488448,0.474112,0.7168,0.512,0.456704 5 | 524288,0.467968,0.546816,0.473088,0.472064,0.602112 6 | 1048576,0.642048,0.73728,0.75264,0.531456,0.699392 7 | 2097152,0.444416,0.466944,0.503808,0.488448,0.514048 8 | 4194304,0.525312,0.528384,0.485376,0.504832,0.485376 9 | 8388608,0.524288,0.483328,0.494592,0.483328,0.50176 10 | 16777216,0.468992,0.461824,0.550912,0.479232,0.505856 11 | 33554432,0.472064,0.492544,0.470016,0.548864,0.47616 12 | 67108864,0.590848,0.585728,0.576512,0.580608,0.585728 13 | 134217728,0.98816,0.985088,0.98816,0.996352,0.987136 14 | 268435456,1.80122,1.80224,1.81555,1.79814,1.84934 15 | 536870912,3.42733,3.43859,3.42528,3.42835,3.42835 16 | 1073741824,6.68877,6.68365,6.68262,6.68058,6.68058 17 | 2147483648,13.2014,13.1994,13.1942,13.2076,13.2188 18 | 4294967296,26.2175,26.2226,26.2431,26.2175,26.2236 -------------------------------------------------------------------------------- /benchmark/results/dgx1/gather/opt_1chunk.csv: -------------------------------------------------------------------------------- 1 | 32768,0.263168,0.259072,0.224256,0.214016,0.2304 2 | 65536,0.304128,0.269312,0.214016,0.207872,0.217088 3 | 131072,0.290816,0.259072,0.260096,0.178176,0.263168 4 | 262144,0.212992,0.218112,0.222208,0.224256,0.187392 5 | 524288,0.218112,0.211968,0.211968,0.262144,0.207872 6 | 1048576,0.214016,0.207872,0.211968,0.306176,0.31744 7 | 2097152,0.18944,0.26624,0.22016,0.206848,0.280576 8 | 4194304,0.249856,0.241664,0.249856,0.258048,0.183296 9 | 8388608,0.21504,0.24064,0.303104,0.287744,0.2304 10 | 16777216,0.288768,0.282624,0.346112,0.285696,0.282624 11 | 33554432,0.41472,0.422912,0.41984,0.433152,0.422912 12 | 67108864,0.689152,0.714752,0.684032,0.690176,0.719872 13 | 134217728,1.20832,1.23085,1.2073,1.21958,1.20525 14 | 268435456,2.25382,2.28147,2.26406,2.25587,2.26099 15 | 536870912,4.34176,4.33766,4.33254,4.33562,4.33357 16 | 1073741824,8.51763,8.5248,8.54733,8.54221,8.53094 17 | 2147483648,16.9605,16.9329,16.9298,16.9114,16.9144 18 | 4294967296,33.6978,33.6957,33.7091,33.6998,33.7029 -------------------------------------------------------------------------------- /benchmark/results/dgx1/gather/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.646144,0.50688,0.512,0.668672,0.54272 2 | 65536,0.733184,0.510976,0.512,0.479232,0.47104 3 | 131072,0.565248,0.485376,0.52224,0.533504,0.477184 4 | 262144,0.493568,0.49152,0.620544,0.551936,0.647168 5 | 524288,0.487424,0.770048,0.529408,0.83456,0.452608 6 | 1048576,0.600064,0.497664,0.607232,0.49152,0.699392 7 | 2097152,0.515072,0.472064,0.47104,0.503808,0.559104 8 | 4194304,0.488448,0.493568,0.48128,0.4608,0.49664 9 | 8388608,0.509952,0.45568,0.48128,0.452608,0.451584 10 | 16777216,0.56832,0.57856,0.472064,0.520192,0.458752 11 | 33554432,0.524288,0.528384,0.576512,0.631808,0.52736 12 | 67108864,0.70656,0.688128,0.67584,0.669696,0.668672 13 | 134217728,1.10387,1.11206,1.09875,1.09056,1.08954 14 | 268435456,1.90771,1.90259,1.90157,1.91795,1.90157 15 | 536870912,3.52256,3.52768,3.53485,3.53894,3.51539 16 | 1073741824,6.78707,6.79731,6.78195,6.8137,6.79219 17 | 2147483648,13.2977,13.3468,13.3581,13.2905,13.2987 18 | 4294967296,26.4131,26.3107,26.3117,26.3117,26.3066 -------------------------------------------------------------------------------- /benchmark/results/dgx1/gather/symm.csv: -------------------------------------------------------------------------------- 1 | 32768,0.227328,0.24064,0.200704,0.19968,0.191488 2 | 65536,0.207872,0.196608,0.232448,0.200704,0.207872 3 | 131072,0.2048,0.197632,0.19968,0.19968,0.232448 4 | 262144,0.197632,0.193536,0.203776,0.197632,0.203776 5 | 524288,0.242688,0.193536,0.236544,0.191488,0.192512 6 | 1048576,0.238592,0.18944,0.23552,0.19968,0.190464 7 | 2097152,0.211968,0.19456,0.192512,0.187392,0.192512 8 | 4194304,0.212992,0.22016,0.180224,0.203776,0.19968 9 | 8388608,0.22528,0.227328,0.228352,0.2232,0.226304 10 | 16777216,0.310272,0.329728,0.309248,0.311296,0.3072 11 | 33554432,0.487424,0.4864,0.487424,0.488448,0.493568 12 | 67108864,0.840704,0.837632,0.837632,0.837632,0.841728 13 | 134217728,1.536,1.54624,1.52883,1.53907,1.5401 14 | 268435456,2.93376,2.95936,2.93171,2.93274,2.93478 15 | 536870912,5.7385,5.75488,5.7385,5.73747,5.73747 16 | 1073741824,11.3347,11.2947,11.2937,11.307,11.3172 17 | 2147483648,22.4604,22.4584,22.4778,22.4778,22.4799 18 | 4294967296,44.7969,44.7918,44.7846,44.799,44.7918 -------------------------------------------------------------------------------- /benchmark/results/dgx1/nvlink_bandwidth.csv: -------------------------------------------------------------------------------- 1 | 2048,0.012928,0.012256,0.013184,0.013248,0.013536,0.011552,0.013408,0.01328,0.013248,0.01328 2 | 4096,0.015744,0.014176,0.013408,0.013312,0.011808,0.011296,0.011488,0.01136,0.013376,0.012736 3 | 8192,0.014368,0.013152,0.011808,0.013632,0.01088,0.011456,0.01344,0.012672,0.013408,0.013472 4 | 16384,0.013792,0.012512,0.013888,0.0136,0.012288,0.01264,0.013568,0.013664,0.010976,0.01344 5 | 32768,0.014624,0.012928,0.013632,0.012352,0.013184,0.012736,0.012544,0.0136,0.02272,0.02272 6 | 65536,0.014976,0.01504,0.013888,0.021952,0.013376,0.022048,0.0224,0.0224,0.022592,0.013632 7 | 131072,0.0176,0.017248,0.023392,0.024128,0.022336,0.022656,0.02272,0.022592,0.022432,0.02272 8 | 262144,0.027808,0.024256,0.022528,0.022496,0.02416,0.022752,0.0224,0.022656,0.0224,0.022752 9 | 524288,0.039392,0.037856,0.036256,0.03488,0.034784,0.034624,0.034784,0.03376,0.03472,0.034976 10 | 1048576,0.05568,0.061088,0.060864,0.059456,0.066336,0.0592,0.059456,0.059552,0.059392,0.058912 11 | 2097152,0.101248,0.098784,0.098816,0.098304,0.098144,0.097952,0.09872,0.098528,0.098336,0.098528 12 | 4194304,0.1888,0.185408,0.184416,0.183232,0.184544,0.184,0.184576,0.18416,0.184576,0.185184 13 | 8388608,0.35968,0.357824,0.358112,0.357728,0.365312,0.357664,0.356864,0.356928,0.35696,0.356896 14 | 16777216,0.70416,0.703296,0.710432,0.710272,0.710112,0.710272,0.710976,0.702208,0.70992,0.710528 15 | 33554432,1.3953,1.40166,1.3992,1.3991,1.39907,1.39914,1.39926,1.3992,1.39786,1.39936 16 | 67108864,2.78099,2.77789,2.78355,2.77811,2.77814,2.77824,2.77683,2.77683,2.78486,2.77738 17 | 134217728,5.54442,5.54397,5.54666,5.54413,5.54346,5.54288,5.54326,5.54237,5.54282,5.54253 18 | 268435456,11.0783,11.0771,11.082,11.0832,11.0824,11.0823,11.0774,11.0789,11.0818,11.0784 19 | 536870912,22.1474,22.1437,22.1463,22.1429,22.1397,22.1426,22.1426,22.1435,22.1416,22.1423 20 | 1073741824,44.2743,44.2709,44.2749,44.2674,44.2688,44.2679,44.2652,44.2737,44.297,44.2679 21 | 2147483648,88.5248,88.526,88.5256,88.5184,88.5184,88.5187,88.5199,88.5186,88.5203,88.5195 22 | 4294967296,177.029,177.026,177.025,177.028,177.023,177.024,177.022,177.027,177.029,177.025 23 | 8589934592,355.663,355.661,355.659,355.658,355.68,355.66,355.662,355.66,355.664,355.658 24 | -------------------------------------------------------------------------------- /benchmark/results/dgx1/nvlink_latency.csv: -------------------------------------------------------------------------------- 1 | 1,0.011968 -------------------------------------------------------------------------------- /benchmark/results/dgx1/scatter/direct.csv: -------------------------------------------------------------------------------- 1 | 32768,0.234496,0.229376,0.23552,0.238592,0.231424 2 | 65536,0.231424,0.229376,0.2304,0.231424,0.304128 3 | 131072,0.267264,0.226304,0.229376,0.229376,0.22528 4 | 262144,0.234496,0.231424,0.249856,0.229376,0.23552 5 | 524288,0.239616,0.252928,0.254976,0.246784,0.243712 6 | 1048576,0.326656,0.265216,0.257024,0.251904,0.2816 7 | 2097152,0.292864,0.356352,0.28672,0.2816,0.287744 8 | 4194304,0.369664,0.428032,0.408576,0.381952,0.371712 9 | 8388608,0.54784,0.587776,0.63488,0.557056,0.581632 10 | 16777216,0.801792,0.811008,0.923648,0.815104,0.88064 11 | 33554432,1.37216,1.31891,1.32506,1.38445,1.35578 12 | 67108864,2.39206,2.40333,2.40026,2.38387,2.41254 13 | 134217728,4.55578,4.58138,4.5609,4.59366,4.5568 14 | 268435456,8.86374,8.87398,8.96205,8.92416,8.99277 15 | 536870912,18.0142,17.4449,17.366,17.324,17.3865 16 | 1073741824,34.601,34.603,34.6563,34.5364,34.3204 17 | 2147483648,69.63,68.5353,68.7053,69.1732,68.4237 18 | 4294967296,136.455,136.261,136.449,136.718,136.502 -------------------------------------------------------------------------------- /benchmark/results/dgx1/scatter/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.6912,0.694272,0.71168,0.694272,0.698368 2 | 65536,0.75776,0.714752,0.806912,0.7168,0.713728 3 | 131072,0.705536,0.681984,0.693248,0.617472,0.715776 4 | 262144,0.704512,0.809984,0.703488,0.695296,0.714752 5 | 524288,0.739328,0.700416,0.677888,0.69632,0.740352 6 | 1048576,0.534528,0.688128,0.722944,0.68608,0.731136 7 | 2097152,0.666624,0.712736,0.548864,0.688128,0.796672 8 | 4194304,0.589824,0.536576,0.661504,0.615424,0.6144 9 | 8388608,0.561152,0.540672,0.592896,0.548864,0.61952 10 | 16777216,0.466944,0.453632,0.513024,0.464896,0.477184 11 | 33554432,0.531456,0.539648,0.468992,0.539648,0.472064 12 | 67108864,0.687104,0.668672,0.65536,0.65536,0.659456 13 | 134217728,1.0967,1.08237,1.09158,1.09875,1.08646 14 | 268435456,1.8985,1.8985,1.91386,1.8944,1.95277 15 | 536870912,3.57683,3.58093,3.56557,3.57171,3.57786 16 | 1073741824,6.92531,6.90995,6.91302,6.91917,6.90074 17 | 2147483648,13.5854,13.5905,13.5782,13.5956,13.5834 18 | 4294967296,26.9353,26.9476,26.965,26.9384,26.9373 -------------------------------------------------------------------------------- /benchmark/results/dgx1/scatter/opt_1chunk.csv: -------------------------------------------------------------------------------- 1 | 32768,0.27136,0.310272,0.275456,0.264192,0.287744 2 | 65536,0.300032,0.26112,0.269312,0.34304,0.268288 3 | 131072,0.288768,0.247808,0.260096,0.241664,0.270336 4 | 262144,0.265216,0.263168,0.280576,0.264192,0.190464 5 | 524288,0.265216,0.273408,0.263168,0.272384,0.34304 6 | 1048576,0.273408,0.264192,0.269312,0.29696,0.340992 7 | 2097152,0.377856,0.29184,0.268288,0.263168,0.278528 8 | 4194304,0.27136,0.27648,0.273408,0.282624,0.32256 9 | 8388608,0.233472,0.236544,0.31744,0.254976,0.242688 10 | 16777216,0.293888,0.278528,0.301056,0.283648,0.280576 11 | 33554432,0.417792,0.422912,0.417792,0.415744,0.428032 12 | 67108864,0.684032,0.681984,0.68096,0.683008,0.687104 13 | 134217728,1.24621,1.22573,1.21549,1.21651,1.22061 14 | 268435456,2.27533,2.29786,2.26918,2.27123,2.2825 15 | 536870912,4.41139,4.41446,4.41856,4.40934,4.41651 16 | 1073741824,8.64768,8.6569,8.68454,8.66202,8.65178 17 | 2147483648,17.2073,17.1684,17.1807,17.1643,17.1633 18 | 4294967296,34.2682,34.2682,34.2682,34.26,34.2938 -------------------------------------------------------------------------------- /benchmark/results/dgx1/scatter/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.965632,0.728064,0.769024,0.718848,0.715776 2 | 65536,0.910336,0.72192,0.785408,0.72192,0.724992 3 | 131072,0.7168,0.521216,0.72704,0.712704,0.73216 4 | 262144,0.714752,0.722944,0.715776,0.746496,0.799744 5 | 524288,0.695296,0.930816,0.781312,0.876544,0.745472 6 | 1048576,0.735232,0.805888,0.754688,0.637952,0.764928 7 | 2097152,0.594944,0.684032,0.676864,0.6656,0.73728 8 | 4194304,0.652288,0.621568,0.690176,0.649216,0.642048 9 | 8388608,0.621568,0.582656,0.580608,0.566272,0.565248 10 | 16777216,0.688128,0.704512,0.521216,0.576512,0.616448 11 | 33554432,0.57856,0.590848,0.58368,0.601088,0.57344 12 | 67108864,0.676864,0.756736,0.652288,0.649216,0.646144 13 | 134217728,1.04243,1.06291,1.04653,1.03936,1.05267 14 | 268435456,1.87392,1.86675,1.86573,1.86675,1.86573 15 | 536870912,3.55021,3.54202,3.54714,3.54918,3.53894 16 | 1073741824,6.87923,6.87616,6.87104,6.87002,6.87411 17 | 2147483648,13.5578,13.5496,13.5506,13.5475,13.5608 18 | 4294967296,26.8513,26.8544,26.8616,26.8534,26.8524 -------------------------------------------------------------------------------- /benchmark/results/dgx1/scatter/symm.csv: -------------------------------------------------------------------------------- 1 | 32768,0.244736,0.234496,0.256,0.236544,0.246784 2 | 65536,0.236544,0.23552,0.295968,0.257024,0.23552 3 | 131072,0.232448,0.232448,0.23552,0.2304,0.237568 4 | 262144,0.23552,0.233472,0.234496,0.236544,0.236544 5 | 524288,0.238592,0.241664,0.236544,0.23552,0.24064 6 | 1048576,0.272384,0.241664,0.23552,0.242688,0.23552 7 | 2097152,0.241664,0.236544,0.247808,0.234496,0.238592 8 | 4194304,0.242688,0.242688,0.224256,0.237568,0.238592 9 | 8388608,0.23552,0.239616,0.239616,0.24064,0.24064 10 | 16777216,0.309248,0.321536,0.3072,0.3072,0.309248 11 | 33554432,0.484352,0.485376,0.483328,0.485376,0.492544 12 | 67108864,0.838656,0.861184,0.838656,0.859136,0.841728 13 | 134217728,1.54419,1.55341,1.54317,1.54214,1.57901 14 | 268435456,2.95014,2.94707,2.98086,2.95117,2.95219 15 | 536870912,5.7815,5.79994,5.77843,5.77434,5.77638 16 | 1073741824,11.3981,11.391,11.3879,11.3981,11.3951 17 | 2147483648,22.6468,22.656,22.6642,22.6632,22.7082 18 | 4294967296,45.2465,45.2628,45.2465,45.2577,45.2465 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/all2all/direct_p100.csv: -------------------------------------------------------------------------------- 1 | 131072,0.15936,0.154272,0.15472,0.154912,0.156896 2 | 262144,0.154368,0.155488,0.15952,0.155264,0.157152 3 | 524288,0.154976,0.155872,0.15536,0.153152,0.16048 4 | 1048576,0.155264,0.155552,0.153984,0.157216,0.157408 5 | 2097152,0.15792,0.156352,0.15248,0.155808,0.15424 6 | 4194304,0.155776,0.157568,0.155712,0.15664,0.15888 7 | 8388608,0.159648,0.156608,0.158464,0.159072,0.158432 8 | 16777216,0.170528,0.175712,0.17392,0.174816,0.1752 9 | 33554432,0.20112,0.202304,0.200576,0.199744,0.203584 10 | 67108864,0.29984,0.300704,0.29792,0.301888,0.301216 11 | 134217728,0.555904,0.5568,0.554912,0.55472,0.554112 12 | 268435456,0.88288,0.890304,0.88352,0.886624,0.885824 13 | 536870912,1.60342,1.6071,1.60438,1.60554,1.60413 14 | 1073741824,3.07344,3.04538,3.04861,3.05616,3.04874 15 | 2147483648,5.87782,5.87894,5.88205,5.87856,5.87715 16 | 4294967296,11.692,11.6962,11.692,11.6801,11.6848 17 | 8589934592,23.2075,23.1988,23.205,23.2105,23.2049 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/all2all/opt_p100.csv: -------------------------------------------------------------------------------- 1 | 131072,0.163264,0.155264,0.153984,0.15488,0.158048 2 | 262144,0.156704,0.154656,0.155808,0.155968,0.157536 3 | 524288,0.15504,0.156064,0.155136,0.155424,0.157472 4 | 1048576,0.157472,0.154752,0.156672,0.153216,0.154304 5 | 2097152,0.151872,0.157504,0.154624,0.161312,0.156928 6 | 4194304,0.159264,0.15824,0.158656,0.157568,0.1568 7 | 8388608,0.155872,0.155616,0.161248,0.158784,0.158176 8 | 16777216,0.175904,0.172256,0.174432,0.172992,0.174144 9 | 33554432,0.205248,0.203904,0.237216,0.201856,0.216736 10 | 67108864,0.300832,0.299648,0.297056,0.299616,0.300256 11 | 134217728,0.556096,0.556832,0.579712,0.559904,0.556256 12 | 268435456,0.887936,0.882848,0.890176,0.882656,0.88416 13 | 536870912,1.62893,1.59773,1.60554,1.60122,1.63616 14 | 1073741824,3.05187,3.07104,3.05267,3.05136,3.05117 15 | 2147483648,5.87974,5.87827,5.89722,5.86637,5.88298 16 | 4294967296,11.6802,11.6992,11.6975,11.6809,11.6812 17 | 8589934592,23.214,23.2404,23.1909,23.2037,23.203 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/all2all/rings_p100.csv: -------------------------------------------------------------------------------- 1 | 131072,0.692288,0.683808,0.684704,0.684384,0.68512 2 | 262144,0.6952,0.691008,0.71216,0.7064,0.684096 3 | 524288,0.684064,0.714752,0.681472,0.702016,0.680512 4 | 1048576,0.681504,0.737632,0.681248,0.686912,0.684736 5 | 2097152,0.679904,0.697568,0.699328,0.68128,0.703488 6 | 4194304,0.69008,0.712,0.685824,0.683008,0.689088 7 | 8388608,0.68496,0.69024,0.689504,0.722592,0.705152 8 | 16777216,0.602624,0.61632,0.679808,0.621536,0.613984 9 | 33554432,0.681472,0.633632,0.627968,0.635872,0.635008 10 | 67108864,0.78592,0.779488,0.786432,0.79824,0.78416 11 | 134217728,1.12227,1.0825,1.07485,1.07558,1.0839 12 | 268435456,1.41706,1.43658,1.4415,1.42269,1.42643 13 | 536870912,2.21194,2.22659,2.20717,2.21827,2.21251 14 | 1073741824,3.86669,3.87149,3.8632,3.86144,3.90179 15 | 2147483648,7.29773,7.29405,7.29043,7.30576,7.27686 16 | 4294967296,14.1246,14.1004,14.1215,14.117,14.1063 17 | 8589934592,27.791,27.8366,27.775,27.8098,27.7988 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/all2all_async/direct_p100.csv: -------------------------------------------------------------------------------- 1 | 131072,0.153504,0.209344,0.150848,0.1536,0.150976 2 | 262144,0.15008,0.154496,0.154048,0.151648,0.150496 3 | 524288,0.150208,0.152672,0.152768,0.148864,0.15456 4 | 1048576,0.150848,0.16944,0.167904,0.153664,0.152864 5 | 2097152,0.152864,0.149216,0.150112,0.148288,1.02474 6 | 4194304,0.15664,0.154144,0.15216,0.157248,0.154336 7 | 8388608,0.215136,0.2616,0.261824,0.188032,0.159712 8 | 16777216,0.252512,0.263712,0.206944,0.260192,0.289856 9 | 33554432,0.242944,0.222816,0.22976,0.231424,0.230016 10 | 67108864,0.329344,0.323392,0.322624,0.32256,0.327264 11 | 134217728,0.51536,0.499168,0.495168,0.516736,0.498176 12 | 268435456,0.854432,0.8528,0.854752,0.854176,0.854592 13 | 536870912,1.57299,1.57834,1.57312,1.57293,1.5736 14 | 1073741824,3.01875,3.01978,3.0168,3.01763,3.03587 15 | 2147483648,5.9031,5.90061,5.90214,5.89725,5.8977 16 | 4294967296,11.7143,11.7127,11.7188,11.7308,11.7005 17 | 8589934592,23.2306,23.2238,23.2259,23.2097,23.2219 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/all2all_async/opt.csv: -------------------------------------------------------------------------------- 1 | 131072,0.574848,0.378976,0.541728,0.364352,0.36816 2 | 262144,0.514528,0.535968,0.52672,0.533536,0.3496 3 | 524288,0.517024,0.53056,0.527584,0.520672,0.520128 4 | 1048576,0.497472,0.488128,0.515648,0.520704,0.347968 5 | 2097152,0.8184,1.71197,0.519968,0.43824,0.517792 6 | 4194304,0.493024,0.361248,0.577504,0.296448,0.470336 7 | 8388608,0.302592,0.52448,0.597312,0.52064,0.527104 8 | 16777216,0.373536,0.580416,0.5216,0.433792,0.458912 9 | 33554432,0.695424,0.495904,0.442176,0.49472,0.450656 10 | 67108864,0.504448,0.471904,0.511424,0.53248,0.576736 11 | 134217728,0.462816,0.46624,0.464832,0.491584,0.488992 12 | 268435456,0.741536,0.74208,0.745696,0.743232,0.737696 13 | 536870912,1.3088,1.31418,1.3111,1.31178,1.30845 14 | 1073741824,2.46474,2.45875,2.46016,2.46224,2.46611 15 | 2147483648,4.77104,4.77469,4.80768,4.77046,4.7713 16 | 4294967296,9.44547,9.46317,9.42237,9.45677,9.4545 17 | 8589934592,18.7224,18.7274,18.7331,18.7235,18.7368 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/all2all_async/opt_p100.csv: -------------------------------------------------------------------------------- 1 | 131072,0.208608,0.1488,0.14864,0.147616,0.150976 2 | 262144,0.152096,0.15088,0.185856,0.150336,0.152064 3 | 524288,0.149664,0.15136,0.172608,0.151616,0.151328 4 | 1048576,0.152256,0.153312,0.154144,0.148896,0.147488 5 | 2097152,0.149248,0.151008,0.186976,0.16016,0.149088 6 | 4194304,0.15296,0.151712,0.158048,0.1528,0.1496 7 | 8388608,0.261696,0.1512,0.266496,0.263616,0.265696 8 | 16777216,0.195072,0.259776,0.193312,0.192704,0.256704 9 | 33554432,0.238816,0.224192,0.24464,0.22544,0.264448 10 | 67108864,0.326816,0.328672,0.3456,0.32496,0.32464 11 | 134217728,0.496096,0.52064,0.501184,0.498976,0.50272 12 | 268435456,0.85632,0.858592,0.860096,0.8568,0.860992 13 | 536870912,1.57078,1.56762,1.57283,1.57427,1.56982 14 | 1073741824,3.01242,3.01594,3.02835,3.01482,3.02 15 | 2147483648,5.90742,5.92045,5.8937,5.89894,5.89856 16 | 4294967296,11.706,11.7178,11.7214,11.7499,11.7123 17 | 8589934592,23.2351,23.2222,23.2163,23.2284,23.2164 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/all2all_async/rings_p100.csv: -------------------------------------------------------------------------------- 1 | 131072,0.441824,0.392384,0.410912,0.39088,0.394464 2 | 262144,0.449536,0.388256,0.402656,0.45136,0.387872 3 | 524288,0.403968,0.39536,0.390048,0.416352,0.390656 4 | 1048576,0.3928,0.410688,0.431072,0.389952,0.401312 5 | 2097152,0.39584,0.396736,0.39248,0.392192,0.40256 6 | 4194304,0.437248,0.40496,0.422336,0.397632,0.39904 7 | 8388608,0.405984,0.506144,0.394112,0.401344,0.397408 8 | 16777216,0.425792,0.439008,0.450112,0.481728,0.4448 9 | 33554432,0.382816,0.372896,0.371776,0.379264,0.37296 10 | 67108864,0.448128,0.451456,0.49456,0.452512,0.450144 11 | 134217728,0.567936,0.569024,0.565024,0.576832,0.57328 12 | 268435456,0.918656,0.929536,0.91824,0.917376,0.91552 13 | 536870912,1.62128,1.62243,1.6177,1.61763,1.6193 14 | 1073741824,3.04266,3.0415,3.04442,3.04291,3.04093 15 | 2147483648,5.98192,5.96861,5.96822,5.96486,5.96461 16 | 4294967296,11.7316,11.7252,11.736,11.723,11.7275 17 | 8589934592,23.2471,23.2332,23.2473,23.245,23.2519 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/gather/direct_p100.csv: -------------------------------------------------------------------------------- 1 | 32768,0.069184,0.067392,0.065024,0.091168,0.067744 2 | 65536,0.066848,0.066528,0.065568,0.0696,0.066528 3 | 131072,0.066656,0.066432,0.065888,0.066784,0.067168 4 | 262144,0.07248,0.069696,0.084672,0.067168,0.067264 5 | 524288,0.067168,0.069248,0.066368,0.06688,0.070976 6 | 1048576,0.0704,0.069344,0.068,0.070272,0.069248 7 | 2097152,0.07328,0.072832,0.073568,0.073408,0.0736 8 | 4194304,0.098528,0.098368,0.098688,0.098624,0.097728 9 | 8388608,0.13328,0.13344,0.133472,0.135616,0.134144 10 | 16777216,0.219968,0.219712,0.218944,0.218816,0.219424 11 | 33554432,0.396416,0.397728,0.412832,0.395968,0.397088 12 | 67108864,0.746528,0.745408,0.74656,0.746592,0.744736 13 | 134217728,1.44637,1.45158,1.44909,1.44714,1.44698 14 | 268435456,2.84506,2.84429,2.84474,2.84493,2.84573 15 | 536870912,5.64675,5.64627,5.64557,5.64592,5.6455 16 | 1073741824,11.239,11.2402,11.2407,11.2403,11.2404 17 | 2147483648,22.427,22.4235,22.425,22.4252,22.4238 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/gather/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.148352,0.141984,0.142496,0.15792,0.164448 2 | 65536,0.160064,0.143168,0.141984,0.16064,0.14544 3 | 131072,0.143584,0.143136,0.146272,0.177696,0.137856 4 | 262144,0.161184,0.141056,0.145088,0.164128,0.144128 5 | 524288,0.155648,0.153088,0.116576,0.159616,0.157504 6 | 1048576,0.153952,0.147904,0.150944,0.15152,0.148 7 | 2097152,0.146976,0.158112,0.151104,0.146592,0.176576 8 | 4194304,0.145152,0.149792,0.167328,0.146272,0.149056 9 | 8388608,0.136864,0.132896,0.134048,0.13392,0.133312 10 | 16777216,0.162816,0.162528,0.16224,0.16208,0.162624 11 | 33554432,0.269248,0.298048,0.271904,0.269408,0.26992 12 | 67108864,0.480064,0.4808,0.480576,0.479808,0.480416 13 | 134217728,0.89792,0.895872,0.900224,0.897088,0.898784 14 | 268435456,1.74595,1.73133,1.73603,1.73168,1.72998 15 | 536870912,3.40541,3.40534,3.40781,3.41146,3.40496 16 | 1073741824,6.77418,6.77494,6.7743,6.77408,6.77414 17 | 2147483648,13.4703,13.4692,13.4707,13.469,13.4693 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/gather/opt_p100.csv: -------------------------------------------------------------------------------- 1 | 32768,0.141344,0.133408,0.13216,0.130784,0.129344 2 | 65536,0.130528,0.130304,0.132448,0.131168,0.12896 3 | 131072,0.101216,0.131744,0.101824,0.144768,0.131648 4 | 262144,0.13152,0.131872,0.132448,0.132768,0.129248 5 | 524288,0.194112,0.13872,0.157152,0.143296,0.131136 6 | 1048576,0.141184,0.139744,0.138464,0.140608,0.141696 7 | 2097152,0.178016,0.140448,0.138976,0.1384,0.138144 8 | 4194304,0.143392,0.1464,0.142464,0.149984,0.1416 9 | 8388608,0.149472,0.13584,0.138304,0.137536,0.136896 10 | 16777216,0.201824,0.202208,0.204096,0.202496,0.200896 11 | 33554432,0.335488,0.3352,0.335008,0.33536,0.332864 12 | 67108864,0.598208,0.600736,0.59824,0.601792,0.634944 13 | 134217728,1.12214,1.12406,1.12413,1.12125,1.1255 14 | 268435456,2.16912,2.16678,2.1664,2.16829,2.16573 15 | 536870912,4.25299,4.25475,4.25046,4.2543,4.25837 16 | 1073741824,8.45866,8.45939,8.45709,8.45949,8.47763 17 | 2147483648,16.8331,16.8317,16.8335,16.8343,16.83 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/gather/rings_p100.csv: -------------------------------------------------------------------------------- 1 | 32768,0.133376,0.126496,0.148576,0.132512,0.130112 2 | 65536,0.129792,0.132032,0.132224,0.133184,0.141984 3 | 131072,0.956352,0.134496,0.131232,0.132128,0.128704 4 | 262144,0.131776,0.135968,1.01302,0.130688,0.13136 5 | 524288,0.130368,0.131648,0.130048,1.32032,0.130368 6 | 1048576,0.141472,0.144704,0.156128,0.13952,0.14672 7 | 2097152,0.138144,0.135456,0.1408,0.144192,0.142592 8 | 4194304,0.144128,0.140064,0.139072,0.14208,0.140032 9 | 8388608,0.139328,0.136992,0.13744,0.139232,0.13616 10 | 16777216,0.204832,0.20208,0.203008,0.202528,0.202112 11 | 33554432,0.337184,0.33536,0.336064,0.337952,0.34208 12 | 67108864,0.602016,0.601408,0.601024,0.600672,0.600576 13 | 134217728,1.12435,1.12496,1.12166,1.1249,1.1249 14 | 268435456,2.16346,2.18304,2.16336,2.16653,2.16525 15 | 536870912,4.2703,4.2695,4.27069,4.28605,4.26845 16 | 1073741824,8.4751,8.47773,8.47475,8.4769,8.47408 17 | 2147483648,16.8277,16.8291,16.8316,16.8307,16.8276 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/scatter/direct_p100.csv: -------------------------------------------------------------------------------- 1 | 32768,0.080448,0.070752,0.086912,0.095904,0.071072 2 | 65536,0.071456,0.073248,0.071296,0.072544,0.072064 3 | 131072,0.072864,0.070944,0.071168,0.070336,0.072992 4 | 262144,0.089408,0.072128,0.09728,0.072288,0.071488 5 | 524288,0.074368,0.081504,0.073728,0.078528,0.081728 6 | 1048576,0.081952,0.084288,0.080128,0.08384,0.081568 7 | 2097152,0.088288,0.088256,0.089312,0.089888,0.092544 8 | 4194304,0.113408,0.119296,0.115296,0.114272,0.114944 9 | 8388608,0.161248,0.161504,0.160992,0.16032,0.162208 10 | 16777216,0.232928,0.231968,0.2312,0.232352,0.232896 11 | 33554432,0.413216,0.413248,0.452448,0.413312,0.411808 12 | 67108864,0.770688,0.772,0.772096,0.776064,0.770912 13 | 134217728,1.4752,1.4769,1.48013,1.47171,1.47744 14 | 268435456,2.8776,2.8815,2.8792,2.88147,2.8807 15 | 536870912,5.69597,5.71427,5.70803,5.70477,5.70461 16 | 1073741824,11.3276,11.3304,11.3304,11.3263,11.3218 17 | 2147483648,22.5756,22.5827,22.569,22.5676,22.5692 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/scatter/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.166592,0.148672,0.150624,0.165888,0.152096 2 | 65536,0.199648,0.156608,0.154656,0.1488,0.183808 3 | 131072,0.156512,0.149184,0.151776,0.153696,0.154752 4 | 262144,0.154144,0.151104,0.152544,0.148544,0.149216 5 | 524288,0.15776,0.160224,0.150816,0.166784,0.168288 6 | 1048576,0.15968,0.15856,0.158528,0.160832,0.169856 7 | 2097152,0.156256,0.156768,0.179776,0.155744,0.15712 8 | 4194304,0.156576,0.155904,0.211424,0.159328,0.157408 9 | 8388608,0.170912,0.162208,0.19712,0.166656,0.175584 10 | 16777216,0.169728,0.169504,0.16928,0.168064,0.168064 11 | 33554432,0.271744,0.272832,0.275648,0.272992,0.311136 12 | 67108864,0.517312,0.497536,0.495712,0.49744,0.496864 13 | 134217728,0.932704,0.931776,0.93088,0.929984,0.930304 14 | 268435456,1.78544,1.78326,1.78346,1.7785,1.78493 15 | 536870912,3.50237,3.5113,3.52198,3.51795,3.51123 16 | 1073741824,6.92362,6.91424,6.91062,6.92038,6.92694 17 | 2147483648,13.804,13.8088,13.8092,13.8014,13.8215 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/scatter/opt_p100.csv: -------------------------------------------------------------------------------- 1 | 32768,0.145536,0.135712,0.136704,0.135808,0.134912 2 | 65536,0.148032,0.134208,0.134496,0.133824,0.139712 3 | 131072,0.10384,0.134688,0.126176,0.174784,0.134976 4 | 262144,0.137312,0.135424,0.134944,0.137504,0.133408 5 | 524288,0.176512,0.149952,0.144928,0.147136,0.14144 6 | 1048576,0.147104,0.14432,0.148192,0.143712,0.14288 7 | 2097152,0.15152,0.14416,0.149376,0.163808,0.14368 8 | 4194304,0.175104,0.151808,0.1472,0.155072,0.145344 9 | 8388608,0.17184,0.171328,0.173152,0.184512,0.173856 10 | 16777216,0.221568,0.220864,0.2232,0.221984,0.221504 11 | 33554432,0.35648,0.358432,0.356,0.356064,0.354816 12 | 67108864,0.637184,0.638016,0.633056,0.663392,0.635808 13 | 134217728,1.16963,1.16662,1.16544,1.16643,1.16502 14 | 268435456,2.2337,2.23309,2.23453,2.23341,2.23245 15 | 536870912,4.36224,4.35597,4.36333,4.36413,4.35187 16 | 1073741824,8.61776,8.61894,8.61427,8.65021,8.63078 17 | 2147483648,17.1707,17.1509,17.1593,17.1518,17.1474 -------------------------------------------------------------------------------- /benchmark/results/dgx1_quad/scatter/rings_p100.csv: -------------------------------------------------------------------------------- 1 | 32768,0.170336,0.13568,0.139552,0.138496,0.1392 2 | 65536,0.139008,0.139328,0.141728,0.2008,0.160352 3 | 131072,0.139872,0.139488,0.139328,0.13856,0.141664 4 | 262144,0.138176,0.144736,0.136672,0.138912,0.13856 5 | 524288,0.15776,0.141184,0.178912,0.138848,0.13792 6 | 1048576,0.150432,0.156544,0.156224,0.150848,0.147872 7 | 2097152,0.17104,0.142464,0.152512,0.152384,0.148544 8 | 4194304,0.158688,0.155456,0.155456,0.154752,0.157376 9 | 8388608,0.196896,0.18112,0.179168,0.181888,0.180352 10 | 16777216,0.213504,0.212256,0.213408,0.212544,0.212768 11 | 33554432,0.374272,0.34672,0.34912,0.348,0.348704 12 | 67108864,0.627296,0.626752,0.626016,0.627072,0.626208 13 | 134217728,1.16272,1.16429,1.16131,1.16333,1.15901 14 | 268435456,2.22051,2.22714,2.21747,2.23014,2.22336 15 | 536870912,4.35872,4.36592,4.35376,4.36362,4.35446 16 | 1073741824,8.62861,8.63411,8.65376,8.63117,8.62307 17 | 2147483648,17.2308,17.222,17.2316,17.2251,17.2289 -------------------------------------------------------------------------------- /benchmark/results/old/p100_quad/all2all/rings.csv: -------------------------------------------------------------------------------- 1 | 131072,0.79632,0.783904,0.782688,0.788864,0.7864 2 | 262144,0.816096,0.850592,0.80848,0.791296,0.776864 3 | 524288,0.823104,0.78384,0.80928,0.803808,0.782656 4 | 1048576,0.788608,0.770688,0.78176,0.834208,0.79904 5 | 2097152,0.790656,0.810848,0.781696,0.791424,0.781536 6 | 4194304,0.790272,0.776992,0.826272,0.829248,0.783872 7 | 8388608,0.78416,0.796384,0.788352,0.785952,0.788352 8 | 16777216,0.687456,0.67696,0.680448,0.756288,0.687168 9 | 33554432,0.751744,0.73696,0.737824,0.734144,0.737408 10 | 67108864,0.914048,0.948128,0.941568,0.929568,0.920736 11 | 134217728,1.18038,1.23178,1.18278,1.1879,1.24067 12 | 268435456,1.7687,1.68381,1.68086,1.68589,1.74416 13 | 536870912,2.81421,2.87302,2.78122,2.76944,2.7783 14 | 1073741824,5.07946,5.10278,5.12045,5.13318,5.11184 15 | 2147483648,10.0045,9.9689,10.0106,9.97341,9.96765 16 | 4294967296,19.6316,21.113,19.6234,19.6348,19.6309 17 | 8589934592,39.5561,39.5133,39.551,39.5342,39.4945 -------------------------------------------------------------------------------- /benchmark/results/old/p100_quad/all2all_async/rings.csv: -------------------------------------------------------------------------------- 1 | 131072,0.45488,0.447072,0.456128,0.452288,0.457376 2 | 262144,0.449408,0.463136,0.464544,0.447008,0.449696 3 | 524288,0.449056,0.4464,0.4584,0.459456,0.449824 4 | 1048576,0.497472,0.444576,0.458592,0.446336,0.46576 5 | 2097152,0.482272,0.489088,0.449568,0.461408,0.447232 6 | 4194304,0.478464,0.484992,0.46896,0.453152,0.449024 7 | 8388608,0.469792,0.50688,0.47152,0.458656,0.459584 8 | 16777216,0.464896,0.466112,0.473024,0.527584,0.485504 9 | 33554432,0.473824,0.428992,0.424736,0.417792,0.415872 10 | 67108864,0.499168,0.513568,0.5,0.506528,0.500992 11 | 134217728,1.01405,1.03712,1.06192,1.06195,1.33677 12 | 268435456,1.80138,1.84128,1.81606,1.82608,1.8001 13 | 536870912,3.43418,3.43434,3.4345,3.45114,3.44051 14 | 1073741824,6.6417,6.64016,6.63146,6.63418,6.63434 15 | 2147483648,13.1537,13.1502,13.17,13.1634,13.1761 16 | 4294967296,26.1654,26.2101,26.1792,26.2072,26.1577 17 | 8589934592,52.2627,52.2667,52.2605,52.2693,52.2506 -------------------------------------------------------------------------------- /benchmark/results/old/p100_quad/gather/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.143392,0.144704,0.13984,0.138848,0.1392 2 | 65536,0.141312,0.143232,0.14368,0.148352,0.142336 3 | 131072,0.141248,0.141504,0.142112,0.154112,0.142592 4 | 262144,0.137312,0.138592,0.140032,0.139392,0.141248 5 | 524288,0.141248,0.140608,0.146944,0.142016,0.142752 6 | 1048576,0.146624,0.14656,0.14352,0.161632,0.149312 7 | 2097152,0.146848,0.143264,0.143008,0.142336,0.141696 8 | 4194304,0.146496,0.148992,0.145248,0.159488,0.150048 9 | 8388608,0.17024,0.1768,0.17552,0.199648,0.19888 10 | 16777216,0.26448,0.267232,0.264576,0.2704,0.272288 11 | 33554432,0.468128,0.475584,0.467168,0.475488,0.473888 12 | 67108864,0.879904,0.88752,0.882528,0.878016,0.877472 13 | 134217728,1.68688,1.68381,1.68637,1.68618,1.68576 14 | 268435456,3.3001,3.30355,3.30214,3.30822,3.31386 15 | 536870912,6.54634,6.5352,6.53056,6.54237,6.54605 16 | 1073741824,13.022,13.0033,13.0086,13.0199,13.0139 17 | 2147483648,25.9713,25.9761,25.9999,25.9739,25.9742 -------------------------------------------------------------------------------- /benchmark/results/old/p100_quad/scatter/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.157248,0.149952,0.151808,0.150368,0.149312 2 | 65536,0.168512,0.168576,0.149696,0.148384,0.157888 3 | 131072,0.16336,0.149888,0.150112,0.153824,0.15216 4 | 262144,0.157248,0.148096,0.15168,0.155936,0.151296 5 | 524288,0.145696,0.151552,0.15008,0.15264,0.159648 6 | 1048576,0.150464,0.150368,0.153248,0.14992,0.149824 7 | 2097152,0.152832,0.155968,0.173408,0.16096,0.15424 8 | 4194304,0.172576,0.174784,0.173088,0.203232,0.175168 9 | 8388608,0.247776,0.250464,0.240704,0.241792,0.244 10 | 16777216,0.312672,0.314176,0.311488,0.322752,0.318912 11 | 33554432,0.549024,0.552224,0.547392,0.547168,0.56048 12 | 67108864,1.00934,1.00733,1.01261,1.00922,1.00835 13 | 134217728,1.92973,1.93344,1.93418,1.93075,1.93181 14 | 268435456,3.74563,3.74643,3.74522,3.74867,3.75456 15 | 536870912,7.40819,7.39795,7.39939,7.40637,7.41232 16 | 1073741824,14.7264,14.7191,14.7318,14.7319,14.7218 17 | 2147483648,29.4246,29.4318,29.4179,29.4318,29.4223 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/all2all/direct.csv: -------------------------------------------------------------------------------- 1 | 131072,0.2048,0.177152,0.178176,0.175104,0.1792 2 | 262144,0.191488,0.182272,0.19968,0.19456,0.207872 3 | 524288,0.198656,0.192512,0.205824,0.649216,0.323584 4 | 1048576,0.19968,0.313344,0.19968,0.203776,0.192512 5 | 2097152,0.15872,0.19456,0.147456,0.196608,0.150528 6 | 4194304,0.197632,0.200704,0.193536,0.622592,0.227328 7 | 8388608,0.2048,0.203776,0.203776,0.192512,0.214016 8 | 16777216,0.216064,0.218112,0.214016,0.214016,0.520192 9 | 33554432,0.282624,0.269312,0.258048,0.247808,0.239616 10 | 67108864,0.339968,0.374784,0.344064,0.365568,0.346112 11 | 134217728,0.505856,0.514048,0.540672,0.509952,0.510976 12 | 268435456,0.989184,0.867328,0.8704,0.8704,0.873472 13 | 536870912,1.58618,1.58822,1.59232,1.57184,1.57389 14 | 1073741824,3.04128,3.0423,3.04435,3.03923,3.04333 15 | 2147483648,5.87776,5.87366,5.87878,5.90438,5.87366 16 | 4294967296,11.6859,11.6808,11.692,11.6828,11.7012 17 | 8589934592,23.2366,23.2489,23.2356,23.255,23.2315 18 | 17179869184,46.3483,46.3493,46.3647,46.3708,46.3452 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/all2all/opt.csv: -------------------------------------------------------------------------------- 1 | 131072,0.203776,0.159744,0.5376,0.19456,0.207872 2 | 262144,0.166912,0.193536,0.197632,0.195584,0.192512 3 | 524288,0.197632,0.3072,0.203776,0.19968,0.643072 4 | 1048576,0.202752,0.200704,0.161792,0.191488,0.2048 5 | 2097152,0.198656,0.1536,0.198656,0.195584,0.198656 6 | 4194304,0.200704,0.197632,0.201728,0.195584,0.21504 7 | 8388608,0.211968,0.175104,0.231424,0.197632,0.202752 8 | 16777216,0.195584,0.18944,0.31232,0.20992,0.190464 9 | 33554432,0.221184,0.23552,0.2304,0.25088,0.216064 10 | 67108864,0.693248,0.336896,0.398336,0.379904,0.39936 11 | 134217728,0.557056,0.560128,0.566272,0.565248,0.569344 12 | 268435456,0.88064,0.93184,0.8704,0.920576,0.888832 13 | 536870912,1.59949,1.60256,1.61485,1.60666,1.60256 14 | 1073741824,3.05152,3.0505,3.0761,3.05971,3.06995 15 | 2147483648,5.87981,5.90234,5.89824,5.88902,5.88902 16 | 4294967296,11.7033,11.6992,11.7074,11.7023,11.6931 17 | 8589934592,23.2305,23.2612,23.2315,23.2376,23.2509 18 | 17179869184,46.3452,46.3442,46.3616,46.3452,46.3432 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/all2all/rings.csv: -------------------------------------------------------------------------------- 1 | 131072,1.33837,1.14586,1.29946,1.17862,0.802816 2 | 262144,1.13971,1.23494,1.13254,1.14278,1.1561 3 | 524288,0.833536,1.08749,1.1305,1.16941,1.10797 4 | 1048576,1.14893,1.1479,1.15507,1.11002,1.31174 5 | 2097152,1.14893,1.3783,1.13664,1.42234,1.13254 6 | 4194304,1.12538,1.15712,1.15917,1.17862,1.16634 7 | 8388608,1.14688,1.10285,1.19091,1.1049,1.12947 8 | 16777216,1.0711,1.21446,1.03219,1.16838,1.01274 9 | 33554432,0.940032,1.03117,1.2073,0.941056,0.900096 10 | 67108864,0.97792,0.955392,1.12026,1.10285,1.03014 11 | 134217728,1.16941,1.28102,1.1991,1.37011,1.21242 12 | 268435456,1.94355,1.6343,1.65786,1.62714,1.62714 13 | 536870912,2.67162,2.93478,2.70848,2.6921,2.69517 14 | 1073741824,4.84557,4.98278,4.85478,4.87834,4.86605 15 | 2147483648,9.29075,9.28666,9.26925,9.32454,9.31635 16 | 4294967296,18.174,18.1535,18.1637,18.1637,18.1422 17 | 8589934592,35.8707,36.0305,35.9332,35.9516,35.9322 18 | 17179869184,71.4424,71.3728,71.297,71.4179,71.3462 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/all2all_async/direct.csv: -------------------------------------------------------------------------------- 1 | 131072,0.177152,0.169984,0.246784,0.197632,0.205824 2 | 262144,0.177152,0.231424,0.1792,0.167936,0.182272 3 | 524288,0.173056,0.216064,0.181248,0.321536,0.176128 4 | 1048576,0.176128,0.18432,0.180224,0.181248,0.176128 5 | 2097152,0.164864,0.175104,0.176128,0.176128,0.178176 6 | 4194304,0.17408,0.180224,0.173056,0.239616,0.183296 7 | 8388608,0.183296,0.180224,0.180224,0.177152,0.19968 8 | 16777216,0.200704,0.197632,0.2048,0.188416,0.310272 9 | 33554432,0.221184,0.222208,0.224256,0.224256,0.212992 10 | 67108864,0.338944,0.344064,0.4352,0.354304,0.315392 11 | 134217728,0.516096,0.493568,0.49664,0.495616,0.50176 12 | 268435456,0.90112,0.941056,0.85504,0.858112,0.851968 13 | 536870912,1.61792,1.57082,1.56979,1.56774,1.56467 14 | 1073741824,3.02285,3.02797,3.04845,3.02797,3.02592 15 | 2147483648,5.8583,5.85728,5.85421,5.89619,5.85523 16 | 4294967296,11.6726,11.6849,11.7064,11.7156,11.6572 17 | 8589934592,23.2458,23.2212,23.2223,23.2387,23.2243 18 | 17179869184,46.3206,46.3176,46.3227,46.3053,46.3186 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/all2all_async/opt.csv: -------------------------------------------------------------------------------- 1 | 131072,0.2816,0.177152,0.18432,0.172032,0.263168 2 | 262144,0.188416,0.175104,0.23552,0.377856,0.181248 3 | 524288,0.178176,0.280576,0.180224,0.178176,0.26112 4 | 1048576,0.176128,0.185344,0.169984,0.175104,0.181248 5 | 2097152,0.176128,0.185344,0.185344,0.203776,0.178176 6 | 4194304,0.289792,0.183296,0.130048,0.272384,0.181248 7 | 8388608,0.187392,0.232448,0.186368,0.187392,0.305152 8 | 16777216,0.20992,0.20992,0.192512,0.219136,0.206848 9 | 33554432,0.259072,0.32256,0.258048,0.221184,0.237568 10 | 67108864,0.340992,0.311296,0.304128,0.305152,0.308224 11 | 134217728,0.50176,0.488448,0.498688,0.493568,0.495616 12 | 268435456,0.861184,0.858112,0.85504,0.858112,0.85504 13 | 536870912,1.57491,1.57082,1.56467,1.59846,1.59744 14 | 1073741824,3.0208,3.03616,3.02592,3.02899,3.02694 15 | 2147483648,5.94739,5.86138,5.86854,5.9177,5.85421 16 | 4294967296,11.6603,11.7176,11.6623,11.7187,11.7207 17 | 8589934592,23.2284,23.2192,23.2131,23.2294,23.212 18 | 17179869184,46.2991,46.3596,46.294,46.3012,46.3278 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/all2all_async/rings.csv: -------------------------------------------------------------------------------- 1 | 131072,0.758784,0.641024,0.75264,0.607232,0.636928 2 | 262144,0.671744,0.81408,0.647168,0.635904,0.653312 3 | 524288,0.521216,0.56832,0.649216,0.647168,0.62464 4 | 1048576,0.64512,0.662528,0.649216,0.846848,0.850944 5 | 2097152,0.628736,0.872448,0.698368,0.740352,0.847872 6 | 4194304,0.642048,0.638976,0.677888,0.659456,0.651264 7 | 8388608,0.67584,0.44544,0.809984,0.647168,0.536576 8 | 16777216,0.758784,0.661504,0.669696,0.656384,0.62464 9 | 33554432,0.530432,0.64512,0.68096,0.528384,0.524288 10 | 67108864,0.65536,0.564224,0.628736,0.631808,0.610304 11 | 134217728,0.695296,0.724992,0.766976,0.805888,0.69632 12 | 268435456,1.22675,1.14896,1.16429,1.20218,1.15098 13 | 536870912,2.08691,2.16166,2.1545,2.10534,2.08998 14 | 1073741824,3.99565,4.02842,4.03046,3.99258,3.99974 15 | 2147483648,7.85613,7.80493,7.79366,7.81107,7.84691 16 | 4294967296,15.487,15.4982,15.4829,15.5003,15.4849 17 | 8589934592,30.7128,30.6975,30.7128,30.6964,30.7098 18 | 17179869184,61.2219,61.1502,61.14,61.1871,61.1348 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/gather/direct.csv: -------------------------------------------------------------------------------- 1 | 32768,0.079872,0.082944,0.080896,0.0768,0.082944 2 | 65536,0.167936,0.091136,0.079872,0.077824,0.086016 3 | 131072,0.0768,0.078848,0.080896,0.162816,0.052224 4 | 262144,0.078848,0.077824,0.079872,0.080896,0.078848 5 | 524288,0.06656,0.075776,0.12288,0.077824,0.077824 6 | 1048576,0.0768,0.07984,0.075776,0.10752,0.077824 7 | 2097152,0.082944,0.080896,0.08192,0.080896,0.082944 8 | 4194304,0.183296,0.157696,0.104448,0.10752,0.182272 9 | 8388608,0.149504,0.1536,0.151552,0.151552,0.147456 10 | 16777216,0.22016,0.22016,0.238592,0.221184,0.22016 11 | 33554432,0.396288,0.397312,0.39936,0.39936,0.398336 12 | 67108864,0.75264,0.753664,0.74752,0.745472,0.746496 13 | 134217728,1.4592,1.44589,1.45101,1.44794,1.44896 14 | 268435456,2.84979,2.8457,2.84877,2.85082,2.84672 15 | 536870912,5.64429,5.64634,5.64736,5.65043,5.64531 16 | 1073741824,11.2384,11.2384,11.263,11.2548,11.2364 17 | 2147483648,22.4215,22.4236,22.4225,22.4225,22.4215 18 | 4294967296,44.7959,44.7939,44.7949,44.7959,44.7959 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/gather/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.192512,0.183296,0.324576,0.190464,0.185344 2 | 65536,0.188416,0.185344,0.169984,0.18944,0.186368 3 | 131072,0.188416,0.180224,0.187392,0.192512,0.181248 4 | 262144,0.195584,0.217088,0.191488,0.185344,0.183296 5 | 524288,0.185344,0.186368,0.186368,0.191488,0.191488 6 | 1048576,0.187392,0.18944,0.190464,0.264192,0.185344 7 | 2097152,0.186368,0.18432,0.192512,0.183296,0.188416 8 | 4194304,0.18944,0.193536,0.18432,0.193536,0.182272 9 | 8388608,0.188416,0.186368,0.18944,0.177152,0.188416 10 | 16777216,0.2816,0.264192,0.262144,0.285696,0.263168 11 | 33554432,0.478208,0.47616,0.479232,0.478208,0.477184 12 | 67108864,0.896,0.898048,0.898048,0.894976,0.894976 13 | 134217728,1.72954,1.73158,1.73056,1.73056,1.73158 14 | 268435456,3.40378,3.40173,3.39661,3.39968,3.40173 15 | 536870912,6.75226,6.75123,6.7543,6.75328,6.75635 16 | 1073741824,13.482,13.483,13.483,13.4851,13.483 17 | 2147483648,26.8042,26.8206,26.8022,26.8012,26.8063 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/gather/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.226304,0.1792,0.219136,0.203776,0.193536 2 | 65536,0.198656,0.19968,0.19456,0.191488,0.128 3 | 131072,0.19456,0.222208,0.197632,0.227328,0.188416 4 | 262144,0.196608,0.19968,0.19456,0.208896,0.197632 5 | 524288,0.190464,0.238592,0.193536,0.197632,0.19456 6 | 1048576,0.210944,0.19456,0.195584,0.201728,0.195584 7 | 2097152,0.234496,0.154624,0.201728,0.238592,0.205824 8 | 4194304,0.24064,0.201728,0.134144,0.196608,0.218112 9 | 8388608,0.18944,0.18432,0.185344,0.207872,0.208896 10 | 16777216,0.270336,0.284672,0.284672,0.260096,0.273408 11 | 33554432,0.432128,0.4352,0.44544,0.439296,0.434176 12 | 67108864,0.794624,0.797696,0.789504,0.787456,0.78848 13 | 134217728,1.4807,1.49299,1.49094,1.48787,1.48787 14 | 268435456,2.87642,2.88051,2.87232,2.87744,2.88051 15 | 536870912,5.66374,5.6617,5.65862,5.66579,5.66067 16 | 1073741824,11.2978,11.2794,11.2845,11.2794,11.2957 17 | 2147483648,22.4072,22.4174,22.4113,22.4123,22.4061 18 | 4294967296,44.8205,44.8184,44.8266,44.8266,44.8164 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/scatter/direct.csv: -------------------------------------------------------------------------------- 1 | 32768,0.074752,0.08192,0.0768,0.074752,0.078848 2 | 65536,0.088064,0.090112,0.075776,0.074752,0.078848 3 | 131072,0.074752,0.073728,0.0768,0.0768,0.055296 4 | 262144,0.079872,0.08192,0.077824,0.077824,0.078848 5 | 524288,0.075776,0.077824,0.077824,0.077824,0.075776 6 | 1048576,0.078848,0.08192,0.079872,0.082944,0.08192 7 | 2097152,0.093184,0.091136,0.090112,0.090112,0.094208 8 | 4194304,0.124928,0.123904,0.120832,0.115712,0.121856 9 | 8388608,0.162816,0.162816,0.162816,0.16384,0.159744 10 | 16777216,0.232448,0.232448,0.239616,0.233472,0.232448 11 | 33554432,0.4096,0.428032,0.429056,0.407552,0.408576 12 | 67108864,0.769024,0.77312,0.765952,0.784384,0.768 13 | 134217728,1.47661,1.47046,1.47661,1.47251,1.50528 14 | 268435456,2.88051,2.88358,2.88666,2.88461,2.88563 15 | 536870912,5.71494,5.69446,5.72006,5.69446,5.69242 16 | 1073741824,11.3449,11.3469,11.3521,11.3408,11.3623 17 | 2147483648,22.6068,22.6089,22.6048,22.6048,22.5976 18 | 4294967296,45.142,45.1348,45.1523,45.1369,45.141 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/scatter/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.282624,0.285696,0.427008,0.285696,0.27136 2 | 65536,0.285696,0.277504,0.26112,0.2816,0.270336 3 | 131072,0.27648,0.273408,0.279552,0.32256,0.648192 4 | 262144,0.297984,0.284672,0.346112,0.273408,0.27648 5 | 524288,0.278528,0.283648,0.275456,0.29184,0.282624 6 | 1048576,0.288768,0.27648,0.28672,0.273408,0.27136 7 | 2097152,0.272384,0.26624,0.278528,0.27648,0.277504 8 | 4194304,0.306176,0.309248,0.301056,0.305152,0.297984 9 | 8388608,0.406528,0.398336,0.402432,0.420864,0.39936 10 | 16777216,0.635904,0.633856,0.648192,0.638976,0.628736 11 | 33554432,1.03834,1.03424,1.04038,1.06906,1.05062 12 | 67108864,1.89747,1.87494,1.88723,1.87802,1.87392 13 | 134217728,3.53075,3.5328,3.5369,3.53485,3.53485 14 | 268435456,6.84032,6.81779,6.81984,6.82906,6.81062 15 | 536870912,13.4205,13.4093,13.4318,13.4226,13.4441 16 | 1073741824,26.5697,26.583,26.5841,26.5851,26.5851 17 | 2147483648,53.0084,52.9992,52.9981,52.992,53.0135 -------------------------------------------------------------------------------- /benchmark/results/old/v100_quad/scatter/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.234496,0.208896,0.227328,0.275456,0.205824 2 | 65536,0.211968,0.20992,0.205824,0.202752,0.160768 3 | 131072,0.2304,0.218112,0.206848,0.239616,0.197632 4 | 262144,0.208896,0.205824,0.207872,0.253952,0.208896 5 | 524288,0.2048,0.274432,0.206848,0.210944,0.208896 6 | 1048576,0.299008,0.207872,0.2048,0.268288,0.206848 7 | 2097152,0.208896,0.192512,0.282624,0.334848,0.23552 8 | 4194304,0.283648,0.218112,0.200704,0.216064,0.210944 9 | 8388608,0.231424,0.249856,0.222208,0.243712,0.247808 10 | 16777216,0.285696,0.347136,0.347136,0.288768,0.320512 11 | 33554432,0.444416,0.464896,0.448512,0.467968,0.467968 12 | 67108864,0.831488,0.817152,0.828416,0.807936,0.82432 13 | 134217728,1.51142,1.51757,1.52781,1.5319,1.51757 14 | 268435456,2.93478,2.92966,2.93478,2.97062,2.9225 15 | 536870912,5.75795,5.75386,5.77229,5.76717,5.75898 16 | 1073741824,11.4534,11.4319,11.4432,11.436,11.4289 17 | 2147483648,22.7574,22.7717,22.7901,22.786,22.783 18 | 4294967296,45.3806,45.3827,45.4031,45.4164,45.4062 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/all2all/direct.csv: -------------------------------------------------------------------------------- 1 | 131072,0.169664,0.174944,0.167424,0.166752,0.166656 2 | 262144,0.170688,0.167168,0.169184,0.167104,0.168384 3 | 524288,0.175328,0.16848,0.176,0.1664,0.16624 4 | 1048576,0.168992,0.166528,0.176,0.171456,0.1704 5 | 2097152,0.168896,0.171008,0.168608,0.168192,0.171392 6 | 4194304,0.176672,0.168256,0.170784,0.172064,0.170304 7 | 8388608,0.18,0.175584,0.176064,0.179424,0.175136 8 | 16777216,0.21856,0.215808,0.241024,0.220832,0.216864 9 | 33554432,0.250848,0.248864,0.254944,0.249568,0.249216 10 | 67108864,0.371296,0.367488,0.37216,0.368512,0.37168 11 | 134217728,0.608128,0.604928,0.608928,0.611872,0.609088 12 | 268435456,1.0848,1.13018,1.08429,1.08218,1.13827 13 | 536870912,2.08634,2.08234,2.07654,2.04205,2.03517 14 | 1073741824,3.93213,3.91661,3.9087,3.90886,3.89878 15 | 2147483648,7.70829,7.6976,7.67277,7.69296,7.68992 16 | 4294967296,15.2421,15.229,15.2152,15.2067,15.2106 17 | 8589934592,30.4344,30.5196,30.4453,30.5063,30.4369 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/all2all/opt.csv: -------------------------------------------------------------------------------- 1 | 131072,0.183424,0.176256,0.16864,0.166368,0.17904 2 | 262144,0.165216,0.168096,0.168768,0.16912,0.168032 3 | 524288,0.165728,0.17376,0.168128,0.173824,0.167424 4 | 1048576,0.171552,0.168192,0.168704,0.176384,0.18736 5 | 2097152,0.170112,0.170208,0.168928,0.170912,0.166816 6 | 4194304,0.16736,0.17056,0.185376,0.1832,0.168416 7 | 8388608,0.177664,0.17392,0.173344,0.173536,0.173216 8 | 16777216,0.21632,0.21776,0.211136,0.215616,0.216768 9 | 33554432,0.252,0.251712,0.253344,0.281504,0.24784 10 | 67108864,0.370624,0.368864,0.369312,0.370656,0.370656 11 | 134217728,0.608608,0.608928,0.604704,0.606304,0.60928 12 | 268435456,1.08691,1.07923,1.13376,1.13146,1.13056 13 | 536870912,2.06358,2.08045,2.08349,2.0625,2.04586 14 | 1073741824,3.92288,3.91795,3.91462,3.91907,3.91965 15 | 2147483648,7.69232,7.73238,7.68093,7.69024,7.69392 16 | 4294967296,15.2119,15.2082,15.2066,15.1738,15.2198 17 | 8589934592,30.4417,30.4936,30.4391,30.4643,30.47 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/all2all/rings.csv: -------------------------------------------------------------------------------- 1 | 131072,0.860672,0.881024,0.872224,0.863936,0.581408 2 | 262144,0.866528,0.901984,0.59008,0.590592,0.603584 3 | 524288,0.591104,0.869824,0.584128,0.606336,0.876576 4 | 1048576,0.588704,0.60528,0.881408,0.58448,0.872224 5 | 2097152,0.585568,0.876,0.587584,0.869952,0.589728 6 | 4194304,0.582208,0.872768,0.871456,0.59504,0.595392 7 | 8388608,0.87488,0.871584,0.618464,0.881056,0.591648 8 | 16777216,0.803648,0.796448,0.827872,0.840832,0.799456 9 | 33554432,0.787584,0.740224,0.784896,0.779584,0.771008 10 | 67108864,0.970112,0.926976,1.03562,0.92304,0.922304 11 | 134217728,1.18582,1.2145,1.1983,1.18906,1.20822 12 | 268435456,1.76432,1.68749,1.72317,1.69418,1.68717 13 | 536870912,2.93792,2.82157,2.81098,2.79443,2.79341 14 | 1073741824,5.11107,5.08794,5.10298,5.1055,5.1137 15 | 2147483648,9.98893,9.97754,10.0094,9.95952,9.96547 16 | 4294967296,19.6221,19.6338,19.6041,19.6361,19.6252 17 | 8589934592,39.565,39.5202,39.4226,39.5744,39.5572 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/all2all_async/direct.csv: -------------------------------------------------------------------------------- 1 | 131072,0.158656,0.164096,0.160224,0.173408,0.19072 2 | 262144,0.16576,0.16224,0.169408,0.160992,0.161536 3 | 524288,0.178208,0.191488,0.160608,0.15872,0.162272 4 | 1048576,0.161056,0.168736,0.159456,0.16128,0.16032 5 | 2097152,0.167776,0.180928,0.167936,0.159808,0.159424 6 | 4194304,0.164736,0.16464,0.168224,0.16464,0.173184 7 | 8388608,0.175232,0.175712,0.176128,0.1728,0.171584 8 | 16777216,0.214016,0.215776,0.214784,0.214048,0.21536 9 | 33554432,0.248576,0.246528,0.248576,0.25328,0.250944 10 | 67108864,0.36976,0.366752,0.426048,0.370368,0.367456 11 | 134217728,0.604864,0.662912,0.637632,0.66608,0.661888 12 | 268435456,1.10077,1.08522,1.10218,1.10566,1.09059 13 | 536870912,2.04099,2.04083,2.03878,2.03574,2.03837 14 | 1073741824,3.89139,3.89418,3.89629,3.89158,3.9689 15 | 2147483648,7.73971,7.7136,7.71293,7.7232,7.69731 16 | 4294967296,15.2451,15.2679,15.2473,15.2495,15.2469 17 | 8589934592,30.5028,30.4074,30.4869,30.4215,30.4348 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/all2all_async/opt.csv: -------------------------------------------------------------------------------- 1 | 131072,0.176,0.16096,0.159392,0.158624,0.158336 2 | 262144,0.158464,0.162368,0.15984,0.161664,0.159136 3 | 524288,0.158752,0.160704,0.157984,0.161376,0.159552 4 | 1048576,0.158464,0.159744,0.162528,0.163712,0.17824 5 | 2097152,0.182464,0.160896,0.163776,0.163808,0.15936 6 | 4194304,0.164544,0.164672,0.16256,0.166368,0.166368 7 | 8388608,0.170432,0.178848,0.174048,0.171456,0.174432 8 | 16777216,0.215328,0.216544,0.21184,0.21568,0.21232 9 | 33554432,0.250496,0.24784,0.248192,0.250464,0.248576 10 | 67108864,0.36912,0.368352,0.369216,0.370784,0.367648 11 | 134217728,0.606432,0.658848,0.620896,0.656384,0.60288 12 | 268435456,1.10358,1.09709,1.0888,1.08429,1.0856 13 | 536870912,2.0417,2.04221,2.04208,2.04256,2.0353 14 | 1073741824,3.89658,3.91062,3.90253,3.88979,3.90429 15 | 2147483648,7.70589,7.72381,7.71414,7.71888,7.72368 16 | 4294967296,15.2168,15.211,15.216,15.208,15.166 17 | 8589934592,30.4653,30.4902,30.4765,30.5019,30.4692 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/all2all_async/rings.csv: -------------------------------------------------------------------------------- 1 | 131072,0.472384,0.487776,0.48656,0.481408,0.486336 2 | 262144,0.477952,0.48448,0.4856,0.492416,0.342688 3 | 524288,0.482688,0.490816,0.489376,0.491456,0.484 4 | 1048576,0.330144,0.499232,0.484352,0.482176,0.480768 5 | 2097152,0.49248,0.484896,0.484544,0.49584,0.33584 6 | 4194304,0.519328,0.492256,0.486272,0.490528,0.489472 7 | 8388608,0.49696,0.49968,0.514944,0.503008,0.514944 8 | 16777216,0.535616,0.530432,0.526528,0.529856,0.537312 9 | 33554432,0.45968,0.445952,0.502272,0.47824,0.496416 10 | 67108864,0.562464,0.535712,0.54736,0.491904,0.583872 11 | 134217728,0.712064,0.708352,0.722752,0.697696,0.736512 12 | 268435456,1.14605,1.16826,1.13536,1.16544,1.16192 13 | 536870912,2.06973,2.07341,2.07078,2.0689,2.08291 14 | 1073741824,3.9511,3.96531,3.95514,3.95322,3.95834 15 | 2147483648,7.75862,7.76253,7.76733,7.77286,7.76342 16 | 4294967296,15.3711,15.3737,15.3685,15.3795,15.3904 17 | 8589934592,30.7092,30.7343,30.7398,30.7244,30.6951 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/gather/direct.csv: -------------------------------------------------------------------------------- 1 | 32768,0.072288,0.072064,0.072256,0.078336,0.073376 2 | 65536,0.069536,0.072096,0.071072,0.07392,0.070176 3 | 131072,0.069184,0.053344,0.07888,0.068416,0.073024 4 | 262144,0.070784,0.083808,0.0552,0.07264,0.057312 5 | 524288,0.07264,0.071072,0.0744,0.073472,0.072512 6 | 1048576,0.067808,0.076224,0.07264,0.07456,0.071488 7 | 2097152,0.082624,0.080672,0.085344,0.081376,0.081056 8 | 4194304,0.11904,0.119648,0.118944,0.118848,0.118112 9 | 8388608,0.178208,0.179232,0.177312,0.174016,0.174976 10 | 16777216,0.28896,0.289376,0.274496,0.298048,0.29472 11 | 33554432,0.508224,0.506464,0.50624,0.510912,0.507296 12 | 67108864,0.971264,0.969856,0.97072,0.97056,0.9696 13 | 134217728,1.89715,1.89216,1.89722,1.89904,1.8937 14 | 268435456,3.74842,3.74333,3.74448,3.75466,3.7505 15 | 536870912,7.46243,7.46643,7.45626,7.47158,7.44848 16 | 1073741824,14.8481,14.8399,14.8564,14.858,14.8498 17 | 2147483648,29.6434,29.6339,29.6396,29.6355,29.6348 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/gather/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.114528,0.1544,0.164864,0.155136,0.152064 2 | 65536,0.161536,0.152576,0.167936,0.153952,0.153344 3 | 131072,0.159808,0.155328,0.108512,0.15328,0.160448 4 | 262144,0.153952,0.163424,0.154112,0.163328,0.165568 5 | 524288,0.156096,0.156928,0.155296,0.114912,0.153184 6 | 1048576,0.151584,0.160736,0.158848,0.151104,0.152672 7 | 2097152,0.195552,0.156928,0.155872,0.161184,0.159424 8 | 4194304,0.162144,0.161632,0.16128,0.167168,0.16064 9 | 8388608,0.167104,0.170528,0.180992,0.169504,0.169632 10 | 16777216,0.242176,0.24352,0.24544,0.246656,0.243296 11 | 33554432,0.426656,0.427488,0.42752,0.428928,0.430656 12 | 67108864,0.781376,0.780288,0.777984,0.776544,0.780832 13 | 134217728,1.47872,1.48483,1.48,1.47898,1.47891 14 | 268435456,2.87645,2.87485,2.88189,2.88144,2.88074 15 | 536870912,5.66906,5.65875,5.66214,5.65619,5.67082 16 | 1073741824,11.3286,11.345,11.3376,11.3428,11.3342 17 | 2147483648,22.4854,22.5068,22.4849,22.495,22.4826 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/gather/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.150944,0.161664,0.154464,0.15984,0.156224 2 | 65536,0.151648,0.149024,0.149408,0.1504,0.155936 3 | 131072,0.160992,0.157728,0.159584,0.159776,0.159552 4 | 262144,0.105824,0.154528,0.152672,0.151552,0.149664 5 | 524288,0.148832,0.155072,0.1592,0.152576,0.10688 6 | 1048576,0.15488,0.153024,0.120192,0.15152,0.155584 7 | 2097152,0.150048,0.14992,0.1632,0.151328,0.1536 8 | 4194304,0.160032,0.16336,0.158784,0.164,0.161696 9 | 8388608,0.19424,0.194208,0.191424,0.188864,0.1944 10 | 16777216,0.248416,0.25008,0.247936,0.255744,0.251616 11 | 33554432,0.422624,0.420512,0.422464,0.42,0.422784 12 | 67108864,0.770144,0.77104,0.770336,0.77216,0.770432 13 | 134217728,1.46682,1.46765,1.46794,1.46701,1.46653 14 | 268435456,2.87043,2.88474,2.86819,2.87354,2.87021 15 | 536870912,5.65005,5.64877,5.64771,5.64854,5.65546 16 | 1073741824,11.3135,11.3083,11.3078,11.3061,11.3113 17 | 2147483648,22.435,22.4549,22.4536,22.4464,22.438 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/scatter/direct.csv: -------------------------------------------------------------------------------- 1 | 32768,0.07904,0.076928,0.078912,0.076608,0.08112 2 | 65536,0.075232,0.079072,0.083168,0.077984,0.077248 3 | 131072,0.074528,0.060416,0.07856,0.074592,0.07696 4 | 262144,0.07616,0.077376,0.06176,0.076704,0.065536 5 | 524288,0.077696,0.078016,0.078048,0.0808,0.078208 6 | 1048576,0.084672,0.090048,0.085344,0.087456,0.082816 7 | 2097152,0.10384,0.099584,0.110592,0.101696,0.10032 8 | 4194304,0.136544,0.140192,0.135296,0.136544,0.135616 9 | 8388608,0.208448,0.206272,0.200224,0.19856,0.196096 10 | 16777216,0.335392,0.35072,0.313536,0.333248,0.334272 11 | 33554432,0.532704,0.548064,0.534912,0.53312,0.536576 12 | 67108864,0.993312,0.99296,0.992896,0.99056,0.995744 13 | 134217728,1.89942,1.90768,1.90458,1.9048,1.90544 14 | 268435456,3.73699,3.73517,3.73731,3.73229,3.73232 15 | 536870912,7.45472,7.45475,7.44941,7.45706,7.45642 16 | 1073741824,14.8698,14.8493,14.859,14.8589,14.8459 17 | 2147483648,29.4061,29.3815,29.4004,29.4127,29.3916 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/scatter/opt.csv: -------------------------------------------------------------------------------- 1 | 32768,0.126432,0.164384,0.16896,0.16512,0.1768 2 | 65536,0.16368,0.165472,0.16688,0.16496,0.165472 3 | 131072,0.1696,0.16448,0.120064,0.164832,0.16784 4 | 262144,0.166112,0.165824,0.164736,0.165504,0.169376 5 | 524288,0.165472,0.16896,0.175168,0.120096,0.16448 6 | 1048576,0.172192,0.169152,0.167488,0.163136,0.163712 7 | 2097152,0.170624,0.169376,0.166048,0.179712,0.171936 8 | 4194304,0.17792,0.177408,0.17536,0.184128,0.184224 9 | 8388608,0.189856,0.198304,0.199264,0.19536,0.192256 10 | 16777216,0.268064,0.268128,0.26704,0.302976,0.275136 11 | 33554432,0.46192,0.46192,0.46304,0.464576,0.463072 12 | 67108864,0.815488,0.817088,0.815392,0.815008,0.818656 13 | 134217728,1.52147,1.52374,1.5248,1.52243,1.52493 14 | 268435456,2.95299,2.94966,2.9575,2.95146,2.96019 15 | 536870912,5.74086,5.72915,5.73286,5.7279,5.74064 16 | 1073741824,11.3889,11.398,11.3884,11.3863,11.38 17 | 2147483648,22.6211,22.6181,22.6172,22.6153,22.6085 -------------------------------------------------------------------------------- /benchmark/results/p100_quad/scatter/rings.csv: -------------------------------------------------------------------------------- 1 | 32768,0.165792,0.16368,0.159744,0.1616,0.15952 2 | 65536,0.16976,0.165216,0.159712,0.161024,0.158464 3 | 131072,0.16,0.158432,0.157504,0.162144,0.161024 4 | 262144,0.115328,0.15952,0.162752,0.158368,0.165408 5 | 524288,0.160416,0.161344,0.161536,0.161728,0.115904 6 | 1048576,0.1648,0.160736,0.162656,0.1592,0.165152 7 | 2097152,0.162784,0.163872,0.178528,0.166496,0.167584 8 | 4194304,0.183744,0.191808,0.182656,0.185888,0.186368 9 | 8388608,0.23232,0.237856,0.227168,0.220512,0.236384 10 | 16777216,0.272096,0.31936,0.26784,0.288704,0.290624 11 | 33554432,0.450272,0.469536,0.453824,0.449792,0.45328 12 | 67108864,0.804032,0.805696,0.80208,0.8048,0.808448 13 | 134217728,1.50342,1.5073,1.5144,1.50499,1.50563 14 | 268435456,2.92179,2.92899,2.92493,2.93616,2.92973 15 | 536870912,5.75146,5.75411,5.7504,5.75194,5.74093 16 | 1073741824,11.314,11.3356,11.322,11.3223,11.3143 17 | 2147483648,22.75,22.77,22.7373,22.742,22.7477 -------------------------------------------------------------------------------- /benchmark/results/throughput.txt: -------------------------------------------------------------------------------- 1 | dgx1_all2all 2 | results/dgx1/all2all_async/direct.csv peak troughput: 57 GB/s 3 | results/dgx1/all2all_async/rings.csv peak troughput: 523 GB/s 4 | results/dgx1/all2all_async/opt_1chunk.csv peak troughput: 468 GB/s 5 | results/dgx1/all2all_async/opt.csv peak troughput: 526 GB/s 6 | results/dgx1/all2all/opt.csv peak troughput: 477 GB/s 7 | dgx1_scatter 8 | results/dgx1/scatter/direct.csv peak troughput: 29 GB/s 9 | results/dgx1/scatter/rings.csv peak troughput: 148 GB/s 10 | results/dgx1/scatter/opt_1chunk.csv peak troughput: 116 GB/s 11 | results/dgx1/scatter/opt.csv peak troughput: 148 GB/s 12 | p100_quad_all2all 13 | results/p100_quad/all2all_async/direct.csv peak troughput: 262 GB/s 14 | results/p100_quad/all2all_async/rings.csv peak troughput: 260 GB/s 15 | results/p100_quad/all2all_async/opt.csv peak troughput: 262 GB/s 16 | p100_quad_scatter 17 | results/p100_quad/scatter/direct.csv peak troughput: 68 GB/s 18 | results/p100_quad/scatter/rings.csv peak troughput: 88 GB/s 19 | results/p100_quad/scatter/opt.csv peak troughput: 88 GB/s 20 | dgx1_quad_all2all 21 | results/dgx1_quad/all2all_async/direct_p100.csv peak troughput: 344 GB/s 22 | results/dgx1_quad/all2all_async/rings_p100.csv peak troughput: 344 GB/s 23 | results/dgx1_quad/all2all_async/opt_p100.csv peak troughput: 427 GB/s 24 | results/dgx1_quad/all2all/opt_p100.csv peak troughput: 344 GB/s 25 | dgx1_quad_scatter 26 | results/dgx1_quad/scatter/direct_p100.csv peak troughput: 88 GB/s 27 | results/dgx1_quad/scatter/rings_p100.csv peak troughput: 116 GB/s 28 | results/dgx1_quad/scatter/opt_p100.csv peak troughput: 116 GB/s 29 | results/dgx1_quad/scatter/opt.csv peak troughput: 144 GB/s 30 | -------------------------------------------------------------------------------- /execute.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "include/gossip.cuh" 8 | #include "executor.cuh" 9 | #include "include/plan_parser.hpp" 10 | #include "include/clipp/include/clipp.h" 11 | 12 | template 13 | void all2all( 14 | gossip::transfer_plan_t transfer_plan, 15 | const size_t batch_size, 16 | const size_t batch_size_secure) { 17 | 18 | gossip::all2all::verify_plan(transfer_plan); 19 | 20 | auto num_gpus = transfer_plan.num_gpus(); 21 | 22 | if(transfer_plan.valid()) { 23 | 24 | auto context = gossip::context_t(num_gpus); 25 | // context.print_connectivity_matrix(); 26 | auto all2all = gossip::all2all_t(context, transfer_plan); 27 | auto multisplit = gossip::multisplit_t(context); 28 | auto point2point = gossip::point2point_t(context); 29 | 30 | run_multisplit_all2all( 31 | context, all2all, multisplit, point2point, 32 | batch_size, batch_size_secure); 33 | 34 | context.sync_hard(); 35 | } 36 | } 37 | 38 | template 39 | void all2all_async( 40 | gossip::transfer_plan_t transfer_plan, 41 | const size_t batch_size, 42 | const size_t batch_size_secure) { 43 | 44 | gossip::all2all::verify_plan(transfer_plan); 45 | 46 | auto num_gpus = transfer_plan.num_gpus(); 47 | 48 | if(transfer_plan.valid()) { 49 | 50 | auto context = gossip::context_t(num_gpus); 51 | // context.print_connectivity_matrix(); 52 | auto all2all = gossip::all2all_async_t(context, transfer_plan); 53 | auto multisplit = gossip::multisplit_t(context); 54 | auto point2point = gossip::point2point_t(context); 55 | 56 | run_multisplit_all2all_async( 57 | context, all2all, multisplit, point2point, 58 | batch_size, batch_size_secure); 59 | 60 | context.sync_hard(); 61 | } 62 | } 63 | 64 | template 65 | void scatter_gather( 66 | gossip::transfer_plan_t scatter_plan, 67 | gossip::transfer_plan_t gather_plan, 68 | const size_t batch_size, 69 | const size_t batch_size_secure) { 70 | 71 | gossip::scatter::verify_plan(scatter_plan); 72 | gossip::gather::verify_plan(gather_plan); 73 | 74 | auto num_gpus = scatter_plan.num_gpus(); 75 | if(num_gpus != gather_plan.num_gpus()) { 76 | std::cout << "scatter and gather num_gpus does not match" << std::endl; 77 | return; 78 | } 79 | 80 | auto main_gpu = scatter_plan.main_gpu(); 81 | if(main_gpu != gather_plan.main_gpu()) { 82 | std::cout << "scatter and gather main_gpu does not match" << std::endl; 83 | return; 84 | } 85 | 86 | if(scatter_plan.valid() && gather_plan.valid()) { 87 | 88 | auto context = gossip::context_t(num_gpus); 89 | // context.print_connectivity_matrix(); 90 | auto point2point = gossip::point2point_t(context); 91 | auto multisplit = gossip::multisplit_t(context); 92 | auto scatter = gossip::scatter_t(context, scatter_plan); 93 | auto gather = gossip::gather_t(context, gather_plan); 94 | 95 | run_multisplit_scatter_gather( 96 | context, point2point, multisplit, scatter, gather, 97 | main_gpu, 98 | batch_size, batch_size_secure); 99 | 100 | context.sync_hard(); 101 | } 102 | } 103 | 104 | template 105 | void broadcaster( 106 | gossip::transfer_plan_t transfer_plan, 107 | const size_t batch_size, 108 | const size_t batch_size_secure) { 109 | 110 | gossip::broadcast::verify_plan(transfer_plan); 111 | 112 | auto num_gpus = transfer_plan.num_gpus(); 113 | 114 | if(transfer_plan.valid()) { 115 | 116 | auto context = gossip::context_t(num_gpus); 117 | // context.print_connectivity_matrix(); 118 | auto broadcast = gossip::broadcast_t(context, transfer_plan); 119 | auto multisplit = gossip::multisplit_t(context); 120 | auto point2point = gossip::point2point_t(context); 121 | 122 | run_multisplit_broadcast( 123 | context, point2point, multisplit, broadcast, 124 | batch_size, batch_size_secure); 125 | 126 | context.sync_hard(); 127 | } 128 | } 129 | 130 | int main (int argc, char *argv[]) { 131 | using data_t = uint64_t; // base data type 132 | 133 | // parse args using https://github.com/muellan/clipp 134 | using namespace clipp; 135 | enum class mode {all2all, all2all_async, scatter_gather, broadcast, help}; 136 | 137 | mode selected; 138 | double security_factor = 1.5; 139 | size_t data_size = 28; 140 | std::string plan_file, scatter_plan_file, gather_plan_file; 141 | 142 | auto cli = 143 | ( 144 | ( 145 | ( 146 | ( 147 | ( 148 | command("all2all").set(selected, mode::all2all) | 149 | command("all2all_async").set(selected, mode::all2all_async) | 150 | command("broadcast").set(selected, mode::broadcast) 151 | ), 152 | value("transfer plan", plan_file) 153 | ) | 154 | ( 155 | command("scatter_gather").set(selected, mode::scatter_gather), 156 | value("scatter plan", scatter_plan_file), value("gather plan", gather_plan_file) 157 | ) 158 | ), 159 | option("--size", "-s") & value("size", data_size) % "data size (bytes log2) [default: 28]", 160 | option("--memory-factor") & value("factor", security_factor) % "memory security factor [default: 1.5]" 161 | ) | 162 | command("help").set(selected, mode::help) 163 | ); 164 | 165 | if(parse(argc, argv, cli)) 166 | { 167 | assert(data_size >= 4); 168 | data_size = 1UL << data_size; 169 | size_t data_size_secure = data_size * security_factor; 170 | 171 | // execute selected collective 172 | switch(selected) 173 | { 174 | case mode::all2all: 175 | std::cout << "RUN: all2all" << std::endl; 176 | all2all(parse_plan(plan_file.c_str()), data_size, data_size_secure); 177 | break; 178 | case mode::all2all_async: 179 | std::cout << "RUN: all2all_async" << std::endl; 180 | all2all_async(parse_plan(plan_file.c_str()), data_size, data_size_secure); 181 | break; 182 | case mode::broadcast: 183 | std::cout << "RUN: broadcast" << std::endl; 184 | broadcaster(parse_plan(plan_file.c_str()), data_size, data_size_secure); 185 | break; 186 | case mode::scatter_gather: 187 | std::cout << "RUN: scatter_gather" << std::endl; 188 | scatter_gather(parse_plan(scatter_plan_file.c_str()), parse_plan(gather_plan_file.c_str()), data_size, data_size_secure); 189 | break; 190 | case mode::help: 191 | std::cout << make_man_page(cli, "execute"). 192 | prepend_section("DESCRIPTION", " test gossip on uniformly distributed data"); 193 | break; 194 | } 195 | } 196 | else 197 | { 198 | std::cout << usage_lines(cli, "execute") << '\n'; 199 | } 200 | 201 | } 202 | -------------------------------------------------------------------------------- /include/gossip.cuh: -------------------------------------------------------------------------------- 1 | # pragma once 2 | 3 | #include "hpc_helpers/include/cuda_helpers.cuh" 4 | 5 | #include "gossip/config.h" 6 | #include "gossip/context.cuh" 7 | #include "gossip/all_to_all_async.cuh" 8 | #include "gossip/all_to_all.cuh" 9 | #include "gossip/broadcast.cuh" 10 | #include "gossip/scatter.cuh" 11 | #include "gossip/gather.cuh" 12 | #include "gossip/multisplit.cuh" 13 | #include "gossip/point_to_point.cuh" 14 | #include "gossip/memory_manager.cuh" 15 | -------------------------------------------------------------------------------- /include/gossip/all_to_all_async.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "config.h" 6 | #include "error_checking.hpp" 7 | #include "common.cuh" 8 | #include "context.cuh" 9 | #include "all_to_all_plan.hpp" 10 | 11 | namespace gossip { 12 | 13 | class all2all_async_t { 14 | 15 | const context_t * context; 16 | 17 | transfer_plan_t transfer_plan; 18 | bool plan_valid; 19 | 20 | public: 21 | all2all_async_t ( 22 | const context_t& context_) 23 | : context(&context_), 24 | transfer_plan( all2all::default_plan(context->get_num_devices()) ), 25 | plan_valid( transfer_plan.valid() ) 26 | { 27 | check(context->is_valid(), 28 | "You have to pass a valid context!"); 29 | } 30 | 31 | all2all_async_t ( 32 | const context_t& context_, 33 | const transfer_plan_t& transfer_plan_) 34 | : context(&context_), 35 | transfer_plan(transfer_plan_), 36 | plan_valid(false) 37 | { 38 | check(context->is_valid(), 39 | "You have to pass a valid context!"); 40 | 41 | if(!transfer_plan.valid()) 42 | all2all::verify_plan(transfer_plan); 43 | 44 | check(get_num_devices() == transfer_plan.num_gpus(), 45 | "Plan does fit number of gpus of context!"); 46 | 47 | plan_valid = (get_num_devices() == transfer_plan.num_gpus()) && 48 | transfer_plan.valid(); 49 | } 50 | 51 | void show_plan() const { 52 | if(!plan_valid) 53 | std::cout << "WARNING: plan does fit number of gpus\n"; 54 | 55 | transfer_plan.show_plan(); 56 | } 57 | 58 | private: 59 | template < 60 | typename table_t> 61 | transfer_handler makeTransferHandler ( 62 | const std::vector >& send_counts, 63 | bool verbose = false 64 | ) const { 65 | const auto num_phases = transfer_plan.num_steps(); 66 | const auto num_chunks = transfer_plan.num_chunks(); 67 | 68 | std::vector > src_displacements(get_num_devices(), std::vector(get_num_devices()+1)); 69 | // horizontal scan to get src offsets 70 | for (gpu_id_t gpu = 0; gpu < get_num_devices(); ++gpu) { 71 | for (gpu_id_t part = 0; part < get_num_devices(); ++part) { 72 | src_displacements[gpu][part+1] = send_counts[gpu][part]+src_displacements[gpu][part]; 73 | } 74 | } 75 | std::vector > trg_displacements(get_num_devices()+1, std::vector(get_num_devices())); 76 | // vertical scan to get trg offsets 77 | for (gpu_id_t gpu = 0; gpu < get_num_devices(); ++gpu) { 78 | for (gpu_id_t part = 0; part < get_num_devices(); ++part) { 79 | trg_displacements[part+1][gpu] = send_counts[part][gpu]+trg_displacements[part][gpu]; 80 | } 81 | } 82 | 83 | transfer_handler transfers(context, 84 | src_displacements, 85 | trg_displacements, 86 | send_counts, 87 | num_phases, num_chunks); 88 | 89 | // prepare transfers according to transfer_plan 90 | for (const auto& sequence : transfer_plan.transfer_sequences()) { 91 | transfers.push_back(sequence.seq, sequence.size, verbose); 92 | } 93 | 94 | if(verbose) { 95 | for (size_t p = 0; p < num_phases; ++p) { 96 | transfers.show_phase(p); 97 | } 98 | } 99 | 100 | return transfers; 101 | } 102 | 103 | public: 104 | /** 105 | * Calculate buffer lengths needed to execute all2all with given send_counts. 106 | * The lenghts of the parameters have to match the context. 107 | * @param send_counts send_counts[k][l] elements are sent from device_ids[k] to device_ids[l]. 108 | * @param verbose if true, show details for each transfer. 109 | * @return bufs_len bufs_len[k] is required length of bufs[k] array. 110 | */ 111 | template < 112 | typename table_t> 113 | const std::vector calcBufferLengths ( 114 | const std::vector >& send_counts, 115 | bool verbose = false 116 | ) const { 117 | if (!check(plan_valid, "Invalid plan. Abort.")) 118 | return {}; 119 | 120 | for (const auto& counts : send_counts) { 121 | if (!check(counts.size() == get_num_devices(), 122 | "table size does not match number of gpus.")) 123 | return {}; 124 | } 125 | 126 | transfer_handler transfers = makeTransferHandler(send_counts, verbose); 127 | 128 | return transfers.aux_offsets; 129 | } 130 | 131 | /** 132 | * Execute all2all asynchronously using the given context. 133 | * The lenghts of the parameters have to match the context. 134 | * @param srcs pointers to source arrays. srcs[k] array should reside on device_ids[k]. 135 | * @param srcs_len srcs_len[k] is length of srcs[k] array. 136 | * @param dsts pointers to destination arrays. dsts[k] array should reside on device_ids[k]. 137 | * @param dsts_len dsts_len[k] is length of dsts[k] array. 138 | * @param bufs pointers to buffer arrays. bufs[k] array should reside on device_ids[k]. 139 | * @param bufs_len bufs_len[k] is length of bufs[k] array. 140 | * @param send_counts send_counts[k][l] elements are sent from device_ids[k] to device_ids[l]. 141 | * @param verbose if true, show details for each transfer. 142 | * @return true if executed successfully. 143 | */ 144 | template < 145 | typename value_t, 146 | typename index_t, 147 | typename table_t> 148 | bool execAsync ( 149 | const std::vector& srcs, 150 | const std::vector& srcs_lens, 151 | const std::vector& dsts, 152 | const std::vector& dsts_lens, 153 | const std::vector& bufs, 154 | const std::vector& bufs_lens, 155 | const std::vector >& send_counts, 156 | bool verbose = false 157 | ) const { 158 | if (!check(plan_valid, "Invalid plan. Abort.")) 159 | return false; 160 | 161 | if (!check(srcs.size() == get_num_devices(), 162 | "srcs size does not match number of gpus.")) 163 | return false; 164 | if (!check(srcs_lens.size() == get_num_devices(), 165 | "srcs_lens size does not match number of gpus.")) 166 | return false; 167 | if (!check(dsts.size() == get_num_devices(), 168 | "dsts size does not match number of gpus.")) 169 | return false; 170 | if (!check(dsts_lens.size() == get_num_devices(), 171 | "dsts_lens size does not match number of gpus.")) 172 | return false; 173 | if (!check(bufs.size() == get_num_devices(), 174 | "bufs size does not match number of gpus.")) 175 | return false; 176 | if (!check(bufs_lens.size() == get_num_devices(), 177 | "bufs_lens size does not match number of gpus.")) 178 | return false; 179 | if (!check(send_counts.size() == get_num_devices(), 180 | "table size does not match number of gpus.")) 181 | return false; 182 | for (const auto& counts : send_counts) { 183 | if (!check(counts.size() == get_num_devices(), 184 | "table size does not match number of gpus.")) 185 | return false; 186 | } 187 | 188 | transfer_handler transfers = makeTransferHandler(send_counts, verbose); 189 | 190 | if(!check_size(transfers.aux_offsets, bufs_lens)) return false; 191 | if(!check_size(transfers.trg_offsets.back(), dsts_lens)) return false; 192 | 193 | for (size_t p = 0; p < transfers.num_phases; ++p) 194 | transfers.execute_phase(p, srcs, dsts, bufs); 195 | 196 | return true; 197 | } 198 | 199 | gpu_id_t get_num_devices () const noexcept { 200 | return context->get_num_devices(); 201 | } 202 | 203 | void sync () const noexcept { 204 | context->sync_all_streams(); 205 | } 206 | 207 | void sync_hard () const noexcept { 208 | context->sync_hard(); 209 | } 210 | 211 | const context_t& get_context() const noexcept { 212 | return *context; 213 | } 214 | }; 215 | 216 | } // namespace 217 | -------------------------------------------------------------------------------- /include/gossip/all_to_all_plan.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "error_checking.hpp" 7 | #include "transfer_plan.hpp" 8 | 9 | namespace gossip { 10 | 11 | class all2all { 12 | 13 | public: 14 | static void verify_plan(transfer_plan_t& plan) { 15 | bool valid = true; 16 | 17 | valid &= check(plan.num_steps() >= 1, 18 | "planned sequence must be at least of length 2."); 19 | 20 | for (const auto& sequence : plan.transfer_sequences()) 21 | valid &= check(sequence.seq.size() == plan.num_steps()+1, 22 | "planned sequences must have same lengths."); 23 | 24 | std::vector > completeness(plan.num_gpus(), std::vector(plan.num_gpus())); 25 | for (const auto& sequence : plan.transfer_sequences()) { 26 | completeness[sequence.seq.front()][sequence.seq.back()] += sequence.size; 27 | } 28 | for (gpu_id_t src = 0; src < plan.num_gpus(); ++src) { 29 | for (gpu_id_t trg = 0; trg < plan.num_gpus(); ++trg) { 30 | valid &= check(completeness[src][trg] == plan.num_chunks(), 31 | "transfer plan is incomplete."); 32 | } 33 | } 34 | 35 | if(valid) 36 | plan.validate(); 37 | } 38 | 39 | static transfer_plan_t default_plan(const gpu_id_t num_gpus) { 40 | 41 | std::vector > sequences; 42 | 43 | sequences.reserve(num_gpus*num_gpus); 44 | 45 | // plan direct transfers from src to trg gpu 46 | for (gpu_id_t src = 0; src < num_gpus; ++src) { 47 | for (gpu_id_t trg = 0; trg < num_gpus; ++trg) { 48 | sequences.emplace_back(std::vector{src,trg}); 49 | } 50 | } 51 | 52 | transfer_plan_t plan("all2all", num_gpus, sequences); 53 | 54 | verify_plan(plan); 55 | 56 | return plan; 57 | } 58 | 59 | }; 60 | 61 | } // namespace 62 | -------------------------------------------------------------------------------- /include/gossip/broadcast_plan.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "error_checking.hpp" 7 | #include "transfer_plan.hpp" 8 | 9 | namespace gossip { 10 | 11 | class broadcast { 12 | 13 | public: 14 | static void verify_plan(transfer_plan_t& plan) { 15 | bool valid = true; 16 | 17 | valid &= check(plan.main_gpu() != gpu_id_t(-1), 18 | "main gpu not set in plan."); 19 | 20 | valid &= check(plan.num_steps() >= 1, 21 | "planned sequence must be at least of length 2."); 22 | 23 | for (const auto& sequence : plan.transfer_sequences()) 24 | valid &= check(sequence.seq.size() == plan.num_steps()+1, 25 | "planned sequences must have same lengths."); 26 | 27 | for (const auto& sequence : plan.transfer_sequences()) { 28 | valid &= check(sequence.seq.front() == plan.main_gpu(), 29 | "all sequences must have same source."); 30 | } 31 | 32 | std::vector completeness(plan.num_gpus()); 33 | // sum up all chunks for each target gpu 34 | for (const auto& sequence : plan.transfer_sequences()) { 35 | completeness[sequence.seq.back()] += 1; 36 | } 37 | for (gpu_id_t trg = 0; trg < plan.num_gpus(); ++trg) { 38 | valid &= check(completeness[trg] == plan.num_chunks(), 39 | "transfer plan is incomplete."); 40 | } 41 | 42 | if(valid) 43 | plan.validate(); 44 | } 45 | 46 | static transfer_plan_t default_plan(const gpu_id_t num_gpus, const gpu_id_t source) { 47 | 48 | std::vector > sequences; 49 | 50 | sequences.reserve(num_gpus); 51 | 52 | // plan direct transfers from source to trg gpu 53 | for (gpu_id_t trg = 0; trg < num_gpus; ++trg) { 54 | sequences.emplace_back(std::vector{source,trg}); 55 | } 56 | 57 | const size_t num_chunks = 1; 58 | const std::vector chunks(num_gpus, 0); 59 | 60 | transfer_plan_t plan("broadcast", num_gpus, sequences, num_chunks, chunks); 61 | 62 | plan.main_gpu(source); 63 | 64 | verify_plan(plan); 65 | 66 | return plan; 67 | } 68 | 69 | }; 70 | 71 | } // namespace 72 | -------------------------------------------------------------------------------- /include/gossip/config.h: -------------------------------------------------------------------------------- 1 | # pragma once 2 | 3 | #include 4 | 5 | #define THROW_EXCEPTIONS 1 6 | 7 | namespace gossip { 8 | 9 | using gpu_id_t = uint16_t; 10 | // type of multisplit counters 11 | using cnter_t = uint64_t; 12 | 13 | enum class PEER_STATUS : uint8_t { 14 | SLOW = 0, 15 | DIAG = 1, 16 | FAST = 2 17 | }; 18 | 19 | } // namespace 20 | -------------------------------------------------------------------------------- /include/gossip/context.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "config.h" 8 | #include "error_checking.hpp" 9 | 10 | namespace gossip { 11 | 12 | class context_t { 13 | 14 | gpu_id_t num_gpus; 15 | std::vector device_ids; 16 | std::vector> streams; 17 | std::vector> peer_status; 18 | bool valid = true; 19 | 20 | public: 21 | 22 | context_t (const gpu_id_t num_gpus_) { 23 | 24 | valid = check(num_gpus_ > 0, 25 | "Invalid number of devices."); 26 | 27 | num_gpus = num_gpus_; 28 | 29 | device_ids.resize(num_gpus); 30 | std::iota(device_ids.begin(), device_ids.end(), 0); 31 | 32 | initialize(); 33 | } 34 | 35 | context_t (const std::vector& device_ids_) { 36 | 37 | valid = check(!device_ids_.empty(), 38 | "Invalid number of device ids."); 39 | 40 | num_gpus = device_ids_.size(); 41 | 42 | device_ids = device_ids_; 43 | 44 | initialize(); 45 | } 46 | 47 | private: 48 | void initialize() { 49 | if(!valid) return; 50 | 51 | streams.resize(num_gpus, std::vector(num_gpus)); 52 | 53 | // create num_gpus^2 streams where streams[gpu*num_gpus+part] 54 | // denotes the stream to be used for GPU gpu and partition part 55 | for (gpu_id_t src_gpu = 0; src_gpu < num_gpus; ++src_gpu) { 56 | cudaSetDevice(get_device_id(src_gpu)); 57 | cudaDeviceSynchronize(); 58 | for (gpu_id_t part = 0; part < num_gpus; ++part) { 59 | cudaStreamCreate(&streams[src_gpu][part]); 60 | } 61 | } CUERR 62 | 63 | peer_status.resize(num_gpus, std::vector(num_gpus)); 64 | 65 | // compute the connectivity matrix 66 | for (gpu_id_t src_gpu = 0; src_gpu < num_gpus; ++src_gpu) { 67 | const gpu_id_t src = get_device_id(src_gpu); 68 | cudaSetDevice(src); 69 | for (gpu_id_t dst_gpu = 0; dst_gpu < num_gpus; ++dst_gpu) { 70 | const gpu_id_t dst = get_device_id(dst_gpu); 71 | 72 | // check if src can access dst 73 | if (src == dst) { 74 | peer_status[src_gpu][dst_gpu] = PEER_STATUS::DIAG; 75 | } else { 76 | int32_t status; 77 | cudaDeviceCanAccessPeer(&status, src, dst); 78 | peer_status[src_gpu][dst_gpu] = status ? 79 | PEER_STATUS::FAST : 80 | PEER_STATUS::SLOW ; 81 | } 82 | } 83 | } CUERR 84 | 85 | for (gpu_id_t src_gpu = 0; src_gpu < num_gpus; ++src_gpu) { 86 | const gpu_id_t src = get_device_id(src_gpu); 87 | cudaSetDevice(src); 88 | for (gpu_id_t dst_gpu = 0; dst_gpu < num_gpus; ++dst_gpu) { 89 | const gpu_id_t dst = get_device_id(dst_gpu); 90 | 91 | if (src_gpu != dst_gpu) { 92 | if (src == dst) 93 | std::cout << "WARNING: device identifiers are not unique." 94 | << std::endl; 95 | } 96 | 97 | if (peer_status[src_gpu][dst_gpu] == PEER_STATUS::FAST) { 98 | cudaDeviceEnablePeerAccess(dst, 0); 99 | 100 | // consume error for rendundant 101 | // peer access initialization 102 | const cudaError_t cuerr = cudaGetLastError(); 103 | if (cuerr == cudaErrorPeerAccessAlreadyEnabled) 104 | std::cout << "STATUS: redundant enabling of " 105 | << "peer access from GPU " << src 106 | << " to GPU " << dst << " attempted." 107 | << std::endl; 108 | else if (cuerr) 109 | std::cout << "CUDA error: " 110 | << cudaGetErrorString(cuerr) << " : " 111 | << __FILE__ << ", line " 112 | << __LINE__ << std::endl; 113 | } 114 | 115 | } 116 | } CUERR 117 | } 118 | 119 | public: 120 | ~context_t () { 121 | 122 | if(!valid) return; 123 | 124 | // synchronize and destroy streams 125 | for (gpu_id_t src_gpu = 0; src_gpu < num_gpus; ++src_gpu) { 126 | cudaSetDevice(get_device_id(src_gpu)); 127 | cudaDeviceSynchronize(); 128 | for (gpu_id_t part = 0; part < num_gpus; ++part) { 129 | cudaStreamSynchronize(get_streams(src_gpu)[part]); 130 | cudaStreamDestroy(streams[src_gpu][part]); 131 | } 132 | } CUERR 133 | 134 | // disable peer access 135 | for (gpu_id_t src_gpu = 0; src_gpu < num_gpus; ++src_gpu) { 136 | const gpu_id_t src = get_device_id(src_gpu); 137 | cudaSetDevice(src); 138 | for (gpu_id_t dst_gpu = 0; dst_gpu < num_gpus; ++dst_gpu) { 139 | const gpu_id_t dst = get_device_id(dst_gpu); 140 | 141 | if (peer_status[src_gpu][dst_gpu] == PEER_STATUS::FAST) { 142 | cudaDeviceDisablePeerAccess(dst); 143 | 144 | // consume error for rendundant 145 | // peer access deactivation 146 | const cudaError_t cuerr = cudaGetLastError(); 147 | if (cuerr == cudaErrorPeerAccessNotEnabled) 148 | std::cout << "STATUS: redundant disabling of " 149 | << "peer access from GPU " << src_gpu 150 | << " to GPU " << dst << " attempted." 151 | << std::endl; 152 | else if (cuerr) 153 | std::cout << "CUDA error: " 154 | << cudaGetErrorString(cuerr) << " : " 155 | << __FILE__ << ", line " 156 | << __LINE__ << std::endl; 157 | } 158 | } 159 | } CUERR 160 | } 161 | 162 | // return the number of devices belonging to context 163 | gpu_id_t get_num_devices () const noexcept { 164 | return num_gpus; 165 | } 166 | 167 | // return the actual device identifier of specified GPU 168 | gpu_id_t get_device_id (const gpu_id_t gpu) const noexcept { 169 | return device_ids[gpu]; 170 | } 171 | 172 | // return vector of streams associated with to specified GPU 173 | const std::vector& get_streams (const gpu_id_t gpu) const noexcept { 174 | return streams[gpu]; 175 | } 176 | 177 | // sync all streams associated with the specified GPU 178 | void sync_gpu_streams (const gpu_id_t gpu) const noexcept { 179 | cudaSetDevice(get_device_id(gpu)); 180 | for (gpu_id_t part = 0; part < num_gpus; ++part) 181 | cudaStreamSynchronize(get_streams(gpu)[part]); 182 | } 183 | 184 | // sync all streams of the context 185 | void sync_all_streams () const noexcept { 186 | for (gpu_id_t gpu = 0; gpu < num_gpus; ++gpu) 187 | sync_gpu_streams(gpu); 188 | } 189 | 190 | // sync all GPUs 191 | void sync_hard () const noexcept { 192 | for (gpu_id_t gpu = 0; gpu < num_gpus; ++gpu) { 193 | cudaSetDevice(get_device_id(gpu)); 194 | cudaDeviceSynchronize(); 195 | } 196 | } 197 | 198 | // check if both streams and device identifiers are created 199 | bool is_valid () const noexcept { 200 | return !streams.empty() && !device_ids.empty(); 201 | } 202 | 203 | void print_connectivity_matrix () const { 204 | std::cout << "STATUS: connectivity matrix:" << std::endl; 205 | for (gpu_id_t src_gpu = 0; src_gpu < num_gpus; ++src_gpu) 206 | for (gpu_id_t dst_gpu = 0; dst_gpu < num_gpus; ++dst_gpu) 207 | std::cout << (dst_gpu == 0 ? "STATUS: |" : "") 208 | << uint32_t(peer_status[src_gpu][dst_gpu]) 209 | << (dst_gpu+1 == num_gpus ? "|\n" : " "); 210 | } 211 | }; 212 | 213 | } // namespace 214 | -------------------------------------------------------------------------------- /include/gossip/error_checking.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "config.h" 8 | 9 | namespace gossip { 10 | 11 | inline 12 | bool check(bool statement, const char* message) { 13 | if(!statement) { 14 | #ifdef THROW_EXCEPTIONS 15 | throw std::invalid_argument(message); 16 | #else 17 | std::cerr << message << std::endl; 18 | return false; 19 | #endif 20 | } 21 | return true; 22 | } 23 | 24 | template 25 | bool check_size( 26 | const size_t transfer_size, 27 | const index_t buffer_length 28 | ) { 29 | return check(transfer_size <= buffer_length, 30 | "buffer not large enough for transfers."); 31 | } 32 | 33 | template 34 | bool check_size( 35 | const std::vector& transfer_sizes, 36 | const std::vector& buffer_lengths 37 | ) { 38 | for (gpu_id_t i = 0; i < transfer_sizes.size(); ++i) { 39 | if (!check_size(transfer_sizes[i], buffer_lengths[i])) 40 | return false; 41 | } 42 | return true; 43 | } 44 | 45 | } // namespace 46 | -------------------------------------------------------------------------------- /include/gossip/gather.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "config.h" 6 | #include "error_checking.hpp" 7 | #include "common.cuh" 8 | #include "context.cuh" 9 | #include "gather_plan.hpp" 10 | 11 | namespace gossip { 12 | 13 | class gather_t { 14 | 15 | private: 16 | const context_t * context; 17 | 18 | transfer_plan_t transfer_plan; 19 | bool plan_valid; 20 | 21 | public: 22 | gather_t ( 23 | const context_t& context_, 24 | const gpu_id_t main_gpu_) 25 | : context(&context_), 26 | transfer_plan( gather::default_plan(context->get_num_devices(), main_gpu_) ), 27 | plan_valid( transfer_plan.valid() ) 28 | { 29 | check(context->is_valid(), 30 | "You have to pass a valid context!"); 31 | } 32 | 33 | gather_t ( 34 | const context_t& context_, 35 | const transfer_plan_t& transfer_plan_) 36 | : context(&context_), 37 | transfer_plan(transfer_plan_), 38 | plan_valid(false) 39 | { 40 | check(context->is_valid(), 41 | "You have to pass a valid context!"); 42 | 43 | if(!transfer_plan.valid()) 44 | gather::verify_plan(transfer_plan); 45 | 46 | check(get_num_devices() == transfer_plan.num_gpus(), 47 | "Plan does fit number of gpus of context!"); 48 | 49 | plan_valid = (get_num_devices() == transfer_plan.num_gpus()) && 50 | transfer_plan.valid(); 51 | } 52 | 53 | void show_plan() const { 54 | if(!plan_valid) 55 | std::cout << "WARNING: plan does fit number of gpus\n"; 56 | 57 | transfer_plan.show_plan(); 58 | } 59 | 60 | private: 61 | template < 62 | typename table_t> 63 | transfer_handler makeTransferHandler ( 64 | const std::vector& send_counts, 65 | bool verbose = false 66 | ) const { 67 | const auto main_gpu = transfer_plan.main_gpu(); 68 | const auto num_phases = transfer_plan.num_steps(); 69 | const auto num_chunks = transfer_plan.num_chunks(); 70 | 71 | std::vector > src_displacements(get_num_devices(), std::vector(get_num_devices()+1)); 72 | 73 | std::vector > trg_displacements(get_num_devices()+1, std::vector(get_num_devices())); 74 | for (gpu_id_t part = 0; part < get_num_devices(); ++part) { 75 | // exclusive scan to get trg_displacements 76 | trg_displacements[part+1][main_gpu] = send_counts[part] + trg_displacements[part][main_gpu]; 77 | } 78 | 79 | std::vector > sizes(get_num_devices(), std::vector(get_num_devices())); 80 | for (gpu_id_t part = 0; part < get_num_devices(); ++part) { 81 | sizes[part][main_gpu] = send_counts[part]; 82 | } 83 | 84 | transfer_handler transfers(context, 85 | src_displacements, 86 | trg_displacements, 87 | sizes, 88 | num_phases, num_chunks); 89 | 90 | // prepare transfers according to transfer_plan 91 | for (const auto& sequence : transfer_plan.transfer_sequences()) { 92 | transfers.push_back(sequence.seq, sequence.size, verbose); 93 | } 94 | 95 | if(verbose) { 96 | for (size_t p = 0; p < num_phases; ++p) { 97 | transfers.show_phase(p); 98 | } 99 | } 100 | 101 | return transfers; 102 | } 103 | 104 | public: 105 | /** 106 | * Calculate buffer lengths needed to execute scatter with given send_counts. 107 | * The lenghts of the parameters have to match the context. 108 | * @param send_counts send_counts[k] elements are sent to device_ids[k]. 109 | * @param verbose if true, show details for each transfer. 110 | * @return bufs_len bufs_len[k] is required length of bufs[k] array. 111 | */ 112 | template < 113 | typename table_t> 114 | const std::vector calcBufferLengths ( 115 | const std::vector& send_counts, 116 | bool verbose = false 117 | ) const { 118 | if (!check(plan_valid, "Invalid plan. Abort.")) 119 | return {}; 120 | 121 | if (!check(send_counts.size() == get_num_devices(), 122 | "table size does not match number of gpus.")) 123 | return {}; 124 | 125 | transfer_handler transfers = makeTransferHandler(send_counts, verbose); 126 | 127 | return transfers.aux_offsets; 128 | } 129 | 130 | /** 131 | * Execute gather asynchronously using the given context. 132 | * The lenghts of the parameters have to match the context. 133 | * @param srcs pointers to source arrays. srcs[k] array should reside on device_ids[k]. 134 | * @param srcs_len srcs_len[k] is length of srcs[k] array. 135 | * @param send_counts send_counts[k] elements are sent from device_ids[k]. 136 | * @param dst pointer to destination array. should reside on device_ids[main_gpu]. 137 | * @param dst_len dst_len is length of dst array. 138 | * @param verbose if true, show details for each transfer. 139 | * @return true if executed successfully. 140 | */ 141 | template < 142 | typename value_t, 143 | typename index_t, 144 | typename table_t> 145 | bool execAsync ( 146 | const std::vector& srcs, 147 | const std::vector& srcs_lens, 148 | value_t * dst, 149 | const index_t dst_len, 150 | const std::vector& bufs, 151 | const std::vector& bufs_lens, 152 | const std::vector& send_counts, 153 | bool verbose = false 154 | ) const { 155 | if (!check(plan_valid, "Invalid plan. Abort.")) 156 | return false; 157 | 158 | if (!check(srcs.size() == get_num_devices(), 159 | "srcs size does not match number of gpus.")) 160 | return false; 161 | if (!check(srcs_lens.size() == get_num_devices(), 162 | "srcs_lens size does not match number of gpus.")) 163 | return false; 164 | if (!check(bufs.size() == get_num_devices(), 165 | "bufs size does not match number of gpus.")) 166 | return false; 167 | if (!check(bufs_lens.size() == get_num_devices(), 168 | "bufs_lens size does not match number of gpus.")) 169 | return false; 170 | if (!check(send_counts.size() == get_num_devices(), 171 | "table size does not match number of gpus.")) 172 | return false; 173 | 174 | transfer_handler transfers = makeTransferHandler(send_counts, verbose); 175 | 176 | // check source array sizes 177 | if (!check_size(send_counts, srcs_lens)) return false; 178 | // check buffer array sizes 179 | if(!check_size(transfers.aux_offsets, bufs_lens)) return false; 180 | // check destination array size 181 | if(!check_size(transfers.trg_offsets.back()[transfer_plan.main_gpu()], dst_len)) 182 | return false; 183 | 184 | std::vector dsts(get_num_devices()); 185 | dsts[transfer_plan.main_gpu()] = dst; 186 | 187 | for (size_t p = 0; p < transfers.num_phases; ++p) 188 | transfers.execute_phase(p, srcs, dsts, bufs); 189 | 190 | return true; 191 | } 192 | 193 | gpu_id_t get_num_devices () const noexcept { 194 | return context->get_num_devices(); 195 | } 196 | 197 | void sync () const noexcept { 198 | context->sync_all_streams(); 199 | } 200 | 201 | void sync_hard () const noexcept { 202 | context->sync_hard(); 203 | } 204 | 205 | const context_t& get_context() const noexcept { 206 | return *context; 207 | } 208 | }; 209 | 210 | } // namespace 211 | -------------------------------------------------------------------------------- /include/gossip/gather_plan.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "error_checking.hpp" 7 | #include "transfer_plan.hpp" 8 | 9 | namespace gossip { 10 | 11 | class gather { 12 | 13 | public: 14 | static void verify_plan(transfer_plan_t& plan) { 15 | bool valid = true; 16 | 17 | valid &= check(plan.main_gpu() != gpu_id_t(-1), 18 | "main gpu not set in plan."); 19 | 20 | valid &= check(plan.num_steps() >= 1, 21 | "planned sequence must be at least of length 2."); 22 | 23 | for (const auto& sequence : plan.transfer_sequences()) 24 | valid &= check(sequence.seq.size() == plan.num_steps()+1, 25 | "planned sequences must have same lengths."); 26 | 27 | for (const auto& sequence : plan.transfer_sequences()) { 28 | valid &= check(sequence.seq.back() == plan.main_gpu(), 29 | "all sequences must have same target."); 30 | } 31 | 32 | std::vector completeness(plan.num_gpus()); 33 | // sum up all chunks for each source gpu 34 | for (const auto& sequence : plan.transfer_sequences()) { 35 | completeness[sequence.seq.front()] += sequence.size; 36 | } 37 | for (gpu_id_t trg = 0; trg < plan.num_gpus(); ++trg) { 38 | valid &= check(completeness[trg] == plan.num_chunks(), 39 | "transfer plan is incomplete."); 40 | } 41 | 42 | if(valid) 43 | plan.validate(); 44 | } 45 | 46 | static transfer_plan_t default_plan(const gpu_id_t num_gpus, const gpu_id_t target) { 47 | 48 | std::vector > sequences; 49 | 50 | sequences.reserve(num_gpus); 51 | 52 | // plan direct transfers from src to target gpu 53 | for (gpu_id_t src = 0; src < num_gpus; ++src) { 54 | sequences.emplace_back(std::vector{src,target}); 55 | } 56 | 57 | transfer_plan_t plan("gather", num_gpus, sequences); 58 | 59 | plan.main_gpu(target); 60 | 61 | verify_plan(plan); 62 | 63 | return plan; 64 | } 65 | 66 | }; 67 | 68 | } // namespace 69 | -------------------------------------------------------------------------------- /include/gossip/memory_manager.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "config.h" 6 | #include "error_checking.hpp" 7 | #include "context.cuh" 8 | 9 | namespace gossip { 10 | 11 | class memory_manager_t { 12 | 13 | const context_t * context; 14 | 15 | public: 16 | memory_manager_t (const context_t& context_) : context(&context_) 17 | { 18 | check(context->is_valid(), 19 | "You have to pass a valid context!"); 20 | } 21 | 22 | template < 23 | typename value_t, 24 | typename index_t> 25 | std::vector 26 | alloc_device( 27 | const std::vector& lens, 28 | const bool zero=true) const { 29 | 30 | std::vector data = {}; 31 | 32 | if (!check(lens.size() == get_num_devices(), 33 | "lens size does not match number of gpus.")) 34 | return data; 35 | 36 | data.resize(get_num_devices()); 37 | 38 | // malloc as device-sided memory 39 | for (gpu_id_t gpu = 0; gpu < get_num_devices(); ++gpu) { 40 | cudaSetDevice(context->get_device_id(gpu)); 41 | cudaMalloc(&data[gpu], sizeof(value_t)*lens[gpu]); 42 | if (zero) 43 | cudaMemsetAsync(data[gpu], 0, sizeof(value_t)*lens[gpu], 44 | context->get_streams(gpu)[0]); 45 | } 46 | CUERR 47 | 48 | return std::move(data); 49 | } 50 | 51 | template < 52 | typename value_t, 53 | typename index_t> 54 | std::vector 55 | alloc_host( 56 | const std::vector& lens, 57 | const bool zero=true) const { 58 | 59 | std::vector data = {}; 60 | 61 | if (!check(lens.size() == get_num_devices(), 62 | "lens size does not match number of gpus.")) 63 | return data; 64 | 65 | data.resize(get_num_devices()); 66 | 67 | // malloc as host-sided pinned memory 68 | for (gpu_id_t gpu = 0; gpu < get_num_devices(); ++gpu) { 69 | cudaMallocHost(&data[gpu], sizeof(value_t)*lens[gpu]); 70 | if (zero) 71 | std::memset(data[gpu], 0, sizeof(value_t)*lens[gpu]); 72 | } 73 | CUERR 74 | 75 | return std::move(data); 76 | } 77 | 78 | template < 79 | typename value_t> 80 | bool free_device(std::vector& data) const { 81 | 82 | if (!check(data.size() == get_num_devices(), 83 | "data size does not match number of gpus.")) 84 | return false; 85 | 86 | for (gpu_id_t gpu = 0; gpu < get_num_devices(); ++gpu) { 87 | cudaSetDevice(context->get_device_id(gpu)); 88 | cudaFree(data[gpu]); 89 | } 90 | CUERR 91 | 92 | return true; 93 | } 94 | 95 | template < 96 | typename value_t> 97 | bool free_host(std::vector& data) const { 98 | 99 | if (!check(data.size() == get_num_devices(), 100 | "data size does not match number of gpus.")) 101 | return false; 102 | 103 | for (gpu_id_t gpu = 0; gpu < get_num_devices(); ++gpu) 104 | cudaFreeHost(data[gpu]); 105 | CUERR 106 | 107 | return true; 108 | } 109 | 110 | gpu_id_t get_num_devices () const noexcept { 111 | return context->get_num_devices(); 112 | } 113 | }; 114 | 115 | } // namespace 116 | -------------------------------------------------------------------------------- /include/gossip/point_to_point.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "config.h" 5 | #include "error_checking.hpp" 6 | #include "context.cuh" 7 | 8 | namespace gossip { 9 | 10 | class point2point_t { 11 | 12 | const context_t * context; 13 | 14 | public: 15 | point2point_t ( 16 | const context_t& context_) 17 | : context(&context_) 18 | { 19 | check(context->is_valid(), 20 | "You have to pass a valid context!"); 21 | } 22 | 23 | template < 24 | cudaMemcpyKind cudaMemcpyDirection, 25 | typename value_t, 26 | typename index_t> 27 | bool execAsync ( 28 | const std::vector& srcs, 29 | const std::vector& dsts, 30 | const std::vector& lens 31 | ) const { 32 | CUERR 33 | if (!check(srcs.size() == get_num_devices(), 34 | "srcs size does not match number of gpus.")) 35 | return false; 36 | if (!check(dsts.size() == get_num_devices(), 37 | "dsts size does not match number of gpus.")) 38 | return false; 39 | if (!check(lens.size() == get_num_devices(), 40 | "lens size does not match number of gpus.")) 41 | return false; 42 | 43 | for (gpu_id_t src_gpu = 0; src_gpu < get_num_devices(); ++src_gpu) { 44 | if (lens[src_gpu] > 0) { 45 | cudaSetDevice(context->get_device_id(src_gpu)); 46 | 47 | cudaMemcpyAsync(dsts[src_gpu], srcs[src_gpu], 48 | sizeof(value_t)*lens[src_gpu], 49 | cudaMemcpyDirection, 50 | context->get_streams(src_gpu)[0]); 51 | } 52 | } CUERR 53 | 54 | return true; 55 | } 56 | 57 | template < 58 | typename value_t, 59 | typename index_t> 60 | bool execH2DAsync ( 61 | const std::vector& srcs, 62 | const std::vector& dsts, 63 | const std::vector& lens 64 | ) const { 65 | return execAsync(srcs, dsts, lens); 66 | } 67 | 68 | template < 69 | typename value_t, 70 | typename index_t> 71 | bool execD2HAsync ( 72 | const std::vector& srcs, 73 | const std::vector& dsts, 74 | const std::vector& lens 75 | ) const { 76 | return execAsync(srcs, dsts, lens); 77 | } 78 | 79 | template < 80 | typename value_t, 81 | typename index_t> 82 | bool execD2DAsync ( 83 | const std::vector& srcs, 84 | const std::vector& dsts, 85 | const std::vector& lens 86 | ) const { 87 | return execAsync(srcs, dsts, lens); 88 | } 89 | 90 | gpu_id_t get_num_devices () const noexcept { 91 | return context->get_num_devices(); 92 | } 93 | 94 | void sync () const noexcept { 95 | context->sync_all_streams(); 96 | } 97 | 98 | void sync_hard () const noexcept { 99 | context->sync_hard(); 100 | } 101 | 102 | const context_t& get_context() const noexcept { 103 | return *context; 104 | } 105 | }; 106 | 107 | } // namespace 108 | -------------------------------------------------------------------------------- /include/gossip/scatter.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "config.h" 6 | #include "error_checking.hpp" 7 | #include "common.cuh" 8 | #include "context.cuh" 9 | #include "scatter_plan.hpp" 10 | 11 | namespace gossip { 12 | 13 | class scatter_t { 14 | 15 | private: 16 | const context_t * context; 17 | 18 | transfer_plan_t transfer_plan; 19 | bool plan_valid; 20 | 21 | public: 22 | scatter_t ( 23 | const context_t& context_, 24 | const gpu_id_t main_gpu_) 25 | : context(&context_), 26 | transfer_plan( scatter::default_plan(context->get_num_devices(), main_gpu_) ), 27 | plan_valid( transfer_plan.valid() ) 28 | { 29 | check(context->is_valid(), 30 | "You have to pass a valid context!"); 31 | } 32 | 33 | scatter_t ( 34 | const context_t& context_, 35 | const transfer_plan_t& transfer_plan_) 36 | : context(&context_), 37 | transfer_plan(transfer_plan_), 38 | plan_valid(false) 39 | { 40 | check(context->is_valid(), 41 | "You have to pass a valid context!"); 42 | 43 | if(!transfer_plan.valid()) 44 | scatter::verify_plan(transfer_plan); 45 | 46 | check(get_num_devices() == transfer_plan.num_gpus(), 47 | "Plan does fit number of gpus of context!"); 48 | 49 | plan_valid = (get_num_devices() == transfer_plan.num_gpus()) && 50 | transfer_plan.valid(); 51 | } 52 | 53 | void show_plan() const { 54 | if(!plan_valid) 55 | std::cout << "WARNING: plan does fit number of gpus\n"; 56 | 57 | transfer_plan.show_plan(); 58 | } 59 | 60 | private: 61 | template < 62 | typename table_t> 63 | transfer_handler makeTransferHandler ( 64 | const std::vector& send_counts, 65 | bool verbose = false 66 | ) const { 67 | const auto main_gpu = transfer_plan.main_gpu(); 68 | const auto num_phases = transfer_plan.num_steps(); 69 | const auto num_chunks = transfer_plan.num_chunks(); 70 | 71 | std::vector > src_displacements(get_num_devices(), std::vector(get_num_devices()+1)); 72 | for (gpu_id_t part = 0; part < get_num_devices(); ++part) { 73 | // exclusive scan to get src_displacements 74 | src_displacements[main_gpu][part+1] = send_counts[part] + src_displacements[main_gpu][part]; 75 | } 76 | 77 | std::vector > trg_displacements(get_num_devices()+1, std::vector(get_num_devices())); 78 | 79 | std::vector > all_send_counts(get_num_devices(), std::vector(get_num_devices())); 80 | for (gpu_id_t part = 0; part < get_num_devices(); ++part) { 81 | all_send_counts[main_gpu][part] = send_counts[part]; 82 | } 83 | 84 | transfer_handler transfers(context, 85 | src_displacements, 86 | trg_displacements, 87 | all_send_counts, 88 | num_phases, num_chunks); 89 | 90 | // prepare transfers according to transfer_plan 91 | for (const auto& sequence : transfer_plan.transfer_sequences()) { 92 | transfers.push_back(sequence.seq, sequence.size, verbose); 93 | } 94 | 95 | if(verbose) { 96 | for (size_t p = 0; p < num_phases; ++p) { 97 | transfers.show_phase(p); 98 | } 99 | } 100 | 101 | return transfers; 102 | } 103 | 104 | public: 105 | /** 106 | * Calculate buffer lengths needed to execute scatter with given send_counts. 107 | * The lenghts of the parameters have to match the context. 108 | * @param send_counts send_counts[k] elements are sent to device_ids[k]. 109 | * @param verbose if true, show details for each transfer. 110 | * @return bufs_len bufs_len[k] is required length of bufs[k] array. 111 | */ 112 | template < 113 | typename table_t> 114 | const std::vector calcBufferLengths ( 115 | const std::vector& send_counts, 116 | bool verbose = false 117 | ) const { 118 | if (!check(plan_valid, "Invalid plan. Abort.")) 119 | return {}; 120 | 121 | if (!check(send_counts.size() == get_num_devices(), 122 | "table size does not match number of gpus.")) 123 | return {}; 124 | 125 | transfer_handler transfers = makeTransferHandler(send_counts, verbose); 126 | 127 | return transfers.aux_offsets; 128 | } 129 | 130 | /** 131 | * Execute scatter asynchronously using the given context. 132 | * The lenghts of the parameters have to match the context. 133 | * @param src pointer to source array. should reside on device_ids[main_gpu]. 134 | * @param src_len src_len is length of src array. 135 | * @param dsts pointers to destination arrays. dsts[k] array should reside on device_ids[k]. 136 | * @param dsts_len dsts_len[k] is length of dsts[k] array. 137 | * @param bufs pointers to buffer arrays. bufs[k] array should reside on device_ids[k]. 138 | * @param bufs_len bufs_len[k] is length of bufs[k] array. 139 | * @param send_counts send_counts[k] elements are sent to device_ids[k] 140 | * @param verbose if true, show details for each transfer. 141 | * @return true if executed successfully. 142 | */ 143 | template < 144 | typename value_t, 145 | typename index_t, 146 | typename table_t> 147 | bool execAsync ( 148 | value_t * src, 149 | const index_t src_len, 150 | const std::vector& dsts, 151 | const std::vector& dsts_lens, 152 | const std::vector& bufs, 153 | const std::vector& bufs_lens, 154 | const std::vector& send_counts, 155 | bool verbose = false 156 | ) const { 157 | if (!check(plan_valid, "Invalid plan. Abort.")) 158 | return false; 159 | 160 | if (!check(dsts.size() == get_num_devices(), 161 | "dsts size does not match number of gpus.")) 162 | return false; 163 | if (!check(dsts_lens.size() == get_num_devices(), 164 | "dsts_lens size does not match number of gpus.")) 165 | return false; 166 | if (!check(bufs.size() == get_num_devices(), 167 | "bufs size does not match number of gpus.")) 168 | return false; 169 | if (!check(bufs_lens.size() == get_num_devices(), 170 | "bufs_lens size does not match number of gpus.")) 171 | return false; 172 | if (!check(send_counts.size() == get_num_devices(), 173 | "table size does not match number of gpus.")) 174 | return false; 175 | 176 | transfer_handler transfers = makeTransferHandler(send_counts, verbose); 177 | 178 | // check source array size 179 | if (!check_size(transfers.src_offsets[transfer_plan.main_gpu()].back(), src_len)) 180 | return false; 181 | // check buffer array sizes 182 | if(!check_size(transfers.aux_offsets, bufs_lens)) return false; 183 | // check destination array sizes 184 | if(!check_size(send_counts, dsts_lens)) return false; 185 | 186 | std::vector srcs(get_num_devices()); 187 | srcs[transfer_plan.main_gpu()] = src; 188 | 189 | for (size_t p = 0; p < transfers.num_phases; ++p) 190 | transfers.execute_phase(p, srcs, dsts, bufs); 191 | 192 | return true; 193 | } 194 | 195 | gpu_id_t get_num_devices () const noexcept { 196 | return context->get_num_devices(); 197 | } 198 | 199 | void sync () const noexcept { 200 | context->sync_all_streams(); 201 | } 202 | 203 | void sync_hard () const noexcept { 204 | context->sync_hard(); 205 | } 206 | 207 | const context_t& get_context() const noexcept { 208 | return *context; 209 | } 210 | }; 211 | 212 | } // namespace 213 | -------------------------------------------------------------------------------- /include/gossip/scatter_plan.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "error_checking.hpp" 7 | #include "transfer_plan.hpp" 8 | 9 | namespace gossip { 10 | 11 | class scatter { 12 | 13 | public: 14 | static void verify_plan(transfer_plan_t& plan) { 15 | bool valid = true; 16 | 17 | valid &= check(plan.main_gpu() != gpu_id_t(-1), 18 | "main gpu not set in plan."); 19 | 20 | valid &= check(plan.num_steps() >= 1, 21 | "planned sequence must be at least of length 2."); 22 | 23 | for (const auto& sequence : plan.transfer_sequences()) 24 | valid &= check(sequence.seq.size() == plan.num_steps()+1, 25 | "planned sequences must have same lengths."); 26 | 27 | for (const auto& sequence : plan.transfer_sequences()) { 28 | valid &= check(sequence.seq.front() == plan.main_gpu(), 29 | "all sequences must have same source."); 30 | } 31 | 32 | std::vector completeness(plan.num_gpus()); 33 | // sum up all chunks for each target gpu 34 | for (const auto& sequence : plan.transfer_sequences()) { 35 | completeness[sequence.seq.back()] += sequence.size; 36 | } 37 | for (gpu_id_t trg = 0; trg < plan.num_gpus(); ++trg) { 38 | valid &= check(completeness[trg] == plan.num_chunks(), 39 | "transfer plan is incomplete."); 40 | } 41 | 42 | if(valid) 43 | plan.validate(); 44 | } 45 | 46 | static transfer_plan_t default_plan(const gpu_id_t num_gpus, const gpu_id_t source) { 47 | 48 | std::vector > sequences; 49 | 50 | sequences.reserve(num_gpus); 51 | 52 | // plan direct transfers from source to trg gpu 53 | for (gpu_id_t trg = 0; trg < num_gpus; ++trg) { 54 | sequences.emplace_back(std::vector{source,trg}); 55 | } 56 | 57 | transfer_plan_t plan("scatter", num_gpus, sequences); 58 | 59 | plan.main_gpu(source); 60 | 61 | verify_plan(plan); 62 | 63 | return plan; 64 | } 65 | 66 | }; 67 | 68 | } // namespace 69 | -------------------------------------------------------------------------------- /include/gossip/transfer_plan.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "config.h" 7 | 8 | namespace gossip { 9 | 10 | class transfer_plan_t { 11 | 12 | struct transfer_sequence { 13 | std::vector seq; 14 | size_t size; 15 | }; 16 | 17 | std::string type_; 18 | gpu_id_t num_gpus_; 19 | gpu_id_t main_gpu_; 20 | size_t num_steps_; 21 | size_t num_chunks_; 22 | std::vector transfer_sequences_; 23 | std::vector sync_steps_; 24 | bool valid_; 25 | 26 | public: 27 | transfer_plan_t( 28 | const std::string type, 29 | const gpu_id_t num_gpus, 30 | const std::vector>& sequences 31 | ) : 32 | type_(type), 33 | num_gpus_(num_gpus), 34 | main_gpu_(gpu_id_t(-1)), 35 | num_steps_(0), 36 | num_chunks_(1), 37 | valid_(false) 38 | { 39 | if(sequences.size()) 40 | num_steps_ = sequences[0].size()-1; 41 | transfer_sequences_.reserve(sequences.size()); 42 | for(const auto& sequence : sequences) 43 | transfer_sequences_.push_back({sequence, 1}); 44 | } 45 | 46 | transfer_plan_t( 47 | const std::string type, 48 | const gpu_id_t num_gpus, 49 | const std::vector>& sequences, 50 | const size_t num_chunks, 51 | const std::vector& transfer_sizes 52 | ) : 53 | type_(type), 54 | num_gpus_(num_gpus), 55 | main_gpu_(gpu_id_t(-1)), 56 | num_steps_(0), 57 | num_chunks_(num_chunks), 58 | valid_(false) 59 | { 60 | if(sequences.size() == transfer_sizes.size()) { 61 | if(sequences.size()) 62 | num_steps_ = sequences[0].size()-1; 63 | 64 | transfer_sequences_.reserve(sequences.size()); 65 | 66 | for(size_t i = 0; i < sequences.size(); ++i) 67 | transfer_sequences_.push_back({sequences[i], transfer_sizes[i]}); 68 | } 69 | } 70 | 71 | public: 72 | std::string type() const noexcept { 73 | return type_; 74 | } 75 | 76 | gpu_id_t num_gpus() const noexcept { 77 | return num_gpus_; 78 | } 79 | 80 | gpu_id_t main_gpu() const noexcept { 81 | return main_gpu_; 82 | } 83 | 84 | void main_gpu(const gpu_id_t gpu) { 85 | main_gpu_ = gpu; 86 | } 87 | 88 | size_t num_steps() const noexcept { 89 | return num_steps_; 90 | } 91 | 92 | size_t num_chunks() const noexcept { 93 | return num_chunks_; 94 | } 95 | 96 | const std::vector& transfer_sequences() const { 97 | return transfer_sequences_; 98 | } 99 | 100 | const std::vector& sync_steps() { 101 | return sync_steps_; 102 | } 103 | 104 | void sync_steps(const std::vector& steps) { 105 | sync_steps_ = steps; 106 | } 107 | 108 | bool synchronized() const noexcept { 109 | return sync_steps_.size() > 0; 110 | } 111 | 112 | bool valid() const noexcept { 113 | return valid_; 114 | } 115 | 116 | void validate() { 117 | valid_ = true; 118 | } 119 | 120 | void invalidate() { 121 | valid_ = false; 122 | } 123 | 124 | void show_plan() const { 125 | if(!valid_) 126 | std::cout << "ERROR: invalid plan\n"; 127 | 128 | std::cout << "INFO: Transfer plan for " << uint32_t(num_gpus_) << " gpus\n"; 129 | std::cout << "INFO: Transfer " << uint32_t(num_chunks_) << " chunks in " << num_steps_ << " steps\n"; 130 | 131 | if(synchronized()) { 132 | std::cout << "INFO: Plan synchronizes after steps "; 133 | for(const auto& s : sync_steps_) 134 | std::cout << s << ' '; 135 | std::cout << '\n'; 136 | } 137 | else { 138 | std::cout << "INFO: Plan is without synchronization\n"; 139 | } 140 | 141 | for (const auto& sequence : transfer_sequences_) { 142 | std::cout << "\tTransfer " 143 | << sequence.size 144 | << " chunks via ["; 145 | for(const auto& item : sequence.seq) 146 | std::cout << uint32_t(item) << ' '; 147 | std::cout << "]\n"; 148 | } 149 | std::cout << std::endl; 150 | } 151 | 152 | }; 153 | 154 | } // namespace 155 | -------------------------------------------------------------------------------- /include/gossip/utils.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "error_checking.hpp" 4 | #include "context.cuh" 5 | 6 | namespace gossip { 7 | 8 | // only for convenience 9 | template < 10 | typename value_t, 11 | typename index_t> 12 | bool clear( 13 | const context_t * context, 14 | const std::vector& mem, 15 | const std::vector& mem_lens 16 | ) { 17 | if(!check(mem.size() == context->get_num_devices(), 18 | "mem size does not match number of gpus.")) 19 | return false; 20 | if(!check(mem_lens.size() == context->get_num_devices(), 21 | "mem_lens size does not match number of gpus.")) 22 | return false; 23 | 24 | context->sync_all_streams(); 25 | for (gpu_id_t gpu = 0; gpu < context->get_num_devices(); gpu++) { 26 | const gpu_id_t id = context->get_device_id(gpu); 27 | const auto stream = context->get_streams(gpu)[0]; 28 | cudaSetDevice(id); 29 | const size_t size = mem_lens[gpu] 30 | * sizeof(value_t); 31 | cudaMemsetAsync(mem[gpu], 0, size, stream); 32 | } CUERR 33 | 34 | return true; 35 | } 36 | 37 | } // namespace 38 | -------------------------------------------------------------------------------- /include/plan_parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "plan_parser.hpp" 7 | #include "json.hpp" 8 | using json = nlohmann::json; 9 | 10 | using gpu_id_t = gossip::gpu_id_t; 11 | 12 | gossip::transfer_plan_t 13 | parse_plan(const char* filename) { 14 | std::string type = ""; 15 | gpu_id_t num_gpus = 0; 16 | gpu_id_t main_gpu = -1; 17 | size_t num_steps = 0; 18 | size_t num_chunks = 0; 19 | std::vector> transfer_sequences = {}; 20 | std::vector transfer_sizes = {}; 21 | 22 | std::ifstream ifs(filename); 23 | json json_plan; 24 | 25 | if(ifs.good()) 26 | ifs >> json_plan; 27 | else { 28 | std::cerr << "error reading " << filename << std::endl; 29 | auto plan = gossip::transfer_plan_t{type, num_gpus, transfer_sequences}; 30 | return plan; 31 | } 32 | 33 | // get plan from json 34 | auto it = json_plan.find("type"); 35 | if(it != json_plan.end()) 36 | type = *it; 37 | 38 | it = json_plan.find("num_gpus"); 39 | if(it != json_plan.end()) 40 | num_gpus = *it; 41 | 42 | it = json_plan.find("main_gpu"); 43 | if(it != json_plan.end()) 44 | main_gpu = *it; 45 | 46 | it = json_plan.find("num_steps"); 47 | if(it != json_plan.end()) 48 | num_steps = *it; 49 | 50 | it = json_plan.find("num_chunks"); 51 | if(it != json_plan.end()) 52 | num_chunks = *it; 53 | 54 | it = json_plan.find("plan"); 55 | if(it != json_plan.end()) { 56 | for(const auto& seq : *it) { 57 | transfer_sequences.push_back(seq); 58 | //TODO cut surplus items from seq 59 | } 60 | if(transfer_sequences.back().size()-1 != num_steps) 61 | std::cerr << "transfer sequence length does not match num steps!" << std::endl; 62 | } 63 | 64 | it = json_plan.find("chunks"); 65 | if(it != json_plan.end()) 66 | for(const auto& seq : *it) { 67 | transfer_sizes.push_back(seq); 68 | } 69 | 70 | auto plan = gossip::transfer_plan_t{type, num_gpus, transfer_sequences, num_chunks, transfer_sizes}; 71 | 72 | plan.main_gpu(main_gpu); 73 | 74 | return plan; 75 | } 76 | -------------------------------------------------------------------------------- /include/plan_parser.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "gossip/transfer_plan.hpp" 4 | 5 | gossip::transfer_plan_t parse_plan(const char* filename); 6 | -------------------------------------------------------------------------------- /plans/8v100_switched/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [ 3 | 1, 1, 1, 1, 1, 1, 1, 1, 4 | 1, 1, 1, 1, 1, 1, 1, 1, 5 | 1, 1, 1, 1, 1, 1, 1, 1, 6 | 1, 1, 1, 1, 1, 1, 1, 1, 7 | 1, 1, 1, 1, 1, 1, 1, 1, 8 | 1, 1, 1, 1, 1, 1, 1, 1, 9 | 1, 1, 1, 1, 1, 1, 1, 1, 10 | 1, 1, 1, 1, 1, 1, 1, 1 11 | ], 12 | "plan": [ 13 | [ 0, 0, 0, 0, 0, 0, 0, 0], 14 | [ 1, 1, 1, 1, 1, 1, 1, 1], 15 | [ 2, 2, 2, 2, 2, 2, 2, 2], 16 | [ 3, 3, 3, 3, 3, 3, 3, 3], 17 | [ 4, 4, 4, 4, 4, 4, 4, 4], 18 | [ 5, 5, 5, 5, 5, 5, 5, 5], 19 | [ 6, 6, 6, 6, 6, 6, 6, 6], 20 | [ 7, 7, 7, 7, 7, 7, 7, 7], 21 | 22 | [ 0, 1, 1, 1, 1, 1, 1, 1], 23 | [ 1, 2, 2, 2, 2, 2, 2, 2], 24 | [ 2, 3, 3, 3, 3, 3, 3, 3], 25 | [ 3, 4, 4, 4, 4, 4, 4, 4], 26 | [ 4, 5, 5, 5, 5, 5, 5, 5], 27 | [ 5, 6, 6, 6, 6, 6, 6, 6], 28 | [ 6, 7, 7, 7, 7, 7, 7, 7], 29 | [ 7, 0, 0, 0, 0, 0, 0, 0], 30 | 31 | [ 0, 0, 2, 2, 2, 2, 2, 2], 32 | [ 1, 1, 3, 3, 3, 3, 3, 3], 33 | [ 2, 2, 4, 4, 4, 4, 4, 4], 34 | [ 3, 3, 5, 5, 5, 5, 5, 5], 35 | [ 4, 4, 6, 6, 6, 6, 6, 6], 36 | [ 5, 5, 7, 7, 7, 7, 7, 7], 37 | [ 6, 6, 0, 0, 0, 0, 0, 0], 38 | [ 7, 7, 1, 1, 1, 1, 1, 1], 39 | 40 | [ 0, 0, 0, 3, 3, 3, 3, 3], 41 | [ 1, 1, 1, 4, 4, 4, 4, 4], 42 | [ 2, 2, 2, 5, 5, 5, 5, 5], 43 | [ 3, 3, 3, 6, 6, 6, 6, 6], 44 | [ 4, 4, 4, 7, 7, 7, 7, 7], 45 | [ 5, 5, 5, 0, 0, 0, 0, 0], 46 | [ 6, 6, 6, 1, 1, 1, 1, 1], 47 | [ 7, 7, 7, 2, 2, 2, 2, 2], 48 | 49 | [ 0, 0, 0, 0, 4, 4, 4, 4], 50 | [ 1, 1, 1, 1, 5, 5, 5, 5], 51 | [ 2, 2, 2, 2, 6, 6, 6, 6], 52 | [ 3, 3, 3, 3, 7, 7, 7, 7], 53 | [ 4, 4, 4, 4, 0, 0, 0, 0], 54 | [ 5, 5, 5, 5, 1, 1, 1, 1], 55 | [ 6, 6, 6, 6, 2, 2, 2, 2], 56 | [ 7, 7, 7, 7, 3, 3, 3, 3], 57 | 58 | [ 0, 0, 0, 0, 0, 5, 5, 5], 59 | [ 1, 1, 1, 1, 1, 6, 6, 6], 60 | [ 2, 2, 2, 2, 2, 7, 7, 7], 61 | [ 3, 3, 3, 3, 3, 0, 0, 0], 62 | [ 4, 4, 4, 4, 4, 1, 1, 1], 63 | [ 5, 5, 5, 5, 5, 2, 2, 2], 64 | [ 6, 6, 6, 6, 6, 3, 3, 3], 65 | [ 7, 7, 7, 7, 7, 4, 4, 4], 66 | 67 | [ 0, 0, 0, 0, 0, 0, 6, 6], 68 | [ 1, 1, 1, 1, 1, 1, 7, 7], 69 | [ 2, 2, 2, 2, 2, 2, 0, 0], 70 | [ 3, 3, 3, 3, 3, 3, 1, 1], 71 | [ 4, 4, 4, 4, 4, 4, 2, 2], 72 | [ 5, 5, 5, 5, 5, 5, 3, 3], 73 | [ 6, 6, 6, 6, 6, 6, 4, 4], 74 | [ 7, 7, 7, 7, 7, 7, 5, 5], 75 | 76 | [ 0, 0, 0, 0, 0, 0, 0, 7], 77 | [ 1, 1, 1, 1, 1, 1, 1, 0], 78 | [ 2, 2, 2, 2, 2, 2, 2, 1], 79 | [ 3, 3, 3, 3, 3, 3, 3, 2], 80 | [ 4, 4, 4, 4, 4, 4, 4, 3], 81 | [ 5, 5, 5, 5, 5, 5, 5, 4], 82 | [ 6, 6, 6, 6, 6, 6, 6, 5], 83 | [ 7, 7, 7, 7, 7, 7, 7, 6] 84 | ], 85 | "type": "all2all", 86 | "num_chunks": 1, 87 | "num_gpus": 8, 88 | "num_steps": 7 89 | } 90 | -------------------------------------------------------------------------------- /plans/8v100_switched/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "gather", 3 | "num_gpus": 8, 4 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1], 5 | "main_gpu": 0, 6 | "num_steps": 1, 7 | "plan": [ 8 | [0, 0], 9 | [1, 0], 10 | [2, 0], 11 | [3, 0], 12 | [4, 0], 13 | [5, 0], 14 | [6, 0], 15 | [7, 0] 16 | ], 17 | "num_chunks": 1 18 | } 19 | -------------------------------------------------------------------------------- /plans/8v100_switched/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1], 3 | "type": "scatter", 4 | "main_gpu": 0, 5 | "plan": [ 6 | [0, 0], 7 | [0, 1], 8 | [0, 2], 9 | [0, 3], 10 | [0, 4], 11 | [0, 5], 12 | [0, 6], 13 | [0, 7] 14 | ], 15 | "num_chunks": 1, 16 | "num_steps": 1, 17 | "num_gpus": 8 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_direct/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 3 | "num_steps": 1, 4 | "type": "all2all", 5 | "plan": [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7], 6 | [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7], 7 | [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7], 8 | [3, 0], [3, 1], [3, 2], [3, 3], [3, 4], [3, 5], [3, 6], [3, 7], 9 | [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [4, 5], [4, 6], [4, 7], 10 | [5, 0], [5, 1], [5, 2], [5, 3], [5, 4], [5, 5], [5, 6], [5, 7], 11 | [6, 0], [6, 1], [6, 2], [6, 3], [6, 4], [6, 5], [6, 6], [6, 7], 12 | [7, 0], [7, 1], [7, 2], [7, 3], [7, 4], [7, 5], [7, 6], [7, 7]], 13 | "num_chunks": 1, 14 | "num_gpus": 8 15 | } -------------------------------------------------------------------------------- /plans/dgx1_direct/broadcast_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [0, 0, 0, 0, 0, 0, 0, 0], 3 | "type": "broadcast", 4 | "num_gpus": 8, 5 | "num_steps": 1, 6 | "plan": [ 7 | [0, 0], 8 | [0, 1], 9 | [0, 2], 10 | [0, 3], 11 | [0, 4], 12 | [0, 5], 13 | [0, 6], 14 | [0, 7] 15 | ], 16 | "main_gpu": 0, 17 | "num_chunks": 1 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_direct/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "gather", 3 | "num_gpus": 8, 4 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1], 5 | "main_gpu": 0, 6 | "num_steps": 1, 7 | "plan": [ 8 | [0, 0], 9 | [1, 0], 10 | [2, 0], 11 | [3, 0], 12 | [4, 0], 13 | [5, 0], 14 | [6, 0], 15 | [7, 0] 16 | ], 17 | "num_chunks": 1 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_direct/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1], 3 | "type": "scatter", 4 | "main_gpu": 0, 5 | "plan": [ 6 | [0, 0], 7 | [0, 1], 8 | [0, 2], 9 | [0, 3], 10 | [0, 4], 11 | [0, 5], 12 | [0, 6], 13 | [0, 7] 14 | ], 15 | "num_chunks": 1, 16 | "num_steps": 1, 17 | "num_gpus": 8 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_opt/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "all2all", 3 | "plan": [[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 4], [0, 1, 1], [0, 1, 5], [0, 2, 2], [0, 2, 6], [0, 3, 3], [0, 4, 4], [0, 4, 5], [0, 4, 6], [0, 4, 7], 4 | [1, 0, 0], [1, 0, 4], [1, 1, 0], [1, 1, 1], [1, 1, 2], [1, 1, 5], [1, 2, 2], [1, 2, 6], [1, 3, 3], [1, 5, 4], [1, 5, 5], [1, 5, 6], [1, 5, 7], 5 | [2, 0, 4], [2, 1, 1], [2, 1, 5], [2, 2, 0], [2, 2, 1], [2, 2, 2], [2, 2, 3], [2, 2, 6], [2, 3, 3], [2, 3, 7], [2, 6, 5], [2, 6, 6], [2, 6, 7], 6 | [3, 0, 0], [3, 0, 4], [3, 1, 5], [3, 2, 2], [3, 2, 6], [3, 3, 0], [3, 3, 1], [3, 3, 2], [3, 3, 3], [3, 3, 7], [3, 7, 4], [3, 7, 6], [3, 7, 7], 7 | [4, 0, 0], [4, 0, 1], [4, 0, 2], [4, 0, 3], [4, 4, 0], [4, 4, 4], [4, 4, 5], [4, 4, 6], [4, 5, 1], [4, 5, 5], [4, 6, 2], [4, 6, 6], [4, 7, 7], 8 | [5, 1, 0], [5, 1, 1], [5, 1, 2], [5, 1, 3], [5, 4, 0], [5, 4, 4], [5, 5, 1], [5, 5, 4], [5, 5, 5], [5, 5, 6], [5, 6, 2], [5, 6, 6], [5, 7, 7], 9 | [6, 2, 1], [6, 2, 2], [6, 2, 3], [6, 4, 0], [6, 5, 1], [6, 5, 5], [6, 6, 2], [6, 6, 4], [6, 6, 5], [6, 6, 6], [6, 6, 7], [6, 7, 3], [6, 7, 7], 10 | [7, 3, 0], [7, 3, 2], [7, 3, 3], [7, 4, 0], [7, 4, 4], [7, 5, 1], [7, 6, 2], [7, 6, 6], [7, 7, 3], [7, 7, 4], [7, 7, 5], [7, 7, 6], [7, 7, 7]], 11 | "num_steps": 2, 12 | "num_chunks": 3, 13 | "num_gpus": 8, 14 | "chunks": [3, 2, 1, 1, 1, 2, 2, 1, 3, 2, 1, 2, 3, 15 | 1, 2, 2, 3, 1, 1, 2, 1, 3, 1, 2, 2, 3, 16 | 3, 1, 2, 3, 2, 3, 2, 1, 1, 2, 1, 2, 1, 17 | 1, 2, 3, 2, 1, 2, 3, 1, 3, 2, 1, 2, 1, 18 | 2, 1, 2, 3, 1, 3, 2, 1, 2, 1, 1, 2, 3, 19 | 1, 2, 2, 3, 2, 1, 1, 2, 3, 1, 1, 2, 3, 20 | 1, 2, 1, 3, 2, 1, 1, 3, 2, 3, 2, 2, 1, 21 | 1, 2, 1, 2, 1, 3, 1, 2, 2, 2, 3, 1, 3] 22 | } -------------------------------------------------------------------------------- /plans/dgx1_opt/broadcast_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 8, 3 | "main_gpu": 0, 4 | "type": "broadcast", 5 | "chunks": [ 6 | 0, 7 | 0, 8 | 0, 9 | 0, 10 | 0, 11 | 0, 12 | 0, 13 | 0, 14 | 1, 15 | 1, 16 | 1, 17 | 1, 18 | 1, 19 | 1, 20 | 1, 21 | 1, 22 | 2, 23 | 2, 24 | 2, 25 | 2, 26 | 2, 27 | 2, 28 | 2, 29 | 2, 30 | 3, 31 | 3, 32 | 3, 33 | 3, 34 | 3, 35 | 3, 36 | 3, 37 | 3 38 | ], 39 | "plan": [ 40 | [0, 0, 0, 0, 0], 41 | [0, 3, 2, 1, 1], 42 | [0, 3, 2, 2, 2], 43 | [0, 3, 3, 3, 3], 44 | [0, 3, 7, 7, 4], 45 | [0, 3, 2, 1, 5], 46 | [0, 3, 2, 6, 6], 47 | [0, 3, 7, 7, 7], 48 | [0, 0, 0, 0, 0], 49 | [0, 0, 0, 1, 1], 50 | [0, 0, 0, 3, 2], 51 | [0, 0, 0, 3, 3], 52 | [0, 4, 4, 4, 4], 53 | [0, 4, 7, 6, 5], 54 | [0, 4, 7, 6, 6], 55 | [0, 4, 7, 7, 7], 56 | [0, 0, 0, 0, 0], 57 | [0, 0, 3, 2, 1], 58 | [0, 0, 3, 2, 2], 59 | [0, 0, 3, 3, 3], 60 | [0, 0, 4, 4, 4], 61 | [0, 0, 4, 5, 5], 62 | [0, 0, 4, 7, 6], 63 | [0, 0, 4, 7, 7], 64 | [0, 0, 0, 0, 0], 65 | [0, 1, 1, 1, 1], 66 | [0, 1, 1, 2, 2], 67 | [0, 0, 0, 0, 3], 68 | [0, 0, 0, 4, 4], 69 | [0, 1, 1, 5, 5], 70 | [0, 1, 1, 5, 6], 71 | [0, 0, 0, 4, 7] 72 | ], 73 | "num_steps": 4, 74 | "num_chunks": 4 75 | } 76 | -------------------------------------------------------------------------------- /plans/dgx1_opt/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_chunks": 6, 3 | "main_gpu": 0, 4 | "num_steps": 14, 5 | "num_gpus": 8, 6 | "type": "gather", 7 | "chunks": [ 8 | 6, 9 | 1, 10 | 1, 11 | 1, 12 | 1, 13 | 1, 14 | 1, 15 | 1, 16 | 1, 17 | 1, 18 | 1, 19 | 1, 20 | 1, 21 | 1, 22 | 1, 23 | 1, 24 | 1, 25 | 1, 26 | 1, 27 | 1, 28 | 1, 29 | 1, 30 | 1, 31 | 1, 32 | 1, 33 | 1, 34 | 1, 35 | 1, 36 | 1, 37 | 1, 38 | 1, 39 | 1, 40 | 1, 41 | 1, 42 | 1, 43 | 1, 44 | 1, 45 | 1, 46 | 1, 47 | 1, 48 | 1, 49 | 1, 50 | 1 51 | ], 52 | "plan": [ 53 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 54 | [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 55 | [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 56 | [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 57 | [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], 58 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 59 | [1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 60 | [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 61 | [2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 62 | [2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0], 63 | [2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0], 64 | [2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 65 | [2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 66 | [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 67 | [3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 68 | [3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 69 | [3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0], 70 | [3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0], 71 | [3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0], 72 | [4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 73 | [4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 74 | [4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 75 | [4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 76 | [4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 77 | [4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0], 78 | [5, 5, 5, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0], 79 | [5, 5, 5, 5, 1, 1, 1, 1, 2, 2, 3, 0, 0, 0, 0], 80 | [5, 5, 5, 5, 5, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0], 81 | [5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 0, 0, 0, 0, 0], 82 | [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 0, 0, 0, 0], 83 | [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 0, 0], 84 | [6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 3, 0, 0, 0], 85 | [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 0, 0], 86 | [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 0, 0], 87 | [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 4, 0], 88 | [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 3, 3, 0], 89 | [6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 4, 0, 0, 0], 90 | [7, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0], 91 | [7, 7, 7, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0], 92 | [7, 7, 7, 7, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0], 93 | [7, 7, 7, 7, 7, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0], 94 | [7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 4, 0, 0, 0, 0], 95 | [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 0, 0] 96 | ] 97 | } 98 | -------------------------------------------------------------------------------- /plans/dgx1_opt/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 8, 3 | "chunks": [ 4 | 6, 5 | 1, 6 | 1, 7 | 1, 8 | 1, 9 | 1, 10 | 1, 11 | 1, 12 | 1, 13 | 1, 14 | 1, 15 | 1, 16 | 1, 17 | 1, 18 | 1, 19 | 1, 20 | 1, 21 | 1, 22 | 1, 23 | 1, 24 | 1, 25 | 1, 26 | 1, 27 | 1, 28 | 1, 29 | 1, 30 | 1, 31 | 1, 32 | 1, 33 | 1, 34 | 1, 35 | 1, 36 | 1, 37 | 1, 38 | 1, 39 | 1, 40 | 1, 41 | 1, 42 | 1, 43 | 1, 44 | 1, 45 | 1, 46 | 1 47 | ], 48 | "main_gpu": 0, 49 | "plan": [ 50 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 51 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3], 52 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4], 53 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], 54 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2], 55 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3], 56 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4], 57 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3], 58 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4], 59 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 5, 5], 60 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 1], 61 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 7, 7], 62 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 7, 7], 63 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3], 64 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4], 65 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 5, 5, 5], 66 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 6, 6], 67 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3], 68 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 6, 6, 6], 69 | [0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 1, 1, 1, 1, 1], 70 | [0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 7, 7, 7, 7, 7], 71 | [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 5, 5, 5, 5], 72 | [0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2], 73 | [0, 0, 0, 0, 0, 0, 0, 3, 7, 7, 7, 7, 7, 7, 7], 74 | [0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 5, 5, 5], 75 | [0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 2, 2, 2, 2, 2], 76 | [0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4], 77 | [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 78 | [0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1], 79 | [0, 0, 0, 0, 0, 3, 3, 3, 2, 6, 6, 6, 6, 6, 6], 80 | [0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], 81 | [0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], 82 | [0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 7, 6, 6, 6], 83 | [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 84 | [0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 85 | [0, 0, 0, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2], 86 | [0, 0, 0, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5], 87 | [0, 0, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], 88 | [0, 0, 4, 4, 4, 4, 4, 4, 4, 7, 7, 7, 7, 7, 7], 89 | [0, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5], 90 | [0, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6], 91 | [0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2], 92 | [0, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6] 93 | ], 94 | "num_chunks": 6, 95 | "type": "scatter", 96 | "num_steps": 14 97 | } 98 | -------------------------------------------------------------------------------- /plans/dgx1_opt_1chunk/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [ 3 | 1, 4 | 1, 5 | 1, 6 | 1, 7 | 1, 8 | 1, 9 | 1, 10 | 1, 11 | 1, 12 | 1, 13 | 1, 14 | 1, 15 | 1, 16 | 1, 17 | 1, 18 | 1, 19 | 1, 20 | 1, 21 | 1, 22 | 1, 23 | 1, 24 | 1, 25 | 1, 26 | 1, 27 | 1, 28 | 1, 29 | 1, 30 | 1, 31 | 1, 32 | 1, 33 | 1, 34 | 1, 35 | 1, 36 | 1, 37 | 1, 38 | 1, 39 | 1, 40 | 1, 41 | 1, 42 | 1, 43 | 1, 44 | 1, 45 | 1, 46 | 1, 47 | 1, 48 | 1, 49 | 1, 50 | 1, 51 | 1, 52 | 1, 53 | 1, 54 | 1, 55 | 1, 56 | 1, 57 | 1, 58 | 1, 59 | 1, 60 | 1, 61 | 1, 62 | 1, 63 | 1, 64 | 1, 65 | 1, 66 | 1 67 | ], 68 | "plan": [ 69 | [0, 0, 0], 70 | [0, 0, 1], 71 | [0, 1, 5], 72 | [0, 2, 2], 73 | [0, 3, 3], 74 | [0, 4, 4], 75 | [0, 4, 6], 76 | [0, 4, 7], 77 | [1, 0, 4], 78 | [1, 1, 0], 79 | [1, 1, 1], 80 | [1, 2, 2], 81 | [1, 3, 3], 82 | [1, 5, 5], 83 | [1, 5, 6], 84 | [1, 5, 7], 85 | [2, 0, 4], 86 | [2, 1, 5], 87 | [2, 2, 0], 88 | [2, 2, 1], 89 | [2, 2, 2], 90 | [2, 2, 3], 91 | [2, 3, 7], 92 | [2, 6, 6], 93 | [3, 0, 4], 94 | [3, 1, 5], 95 | [3, 2, 6], 96 | [3, 3, 0], 97 | [3, 3, 1], 98 | [3, 3, 2], 99 | [3, 3, 3], 100 | [3, 7, 7], 101 | [4, 0, 0], 102 | [4, 0, 2], 103 | [4, 0, 3], 104 | [4, 4, 4], 105 | [4, 4, 5], 106 | [4, 5, 1], 107 | [4, 6, 6], 108 | [4, 7, 7], 109 | [5, 1, 1], 110 | [5, 1, 2], 111 | [5, 1, 3], 112 | [5, 4, 0], 113 | [5, 5, 4], 114 | [5, 5, 5], 115 | [5, 6, 6], 116 | [5, 7, 7], 117 | [6, 2, 2], 118 | [6, 4, 0], 119 | [6, 5, 1], 120 | [6, 6, 4], 121 | [6, 6, 5], 122 | [6, 6, 6], 123 | [6, 6, 7], 124 | [6, 7, 3], 125 | [7, 3, 3], 126 | [7, 4, 0], 127 | [7, 5, 1], 128 | [7, 6, 2], 129 | [7, 7, 4], 130 | [7, 7, 5], 131 | [7, 7, 6], 132 | [7, 7, 7] 133 | ], 134 | "type": "all2all", 135 | "num_chunks": 1, 136 | "num_gpus": 8, 137 | "num_steps": 2 138 | } 139 | -------------------------------------------------------------------------------- /plans/dgx1_opt_1chunk/broadcast_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [0, 0, 0, 0, 0, 0, 0, 0], 3 | "type": "broadcast", 4 | "num_gpus": 8, 5 | "num_steps": 3, 6 | "plan": [ 7 | [0, 0, 0, 0], 8 | [0, 1, 1, 1], 9 | [0, 3, 2, 2], 10 | [0, 3, 3, 3], 11 | [0, 4, 4, 4], 12 | [0, 1, 1, 5], 13 | [0, 4, 7, 6], 14 | [0, 4, 7, 7] 15 | ], 16 | "main_gpu": 0, 17 | "num_chunks": 1 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_opt_1chunk/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "plan": [ 3 | [0, 0, 0, 0], 4 | [1, 3, 3, 0], 5 | [2, 3, 0, 0], 6 | [3, 0, 0, 0], 7 | [4, 0, 0, 0], 8 | [5, 1, 0, 0], 9 | [6, 4, 4, 0], 10 | [7, 4, 0, 0] 11 | ], 12 | "type": "gather", 13 | "num_chunks": 1, 14 | "main_gpu": 0, 15 | "num_steps": 3, 16 | "num_gpus": 8, 17 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1] 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_opt_1chunk/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "plan": [ 3 | [0, 0, 0, 0], 4 | [0, 0, 0, 4], 5 | [0, 0, 2, 2], 6 | [0, 0, 3, 3], 7 | [0, 0, 4, 7], 8 | [0, 1, 1, 5], 9 | [0, 3, 2, 1], 10 | [0, 4, 6, 6] 11 | ], 12 | "num_steps": 3, 13 | "type": "scatter", 14 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1], 15 | "num_gpus": 8, 16 | "num_chunks": 1, 17 | "main_gpu": 0 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_quad_opt/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 4, 3 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 4 | "num_steps": 1, 5 | "plan": [ 6 | [0, 0], 7 | [0, 1], 8 | [0, 2], 9 | [0, 3], 10 | [1, 0], 11 | [1, 1], 12 | [1, 2], 13 | [1, 3], 14 | [2, 0], 15 | [2, 1], 16 | [2, 2], 17 | [2, 3], 18 | [3, 0], 19 | [3, 1], 20 | [3, 2], 21 | [3, 3] 22 | ], 23 | "type": "all2all", 24 | "num_chunks": 1 25 | } 26 | -------------------------------------------------------------------------------- /plans/dgx1_quad_opt/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "plan": [ 3 | [0, 0, 0, 0, 0, 3, 3], 4 | [0, 0, 0, 0, 3, 3, 3], 5 | [0, 0, 0, 3, 3, 3, 3], 6 | [0, 0, 3, 3, 3, 3, 3], 7 | [0, 3, 3, 3, 3, 3, 3], 8 | [1, 1, 1, 1, 0, 0, 3], 9 | [1, 1, 1, 1, 1, 3, 3], 10 | [1, 1, 1, 3, 3, 3, 3], 11 | [1, 1, 2, 2, 2, 2, 3], 12 | [1, 3, 3, 3, 3, 3, 3], 13 | [2, 2, 2, 2, 2, 3, 3], 14 | [2, 2, 2, 2, 3, 3, 3], 15 | [2, 2, 2, 3, 3, 3, 3], 16 | [2, 2, 3, 3, 3, 3, 3], 17 | [2, 3, 3, 3, 3, 3, 3], 18 | [3, 3, 3, 3, 3, 3, 3] 19 | ], 20 | "main_gpu": 3, 21 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5], 22 | "type": "gather", 23 | "num_steps": 6, 24 | "num_gpus": 4, 25 | "num_chunks": 5 26 | } 27 | -------------------------------------------------------------------------------- /plans/dgx1_quad_opt/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "scatter", 3 | "num_gpus": 4, 4 | "main_gpu": 3, 5 | "num_steps": 6, 6 | "num_chunks": 5, 7 | "plan": [ 8 | [3, 3, 0, 0, 0, 0, 0], 9 | [3, 3, 3, 0, 0, 0, 0], 10 | [3, 3, 3, 3, 0, 0, 0], 11 | [3, 3, 3, 3, 3, 0, 0], 12 | [3, 3, 3, 3, 3, 3, 0], 13 | [3, 0, 1, 1, 1, 1, 1], 14 | [3, 1, 1, 1, 1, 1, 1], 15 | [3, 3, 3, 1, 1, 1, 1], 16 | [3, 2, 2, 2, 1, 1, 1], 17 | [3, 3, 3, 3, 3, 1, 1], 18 | [3, 3, 2, 2, 2, 2, 2], 19 | [3, 3, 3, 2, 2, 2, 2], 20 | [3, 3, 3, 3, 2, 2, 2], 21 | [3, 3, 3, 3, 3, 2, 2], 22 | [3, 3, 3, 3, 3, 3, 2], 23 | [3, 3, 3, 3, 3, 3, 3] 24 | ], 25 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5] 26 | } 27 | -------------------------------------------------------------------------------- /plans/dgx1_quad_opt2/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 4, 3 | "chunks": [5, 5, 5, 5, 4 | 1, 1, 1, 1, 1, 5 | 1, 1, 1, 1, 1, 6 | 1, 1, 1, 1, 1, 7 | 1, 1, 1, 1, 1, 8 | 1, 1, 1, 1, 1, 9 | 1, 1, 1, 1, 1, 10 | 1, 1, 1, 1, 1, 11 | 1, 1, 1, 1, 1, 12 | 1, 1, 1, 1, 1, 13 | 1, 1, 1, 1, 1, 14 | 1, 1, 1, 1, 1, 15 | 1, 1, 1, 1, 1], 16 | "num_steps": 8, 17 | "plan": [ 18 | [0, 0, 0, 0, 0, 0, 0, 0, 0], 19 | [1, 1, 1, 1, 1, 1, 1, 1, 1], 20 | [2, 2, 2, 2, 2, 2, 2, 2, 2], 21 | [3, 3, 3, 3, 3, 3, 3, 3, 3], 22 | [0, 3, 2, 1, 1, 1, 1, 1, 1], 23 | [0, 1, 1, 1, 1, 1, 1, 1, 1], 24 | [0, 0, 0, 1, 1, 1, 1, 1, 1], 25 | [0, 0, 0, 0, 0, 1, 1, 1, 1], 26 | [0, 0, 0, 0, 0, 0, 0, 1, 1], 27 | [0, 0, 3, 2, 2, 2, 2, 2, 2], 28 | [0, 2, 2, 2, 2, 2, 2, 2, 2], 29 | [0, 0, 0, 2, 2, 2, 2, 2, 2], 30 | [0, 0, 0, 0, 0, 2, 2, 2, 2], 31 | [0, 0, 0, 0, 0, 0, 0, 2, 2], 32 | [0, 0, 0, 3, 3, 3, 3, 3, 3], 33 | [0, 0, 0, 0, 3, 3, 3, 3, 3], 34 | [0, 0, 0, 0, 0, 3, 3, 3, 3], 35 | [0, 0, 0, 0, 0, 0, 3, 3, 3], 36 | [0, 0, 0, 0, 0, 0, 0, 3, 3], 37 | [1, 2, 3, 0, 0, 0, 0, 0, 0], 38 | [1, 0, 0, 0, 0, 0, 0, 0, 0], 39 | [1, 1, 1, 0, 0, 0, 0, 0, 0], 40 | [1, 1, 1, 1, 1, 0, 0, 0, 0], 41 | [1, 1, 1, 1, 1, 1, 1, 0, 0], 42 | [1, 1, 1, 2, 2, 2, 2, 2, 2], 43 | [1, 1, 1, 1, 2, 2, 2, 2, 2], 44 | [1, 1, 1, 1, 1, 2, 2, 2, 2], 45 | [1, 1, 1, 1, 1, 1, 2, 2, 2], 46 | [1, 1, 1, 1, 1, 1, 1, 2, 2], 47 | [1, 1, 2, 3, 3, 3, 3, 3, 3], 48 | [1, 3, 3, 3, 3, 3, 3, 3, 3], 49 | [1, 1, 1, 3, 3, 3, 3, 3, 3], 50 | [1, 1, 1, 1, 1, 3, 3, 3, 3], 51 | [1, 1, 1, 1, 1, 1, 1, 3, 3], 52 | [2, 3, 0, 0, 0, 0, 0, 0, 0], 53 | [2, 0, 0, 0, 0, 0, 0, 0, 0], 54 | [2, 2, 2, 0, 0, 0, 0, 0, 0], 55 | [2, 2, 2, 2, 2, 0, 0, 0, 0], 56 | [2, 2, 2, 2, 2, 2, 2, 0, 0], 57 | [2, 1, 1, 1, 1, 1, 1, 1, 1], 58 | [2, 2, 2, 2, 1, 1, 1, 1, 1], 59 | [2, 2, 2, 2, 2, 1, 1, 1, 1], 60 | [2, 2, 2, 2, 2, 2, 1, 1, 1], 61 | [2, 2, 2, 2, 2, 2, 2, 1, 1], 62 | [2, 2, 2, 2, 3, 3, 3, 3, 3], 63 | [2, 2, 2, 2, 2, 3, 3, 3, 3], 64 | [2, 2, 2, 2, 2, 2, 3, 3, 3], 65 | [2, 2, 2, 2, 2, 2, 2, 3, 3], 66 | [2, 2, 2, 2, 2, 2, 2, 2, 3], 67 | [3, 0, 0, 0, 0, 0, 0, 0, 0], 68 | [3, 3, 3, 3, 0, 0, 0, 0, 0], 69 | [3, 3, 3, 3, 3, 0, 0, 0, 0], 70 | [3, 3, 3, 3, 3, 3, 0, 0, 0], 71 | [3, 3, 3, 3, 3, 3, 3, 0, 0], 72 | [3, 2, 1, 1, 1, 1, 1, 1, 1], 73 | [3, 1, 1, 1, 1, 1, 1, 1, 1], 74 | [3, 3, 3, 1, 1, 1, 1, 1, 1], 75 | [3, 3, 3, 3, 3, 1, 1, 1, 1], 76 | [3, 3, 3, 3, 3, 3, 3, 1, 1], 77 | [3, 3, 3, 3, 2, 2, 2, 2, 2], 78 | [3, 3, 3, 3, 3, 2, 2, 2, 2], 79 | [3, 3, 3, 3, 3, 3, 2, 2, 2], 80 | [3, 3, 3, 3, 3, 3, 3, 2, 2], 81 | [3, 3, 3, 3, 3, 3, 3, 3, 2] 82 | ], 83 | "type": "all2all", 84 | "num_chunks": 5 85 | } -------------------------------------------------------------------------------- /plans/dgx1_quad_opt2/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "gather", 3 | "num_gpus": 4, 4 | "num_chunks": 4, 5 | "num_steps": 6, 6 | "chunks": [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 7 | "plan": [[0, 0, 0, 0, 0, 0, 0], 8 | [1, 0, 0, 0, 0, 0, 0], 9 | [1, 1, 1, 0, 0, 0, 0], 10 | [1, 1, 1, 1, 1, 0, 0], 11 | [1, 1, 1, 1, 3, 3, 0], 12 | [2, 0, 0, 0, 0, 0, 0], 13 | [2, 2, 2, 0, 0, 0, 0], 14 | [2, 2, 2, 2, 2, 0, 0], 15 | [2, 3, 3, 0, 0, 0, 0], 16 | [3, 0, 0, 0, 0, 0, 0], 17 | [3, 3, 0, 0, 0, 0, 0], 18 | [3, 3, 3, 3, 0, 0, 0], 19 | [3, 3, 3, 3, 3, 0, 0]], 20 | "main_gpu": 0 21 | } -------------------------------------------------------------------------------- /plans/dgx1_quad_opt2/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_chunks": 4, 3 | "main_gpu": 0, 4 | "num_gpus": 4, 5 | "type": "scatter", 6 | "chunks": [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 7 | "plan": [ 8 | [0, 0, 0, 0, 0, 0, 0], 9 | [0, 0, 0, 0, 0, 0, 3], 10 | [0, 0, 0, 0, 0, 1, 1], 11 | [0, 0, 0, 0, 0, 2, 2], 12 | [0, 0, 0, 0, 0, 3, 3], 13 | [0, 0, 0, 0, 3, 3, 3], 14 | [0, 0, 0, 1, 1, 1, 1], 15 | [0, 0, 0, 2, 2, 2, 2], 16 | [0, 0, 0, 3, 3, 3, 3], 17 | [0, 0, 3, 3, 3, 1, 1], 18 | [0, 1, 1, 1, 1, 1, 1], 19 | [0, 2, 2, 2, 2, 2, 2], 20 | [0, 3, 3, 3, 2, 2, 2] 21 | ], 22 | "num_steps": 6 23 | } 24 | -------------------------------------------------------------------------------- /plans/dgx1_rings/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_steps": 4, 3 | "plan": [ 4 | [0, 0, 0, 0, 0], 5 | [5, 6, 7, 4, 0], 6 | [6, 7, 4, 0, 0], 7 | [7, 4, 0, 0, 0], 8 | [4, 0, 0, 0, 0], 9 | [5, 1, 2, 3, 0], 10 | [1, 2, 3, 0, 0], 11 | [2, 3, 0, 0, 0], 12 | [3, 0, 0, 0, 0], 13 | [5, 6, 7, 4, 0], 14 | [6, 7, 4, 0, 0], 15 | [7, 4, 0, 0, 0], 16 | [4, 0, 0, 0, 0], 17 | [5, 1, 2, 3, 0], 18 | [1, 2, 3, 0, 0], 19 | [2, 3, 0, 0, 0], 20 | [3, 0, 0, 0, 0], 21 | [5, 7, 3, 1, 0], 22 | [7, 3, 1, 0, 0], 23 | [3, 1, 0, 0, 0], 24 | [1, 0, 0, 0, 0], 25 | [5, 4, 6, 2, 0], 26 | [4, 6, 2, 0, 0], 27 | [6, 2, 0, 0, 0], 28 | [2, 0, 0, 0, 0] 29 | ], 30 | "type": "gather", 31 | "num_gpus": 8, 32 | "num_chunks": 6, 33 | "chunks": [ 34 | 6, 35 | 1, 36 | 2, 37 | 2, 38 | 2, 39 | 1, 40 | 2, 41 | 2, 42 | 2, 43 | 1, 44 | 2, 45 | 2, 46 | 2, 47 | 1, 48 | 2, 49 | 2, 50 | 2, 51 | 1, 52 | 2, 53 | 2, 54 | 2, 55 | 1, 56 | 2, 57 | 2, 58 | 2 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /plans/dgx1_rings/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 8, 3 | "num_chunks": 6, 4 | "num_steps": 4, 5 | "plan": [ 6 | [0, 0, 0, 0, 0], 7 | [0, 4, 7, 6, 5], 8 | [0, 0, 4, 7, 6], 9 | [0, 0, 0, 4, 7], 10 | [0, 0, 0, 0, 4], 11 | [0, 3, 2, 1, 5], 12 | [0, 0, 3, 2, 1], 13 | [0, 0, 0, 3, 2], 14 | [0, 0, 0, 0, 3], 15 | [0, 4, 7, 6, 5], 16 | [0, 0, 4, 7, 6], 17 | [0, 0, 0, 4, 7], 18 | [0, 0, 0, 0, 4], 19 | [0, 3, 2, 1, 5], 20 | [0, 0, 3, 2, 1], 21 | [0, 0, 0, 3, 2], 22 | [0, 0, 0, 0, 3], 23 | [0, 1, 3, 7, 5], 24 | [0, 0, 1, 3, 7], 25 | [0, 0, 0, 1, 3], 26 | [0, 0, 0, 0, 1], 27 | [0, 2, 6, 4, 5], 28 | [0, 0, 2, 6, 4], 29 | [0, 0, 0, 2, 6], 30 | [0, 0, 0, 0, 2] 31 | ], 32 | "chunks": [ 33 | 6, 34 | 1, 35 | 2, 36 | 2, 37 | 2, 38 | 1, 39 | 2, 40 | 2, 41 | 2, 42 | 1, 43 | 2, 44 | 2, 45 | 2, 46 | 1, 47 | 2, 48 | 2, 49 | 2, 50 | 1, 51 | 2, 52 | 2, 53 | 2, 54 | 1, 55 | 2, 56 | 2, 57 | 2 58 | ], 59 | "type": "scatter" 60 | } 61 | -------------------------------------------------------------------------------- /plans/dgx1_symm/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 3 | "num_steps": 1, 4 | "type": "all2all", 5 | "plan": [[0, 0, 0], [0, 1, 1], [0, 2, 2], [0, 3, 3], [0, 4, 4], [0, 4, 5], [0, 4, 6], [0, 4, 7], 6 | [1, 0, 0], [1, 1, 1], [1, 2, 2], [1, 3, 3], [1, 5, 4], [1, 5, 5], [1, 5, 6], [1, 5, 7], 7 | [2, 0, 0], [2, 1, 1], [2, 2, 2], [2, 3, 3], [2, 6, 4], [2, 6, 5], [2, 6, 6], [2, 6, 7], 8 | [3, 0, 0], [3, 1, 1], [3, 2, 2], [3, 3, 3], [3, 7, 4], [3, 7, 5], [3, 7, 6], [3, 7, 7], 9 | [4, 0, 0], [4, 0, 1], [4, 0, 2], [4, 0, 3], [4, 4, 4], [4, 5, 5], [4, 6, 6], [4, 7, 7], 10 | [5, 1, 0], [5, 1, 1], [5, 1, 2], [5, 1, 3], [5, 4, 4], [5, 5, 5], [5, 6, 6], [5, 7, 7], 11 | [6, 2, 0], [6, 2, 1], [6, 2, 2], [6, 2, 3], [6, 4, 4], [6, 5, 5], [6, 6, 6], [6, 7, 7], 12 | [7, 3, 0], [7, 3, 1], [7, 3, 2], [7, 3, 3], [7, 4, 4], [7, 5, 5], [7, 6, 6], [7, 7, 7]], 13 | "num_chunks": 1, 14 | "num_gpus": 8 15 | } -------------------------------------------------------------------------------- /plans/dgx1_symm/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "gather", 3 | "num_gpus": 8, 4 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1], 5 | "main_gpu": 0, 6 | "num_steps": 2, 7 | "plan": [ 8 | [0, 0, 0], 9 | [1, 0, 0], 10 | [2, 0, 0], 11 | [3, 0, 0], 12 | [4, 0, 0], 13 | [5, 1, 0], 14 | [6, 2, 0], 15 | [7, 3, 0] 16 | ], 17 | "num_chunks": 1 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx1_symm/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1], 3 | "type": "scatter", 4 | "main_gpu": 0, 5 | "plan": [ 6 | [0, 0, 0], 7 | [0, 0, 1], 8 | [0, 0, 2], 9 | [0, 0, 3], 10 | [0, 0, 4], 11 | [0, 1, 5], 12 | [0, 2, 6], 13 | [0, 3, 7] 14 | ], 15 | "num_chunks": 1, 16 | "num_steps": 2, 17 | "num_gpus": 8 18 | } 19 | -------------------------------------------------------------------------------- /plans/dgx2_direct/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [ 3 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 7 | ], 8 | "num_steps": 1, 9 | "type": "all2all", 10 | "plan": [ 11 | [ 0, 0], [ 0, 1], [ 0, 2], [ 0, 3], [ 0, 4], [ 0, 5], [ 0, 6], [ 0, 7], [ 0, 8], [ 0, 9], [ 0, 10], [ 0, 11], [ 0, 12], [ 0, 13], [ 0, 14], [ 0, 15], 12 | [ 1, 0], [ 1, 1], [ 1, 2], [ 1, 3], [ 1, 4], [ 1, 5], [ 1, 6], [ 1, 7], [ 1, 8], [ 1, 9], [ 1, 10], [ 1, 11], [ 1, 12], [ 1, 13], [ 1, 14], [ 1, 15], 13 | [ 2, 0], [ 2, 1], [ 2, 2], [ 2, 3], [ 2, 4], [ 2, 5], [ 2, 6], [ 2, 7], [ 2, 8], [ 2, 9], [ 2, 10], [ 2, 11], [ 2, 12], [ 2, 13], [ 2, 14], [ 2, 15], 14 | [ 3, 0], [ 3, 1], [ 3, 2], [ 3, 3], [ 3, 4], [ 3, 5], [ 3, 6], [ 3, 7], [ 3, 8], [ 3, 9], [ 3, 10], [ 3, 11], [ 3, 12], [ 3, 13], [ 3, 14], [ 3, 15], 15 | [ 4, 0], [ 4, 1], [ 4, 2], [ 4, 3], [ 4, 4], [ 4, 5], [ 4, 6], [ 4, 7], [ 4, 8], [ 4, 9], [ 4, 10], [ 4, 11], [ 4, 12], [ 4, 13], [ 4, 14], [ 4, 15], 16 | [ 5, 0], [ 5, 1], [ 5, 2], [ 5, 3], [ 5, 4], [ 5, 5], [ 5, 6], [ 5, 7], [ 5, 8], [ 5, 9], [ 5, 10], [ 5, 11], [ 5, 12], [ 5, 13], [ 5, 14], [ 5, 15], 17 | [ 6, 0], [ 6, 1], [ 6, 2], [ 6, 3], [ 6, 4], [ 6, 5], [ 6, 6], [ 6, 7], [ 6, 8], [ 6, 9], [ 6, 10], [ 6, 11], [ 6, 12], [ 6, 13], [ 6, 14], [ 6, 15], 18 | [ 7, 0], [ 7, 1], [ 7, 2], [ 7, 3], [ 7, 4], [ 7, 5], [ 7, 6], [ 7, 7], [ 7, 8], [ 7, 9], [ 7, 10], [ 7, 11], [ 7, 12], [ 7, 13], [ 7, 14], [ 7, 15], 19 | [ 8, 0], [ 8, 1], [ 8, 2], [ 8, 3], [ 8, 4], [ 8, 5], [ 8, 6], [ 8, 7], [ 8, 8], [ 8, 9], [ 8, 10], [ 8, 11], [ 8, 12], [ 8, 13], [ 8, 14], [ 8, 15], 20 | [ 9, 0], [ 9, 1], [ 9, 2], [ 9, 3], [ 9, 4], [ 9, 5], [ 9, 6], [ 9, 7], [ 9, 8], [ 9, 9], [ 9, 10], [ 9, 11], [ 9, 12], [ 9, 13], [ 9, 14], [ 9, 15], 21 | [10, 0], [10, 1], [10, 2], [10, 3], [10, 4], [10, 5], [10, 6], [10, 7], [10, 8], [10, 9], [10, 10], [10, 11], [10, 12], [10, 13], [10, 14], [10, 15], 22 | [11, 0], [11, 1], [11, 2], [11, 3], [11, 4], [11, 5], [11, 6], [11, 7], [11, 8], [11, 9], [11, 10], [11, 11], [11, 12], [11, 13], [11, 14], [11, 15], 23 | [12, 0], [12, 1], [12, 2], [12, 3], [12, 4], [12, 5], [12, 6], [12, 7], [12, 8], [12, 9], [12, 10], [12, 11], [12, 12], [12, 13], [12, 14], [12, 15], 24 | [13, 0], [13, 1], [13, 2], [13, 3], [13, 4], [13, 5], [13, 6], [13, 7], [13, 8], [13, 9], [13, 10], [13, 11], [13, 12], [13, 13], [13, 14], [13, 15], 25 | [14, 0], [14, 1], [14, 2], [14, 3], [14, 4], [14, 5], [14, 6], [14, 7], [14, 8], [14, 9], [14, 10], [14, 11], [14, 12], [14, 13], [14, 14], [14, 15], 26 | [15, 0], [15, 1], [15, 2], [15, 3], [15, 4], [15, 5], [15, 6], [15, 7], [15, 8], [15, 9], [15, 10], [15, 11], [15, 12], [15, 13], [15, 14], [15, 15] 27 | ], 28 | "num_chunks": 1, 29 | "num_gpus": 16 30 | } -------------------------------------------------------------------------------- /plans/dgx2_direct/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "gather", 3 | "num_gpus": 16, 4 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 5 | "main_gpu": 0, 6 | "num_steps": 1, 7 | "plan": [ 8 | [0, 0], 9 | [1, 0], 10 | [2, 0], 11 | [3, 0], 12 | [4, 0], 13 | [5, 0], 14 | [6, 0], 15 | [7, 0], 16 | [8, 0], 17 | [9, 0], 18 | [10, 0], 19 | [11, 0], 20 | [12, 0], 21 | [13, 0], 22 | [14, 0], 23 | [15, 0] 24 | ], 25 | "num_chunks": 1 26 | } 27 | -------------------------------------------------------------------------------- /plans/dgx2_direct/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 3 | "type": "scatter", 4 | "main_gpu": 0, 5 | "plan": [ 6 | [0, 0], 7 | [0, 1], 8 | [0, 2], 9 | [0, 3], 10 | [0, 4], 11 | [0, 5], 12 | [0, 6], 13 | [0, 7], 14 | [0, 8], 15 | [0, 9], 16 | [0, 10], 17 | [0, 11], 18 | [0, 12], 19 | [0, 13], 20 | [0, 14], 21 | [0, 15] 22 | ], 23 | "num_chunks": 1, 24 | "num_steps": 1, 25 | "num_gpus": 16 26 | } 27 | -------------------------------------------------------------------------------- /plans/p100_quad_direct/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 4, 3 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 4 | "num_steps": 1, 5 | "plan": [ 6 | [0, 0], 7 | [0, 1], 8 | [0, 2], 9 | [0, 3], 10 | [1, 0], 11 | [1, 1], 12 | [1, 2], 13 | [1, 3], 14 | [2, 0], 15 | [2, 1], 16 | [2, 2], 17 | [2, 3], 18 | [3, 0], 19 | [3, 1], 20 | [3, 2], 21 | [3, 3] 22 | ], 23 | "type": "all2all", 24 | "num_chunks": 1 25 | } 26 | -------------------------------------------------------------------------------- /plans/p100_quad_direct/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "plan": [[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0]], 3 | "type": "gather", 4 | "num_gpus": 4, 5 | "main_gpu": 0, 6 | "num_chunks": 1, 7 | "chunks": [1, 1, 1, 1], 8 | "num_steps": 2 9 | } 10 | -------------------------------------------------------------------------------- /plans/p100_quad_direct/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "chunks": [1, 1, 1, 1], 3 | "num_steps": 2, 4 | "plan": [[0, 0, 0], [0, 1, 1], [0, 2, 2], [0, 3, 3]], 5 | "type": "scatter", 6 | "main_gpu": 0, 7 | "num_gpus": 4, 8 | "num_chunks": 1 9 | } 10 | -------------------------------------------------------------------------------- /plans/p100_quad_opt/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 4, 3 | "chunks": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 4 | "num_steps": 1, 5 | "plan": [ 6 | [0, 0], 7 | [0, 1], 8 | [0, 2], 9 | [0, 3], 10 | [1, 0], 11 | [1, 1], 12 | [1, 2], 13 | [1, 3], 14 | [2, 0], 15 | [2, 1], 16 | [2, 2], 17 | [2, 3], 18 | [3, 0], 19 | [3, 1], 20 | [3, 2], 21 | [3, 3] 22 | ], 23 | "type": "all2all", 24 | "num_chunks": 1 25 | } 26 | -------------------------------------------------------------------------------- /plans/p100_quad_opt/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "gather", 3 | "num_gpus": 4, 4 | "num_chunks": 4, 5 | "num_steps": 6, 6 | "chunks": [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 7 | "plan": [[0, 0, 0, 0, 0, 0, 0], 8 | [1, 0, 0, 0, 0, 0, 0], 9 | [1, 1, 1, 0, 0, 0, 0], 10 | [1, 1, 1, 1, 1, 0, 0], 11 | [1, 1, 1, 1, 3, 3, 0], 12 | [2, 0, 0, 0, 0, 0, 0], 13 | [2, 2, 2, 0, 0, 0, 0], 14 | [2, 2, 2, 2, 2, 0, 0], 15 | [2, 3, 3, 0, 0, 0, 0], 16 | [3, 0, 0, 0, 0, 0, 0], 17 | [3, 3, 0, 0, 0, 0, 0], 18 | [3, 3, 3, 3, 0, 0, 0], 19 | [3, 3, 3, 3, 3, 0, 0]], 20 | "main_gpu": 0 21 | } -------------------------------------------------------------------------------- /plans/p100_quad_opt/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_chunks": 4, 3 | "main_gpu": 0, 4 | "num_gpus": 4, 5 | "type": "scatter", 6 | "chunks": [4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 7 | "plan": [ 8 | [0, 0, 0, 0, 0, 0, 0], 9 | [0, 0, 0, 0, 0, 0, 3], 10 | [0, 0, 0, 0, 0, 1, 1], 11 | [0, 0, 0, 0, 0, 2, 2], 12 | [0, 0, 0, 0, 0, 3, 3], 13 | [0, 0, 0, 0, 3, 3, 3], 14 | [0, 0, 0, 1, 1, 1, 1], 15 | [0, 0, 0, 2, 2, 2, 2], 16 | [0, 0, 0, 3, 3, 3, 3], 17 | [0, 0, 3, 3, 3, 1, 1], 18 | [0, 1, 1, 1, 1, 1, 1], 19 | [0, 2, 2, 2, 2, 2, 2], 20 | [0, 3, 3, 3, 2, 2, 2] 21 | ], 22 | "num_steps": 6 23 | } 24 | -------------------------------------------------------------------------------- /plans/p100_quad_rings/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 4, 3 | "chunks": [ 4 | 4, 5 | 4, 6 | 4, 7 | 4, 8 | 1, 9 | 2, 10 | 1, 11 | 2, 12 | 1, 13 | 2, 14 | 1, 15 | 2, 16 | 1, 17 | 2, 18 | 1, 19 | 2, 20 | 1, 21 | 2, 22 | 1, 23 | 2, 24 | 1, 25 | 2, 26 | 1, 27 | 2, 28 | 1, 29 | 2, 30 | 1, 31 | 2, 32 | 1, 33 | 2, 34 | 1, 35 | 2, 36 | 1, 37 | 2, 38 | 1, 39 | 2 40 | ], 41 | "num_chunks": 4, 42 | "num_steps": 3, 43 | "main_gpu": 0, 44 | "type": "all2all", 45 | "plan": [ 46 | [0, 0, 0, 0], 47 | [1, 1, 1, 1], 48 | [2, 2, 2, 2], 49 | [3, 3, 3, 3], 50 | [0, 1, 2, 2], 51 | [0, 0, 0, 1], 52 | [0, 3, 2, 2], 53 | [0, 0, 0, 3], 54 | [1, 2, 3, 3], 55 | [1, 1, 1, 2], 56 | [1, 0, 3, 3], 57 | [1, 1, 1, 0], 58 | [2, 3, 0, 0], 59 | [2, 2, 2, 3], 60 | [2, 1, 0, 0], 61 | [2, 2, 2, 1], 62 | [3, 0, 1, 1], 63 | [3, 3, 3, 0], 64 | [3, 2, 1, 1], 65 | [3, 3, 3, 2], 66 | [0, 2, 1, 1], 67 | [0, 0, 0, 2], 68 | [0, 3, 1, 1], 69 | [0, 0, 0, 3], 70 | [1, 3, 0, 0], 71 | [1, 1, 1, 3], 72 | [1, 2, 0, 0], 73 | [1, 1, 1, 2], 74 | [2, 1, 3, 3], 75 | [2, 2, 2, 1], 76 | [2, 0, 3, 3], 77 | [2, 2, 2, 0], 78 | [3, 0, 2, 2], 79 | [3, 3, 3, 0], 80 | [3, 1, 2, 2], 81 | [3, 3, 3, 1] 82 | ] 83 | } 84 | -------------------------------------------------------------------------------- /plans/p100_quad_rings/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_gpus": 4, 3 | "num_steps": 2, 4 | "main_gpu": 0, 5 | "plan": [ 6 | [0, 0, 0], 7 | [2, 1, 0], 8 | [1, 0, 0], 9 | [2, 3, 0], 10 | [3, 0, 0], 11 | [1, 2, 0], 12 | [2, 0, 0], 13 | [1, 3, 0], 14 | [3, 0, 0] 15 | ], 16 | "chunks": [4, 1, 2, 1, 2, 1, 2, 1, 2], 17 | "num_chunks": 4, 18 | "type": "gather" 19 | } 20 | -------------------------------------------------------------------------------- /plans/p100_quad_rings/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_steps": 2, 3 | "num_gpus": 4, 4 | "chunks": [4, 1, 2, 1, 2, 1, 2, 1, 2], 5 | "num_chunks": 4, 6 | "type": "scatter", 7 | "main_gpu": 0, 8 | "plan": [ 9 | [0, 0, 0], 10 | [0, 1, 2], 11 | [0, 0, 1], 12 | [0, 3, 2], 13 | [0, 0, 3], 14 | [0, 2, 1], 15 | [0, 0, 2], 16 | [0, 3, 1], 17 | [0, 0, 3] 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /plans/v100_quad_opt/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "all2all", 3 | "num_chunks": 2, 4 | "chunks": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 5 | "plan": [ 6 | [0, 0], 7 | [0, 1], 8 | [0, 2], 9 | [0, 3], 10 | [1, 0], 11 | [1, 1], 12 | [1, 2], 13 | [1, 3], 14 | [2, 0], 15 | [2, 1], 16 | [2, 2], 17 | [2, 3], 18 | [3, 0], 19 | [3, 1], 20 | [3, 2], 21 | [3, 3] 22 | ], 23 | "num_gpus": 4, 24 | "num_steps": 1 25 | } 26 | -------------------------------------------------------------------------------- /plans/v100_quad_opt/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_chunks": 2, 3 | "main_gpu": 0, 4 | "type": "gather", 5 | "num_gpus": 4, 6 | "chunks": [2, 1, 1, 1, 1, 1, 1], 7 | "num_steps": 2, 8 | "plan": [ 9 | [0, 0, 0], 10 | [1, 0, 0], 11 | [1, 1, 0], 12 | [2, 0, 0], 13 | [2, 2, 0], 14 | [3, 0, 0], 15 | [3, 3, 0] 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /plans/v100_quad_opt/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "main_gpu": 0, 3 | "num_steps": 2, 4 | "chunks": [2, 1, 1, 1, 1, 1, 1], 5 | "num_chunks": 2, 6 | "type": "scatter", 7 | "num_gpus": 4, 8 | "plan": [ 9 | [0, 0, 0], 10 | [0, 0, 1], 11 | [0, 0, 2], 12 | [0, 0, 3], 13 | [0, 1, 1], 14 | [0, 2, 2], 15 | [0, 3, 3] 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /plans/v100_quad_rings/all2all_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_steps": 3, 3 | "num_chunks": 6, 4 | "chunks": [ 5 | 6, 6 | 6, 7 | 6, 8 | 6, 9 | 1, 10 | 2, 11 | 1, 12 | 2, 13 | 1, 14 | 2, 15 | 1, 16 | 2, 17 | 1, 18 | 2, 19 | 1, 20 | 2, 21 | 1, 22 | 2, 23 | 1, 24 | 2, 25 | 1, 26 | 2, 27 | 1, 28 | 2, 29 | 1, 30 | 2, 31 | 1, 32 | 2, 33 | 1, 34 | 2, 35 | 1, 36 | 2, 37 | 1, 38 | 2, 39 | 1, 40 | 2, 41 | 1, 42 | 2, 43 | 1, 44 | 2, 45 | 1, 46 | 2, 47 | 1, 48 | 2, 49 | 1, 50 | 2, 51 | 1, 52 | 2, 53 | 1, 54 | 2, 55 | 1, 56 | 2 57 | ], 58 | "type": "all2all", 59 | "plan": [ 60 | [0, 0, 0, 0], 61 | [1, 1, 1, 1], 62 | [2, 2, 2, 2], 63 | [3, 3, 3, 3], 64 | [0, 1, 2, 2], 65 | [0, 0, 0, 1], 66 | [0, 3, 2, 2], 67 | [0, 0, 0, 3], 68 | [1, 2, 3, 3], 69 | [1, 1, 1, 2], 70 | [1, 0, 3, 3], 71 | [1, 1, 1, 0], 72 | [2, 3, 0, 0], 73 | [2, 2, 2, 3], 74 | [2, 1, 0, 0], 75 | [2, 2, 2, 1], 76 | [3, 0, 1, 1], 77 | [3, 3, 3, 0], 78 | [3, 2, 1, 1], 79 | [3, 3, 3, 2], 80 | [0, 1, 3, 3], 81 | [0, 0, 0, 1], 82 | [0, 2, 3, 3], 83 | [0, 0, 0, 2], 84 | [1, 3, 2, 2], 85 | [1, 1, 1, 3], 86 | [1, 0, 2, 2], 87 | [1, 1, 1, 0], 88 | [2, 0, 1, 1], 89 | [2, 2, 2, 0], 90 | [2, 3, 1, 1], 91 | [2, 2, 2, 3], 92 | [3, 2, 0, 0], 93 | [3, 3, 3, 2], 94 | [3, 1, 0, 0], 95 | [3, 3, 3, 1], 96 | [0, 2, 1, 1], 97 | [0, 0, 0, 2], 98 | [0, 3, 1, 1], 99 | [0, 0, 0, 3], 100 | [1, 3, 0, 0], 101 | [1, 1, 1, 3], 102 | [1, 2, 0, 0], 103 | [1, 1, 1, 2], 104 | [2, 1, 3, 3], 105 | [2, 2, 2, 1], 106 | [2, 0, 3, 3], 107 | [2, 2, 2, 0], 108 | [3, 0, 2, 2], 109 | [3, 3, 3, 0], 110 | [3, 1, 2, 2], 111 | [3, 3, 3, 1] 112 | ], 113 | "num_gpus": 4 114 | } 115 | -------------------------------------------------------------------------------- /plans/v100_quad_rings/gather_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_steps": 2, 3 | "num_chunks": 6, 4 | "num_gpus": 4, 5 | "type": "gather", 6 | "chunks": [6, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], 7 | "plan": [ 8 | [0, 0, 0], 9 | [2, 1, 0], 10 | [1, 0, 0], 11 | [2, 3, 0], 12 | [3, 0, 0], 13 | [3, 1, 0], 14 | [1, 0, 0], 15 | [3, 2, 0], 16 | [2, 0, 0], 17 | [1, 2, 0], 18 | [2, 0, 0], 19 | [1, 3, 0], 20 | [3, 0, 0] 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /plans/v100_quad_rings/scatter_plan.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_chunks": 6, 3 | "num_gpus": 4, 4 | "plan": [ 5 | [0, 0, 0], 6 | [0, 1, 2], 7 | [0, 0, 1], 8 | [0, 3, 2], 9 | [0, 0, 3], 10 | [0, 1, 3], 11 | [0, 0, 1], 12 | [0, 2, 3], 13 | [0, 0, 2], 14 | [0, 2, 1], 15 | [0, 0, 2], 16 | [0, 3, 1], 17 | [0, 0, 3] 18 | ], 19 | "type": "scatter", 20 | "num_steps": 2, 21 | "chunks": [6, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2] 22 | } 23 | -------------------------------------------------------------------------------- /scripts/dgx1_topology.txt: -------------------------------------------------------------------------------- 1 | GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 mlx5_0 mlx5_2 mlx5_1 mlx5_3 CPU Affinity 2 | GPU0 X NV1 NV1 NV2 NV2 SYS SYS SYS PIX SYS PHB SYS 0-19,40-59 3 | GPU1 NV1 X NV2 NV1 SYS NV2 SYS SYS PIX SYS PHB SYS 0-19,40-59 4 | GPU2 NV1 NV2 X NV2 SYS SYS NV1 SYS PHB SYS PIX SYS 0-19,40-59 5 | GPU3 NV2 NV1 NV2 X SYS SYS SYS NV1 PHB SYS PIX SYS 0-19,40-59 6 | GPU4 NV2 SYS SYS SYS X NV1 NV1 NV2 SYS PIX SYS PHB 20-39,60-79 7 | GPU5 SYS NV2 SYS SYS NV1 X NV2 NV1 SYS PIX SYS PHB 20-39,60-79 8 | GPU6 SYS SYS NV1 SYS NV1 NV2 X NV2 SYS PHB SYS PIX 20-39,60-79 9 | GPU7 SYS SYS SYS NV1 NV2 NV1 NV2 X SYS PHB SYS PIX 20-39,60-79 10 | mlx5_0 PIX PIX PHB PHB SYS SYS SYS SYS X SYS PHB SYS 11 | mlx5_2 SYS SYS SYS SYS PIX PIX PHB PHB SYS X SYS PHB 12 | mlx5_1 PHB PHB PIX PIX SYS SYS SYS SYS PHB SYS X SYS 13 | mlx5_3 SYS SYS SYS SYS PHB PHB PIX PIX SYS PHB SYS X 14 | 15 | Legend: 16 | 17 | X = Self 18 | SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) 19 | NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node 20 | PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) 21 | PXB = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge) 22 | PIX = Connection traversing a single PCIe switch 23 | NV# = Connection traversing a bonded set of # NVLinks -------------------------------------------------------------------------------- /scripts/plan_from_rings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from itertools import cycle, islice, dropwhile 4 | import numpy as np 5 | import json 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser(description="create transfer plan.") 9 | parser.add_argument("mode", type=str, help="scatter, gather or all2all") 10 | parser.add_argument("-m","--main_gpu", type=int, default=0, help="source for scatter or target for gather") 11 | args=parser.parse_args() 12 | 13 | modes = ["scatter", "gather", "all2all", "broadcast"] 14 | if args.mode in modes: 15 | mode = args.mode 16 | else: 17 | print("invalid mode") 18 | parser.print_help() 19 | raise SystemExit 20 | 21 | main_gpu = args.main_gpu 22 | 23 | 24 | # ps0001 4x pascal: 4 nvlink per gpu, 2 rings 25 | num_gpus = 4 26 | rings = [[0,1,2,3], 27 | [0,2,1,3]] 28 | # hypotetical 4x volta: 6 nvlink per gpu, 3 rings 29 | # num_gpus = 4 30 | # rings = [[0,1,2,3], 31 | # [0,1,3,2], 32 | # [0,2,1,3]] 33 | # dgx1 8x volta: 6 nvlink per gpu, 3 rings 34 | # num_gpus = 8 35 | # rings = [[0,4,7,6,5,1,2,3], 36 | # [0,4,7,6,5,1,2,3], 37 | # [0,1,3,7,5,4,6,2]] 38 | 39 | half_num_gpus = num_gpus//2 40 | num_chunks = 2*len(rings) 41 | 42 | 43 | def make_paths(ring, src, forward=True, wait=True): 44 | plan = [] 45 | chunks = [] 46 | 47 | for i in range(half_num_gpus): 48 | length = half_num_gpus-i 49 | if forward: 50 | cycled = cycle(ring) 51 | else: 52 | cycled = cycle(reversed(ring)) 53 | skipped = dropwhile(lambda x: x != src, cycled) 54 | path = islice(skipped, 0, length+1) 55 | path = list(path) 56 | if wait: 57 | wait_steps = (half_num_gpus*(half_num_gpus+1)//2) - ((half_num_gpus-i)*(half_num_gpus-i+1)//2) 58 | fill_steps = ((half_num_gpus-i-1)*(half_num_gpus-i)//2) 59 | else: 60 | wait_steps = i 61 | fill_steps = 0 62 | full_path = [path[0]]*wait_steps + path + [path[-1]]*fill_steps 63 | # print(path) 64 | # print(full_path) 65 | plan.append(full_path) 66 | if i == 0 and num_gpus%2 == 0: 67 | chunk = 1 68 | else: 69 | chunk = 2 70 | chunks.append(chunk) 71 | 72 | return plan, chunks 73 | 74 | def make_all2all_plan(): 75 | plan = [] 76 | chunks = [] 77 | 78 | # copy to self 79 | for src in range(num_gpus): 80 | steps = ((half_num_gpus)*(half_num_gpus+1)//2)+1 81 | path = [src] * steps 82 | plan.append(path) 83 | chunks.append(num_chunks) 84 | # transfer along rings 85 | for ring in rings: 86 | for src in range(num_gpus): 87 | # foward 88 | i_plan, i_chunks = make_paths(ring, src, forward=True) 89 | plan += i_plan 90 | chunks += i_chunks 91 | # reverse 92 | i_plan, i_chunks = make_paths(ring, src, forward=False) 93 | plan += i_plan 94 | chunks += i_chunks 95 | 96 | chunk_counter = np.zeros((num_gpus,num_gpus)) 97 | for p,c in zip(plan,chunks): 98 | # if p[0] == 0: print(p,c) 99 | chunk_counter[p[0],p[-1]] += c 100 | assert(np.all(chunk_counter == num_chunks)) 101 | 102 | return plan, chunks 103 | 104 | def make_scatter_plan(src): 105 | plan = [] 106 | chunks = [] 107 | 108 | # copy to self 109 | steps = half_num_gpus+1 110 | path = [src] * steps 111 | plan.append(path) 112 | chunks.append(num_chunks) 113 | # transfer along rings 114 | for ring in rings: 115 | # foward 116 | i_plan, i_chunks = make_paths(ring, src, forward=True, wait=False) 117 | plan += i_plan 118 | chunks += i_chunks 119 | # reverse 120 | i_plan, i_chunks = make_paths(ring, src, forward=False, wait=False) 121 | plan += i_plan 122 | chunks += i_chunks 123 | 124 | chunk_counter = np.zeros(num_gpus) 125 | for p,c in zip(plan,chunks): 126 | # print(p,c) 127 | # assert(plan[0] == src) 128 | chunk_counter[p[-1]] += c 129 | # print(chunk_counter) 130 | assert(np.all(chunk_counter == num_chunks)) 131 | 132 | return plan, chunks 133 | 134 | def make_gather_plan(trg): 135 | plan,chunks = make_scatter_plan(src=trg) 136 | plan = [list(reversed(p)) for p in plan] 137 | return plan, chunks 138 | 139 | def make_broadcast_plan(src, chunks_per_ring): 140 | plan = [] 141 | chunks = [] 142 | 143 | length = len(rings[0]) 144 | path_length = len(rings[0]) + chunks_per_ring - 1 + (len(rings)-1) * chunks_per_ring 145 | 146 | for r,ring in enumerate(rings): 147 | # forward 148 | for c in range(chunks_per_ring): 149 | # to self 150 | plan.append([src]* path_length) 151 | chunks.append(r*chunks_per_ring*2+c) 152 | # to others 153 | cycled = cycle(ring) 154 | skipped = dropwhile(lambda x: x != src, cycled) 155 | path = islice(skipped, 0, length) 156 | path = list(path) 157 | path = [path[0]] * c + path + [path[-1]] * (chunks_per_ring - c - 1) 158 | path = [path[0]] * r * chunks_per_ring + path + [path[-1]] * (len(rings)-r-1) * chunks_per_ring 159 | plan.append(path) 160 | chunks.append(r*chunks_per_ring*2+c) 161 | # reverse 162 | for c in range(chunks_per_ring): 163 | # to self 164 | plan.append([src]* path_length) 165 | chunks.append(r*chunks_per_ring*2+chunks_per_ring+c) 166 | # to others 167 | cycled = cycle(reversed(ring)) 168 | skipped = dropwhile(lambda x: x != src, cycled) 169 | path = islice(skipped, 0, length) 170 | path = list(path) 171 | path = [path[0]] * c + path + [path[-1]] * (chunks_per_ring - c - 1) 172 | path = [path[0]] * r * chunks_per_ring + path + [path[-1]] * (len(rings)-r-1) * chunks_per_ring 173 | plan.append(path) 174 | chunks.append(r*chunks_per_ring*2+chunks_per_ring+c) 175 | return plan, chunks 176 | 177 | if mode == "all2all": 178 | plan,chunks = make_all2all_plan() 179 | for p,c in zip(plan,chunks): 180 | if p[0] == main_gpu: print(p,c) 181 | elif mode == "scatter": 182 | plan,chunks = make_scatter_plan(src=main_gpu) 183 | for p,c in zip(plan,chunks): 184 | print(p,c) 185 | elif mode == "gather": 186 | plan,chunks = make_gather_plan(trg=main_gpu) 187 | for p,c in zip(plan,chunks): 188 | print(p,c) 189 | elif mode == "broadcast": 190 | plan,chunks = make_broadcast_plan(src=main_gpu, chunks_per_ring=20) 191 | num_chunks = len(plan) // 2 192 | for p,c in zip(plan,chunks): 193 | print(p,c) 194 | else: 195 | raise SystemExit 196 | 197 | steps = len(plan[0]) - 1 198 | 199 | data = { 200 | "type" : mode, 201 | "num_gpus" : num_gpus, 202 | "main_gpu" : main_gpu, 203 | "num_steps" : steps, 204 | "num_chunks" : num_chunks, 205 | "plan" : plan, 206 | "chunks" : chunks 207 | } 208 | # print(data) 209 | 210 | json_string = json.dumps(data) 211 | print(json_string) 212 | json_name = mode+"_plan.json" 213 | print("saving json to '%s'" %(json_name)) 214 | with open(json_name, "w") as file: 215 | json.dump(data, file) 216 | -------------------------------------------------------------------------------- /scripts/topology_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from subprocess import run, PIPE 4 | import numpy as np 5 | 6 | def get_topology_matrix(filename = ""): 7 | if filename: 8 | with open(filename, "r") as file: 9 | lines = file.read().split('\n') 10 | else: 11 | process = run(["nvidia-smi", "topo", "-m"], stdout=PIPE, universal_newlines=True) 12 | lines = process.stdout.split('\n') 13 | 14 | topology_lines = [l for l in lines if l.find("GPU") == 0] 15 | 16 | num_gpus = len(topology_lines) 17 | 18 | nvlink = False 19 | topology = [] 20 | for line in topology_lines: 21 | if line.find("NV") >= 0: 22 | nvlink = True 23 | topology.append(line.split()[1:num_gpus+1]) 24 | 25 | if not nvlink: 26 | return np.ones((num_gpus,num_gpus)) 27 | 28 | topology_matrix = np.eye(num_gpus) * num_gpus 29 | 30 | for i in range(len(topology)): 31 | for j in range(len(topology[i])): 32 | item = topology[i][j] 33 | if item[:2] == "NV": 34 | topology_matrix[i,j] = int(item[2:]) 35 | 36 | return topology_matrix 37 | 38 | # topology_matrix = get_topology_matrix() 39 | # print(topology_matrix) 40 | 41 | # topology_matrix = get_topology_matrix("dgx1_topology.txt") 42 | # print(topology_matrix) -------------------------------------------------------------------------------- /simulate.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "include/gossip.cuh" 8 | #include "executor.cuh" 9 | #include "include/plan_parser.hpp" 10 | #include "include/clipp/include/clipp.h" 11 | 12 | template 13 | void all2all( 14 | gossip::transfer_plan_t transfer_plan, 15 | const size_t batch_size, 16 | const size_t batch_size_secure) { 17 | 18 | gossip::all2all::verify_plan(transfer_plan); 19 | 20 | auto num_gpus = transfer_plan.num_gpus(); 21 | std::vector device_ids(num_gpus, 0); 22 | 23 | if(transfer_plan.valid()) { 24 | 25 | auto context = gossip::context_t(device_ids); 26 | // context.print_connectivity_matrix(); 27 | auto all2all = gossip::all2all_t(context, transfer_plan); 28 | auto multisplit = gossip::multisplit_t(context); 29 | auto point2point = gossip::point2point_t(context); 30 | 31 | run_multisplit_all2all( 32 | context, all2all, multisplit, point2point, 33 | batch_size, batch_size_secure); 34 | 35 | context.sync_hard(); 36 | } 37 | } 38 | 39 | template 40 | void all2all_async( 41 | gossip::transfer_plan_t transfer_plan, 42 | const size_t batch_size, 43 | const size_t batch_size_secure) { 44 | 45 | gossip::all2all::verify_plan(transfer_plan); 46 | 47 | auto num_gpus = transfer_plan.num_gpus(); 48 | std::vector device_ids(num_gpus, 0); 49 | 50 | if(transfer_plan.valid()) { 51 | 52 | auto context = gossip::context_t(device_ids); 53 | // context.print_connectivity_matrix(); 54 | auto all2all = gossip::all2all_async_t(context, transfer_plan); 55 | auto multisplit = gossip::multisplit_t(context); 56 | auto point2point = gossip::point2point_t(context); 57 | 58 | run_multisplit_all2all_async( 59 | context, all2all, multisplit, point2point, 60 | batch_size, batch_size_secure); 61 | 62 | context.sync_hard(); 63 | } 64 | } 65 | 66 | template 67 | void scatter_gather( 68 | gossip::transfer_plan_t scatter_plan, 69 | gossip::transfer_plan_t gather_plan, 70 | const size_t batch_size, 71 | const size_t batch_size_secure) { 72 | 73 | gossip::scatter::verify_plan(scatter_plan); 74 | gossip::gather::verify_plan(gather_plan); 75 | 76 | auto num_gpus = scatter_plan.num_gpus(); 77 | if(num_gpus != gather_plan.num_gpus()) { 78 | std::cout << "scatter and gather num_gpus does not match" << std::endl; 79 | return; 80 | } 81 | 82 | auto main_gpu = scatter_plan.main_gpu(); 83 | if(main_gpu != gather_plan.main_gpu()) { 84 | std::cout << "scatter and gather main_gpu does not match" << std::endl; 85 | return; 86 | } 87 | 88 | std::vector device_ids(num_gpus, 0); 89 | 90 | if(scatter_plan.valid() && gather_plan.valid()) { 91 | 92 | auto context = gossip::context_t(device_ids); 93 | // context.print_connectivity_matrix(); 94 | auto point2point = gossip::point2point_t(context); 95 | auto multisplit = gossip::multisplit_t(context); 96 | auto scatter = gossip::scatter_t(context, scatter_plan); 97 | auto gather = gossip::gather_t(context, gather_plan); 98 | 99 | run_multisplit_scatter_gather( 100 | context, point2point, multisplit, scatter, gather, 101 | main_gpu, 102 | batch_size, batch_size_secure); 103 | 104 | context.sync_hard(); 105 | } 106 | } 107 | 108 | template 109 | void broadcaster( 110 | gossip::transfer_plan_t transfer_plan, 111 | const size_t batch_size, 112 | const size_t batch_size_secure) { 113 | 114 | gossip::broadcast::verify_plan(transfer_plan); 115 | 116 | auto num_gpus = transfer_plan.num_gpus(); 117 | std::vector device_ids(num_gpus, 0); 118 | 119 | if(transfer_plan.valid()) { 120 | 121 | auto context = gossip::context_t(device_ids); 122 | // context.print_connectivity_matrix(); 123 | auto broadcast = gossip::broadcast_t(context, transfer_plan); 124 | auto multisplit = gossip::multisplit_t(context); 125 | auto point2point = gossip::point2point_t(context); 126 | 127 | run_multisplit_broadcast( 128 | context, point2point, multisplit, broadcast, 129 | batch_size, batch_size_secure); 130 | 131 | context.sync_hard(); 132 | } 133 | } 134 | 135 | int main (int argc, char *argv[]) { 136 | using data_t = uint64_t; // base data type 137 | 138 | // parse args using https://github.com/muellan/clipp 139 | using namespace clipp; 140 | enum class mode {all2all, all2all_async, scatter_gather, broadcast, help}; 141 | 142 | mode selected; 143 | double security_factor = 1.5; 144 | size_t data_size = 28; 145 | std::string plan_file, scatter_plan_file, gather_plan_file; 146 | 147 | auto cli = 148 | ( 149 | ( 150 | ( 151 | ( 152 | ( 153 | command("all2all").set(selected, mode::all2all) | 154 | command("all2all_async").set(selected, mode::all2all_async) | 155 | command("broadcast").set(selected, mode::broadcast) 156 | ), 157 | value("transfer plan", plan_file) 158 | ) | 159 | ( 160 | command("scatter_gather").set(selected, mode::scatter_gather), 161 | value("scatter plan", scatter_plan_file), value("gather plan", gather_plan_file) 162 | ) 163 | ), 164 | option("--size", "-s") & value("size", data_size) % "data size (bytes log2) [default: 28]", 165 | option("--memory-factor") & value("factor", security_factor) % "memory security factor [default: 1.5]" 166 | ) | 167 | command("help").set(selected, mode::help) 168 | ); 169 | 170 | if(parse(argc, argv, cli)) 171 | { 172 | assert(data_size >= 4); 173 | data_size = 1UL << data_size; 174 | size_t data_size_secure = data_size * security_factor; 175 | 176 | // execute selected collective 177 | switch(selected) 178 | { 179 | case mode::all2all: 180 | std::cout << "RUN: all2all" << std::endl; 181 | all2all(parse_plan(plan_file.c_str()), data_size, data_size_secure); 182 | break; 183 | case mode::all2all_async: 184 | std::cout << "RUN: all2all_async" << std::endl; 185 | all2all_async(parse_plan(plan_file.c_str()), data_size, data_size_secure); 186 | break; 187 | case mode::broadcast: 188 | std::cout << "RUN: broadcast" << std::endl; 189 | broadcaster(parse_plan(plan_file.c_str()), data_size, data_size_secure); 190 | break; 191 | case mode::scatter_gather: 192 | std::cout << "RUN: scatter_gather" << std::endl; 193 | scatter_gather(parse_plan(scatter_plan_file.c_str()), parse_plan(gather_plan_file.c_str()), data_size, data_size_secure); 194 | break; 195 | case mode::help: 196 | std::cout << make_man_page(cli, "execute"). 197 | prepend_section("DESCRIPTION", " test gossip on uniformly distributed data"); 198 | break; 199 | } 200 | } 201 | else 202 | { 203 | std::cout << usage_lines(cli, "execute") << '\n'; 204 | } 205 | 206 | } 207 | --------------------------------------------------------------------------------