├── .gitmodules ├── README.md ├── collectives ├── Makefile_aurora ├── Makefile_delta ├── Makefile_perlmutter ├── main.cpp ├── run_aurora.sh ├── run_delta.sh └── run_perlmutter.sh ├── hiccl.h ├── main.cpp ├── main.cu ├── misc ├── IPDPS25_rebuttal.md ├── hiccl_collectives_new.pdf ├── hiccl_collectives_new.png ├── rebuttal.md └── test.md └── source ├── bench.h ├── broadcast.h ├── coll.h ├── comm.h ├── command.h ├── compute.h ├── init.h └── reduce.h /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "CommBench"] 2 | path = CommBench 3 | url = https://github.com/merthidayetoglu/CommBench.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HiCCL 2 | 3 | HiCCL is a compositional communication library for hierarchical GPU networks. It offers an API for composing collective functions using *multicast*, *reduction*, and *fence* primitives. These primitives are machine- and library-agnostic, and are defined across GPU endpoints. HiCCL's design principle is to decouple the higher-level communication design and machine-specific optimizations. This principle aims to improve productivity, portability, and performance when building custom collective functions. 4 | 5 | 6 | HiCCL is based on [CommBench](https://github.com/merthidayetoglu/CommBench): a micro-benchmarking software for HPC networks. While HiCCL is a C++ layer for generating communication patterns on an abstract machine, CommBench is the middleware for implementing the patterns on an actual machine. The implementation is achieved by using the point-to-point functions of the chosen communication library, MPI, NCCL, RCCL, and OneCCL, and IPC capabilities (e.g., put, get), and recently GASNet-EX RMA functions for non-MPI applications. 7 | 8 | ## API 9 | 10 | The collective function is built within a persistent communicator. As an example, below shows an in-place composition of the All-Reduce collective. 11 | 12 | ```c++ 13 | #define PORT_CUDA 14 | #include "hiccl.h" 15 | 16 | #define T float; 17 | 18 | using namespace HiCCL 19 | 20 | int main() { 21 | 22 | size_t count = 1e9 / sizeof(T); // 1 GB 23 | 24 | Comm allreduce; 25 | 26 | T *sendbuf; 27 | T *recvbuf; 28 | allocate(sendbuf, count * numproc); 29 | allocate(recvbuf, count * numproc); 30 | 31 | // partial reductions (each GPU gathers count elements from all GPUs for reduction) 32 | for (int i = 0; i < numproc; i++) 33 | allreduce.add_reduction(sendbuf + i * count, recvbuf + i * count, count, HiCCL::all, i); 34 | // express ordering of the primitives 35 | allreduce.add_fence(); 36 | // multicast partial results (each GPU sends count elements to all GPUs except itself) 37 | for (int i = 0; i < numproc; i++) 38 | allreduce.add_multicast(recvbuf + i * count, recvbuf + i * count, count, i, HiCCL::others); 39 | 40 | // optimization parameters 41 | std::vector hierarchy = {numproc / 12, 6, 2}; // hierarchical factorization 42 | std::vector lib = {MPI, IPC, IPC}; // implementation libraries in each level 43 | int numstripe(1); // multi-rail striping (off) 44 | int ring(1); // number of virtual ring nodes (off) 45 | int pipeline(count / (1e6 / sizeof(T))); // MTU: 1 MB 46 | 47 | // initialize 48 | allreduce.init(hierarchy, lib, numstripe, ring, pipeline); 49 | 50 | // repetetive communications 51 | for (int iter = 0; iter < numiter; iter++) { 52 | // ... 53 | // nonblocking start 54 | allreduce.start(); 55 | // ... overlap other things 56 | // blocking wait 57 | allreduce.wait(); 58 | // ... 59 | } 60 | } 61 | 62 | ``` 63 | 64 | ![Collective throughput.](misc/hiccl_collectives_new.png) 65 | 66 | For questions and support, please send an email to merth@stanford.edu 67 | -------------------------------------------------------------------------------- /collectives/Makefile_aurora: -------------------------------------------------------------------------------- 1 | # ----- Make Macros ----- 2 | 3 | CXX = mpicxx 4 | CXXFLAGS = -cxx=icpx -fsycl -fsycl-targets=spir64 5 | 6 | LD_FLAGS = -qopenmp -fsycl -lze_loader -lccl 7 | CMPIFLAGS = 8 | CMPILIBFLAGS = 9 | 10 | TARGETS = HiCCL 11 | OBJECTS = main.o 12 | 13 | # ----- Make Rules ----- 14 | 15 | all: $(TARGETS) 16 | 17 | %.o : %.cpp 18 | ${CXX} ${CXXFLAGS} ${CMPIFLAGS} $< -c -o $@ 19 | 20 | HiCCL: $(OBJECTS) 21 | $(CXX) -o $@ $(OBJECTS) $(CMPILIBFLAGS) $(LD_FLAGS) 22 | 23 | clean: 24 | rm -f $(TARGETS) *.o *.o.* *.txt *.bin core *.html *.xml 25 | -------------------------------------------------------------------------------- /collectives/Makefile_delta: -------------------------------------------------------------------------------- 1 | # ----- Make Macros ----- 2 | 3 | NVCC = nvcc 4 | NVCCFLAGS = -lineinfo -O3 -std=c++14 -gencode arch=compute_80,code=sm_80 -ccbin=mpicxx -Xcompiler -fopenmp -Xptxas="-v" 5 | 6 | LD_FLAGS = -ccbin=mpicxx -Xcompiler -fopenmp -lnccl 7 | 8 | TARGETS = HiCCL 9 | OBJECTS = main.o 10 | 11 | # ----- Make Rules ----- 12 | 13 | all: $(TARGETS) 14 | 15 | %.o : %.cu 16 | ${NVCC} ${NVCCFLAGS} $< -c -o $@ 17 | 18 | HiCCL: $(OBJECTS) 19 | $(NVCC) -o $@ $(OBJECTS) $(LD_FLAGS) 20 | 21 | clean: 22 | rm -f $(TARGETS) *.o *.o.* *.txt *.bin core *.html *.xml 23 | -------------------------------------------------------------------------------- /collectives/Makefile_perlmutter: -------------------------------------------------------------------------------- 1 | # ----- Make Macros ----- 2 | 3 | CC = CC -target-accel=nvidia80 -fopenmp 4 | 5 | NVCC = nvcc 6 | NVCCFLAGS = -lineinfo -O3 -std=c++14 -gencode arch=compute_80,code=sm_80 -ccbin=CC -Xcompiler -fopenmp -Xptxas="-v" 7 | 8 | LD_FLAGS = -L${NCCL_DIR}/lib -lnccl 9 | 10 | TARGETS = HiCCL 11 | OBJECTS = main.o 12 | 13 | # ----- Make Rules ----- 14 | 15 | all: $(TARGETS) 16 | 17 | %.o : %.cpp 18 | ${CC} $< -c -o $@ 19 | 20 | %.o : %.cu 21 | ${NVCC} ${NVCCFLAGS} $< -c -o $@ 22 | 23 | HiCCL: $(OBJECTS) 24 | $(CC) -o $@ $(OBJECTS) $(LD_FLAGS) 25 | 26 | clean: 27 | rm -f $(TARGETS) *.o *.o.* *.txt *.bin core *.html *.xml 28 | -------------------------------------------------------------------------------- /collectives/main.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 Stanford University 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | // #define PORT_SYCL 17 | #define PORT_HIP 18 | // #define PORT_CUDA 19 | #include "../hiccl.h" 20 | 21 | #define ROOT 0 22 | 23 | // USER DEFINED TYPE 24 | #define Type size_t 25 | /*struct Type 26 | { 27 | // int tag; 28 | int data[1]; 29 | // complex x, y, z; 30 | };*/ 31 | 32 | int main(int argc, char *argv[]) 33 | { 34 | // INITIALIZE 35 | CommBench::init(); 36 | int myid = CommBench::myid; 37 | int numproc = CommBench::numproc; 38 | // MPI_Init(&argc, &argv); 39 | // MPI_Comm_rank(MPI_COMM_WORLD, &myid); 40 | // MPI_Comm_size(MPI_COMM_WORLD, &numproc); 41 | // char machine_name[MPI_MAX_PROCESSOR_NAME]; 42 | // int name_len = 0; 43 | // MPI_Get_processor_name(machine_name, &name_len); 44 | // printf("myid %d %s\n",myid, machine_name); 45 | 46 | // INPUT PARAMETERS 47 | int pattern = atoi(argv[1]); 48 | size_t count = atol(argv[2]); 49 | int numstripe = atoi(argv[3]); 50 | int ringnodes = atoi(argv[4]); 51 | int pipedepth = atoi(argv[5]); 52 | int warmup = atoi(argv[6]); 53 | int numiter = atoi(argv[7]); 54 | 55 | 56 | // PRINT NUMBER OF PROCESSES AND THREADS 57 | if(myid == CommBench::printid) 58 | { 59 | printf("\n"); 60 | printf("Number of processes: %d\n", numproc); 61 | printf("Number of warmup %d\n", warmup); 62 | printf("Number of iterations %d\n", numiter); 63 | printf("\n"); 64 | 65 | printf("Pattern: "); 66 | switch(pattern) { 67 | case HiCCL::gather : printf("Gather\n"); break; 68 | case HiCCL::scatter : printf("Scatter\n"); break; 69 | case HiCCL::broadcast : printf("Broadcast\n"); break; 70 | case HiCCL::reduce : printf("Reduce\n"); break; 71 | case HiCCL::alltoall : printf("All-to-All\n"); break; 72 | case HiCCL::allgather : printf("All-Gather\n"); break; 73 | case HiCCL::reducescatter : printf("Reduce-Scatter\n"); break; 74 | case HiCCL::allreduce : printf("All-Reduce\n"); break; 75 | } 76 | printf("\n"); 77 | printf("Bytes per Type %lu\n", sizeof(Type)); 78 | printf("count %ld: ", count); 79 | CommBench::print_data(count * sizeof(Type)); 80 | printf("\n"); 81 | { 82 | size_t data = 2 * count * numproc * sizeof(Type); 83 | printf("sendbuf + recvbuf count: %zu + %zu = ", count * numproc, count * numproc); 84 | CommBench::print_data(data); 85 | printf("\n"); 86 | } 87 | printf("Number of stripes: %d\n", numstripe); 88 | printf("Number of ring nodes: %d\n", ringnodes); 89 | printf("Pipeline depth: %d\n", pipedepth); 90 | } 91 | 92 | // ALLOCATE 93 | Type *sendbuf_d; 94 | Type *recvbuf_d; 95 | CommBench::allocate(sendbuf_d, count * numproc); 96 | CommBench::allocate(recvbuf_d, count * numproc); 97 | 98 | // COLLECTIVE COMMUNICATION 99 | { 100 | HiCCL::Comm coll; 101 | 102 | HiCCL::printid = -1; 103 | // PATTERN DESRIPTION 104 | switch (pattern) { 105 | case HiCCL::gather : 106 | for(int sender = 0; sender < numproc; sender++) 107 | coll.add_bcast(sendbuf_d, 0, recvbuf_d, sender * count, count, sender, ROOT); 108 | break; 109 | case HiCCL::scatter : 110 | for(int recver = 0; recver < numproc; recver++) 111 | coll.add_reduce(sendbuf_d, recver * count, recvbuf_d, 0, count, ROOT, recver); 112 | break; 113 | case HiCCL::broadcast : 114 | coll.add_bcast(sendbuf_d, 0, recvbuf_d, 0, count * numproc, ROOT, HiCCL::all); 115 | // SCATTER + ALL-GATHER 116 | /* for(int recver = 0; recver < numproc; recver++) 117 | coll.add_reduce(sendbuf_d, recver * count, recvbuf_d, recver * count, count, ROOT, recver); 118 | coll.add_fence(); 119 | for(int sender = 0; sender < numproc; sender++) 120 | coll.add_bcast(recvbuf_d, sender * count, recvbuf_d, sender * count, count, sender, HiCCL::others); */ 121 | break; 122 | case HiCCL::reduce : 123 | coll.add_reduce(sendbuf_d, 0, recvbuf_d, 0, count * numproc, HiCCL::all, ROOT); 124 | // REDUCE-SCATTER + GATHER 125 | /* for(int recver = 0; recver < numproc; recver++) 126 | coll.add_reduce(sendbuf_d, recver * count, recvbuf_d, recver * count, count, HiCCL::all, recver); 127 | coll.add_fence(); 128 | for(int sender = 0; sender < numproc; sender++) 129 | if(sender != ROOT) 130 | coll.add_bcast(recvbuf_d, sender * count, recvbuf_d, sender * count, count, sender, ROOT); */ 131 | break; 132 | case HiCCL::alltoall : 133 | for(int sender = 0; sender < numproc; sender++) 134 | for(int recver = 0; recver < numproc; recver++) 135 | coll.add_bcast(sendbuf_d, recver * count, recvbuf_d, sender * count, count, sender, recver); 136 | break; 137 | case HiCCL::allgather : 138 | for(int sender = 0; sender < numproc; sender++) 139 | coll.add_bcast(sendbuf_d, 0, recvbuf_d, sender * count, count, sender, HiCCL::all); 140 | break; 141 | case HiCCL::reducescatter : 142 | for(int recver = 0; recver < numproc; recver++) 143 | coll.add_reduce(sendbuf_d, recver * count, recvbuf_d, 0, count, HiCCL::all, recver); 144 | break; 145 | case HiCCL::allreduce : 146 | // REDUCE + BROADCAST 147 | /*coll.add_reduce(sendbuf_d, 0, recvbuf_d, 0, count * numproc, HiCCL::all, ROOT); 148 | coll.add_fence(); 149 | coll.add_bcast(recvbuf_d, 0, recvbuf_d, 0, count * numproc, ROOT, HiCCL::others);*/ 150 | // REDUCE-SCATTER + ALL-GATHER 151 | for(int recver = 0; recver < numproc; recver++) 152 | coll.add_reduce(sendbuf_d, recver * count, recvbuf_d, recver * count, count, HiCCL::all, recver); 153 | coll.add_fence(); 154 | for(int sender = 0; sender < numproc; sender++) 155 | coll.add_bcast(recvbuf_d, sender * count, recvbuf_d, sender * count, count, sender, HiCCL::others); 156 | break; 157 | default: 158 | if(myid == CommBench::printid) 159 | printf("invalid collective option\n"); 160 | } 161 | HiCCL::printid = 0; 162 | 163 | // INITIALIZE 164 | coll.set_hierarchy(std::vector {4, 4, 2}, 165 | std::vector {CommBench::MPI, CommBench::IPC, CommBench::IPC}); 166 | //coll.set_hierarchy(std::vector {2, 2, 4, 2}, 167 | // std::vector {CommBench::MPI, CommBench::MPI, CommBench::IPC, CommBench::IPC}); 168 | // coll.set_hierarchy(std::vector {32, 8}, 169 | // std::vector {CommBench::MPI, CommBench::IPC}); 170 | coll.set_numstripe(numstripe); 171 | coll.set_ringnodes(ringnodes); 172 | coll.set_pipedepth(pipedepth); 173 | 174 | CommBench::printid = -1; 175 | coll.init(); 176 | CommBench::printid = 0; 177 | 178 | coll.measure(warmup, numiter, count * numproc / pipedepth); 179 | 180 | CommBench::report_memory(); 181 | HiCCL::measure(warmup, numiter, count * numproc, coll); 182 | HiCCL::validate(sendbuf_d, recvbuf_d, count, pattern, ROOT, coll); 183 | } 184 | if(myid == CommBench::printid) { 185 | printf("approx. message length: "); 186 | CommBench::print_data((((double)count / numstripe) / pipedepth) * sizeof(Type)); 187 | printf("\n"); 188 | } 189 | 190 | // DEALLOCATE 191 | CommBench::free(sendbuf_d); 192 | CommBench::free(recvbuf_d); 193 | 194 | return 0; 195 | } // main() 196 | -------------------------------------------------------------------------------- /collectives/run_aurora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #PBS -l select=2:system=sunspot,place=scatter 4 | #PBS -A CSC249ADCD01_CNDA 5 | #PBS -l walltime=01:00:00 6 | #PBS -N 2nodes_gpu 7 | #PBS -k doe 8 | 9 | export TZ='/usr/share/zoneinfo/US/Central' 10 | export OMP_PROC_BIND=spread 11 | export OMP_NUM_THREADS=8 12 | export OMP_PLACES=threads 13 | #unset OMP_PLACES 14 | 15 | date 16 | 17 | echo Jobid: $PBS_JOBID 18 | echo Running on host `hostname` 19 | echo Running on nodes `cat $PBS_NODEFILE` 20 | 21 | NNODES=`wc -l < $PBS_NODEFILE` 22 | NRANKS=12 # Number of MPI ranks per node 23 | NDEPTH=8 # Number of hardware threads per rank, spacing between MPI ranks on a node 24 | # NTHREADS=$OMP_NUM_THREADS # Number of OMP threads per rank, given to OMP_NUM_THREADS 25 | 26 | export MPICH_GPU_SUPPORT_ENABLED=1 27 | 28 | NTOTRANKS=$(( NNODES * NRANKS )) 29 | 30 | echo "NUM_NODES=${NNODES} TOTAL_RANKS=${NTOTRANKS} RANKS_PER_NODE=${NRANKS} THREADS_PER_RANK=${OMP_NUM_THREADS}" 31 | echo "OMP_PROC_BIND=$OMP_PROC_BIND OMP_PLACES=$OMP_PLACES" 32 | 33 | export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1 34 | PROC_LIST='list:0-7:8-15:16-23:24-31:32-39:40-47:52-59:60-67:68-75:76-83:84-91:92-99' 35 | 36 | mpiexec -np ${NTOTRANKS} -ppn ${NRANKS} --cpu-bind=$PROC_LIST gpu_tile_compact.sh ./cxi_assign_rr.sh ./HiCCL 37 | 38 | date 39 | -------------------------------------------------------------------------------- /collectives/run_delta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --exclusive 4 | #SBATCH --mem=0 5 | #SBATCH --nodes=2 6 | #SBATCH --ntasks-per-node=4 7 | #SBATCH --cpus-per-task=16 8 | #SBATCH --partition=gpuA100x4 9 | #SBATCH --account=bbkf-delta-gpu 10 | #SBATCH --time=00:30:00 11 | ### GPU options ### 12 | #SBATCH --gpus-per-node=4 13 | 14 | date 15 | 16 | scontrol show job ${SLURM_JOBID} 17 | 18 | module -t list 19 | 20 | pattern=3 21 | count=$((2 ** 25)) 22 | numstripe=4 23 | ringnodes=4 24 | pipedepth=128 25 | warmup=5 26 | numiter=10 27 | 28 | srun ./HiCCL $pattern $count $numstripe $ringnodes $pipedepth $warmup $numiter 29 | 30 | date 31 | -------------------------------------------------------------------------------- /collectives/run_perlmutter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A m4301 3 | #SBATCH -C gpu 4 | #SBATCH -q regular 5 | #SBATCH -t 00:01:00 6 | #SBATCH -N 4 7 | #SBATCH --ntasks-per-node=4 8 | #SBATCH -c 32 9 | #SBATCH --gpus-per-task=1 10 | #SBATCH --gpu-bind=none 11 | #SBATCH --array=0-28 12 | 13 | date 14 | 15 | module -t list 16 | 17 | #export MPICH_OFI_NIC_VERBOSE=2 18 | #export MPICH_ENV_DISPLAY=1 19 | 20 | export SLURM_CPU_BIND="cores" 21 | 22 | warmup=5 23 | numiter=10 24 | 25 | ringnodes=1 26 | numstripe=1 27 | stripeoffset=1 28 | pipeoffset=1 29 | 30 | for pattern in 8 31 | do 32 | for pipedepth in 128 33 | do 34 | #for count in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216 33554432 67108864 134217728 268435456 35 | #for size in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 36 | for size in $SLURM_ARRAY_TASK_ID 37 | do 38 | count=$((2**size)) 39 | srun -N 4 --ntasks-per-node=4 -C gpu -c 32 --gpus-per-task=1 --gpu-bind=none ./ExaComm $pattern $ringnodes $numstripe $stripeoffset $pipedepth $pipeoffset $count $warmup $numiter 40 | done 41 | done 42 | done 43 | 44 | date 45 | -------------------------------------------------------------------------------- /hiccl.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 Stanford University 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | #ifndef HICCL_H 17 | #define HICCL_H 18 | 19 | // GPU PORTS 20 | // For NVIDIA: #define PORT_CUDA 21 | // For AMD: #define PORT_HIP 22 | // For SYCL: #define PORT_SYCL 23 | 24 | #include "CommBench/commbench.h" 25 | 26 | #include 27 | #include 28 | 29 | namespace HiCCL { 30 | 31 | static int printid = 0; 32 | static const MPI_Comm &comm_mpi = CommBench::comm_mpi; 33 | static const int &numproc = CommBench::numproc; 34 | static const int &myid = CommBench::myid; 35 | 36 | static size_t buffsize = 0; 37 | static size_t recycle = 0; 38 | static size_t reuse = 0; 39 | 40 | enum pattern {all, others}; 41 | enum collective {dummy, gather, scatter, broadcast, reduce, alltoall, allgather, reducescatter, allreduce}; 42 | 43 | #include "source/compute.h" 44 | #include "source/coll.h" 45 | #include "source/command.h" 46 | #include "source/reduce.h" 47 | #include "source/broadcast.h" 48 | // #include "source/init.h" 49 | #include "source/comm.h" 50 | #include "source/bench.h" 51 | 52 | } 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #define PORT_HIP 2 | #include "hiccl.h" 3 | 4 | #define Type size_t 5 | 6 | int main() { 7 | 8 | int myid = CommBench::myid; 9 | int numproc = CommBench::numproc; 10 | int root = 0; 11 | 12 | size_t count = 128e6; // 1 GB per GPU 13 | size_t *weights; 14 | CommBench::allocate(weights, count); 15 | 16 | HiCCL::Comm allreduce; 17 | { 18 | // END POINTS 19 | Type *sendbuf; 20 | Type *recvbuf; 21 | CommBench::allocate(sendbuf, count); 22 | CommBench::allocate(recvbuf, count) ; 23 | 24 | // COMPOSITION 25 | /* 26 | // DIRECT REDUCE 27 | for(int recver = 0; recver < numproc; recver++) 28 | allreduce.add_reduce(sendbuf, 0, recvbuf, 0, count, HiCCL::all, recver); 29 | // REDUCE + BROADCAST 30 | allreduce.add_reduce(sendbuf, 0, recvbuf, 0, count, HiCCL::all, root); // all -> root 31 | allreduce.add_fence(); 32 | allreduce.add_bcast(recvbuf, 0, recvbuf, 0, count, root, HiCCL::others); // root -> all - root 33 | // REDUCE + BROADCAST OMITTING THE ROOT REDUCTION 34 | int nodesize = 8; 35 | for(int i = 0; i < nodesize; i++) 36 | allreduce.add_reduce(sendbuf, i * (count / nodesize), recvbuf, i * (count / nodesize), count / nodesize, HiCCL::all, i); 37 | allreduce.add_fence(); 38 | for(int i = 0; i < nodesize; i++) 39 | allreduce.add_bcast(recvbuf, i * (count / nodesize), recvbuf, i * (count / nodesize), count / nodesize, i, HiCCL::others); 40 | */ 41 | // REDUCE-SCATTER + ALL-GATHER 42 | for(int recver = 0; recver < numproc; recver++) 43 | allreduce.add_reduce(sendbuf, recver * count / numproc, recvbuf, recver * count / numproc, count / numproc, HiCCL::all, recver); 44 | allreduce.add_fence(); 45 | for(int sender = 0; sender < numproc; sender++) 46 | allreduce.add_bcast(recvbuf, sender * count / numproc, recvbuf, sender * count / numproc, count / numproc, sender, HiCCL::others); 47 | 48 | /* 49 | */ 50 | // SET PARAMETERS 51 | allreduce.set_hierarchy(std::vector {2, 4, 2}, std::vector {CommBench::MPI, CommBench::IPC, CommBench::IPC}); 52 | // allreduce.set_numstripe(8); 53 | // allreduce.set_ringnodes(16); 54 | allreduce.set_pipedepth(4); 55 | allreduce.set_endpoints(sendbuf, count, recvbuf, count); 56 | // CommBench::printid = -1; 57 | allreduce.init(); 58 | // CommBench::printid = 0; 59 | CommBench::report_memory(); 60 | 61 | HiCCL::validate(sendbuf, recvbuf, count / numproc, HiCCL::allreduce, root, allreduce); 62 | } 63 | 64 | allreduce.run(weights, weights); 65 | 66 | HiCCL::measure(5, 10, count, allreduce); 67 | 68 | CommBench::free(weights); 69 | 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /main.cu: -------------------------------------------------------------------------------- 1 | #define PORT_CUDA 2 | #include "hiccl.h" 3 | 4 | int main() { 5 | 6 | size_t count = 256e6; // 1 GB 7 | float *weights; 8 | cudaMalloc(&weights, count * sizeof(float)); 9 | 10 | HiCCL::Comm allreduce; 11 | { 12 | // END POINTS 13 | float *sendbuf; 14 | float *recvbuf; 15 | CommBench::allocate(sendbuf, count); 16 | CommBench::allocate(recvbuf, count); 17 | 18 | // COMPOSITION 19 | int root = 0; 20 | allreduce.add_reduce(sendbuf, 0, recvbuf, 0, count, HiCCL::all, root); // all -> root 21 | allreduce.add_fence(); 22 | allreduce.add_bcast(recvbuf, 0, recvbuf, 0, count, root, HiCCL::others); // root -> all - root 23 | 24 | // SET PARAMETERS 25 | // allreduce.set_hierarchy(std::vector {2, 4}, std::vector {CommBench::XCCL, CommBench::IPC_get}); 26 | // allreduce.set_numstripe(4); 27 | // allreduce.set_pipedepth(12); 28 | allreduce.set_endpoints(sendbuf, count, recvbuf, count); 29 | allreduce.init(); 30 | CommBench::report_memory(); 31 | } 32 | 33 | allreduce.run(weights, weights); 34 | 35 | HiCCL::measure(5, 10, count, allreduce); 36 | 37 | cudaFree(weights); 38 | 39 | return 0; 40 | } 41 | -------------------------------------------------------------------------------- /misc/IPDPS25_rebuttal.md: -------------------------------------------------------------------------------- 1 | **Common Questions** 2 | 3 | **Q1: Performance at large scale (Reviewers 2, 3, 4).** 4 | We are explicit about the limitation of our current approach beyond 256 Nodes in S6-E, and mention that HiCCL could be extended to implement latency oriented optimizations as a potential future direction. However HPC strong scaling workloads as well as ML inference leverage lower than 256 nodes would still benefit from this work. For example, as large language models become more efficient, inference production is typically done on a few nodes. 5 | 6 | **Reviewer 1** 7 | 8 | **Race condition in single step:** 9 | HiCCL allows composition of collective communications in multiple steps as explained in S3-C. Each step is composed of reduction and multicast primitives, that should not write to the same output. In other words, each output element must be updated by a single primitive. 10 | 11 | **Derived data types:** 12 | The data type is templatized, and passed when initializing a communicator as shown in Line 3 of Listing 3. Derived data types can be passed as a structure, and the reduction operation can be extended for the derived data type. In this extent, HiCCL can be used as a drop-in replacement for traditional data types, but it would require some additional engineering when it comes to derived data types. 13 | 14 | **Theoretical throughput:** 15 | The theoretical throughputs in Figure 8, are based on Table III. These formulas are based on B/W = dg/t, where d is the volume of each point-to-point communication, g is the number of participating GPUs, and t is the elapsed time. Since all-to-all involves more number of point-to-point communications than other collectives, all-to-all would take the most elapsed time with the same d. Therefore all-to-all's algorithmic (theoretical) throughput maps to the lowest among all collectives. 16 | 17 | **Reviewer 2** 18 | 19 | **Node placement:** 20 | All experiments are conducted in a single SLURM session, resulting in consistent node placement across scaling. Thus all experiments use the same layout between runs. Further control on node placement is challenging without the assistance of administration. We will add this to the paper. 21 | 22 | **Reviewer 3** 23 | 24 | **Comparison with NCCL:** 25 | As stated shown in Figure 10(a), NCCL achieves a higher throughput for node counts larger than four. We agree with the reviewer that NCCL is faster on medium to large node counts, it is a vendor-specific solution. Whereas HiCCL manages to reach competitive performance while being portable across multiple vendors and architectures. 26 | 27 | **Reviewer 4** 28 | 29 | **Integration of a new API:** 30 | HiCCL’s is designed for easy integration of new library APIs for mixed-library implementation. The collectives are ultimately implemented with point-to-point functions, and HiCCL takes advantage of non-blocking point-to-point API of a new communication library via a simplified interface. We used that interface to integrate the existing libraries–NCCL, MPI, IPC (CUDA/HIP/OneAPI). In fact, we have recently integrated GASNet for non-MPI applications in one day of engineering effort. In the end, the user can choose whichever library they want in a particular hierarchy level as in Line 14 of Listing 2. 31 | 32 | **Intra-node communication hierarchies:** 33 | The key contribution of this work is the abstraction of the communication hierarchies. HiCCL takes inter-tile and inter-GPU interconnects into account as explicitly stated in the 4th sentence of 2nd paragraph of S6-C.2). The overall hierarchy is set using a vector as in Line 13 of Listing 2. In the Aurora example, the last two elements {6, 2} represent six devices (connected with XeLinks) with two tiles (connected with MDFI). In evaluation, we will include this detail and refer to Line 13 of Listing 2 for convenience of the reader. 34 | 35 | **Message size vs. bandwidth:** 36 | We show the throughput for various message sizes in Figure 9 for a few representative cases. Since other systems / collectives show similar curves, we omitted them from the evaluation section for brevity. 37 | 38 | **Strong scaling:** 39 | We did not choose the message sizes based on any specific application. We chose them for the sake of stressing the network bandwidth with large messages and to investigate if we can reach the theoretical limits. Therefore we chose large buffer sizes (8.6 GB and 17.2 GB) for strong scaling experiment. We observe that throughput-oriented optimizations break down with large number of nodes. 40 | 41 | **Latency:** 42 | In our scaling experiments, we keep the buffer size per GPU constant while increasing the node count. In tree configuration, the number of point-to-point communications are increased whereas the volume per communication is decreased. In strong scaling to thousands of GPUs, the work per GPU becomes so small (MB regime. 20 | 21 | **Rev1** 22 | 23 | 1) The hierarchical tree structure is automatically built from the user input, specifically through the vector in Listing 2, Line 13. This vector corresponds to the branching factors in each level. Listing 2 is intended for library developers utilizing HiCCL. An informed user can utilize HiCCL directly yet setting the input parameters require expertise. 24 | 2) We apologize for not matching the hierarchy in Listing 2 with one of those in Figure 5. Listing 2 is for Aurora, which has six GPUs and each GPU has two dies. Therefore there are 12 endpoints per node in total. When there are two nodes (numproc=24) the factorization in Listing 2 will be the following {2,6,2} (currently not displayed). For clarity, we will replace Figure 5(a) with a display of the parameters in Listing 2. 25 | 3) Figure 8 shows the algorithmic throughput of collective functions in isolation. 26 | The geometric mean of HiCCL’s speedup over MPI is calculated based on throughput on four systems and eight collectives that are shown in Figure 8. 27 | 28 | **Rev2** 29 | 30 | - **Parameter selection:** The communication policies must be informed by the user through HiCCL’s API. The user must set the intra-node hierarchy according to the dies per device and devices per node as shown in TableV-bold. In our experience, one can determine the optimal configuration in only three guesses per system. The most effective optimization across nodes is the multi-NIC striping, which can be turned on by simply setting the number of stripes per node. 31 | 32 | - **Alternative topologies:** Most recent GPU systems have hierarchical networks, therefore we designed hierarchical optimizations. A less common topology is torus (e.g.,Tofu), which would require different optimizations. The machine-agnostic collective composition (with three primitives) can be applied directly to any topology. However, the factorization of the primitives will be different, and must be specialized for the given network topology. 33 | 34 | - **Round-robin GPU-to-NIC associations:** Background explains achievable bandwidth of 75% due to load imbalance across NICs. This manifests in Aurora as shown in Fig.8(d) with “not achievable” frames. 35 | 36 | **Rev3** 37 | 38 | **W3,W5,R1,D12,D15.** Refer-to-**A** 39 | 40 | **W7,W11.** Refer-to-**B**. 41 | 42 | **W1.** Refer-to-**C** 43 | 44 | **W2.** Fence divides the collective into two steps. Each process wait for completion of the first step before starting the second, hence guaranteeing correctness. Refer-to-**A**. 45 | 46 | **W4.** Yes, the user can tune the parameters Listing2(13–17) for tuning for latency or bandwidth. Refer-to-**C**. 47 | 48 | **W6.** We exhaustively tested correctness with various 1) compositions, 2) parameters, 3) systems. We describe the verification tests in AD/AE appendix. Refer-to-**A**. 49 | 50 | 51 | **W8.** We will review our terminology usage in the paper and make updates accordingly. We welcome any further feedback from the reviewer in this regard. Please consult our other answers when considering potential alignment with existing terms. 52 | 53 | **W9.** To our knowledge, we have included all the work that directly relates to our contribution in Section VII. We will include happens-before semantics in related work and cite the work pointed out by the reviewer. We are open to adding additional work based on reviewers’ suggestions. 54 | 55 | **W10.** MPI on our test systems are vendor-provided and are optimized/tested extensively for acceptance tests. We worked with facility staff and MPICH developers and set necessary tuning flags to maximize throughput. Refer-to-**D8**. 56 | 57 | **R2.** Refer-to-**A**&**W9**. 58 | 59 | **R3.** a) Refer-to-**A** b) In practice, the collective performance will be affected by additional communication (if any) and so the theoretical bounds in Table III. 60 | 61 | **D1.** The endpoints of primitives (sendbuf/recvbuf) cannot overlap. 62 | 63 | **D6.** We chose the three primitives for their simplicity and expressivity. Alternative to multicast and reduction would allGatherv and reduceScatterv. However, the alternative has a more complex interface than the original. 64 | 65 | **D7.** We compare ring and tree for broadcast and reduce in Figure 8 to show the case where saturating bandwidth does not mean higher throughput. Similarly, we discuss that an all-reduce with reduction-only primitives is not communication optimal (SIII-Bpar.3,SIV-Cpar.2). On the other hand, a reduce followed by a broadcast (TableIIrow14) is load imbalanced because of reduction on a single GPU. Therefore we chose reduce-scatter followed by all-gather (TabIIrow15), which is communication optimal. 66 | 67 | **D8.** The core algorithms in MPI implementations are originally implemented for CPUs. GPU-aware MPI (OpenMPI/MPICH) typically moves the data to CPU, runs the original algorithm as it is, and then moves the results back to GPU. Therefore they do not take advantage of the direct links across GPUs. 68 | 69 | **D9.** We can also run MVAPICH, but we rely on the available MPI implementations. Refer-to-W10. Regardless, we also show that there is little room for improvement over HiCCL as it already approaches theoretical limits (Figure 8). 70 | 71 | **D11.** Striping is composed algebraically in SIV-Cpar.2, which can be generalized. 72 | 73 | **D14.** For example, Perlmutter has NVLinks, wher within nodes and SS-11 across nodes. NCCL may be faster within nodes and MPI may be faster across nodes. 74 | 75 | **D16.** Refer-to-**D8**. The core algorithms in GPU-aware MPI implementations are CPU based. It is hard to compare the ideas in HiCCL with GPU-aware MPI libraries. 76 | 77 | **D17.** Refer-to-**W9**. 78 | 79 | **D18.** [7] is criticized for costly code synthesis for collective communications, not using SMT in general. We will clarify in the final version. 80 | 81 | **Rev4** 82 | 83 | 1) HiCCL relies on non-blocking point-to-point functions of existing GPU-aware communication libraries to implement collective communications. The repository includes a header file that can be modified for integrating additional APIs. For example, a user has integrated the GASNet library as an additional option in one day. Similarly, NVSHMEM, UCX, or another desired library can be integrated as long as it has send/recv or put/get functions that provides a handle for waiting on remote completion. 84 | 85 | 2) Yes, please refer to SVI-C2par2. Our implementation exchanges remote memory handles (hipIpcMemHandle_t on Frontier and ze_ipc_mem_handle_t on Aurora) for utilizing interconnects across tiles and devices separately. 86 | 87 | 3) Figure 9 shows message sizes on the X axis and throughput on the Y axis for four collective functions on Perlmutter. We observe similar results on other collectives and systems, although we cannot include them in the paper due to space constraints. 88 | 89 | 4) We performed the scaling experiment for the sake of stressing the network bandwidth with large messages and finding the limit where the scaling breaks down. We found out the throughput is hampered with message sizes smaller than a MB. 90 | 91 | 5) The allreduce algorithm(Fig.2) parallelizes the reduction, where each GPU is responsible to reduce a partial data. With thousands of GPUs, the work per GPU becomes so small that the network and GPU kernel launch latency that would not be significant otherwise becomes significant. It is a future research to find novel compositions to maintain large message sizes at scale. HiCCL’s compositional design will help productivity for developing interesting configurations. For example, one can try employing a single GPU per node for reducing partial data rather than all GPUs for delaying messages getting so small when scaling out. 92 | 93 | **Rev5** 94 | 95 | - HiCCL’s potential use is to replace one or a few throughput-critical functions manually with the original API(Listing2). Similarly, a drop-in replacement can be achieved with a macro. For legacy applications in Fortran, it is possible to create Fortran bindings of the C++ API. 96 | 97 | - Refer-to-**B**. 98 | 99 | We will address all minor typographical errors and corrections to figures and text in the final version. 100 | -------------------------------------------------------------------------------- /source/bench.h: -------------------------------------------------------------------------------- 1 | template 2 | void measure(int warmup, int numiter, size_t count, Comm &comm) { 3 | 4 | double times[numiter]; 5 | if(myid == printid) 6 | printf("%d warmup iterations (in order):\n", warmup); 7 | for (int iter = -warmup; iter < numiter; iter++) { 8 | 9 | #ifdef PORT_CUDA 10 | cudaDeviceSynchronize(); 11 | #elif defined PORT_HIP 12 | hipDeviceSynchronize(); 13 | #endif 14 | MPI_Barrier(comm_mpi); 15 | double time = MPI_Wtime(); 16 | 17 | comm.run(); 18 | 19 | time = MPI_Wtime() - time; 20 | 21 | MPI_Allreduce(MPI_IN_PLACE, &time, 1, MPI_DOUBLE, MPI_MAX, comm_mpi); 22 | if(iter < 0) { 23 | if(myid == printid) 24 | printf("warmup: %e\n", time); 25 | } 26 | else 27 | times[iter] = time; 28 | } 29 | std::sort(times, times + numiter, [](const double & a, const double & b) -> bool {return a < b;}); 30 | 31 | if(myid == printid) { 32 | printf("%d measurement iterations (sorted):\n", numiter); 33 | for(int iter = 0; iter < numiter; iter++) { 34 | printf("time: %.4e", times[iter]); 35 | if(iter == 0) 36 | printf(" -> min\n"); 37 | else if(iter == numiter / 2) 38 | printf(" -> median\n"); 39 | else if(iter == numiter - 1) 40 | printf(" -> max\n"); 41 | else 42 | printf("\n"); 43 | } 44 | printf("\n"); 45 | double minTime = times[0]; 46 | double medTime = times[numiter / 2]; 47 | double maxTime = times[numiter - 1]; 48 | double avgTime = 0; 49 | for(int iter = 0; iter < numiter; iter++) 50 | avgTime += times[iter]; 51 | avgTime /= numiter; 52 | double data = count * sizeof(T); 53 | printf("Total data: "); CommBench::print_data(data); printf("\n"); 54 | printf("Total minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); 55 | printf("Total medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); 56 | printf("Total maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); 57 | printf("Total avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); 58 | printf("\n"); 59 | } 60 | } 61 | 62 | template 63 | void validate(T *sendbuf_d, T *recvbuf_d, size_t count, int patternid, int root, Comm &comm) { 64 | 65 | T *sendbuf; 66 | T *recvbuf; 67 | #ifdef PORT_CUDA 68 | cudaMallocHost(&sendbuf, count * numproc * sizeof(T)); 69 | cudaMallocHost(&recvbuf, count * numproc * sizeof(T)); 70 | cudaMemset(recvbuf_d, -1, count * numproc * sizeof(T)); 71 | #elif defined PORT_HIP 72 | hipHostMalloc(&sendbuf, count * numproc * sizeof(T)); 73 | hipHostMalloc(&recvbuf, count * numproc * sizeof(T)); 74 | hipMemset(recvbuf_d, -1, count * numproc * sizeof(T)); 75 | #elif defined PORT_SYCL 76 | sendbuf = sycl::malloc_host(count * numproc, CommBench::q); 77 | recvbuf = sycl::malloc_host(count * numproc, CommBench::q); 78 | CommBench::q.memset(recvbuf_d, -1, count); // call a kernel; 79 | #endif 80 | #pragma omp parallel for 81 | for(size_t i = 0; i < count * numproc; i++) 82 | sendbuf[i] = i; 83 | #ifdef PORT_CUDA 84 | cudaStream_t stream; 85 | cudaStreamCreate(&stream); 86 | cudaMemcpyAsync(sendbuf_d, sendbuf, count * numproc * sizeof(T), cudaMemcpyHostToDevice, stream); 87 | cudaStreamSynchronize(stream); 88 | #elif defined PORT_HIP 89 | hipStream_t stream; 90 | hipStreamCreate(&stream); 91 | hipMemcpyAsync(sendbuf_d, sendbuf, count * numproc * sizeof(T), hipMemcpyHostToDevice, stream); 92 | hipStreamSynchronize(stream); 93 | #elif defined PORT_SYCL 94 | CommBench::q.memcpy(sendbuf_d, sendbuf, count * numproc * sizeof(T)).wait(); 95 | #endif 96 | MPI_Barrier(comm_mpi); 97 | 98 | // comm.run(); 99 | // comm.run(sendbuf_d, recvbuf_d); 100 | comm.start(); 101 | comm.wait(); 102 | 103 | #ifdef PORT_CUDA 104 | cudaMemcpyAsync(recvbuf, recvbuf_d, count * numproc * sizeof(T), cudaMemcpyDeviceToHost, stream); 105 | cudaStreamSynchronize(stream); 106 | #elif defined PORT_HIP 107 | hipMemcpyAsync(recvbuf, recvbuf_d, count * numproc * sizeof(T), hipMemcpyDeviceToHost, stream); 108 | hipStreamSynchronize(stream); 109 | #elif defined PORT_SYCL 110 | CommBench::q.memcpy(recvbuf, recvbuf_d, count * numproc * sizeof(T)); 111 | CommBench::q.wait(); 112 | #endif 113 | 114 | MPI_Barrier(comm_mpi); 115 | double time = MPI_Wtime(); 116 | unsigned long errorcount = 0; 117 | bool pass = true; 118 | switch(patternid) { 119 | case gather: if(myid == printid) printf("VERIFY GATHER ROOT = %d: ", root); 120 | if(myid == root) 121 | for(int p = 0; p < numproc; p++) 122 | for(size_t i = 0; i < count; i++) { 123 | // printf("myid %d recvbuf[%zu] = %d\n", myid, p * count + i, recvbuf[p * count + i]); 124 | if(recvbuf[p * count + i] != i) { 125 | pass = false; 126 | errorcount++; 127 | } 128 | } 129 | break; 130 | case scatter: if(myid == printid) printf("VERIFY SCATTER ROOT = %d: ", root); 131 | for(size_t i = 0; i < count; i++) { 132 | // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); 133 | if(recvbuf[i] != myid * count + i) { 134 | pass = false; 135 | errorcount++; 136 | } 137 | } 138 | break; 139 | case broadcast: if(myid == printid) printf("VERIFY BCAST ROOT = %d: ", root); 140 | for(size_t i = 0; i < count * numproc; i++) { 141 | // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); 142 | if(recvbuf[i] != i) { 143 | pass = false; 144 | errorcount++; 145 | } 146 | } 147 | break; 148 | case reduce: if(myid == printid) printf("VERIFY REDUCE ROOT = %d: ", root); 149 | if(myid == root) 150 | for(size_t i = 0; i < count * numproc; i++) { 151 | // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); 152 | if(recvbuf[i] != i * numproc) { 153 | pass = false; 154 | errorcount++; 155 | } 156 | } 157 | break; 158 | case alltoall: if(myid == printid) printf("VERIFY ALL-TO-ALL: "); 159 | for(int p = 0; p < numproc; p++) 160 | for(size_t i = 0; i < count; i++) { 161 | // printf("myid %d recvbuf[%d] = %d\n", myid, i, recvbuf[i]); 162 | if(recvbuf[p * count + i] != myid * count + i) { 163 | pass = false; 164 | errorcount++; 165 | } 166 | } 167 | break; 168 | case allgather: if(myid == printid) printf("VERIFY ALL-GATHER: "); 169 | for(int p = 0; p < numproc; p++) 170 | for(size_t i = 0; i < count; i++) { 171 | // if(myid == printid) printf("myid %d recvbuf[%d] = %d (%d)\n", myid, p * count + i, recvbuf[p * count + i], i); 172 | if(recvbuf[p * count + i] != i) { 173 | pass = false; 174 | errorcount++; 175 | } 176 | } 177 | break; 178 | case reducescatter: if(myid == printid) printf("VERIFY REDUCE-SCATTER: "); 179 | for(size_t i = 0; i < count; i++) { 180 | // if(myid == printid) printf("myid %d recvbuf[%d] = %d (%d)\n", myid, i, recvbuf[i], (myid * count + i) * numproc); 181 | if(recvbuf[i] != (myid * count + i) * numproc) { 182 | pass = false; 183 | errorcount++; 184 | } 185 | } 186 | break; 187 | case allreduce: if(myid == printid) printf("VERIFY ALL-REDUCE: "); 188 | for(size_t i = 0; i < count * numproc; i++) { 189 | // if(myid == printid) printf("myid %d recvbuf[%d] = %d (%d)\n", myid, i, recvbuf[i], i * numproc); 190 | if(recvbuf[i] != i * numproc) { 191 | pass = false; 192 | errorcount++; 193 | } 194 | } 195 | break; 196 | default: 197 | pass = false; 198 | break; 199 | } 200 | MPI_Allreduce(MPI_IN_PLACE, &pass, 1, MPI_C_BOOL, MPI_LAND, comm_mpi); 201 | if(myid == printid) { 202 | if(pass) 203 | printf("PASSED!\n"); 204 | else 205 | printf("FAILED!!!\n"); 206 | } 207 | if(myid == printid) 208 | printf("verification time: %e seconds\n", MPI_Wtime() - time); 209 | if(!pass) { 210 | std::vector errorcounts(numproc); 211 | MPI_Allgather(&errorcount, 1, MPI_UNSIGNED_LONG, errorcounts.data(), 1, MPI_UNSIGNED_LONG, comm_mpi); 212 | MPI_Allreduce(MPI_IN_PLACE, &errorcount, 1, MPI_UNSIGNED_LONG, MPI_SUM, comm_mpi); 213 | if(myid == printid) { 214 | printf("count %zu total errorcount %zu\n", count, errorcount); 215 | for(int proc = 0; proc < numproc; proc++) 216 | printf("proc %d errorcount: %zu\n", proc, errorcounts[proc]); 217 | } 218 | } 219 | 220 | #ifdef PORT_CUDA 221 | cudaFreeHost(sendbuf); 222 | cudaFreeHost(recvbuf); 223 | #elif defined PORT_HIP 224 | hipHostFree(sendbuf); 225 | hipHostFree(recvbuf); 226 | #endif 227 | } 228 | 229 | -------------------------------------------------------------------------------- /source/broadcast.h: -------------------------------------------------------------------------------- 1 | 2 | template 3 | struct BROADCAST { 4 | T* sendbuf; 5 | size_t sendoffset; 6 | T* recvbuf; 7 | size_t recvoffset; 8 | size_t count; 9 | int sendid; 10 | std::vector recvids; 11 | 12 | void report() { 13 | if(printid < 0) 14 | return; 15 | MPI_Barrier(comm_mpi); 16 | if(myid == sendid) { 17 | MPI_Send(&sendbuf, sizeof(T*), MPI_BYTE, printid, 0, comm_mpi); 18 | MPI_Send(&sendoffset, sizeof(size_t), MPI_BYTE, printid, 0, comm_mpi); 19 | } 20 | for(auto &recvid : this->recvids) { 21 | if(myid == recvid) { 22 | MPI_Send(&recvbuf, sizeof(T*), MPI_BYTE, printid, 0, comm_mpi); 23 | MPI_Send(&recvoffset, sizeof(size_t), MPI_BYTE, printid, 0, comm_mpi); 24 | } 25 | } 26 | if(myid == printid) { 27 | T* sendbuf_sendid; 28 | size_t sendoffset_sendid; 29 | MPI_Recv(&sendbuf_sendid, sizeof(T*), MPI_BYTE, sendid, 0, comm_mpi, MPI_STATUS_IGNORE); 30 | MPI_Recv(&sendoffset_sendid, sizeof(size_t), MPI_BYTE, sendid, 0, comm_mpi, MPI_STATUS_IGNORE); 31 | std::vector recvbuf_recvid(recvids.size()); 32 | std::vector recvoffset_recvid(recvids.size()); 33 | for(int recv = 0; recv < recvids.size(); recv++) { 34 | MPI_Recv(&recvbuf_recvid[recv], sizeof(T*), MPI_BYTE, recvids[recv], 0, comm_mpi, MPI_STATUS_IGNORE); 35 | MPI_Recv(&recvoffset_recvid[recv], sizeof(size_t), MPI_BYTE, recvids[recv], 0, comm_mpi, MPI_STATUS_IGNORE); 36 | } 37 | printf("BROADCAST report: count %lu (", count); 38 | CommBench::print_data(count * sizeof(T)); 39 | printf(")\n"); 40 | char text[1000]; 41 | int n = sprintf(text, "sendid %d sendbuf %p sendoffset %lu -> ", sendid, sendbuf_sendid, sendoffset_sendid); 42 | printf("%s", text); 43 | memset(text, ' ', n); 44 | for(int recv = 0; recv < recvids.size(); recv++) { 45 | printf("recvid: %d recvbuf %p recvoffset %lu\n", recvids[recv], recvbuf_recvid[recv], recvoffset_recvid[recv]); 46 | printf("%s", text); 47 | } 48 | printf("\n"); 49 | } 50 | } 51 | 52 | BROADCAST(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, std::vector &recvids) : sendbuf(sendbuf), sendoffset(sendoffset), recvbuf(recvbuf), recvoffset(recvoffset), count(count), sendid(sendid), recvids(recvids) { } 53 | 54 | BROADCAST(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, int recvid) : sendbuf(sendbuf), sendoffset(sendoffset), recvbuf(recvbuf), recvoffset(recvoffset), count(count), sendid(sendid) { 55 | for(int i = 0; i < numproc; i++) { 56 | if(recvid == numproc) 57 | recvids.push_back(i); 58 | else if(recvid == -1) { 59 | if(i != sendid) 60 | recvids.push_back(i); 61 | } 62 | else 63 | if(i == recvid) 64 | recvids.push_back(i); 65 | } 66 | } 67 | }; 68 | 69 | template 70 | void bcast_tree(int numlevel, int groupsize[], CommBench::library lib[], std::vector> bcastlist, int level, std::list*> &coll_list) { 71 | 72 | if(numproc != groupsize[0]) { 73 | printf("ERROR!!! groupsize[0] must be equal to numproc.\n"); 74 | return; 75 | } 76 | 77 | // EXIT CONDITION 78 | if(bcastlist.size() == 0) 79 | return; 80 | 81 | Coll *coll_temp = new Coll(lib[level-1]); 82 | 83 | std::vector> bcastlist_new; 84 | 85 | // SELF COMMUNICATION 86 | if(level == numlevel) { 87 | // if(printid == printid) 88 | // printf("************************************ leaf level %d groupsize %d\n", level, groupsize[level - 1]); 89 | for(auto bcast : bcastlist) 90 | for(auto recvid : bcast.recvids) { 91 | coll_temp->add(bcast.sendbuf, bcast.sendoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, bcast.sendid, recvid); 92 | } 93 | // if(printid == printid) 94 | // printf("\n"); 95 | } 96 | else { 97 | int numgroup = numproc / groupsize[level]; 98 | // LOCAL COMMUNICATIONS 99 | { 100 | for(auto bcast : bcastlist) { 101 | int sendgroup = bcast.sendid / groupsize[level]; 102 | for(int recvgroup = 0; recvgroup < numgroup; recvgroup++) { 103 | if(sendgroup == recvgroup) { 104 | std::vector recvids; 105 | for(auto recvid : bcast.recvids) { 106 | int group = recvid / groupsize[level]; 107 | if(group == recvgroup) 108 | recvids.push_back(recvid); 109 | } 110 | if(recvids.size()) 111 | bcastlist_new.push_back(BROADCAST(bcast.sendbuf, bcast.sendoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, bcast.sendid, recvids)); 112 | } 113 | } 114 | } 115 | } 116 | // GLOBAL COMMUNICATIONS 117 | { 118 | for(int recvgroup = 0; recvgroup < numgroup; recvgroup++) { 119 | for(auto bcast : bcastlist) { 120 | int sendgroup = bcast.sendid / groupsize[level]; 121 | if(sendgroup != recvgroup) { 122 | std::vector recvids; 123 | for(auto recvid : bcast.recvids) { 124 | if(recvid / groupsize[level] == recvgroup) 125 | recvids.push_back(recvid); 126 | } 127 | if(recvids.size()) { 128 | int recvid = recvgroup * groupsize[level] + bcast.sendid % groupsize[level]; 129 | // if(printid == printid) 130 | // printf("level %d groupsize %d numgroup %d sendgroup %d recvgroup %d recvid %d\n", level, groupsize[level], numgroup, sendgroup, recvgroup, recvid); 131 | T *recvbuf; 132 | size_t recvoffset; 133 | bool found = false; 134 | for(auto it = recvids.begin(); it != recvids.end(); ++it) { 135 | if(*it == recvid) { 136 | //if(printid == printid) 137 | // printf("******************************************************************************************* found recvid %d\n", *it); 138 | recvbuf = bcast.recvbuf; 139 | recvoffset = bcast.recvoffset; 140 | found = true; 141 | recvids.erase(it); 142 | break; 143 | } 144 | } 145 | if(found) { 146 | if(myid == recvid) 147 | reuse += bcast.count; 148 | } 149 | else { 150 | if(myid == recvid) { 151 | CommBench::allocate(recvbuf, bcast.count); 152 | recvoffset = 0; 153 | buffsize += bcast.count; 154 | } 155 | //if(printid == printid) 156 | // printf("^^^^^^^^^^^^^^^^^^^^^^^ recvid %d myid %d allocates\n", recvid, myid); 157 | } 158 | coll_temp->add(bcast.sendbuf, bcast.sendoffset, recvbuf, recvoffset, bcast.count, bcast.sendid, recvid); 159 | if(recvids.size()) 160 | bcastlist_new.push_back(BROADCAST(recvbuf, recvoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, recvid, recvids)); 161 | } 162 | } 163 | } 164 | } 165 | } 166 | } 167 | if(coll_temp->numcomm) 168 | coll_list.push_back(coll_temp); 169 | else 170 | delete coll_temp; 171 | bcast_tree(numlevel, groupsize, lib, bcastlist_new, level + 1, coll_list); 172 | } 173 | 174 | template 175 | void bcast_ring(int groupsize, CommBench::library lib, std::vector> &bcastlist, std::vector> &bcastlist_intra, std::list*> &coll_list) { 176 | 177 | std::vector> bcastlist_extra; 178 | 179 | Coll *coll_temp = new Coll(lib); 180 | 181 | for(auto &bcast : bcastlist) { 182 | int sendnode = bcast.sendid / groupsize; 183 | std::vector recvids_intra; 184 | std::vector recvids_extra; 185 | for(auto &recvid : bcast.recvids) { 186 | int recvnode = recvid / groupsize; 187 | if(sendnode == recvnode) 188 | recvids_intra.push_back(recvid); 189 | else 190 | recvids_extra.push_back(recvid); 191 | } 192 | // if(printid == printid) 193 | // printf("recvids_intra: %zu recvids_extra: %zu\n", recvids_intra.size(), recvids_extra.size()); 194 | if(recvids_intra.size()) 195 | bcastlist_intra.push_back(BROADCAST(bcast.sendbuf, bcast.sendoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, bcast.sendid, recvids_intra)); 196 | if(recvids_extra.size()) { 197 | T *recvbuf; 198 | size_t recvoffset; 199 | int recvid = ((sendnode + 1) % (numproc / groupsize)) * groupsize + bcast.sendid % groupsize; 200 | bool found = false; 201 | for(auto it = recvids_extra.begin(); it != recvids_extra.end(); it++) 202 | if(*it == recvid) { 203 | found = true; 204 | recvids_extra.erase(it); 205 | break; 206 | } 207 | if(myid == recvid) { 208 | if(found) { 209 | recvbuf = bcast.recvbuf; 210 | recvoffset = bcast.recvoffset; 211 | reuse += bcast.count; 212 | } 213 | else { 214 | CommBench::allocate(recvbuf, bcast.count); 215 | recvoffset = 0; 216 | buffsize += bcast.count; 217 | } 218 | } 219 | coll_temp->add(bcast.sendbuf, bcast.sendoffset, recvbuf, recvoffset, bcast.count, bcast.sendid, recvid); 220 | if(recvids_extra.size()) 221 | bcastlist_extra.push_back(BROADCAST(recvbuf, recvoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, recvid, recvids_extra)); 222 | } 223 | } 224 | if(coll_temp->numcomm) 225 | coll_list.push_back(coll_temp); 226 | else 227 | delete coll_temp; 228 | 229 | if(bcastlist_extra.size()) // IMPLEMENT RING FOR EXTRA-NODE COMMUNICATIONS (IF THERE IS STILL LEFT) 230 | bcast_ring(groupsize, lib, bcastlist_extra, bcastlist_intra, coll_list); 231 | /*else { // ELSE IMPLEMENT TREE FOR INTRA-NODE COMMUNICATION 232 | std::vector groupsize_temp(groupsize, groupsize + numlevel); 233 | groupsize_temp[0] = numproc; 234 | bcast_tree(numlevel, groupsize_temp.data(), lib, bcastlist_intra, 1, coll_list); 235 | }*/ 236 | } 237 | 238 | template 239 | void stripe(int numstripe, std::vector> &bcastlist, std::vector

&split_list) { 240 | 241 | int nodesize = numstripe; 242 | 243 | // SEPARATE INTRA AND INTER NODES 244 | std::vector> bcastlist_intra; 245 | std::vector> bcastlist_inter; 246 | for(auto &bcast : bcastlist) { 247 | int sendid = bcast.sendid; 248 | std::vector recvid_intra; 249 | std::vector recvid_inter; 250 | for(auto &recvid : bcast.recvids) 251 | if(sendid / nodesize == recvid / nodesize) 252 | recvid_intra.push_back(recvid); 253 | else 254 | recvid_inter.push_back(recvid); 255 | if(recvid_inter.size()) 256 | bcastlist_inter.push_back(BROADCAST(bcast.sendbuf, bcast.sendoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, bcast.sendid, bcast.recvids)); 257 | else 258 | bcastlist_intra.push_back(BROADCAST(bcast.sendbuf, bcast.sendoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, bcast.sendid, bcast.recvids)); 259 | } 260 | // CLEAR BROADCASTLIST 261 | bcastlist.clear(); 262 | // ADD INTRA-NODE BROADCAST DIRECTLY (IF ANY) 263 | for(auto &bcast : bcastlist_intra) 264 | bcastlist.push_back(BROADCAST(bcast.sendbuf, bcast.sendoffset, bcast.recvbuf, bcast.recvoffset, bcast.count, bcast.sendid, bcast.recvids)); 265 | 266 | // ADD INTER-NODE BROADCAST BY STRIPING 267 | if(bcastlist_inter.size()) { 268 | for(auto &bcast : bcastlist_inter) { 269 | int sendgroup = bcast.sendid / nodesize; 270 | size_t splitoffset = 0; 271 | for(int stripe = 0; stripe < numstripe; stripe++) { 272 | int sender = sendgroup * nodesize + stripe; 273 | size_t splitcount = bcast.count / numstripe + (stripe < bcast.count % numstripe ? 1 : 0); 274 | if(splitcount) { 275 | T *sendbuf; 276 | size_t sendoffset; 277 | std::vector recvids = bcast.recvids; 278 | if(sender != bcast.sendid) { 279 | bool found = false; 280 | // REUSE 281 | for(auto it = recvids.begin(); it < recvids.end(); it++) { 282 | if(*it == sender) { 283 | recvids.erase(it); 284 | found = true; 285 | break; 286 | } 287 | } 288 | if(found) { 289 | if(myid == sender) { 290 | sendbuf = bcast.recvbuf; 291 | sendoffset = bcast.recvoffset + splitoffset; 292 | reuse += splitcount; 293 | } 294 | } 295 | else { 296 | if(myid == sender) { 297 | CommBench::allocate(sendbuf, splitcount); 298 | sendoffset = 0; 299 | buffsize += splitcount; 300 | } 301 | } 302 | split_list.push_back(P(bcast.sendbuf, bcast.sendoffset + splitoffset, sendbuf, sendoffset, splitcount, bcast.sendid, sender)); 303 | } 304 | else { 305 | if(myid == sender) { 306 | sendbuf = bcast.sendbuf; 307 | sendoffset = bcast.sendoffset + splitoffset; 308 | reuse += splitcount; 309 | } 310 | } 311 | bcastlist.push_back(BROADCAST(sendbuf, sendoffset, bcast.recvbuf, bcast.recvoffset + splitoffset, splitcount, sender, recvids)); 312 | splitoffset += splitcount; 313 | } 314 | else 315 | break; 316 | } 317 | } 318 | } 319 | } 320 | 321 | template 322 | void partition(std::vector> &bcastlist, int numbatch, std::vector>> &bcast_batch) { 323 | for(auto &bcast : bcastlist) { 324 | size_t batchoffset = 0; 325 | for(int batch = 0; batch < numbatch; batch++) { 326 | size_t batchsize = bcast.count / numbatch + (batch < bcast.count % numbatch ? 1 : 0); 327 | if(batchsize) { 328 | bcast_batch[batch].push_back(BROADCAST(bcast.sendbuf, bcast.sendoffset + batchoffset, bcast.recvbuf, bcast.recvoffset + batchoffset, batchsize, bcast.sendid, bcast.recvids)); 329 | batchoffset += batchsize; 330 | } 331 | else 332 | break; 333 | } 334 | } 335 | } 336 | -------------------------------------------------------------------------------- /source/coll.h: -------------------------------------------------------------------------------- 1 | template 2 | class Coll { 3 | 4 | public: 5 | 6 | CommBench::library lib; 7 | 8 | // Communication 9 | int numcomm = 0; 10 | std::vector sendbuf; 11 | std::vector sendoffset; 12 | std::vector recvbuf; 13 | std::vector recvoffset; 14 | std::vector count; 15 | std::vector sendid; 16 | std::vector recvid; 17 | 18 | // Computation 19 | int numcompute = 0; 20 | std::vector> inputbuf; 21 | std::vector outputbuf; 22 | std::vector numreduce; 23 | std::vector compid; 24 | 25 | Coll(CommBench::library lib) : lib(lib) {} 26 | 27 | void add(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, int recvid) { 28 | this->sendbuf.push_back(sendbuf); 29 | this->sendoffset.push_back(sendoffset); 30 | this->recvbuf.push_back(recvbuf); 31 | this->recvoffset.push_back(recvoffset); 32 | this->count.push_back(count); 33 | this->sendid.push_back(sendid); 34 | this->recvid.push_back(recvid); 35 | numcomm++; 36 | } 37 | 38 | void add(std::vector inputbuf, T* outputbuf, size_t numreduce, int compid) { 39 | this->inputbuf.push_back(inputbuf); 40 | this->outputbuf.push_back(outputbuf); 41 | this->numreduce.push_back(numreduce); 42 | this->compid.push_back(compid); 43 | numcompute++; 44 | } 45 | 46 | void report() { 47 | if(myid == printid) { 48 | CommBench::print_lib(this->lib); 49 | printf(" communication: "); 50 | { 51 | std::vector> matrix(numproc, std::vector(numproc)); 52 | size_t data = 0; 53 | for(int i = 0; i < this->numcomm; i++) { 54 | data += this->count[i] * sizeof(T); 55 | matrix[abs(this->recvid[i])][abs(this->sendid[i])]++; 56 | } 57 | CommBench::print_data(data); 58 | printf("\n"); 59 | if(numproc < 64) 60 | for(int recv = 0; recv < numproc; recv++) { 61 | for(int send = 0; send < numproc; send++) 62 | if(matrix[recv][send]) 63 | printf("%d ", matrix[recv][send]); 64 | else 65 | printf(". "); 66 | printf("\n"); 67 | } 68 | printf("\n"); 69 | } 70 | if(this->numcompute) { 71 | printf("computation: "); 72 | std::vector input(numproc, 0); 73 | std::vector output(numproc, 0); 74 | size_t inputdata = 0; 75 | size_t outputdata = 0; 76 | for(int i = 0; i < this->numcompute; i++) { 77 | inputdata += this->numreduce[i] * sizeof(T) * this->inputbuf[i].size(); 78 | outputdata += this->numreduce[i] * sizeof(T); 79 | input[this->compid[i]] += this->inputbuf[i].size(); 80 | output[this->compid[i]]++; 81 | } 82 | printf("input "); 83 | CommBench::print_data(inputdata); 84 | printf(" output "); 85 | CommBench::print_data(outputdata); 86 | printf("\n"); 87 | if(numproc < 64) 88 | for(int p = 0; p < numproc; p++) 89 | if(output[p]) 90 | printf("%d: %d -> %d\n", p, input[p], output[p]); 91 | printf("\n"); 92 | } 93 | } 94 | } 95 | }; 96 | 97 | template 98 | void report_pipeline(std::vector*>> &coll_batch) { 99 | 100 | // REPORT PIPELINE 101 | if(myid == printid) { 102 | printf("********************************************\n\n"); 103 | printf("pipeline depth %zu\n", coll_batch.size()); 104 | if(coll_batch.size()) 105 | printf("coll_list size %zu\n", coll_batch[0].size()); 106 | printf("\n"); 107 | int print_batch_size = (coll_batch.size() > 16 ? 16 : coll_batch.size()); 108 | using Iter = typename std::list*>::iterator; 109 | std::vector coll_ptr(print_batch_size); 110 | for(int i = 0; i < print_batch_size; i++) 111 | coll_ptr[i] = coll_batch[i].begin(); 112 | int collindex = 0; 113 | while(true) { 114 | bool finished = true; 115 | for(int i = 0; i < print_batch_size; i++) 116 | if(coll_ptr[i] != coll_batch[i].end()) 117 | finished = false; 118 | if(finished) 119 | break; 120 | 121 | printf("proc %d index %d: |", myid, collindex); 122 | for(int i = 0; i < print_batch_size; i++) 123 | if(coll_ptr[i] != coll_batch[i].end()) { 124 | if((*coll_ptr[i])->numcomm) 125 | printf(" %d ", (*coll_ptr[i])->numcomm); 126 | else 127 | printf(" "); 128 | if((*coll_ptr[i])->numcomm + (*coll_ptr[i])->numcompute) 129 | CommBench::print_lib((*coll_ptr[i])->lib); 130 | else 131 | switch((*coll_ptr[i])->lib) { 132 | case CommBench::dummy : printf(" - "); break; 133 | case CommBench::IPC : printf(" P "); break; 134 | case CommBench::IPC_get : printf(" G "); break; 135 | case CommBench::MPI : printf(" M "); break; 136 | case CommBench::XCCL : printf(" X "); break; 137 | case CommBench::numlib : printf(" NUMLIB "); break; 138 | } 139 | if((*coll_ptr[i])->numcompute) 140 | printf(" %d |", (*coll_ptr[i])->numcompute); 141 | else 142 | printf(" |"); 143 | coll_ptr[i]++; 144 | } 145 | else 146 | printf(" |"); 147 | printf("\n"); 148 | collindex++; 149 | } 150 | printf("\n"); 151 | } 152 | } 153 | 154 | -------------------------------------------------------------------------------- /source/comm.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2023 Stanford University 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | template 17 | class Comm { 18 | 19 | // PRIMITIVES 20 | std::vector>> bcast_epoch; 21 | std::vector>> reduce_epoch; 22 | int numepoch = 0; 23 | 24 | // HiCCL PARAMETERS 25 | std::vector hierarchy = {numproc}; 26 | std::vector library = {CommBench::MPI}; 27 | int numstripe = 1; 28 | int ringnodes = 1; 29 | int pipedepth = 1; 30 | // ENDPOINTS 31 | T *sendbuf = nullptr; 32 | T *recvbuf = nullptr; 33 | size_t sendcount = 0; 34 | size_t recvcount = 0; 35 | 36 | public: 37 | 38 | // PIPELINE 39 | std::vector>> command_batch; 40 | std::vector*>> coll_batch; 41 | 42 | // SETTERS 43 | void set_hierarchy(std::vector hierarchy, std::vector library) { 44 | if(hierarchy.size() != library.size()) { 45 | if(myid == printid) 46 | printf("hierarchy and library must have the same size!\n"); 47 | return; 48 | } 49 | else { 50 | this->hierarchy = hierarchy; 51 | this->library = library; 52 | } 53 | } 54 | void set_pipedepth(int pipedepth) { 55 | this->pipedepth = pipedepth; 56 | } 57 | void set_numstripe(int numstripe) { 58 | this->numstripe = numstripe; 59 | } 60 | void set_ringnodes(int ringnodes) { 61 | this->ringnodes = ringnodes; 62 | } 63 | // SET ENDPOINTS 64 | void set_endpoints(T *sendbuf, size_t sendcount, T *recvbuf, size_t recvcount) { 65 | this->sendbuf = sendbuf; 66 | this->sendcount = sendcount; 67 | this->recvbuf = recvbuf; 68 | this->recvcount = recvcount; 69 | } 70 | 71 | void print_parameters() { 72 | if(myid == printid) { 73 | printf("**************** HiCCL PARAMETERS\n"); 74 | printf("%ld-level hierarchy:\n", hierarchy.size()); 75 | for(int i = 0; i < hierarchy.size(); i++) { 76 | printf(" level %d factor: %d library: ", i, hierarchy[i]); 77 | CommBench::print_lib(library[i]); 78 | if(hierarchy[0] == numproc && library[0] == CommBench::MPI) 79 | printf(" (default)\n"); 80 | else 81 | printf("\n"); 82 | } 83 | printf("numstripe: %d", numstripe); 84 | if(numstripe == 1) 85 | printf(" (default)\n"); 86 | else 87 | printf("\n"); 88 | printf("ringnodes: %d", ringnodes); 89 | if(ringnodes == 1) 90 | printf(" (default)\n"); 91 | else 92 | printf("\n"); 93 | printf("pipedepth: %d", pipedepth); 94 | if(pipedepth == 1) 95 | printf(" (default)\n"); 96 | else 97 | printf("\n"); 98 | printf("sendbuf: %p, sendcount %ld", sendbuf, sendcount); 99 | if(sendbuf == nullptr) 100 | printf(" (default)\n"); 101 | else 102 | printf("\n"); 103 | printf("recvbuf: %p, recvcount %ld", recvbuf, recvcount); 104 | if(recvbuf == nullptr) 105 | printf(" (default)\n"); 106 | else 107 | printf("\n"); 108 | printf("*********************************\n"); 109 | } 110 | } 111 | 112 | void add_fence() { 113 | bcast_epoch.push_back(std::vector>()); 114 | reduce_epoch.push_back(std::vector>()); 115 | if(myid == printid) 116 | printf("Add epoch %d\n", numepoch); 117 | numepoch++; 118 | } 119 | 120 | Comm() { 121 | // DEFAULT PARAMETERS 122 | /*if(myid == printid) { 123 | printf("DEFAULT PARAMETERS:\n"); 124 | print_parameters(); 125 | }*/ 126 | // DEFAULT EPOCH 127 | add_fence(); 128 | } 129 | 130 | // ADD FUNCTIONS FOR BROADCAST AND REDUCE PRIMITIVES 131 | void add_bcast(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, std::vector &recvids) { 132 | bcast_epoch.back().push_back(BROADCAST(sendbuf, sendoffset, recvbuf, recvoffset, count, sendid, recvids)); 133 | bcast_epoch.back().back().report(); 134 | } 135 | void add_bcast(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, int recvid) { 136 | bcast_epoch.back().push_back(BROADCAST(sendbuf, sendoffset, recvbuf, recvoffset, count, sendid, recvid)); 137 | bcast_epoch.back().back().report(); 138 | } 139 | void add_bcast(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, pattern recv_pattern) { 140 | int recvid = (recv_pattern == pattern::others ? -1 : numproc); 141 | bcast_epoch.back().push_back(BROADCAST(sendbuf, sendoffset, recvbuf, recvoffset, count, sendid, recvid)); 142 | bcast_epoch.back().back().report(); 143 | } 144 | void add_reduce(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, std::vector &sendids, int recvid) { 145 | reduce_epoch.back().push_back(REDUCE(sendbuf, sendoffset, recvbuf, recvoffset, count, sendids, recvid)); 146 | reduce_epoch.back().back().report(); 147 | } 148 | void add_reduce(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, int recvid) { 149 | reduce_epoch.back().push_back(REDUCE(sendbuf, sendoffset, recvbuf, recvoffset, count, sendid, recvid)); 150 | reduce_epoch.back().back().report(); 151 | } 152 | void add_reduce(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, pattern send_pattern, int recvid) { 153 | int sendid = (send_pattern == pattern::others ? -1 : numproc); 154 | reduce_epoch.back().push_back(REDUCE(sendbuf, sendoffset, recvbuf, recvoffset, count, sendid, recvid)); 155 | reduce_epoch.back().back().report(); 156 | } 157 | 158 | #include "init.h" 159 | 160 | void init() { 161 | if(myid == printid) { 162 | printf("FINAL PARAMETERS\n"); 163 | print_parameters(); 164 | } 165 | // CONVERT FACTORIZATION TO GROUPSIZE 166 | int numlevel = hierarchy.size(); 167 | std::vector groupsize(numlevel); 168 | groupsize[numlevel - 1] = hierarchy[numlevel - 1]; 169 | for(int i = numlevel - 2; i > -1; i--) 170 | groupsize[i] = groupsize[i + 1] * hierarchy[i]; 171 | groupsize[0] = numproc / ringnodes; 172 | MPI_Barrier(comm_mpi); 173 | double init_time = MPI_Wtime(); 174 | // init.h 175 | init(numlevel, groupsize.data(), library.data(), numstripe, pipedepth); 176 | MPI_Barrier(comm_mpi); 177 | if(myid == printid) 178 | printf("initialization time: %e seconds\n", MPI_Wtime() - init_time); 179 | } 180 | 181 | void run() { 182 | using Iter = typename std::list>::iterator; 183 | std::vector commandptr(command_batch.size()); 184 | for(int i = 0; i < command_batch.size(); i++) 185 | commandptr[i] = command_batch[i].begin(); 186 | while(true) { 187 | bool finished = true; 188 | for(int i = 0; i < command_batch.size(); i++) 189 | if(commandptr[i] != command_batch[i].end()) { 190 | commandptr[i]->comm->start(); 191 | finished = false; 192 | } 193 | if(finished) 194 | break; 195 | for(int i = command_batch.size() - 1; i > -1; i--) 196 | if(commandptr[i] != command_batch[i].end()) { 197 | commandptr[i]->comm->wait(); 198 | commandptr[i]->compute->start(); 199 | } 200 | for(int i = 0; i < command_batch.size(); i++) 201 | if(commandptr[i] != command_batch[i].end()) { 202 | commandptr[i]->compute->wait(); 203 | commandptr[i]++; 204 | } 205 | } 206 | } 207 | 208 | void run(T *sendbuf, T *recvbuf) { 209 | CommBench::memcpyD2D(this->sendbuf, sendbuf, sendcount); 210 | run(); 211 | CommBench::memcpyD2D(recvbuf, this->recvbuf, recvcount); 212 | } 213 | 214 | static void* run_async(void* arg) { 215 | CommBench::setup_gpu(); 216 | Comm *test = (Comm*) arg; 217 | test->run(); 218 | pthread_exit(NULL); 219 | } 220 | 221 | pthread_t thread; 222 | void start() { 223 | pthread_create(&thread, NULL, Comm::run_async, this); 224 | } 225 | void wait() { 226 | pthread_join(thread, NULL); 227 | } 228 | 229 | void measure(int warmup, int numiter, size_t count) { 230 | if(myid == printid) { 231 | printf("command_batch size %zu\n", command_batch.size()); 232 | if(command_batch.size()) 233 | printf("commandlist size %zu\n", command_batch[0].size()); 234 | } 235 | MPI_Barrier(comm_mpi); 236 | { 237 | using Iter = typename std::list>::iterator; 238 | std::vector commandptr(command_batch.size()); 239 | for(int i = 0; i < command_batch.size(); i++) 240 | commandptr[i] = command_batch[i].begin(); 241 | while(true) { 242 | bool finished = true; 243 | for(int i = 0; i < command_batch.size(); i++) 244 | if(commandptr[i] != command_batch[i].end()) 245 | finished = false; 246 | if(finished) 247 | break; 248 | if(myid == printid) printf("******************************************* MEASURE COMMANDS ************************************************\n"); 249 | for(int i = 0; i < command_batch.size(); i++) 250 | if(commandptr[i] != command_batch[i].end()) { 251 | commandptr[i]->measure(warmup, numiter, count); 252 | commandptr[i]++; 253 | } 254 | /*if(myid == printid) printf("******************************************* MEASURE STEP ************************************************\n"); 255 | MPI_Barrier(comm_mpi); 256 | double time = MPI_Wtime(); 257 | for(int i = 0; i < command_batch.size(); i++) 258 | if(commandptr[i] != command_batch[i].end()) 259 | commandptr[i]->comm->start(); 260 | for(int i = 0; i < command_batch.size(); i++) 261 | if(commandptr[i] != command_batch[i].end()) { 262 | commandptr[i]->comm->wait(); 263 | commandptr[i]++; 264 | } 265 | MPI_Barrier(comm_mpi); 266 | time = MPI_Wtime() - time; 267 | if(myid == printid) 268 | printf("time: %e\n", time);*/ 269 | } 270 | } 271 | } 272 | 273 | void report() { 274 | if(myid == printid) { 275 | printf("command_batch size %zu\n", command_batch.size()); 276 | printf("commandlist size %zu\n", command_batch[0].size()); 277 | } 278 | int command = 0; 279 | for(auto it = command_batch[0].begin(); it != command_batch[0].end(); it++) { 280 | if(myid == printid) 281 | printf("command %d", command); 282 | it->report(); 283 | command++; 284 | } 285 | } 286 | 287 | void time() { 288 | if(myid == printid) { 289 | printf("********************************************\n\n"); 290 | printf("pipeline depth %zu\n", command_batch.size()); 291 | printf("commandlist size %zu\n", command_batch[0].size()); 292 | printf("\n"); 293 | } 294 | int print_batch_size = (command_batch.size() > 16 ? 16 : command_batch.size()); 295 | { 296 | using Iter = typename std::list>::iterator; 297 | std::vector commandptr(print_batch_size); 298 | for(int i = 0; i < print_batch_size; i++) 299 | commandptr[i] = command_batch[i].begin(); 300 | int command = 0; 301 | while(true) { 302 | if(myid == printid) 303 | printf("proc %d command %d: |", myid, command); 304 | bool finished = true; 305 | for(int i = 0; i < print_batch_size; i++) { 306 | if(commandptr[i] != command_batch[i].end()) { 307 | if(commandptr[i]->comm) { 308 | int numsend = commandptr[i]->comm->numsend; 309 | int numrecv = commandptr[i]->comm->numrecv; 310 | //MPI_Allreduce(MPI_IN_PLACE, &numsend, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); 311 | //MPI_Allreduce(MPI_IN_PLACE, &numrecv, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); 312 | if(myid == printid) { 313 | if(numsend) printf(" %d", numsend); 314 | else printf(" "); 315 | if(numrecv) printf("+%d", numrecv); 316 | else printf(" "); 317 | if(numsend+numrecv) { 318 | printf(" "); 319 | CommBench::print_lib(commandptr[i]->comm->lib); 320 | } 321 | else // printf("- "); 322 | switch(commandptr[i]->comm->lib) { 323 | case CommBench::IPC : printf("P "); break; 324 | case CommBench::IPC_get : printf("G "); break; 325 | case CommBench::MPI : printf("M "); break; 326 | case CommBench::XCCL : printf("X "); break; 327 | default : break; 328 | } 329 | } 330 | if(commandptr[i]->compute) { 331 | int numcomp = commandptr[i]->compute->numcomp; 332 | //MPI_Allreduce(MPI_IN_PLACE, &numcomp, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); 333 | if(myid == printid) { 334 | if(numcomp) printf(" %d*", numcomp); 335 | else printf("* "); 336 | } 337 | } 338 | if(myid == printid) 339 | printf(" |"); 340 | } 341 | else if(commandptr[i]->compute) { 342 | int numcomp = commandptr[i]->compute->numcomp; 343 | //MPI_Allreduce(MPI_IN_PLACE, &numcomp, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); 344 | if(myid == printid) { 345 | if(numcomp) printf(" %d *** |", numcomp); 346 | else printf(" * |"); 347 | } 348 | } 349 | finished = false; 350 | commandptr[i]++; 351 | } 352 | else 353 | if(myid == printid) 354 | printf(" |"); 355 | } 356 | if(myid == printid) 357 | printf("\n"); 358 | if(finished) 359 | break; 360 | command++; 361 | } 362 | } 363 | 364 | using Iter = typename std::list>::iterator; 365 | std::vector commandptr(command_batch.size()); 366 | for(int i = 0; i < command_batch.size(); i++) 367 | commandptr[i] = command_batch[i].begin(); 368 | 369 | int command = 0; 370 | double totalstarttime = 0; 371 | double totalwaittime = 0; 372 | MPI_Barrier(comm_mpi); 373 | double totaltime = MPI_Wtime(); 374 | while(true) { 375 | double starttime; 376 | double waittime; 377 | bool finished = true; 378 | { 379 | MPI_Barrier(comm_mpi); 380 | double time = MPI_Wtime(); 381 | for(int i = 0; i < command_batch.size(); i++) 382 | if(commandptr[i] != command_batch[i].end()) { 383 | commandptr[i]->start(); 384 | finished = false; 385 | } 386 | MPI_Barrier(comm_mpi); 387 | starttime = MPI_Wtime() - time; 388 | } 389 | MPI_Allreduce(MPI_IN_PLACE, &finished, 1, MPI_C_BOOL, MPI_LOR, comm_mpi); 390 | if(finished) 391 | break; 392 | { 393 | MPI_Barrier(comm_mpi); 394 | double time = MPI_Wtime(); 395 | for(int i = 0; i < command_batch.size(); i++) 396 | if(commandptr[i] != command_batch[i].end()) { 397 | commandptr[i]->wait(); 398 | commandptr[i]++; 399 | } 400 | MPI_Barrier(comm_mpi); 401 | waittime = MPI_Wtime() - time; 402 | } 403 | if(myid == printid) 404 | printf("command %d start: %e wait: %e\n", command, starttime, waittime); 405 | totalstarttime += starttime; 406 | totalwaittime += waittime; 407 | command++; 408 | } 409 | MPI_Barrier(comm_mpi); 410 | totaltime = MPI_Wtime() - totaltime; 411 | if(myid == printid) { 412 | printf("start %e wait %e other %e\n", totalstarttime, totalwaittime, totaltime - totalstarttime - totalwaittime); 413 | printf("total time %e\n", totaltime); 414 | } 415 | } 416 | }; 417 | -------------------------------------------------------------------------------- /source/command.h: -------------------------------------------------------------------------------- 1 | 2 | template 3 | class Command { 4 | 5 | public: 6 | 7 | CommBench::Comm *comm = nullptr; 8 | Compute *compute = nullptr; 9 | 10 | // COMMUNICATION 11 | // Command(CommBench::Comm *comm) : comm(comm) {} 12 | // COMPUTATION 13 | // Command(HiCCL::Compute *compute) : compute(compute) {} 14 | // COMMUNICATION + COMPUTATION 15 | Command(CommBench::Comm *comm, Compute *compute) : comm(comm), compute(compute) {} 16 | 17 | void measure(int warmup, int numiter, size_t count) { 18 | int numcomm = 0; 19 | int numcomp = 0; 20 | MPI_Allreduce(&(comm->numsend), &numcomm, 1, MPI_INT, MPI_SUM, comm_mpi); 21 | MPI_Allreduce(&(comm->numrecv), &numcomm, 1, MPI_INT, MPI_SUM, comm_mpi); 22 | MPI_Allreduce(&(compute->numcomp), &numcomp, 1, MPI_INT, MPI_SUM, comm_mpi); 23 | if(numcomm) { 24 | if(myid == printid) { 25 | if(compute->numcomp) printf("COMMAND TYPE: COMMUNICATION + COMPUTATION\n"); 26 | else printf("COMMAND TYPE: COMMUNICATION\n"); 27 | } 28 | comm->measure(warmup, numiter, count); 29 | if(numcomp) 30 | compute->measure(warmup, numiter, count); 31 | } 32 | else if(numcomp) { 33 | if(myid == printid) 34 | printf("COMMAND TYPE: COMPUTATION\n"); 35 | compute->measure(warmup, numiter, count); 36 | } 37 | } 38 | }; 39 | 40 | template 41 | void implement(std::vector*>> &coll_batch, std::vector>> &pipeline, int pipeoffset) { 42 | 43 | for(auto &coll : coll_batch[0]) 44 | coll->report(); 45 | 46 | // REPORT MEMORY 47 | { 48 | long buffsize_tot = buffsize * sizeof(T); 49 | long recycle_tot = recycle * sizeof(T); 50 | long reuse_tot = reuse * sizeof(T); 51 | MPI_Allreduce(MPI_IN_PLACE, &buffsize_tot, 1, MPI_LONG, MPI_SUM, comm_mpi); 52 | MPI_Allreduce(MPI_IN_PLACE, &recycle_tot, 1, MPI_LONG, MPI_SUM, comm_mpi); 53 | MPI_Allreduce(MPI_IN_PLACE, &reuse_tot, 1, MPI_LONG, MPI_SUM, comm_mpi); 54 | if(myid == printid) { 55 | printf("********************************************\n\n"); 56 | printf("total buffsize: "); 57 | CommBench::print_data(buffsize_tot); 58 | printf(" reuse: "); 59 | CommBench::print_data(reuse_tot); 60 | printf(" recycle: "); 61 | CommBench::print_data(recycle_tot); 62 | printf("\n"); 63 | } 64 | std::vector buffsize_all(numproc); 65 | std::vector recycle_all(numproc); 66 | std::vector reuse_all(numproc); 67 | MPI_Allgather(&buffsize, sizeof(size_t), MPI_BYTE, buffsize_all.data(), sizeof(size_t), MPI_BYTE, comm_mpi); 68 | MPI_Allgather(&recycle, sizeof(size_t), MPI_BYTE, recycle_all.data(), sizeof(size_t), MPI_BYTE, comm_mpi); 69 | MPI_Allgather(&reuse, sizeof(size_t), MPI_BYTE, reuse_all.data(), sizeof(size_t), MPI_BYTE, comm_mpi); 70 | if(myid == printid) { 71 | for(int p = 0; p < numproc; p++) 72 | printf("HiCCL Memory [%d]: %zu bytes (%.2f GB) - %.2f GB reused - %.2f GB recycled\n", p, buffsize_all[p] * sizeof(T), buffsize_all[p] * sizeof(T) / 1.e9, reuse_all[p] * sizeof(T) / 1.e9, recycle_all[p] * sizeof(T) / 1.e9); 73 | printf("coll_batch size %zu: ", coll_batch.size()); 74 | for(int i = 0; i < coll_batch.size(); i++) 75 | printf("%zu ", coll_batch[i].size()); 76 | printf("\n\n"); 77 | } 78 | } 79 | 80 | std::vector*>> coll_pipeline; 81 | std::vector*> coll_mixed; 82 | 83 | std::vector lib; 84 | std::vector lib_hash(CommBench::numlib); 85 | { 86 | for(int i = 0; i < coll_batch.size(); i++) { 87 | for(auto &coll : coll_batch[i]) 88 | lib_hash[coll->lib]++; 89 | for(int j = 0; j < i * pipeoffset; j++) 90 | coll_batch[i].push_front(new Coll(CommBench::dummy)); 91 | } 92 | for(int i = 0; i < CommBench::numlib; i++) 93 | if(lib_hash[i]) { 94 | lib_hash[i] = lib.size(); 95 | lib.push_back(i); 96 | pipeline.push_back(std::list>()); 97 | coll_pipeline.push_back(std::list*>()); 98 | } 99 | } 100 | 101 | // REPORT DEGENERATE PIPELINE 102 | report_pipeline(coll_batch); 103 | 104 | { 105 | using Iter = typename std::list*>::iterator; 106 | std::vector coll_ptr(coll_batch.size()); 107 | for(int i = 0; i < coll_batch.size(); i++) 108 | coll_ptr[i] = coll_batch[i].begin(); 109 | while(true) { 110 | bool finished = true; 111 | for(int i = 0; i < coll_batch.size(); i++) 112 | if(coll_ptr[i] != coll_batch[i].end()) 113 | finished = false; 114 | if(finished) 115 | break; 116 | Coll *coll_total = new Coll(CommBench::dummy); 117 | std::vector*> coll_temp(lib.size()); 118 | std::vector*> comm_temp(lib.size()); 119 | std::vector*> compute_temp(lib.size()); 120 | for(int i = 0; i < lib.size(); i++) { 121 | coll_temp[i] = new Coll((CommBench::library) lib[i]); 122 | comm_temp[i] = new CommBench::Comm((CommBench::library) lib[i]); 123 | compute_temp[i] = new Compute(); 124 | } 125 | for(int i = 0; i < coll_batch.size(); i++) 126 | if(coll_ptr[i] != coll_batch[i].end()) { 127 | Coll *coll = *coll_ptr[i]; 128 | coll_ptr[i]++; 129 | for(int i = 0; i < coll->numcomm; i++) { 130 | coll_total->add(coll->sendbuf[i], coll->sendoffset[i], coll->recvbuf[i], coll->recvoffset[i], coll->count[i], coll->sendid[i], coll->recvid[i]); 131 | coll_temp[lib_hash[coll->lib]]->add(coll->sendbuf[i], coll->sendoffset[i], coll->recvbuf[i], coll->recvoffset[i], coll->count[i], coll->sendid[i], coll->recvid[i]); 132 | comm_temp[lib_hash[coll->lib]]->add(coll->sendbuf[i], coll->sendoffset[i], coll->recvbuf[i], coll->recvoffset[i], coll->count[i], coll->sendid[i], coll->recvid[i]); 133 | } 134 | for(int i = 0; i < coll->numcompute; i++) { 135 | coll_total->add(coll->inputbuf[i], coll->outputbuf[i], coll->numreduce[i], coll->compid[i]); 136 | coll_temp[lib_hash[coll->lib]]->add(coll->inputbuf[i], coll->outputbuf[i], coll->numreduce[i], coll->compid[i]); 137 | compute_temp[lib_hash[coll->lib]]->add(coll->inputbuf[i], coll->outputbuf[i], coll->numreduce[i], coll->compid[i]); 138 | } 139 | } 140 | if(coll_total->numcomm + coll_total->numcompute) { 141 | for(int i = 0; i < lib.size(); i++) { 142 | coll_pipeline[i].push_back(coll_temp[i]); 143 | pipeline[i].push_back(Command(comm_temp[i], compute_temp[i])); 144 | } 145 | coll_mixed.push_back(coll_total); 146 | } 147 | else { 148 | delete coll_total; 149 | for(int i = 0; i < lib.size(); i++) { 150 | delete coll_temp[i]; 151 | delete comm_temp[i]; 152 | delete compute_temp[i]; 153 | } 154 | } 155 | } 156 | } 157 | // REPORT MIXED PIPELINE 158 | for(int i = 0; i < coll_mixed.size(); i++) 159 | if(i < coll_batch[0].size() || i >= coll_mixed.size() - coll_batch[0].size()) { 160 | if(myid == printid) 161 | printf("MIXED (OVERLAPPED) STEP: %d / %ld\n", i + 1, coll_mixed.size()); 162 | coll_mixed[i]->report(); 163 | } 164 | report_pipeline(coll_pipeline); 165 | } 166 | 167 | -------------------------------------------------------------------------------- /source/compute.h: -------------------------------------------------------------------------------- 1 | 2 | #if defined PORT_CUDA || defined PORT_HIP 3 | template 4 | __global__ void reduce_kernel(T *output, size_t count, T **input, int numinput) { 5 | size_t i = blockIdx.x * blockDim.x + threadIdx.x; 6 | if(i < count) { 7 | T acc = 0; 8 | for(int in = 0; in < numinput; in++) 9 | acc += input[in][i]; 10 | output[i] = acc; 11 | } 12 | } 13 | #else 14 | template 15 | void reduce_kernel(T *output, size_t count, T **input, int numinput) { 16 | #pragma omp parallel for 17 | for(size_t i = 0; i < count; i++) { 18 | T acc = 0; 19 | for(int in = 0; in < numinput; in++) 20 | acc += input[in][i]; 21 | output[i] = acc; 22 | } 23 | } 24 | #endif 25 | 26 | template 27 | class Compute { 28 | 29 | public: 30 | 31 | int numcomp = 0; 32 | 33 | std::vector> inputbuf; 34 | std::vector outputbuf; 35 | std::vector count; 36 | std::vector inputbuf_d; 37 | #ifdef PORT_CUDA 38 | std::vector stream; 39 | #elif defined PORT_HIP 40 | std::vector stream; 41 | #elif defined PORT_SYCL 42 | std::vector queue; 43 | #endif 44 | 45 | int printid = CommBench::printid; 46 | 47 | void add(std::vector &inputbuf, T *outputbuf, size_t count, int compid) { 48 | if(printid > -1) { 49 | MPI_Barrier(comm_mpi); 50 | if(myid == compid) { 51 | MPI_Send(&outputbuf, sizeof(T*), MPI_BYTE, printid, 0, comm_mpi); 52 | for(int in = 0; in < inputbuf.size(); in++) 53 | MPI_Send(&inputbuf[in], sizeof(T*), MPI_BYTE, printid, 0, comm_mpi); 54 | } 55 | if(myid == printid) { 56 | T *outputbuf; 57 | MPI_Recv(&outputbuf, sizeof(T*), MPI_BYTE, compid, 0, comm_mpi, MPI_STATUS_IGNORE); 58 | printf("add compute (%d) outputbuf %p, count %zu\n", compid, outputbuf, count); 59 | for(int in = 0; in < inputbuf.size(); in++) { 60 | T *inputbuf; 61 | MPI_Recv(&inputbuf, sizeof(T*), MPI_BYTE, compid, 0, comm_mpi, MPI_STATUS_IGNORE); 62 | printf(" inputbuf %p\n", inputbuf); 63 | } 64 | } 65 | } 66 | if(myid == compid) { 67 | this->inputbuf.push_back(inputbuf); // CPU COPY OF GPU POINTERS 68 | this->outputbuf.push_back(outputbuf); 69 | this->count.push_back(count); 70 | T **inputbuf_d; 71 | CommBench::allocate(inputbuf_d, inputbuf.size()); 72 | CommBench::memcpyH2D(inputbuf_d, inputbuf.data(), inputbuf.size()); 73 | #ifdef PORT_CUDA 74 | stream.push_back(new cudaStream_t()); 75 | cudaStreamCreate(stream[numcomp]); 76 | #elif defined PORT_HIP 77 | stream.push_back(new hipStream_t()); 78 | hipStreamCreate(stream[numcomp]); 79 | #elif defined PORT_SYCL 80 | queue.push_back(new sycl::queue(sycl::gpu_selector_v)); 81 | #endif 82 | this->inputbuf_d.push_back(inputbuf_d); 83 | numcomp++; 84 | } 85 | } 86 | 87 | void start() { 88 | for(int comp = 0; comp < numcomp; comp++) { 89 | #if defined PORT_CUDA || defined PORT_HIP 90 | int blocksize = 256; 91 | reduce_kernel<<<(count[comp] + blocksize - 1) / blocksize, blocksize, 0, *stream[comp]>>> (outputbuf[comp], count[comp], inputbuf_d[comp], inputbuf[comp].size()); 92 | #elif defined PORT_SYCL 93 | T *output = outputbuf[comp]; 94 | int numinput = inputbuf[comp].size(); 95 | T **input = inputbuf_d[comp]; 96 | queue[comp]->parallel_for(sycl::range<1>{count[comp]}, [=] (sycl::id<1> i) { 97 | T acc = 0; 98 | for(int in = 0; in < numinput; in++) 99 | acc += input[in][i]; 100 | output[i] = acc; 101 | }); 102 | #else 103 | reduce_kernel (outputbuf[comp], count[comp], inputbuf_d[comp], inputbuf[comp].size()); 104 | #endif 105 | } 106 | } 107 | void wait() { 108 | for(int comp = 0; comp < numcomp; comp++) { 109 | #ifdef PORT_CUDA 110 | cudaStreamSynchronize(*stream[comp]); 111 | #elif defined PORT_HIP 112 | hipStreamSynchronize(*stream[comp]); 113 | #elif defined PORT_SYCL 114 | queue[comp]->wait(); 115 | #endif 116 | } 117 | } 118 | 119 | void report() { 120 | std::vector numcomp_all(numproc); 121 | std::vector numinput_all(numproc); 122 | 123 | MPI_Allgather(&numcomp, 1, MPI_INT, numcomp_all.data(), 1, MPI_INT, comm_mpi); 124 | int numinput = 0; 125 | for(int comp = 0; comp < numcomp; comp++) 126 | numinput += inputbuf[comp].size(); 127 | MPI_Allgather(&numinput, 1, MPI_INT, numinput_all.data(), 1, MPI_INT, comm_mpi); 128 | if(myid == printid) { 129 | printf("numcomp: "); 130 | for(int p = 0; p < numproc; p++) 131 | printf("%d(%d) ", numcomp_all[p], numinput_all[p]); 132 | printf("\n"); 133 | printf("\n"); 134 | } 135 | } 136 | 137 | void measure(int warmup, int numiter, size_t count) { 138 | this->report(); 139 | double times[numiter]; 140 | if(myid == printid) { 141 | printf("Measure Reduction Kernel\n"); 142 | printf("%d warmup iterations (in order)\n", warmup); 143 | } 144 | for (int iter = -warmup; iter < numiter; iter++) { 145 | #ifdef PORT_CUDA 146 | cudaDeviceSynchronize(); 147 | #elif defined PORT_HIP 148 | hipDeviceSynchronize(); 149 | #endif 150 | MPI_Barrier(comm_mpi); 151 | double time = MPI_Wtime(); 152 | this->start(); 153 | double start = MPI_Wtime() - time; 154 | this->wait(); 155 | time = MPI_Wtime() - time; 156 | MPI_Allreduce(MPI_IN_PLACE, &start, 1, MPI_DOUBLE, MPI_MAX, comm_mpi); 157 | MPI_Allreduce(MPI_IN_PLACE, &time, 1, MPI_DOUBLE, MPI_MAX, comm_mpi); 158 | if(iter < 0) { 159 | if(myid == printid) 160 | printf("startup %.2e warmup: %e\n", start, time); 161 | } 162 | else 163 | times[iter] = time; 164 | } 165 | std::sort(times, times + numiter, [](const double & a, const double & b) -> bool {return a < b;}); 166 | 167 | if(myid == printid) { 168 | printf("%d measurement iterations (sorted):\n", numiter); 169 | for(int iter = 0; iter < numiter; iter++) { 170 | printf("time: %.4e", times[iter]); 171 | if(iter == 0) 172 | printf(" -> min\n"); 173 | else if(iter == numiter / 2) 174 | printf(" -> median\n"); 175 | else if(iter == numiter - 1) 176 | printf(" -> max\n"); 177 | else 178 | printf("\n"); 179 | } 180 | printf("\n"); 181 | double minTime = times[0]; 182 | double medTime = times[numiter / 2]; 183 | double maxTime = times[numiter - 1]; 184 | double avgTime = 0; 185 | for(int iter = 0; iter < numiter; iter++) 186 | avgTime += times[iter]; 187 | avgTime /= numiter; 188 | size_t data = count * sizeof(T); 189 | printf("data: "); CommBench::print_data(data); printf("\n"); 190 | printf("minTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", minTime * 1e6, minTime / data * 1e12, data / minTime / 1e9); 191 | printf("medTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", medTime * 1e6, medTime / data * 1e12, data / medTime / 1e9); 192 | printf("maxTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", maxTime * 1e6, maxTime / data * 1e12, data / maxTime / 1e9); 193 | printf("avgTime: %.4e us, %.4e ms/GB, %.4e GB/s\n", avgTime * 1e6, avgTime / data * 1e12, data / avgTime / 1e9); 194 | printf("\n"); 195 | } 196 | } 197 | void measure(int warmup, int numiter) { 198 | size_t count_total = 0; 199 | for(int comp = 0; comp < numcomp; comp++) 200 | count_total += count[comp] * (inputbuf[comp].size() + 1); 201 | MPI_Allreduce(MPI_IN_PLACE, &count_total, 1, MPI_UNSIGNED_LONG, MPI_SUM, comm_mpi); 202 | measure(warmup, numiter, count_total); 203 | } 204 | }; 205 | -------------------------------------------------------------------------------- /source/init.h: -------------------------------------------------------------------------------- 1 | // INITIALIZE BROADCAST AND REDUCTION TREES 2 | void init(int numlevel, int groupsize[], CommBench::library lib[], int numstripe, int numbatch) { 3 | 4 | if(myid == printid) { 5 | printf("NUMBER OF EPOCHS: %d\n", numepoch); 6 | for(int epoch = 0; epoch < numepoch; epoch++) 7 | printf("epoch %d: %zu bcast %zu reduction\n", epoch, bcast_epoch[epoch].size(), reduce_epoch[epoch].size()); 8 | printf("Initialize HiCCL with %d levels\n", numlevel); 9 | for(int level = 0; level < numlevel; level++) { 10 | printf("level %d groupsize %d library: ", level, groupsize[level]); 11 | CommBench::print_lib(lib[level]); 12 | if(level == 0) 13 | if(groupsize[0] != numproc) 14 | printf(" *"); 15 | printf("\n"); 16 | } 17 | printf("\n"); 18 | } 19 | 20 | // ALLOCATE COMMAND BATCH 21 | for(int batch = 0; batch < numbatch; batch++) 22 | coll_batch.push_back(std::list*>()); 23 | 24 | // TEMP HIERARCHY FOR TREE 25 | std::vector groupsize_temp(groupsize, groupsize + numlevel); 26 | groupsize_temp[0] = numproc; 27 | 28 | // FOR EACH EPOCH 29 | for(int epoch = 0; epoch < numepoch; epoch++) { 30 | // INIT BROADCAST 31 | std::vector> &bcastlist = bcast_epoch[epoch]; 32 | if(bcastlist.size()) { 33 | // PARTITION INTO BATCHES 34 | std::vector>> bcast_batch(numbatch); 35 | partition(bcastlist, numbatch, bcast_batch); 36 | // FOR EACH BATCH 37 | for(int batch = 0; batch < numbatch; batch++) { 38 | // STRIPE BROADCAST PRIMITIVES 39 | std::vector> split_list; 40 | stripe(numstripe, bcast_batch[batch], split_list); 41 | 42 | // APPLY REDUCE TREE TO ROOTS FOR STRIPING 43 | std::vector recvbuff; // for memory recycling 44 | // reduce_tree(numlevel, groupsize_temp.data(), lib, split_list, numlevel - 1, coll_batch[batch], recvbuff, 0); 45 | reduce_tree(1, groupsize_temp.data(), &lib[numlevel-1], split_list, 0, coll_batch[batch], recvbuff, 0); 46 | 47 | // APPLY RING TO BRANCHES ACROSS NODES 48 | std::vector> bcast_intra; // for accumulating intra-node communications for tree (internally) 49 | bcast_ring(groupsize[0], lib[0], bcast_batch[batch], bcast_intra, coll_batch[batch]); 50 | 51 | // APPLY TREE TO THE LEAVES WITHIN NODES 52 | bcast_tree(numlevel, groupsize_temp.data(), lib, bcast_intra, 1, coll_batch[batch]); 53 | } 54 | } 55 | // INIT REDUCTION 56 | std::vector> &reducelist = reduce_epoch[epoch]; 57 | if(reducelist.size()) { 58 | // PARTITION INTO BATCHES 59 | std::vector>> reduce_batch(numbatch); 60 | partition(reducelist, numbatch, reduce_batch); 61 | // FOR EACH BATCH 62 | for(int batch = 0; batch < numbatch; batch++) { 63 | // STRIPE REDUCTION 64 | std::vector> merge_list; 65 | stripe(numstripe, reduce_batch[batch], merge_list); 66 | // HIERARCHICAL REDUCTION RING + TREE 67 | std::vector> reduce_intra; // for accumulating intra-node communications for tree (internally) 68 | reduce_ring(numlevel, groupsize, lib, reduce_batch[batch], reduce_intra, coll_batch[batch]); 69 | // COMPLETE STRIPING BY INTRA-NODE GATHER 70 | bcast_tree(numlevel, groupsize_temp.data(), lib, merge_list, 1, coll_batch[batch]); 71 | } 72 | } 73 | } 74 | // IMPLEMENT WITH COMMBENCH 75 | implement(coll_batch, command_batch, 1); 76 | } 77 | 78 | 79 | -------------------------------------------------------------------------------- /source/reduce.h: -------------------------------------------------------------------------------- 1 | 2 | template 3 | struct REDUCE { 4 | T* sendbuf; 5 | size_t sendoffset; 6 | T* recvbuf; 7 | size_t recvoffset; 8 | size_t count; 9 | std::vector sendids; 10 | int recvid; 11 | 12 | void report() { 13 | if(printid < 0) 14 | return; 15 | MPI_Barrier(comm_mpi); 16 | if(myid == recvid) { 17 | MPI_Send(&recvbuf, sizeof(T*), MPI_BYTE, printid, 0, comm_mpi); 18 | MPI_Send(&recvoffset, sizeof(size_t), MPI_BYTE, printid, 0, comm_mpi); 19 | } 20 | for(auto &sendid : this->sendids) 21 | if(myid == sendid) { 22 | MPI_Send(&sendbuf, sizeof(T*), MPI_BYTE, printid, 0, comm_mpi); 23 | MPI_Send(&sendoffset, sizeof(size_t), MPI_BYTE, printid, 0, comm_mpi); 24 | } 25 | if(myid == printid) { 26 | T* recvbuf_recvid; 27 | size_t recvoffset_recvid; 28 | MPI_Recv(&recvbuf_recvid, sizeof(T*), MPI_BYTE, recvid, 0, comm_mpi, MPI_STATUS_IGNORE); 29 | MPI_Recv(&recvoffset_recvid, sizeof(size_t), MPI_BYTE, recvid, 0, comm_mpi, MPI_STATUS_IGNORE); 30 | std::vector sendbuf_sendid(sendids.size()); 31 | std::vector sendoffset_sendid(sendids.size()); 32 | for(int send = 0; send < sendids.size(); send++) { 33 | MPI_Recv(sendbuf_sendid.data() + send, sizeof(T*), MPI_BYTE, sendids[send], 0, comm_mpi, MPI_STATUS_IGNORE); 34 | MPI_Recv(sendoffset_sendid.data() + send, sizeof(size_t), MPI_BYTE, sendids[send], 0, comm_mpi, MPI_STATUS_IGNORE); 35 | } 36 | printf("REDUCE report: count %lu (", count); 37 | CommBench::print_data(count * sizeof(T)); 38 | printf(")\n"); 39 | char text[1000]; 40 | int n = sprintf(text, "recvid %d recvbuf %p recvoffset %lu <- ", recvid, recvbuf_recvid, recvoffset_recvid); 41 | printf("%s", text); 42 | memset(text, ' ', n); 43 | for(int send = 0; send < sendids.size(); send++) { 44 | printf("sendid: %d sendbuf %p sendoffset %lu\n", sendids[send], sendbuf_sendid[send], sendoffset_sendid[send]); 45 | printf("%s", text); 46 | } 47 | printf("\n"); 48 | } 49 | } 50 | 51 | REDUCE(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, std::vector &sendids, int recvid) 52 | : sendbuf(sendbuf), sendoffset(sendoffset), recvbuf(recvbuf), recvoffset(recvoffset), count(count), sendids(sendids), recvid(recvid) { } 53 | 54 | REDUCE(T *sendbuf, size_t sendoffset, T *recvbuf, size_t recvoffset, size_t count, int sendid, int recvid) : sendbuf(sendbuf), sendoffset(sendoffset), recvbuf(recvbuf), recvoffset(recvoffset), count(count), recvid(recvid) { 55 | for(int i = 0; i < numproc; i++) { 56 | if(sendid == numproc) 57 | sendids.push_back(i); 58 | else if(sendid == -1) { 59 | if(i != recvid) 60 | sendids.push_back(i); 61 | } 62 | else 63 | if(i == sendid) 64 | sendids.push_back(i); 65 | } 66 | } 67 | }; 68 | 69 | template 70 | void reduce_tree(int numlevel, int groupsize[], CommBench::library lib[], std::vector> reducelist, int level, std::list*> &coll_list, std::vector &recvbuf_ptr, int numrecvbuf) { 71 | 72 | if(numproc != groupsize[0]) { 73 | printf("ERROR!!! groupsize[0] must be equal to numproc.\n"); 74 | return; 75 | } 76 | if(reducelist.size() == 0) 77 | return; 78 | 79 | // EXIT CONDITION 80 | if(level == -1) 81 | return; 82 | 83 | Coll *coll_temp = new Coll(lib[level]); 84 | 85 | std::vector> reducelist_new; 86 | 87 | int numgroup = numproc / groupsize[level]; 88 | 89 | // if(printid == printid) { 90 | // printf("level %d groupsize %d numgroup %d\n", level, groupsize[level], numgroup); 91 | // } 92 | // for(auto &reduce : reducelist) 93 | // reduce.report(); 94 | 95 | { 96 | for(auto reduce : reducelist) { 97 | std::vector sendids_new; 98 | std::vector sendbuf_new; 99 | std::vector sendoffset_new; 100 | // int recvgroup = reduce.recvid / groupsize[level]; 101 | for(int sendgroup = 0; sendgroup < numgroup; sendgroup++) { 102 | std::vector sendids; 103 | for(auto &sendid : reduce.sendids) 104 | if(sendid / groupsize[level] == sendgroup) 105 | sendids.push_back(sendid); 106 | if(sendids.size()) { 107 | /*if(printid == printid) { 108 | printf("recvgroup: %d recvid: %d sendgroup: %d sendids: ", recvgroup, reduce.recvid, sendgroup); 109 | for(auto sendid : sendids) 110 | printf("%d ", sendid); 111 | printf("\n"); 112 | }*/ 113 | int recvid = sendgroup * groupsize[level] + reduce.recvid % groupsize[level]; 114 | T* outputbuf; 115 | size_t outputoffset; 116 | if(recvid == reduce.recvid) { 117 | if(myid == recvid) { 118 | outputbuf = reduce.recvbuf; 119 | outputoffset = reduce.recvoffset; 120 | reuse += reduce.count; 121 | } 122 | // if(printid == printid) 123 | // printf("recvid %d reuses send memory\n", recvid); 124 | } 125 | else { 126 | if(myid == recvid) { 127 | CommBench::allocate(outputbuf, reduce.count); 128 | outputoffset = 0; 129 | buffsize += reduce.count; 130 | } 131 | // if(printid == printid) 132 | // printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ proc %d send malloc %zu\n", recvid, reduce.count * sizeof(T)); 133 | } 134 | if(sendids.size() > 1) { 135 | std::vector inputbuf; 136 | for(auto &sendid : sendids) { 137 | if(sendid != recvid) { 138 | T *recvbuf; 139 | if(numrecvbuf < recvbuf_ptr.size()) { 140 | if(myid == recvid) { 141 | recvbuf = recvbuf_ptr[numrecvbuf]; // recycle memory 142 | recycle += reduce.count; 143 | numrecvbuf++; 144 | } 145 | // if(myid == printid) 146 | // printf("recvid %d reuses recv memory\n", recvid); 147 | } 148 | else 149 | { 150 | if(myid == recvid) { 151 | CommBench::allocate(recvbuf, reduce.count); 152 | recvbuf_ptr.push_back(recvbuf); 153 | buffsize += reduce.count; 154 | numrecvbuf++; 155 | } 156 | if(myid == numproc) 157 | printf("-"); // this is necessary for Frontier 158 | //printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ proc %d recv malloc %zu\n", recvid, reduce.count * sizeof(T)); 159 | } 160 | /// ADD COMMUNICATION 161 | coll_temp->add(reduce.sendbuf, reduce.sendoffset, recvbuf, 0, reduce.count, sendid, recvid); 162 | inputbuf.push_back(recvbuf); 163 | } 164 | else { 165 | inputbuf.push_back(reduce.sendbuf + reduce.sendoffset); 166 | } 167 | } 168 | // ADD COMPUTATION 169 | coll_temp->add(inputbuf, outputbuf + outputoffset, reduce.count, recvid); 170 | } 171 | else { 172 | if(sendids[0] != recvid) { 173 | /// ADD COMMUNICATION 174 | coll_temp->add(reduce.sendbuf, reduce.sendoffset, outputbuf, outputoffset, reduce.count, sendids[0], recvid); 175 | } 176 | else { 177 | if(level == numlevel - 1) { 178 | /// ADD COMMUNICATION 179 | coll_temp->add(reduce.sendbuf, reduce.sendoffset, outputbuf, outputoffset, reduce.count, sendids[0], recvid); 180 | } 181 | else { 182 | outputbuf = reduce.sendbuf; 183 | outputoffset = reduce.sendoffset; 184 | } 185 | } 186 | } 187 | sendids_new.push_back(recvid); 188 | sendbuf_new.push_back(outputbuf); 189 | sendoffset_new.push_back(outputoffset); 190 | } 191 | } 192 | if(sendids_new.size()) { 193 | T *sendbuf; 194 | size_t sendoffset; 195 | for(int i = 0; i < sendids_new.size(); i++) 196 | if(myid == sendids_new[i]) { 197 | sendbuf = sendbuf_new[i]; 198 | sendoffset = sendoffset_new[i]; 199 | } 200 | reducelist_new.push_back(REDUCE(sendbuf, sendoffset, reduce.recvbuf, reduce.recvoffset, reduce.count, sendids_new, reduce.recvid)); 201 | } 202 | } 203 | } 204 | // ADD COMMUNICATION FOLLOWED BY COMPUTE (IF ANY) OTHERWISE CLEAR MEMORY 205 | if(coll_temp->numcomm + coll_temp->numcompute) 206 | coll_list.push_back(coll_temp); 207 | else 208 | delete coll_temp; 209 | 210 | reduce_tree(numlevel, groupsize, lib, reducelist_new, level - 1, coll_list, recvbuf_ptr, 0); 211 | } 212 | 213 | template 214 | void reduce_ring(int numlevel, int groupsize[], CommBench::library lib[], std::vector> &reducelist, std::vector> &reducelist_intra, std::list*> &coll_list) { 215 | 216 | //if(printid == printid) 217 | // printf("number of original reductions %ld\n", reducelist.size()); 218 | 219 | // std::vector> reducelist_intra; 220 | std::vector> reducelist_extra; 221 | 222 | Coll *coll_temp = new Coll(lib[0]); 223 | 224 | //if(printid == printid) 225 | // printf("number of original reductions %ld\n", reducelist.size()); 226 | for(auto &reduce : reducelist) { 227 | //if(printid == printid) 228 | // printf("reduce recvid: %d numsend: %ld\n", reduce.recvid, reduce.sendids.size()); 229 | int recvnode = reduce.recvid / groupsize[0]; 230 | std::vector sendids_intra; 231 | std::vector sendids_extra; 232 | for(auto &sendid : reduce.sendids) { 233 | int sendnode = sendid / groupsize[0]; 234 | if(sendnode == recvnode) 235 | sendids_intra.push_back(sendid); 236 | else 237 | sendids_extra.push_back(sendid); 238 | } 239 | //if(printid == printid) 240 | // printf("recvid %d numsend %ld sendids_intra: %zu sendids_extra: %zu\n", reduce.recvid, reduce.sendids.size(), sendids_intra.size(), sendids_extra.size()); 241 | if(sendids_extra.size()) { 242 | int numnode = numproc / groupsize[0]; 243 | int sendnode = (numnode + recvnode + 1) % numnode; 244 | std::vector> sendids(numnode); 245 | for(auto &sendid : reduce.sendids) 246 | sendids[sendid / groupsize[0]].push_back(sendid); 247 | int sendid = sendnode * groupsize[0] + reduce.recvid % groupsize[0]; 248 | /*if(printid == printid) { 249 | printf("****************** recvnode %d recvid %d sendnode %d sendid %d\n", recvnode, reduce.recvid, sendnode, sendid); 250 | for(int node = 0; node < numnode; node++) { 251 | printf("for node %d / %d: ", node, numnode); 252 | for(auto &sendid : sendids[node]) 253 | printf("%d ", sendid); 254 | printf("\n"); 255 | } 256 | }*/ 257 | // FOR SENDING NODE 258 | T *sendbuf; 259 | size_t sendoffset; 260 | bool sendreuse = false; 261 | if(sendids[sendnode].size() == 1) 262 | if(sendids[sendnode][0] == sendid) { 263 | sendbuf = reduce.sendbuf; 264 | sendoffset = reduce.sendoffset; 265 | sendreuse = true; 266 | sendids[sendnode].clear(); 267 | reuse += reduce.count; 268 | //if(printid == printid) 269 | // printf("proc %d reuse %ld\n", sendid, reduce.count); 270 | } 271 | if(!sendreuse) { 272 | if(myid == sendid) { 273 | CommBench::allocate(sendbuf, reduce.count); 274 | sendoffset = 0; 275 | buffsize += reduce.count; 276 | } 277 | //if(printid == printid) 278 | // printf("proc %d allocate %ld\n", sendid, reduce.count); 279 | } 280 | std::vector sendids_extra; 281 | for(int node = 0; node < numnode; node++) 282 | if(node != recvnode) 283 | for(auto &sendid: sendids[node]) 284 | sendids_extra.push_back(sendid); 285 | reducelist_extra.push_back(REDUCE(reduce.sendbuf, reduce.sendoffset, sendbuf, sendoffset, reduce.count, sendids_extra, sendid)); 286 | //if(printid == printid) 287 | // printf("recvid %d sendids_intra: %zu sendids_extra: %zu\n", reduce.recvid, sendids_intra.size(), sendids_extra.size()); 288 | // FOR RECIEVING NODE 289 | T *recvbuf; 290 | size_t recvoffset; 291 | if(sendids_intra.size() == 0) { 292 | recvbuf = reduce.recvbuf; 293 | recvoffset = reduce.recvoffset; 294 | reuse += reduce.count; 295 | } 296 | else { 297 | T *recvbuf_intra; 298 | if(myid == reduce.recvid) { 299 | CommBench::allocate(recvbuf, reduce.count); 300 | buffsize += reduce.count; 301 | recvoffset = 0; 302 | CommBench::allocate(recvbuf_intra, reduce.count); 303 | buffsize += reduce.count; 304 | } 305 | reducelist_intra.push_back(REDUCE(reduce.sendbuf, reduce.sendoffset, recvbuf_intra, 0, reduce.count, sendids_intra, reduce.recvid)); 306 | std::vector inputbuf = {recvbuf, recvbuf_intra}; 307 | // ADD COMPUTATION 308 | coll_temp->add(inputbuf, reduce.recvbuf + reduce.recvoffset, reduce.count, reduce.recvid); 309 | sendids_intra.push_back(reduce.recvid); 310 | } 311 | // ADD COMMUNICATION 312 | coll_temp->add(sendbuf, sendoffset, recvbuf, recvoffset, reduce.count, sendid, reduce.recvid); 313 | } 314 | else 315 | reducelist_intra.push_back(REDUCE(reduce.sendbuf, reduce.sendoffset, reduce.recvbuf, reduce.recvoffset, reduce.count, reduce.sendids, reduce.recvid)); 316 | } 317 | /*if(printid == printid) { 318 | printf("intra reductions: %ld extra reductions: %ld\n\n", reducelist_intra.size(), reducelist_extra.size()); 319 | }*/ 320 | 321 | if(reducelist_extra.size()) 322 | reduce_ring(numlevel, groupsize, lib, reducelist_extra, reducelist_intra, coll_list); 323 | else { 324 | // COMPLETE RING WITH INTRA-NODE TREE REDUCTION 325 | std::vector groupsize_temp(groupsize, groupsize + numlevel); 326 | groupsize_temp[0] = numproc; 327 | std::vector recvbuff; // for memory recycling 328 | reduce_tree(numlevel, groupsize_temp.data(), lib, reducelist_intra, numlevel - 1, coll_list, recvbuff, 0); 329 | } 330 | 331 | if(coll_temp->numcomm + coll_temp->numcompute) 332 | coll_list.push_back(coll_temp); 333 | else 334 | delete coll_temp; 335 | } 336 | 337 | template 338 | void stripe(int numstripe, std::vector> &reducelist, std::vector

&merge_list) { 339 | 340 | int nodesize = numstripe; 341 | 342 | // SEPARATE INTRA AND INTER NODES 343 | std::vector> reducelist_intra; 344 | std::vector> reducelist_inter; 345 | for(auto &reduce : reducelist) { 346 | int recvid = reduce.recvid; 347 | std::vector sendid_intra; 348 | std::vector sendid_inter; 349 | for(auto &sendid : reduce.sendids) 350 | if(sendid / nodesize == recvid / nodesize) 351 | sendid_intra.push_back(sendid); 352 | else 353 | sendid_inter.push_back(sendid); 354 | if(sendid_inter.size()) 355 | reducelist_inter.push_back(REDUCE(reduce.sendbuf, reduce.sendoffset, reduce.recvbuf, reduce.recvoffset, reduce.count, reduce.sendids, reduce.recvid)); 356 | else 357 | reducelist_intra.push_back(REDUCE(reduce.sendbuf, reduce.sendoffset, reduce.recvbuf, reduce.recvoffset, reduce.count, reduce.sendids, reduce.recvid)); 358 | } 359 | // CLEAR REDUCELIST 360 | reducelist.clear(); 361 | // ADD INTRA-NODE REDUCTION DIRECTLY (IF ANY) 362 | for(auto &reduce : reducelist_intra) 363 | reducelist.push_back(REDUCE(reduce.sendbuf, reduce.sendoffset, reduce.recvbuf, reduce.recvoffset, reduce.count, reduce.sendids, reduce.recvid)); 364 | 365 | // ADD INTER-NODE REDUCTIONS BY STRIPING 366 | if(reducelist_inter.size()) 367 | { 368 | for(auto &reduce : reducelist_inter) { 369 | int recvnode = reduce.recvid / nodesize; 370 | size_t splitoffset = 0; 371 | for(int stripe = 0; stripe < numstripe; stripe++) { 372 | int recver = recvnode * nodesize + stripe; 373 | size_t splitcount = reduce.count / numstripe + (stripe < reduce.count % numstripe ? 1 : 0); 374 | if(splitcount) { 375 | T *recvbuf; 376 | size_t recvoffset; 377 | if(recver != reduce.recvid) { 378 | if(myid == recver) { 379 | CommBench::allocate(recvbuf, splitcount); 380 | recvoffset = 0; 381 | buffsize += splitcount; 382 | } 383 | merge_list.push_back(P(recvbuf, recvoffset, reduce.recvbuf, reduce.recvoffset + splitoffset, splitcount, recver, reduce.recvid)); 384 | } 385 | else 386 | if(myid == recver) { 387 | recvbuf = reduce.recvbuf; 388 | recvoffset = reduce.recvoffset + splitoffset; 389 | reuse += splitcount; 390 | } 391 | reducelist.push_back(REDUCE(reduce.sendbuf, reduce.sendoffset + splitoffset, recvbuf, recvoffset, splitcount, reduce.sendids, recver)); 392 | splitoffset += splitcount; 393 | } 394 | else 395 | break; 396 | } 397 | } 398 | } 399 | } 400 | 401 | template 402 | void partition(std::vector> &reducelist, int numbatch, std::vector>> &reduce_batch) { 403 | for(auto &reduce : reducelist) { 404 | size_t batchoffset = 0; 405 | for(int batch = 0; batch < numbatch; batch++) { 406 | size_t batchsize = reduce.count / numbatch + (batch < reduce.count % numbatch ? 1 : 0); 407 | if(batchsize) { 408 | reduce_batch[batch].push_back(REDUCE(reduce.sendbuf, reduce.sendoffset + batchoffset, reduce.recvbuf, reduce.recvoffset + batchoffset, batchsize, reduce.sendids, reduce.recvid)); 409 | batchoffset += batchsize; 410 | } 411 | else 412 | break; 413 | } 414 | } 415 | } 416 | --------------------------------------------------------------------------------