├── Guia_Pratica_NCCL_MPI_Operacoes_Coletivas_portugues-2021@muriloboratto.pdf ├── Guion_Practica_NCCL_MPI_Operaciones_Colectivas_espanol-2021@muriloboratto.pdf ├── Guide_Laboratory_NCCL_MPI_Collective_Operations_english-2021@muriloboratto.pdf ├── .gitignore ├── samples ├── gatter │ ├── mpiGather.c │ ├── mpiAllGather.c │ └── ncclAllGather.cu ├── p2p │ ├── mpiSendRecv.c │ └── ncclSendRecv.cu ├── broadcast │ ├── mpiBcast.c │ └── ncclBcast.cu ├── reduce │ ├── mpiReduce.c │ └── ncclReduce.cu └── reducescatter │ └── ncclReduceScatter.cu └── README.md /Guia_Pratica_NCCL_MPI_Operacoes_Coletivas_portugues-2021@muriloboratto.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muriloboratto/NCCL/HEAD/Guia_Pratica_NCCL_MPI_Operacoes_Coletivas_portugues-2021@muriloboratto.pdf -------------------------------------------------------------------------------- /Guion_Practica_NCCL_MPI_Operaciones_Colectivas_espanol-2021@muriloboratto.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muriloboratto/NCCL/HEAD/Guion_Practica_NCCL_MPI_Operaciones_Colectivas_espanol-2021@muriloboratto.pdf -------------------------------------------------------------------------------- /Guide_Laboratory_NCCL_MPI_Collective_Operations_english-2021@muriloboratto.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muriloboratto/NCCL/HEAD/Guide_Laboratory_NCCL_MPI_Collective_Operations_english-2021@muriloboratto.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore the node_modules directory 2 | _*/ 3 | 4 | # Ignore Logs 5 | logs 6 | *.log 7 | 8 | # Ignore the build directory 9 | .ipynb_checkpoints 10 | 11 | # The file containing environment variables 12 | .env 13 | 14 | # Ignore IDE specific files 15 | .DS_Store 16 | i 17 | -------------------------------------------------------------------------------- /samples/gatter/mpiGather.c: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % mpiGather.cu 4 | % 5 | % Purpose: 6 | % Implements sample collective operation GATHER using MPI. 7 | % 8 | % Modified: 9 | % Sep 18 2018 10:57 10 | % 11 | % Author: 12 | % Murilo Boratto 13 | % 14 | % How to Compile: 15 | % mpicxx mpiGather.c -o mpiGather 16 | % 17 | % Execute: 18 | % mpirun -np 4 ./mpiGather 19 | % 20 | %****************************************************************************80*/ 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | int main( int argc, char **argv){ 27 | 28 | int isend; 29 | int *irecv = (int *) calloc (4, sizeof(int)); 30 | int rank, size; 31 | 32 | MPI_Init( &argc, &argv ); 33 | MPI_Comm_rank( MPI_COMM_WORLD, &rank ); 34 | MPI_Comm_size( MPI_COMM_WORLD, &size ); 35 | 36 | isend = rank + 1; 37 | 38 | MPI_Gather(&isend, 1, MPI_INT, irecv, 1, MPI_INT, 0, MPI_COMM_WORLD); 39 | 40 | if(rank == 0) 41 | printf("rank = %d\tisend = %d\tirecv = %d %d %d %d\n", rank, isend, irecv[0], irecv[1], irecv[2], irecv[3]); 42 | else 43 | printf("rank = %d\tisend = %d\tirecv = %d %d %d %d\n", rank, isend, irecv[0], irecv[1], irecv[2], irecv[3]); 44 | 45 | free(irecv); 46 | 47 | MPI_Finalize(); 48 | 49 | return 0; 50 | 51 | }/*main*/ -------------------------------------------------------------------------------- /samples/gatter/mpiAllGather.c: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % mpiAllGather.cu 4 | % 5 | % Purpose: 6 | % Implements sample collective operation ALLGATHER using MPI. 7 | % 8 | % Modified: 9 | % Jan 09 2019 10:57 10 | % 11 | % Author: 12 | % Murilo Boratto 13 | % 14 | % How to Compile: 15 | % mpicxx mpiAllGather.c -o mpiAllGather 16 | % 17 | % Execute: 18 | % mpirun -np 4 ./mpiAllGather 19 | % 20 | % Comments: 21 | % 1) Simple testbed with size problem = 4 on 4 process. 22 | % 23 | %****************************************************************************80*/ 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | int main( int argc, char **argv){ 30 | 31 | int isend; 32 | int *irecv = (int *) calloc (4, sizeof(int)); 33 | int rank, size; 34 | 35 | MPI_Init( &argc, &argv ); 36 | MPI_Comm_rank( MPI_COMM_WORLD, &rank ); 37 | MPI_Comm_size( MPI_COMM_WORLD, &size ); 38 | 39 | switch(rank) { 40 | case 0 : isend = rank + 10; break; 41 | case 1 : isend = rank + 19; break; 42 | case 2 : isend = rank + 28; break; 43 | case 3 : isend = rank + 37; break; 44 | } 45 | 46 | MPI_Allgather(&isend, 1, MPI_INT, irecv, 1, MPI_INT, MPI_COMM_WORLD); 47 | 48 | printf("rank = %d\tisend = %d\tirecv = %d %d %d %d\n", rank, isend, irecv[0], irecv[1], irecv[2], irecv[3]); 49 | 50 | free(irecv); 51 | 52 | MPI_Finalize(); 53 | 54 | return 0; 55 | 56 | }/*main*/ -------------------------------------------------------------------------------- /samples/p2p/mpiSendRecv.c: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************801 2 | % Code: 3 | % mpiSendRecv.c 4 | % 5 | % Purpose: 6 | % Implements sample send/recv code using the package MPI. 7 | % 8 | % Modified: 9 | % Aug 17 2020 10:57 10 | % 11 | % Author: 12 | % Murilo Boratto 13 | % 14 | % How to Compile: 15 | % mpicxx mpiSendRecv.c -o mpiSendRecv 16 | % 17 | % How to Execute: 18 | % mpirun -np 2 ./mpiSendRecv 19 | %****************************************************************************80*/ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | int main (int argc, char *argv[]){ 26 | 27 | int size = 8; 28 | int sendbuff[size]; 29 | int recvbuff[size]; 30 | int numprocessors, rank, dest, i, tag = 1000; 31 | 32 | MPI_Init(&argc, &argv); 33 | MPI_Comm_size(MPI_COMM_WORLD, &numprocessors); 34 | MPI_Comm_rank(MPI_COMM_WORLD,&rank); 35 | MPI_Status status; 36 | 37 | if (rank == 0){ 38 | 39 | printf("Rank %d\n", rank); 40 | 41 | for(int i = 0; i < size; i++) 42 | printf("%d\t", sendbuff[i] = i + 1); 43 | 44 | printf("\n"); 45 | 46 | for (dest = 1; dest < numprocessors; dest++) 47 | MPI_Send(&sendbuff, size, MPI_INT, dest, tag, MPI_COMM_WORLD); 48 | 49 | }else{ 50 | 51 | MPI_Recv(&recvbuff, size, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); 52 | 53 | printf("Rank %d\n", rank); 54 | 55 | for(i = 0; i < size; i++) 56 | printf("%d\t", recvbuff[i]+10); 57 | 58 | printf("\n"); 59 | 60 | } 61 | 62 | MPI_Finalize(); 63 | 64 | }/*main*/ 65 | -------------------------------------------------------------------------------- /samples/broadcast/mpiBcast.c: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % mpiBcast.c 4 | % 5 | % Purpose: 6 | % Implements sample code using collective operator Broadcast with the package MPI. 7 | % The code multiple the vector position per 2. 8 | % 9 | % Modified: 10 | % Aug 17 2020 10:57 11 | % 12 | % Author: 13 | % Murilo Boratto 14 | % 15 | % How to Compile: 16 | % mpicxx mpiBcast.c -o mpiBcast 17 | % 18 | % Execute: 19 | % mpirun -np 4 ./mpiBcast 20 | % 21 | %****************************************************************************80*/ 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | void print_vector(int rank, int *in, int n, int label){ 28 | 29 | if(label) 30 | printf("[%d]\t", rank); 31 | else 32 | printf(" \t"); 33 | 34 | for(int i=0; i < n; i++) 35 | printf("%d\t", in[i]); 36 | 37 | printf("\n"); 38 | 39 | } 40 | 41 | int main(int argc, char* argv[]) { 42 | 43 | int i, rank, size; 44 | 45 | MPI_Init (&argc, &argv); 46 | MPI_Comm_rank (MPI_COMM_WORLD, &rank); 47 | MPI_Comm_size (MPI_COMM_WORLD, &size); 48 | 49 | int data_size = 8; 50 | 51 | int *data = (int*) malloc(data_size * sizeof(int)); 52 | 53 | if(rank == 0) { 54 | for(int i = 0; i < data_size; i++) 55 | data[i] = rand()%(10-2)*2; 56 | 57 | print_vector(rank, data, data_size, 0); 58 | } 59 | 60 | MPI_Bcast(data, data_size, MPI_INT, 0, MPI_COMM_WORLD); 61 | 62 | for(int i = 0; i < data_size; i++) 63 | data[i] *= 2; 64 | 65 | print_vector(rank, data, data_size, 1); 66 | 67 | MPI_Finalize(); 68 | 69 | return 0; 70 | 71 | }/*main*/ 72 | 73 | -------------------------------------------------------------------------------- /samples/reduce/mpiReduce.c: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % mpiReduce.c 4 | % 5 | % Purpose: 6 | % Implements sample code using collective operator Reduce with the package MPI. 7 | % The code calculate the dot product(scalar product). 8 | % x = (xo, x1, x2, ..., xn) 9 | % y = (yo, y1, y2, ..., yn) 10 | % c = (xo . yo + x1 . y1 + ..., xn . yn) 11 | % 12 | % Modified: 13 | % Aug 18 2020 10:57 14 | % 15 | % Author: 16 | % Murilo Boratto 17 | % 18 | % How to Compile: 19 | % mpicxx mpiReduce.c -o mpiReduce 20 | % 21 | % Execute: 22 | % mpirun -np 2 ./mpiReduce 23 | % 24 | %****************************************************************************80*/ 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | void print_vector(double *in, int n){ 31 | 32 | for(int i=0; i < n; i++) 33 | printf("%1.2f\t", in[i]); 34 | 35 | printf("\n"); 36 | 37 | }/*print_vector*/ 38 | 39 | 40 | int main(int argc, char* argv[]) { 41 | 42 | int i, rank, size; 43 | double result = 0, result_f; 44 | 45 | MPI_Init (&argc, &argv); 46 | MPI_Comm_rank (MPI_COMM_WORLD, &rank); 47 | MPI_Comm_size (MPI_COMM_WORLD, &size); 48 | 49 | int data_size = 8; 50 | 51 | double *x = (double*) malloc(data_size * sizeof(double)); 52 | double *y = (double*) malloc(data_size * sizeof(double)); 53 | 54 | for(int i = 0; i < data_size; i++){ 55 | x[i] = 1; 56 | y[i] = 2; 57 | result = result + x[i] * y[i]; 58 | } 59 | 60 | if(rank == 0 || rank){ 61 | printf("Rank %d\n", rank); 62 | print_vector(x, data_size); 63 | print_vector(y, data_size); 64 | } 65 | 66 | MPI_Reduce(&result, &result_f, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 67 | 68 | if(rank == 0) 69 | printf("dot(x,y) = %f\n", result_f); 70 | 71 | MPI_Finalize(); 72 | 73 | return 0; 74 | 75 | }/*main*/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sample Codes using NCCL on Multi-GPU 2 | 3 | It is necessary to carry out communication operations involving multiple computational resources in most parallel applications. These communication operations can be implemented through point-to-point operations. However, this approach is not very efficient for the programmer. Parallel and distributed solutions based on collective operations have long been chosen for these applications. The MPI standard has a set of very efficient routines that perform collective operations, making better use of the computing capacity of available computational resources. Also, with the advent of new computational resources, similar routines appear for multi-GPU systems. This repository will cover the handling of NCCL routines for multi-GPU environments, constantly comparing them with the MPI standard, showing the differences and similarities between the two computational execution environments. 4 | 5 | ---- 6 | ## What is NCCL? 7 | see [NVIDIA](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/index.html) 8 | 9 | > NCCL (NVIDIA Collective Communication Library) is a sample of how to call collective operation functions on multi-GPU. A simple example of using broadcast, reduce, allGather, reduceScatter operations. 10 | 11 | ---- 12 | 13 | ## NCCL Solution 14 | 15 | NVIDIA creates a friendly solution to this interconnect issue by providing higher bandwidth that calls NVIDIA Collective Communications Library (NCCL). This library provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, optimized to achieve high bandwidth over PCIe and NVLINK high-speed interconnect and implements multi-GPU and multi-node collective communication primitives that are performance-optimized for NVIDIA GPUs on NVLINK technology to interconnects. NCCL is a library of multi-GPU collective communication primitives that are topology-aware and easily integrated into your application. Initially developed as an open-source research project, NCCL is lightweight, depending only on the usual C++ and CUDA libraries. 16 | 17 | ---- 18 | 19 | ## Collective Operations 20 | 21 | At present, the library implements the following collectives operations: 22 | 23 | * broadcast 24 | * gatter 25 | * send-recv 26 | * reduce 27 | * reduce-scatter 28 | 29 | ---- 30 | 31 | ## Requirements 32 | 33 | NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported. 34 | -------------------------------------------------------------------------------- /samples/broadcast/ncclBcast.cu: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % ncclBcast.cu 4 | % 5 | % Purpose: 6 | % Implements sample BROADCAST code using the package NCCL (ncclBcast). 7 | % Using 'Multiples Devices per Thread'. 8 | % The code multiple the vector position per 2 on GPUs. 9 | % 10 | % Modified: 11 | % Aug 17 2020 10:57 12 | % 13 | % Author: 14 | % Murilo Boratto 15 | % 16 | % How to Compile: 17 | % nvcc ncclBcast.cu -o ncclBcast -lnccl 18 | % 19 | % Execute: 20 | % ./ncclBcast 21 | % 22 | %****************************************************************************80*/ 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | __global__ void kernel(int *a) 29 | { 30 | int index = threadIdx.x; 31 | 32 | a[index] *= 2; 33 | printf("%d\t", a[index]); 34 | 35 | }/*kernel*/ 36 | 37 | 38 | void print_vector(int *in, int n){ 39 | 40 | for(int i=0; i < n; i++) 41 | printf("%d\t", in[i]); 42 | 43 | printf("\n"); 44 | 45 | }/*print_vector*/ 46 | 47 | 48 | int main(int argc, char* argv[]) { 49 | 50 | int data_size = 8 ; 51 | int nGPUs = 0; 52 | cudaGetDeviceCount(&nGPUs); 53 | 54 | int *DeviceList = (int *) malloc (nGPUs * sizeof(int)); 55 | int *data = (int*) malloc (data_size * sizeof(int)); 56 | int **d_data = (int**) malloc (nGPUs * sizeof(int*)); 57 | 58 | for(int i = 0; i < nGPUs; i++) 59 | DeviceList[i] = i; 60 | 61 | /*Initializing NCCL with Multiples Devices per Thread*/ 62 | ncclComm_t* comms = (ncclComm_t*) malloc(sizeof(ncclComm_t) * nGPUs); 63 | cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)* nGPUs); 64 | ncclCommInitAll(comms, nGPUs, DeviceList); 65 | 66 | /*Population the data vector*/ 67 | for(int i = 0; i < data_size; i++) 68 | data[i] = rand()%(10-2)*2; 69 | 70 | print_vector(data, data_size); 71 | 72 | for(int g = 0; g < nGPUs; g++) { 73 | cudaSetDevice(DeviceList[g]); 74 | cudaStreamCreate(&s[g]); 75 | cudaMalloc(&d_data[g], data_size * sizeof(int)); 76 | 77 | if(g == 0) /*Copy from Host to Device*/ 78 | cudaMemcpy(d_data[g], data, data_size * sizeof(int), cudaMemcpyHostToDevice); 79 | } 80 | 81 | ncclGroupStart(); 82 | 83 | for(int g = 0; g < nGPUs; g++) { 84 | cudaSetDevice(DeviceList[g]); 85 | ncclBcast(d_data[g], data_size, ncclInt, 0, comms[g], s[g]); /*Broadcasting it to all*/ 86 | } 87 | 88 | ncclGroupEnd(); 89 | 90 | for (int g = 0; g < nGPUs; g++) { 91 | cudaSetDevice(DeviceList[g]); 92 | printf("\nThis is device %d\n", g); 93 | kernel <<< 1 , data_size >>> (d_data[g]);/*Call the CUDA Kernel: The code multiple the vector position per 2 on GPUs*/ 94 | cudaDeviceSynchronize(); 95 | } 96 | 97 | printf("\n"); 98 | 99 | for (int g = 0; g < nGPUs; g++) { /*Synchronizing CUDA Streams*/ 100 | cudaSetDevice(DeviceList[g]); 101 | cudaStreamSynchronize(s[g]); 102 | } 103 | 104 | for(int g = 0; g < nGPUs; g++) { /*Destroy CUDA Streams*/ 105 | cudaSetDevice(DeviceList[g]); 106 | cudaStreamDestroy(s[g]); 107 | } 108 | 109 | for(int g = 0; g < nGPUs; g++) /*Finalizing NCCL*/ 110 | ncclCommDestroy(comms[g]); 111 | 112 | /*Freeing memory*/ 113 | free(s); 114 | free(data); 115 | free(DeviceList); 116 | 117 | cudaFree(d_data); 118 | 119 | return 0; 120 | 121 | }/*main*/ 122 | 123 | -------------------------------------------------------------------------------- /samples/p2p/ncclSendRecv.cu: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % ncclSendRecv.cu 4 | % 5 | % Purpose: 6 | % Implements sample send/recv code using the package NCCL (p2p). 7 | % 8 | % Modified: 9 | % Aug 18 2020 10:57 10 | % 11 | % Author: 12 | % Murilo Boratto 13 | % 14 | % How to Compile: 15 | % nvcc ncclSendRecv.cu -o object -lnccl 16 | % 17 | % HowtoExecute: 18 | % ./object 19 | % 20 | %****************************************************************************80*/ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | __global__ void kernel(int *a, int rank) { 28 | 29 | if(rank == 0) 30 | printf("%d\t", a[threadIdx.x]); 31 | else 32 | printf("%d\t", a[threadIdx.x]*10); 33 | } 34 | 35 | void show_all(int *in, int n){ 36 | 37 | printf("\n"); 38 | 39 | for(int i=0; i < n; i++) 40 | printf("%d\t", in[i]); 41 | 42 | printf("\n"); 43 | 44 | }/*show_all*/ 45 | 46 | 47 | int main(int argc, char* argv[]) { 48 | 49 | int size = 8; 50 | 51 | /*Get current amounts number of GPU*/ 52 | int nGPUs = 0; 53 | cudaGetDeviceCount(&nGPUs); 54 | printf("nGPUs = %d\n",nGPUs); 55 | 56 | /*List GPU Device*/ 57 | int *DeviceList = (int *) malloc ( nGPUs * sizeof(int)); 58 | 59 | for(int i = 0; i < nGPUs; ++i) 60 | DeviceList[i] = i; 61 | 62 | /*NCCL Init*/ 63 | ncclComm_t* comms = (ncclComm_t*) malloc(sizeof(ncclComm_t) * nGPUs); 64 | cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)* nGPUs); 65 | ncclCommInitAll(comms, nGPUs, DeviceList); 66 | 67 | /*General variables*/ 68 | int *host = (int*) malloc(size * sizeof(int)); 69 | int **sendbuff = (int**)malloc(nGPUs * sizeof(int*)); 70 | int **recvbuff = (int**)malloc(nGPUs * sizeof(int*)); 71 | 72 | /*Population of vector*/ 73 | for(int i = 0; i < size; i++) 74 | host[i] = i + 1; 75 | 76 | show_all(host, size); 77 | 78 | for(int g = 0; g < nGPUs; g++) { 79 | cudaSetDevice(DeviceList[g]); 80 | cudaStreamCreate(&s[g]); 81 | cudaMalloc(&sendbuff[g], size * sizeof(int)); 82 | cudaMalloc(&recvbuff[g], size * sizeof(int)); 83 | 84 | if(g == 0) 85 | cudaMemcpy(sendbuff[g], host, size * sizeof(int),cudaMemcpyHostToDevice); 86 | 87 | }/*for*/ 88 | 89 | ncclGroupStart(); 90 | 91 | for(int g = 0; g < nGPUs; g++) { 92 | ncclSend(sendbuff[0], size, ncclInt, g, comms[g], s[g]); 93 | ncclRecv(recvbuff[g], size, ncclInt, g, comms[g], s[g]); 94 | } 95 | 96 | ncclGroupEnd(); 97 | 98 | for(int g = 0; g < nGPUs; g++) { 99 | cudaSetDevice(DeviceList[g]); 100 | printf("\nThis is device %d\n", g); 101 | if(g==0) 102 | kernel <<< 1 , size >>> (sendbuff[g], g); 103 | else 104 | kernel <<< 1 , size >>> (recvbuff[g], g); 105 | cudaDeviceSynchronize(); 106 | } 107 | 108 | printf("\n"); 109 | 110 | for (int g = 0; g < nGPUs; g++) { 111 | cudaSetDevice(DeviceList[g]); 112 | cudaStreamSynchronize(s[g]); 113 | } 114 | 115 | 116 | for(int g = 0; g < nGPUs; g++) { 117 | cudaSetDevice(DeviceList[g]); 118 | cudaStreamDestroy(s[g]); 119 | } 120 | 121 | for(int g = 0; g < nGPUs; g++) { 122 | ncclCommDestroy(comms[g]); 123 | } 124 | 125 | free(s); 126 | free(host); 127 | 128 | cudaFree(sendbuff); 129 | cudaFree(recvbuff); 130 | 131 | return 0; 132 | 133 | }/*main*/ 134 | -------------------------------------------------------------------------------- /samples/gatter/ncclAllGather.cu: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % ncclAllGather.cu 4 | % 5 | % Purpose: 6 | % Implements a simple collective operation ALLGATHER using NCCL (ncclAllGather). 7 | % 8 | % Modified: 9 | % Jan 09 2019 10:57 10 | % 11 | % Author: 12 | % Murilo Boratto [muriloboratto 'at' gmail.com] 13 | % 14 | % How to Compile: 15 | % nvcc ncclAllGather.cu -o ncclAllGather -lnccl 16 | % 17 | % How to Execute: 18 | % ./ncclAllGather 19 | % 20 | % Comments: 21 | % 22 | % 1) For ncclAllGather, in place operations are done when the per-rank pointer is located at the rank offset 23 | % of the global buffer. More precisely, these calls are considered in place: 24 | % 25 | % ncclAllGather(data+rank*sendcount, data, sendcount, datatype, comm, stream); 26 | % 27 | % 2) Simple Testbed with size problem = 4 on environment with 4 GPUs. 28 | % 29 | %****************************************************************************80*/ 30 | 31 | #include 32 | #include "cuda_runtime.h" 33 | #include "nccl.h" 34 | 35 | __global__ void Dev_print(float *x) { 36 | 37 | int i = threadIdx.x; 38 | 39 | printf("%1.2f\t", x[i]); 40 | 41 | }/*Dev_print*/ 42 | 43 | 44 | void print_vector(float *in, int n){ 45 | 46 | for(int i=0; i < n; i++) 47 | if(in[i]) 48 | printf("%1.2f\t", in[i]); 49 | 50 | }/*print_vector*/ 51 | 52 | 53 | int main(int argc, char* argv[]){ 54 | 55 | /*Variables*/ 56 | int size = 4; 57 | int nGPUs = 4; 58 | int sendcount = 1; 59 | int DeviceList[4] = {0, 1, 2, 3}; /* (GPUs Id) Testbed on environment with 4 GPUs*/ 60 | 61 | /*Initializing NCCL with Multiples Devices per Thread*/ 62 | ncclComm_t* comms = (ncclComm_t*) malloc(sizeof(ncclComm_t) * nGPUs); 63 | cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)* nGPUs); 64 | ncclCommInitAll(comms, nGPUs, DeviceList); 65 | 66 | /*Allocating and initializing device buffers*/ 67 | float** sendbuff = (float**) malloc(nGPUs * sizeof(float*)); 68 | float** recvbuff = (float**) malloc(nGPUs * sizeof(float*)); 69 | 70 | /*Host vectors*/ 71 | float host_x0[4] = { 10, 0, 0, 0}; 72 | float host_x1[4] = { 0, 20, 0, 0}; 73 | float host_x2[4] = { 0, 0, 30, 0}; 74 | float host_x3[4] = { 0, 0, 0, 40}; 75 | 76 | print_vector(host_x0, size); 77 | print_vector(host_x1, size); 78 | print_vector(host_x2, size); 79 | print_vector(host_x3, size); 80 | 81 | for (int i = 0; i < nGPUs; ++i) { 82 | 83 | cudaSetDevice(i); 84 | 85 | cudaMalloc(&sendbuff[i], size * sizeof(float)); 86 | cudaMalloc(&recvbuff[i], size * sizeof(float)); 87 | 88 | switch(i) { /*Copy from host to devices*/ 89 | case 0 : cudaMemcpy(sendbuff[i] , host_x0, size * sizeof(float), cudaMemcpyHostToDevice); break; 90 | case 1 : cudaMemcpy(sendbuff[i] , host_x1, size * sizeof(float), cudaMemcpyHostToDevice); break; 91 | case 2 : cudaMemcpy(sendbuff[i] , host_x2, size * sizeof(float), cudaMemcpyHostToDevice); break; 92 | case 3 : cudaMemcpy(sendbuff[i] , host_x3, size * sizeof(float), cudaMemcpyHostToDevice); break; 93 | } 94 | 95 | cudaStreamCreate(s+i); 96 | 97 | } 98 | 99 | ncclGroupStart(); 100 | 101 | for(int g = 0; g < nGPUs; g++) { 102 | cudaSetDevice(g); 103 | ncclAllGather(sendbuff[g] + g, recvbuff[g], sendcount, ncclFloat, comms[g], s[g]); /*All Gathering the data on GPUs*/ 104 | } 105 | 106 | ncclGroupEnd(); 107 | 108 | 109 | for(int g = 0; g < nGPUs; g++) { 110 | cudaSetDevice(g); 111 | printf("\nThis is device %d\n", g); 112 | Dev_print <<< 1, size >>> (recvbuff[g]); /*Call the CUDA Kernel: Print vector on GPUs*/ 113 | cudaDeviceSynchronize(); 114 | } 115 | 116 | printf("\n"); 117 | 118 | for (int i = 0; i < nGPUs; ++i) { /*Synchronizing CUDA Streams*/ 119 | cudaSetDevice(i); 120 | cudaStreamSynchronize(s[i]); 121 | } 122 | 123 | for (int i = 0; i < nGPUs; ++i) { /*Destroy CUDA Streams*/ 124 | cudaSetDevice(i); 125 | cudaFree(sendbuff[i]); 126 | cudaFree(recvbuff[i]); 127 | } 128 | 129 | for(int i = 0; i < nGPUs; ++i) /*Finalizing NCCL*/ 130 | ncclCommDestroy(comms[i]); 131 | 132 | /*Freeing memory*/ 133 | cudaFree(sendbuff); 134 | cudaFree(recvbuff); 135 | 136 | return 0; 137 | 138 | }/*main*/ 139 | -------------------------------------------------------------------------------- /samples/reducescatter/ncclReduceScatter.cu: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % ncclReduceScatter.cu 4 | % 5 | % Purpose: 6 | % Implements a simple collective operation REDUCESCATTER using NCCL (ncclReduceScatter). 7 | % 8 | % Modified: 9 | % Aug 18 2020 10:57 10 | % 11 | % Author: 12 | % Murilo Boratto 13 | % 14 | % How to Compile: 15 | % nvcc ncclReduceScatter.cu -o ncclReduceScatter -lnccl 16 | % 17 | % How to Execute: 18 | % ./ncclReduceScatter 19 | % 20 | % Comments: 21 | % 22 | % 1) For ncclReduceScatter, in place operations are done when the per-rank pointer is located at the rank offset 23 | % of the global buffer. More precisely, these calls are considered in place: 24 | % 25 | % ncclReduceScatter(data, data+rank*recvcount, recvcount, datatype, op, comm, stream); 26 | % 27 | % 2) Simple Testbed with size problem = 4 on environment with 4 GPUs. 28 | %****************************************************************************80*/ 29 | 30 | #include 31 | #include "cuda_runtime.h" 32 | #include "nccl.h" 33 | 34 | __global__ void Dev_print(float *x) { 35 | 36 | int i = threadIdx.x; 37 | 38 | printf("%1.2f\t", x[i]); 39 | 40 | 41 | }/*Dev_print*/ 42 | 43 | 44 | void print_vector(float *in, int n){ 45 | 46 | for(int i=0; i < n; i++) 47 | printf("%1.2f\t", in[i]); 48 | 49 | printf("\n"); 50 | 51 | }/*print_vector*/ 52 | 53 | 54 | int main(int argc, char* argv[]){ 55 | 56 | /*Variables*/ 57 | int size = 4; 58 | int nGPUs = 4; 59 | int recvcount = 1; 60 | int DeviceList[4] = {0, 1, 2, 3}; /* (GPUs Id) Testbed on environment with 4 GPUs*/ 61 | 62 | /*Initializing NCCL with Multiples Devices per Thread*/ 63 | ncclComm_t* comms = (ncclComm_t*) malloc(sizeof(ncclComm_t) * nGPUs); 64 | cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)* nGPUs); 65 | ncclCommInitAll(comms, nGPUs, DeviceList); 66 | 67 | /*Allocating and initializing device buffers*/ 68 | float** sendbuff = (float**) malloc(nGPUs * sizeof(float*)); 69 | float** recvbuff = (float**) malloc(nGPUs * sizeof(float*)); 70 | 71 | /*Host vectors*/ 72 | float host_x0[4] = { 10, 50, 90, 130}; 73 | float host_x1[4] = { 20, 60, 100, 140}; 74 | float host_x2[4] = { 30, 70, 110, 150}; 75 | float host_x3[4] = { 40, 80, 120, 160}; 76 | 77 | print_vector(host_x0, size); 78 | print_vector(host_x1, size); 79 | print_vector(host_x2, size); 80 | print_vector(host_x3, size); 81 | 82 | for (int i = 0; i < nGPUs; ++i) { 83 | 84 | cudaSetDevice(i); 85 | 86 | cudaMalloc(&sendbuff[i], size * sizeof(float)); 87 | cudaMalloc(&recvbuff[i], size * sizeof(float)); 88 | 89 | switch(i) { /*Copy from host to devices*/ 90 | case 0 : cudaMemcpy(sendbuff[i] , host_x0, size * sizeof(float), cudaMemcpyHostToDevice); break; 91 | case 1 : cudaMemcpy(sendbuff[i] , host_x1, size * sizeof(float), cudaMemcpyHostToDevice); break; 92 | case 2 : cudaMemcpy(sendbuff[i] , host_x2, size * sizeof(float), cudaMemcpyHostToDevice); break; 93 | case 3 : cudaMemcpy(sendbuff[i] , host_x3, size * sizeof(float), cudaMemcpyHostToDevice); break; 94 | } 95 | 96 | cudaStreamCreate(s+i); 97 | 98 | } 99 | 100 | ncclGroupStart(); 101 | 102 | for(int g = 0; g < nGPUs; g++) { 103 | cudaSetDevice(g); 104 | ncclReduceScatter(sendbuff[g], recvbuff[g], recvcount, ncclFloat, ncclSum, comms[g], s[g]); /*All Reducing and Scattering the data on GPUs*/ 105 | } 106 | 107 | ncclGroupEnd(); 108 | 109 | 110 | for(int g = 0; g < nGPUs; g++) { 111 | cudaSetDevice(g); 112 | printf("\nThis is device %d\n", g); 113 | Dev_print <<< 1, size >>> (recvbuff[g]); /*Call the CUDA Kernel: Print vector on GPUs*/ 114 | cudaDeviceSynchronize(); 115 | } 116 | 117 | printf("\n"); 118 | 119 | for (int i = 0; i < nGPUs; ++i) { /*Synchronizing CUDA Streams*/ 120 | cudaSetDevice(i); 121 | cudaStreamSynchronize(s[i]); 122 | } 123 | 124 | for (int i = 0; i < nGPUs; ++i) { /*Destroy CUDA Streams*/ 125 | cudaSetDevice(i); 126 | cudaFree(sendbuff[i]); 127 | cudaFree(recvbuff[i]); 128 | } 129 | 130 | for(int i = 0; i < nGPUs; ++i) /*Finalizing NCCL*/ 131 | ncclCommDestroy(comms[i]); 132 | 133 | /*Freeing memory*/ 134 | cudaFree(sendbuff); 135 | cudaFree(recvbuff); 136 | 137 | return 0; 138 | 139 | }/*main*/ -------------------------------------------------------------------------------- /samples/reduce/ncclReduce.cu: -------------------------------------------------------------------------------- 1 | /*%****************************************************************************80 2 | % Code: 3 | % ncclReduce.cu 4 | % 5 | % Purpose: 6 | % Implements sample reduce code using the package NCCL(ncclReduce). 7 | % Using 'Multiples Devices per Thread'. 8 | % Implements dot product(scalar product). 9 | % x = (xo, x1, x2, ..., xn) 10 | % y = (yo, y1, y2, ..., yn) 11 | % c = (xo . yo + x1 . y1 + ..., xn . yn) 12 | % 13 | % Modified: 14 | % Aug 18 2020 10:57 15 | % 16 | % Author: 17 | % Murilo Boratto 18 | % 19 | % How to Compile: 20 | % nvcc ncclReduce.cu -o ncclReduce -lnccl 21 | % 22 | % Execute: 23 | % ./ncclReduce 24 | % 25 | %****************************************************************************80*/ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | __global__ void Dev_dot(double *x, double *y, int n) { 34 | 35 | __shared__ double tmp[512]; 36 | 37 | int i = threadIdx.x; 38 | int t = blockDim.x * blockIdx.x + threadIdx.x; 39 | 40 | if (t < n) 41 | tmp[i] = x[t]; 42 | 43 | __syncthreads(); 44 | 45 | for (int stride = blockDim.x / 2; stride > 0; stride /= 2) { 46 | 47 | if (i < stride) 48 | tmp[i] += tmp[i + stride]; 49 | 50 | __syncthreads(); 51 | 52 | } 53 | 54 | if (threadIdx.x == 0) { 55 | y[blockIdx.x] = tmp[0]; 56 | printf("\tdot(x,y) = %1.2f\n", y[blockIdx.x]); 57 | } 58 | 59 | }/*Dev_dot*/ 60 | 61 | 62 | __global__ void Dev_print(double *x) { 63 | 64 | int i = threadIdx.x; 65 | 66 | printf("%1.2f\t", x[i]); 67 | 68 | }/*Dev_print*/ 69 | 70 | 71 | void print_vector(double *in, int n){ 72 | 73 | for(int i=0; i < n; i++) 74 | printf("%1.2f\t", in[i]); 75 | 76 | printf("\n"); 77 | 78 | }/*print_vector*/ 79 | 80 | 81 | int main(int argc, char* argv[]) { 82 | 83 | /*Variables*/ 84 | int nGPUs = 0; 85 | cudaGetDeviceCount(&nGPUs); 86 | int *DeviceList = (int *) malloc ( nGPUs * sizeof(int)); 87 | 88 | int data_size = 8; 89 | 90 | double *x = (double*) malloc(data_size * sizeof(double)); 91 | double *y = (double*) malloc(data_size * sizeof(double)); 92 | double **x_d_data = (double**) malloc(nGPUs * sizeof(double*)); 93 | double **y_d_data = (double**) malloc(nGPUs * sizeof(double*)); 94 | double **Sx_d_data = (double**) malloc(nGPUs * sizeof(double*)); 95 | double **Sy_d_data = (double**) malloc(nGPUs * sizeof(double*)); 96 | 97 | for (int i = 0; i < nGPUs; ++i) 98 | DeviceList[i] = i; 99 | 100 | /*Initializing NCCL with Multiples Devices per Thread*/ 101 | ncclComm_t* comms = (ncclComm_t*) malloc(sizeof(ncclComm_t) * nGPUs); 102 | cudaStream_t* s = (cudaStream_t*)malloc(sizeof(cudaStream_t)* nGPUs); 103 | ncclCommInitAll(comms, nGPUs, DeviceList); 104 | 105 | /*Population vectors*/ 106 | for(int i = 0; i < data_size; i++){ 107 | x[i] = 1; 108 | y[i] = 2; 109 | } 110 | 111 | print_vector(x, data_size); 112 | print_vector(y, data_size); 113 | 114 | 115 | for(int g = 0; g < nGPUs; g++) { 116 | cudaSetDevice(DeviceList[g]); 117 | cudaStreamCreate(&s[g]); 118 | 119 | cudaMalloc(&x_d_data[g], data_size * sizeof(double)); 120 | cudaMalloc(&y_d_data[g], data_size * sizeof(double)); 121 | 122 | cudaMalloc(&Sx_d_data[g], data_size * sizeof(double)); 123 | cudaMalloc(&Sy_d_data[g], data_size * sizeof(double)); 124 | 125 | cudaMemcpy(x_d_data[g], x, data_size * sizeof(double), cudaMemcpyHostToDevice); /*Copy from Host to Devices*/ 126 | cudaMemcpy(y_d_data[g], y, data_size * sizeof(double), cudaMemcpyHostToDevice); 127 | } 128 | 129 | ncclGroupStart(); 130 | 131 | for(int g = 0; g < nGPUs; g++) { 132 | cudaSetDevice(DeviceList[g]); 133 | ncclReduce(x_d_data[g], Sx_d_data[g], data_size, ncclDouble, ncclSum, 0, comms[g], s[g]); /*Reducing x vector*/ 134 | ncclReduce(y_d_data[g], Sy_d_data[g], data_size, ncclDouble, ncclSum, 0, comms[g], s[g]); /*Reducing y vector*/ 135 | } 136 | 137 | ncclGroupEnd(); 138 | 139 | 140 | for(int g = 0; g < nGPUs; g++) { 141 | cudaSetDevice(DeviceList[g]); 142 | printf("\n This is device %d\n", g); 143 | Dev_dot <<< 1, data_size >>> (Sy_d_data[g], Sx_d_data[g], data_size); /*Call the CUDA Kernel: dot product*/ 144 | cudaDeviceSynchronize(); 145 | } 146 | 147 | for (int g = 0; g < nGPUs; g++) { /*Synchronizing CUDA Streams*/ 148 | cudaSetDevice(DeviceList[g]); 149 | cudaStreamSynchronize(s[g]); 150 | } 151 | 152 | for(int g = 0; g < nGPUs; g++) { /*Destroy CUDA Streams*/ 153 | cudaSetDevice(DeviceList[g]); 154 | cudaStreamDestroy(s[g]); 155 | } 156 | 157 | for(int g = 0; g < nGPUs; g++) /*Finalizing NCCL*/ 158 | ncclCommDestroy(comms[g]); 159 | 160 | /*Freeing memory*/ 161 | free(s); 162 | free(x); 163 | free(y); 164 | free(DeviceList); 165 | 166 | cudaFree(x_d_data); 167 | cudaFree(y_d_data); 168 | cudaFree(Sx_d_data); 169 | cudaFree(Sy_d_data); 170 | 171 | return 0; 172 | 173 | }/*main*/ --------------------------------------------------------------------------------