├── .gitignore ├── README.md ├── communications └── gpt-2.cpp ├── cosmoflow.cpp ├── dlrm.cpp └── gpt-3.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | secretz.sh 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | logo 5 | 6 |

7 | Distributed DNNs (Deep Neural Networks) 8 |

9 |

10 | C++/MPI proxies to perform distributed training of DNNs 11 |
12 |

13 | 14 | [![Github][github]][github-url] 15 | 16 | 17 |
18 | 19 |
20 | 21 | ## Table of Contents 22 | 23 |
    24 | 📝 About
    25 | 💻 How to build
    26 | 🔧 Tools used 27 | 29 | 👤 Contact 30 |
31 | 32 |
33 | 34 | ## 📝About 35 | 36 | C++/MPI proxies to perform distributed training of DNNs (deep neural networks): 37 | - `GPT-2` 38 | - `GPT-3` 39 | - `CosmoFlow` 40 | - `DLRM` 41 | 42 | These proxies cover: 43 | - *Data parallelism*: same NN replicated across multiple processors, but each copy processes a different subset of the data 44 | - *Operator parallelism*: splitting different operations (i.e. layers) of a NN across multiple processors 45 | - *Pipeline parallelism*: different stages of a NN are processed on different processors, in a pipelined fashion 46 | - *Hybrid parallelism*: combines two or more of the above types of parallelism i.e. different parts of the NN are processed in parallel across different processors AND data is also split across processors 47 | 48 | ### Benchmarking GPU interconnect performance • NCCL/MPI 49 | 50 | - **MPI for distributed training**: managing communication between nodes in a distributed system, enabling efficient data parallelism and model parallelism strategies 51 | - **NCCL for optimized GPU communication**: common communication operations such as `all-reduce` performed on NVIDIA GPUs 52 | 53 | 54 | ### Scaling techniques for model parallelism 55 | 56 | - **Essential for large model** training i.e. ones that don't even fit into the memory of a single GPU 57 | - **The GPT-3 example** shows a hybrid approach to model and data parallelism. Scaling out training of extremely large models (GPT-3 has over >150 billion paramaters) across multiple GPUs and nodes 58 | 59 | 60 | ### Optimizing CNNs 61 | 62 | - **The CosmoFlow example** illustrates distributed training of a CNN, leveraging GPU acceleration for performance gains. 63 | 64 | 65 | 66 | ## 💻 How to build 67 | 68 | Compile via: 69 | 70 | `mpicxx communications/gpt-2.cpp -o gpt-2` 71 | 72 | Then run: 73 | 74 | `mpirun -n 32 ./gpt-2` 75 | 76 | Set the total num of **Transformer layers** AND total num of **pipeline stages**: 77 | 78 | `mpirun -n 32 ./gpt-2 64 8` 79 | 80 | 81 | 82 | ## 🔧Tools Used 83 | 84 | C++ 88 | MPI 92 | NCCL 96 | pyTorch 100 | 101 | ## 👤Contact 102 | 103 | 104 | [![Email][email]][email-url] 105 | [![Twitter][twitter]][twitter-url] 106 | 107 | 108 | 109 | 110 | [email]: https://img.shields.io/badge/me@vd7.io-FFCA28?style=for-the-badge&logo=Gmail&logoColor=00bbff&color=black 111 | [email-url]: # 112 | [github]: https://img.shields.io/badge/Github-2496ED?style=for-the-badge&logo=github&logoColor=white&color=black 113 | [github-url]: https://github.com/vdutts7/dnn-distributed 114 | [twitter]: https://img.shields.io/badge/Twitter-FFCA28?style=for-the-badge&logo=Twitter&logoColor=00bbff&color=black 115 | [twitter-url]: https://twitter.com/vdutts7/ -------------------------------------------------------------------------------- /communications/gpt-2.cpp: -------------------------------------------------------------------------------- 1 | // C++/MPI proxy • GPT2-large model 2 | // Distributed training (hybrid pipeline x data parallelism) 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #define RUN_COUNT 256 14 | #define WARM_UP_ITERATIONS 10 15 | 16 | //p2p msg size for GPT-2 with micro-batch size=1 and seq_length=632 17 | #define P2P_MESSAGE_SIZE 808960 18 | 19 | #define BEGINNING_SIZE 85317120 20 | #define INTERMEDIATE_SIZE 19677440 21 | #define ENDING_SIZE 84008960 22 | 23 | #define MESSAGE_AGGREGATION 1 24 | 25 | #ifdef MESSAGE_AGGREGATION 26 | //message aggregation 27 | #define BEGINNING_NUM 1 28 | #define INTERMEDIATE_NUM 1 29 | #define ENDING_NUM 1 30 | int first_layer_grad_sizes[BEGINNING_NUM] = {BEGINNING_SIZE}; 31 | int intermediate_layer_grad_sizes[INTERMEDIATE_NUM] = {INTERMEDIATE_SIZE}; 32 | int end_layer_grad_sizes[ENDING_NUM] = {ENDING_SIZE}; 33 | 34 | #else 35 | #define BEGINNING_NUM 14 36 | #define INTERMEDIATE_NUM 12 37 | #define ENDING_NUM 15 38 | //sizes for the gradients per layer of gpt-2 39 | int first_layer_grad_sizes[BEGINNING_NUM] = {64328960, 1310720, 1280, 4915200, 1638400, 1280, 6553600, 6553600, 1280, 3840, 1280, 1280, 5120, 1280}; 40 | int intermediate_layer_grad_sizes[INTERMEDIATE_NUM] = {1280, 4915200, 1638400, 1280, 6553600, 6553600, 1280, 3840, 1280, 1280, 5120, 1280}; 41 | int end_layer_grad_sizes[ENDING_NUM] = {1280, 4915200, 1638400, 1280, 6553600, 6553600, 1280, 64328960, 1280, 3840, 1280, 1280, 5120, 1280, 1280}; 42 | 43 | #endif 44 | 45 | int run_gpt2_training(int grad_accumulation_steps, int stage_number, int num_grad_per_stage, 46 | int total_stages, int allreduce_group_size, 47 | float **start_stage_grad_ptrs, 48 | float **sum_start_stage_grad_ptrs, 49 | float **finish_stage_grad_ptrs, 50 | float **sum_finish_stage_grad_ptrs, 51 | float **intermediate_stage_grad_ptrs, 52 | float **sum_intermediate_stage_grad_ptrs, 53 | int *stage_grad_sizes, 54 | MPI_Comm p2p_comm, MPI_Comm allreduce_comm){ 55 | 56 | float *send_buffer = (float *)calloc(P2P_MESSAGE_SIZE, sizeof(float)); 57 | float *recv_buffer = (float *)calloc(P2P_MESSAGE_SIZE, sizeof(float)); 58 | 59 | //p2p forward 60 | for(int i=0; i 1){ 101 | if(stage_number == 0){ 102 | for(int i=0; i 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define WARM_UP 8 13 | #define RUNS 128 14 | 15 | #define NUM_LAYERS 8 16 | 17 | 18 | int fwd_rt_per_layer[NUM_LAYERS] = {6567, 13135, 6567, 3283, 1641, 5, 3, 1}; 19 | int bwd_rt_per_layer[NUM_LAYERS] = {2, 6, 10, 3283, 6567, 13135, 26270, 13135}; 20 | 21 | #define NUM_CONV_LAYERS 5 22 | 23 | // 2x2 2D spatial decomposition for 3D tensors. Each worker has two neighbors in 2D decomposition! 24 | 25 | // Conv layer halo exchange message sizes in forward 26 | int conv_fwd_halo_sizes[NUM_CONV_LAYERS-1] = {2097152, 1048576, 524288, 262144}; 27 | 28 | // Conv layer halo exchange message sizes in backward 29 | int conv_bwd_halo_sizes[NUM_CONV_LAYERS-1] = {131072, 262144, 524288, 1048576}; 30 | 31 | #define NUM_DENSE_LAYERS 3 32 | 33 | // Dense layer allgather msg sizes in forward 34 | int dense_fwd_allgather_sizes[NUM_DENSE_LAYERS] = {65536, 256, 128}; 35 | 36 | // Dense layer reduce_scatter msg sizes in backward 37 | int dense_bwd_reduce_scatter_sizes[NUM_DENSE_LAYERS] = {128, 256, 65536}; 38 | 39 | // Allreduce sizes for gradients with message aggregation 40 | // Aggregate all dense layers: Dense2-0 Conv4 Conv3 Conv2 Conv1 Conv0 41 | int allreduce_sizes[NUM_LAYERS-2] = {1050737, 3539456, 884992, 221312, 55360, 3488}; 42 | 43 | 44 | 45 | int run_parallel_model(float** fwd_send_buff0_ptrs, 46 | float** fwd_send_buff1_ptrs, 47 | float** fwd_recv_buff0_ptrs, 48 | float** fwd_recv_buff1_ptrs, 49 | float** bwd_send_buff0_ptrs, 50 | float** bwd_send_buff1_ptrs, 51 | float** bwd_recv_buff0_ptrs, 52 | float** bwd_recv_buff1_ptrs, 53 | float** dense_fwd_allgather_sbuff_ptrs, 54 | float** dense_fwd_allgather_rbuff_ptrs, 55 | float** dense_bwd_rs_sbuff_ptrs, 56 | float** dense_bwd_rs_rbuff_ptrs, 57 | float** grad_ptrs, 58 | float** sum_grad_ptrs, 59 | MPI_Comm model_comm, 60 | MPI_Comm dense_comm){ 61 | 62 | // forward (fwd) 63 | int model_group_rank; 64 | MPI_Comm_rank(model_comm, &model_group_rank); 65 | for(int i=0; i=1 && i=NUM_CONV_LAYERS){ // All gather for dense layers 76 | int msg_idx = i-NUM_CONV_LAYERS; 77 | MPI_Allgather(dense_fwd_allgather_sbuff_ptrs[msg_idx], dense_fwd_allgather_sizes[msg_idx], MPI_FLOAT, dense_fwd_allgather_rbuff_ptrs[msg_idx], dense_fwd_allgather_sizes[msg_idx], MPI_FLOAT, model_comm); 78 | } 79 | 80 | usleep(fwd_rt_per_layer[i]); // Compute 81 | } 82 | 83 | // backward (bwd) 84 | MPI_Request grad_allreduce_reqs[NUM_CONV_LAYERS+1]; 85 | for(int i=0; i NUM_DENSE_L) 91 | MPI_Testany(NUM_CONV_LAYERS+1, grad_allreduce_reqs, &index, &flag, MPI_STATUSES_IGNORE); // Advance MPI in the background 92 | 93 | usleep(bwd_rt_per_layer[i]); // Compute 94 | 95 | if(i < NUM_DENSE_L){ // Dense layers 96 | MPI_Reduce_scatter_block(dense_bwd_rs_sbuff_ptrs[i], dense_bwd_rs_rbuff_ptrs[i], dense_bwd_reduce_scatter_sizes[i], MPI_FLOAT, MPI_SUM, model_comm); 97 | } 98 | else if(i < NUM_LAYERS-1){ // Conv layers 99 | int msg_idx = i-NUM_DENSE_L; 100 | MPI_Request requests[4]; 101 | MPI_Isend(bwd_send_buff0_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^1, i, model_comm, &requests[0]); 102 | MPI_Isend(bwd_send_buff1_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^2, i, model_comm, &requests[1]); 103 | MPI_Irecv(bwd_recv_buff0_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^1, i, model_comm, &requests[2]); 104 | MPI_Irecv(bwd_recv_buff1_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^2, i, model_comm, &requests[3]); 105 | MPI_Waitall(4, requests, MPI_STATUSES_IGNORE); 106 | } 107 | 108 | if(i == NUM_DENSE_L-1){ 109 | MPI_Iallreduce(grad_ptrs[0], sum_grad_ptrs[0], allreduce_sizes[0], MPI_FLOAT, MPI_SUM, dense_comm, &grad_allreduce_reqs[0]); 110 | } 111 | else if(i > NUM_DENSE_L-1){ 112 | MPI_Iallreduce(grad_ptrs[i-NUM_DENSE_L+1], sum_grad_ptrs[i-NUM_DENSE_L+1], allreduce_sizes[i-NUM_DENSE_L+1], MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD, &grad_allreduce_reqs[i-NUM_DENSE_L+1]); 113 | } 114 | } 115 | 116 | MPI_Waitall(NUM_CONV_LAYERS+1, grad_allreduce_reqs, MPI_STATUSES_IGNORE); 117 | return 0; 118 | } 119 | 120 | int main(int argc, char *argv[]){ 121 | int rank, world_size; 122 | 123 | int model_shards = 4; // Do not change this 124 | 125 | MPI_Init(&argc,&argv); 126 | MPI_Comm_size(MPI_COMM_WORLD, &world_size); 127 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 128 | 129 | int dense_comm_rank, model_group_rank; 130 | int dense_comm_size, model_group_size; 131 | 132 | // The number of processes should be a multiple of model_shards = 4 133 | assert(world_size % model_shards == 0); 134 | int dense_comm_color = rank % model_shards; 135 | 136 | MPI_Comm dense_comm; 137 | MPI_Comm_split(MPI_COMM_WORLD, dense_comm_color, rank, &dense_comm); 138 | 139 | MPI_Comm_rank(dense_comm, &dense_comm_rank); 140 | MPI_Comm_size(dense_comm, &dense_comm_size); 141 | 142 | MPI_Comm model_comm; 143 | MPI_Comm_split(MPI_COMM_WORLD, dense_comm_rank, rank, &model_comm); 144 | MPI_Comm_rank(model_comm, &model_group_rank); 145 | MPI_Comm_size(model_comm, &model_group_size); 146 | 147 | assert(dense_comm_color == model_group_rank); 148 | assert(model_shards == model_group_size); 149 | 150 | float* fwd_send_buff0_ptrs[NUM_CONV_LAYERS-1]; 151 | float* fwd_send_buff1_ptrs[NUM_CONV_LAYERS-1]; 152 | float* fwd_recv_buff0_ptrs[NUM_CONV_LAYERS-1]; 153 | float* fwd_recv_buff1_ptrs[NUM_CONV_LAYERS-1]; 154 | 155 | float* bwd_send_buff0_ptrs[NUM_CONV_LAYERS-1]; 156 | float* bwd_send_buff1_ptrs[NUM_CONV_LAYERS-1]; 157 | float* bwd_recv_buff0_ptrs[NUM_CONV_LAYERS-1]; 158 | float* bwd_recv_buff1_ptrs[NUM_CONV_LAYERS-1]; 159 | for(int i=0; i 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define NUM_RUNS 1 24 | #define WARMUP_ITERATIONS 0 25 | 26 | #define MLP_BOTTOM_SIZE 49536 27 | #define MLP_TOP_SIZE 728065 28 | #define ALL2ALL_EMB_SIZE 262144 29 | 30 | #define FORWARD_BOTTOM_MLP 341 31 | #define FORWARD_TOP_MLP 455 32 | #define FORWARD_INTER 209 33 | #define FORWARD_EMB 95 34 | 35 | void run_custom_dlrm(int num_procs, 36 | float *top_gradient, 37 | float *sum_top_gradient, 38 | float *bottom_gradient, 39 | float *sum_bottom_gradient, 40 | float *fwd_alltoall_send, 41 | float *fwd_alltoall_recv, 42 | float *bwd_alltoall_send, 43 | float *bwd_alltoall_recv) { 44 | 45 | MPI_Request gradient_allreduce_requests[2]; 46 | usleep(FORWARD_EMB); // Forward pass 47 | MPI_Alltoall(fwd_alltoall_send, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, fwd_alltoall_recv, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, MPI_COMM_WORLD); 48 | 49 | usleep(FORWARD_BOTTOM_MLP); // Forward pass 50 | usleep(FORWARD_INTER); // Forward pass 51 | 52 | usleep(FORWARD_TOP_MLP); // Forward pass 53 | 54 | usleep(FORWARD_TOP_MLP * 2); // Backward pass 55 | MPI_Iallreduce(top_gradient, sum_top_gradient, MLP_TOP_SIZE, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD, &gradient_allreduce_requests[0]); 56 | 57 | usleep(FORWARD_INTER); // Backward pass 58 | usleep(FORWARD_BOTTOM_MLP * 2); // Backward pass 59 | MPI_Iallreduce(bottom_gradient, sum_bottom_gradient, MLP_BOTTOM_SIZE, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD, &gradient_allreduce_requests[1]); 60 | 61 | MPI_Alltoall(bwd_alltoall_send, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, bwd_alltoall_recv, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, MPI_COMM_WORLD); 62 | usleep(FORWARD_EMB * 2); // Backward pass 63 | 64 | MPI_Waitall(2, gradient_allreduce_requests, MPI_STATUSES_IGNORE); 65 | } 66 | 67 | int main(int argc, char *argv[]) { 68 | int process_rank, total_processes; 69 | double start_time, elapsed_time; 70 | 71 | MPI_Init(&argc, &argv); 72 | MPI_Comm_size(MPI_COMM_WORLD, &total_processes); 73 | MPI_Comm_rank(MPI_COMM_WORLD, &process_rank); 74 | 75 | float *top_gradient = (float *)calloc(MLP_TOP_SIZE, sizeof(float)); 76 | float *sum_top_gradient = (float *)calloc(MLP_TOP_SIZE, sizeof(float)); 77 | float *bottom_gradient = (float *)calloc(MLP_BOTTOM_SIZE, sizeof(float)); 78 | float *sum_bottom_gradient = (float *)calloc(MLP_BOTTOM_SIZE, sizeof(float)); 79 | 80 | float *fwd_alltoall_send = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float)); 81 | float *fwd_alltoall_recv = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float)); 82 | float *bwd_alltoall_send = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float)); 83 | float *bwd_alltoall_recv = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float)); 84 | 85 | MPI_Barrier(MPI_COMM_WORLD); 86 | 87 | // Warm-up 88 | for(int warmup_iter = 0; warmup_iter < WARMUP_ITERATIONS; warmup_iter++) { 89 | run_custom_dlrm(total_processes, 90 | top_gradient, 91 | sum_top_gradient, 92 | bottom_gradient, 93 | sum_bottom_gradient, 94 | fwd_alltoall_send, 95 | fwd_alltoall_recv, 96 | bwd_alltoall_send, 97 | bwd_alltoall_recv); 98 | } 99 | 100 | start_time = MPI_Wtime(); 101 | for(int iteration = 0; iteration < NUM_RUNS; iteration++) { 102 | run_custom_dlrm(total_processes, 103 | top_gradient, 104 | sum_top_gradient, 105 | bottom_gradient, 106 | sum_bottom_gradient, 107 | fwd_alltoall_send, 108 | fwd_alltoall_recv, 109 | bwd_alltoall_send, 110 | bwd_alltoall_recv); 111 | } 112 | elapsed_time = (MPI_Wtime() - start_time) / NUM_RUNS; 113 | 114 | if (process_rank == 0) 115 | printf("Performance Metrics: Rank = %d, Total Processes = %d, Global Batch Size = %d, DLRM Runtime per Iteration = %f seconds\n", process_rank, total_processes, 2048, elapsed_time); 116 | 117 | MPI_Finalize(); 118 | } -------------------------------------------------------------------------------- /gpt-3.cpp: -------------------------------------------------------------------------------- 1 | // C++/MPI proxy • 175B parameter GPT-3 model 2 | // Distributed training (hybrid of model x data parallelism) 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #define MODEL_PARALLEL_SIZE 96 13 | #define DATA_PARALLEL_SIZE 4 14 | #define P2P_BUFFER_SIZE 25165824 15 | #define FORWARD_COMPUTE_TIME 15915 16 | #define BACKWARD_COMPUTE_TIME 31830 17 | 18 | #define RUNS 128 19 | #define WARM_UP 8 20 | #define NUM_L 96 21 | #define ACC_STEP_SCALE 2 22 | #define MODEL_SHARDS 4 23 | 24 | // Function declarations 25 | void run_forward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages, 26 | float *send_buffer_fwd, float *recv_buffer_fwd, 27 | float **buffers_fwd_mp, float **buffers_fwd_mp_reduced, 28 | MPI_Comm comm_pp, MPI_Comm comm_mp); 29 | 30 | void run_backward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages, 31 | float *send_buffer_bwd, float *recv_buffer_bwd, 32 | float **buffers_bwd_mp, float **buffers_bwd_mp_reduced, 33 | MPI_Comm comm_pp); 34 | 35 | void aggregate_gradients(float *grad_buffer, float *aggregated_grad_buffer, 36 | MPI_Comm comm_dp); 37 | 38 | int main() { 39 | // Define message sizes and runtime constants 40 | #define MP_ALLREDUCE_SIZE 25165824 41 | #define MOE_ALL2ALL_SIZE 25165824 42 | #define DP_ALLREDUCE_SIZE 452984832 43 | #define FWD_RT 15915 44 | #define BWD_RT 31830 45 | #define BWD_RT_GPIPE 47745 46 | 47 | // Define MPI communicators 48 | MPI_Comm comm_dp, comm_mp, comm_pp; 49 | // Initialize MPI communicators 50 | 51 | // Allocate buffers and arrays 52 | float grad_buffer[DP_ALLREDUCE_SIZE]; 53 | float aggregated_grad_buffer[DATA_PARALLEL_SIZE]; 54 | float send_buffer_fwd[P2P_BUFFER_SIZE], recv_buffer_fwd[P2P_BUFFER_SIZE]; 55 | float send_buffer_bwd[P2P_BUFFER_SIZE], recv_buffer_bwd[P2P_BUFFER_SIZE]; 56 | float *buffers_fwd_mp[2], *buffers_fwd_mp_reduced[2]; 57 | float *buffers_bwd_mp[2], *buffers_bwd_mp_reduced[2]; 58 | 59 | for (int i = 0; i < 2; i++) { 60 | buffers_fwd_mp[i] = new float[MODEL_PARALLEL_SIZE]; 61 | buffers_fwd_mp_reduced[i] = new float[MODEL_PARALLEL_SIZE]; 62 | buffers_bwd_mp[i] = new float[MODEL_PARALLEL_SIZE]; 63 | buffers_bwd_mp_reduced[i] = new float[MODEL_PARALLEL_SIZE]; 64 | } 65 | 66 | // Run the pipeline stage 67 | int steps_for_accumulation = 10; 68 | int stage_index = 2; 69 | int total_pipeline_stages = 4; 70 | 71 | run_forward_pass(steps_for_accumulation, stage_index, total_pipeline_stages, 72 | send_buffer_fwd, recv_buffer_fwd, 73 | buffers_fwd_mp, buffers_fwd_mp_reduced, 74 | comm_pp, comm_mp); 75 | 76 | run_backward_pass(steps_for_accumulation, stage_index, total_pipeline_stages, 77 | send_buffer_bwd, recv_buffer_bwd, 78 | buffers_bwd_mp, buffers_bwd_mp_reduced, 79 | comm_pp); 80 | 81 | aggregate_gradients(grad_buffer, aggregated_grad_buffer, comm_dp); 82 | 83 | // Deallocate buffers 84 | for (int i = 0; i < 2; i++) { 85 | delete[] buffers_fwd_mp[i]; 86 | delete[] buffers_fwd_mp_reduced[i]; 87 | delete[] buffers_bwd_mp[i]; 88 | delete[] buffers_bwd_mp_reduced[i]; 89 | } 90 | 91 | return 0; 92 | } 93 | 94 | void run_forward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages, 95 | float *send_buffer_fwd, float *recv_buffer_fwd, 96 | float **buffers_fwd_mp, float **buffers_fwd_mp_reduced, 97 | MPI_Comm comm_pp, MPI_Comm comm_mp) { 98 | 99 | MPI_Request reqs_fwd[2]; 100 | 101 | for (int i = 0; i < 2; i++) { 102 | reqs_fwd[i] = MPI_REQUEST_NULL; 103 | } 104 | 105 | for (int step = 0; step < steps_for_accumulation; step++) { 106 | if (stage_index == 0) { 107 | MPI_Wait(&reqs_fwd[0], MPI_STATUS_IGNORE); 108 | usleep(FORWARD_COMPUTE_TIME); // Emulate computation time 109 | MPI_Isend(send_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_fwd[0]); 110 | } else if (stage_index == total_pipeline_stages - 1) { 111 | MPI_Irecv(recv_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_fwd[1]); 112 | MPI_Wait(&reqs_fwd[1], MPI_STATUS_IGNORE); 113 | usleep(FORWARD_COMPUTE_TIME); // Emulate computation time 114 | } else { 115 | MPI_Irecv(recv_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_fwd[1]); 116 | MPI_Wait(&reqs_fwd[1], MPI_STATUS_IGNORE); 117 | usleep(FORWARD_COMPUTE_TIME); // Emulate computation time 118 | MPI_Isend(send_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_fwd[0]); 119 | } 120 | 121 | for (int j = 0; j < 2; j++) { 122 | MPI_Allreduce(buffers_fwd_mp[j], buffers_fwd_mp_reduced[j], MODEL_PARALLEL_SIZE, MPI_FLOAT, MPI_SUM, comm_mp); 123 | } 124 | } 125 | } 126 | 127 | void run_backward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages, 128 | float *send_buffer_bwd, float *recv_buffer_bwd, 129 | float **buffers_bwd_mp, float **buffers_bwd_mp_reduced, 130 | MPI_Comm comm_pp) { 131 | 132 | MPI_Request reqs_bwd[2]; 133 | 134 | for (int i = 0; i < 2; i++) { 135 | reqs_bwd[i] = MPI_REQUEST_NULL; 136 | } 137 | 138 | for (int step = 0; step < steps_for_accumulation; step++) { 139 | if (stage_index == total_pipeline_stages - 1) { 140 | usleep(BACKWARD_COMPUTE_TIME); // Emulate computation time 141 | MPI_Isend(send_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_bwd[0]); 142 | } else if (stage_index == 0) { 143 | MPI_Irecv(recv_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_bwd[1]); 144 | MPI_Wait(&reqs_bwd[1], MPI_STATUS_IGNORE); 145 | usleep(BACKWARD_COMPUTE_TIME); // Emulate computation time 146 | } else { 147 | MPI_Irecv(recv_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_bwd[1]); 148 | MPI_Wait(&reqs_bwd[1], MPI_STATUS_IGNORE); 149 | usleep(BACKWARD_COMPUTE_TIME); // Emulate computation time 150 | MPI_Isend(send_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_bwd[0]); 151 | } 152 | } 153 | } 154 | 155 | void aggregate_gradients(float *grad_buffer, float *aggregated_grad_buffer, 156 | MPI_Comm comm_dp) { 157 | // Aggregate gradients across data parallel group 158 | MPI_Allreduce(grad_buffer, aggregated_grad_buffer, DATA_PARALLEL_SIZE, MPI_FLOAT, MPI_SUM, comm_dp); 159 | } --------------------------------------------------------------------------------