├── .gitignore
├── README.md
├── communications
└── gpt-2.cpp
├── cosmoflow.cpp
├── dlrm.cpp
└── gpt-3.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | secretz.sh
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |

5 |
6 |
7 | Distributed DNNs (Deep Neural Networks)
8 |
9 |
10 | C++/MPI proxies to perform distributed training of DNNs
11 |
12 |
13 |
14 | [![Github][github]][github-url]
15 |
16 |
17 |
18 |
19 |
20 |
21 | ## Table of Contents
22 |
23 |
24 | 📝 About
25 | 💻 How to build
26 | 🔧 Tools used
27 |
29 | 👤 Contact
30 |
31 |
32 |
33 |
34 | ## 📝About
35 |
36 | C++/MPI proxies to perform distributed training of DNNs (deep neural networks):
37 | - `GPT-2`
38 | - `GPT-3`
39 | - `CosmoFlow`
40 | - `DLRM`
41 |
42 | These proxies cover:
43 | - *Data parallelism*: same NN replicated across multiple processors, but each copy processes a different subset of the data
44 | - *Operator parallelism*: splitting different operations (i.e. layers) of a NN across multiple processors
45 | - *Pipeline parallelism*: different stages of a NN are processed on different processors, in a pipelined fashion
46 | - *Hybrid parallelism*: combines two or more of the above types of parallelism i.e. different parts of the NN are processed in parallel across different processors AND data is also split across processors
47 |
48 | ### Benchmarking GPU interconnect performance • NCCL/MPI
49 |
50 | - **MPI for distributed training**: managing communication between nodes in a distributed system, enabling efficient data parallelism and model parallelism strategies
51 | - **NCCL for optimized GPU communication**: common communication operations such as `all-reduce` performed on NVIDIA GPUs
52 |
53 |
54 | ### Scaling techniques for model parallelism
55 |
56 | - **Essential for large model** training i.e. ones that don't even fit into the memory of a single GPU
57 | - **The GPT-3 example** shows a hybrid approach to model and data parallelism. Scaling out training of extremely large models (GPT-3 has over >150 billion paramaters) across multiple GPUs and nodes
58 |
59 |
60 | ### Optimizing CNNs
61 |
62 | - **The CosmoFlow example** illustrates distributed training of a CNN, leveraging GPU acceleration for performance gains.
63 |
64 |
65 |
66 | ## 💻 How to build
67 |
68 | Compile via:
69 |
70 | `mpicxx communications/gpt-2.cpp -o gpt-2`
71 |
72 | Then run:
73 |
74 | `mpirun -n 32 ./gpt-2`
75 |
76 | Set the total num of **Transformer layers** AND total num of **pipeline stages**:
77 |
78 | `mpirun -n 32 ./gpt-2 64 8`
79 |
80 |
81 |
82 | ## 🔧Tools Used
83 |
84 |
88 |
92 |
96 |
100 |
101 | ## 👤Contact
102 |
103 |
104 | [![Email][email]][email-url]
105 | [![Twitter][twitter]][twitter-url]
106 |
107 |
108 |
109 |
110 | [email]: https://img.shields.io/badge/me@vd7.io-FFCA28?style=for-the-badge&logo=Gmail&logoColor=00bbff&color=black
111 | [email-url]: #
112 | [github]: https://img.shields.io/badge/Github-2496ED?style=for-the-badge&logo=github&logoColor=white&color=black
113 | [github-url]: https://github.com/vdutts7/dnn-distributed
114 | [twitter]: https://img.shields.io/badge/Twitter-FFCA28?style=for-the-badge&logo=Twitter&logoColor=00bbff&color=black
115 | [twitter-url]: https://twitter.com/vdutts7/
--------------------------------------------------------------------------------
/communications/gpt-2.cpp:
--------------------------------------------------------------------------------
1 | // C++/MPI proxy • GPT2-large model
2 | // Distributed training (hybrid pipeline x data parallelism)
3 |
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #define RUN_COUNT 256
14 | #define WARM_UP_ITERATIONS 10
15 |
16 | //p2p msg size for GPT-2 with micro-batch size=1 and seq_length=632
17 | #define P2P_MESSAGE_SIZE 808960
18 |
19 | #define BEGINNING_SIZE 85317120
20 | #define INTERMEDIATE_SIZE 19677440
21 | #define ENDING_SIZE 84008960
22 |
23 | #define MESSAGE_AGGREGATION 1
24 |
25 | #ifdef MESSAGE_AGGREGATION
26 | //message aggregation
27 | #define BEGINNING_NUM 1
28 | #define INTERMEDIATE_NUM 1
29 | #define ENDING_NUM 1
30 | int first_layer_grad_sizes[BEGINNING_NUM] = {BEGINNING_SIZE};
31 | int intermediate_layer_grad_sizes[INTERMEDIATE_NUM] = {INTERMEDIATE_SIZE};
32 | int end_layer_grad_sizes[ENDING_NUM] = {ENDING_SIZE};
33 |
34 | #else
35 | #define BEGINNING_NUM 14
36 | #define INTERMEDIATE_NUM 12
37 | #define ENDING_NUM 15
38 | //sizes for the gradients per layer of gpt-2
39 | int first_layer_grad_sizes[BEGINNING_NUM] = {64328960, 1310720, 1280, 4915200, 1638400, 1280, 6553600, 6553600, 1280, 3840, 1280, 1280, 5120, 1280};
40 | int intermediate_layer_grad_sizes[INTERMEDIATE_NUM] = {1280, 4915200, 1638400, 1280, 6553600, 6553600, 1280, 3840, 1280, 1280, 5120, 1280};
41 | int end_layer_grad_sizes[ENDING_NUM] = {1280, 4915200, 1638400, 1280, 6553600, 6553600, 1280, 64328960, 1280, 3840, 1280, 1280, 5120, 1280, 1280};
42 |
43 | #endif
44 |
45 | int run_gpt2_training(int grad_accumulation_steps, int stage_number, int num_grad_per_stage,
46 | int total_stages, int allreduce_group_size,
47 | float **start_stage_grad_ptrs,
48 | float **sum_start_stage_grad_ptrs,
49 | float **finish_stage_grad_ptrs,
50 | float **sum_finish_stage_grad_ptrs,
51 | float **intermediate_stage_grad_ptrs,
52 | float **sum_intermediate_stage_grad_ptrs,
53 | int *stage_grad_sizes,
54 | MPI_Comm p2p_comm, MPI_Comm allreduce_comm){
55 |
56 | float *send_buffer = (float *)calloc(P2P_MESSAGE_SIZE, sizeof(float));
57 | float *recv_buffer = (float *)calloc(P2P_MESSAGE_SIZE, sizeof(float));
58 |
59 | //p2p forward
60 | for(int i=0; i 1){
101 | if(stage_number == 0){
102 | for(int i=0; i
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define WARM_UP 8
13 | #define RUNS 128
14 |
15 | #define NUM_LAYERS 8
16 |
17 |
18 | int fwd_rt_per_layer[NUM_LAYERS] = {6567, 13135, 6567, 3283, 1641, 5, 3, 1};
19 | int bwd_rt_per_layer[NUM_LAYERS] = {2, 6, 10, 3283, 6567, 13135, 26270, 13135};
20 |
21 | #define NUM_CONV_LAYERS 5
22 |
23 | // 2x2 2D spatial decomposition for 3D tensors. Each worker has two neighbors in 2D decomposition!
24 |
25 | // Conv layer halo exchange message sizes in forward
26 | int conv_fwd_halo_sizes[NUM_CONV_LAYERS-1] = {2097152, 1048576, 524288, 262144};
27 |
28 | // Conv layer halo exchange message sizes in backward
29 | int conv_bwd_halo_sizes[NUM_CONV_LAYERS-1] = {131072, 262144, 524288, 1048576};
30 |
31 | #define NUM_DENSE_LAYERS 3
32 |
33 | // Dense layer allgather msg sizes in forward
34 | int dense_fwd_allgather_sizes[NUM_DENSE_LAYERS] = {65536, 256, 128};
35 |
36 | // Dense layer reduce_scatter msg sizes in backward
37 | int dense_bwd_reduce_scatter_sizes[NUM_DENSE_LAYERS] = {128, 256, 65536};
38 |
39 | // Allreduce sizes for gradients with message aggregation
40 | // Aggregate all dense layers: Dense2-0 Conv4 Conv3 Conv2 Conv1 Conv0
41 | int allreduce_sizes[NUM_LAYERS-2] = {1050737, 3539456, 884992, 221312, 55360, 3488};
42 |
43 |
44 |
45 | int run_parallel_model(float** fwd_send_buff0_ptrs,
46 | float** fwd_send_buff1_ptrs,
47 | float** fwd_recv_buff0_ptrs,
48 | float** fwd_recv_buff1_ptrs,
49 | float** bwd_send_buff0_ptrs,
50 | float** bwd_send_buff1_ptrs,
51 | float** bwd_recv_buff0_ptrs,
52 | float** bwd_recv_buff1_ptrs,
53 | float** dense_fwd_allgather_sbuff_ptrs,
54 | float** dense_fwd_allgather_rbuff_ptrs,
55 | float** dense_bwd_rs_sbuff_ptrs,
56 | float** dense_bwd_rs_rbuff_ptrs,
57 | float** grad_ptrs,
58 | float** sum_grad_ptrs,
59 | MPI_Comm model_comm,
60 | MPI_Comm dense_comm){
61 |
62 | // forward (fwd)
63 | int model_group_rank;
64 | MPI_Comm_rank(model_comm, &model_group_rank);
65 | for(int i=0; i=1 && i=NUM_CONV_LAYERS){ // All gather for dense layers
76 | int msg_idx = i-NUM_CONV_LAYERS;
77 | MPI_Allgather(dense_fwd_allgather_sbuff_ptrs[msg_idx], dense_fwd_allgather_sizes[msg_idx], MPI_FLOAT, dense_fwd_allgather_rbuff_ptrs[msg_idx], dense_fwd_allgather_sizes[msg_idx], MPI_FLOAT, model_comm);
78 | }
79 |
80 | usleep(fwd_rt_per_layer[i]); // Compute
81 | }
82 |
83 | // backward (bwd)
84 | MPI_Request grad_allreduce_reqs[NUM_CONV_LAYERS+1];
85 | for(int i=0; i NUM_DENSE_L)
91 | MPI_Testany(NUM_CONV_LAYERS+1, grad_allreduce_reqs, &index, &flag, MPI_STATUSES_IGNORE); // Advance MPI in the background
92 |
93 | usleep(bwd_rt_per_layer[i]); // Compute
94 |
95 | if(i < NUM_DENSE_L){ // Dense layers
96 | MPI_Reduce_scatter_block(dense_bwd_rs_sbuff_ptrs[i], dense_bwd_rs_rbuff_ptrs[i], dense_bwd_reduce_scatter_sizes[i], MPI_FLOAT, MPI_SUM, model_comm);
97 | }
98 | else if(i < NUM_LAYERS-1){ // Conv layers
99 | int msg_idx = i-NUM_DENSE_L;
100 | MPI_Request requests[4];
101 | MPI_Isend(bwd_send_buff0_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^1, i, model_comm, &requests[0]);
102 | MPI_Isend(bwd_send_buff1_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^2, i, model_comm, &requests[1]);
103 | MPI_Irecv(bwd_recv_buff0_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^1, i, model_comm, &requests[2]);
104 | MPI_Irecv(bwd_recv_buff1_ptrs[msg_idx], conv_bwd_halo_sizes[msg_idx], MPI_FLOAT, model_group_rank^2, i, model_comm, &requests[3]);
105 | MPI_Waitall(4, requests, MPI_STATUSES_IGNORE);
106 | }
107 |
108 | if(i == NUM_DENSE_L-1){
109 | MPI_Iallreduce(grad_ptrs[0], sum_grad_ptrs[0], allreduce_sizes[0], MPI_FLOAT, MPI_SUM, dense_comm, &grad_allreduce_reqs[0]);
110 | }
111 | else if(i > NUM_DENSE_L-1){
112 | MPI_Iallreduce(grad_ptrs[i-NUM_DENSE_L+1], sum_grad_ptrs[i-NUM_DENSE_L+1], allreduce_sizes[i-NUM_DENSE_L+1], MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD, &grad_allreduce_reqs[i-NUM_DENSE_L+1]);
113 | }
114 | }
115 |
116 | MPI_Waitall(NUM_CONV_LAYERS+1, grad_allreduce_reqs, MPI_STATUSES_IGNORE);
117 | return 0;
118 | }
119 |
120 | int main(int argc, char *argv[]){
121 | int rank, world_size;
122 |
123 | int model_shards = 4; // Do not change this
124 |
125 | MPI_Init(&argc,&argv);
126 | MPI_Comm_size(MPI_COMM_WORLD, &world_size);
127 | MPI_Comm_rank(MPI_COMM_WORLD, &rank);
128 |
129 | int dense_comm_rank, model_group_rank;
130 | int dense_comm_size, model_group_size;
131 |
132 | // The number of processes should be a multiple of model_shards = 4
133 | assert(world_size % model_shards == 0);
134 | int dense_comm_color = rank % model_shards;
135 |
136 | MPI_Comm dense_comm;
137 | MPI_Comm_split(MPI_COMM_WORLD, dense_comm_color, rank, &dense_comm);
138 |
139 | MPI_Comm_rank(dense_comm, &dense_comm_rank);
140 | MPI_Comm_size(dense_comm, &dense_comm_size);
141 |
142 | MPI_Comm model_comm;
143 | MPI_Comm_split(MPI_COMM_WORLD, dense_comm_rank, rank, &model_comm);
144 | MPI_Comm_rank(model_comm, &model_group_rank);
145 | MPI_Comm_size(model_comm, &model_group_size);
146 |
147 | assert(dense_comm_color == model_group_rank);
148 | assert(model_shards == model_group_size);
149 |
150 | float* fwd_send_buff0_ptrs[NUM_CONV_LAYERS-1];
151 | float* fwd_send_buff1_ptrs[NUM_CONV_LAYERS-1];
152 | float* fwd_recv_buff0_ptrs[NUM_CONV_LAYERS-1];
153 | float* fwd_recv_buff1_ptrs[NUM_CONV_LAYERS-1];
154 |
155 | float* bwd_send_buff0_ptrs[NUM_CONV_LAYERS-1];
156 | float* bwd_send_buff1_ptrs[NUM_CONV_LAYERS-1];
157 | float* bwd_recv_buff0_ptrs[NUM_CONV_LAYERS-1];
158 | float* bwd_recv_buff1_ptrs[NUM_CONV_LAYERS-1];
159 | for(int i=0; i
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 |
23 | #define NUM_RUNS 1
24 | #define WARMUP_ITERATIONS 0
25 |
26 | #define MLP_BOTTOM_SIZE 49536
27 | #define MLP_TOP_SIZE 728065
28 | #define ALL2ALL_EMB_SIZE 262144
29 |
30 | #define FORWARD_BOTTOM_MLP 341
31 | #define FORWARD_TOP_MLP 455
32 | #define FORWARD_INTER 209
33 | #define FORWARD_EMB 95
34 |
35 | void run_custom_dlrm(int num_procs,
36 | float *top_gradient,
37 | float *sum_top_gradient,
38 | float *bottom_gradient,
39 | float *sum_bottom_gradient,
40 | float *fwd_alltoall_send,
41 | float *fwd_alltoall_recv,
42 | float *bwd_alltoall_send,
43 | float *bwd_alltoall_recv) {
44 |
45 | MPI_Request gradient_allreduce_requests[2];
46 | usleep(FORWARD_EMB); // Forward pass
47 | MPI_Alltoall(fwd_alltoall_send, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, fwd_alltoall_recv, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, MPI_COMM_WORLD);
48 |
49 | usleep(FORWARD_BOTTOM_MLP); // Forward pass
50 | usleep(FORWARD_INTER); // Forward pass
51 |
52 | usleep(FORWARD_TOP_MLP); // Forward pass
53 |
54 | usleep(FORWARD_TOP_MLP * 2); // Backward pass
55 | MPI_Iallreduce(top_gradient, sum_top_gradient, MLP_TOP_SIZE, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD, &gradient_allreduce_requests[0]);
56 |
57 | usleep(FORWARD_INTER); // Backward pass
58 | usleep(FORWARD_BOTTOM_MLP * 2); // Backward pass
59 | MPI_Iallreduce(bottom_gradient, sum_bottom_gradient, MLP_BOTTOM_SIZE, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD, &gradient_allreduce_requests[1]);
60 |
61 | MPI_Alltoall(bwd_alltoall_send, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, bwd_alltoall_recv, ALL2ALL_EMB_SIZE/num_procs, MPI_FLOAT, MPI_COMM_WORLD);
62 | usleep(FORWARD_EMB * 2); // Backward pass
63 |
64 | MPI_Waitall(2, gradient_allreduce_requests, MPI_STATUSES_IGNORE);
65 | }
66 |
67 | int main(int argc, char *argv[]) {
68 | int process_rank, total_processes;
69 | double start_time, elapsed_time;
70 |
71 | MPI_Init(&argc, &argv);
72 | MPI_Comm_size(MPI_COMM_WORLD, &total_processes);
73 | MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
74 |
75 | float *top_gradient = (float *)calloc(MLP_TOP_SIZE, sizeof(float));
76 | float *sum_top_gradient = (float *)calloc(MLP_TOP_SIZE, sizeof(float));
77 | float *bottom_gradient = (float *)calloc(MLP_BOTTOM_SIZE, sizeof(float));
78 | float *sum_bottom_gradient = (float *)calloc(MLP_BOTTOM_SIZE, sizeof(float));
79 |
80 | float *fwd_alltoall_send = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float));
81 | float *fwd_alltoall_recv = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float));
82 | float *bwd_alltoall_send = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float));
83 | float *bwd_alltoall_recv = (float *)calloc(ALL2ALL_EMB_SIZE, sizeof(float));
84 |
85 | MPI_Barrier(MPI_COMM_WORLD);
86 |
87 | // Warm-up
88 | for(int warmup_iter = 0; warmup_iter < WARMUP_ITERATIONS; warmup_iter++) {
89 | run_custom_dlrm(total_processes,
90 | top_gradient,
91 | sum_top_gradient,
92 | bottom_gradient,
93 | sum_bottom_gradient,
94 | fwd_alltoall_send,
95 | fwd_alltoall_recv,
96 | bwd_alltoall_send,
97 | bwd_alltoall_recv);
98 | }
99 |
100 | start_time = MPI_Wtime();
101 | for(int iteration = 0; iteration < NUM_RUNS; iteration++) {
102 | run_custom_dlrm(total_processes,
103 | top_gradient,
104 | sum_top_gradient,
105 | bottom_gradient,
106 | sum_bottom_gradient,
107 | fwd_alltoall_send,
108 | fwd_alltoall_recv,
109 | bwd_alltoall_send,
110 | bwd_alltoall_recv);
111 | }
112 | elapsed_time = (MPI_Wtime() - start_time) / NUM_RUNS;
113 |
114 | if (process_rank == 0)
115 | printf("Performance Metrics: Rank = %d, Total Processes = %d, Global Batch Size = %d, DLRM Runtime per Iteration = %f seconds\n", process_rank, total_processes, 2048, elapsed_time);
116 |
117 | MPI_Finalize();
118 | }
--------------------------------------------------------------------------------
/gpt-3.cpp:
--------------------------------------------------------------------------------
1 | // C++/MPI proxy • 175B parameter GPT-3 model
2 | // Distributed training (hybrid of model x data parallelism)
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #define MODEL_PARALLEL_SIZE 96
13 | #define DATA_PARALLEL_SIZE 4
14 | #define P2P_BUFFER_SIZE 25165824
15 | #define FORWARD_COMPUTE_TIME 15915
16 | #define BACKWARD_COMPUTE_TIME 31830
17 |
18 | #define RUNS 128
19 | #define WARM_UP 8
20 | #define NUM_L 96
21 | #define ACC_STEP_SCALE 2
22 | #define MODEL_SHARDS 4
23 |
24 | // Function declarations
25 | void run_forward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages,
26 | float *send_buffer_fwd, float *recv_buffer_fwd,
27 | float **buffers_fwd_mp, float **buffers_fwd_mp_reduced,
28 | MPI_Comm comm_pp, MPI_Comm comm_mp);
29 |
30 | void run_backward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages,
31 | float *send_buffer_bwd, float *recv_buffer_bwd,
32 | float **buffers_bwd_mp, float **buffers_bwd_mp_reduced,
33 | MPI_Comm comm_pp);
34 |
35 | void aggregate_gradients(float *grad_buffer, float *aggregated_grad_buffer,
36 | MPI_Comm comm_dp);
37 |
38 | int main() {
39 | // Define message sizes and runtime constants
40 | #define MP_ALLREDUCE_SIZE 25165824
41 | #define MOE_ALL2ALL_SIZE 25165824
42 | #define DP_ALLREDUCE_SIZE 452984832
43 | #define FWD_RT 15915
44 | #define BWD_RT 31830
45 | #define BWD_RT_GPIPE 47745
46 |
47 | // Define MPI communicators
48 | MPI_Comm comm_dp, comm_mp, comm_pp;
49 | // Initialize MPI communicators
50 |
51 | // Allocate buffers and arrays
52 | float grad_buffer[DP_ALLREDUCE_SIZE];
53 | float aggregated_grad_buffer[DATA_PARALLEL_SIZE];
54 | float send_buffer_fwd[P2P_BUFFER_SIZE], recv_buffer_fwd[P2P_BUFFER_SIZE];
55 | float send_buffer_bwd[P2P_BUFFER_SIZE], recv_buffer_bwd[P2P_BUFFER_SIZE];
56 | float *buffers_fwd_mp[2], *buffers_fwd_mp_reduced[2];
57 | float *buffers_bwd_mp[2], *buffers_bwd_mp_reduced[2];
58 |
59 | for (int i = 0; i < 2; i++) {
60 | buffers_fwd_mp[i] = new float[MODEL_PARALLEL_SIZE];
61 | buffers_fwd_mp_reduced[i] = new float[MODEL_PARALLEL_SIZE];
62 | buffers_bwd_mp[i] = new float[MODEL_PARALLEL_SIZE];
63 | buffers_bwd_mp_reduced[i] = new float[MODEL_PARALLEL_SIZE];
64 | }
65 |
66 | // Run the pipeline stage
67 | int steps_for_accumulation = 10;
68 | int stage_index = 2;
69 | int total_pipeline_stages = 4;
70 |
71 | run_forward_pass(steps_for_accumulation, stage_index, total_pipeline_stages,
72 | send_buffer_fwd, recv_buffer_fwd,
73 | buffers_fwd_mp, buffers_fwd_mp_reduced,
74 | comm_pp, comm_mp);
75 |
76 | run_backward_pass(steps_for_accumulation, stage_index, total_pipeline_stages,
77 | send_buffer_bwd, recv_buffer_bwd,
78 | buffers_bwd_mp, buffers_bwd_mp_reduced,
79 | comm_pp);
80 |
81 | aggregate_gradients(grad_buffer, aggregated_grad_buffer, comm_dp);
82 |
83 | // Deallocate buffers
84 | for (int i = 0; i < 2; i++) {
85 | delete[] buffers_fwd_mp[i];
86 | delete[] buffers_fwd_mp_reduced[i];
87 | delete[] buffers_bwd_mp[i];
88 | delete[] buffers_bwd_mp_reduced[i];
89 | }
90 |
91 | return 0;
92 | }
93 |
94 | void run_forward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages,
95 | float *send_buffer_fwd, float *recv_buffer_fwd,
96 | float **buffers_fwd_mp, float **buffers_fwd_mp_reduced,
97 | MPI_Comm comm_pp, MPI_Comm comm_mp) {
98 |
99 | MPI_Request reqs_fwd[2];
100 |
101 | for (int i = 0; i < 2; i++) {
102 | reqs_fwd[i] = MPI_REQUEST_NULL;
103 | }
104 |
105 | for (int step = 0; step < steps_for_accumulation; step++) {
106 | if (stage_index == 0) {
107 | MPI_Wait(&reqs_fwd[0], MPI_STATUS_IGNORE);
108 | usleep(FORWARD_COMPUTE_TIME); // Emulate computation time
109 | MPI_Isend(send_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_fwd[0]);
110 | } else if (stage_index == total_pipeline_stages - 1) {
111 | MPI_Irecv(recv_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_fwd[1]);
112 | MPI_Wait(&reqs_fwd[1], MPI_STATUS_IGNORE);
113 | usleep(FORWARD_COMPUTE_TIME); // Emulate computation time
114 | } else {
115 | MPI_Irecv(recv_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_fwd[1]);
116 | MPI_Wait(&reqs_fwd[1], MPI_STATUS_IGNORE);
117 | usleep(FORWARD_COMPUTE_TIME); // Emulate computation time
118 | MPI_Isend(send_buffer_fwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_fwd[0]);
119 | }
120 |
121 | for (int j = 0; j < 2; j++) {
122 | MPI_Allreduce(buffers_fwd_mp[j], buffers_fwd_mp_reduced[j], MODEL_PARALLEL_SIZE, MPI_FLOAT, MPI_SUM, comm_mp);
123 | }
124 | }
125 | }
126 |
127 | void run_backward_pass(int steps_for_accumulation, int stage_index, int total_pipeline_stages,
128 | float *send_buffer_bwd, float *recv_buffer_bwd,
129 | float **buffers_bwd_mp, float **buffers_bwd_mp_reduced,
130 | MPI_Comm comm_pp) {
131 |
132 | MPI_Request reqs_bwd[2];
133 |
134 | for (int i = 0; i < 2; i++) {
135 | reqs_bwd[i] = MPI_REQUEST_NULL;
136 | }
137 |
138 | for (int step = 0; step < steps_for_accumulation; step++) {
139 | if (stage_index == total_pipeline_stages - 1) {
140 | usleep(BACKWARD_COMPUTE_TIME); // Emulate computation time
141 | MPI_Isend(send_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_bwd[0]);
142 | } else if (stage_index == 0) {
143 | MPI_Irecv(recv_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_bwd[1]);
144 | MPI_Wait(&reqs_bwd[1], MPI_STATUS_IGNORE);
145 | usleep(BACKWARD_COMPUTE_TIME); // Emulate computation time
146 | } else {
147 | MPI_Irecv(recv_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index + 1, step, comm_pp, &reqs_bwd[1]);
148 | MPI_Wait(&reqs_bwd[1], MPI_STATUS_IGNORE);
149 | usleep(BACKWARD_COMPUTE_TIME); // Emulate computation time
150 | MPI_Isend(send_buffer_bwd, P2P_BUFFER_SIZE, MPI_FLOAT, stage_index - 1, step, comm_pp, &reqs_bwd[0]);
151 | }
152 | }
153 | }
154 |
155 | void aggregate_gradients(float *grad_buffer, float *aggregated_grad_buffer,
156 | MPI_Comm comm_dp) {
157 | // Aggregate gradients across data parallel group
158 | MPI_Allreduce(grad_buffer, aggregated_grad_buffer, DATA_PARALLEL_SIZE, MPI_FLOAT, MPI_SUM, comm_dp);
159 | }
--------------------------------------------------------------------------------