├── .flake8 ├── .gitignore ├── README.md ├── autopipe ├── .gitignore ├── README.md ├── TechnicalOverview_old.md ├── __init__.py ├── analysis │ ├── __init__.py │ ├── analysis_utils.py │ ├── asgd_analysis.py │ ├── deprecated_theoretical.py │ ├── pipedream_complexity.py │ ├── pipeline_partition_analysis.py │ ├── profile_pipeline_stages.py │ ├── profile_replica.py │ └── ssgd_analysis.py ├── autopipe │ ├── __init__.py │ ├── api.py │ ├── cache_utils.py │ ├── compiler │ │ ├── __init__.py │ │ ├── compile_normal_model_function.py │ │ ├── compile_partitioned_model.py │ │ ├── create_pipeline_configuration.py │ │ ├── partition_forward_method.py │ │ ├── partition_init_method.py │ │ ├── state_methods.py │ │ └── utils.py │ ├── model_partitioning │ │ ├── __init__.py │ │ ├── acyclic │ │ │ ├── __init__.py │ │ │ ├── acyclic_partitioning.py │ │ │ ├── data_structures.py │ │ │ └── gpa.py │ │ ├── async_pipeline.py │ │ ├── heuristics.py │ │ ├── metis │ │ │ ├── __init__.py │ │ │ ├── metis_partitioning.py │ │ │ └── post_process.py │ │ ├── mixed_pipe │ │ │ ├── __init__.py │ │ │ ├── assignment.py │ │ │ ├── by_prefix.py │ │ │ ├── centers.py │ │ │ ├── check_cycles.py │ │ │ ├── coarsening.py │ │ │ ├── detect_p_rep.py │ │ │ ├── heap_dict.py │ │ │ ├── partition_mixed_pipe_v1_clusters.py │ │ │ ├── partition_mixed_pipe_v2.py │ │ │ ├── post_process.py │ │ │ ├── refine.py │ │ │ ├── systematic_block_ratio_creation.py │ │ │ └── test_coarsen_prefix.py │ │ ├── pipedream │ │ │ ├── __init__.py │ │ │ ├── pipedream_partition_no_hir.py │ │ │ └── pipedream_partition_no_hir_fixed.py │ │ ├── stage_to_device.py │ │ └── utils.py │ ├── model_profiling │ │ ├── __init__.py │ │ ├── control_flow_graph.py │ │ ├── graph_executor.py │ │ ├── infer_is_contiguous.py │ │ ├── infer_req_grad.py │ │ ├── network_profiler.py │ │ ├── profiler.py │ │ └── tracer.py │ ├── union_find.py │ └── utils.py ├── cmd_parser.py ├── download │ ├── download_cifar.py │ ├── download_glue.py │ ├── download_wikitext.py │ └── make_squad.sh ├── environment.yml ├── misc │ ├── display_partitioning_stats.py │ └── megatron_11b │ │ └── dict.txt ├── partition.py ├── partitioning_scripts │ ├── __init__.py │ └── partition_scripts_utils.py ├── py_sbatch.sh └── tasks │ ├── __init__.py │ ├── bert_squad.py │ ├── cep.py │ ├── dummy_t5.py │ ├── functional_model.py │ ├── glue.py │ ├── gpt2.py │ ├── megatron.py │ ├── new_t5.py │ ├── partitioning_task.py │ ├── t5.py │ ├── transformers_utils.py │ └── vision.py ├── docs ├── AcceleratingMixedPipeWithCudaMPS.md ├── MPI.md ├── MiscOptimizations.md ├── NewModels.md ├── PipeDebug.md ├── PipeList.md └── PitfallsKnownIssuesAndTODOs.md ├── models ├── __init__.py ├── new_t5_example │ ├── README.md │ ├── __init__.py │ ├── check_conversion.py │ ├── check_partitioned.py │ ├── convert_none.py │ ├── eval_new_t5.py │ └── modeling_t5.py ├── normal │ ├── NLP_models │ │ ├── __init__.py │ │ ├── modeling_bert.py │ │ ├── modeling_bert_4_1_converted.py │ │ ├── modeling_bert_old.py │ │ ├── modeling_ctrl.py │ │ ├── modeling_ctrl_tied_weights.py │ │ ├── modeling_gpt2.py │ │ ├── modeling_gpt2_tied_weights.py │ │ ├── modeling_roberta.py │ │ ├── modeling_t5.py │ │ ├── modeling_t5_tied_weights.py │ │ ├── modeling_utils_converted.py │ │ ├── stateless.py │ │ └── utils.py │ ├── __init__.py │ ├── cep.py │ ├── dummy.py │ ├── split_linear.py │ └── vision_models │ │ ├── AlexNet.py │ │ ├── DenseNet.py │ │ ├── GoogleNet.py │ │ ├── Inception.py │ │ ├── LeNet.py │ │ ├── ResNet.py │ │ ├── SqueezeNet.py │ │ ├── UNet.py │ │ ├── VGG.py │ │ ├── WideResNet.py │ │ ├── WideResNet_GN.py │ │ ├── __init__.py │ │ └── amoebaNet │ │ ├── __init__.py │ │ ├── genotype.py │ │ └── utils.py └── partitioned │ ├── __init__.py │ ├── bert_base_uncaseds_384_2p_bw12_async_pipedream.py │ ├── bert_base_uncaseds_384_2p_bw12_pipedream.py │ ├── bert_large_uncased_squad_8p.py │ ├── bert_large_uncased_whole_word_maskings_384_2p_bw12_async_pipedream.py │ ├── bert_large_uncased_whole_word_maskings_384_2p_bw12_pipedream.py │ ├── bert_large_uncased_whole_word_maskings_384_4p_bw12_async_pipedream.py │ ├── bert_large_uncased_whole_word_maskings_384_4p_bw12_pipedream.py │ ├── bert_large_uncased_whole_word_maskings_384_8p_bw12_async_pipedream.py │ ├── bert_large_uncased_whole_word_maskings_384_8p_bw12_pipedream.py │ ├── cep_netN50_C20000_4p_bw12_metis.py │ ├── layer_graph_t5_3b_tied_lmheads_320_8_8p_bw12_async_squad1_mpipe.py │ ├── layer_graph_t5_3b_tied_lmheads_320_8_8p_bw12_squad1_pipedream.py │ ├── layer_graph_t5_3b_tied_lmheads_512_4_8p_bw12_async_squad1_mpipe.py │ ├── layer_graph_t5_3b_tied_lmheads_512_4_8p_bw12_squad1_pipedream.py │ ├── layer_graph_t5_3b_tied_lmheads_64_4_8p_bw12_async_squad1_mpipe.py │ ├── layer_graph_t5_3b_tied_lmheads_64_4_8p_bw12_squad1_pipedream.py │ ├── old_gpt2xl_8p_untied.py │ ├── op_t5_3b_tied_lmheads_320_8_8p_bw12_async_squad1_mpipe.py │ ├── op_t5_3b_tied_lmheads_512_4_8p_bw12_async_squad1_mpipe.py │ ├── op_t5_3b_tied_lmheads_64_4_8p_bw12_async_squad1_mpipe.py │ ├── t5_3b_tied_lmheads_320_8_8p_bw12_async_squad1_mpipe.py │ ├── t5_3b_tied_lmheads_320_8_8p_bw12_squad1.py │ ├── t5_3b_tied_lmheads_320_8_8p_bw12_squad1_pipedream.py │ ├── t5_3b_tied_lmheads_320_8_8p_bw12_squad1_virtual_stages.py │ ├── t5_3b_tied_lmheads_512_4_8p_bw12_async_squad1_mpipe.py │ ├── t5_3b_tied_lmheads_512_4_8p_bw12_async_squad1_mpipe_L32.py │ ├── t5_3b_tied_lmheads_512_4_8p_bw12_squad1_acyclic.py │ ├── t5_3b_tied_lmheads_512_4_8p_bw12_squad1_pipedream.py │ ├── t5_3b_tied_lmheads_512_4_8p_bw12_squad1_virtual_stages.py │ ├── t5_3b_tied_lmheads_64_4_8p_bw12_async_squad1_mpipe.py │ ├── t5_3b_tied_lmheads_64_4_8p_bw12_squad1.py │ ├── t5_3b_tied_lmheads_64_4_8p_bw12_squad1_acyclic.py │ ├── t5_3b_tied_lmheads_64_4_8p_bw12_squad1_pipedream.py │ ├── t5_3b_tied_lmheads_64_4_8p_bw12_squad1_virtual_stages.py │ ├── t5_small_tied_lmhead_4p_bw12_async_squad1.py │ ├── t5_small_tied_lmheads_512_4_3p_bw12_squad1_virtual_stages.py │ ├── vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.py │ ├── vit_large_patch32_384_in21k_cifar100_384c384_8p_bw12_async_acyclic.py │ ├── vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_async_acyclic.py │ ├── vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.py │ ├── wrn_16x4_c10_p2.py │ └── wrn_28x10_c100_dr03_gnc32_4p_bw12_pipedream.py ├── pipe ├── .gitignore ├── README.md ├── __init__.py ├── configs │ ├── __init__.py │ ├── all_options.json │ ├── bert │ │ ├── squad │ │ │ ├── bert_base_uncased_2p │ │ │ │ ├── hetprofiling │ │ │ │ │ ├── common.json │ │ │ │ │ └── stale.json │ │ │ │ └── pipedream │ │ │ │ │ ├── common.json │ │ │ │ │ └── stale.json │ │ │ ├── bert_large_uncased_wmm │ │ │ │ ├── aggmsnag.json │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ ├── msnag.json │ │ │ │ ├── stale.json │ │ │ │ └── ws_msnag_ga.json │ │ │ ├── bert_large_uncased_wwm_2m │ │ │ │ ├── common.json │ │ │ │ └── stale.json │ │ │ ├── bert_large_uncased_wwm_2p │ │ │ │ ├── aggmsnag.json │ │ │ │ ├── common.json │ │ │ │ ├── ftpipe.json │ │ │ │ ├── gpipe.json │ │ │ │ ├── pipedream.json │ │ │ │ └── stale.json │ │ │ ├── bert_large_uncased_wwm_4p │ │ │ │ ├── aggmsnag.json │ │ │ │ ├── common.json │ │ │ │ ├── ftpipe.json │ │ │ │ ├── gpipe.json │ │ │ │ ├── pipedream.json │ │ │ │ └── stale.json │ │ │ └── bert_large_uncased_wwm_8p │ │ │ │ ├── aggmsnag.json │ │ │ │ ├── common.json │ │ │ │ ├── ftpipe.json │ │ │ │ ├── ftpipe_layer.json │ │ │ │ ├── gpipe.json │ │ │ │ ├── pipedream.json │ │ │ │ └── stale.json │ │ └── squad2 │ │ │ └── bert_large_uncased_wmm │ │ │ ├── aggmsnag.json │ │ │ ├── common.json │ │ │ ├── gpipe.json │ │ │ ├── msnag.json │ │ │ ├── stale.json │ │ │ ├── ws.json │ │ │ └── ws_msnag_ga.json │ ├── cep │ │ └── common.json │ ├── cv │ │ ├── cifar10 │ │ │ ├── common.json │ │ │ └── wrn_16x4_c10_p2 │ │ │ │ ├── EXAMPLE.md │ │ │ │ └── stale_nr.json │ │ ├── cifar100 │ │ │ └── wrn28x10 │ │ │ │ ├── README.md │ │ │ │ ├── common.json │ │ │ │ ├── msnag.json │ │ │ │ ├── msnag_optimizer.json │ │ │ │ ├── msnag_ws.json │ │ │ │ ├── no_recomputation │ │ │ │ ├── msnag_nr.json │ │ │ │ ├── msnag_ws.json │ │ │ │ ├── norecomp.json │ │ │ │ ├── stale_nr.json │ │ │ │ ├── ws.json │ │ │ │ ├── ws_ga.json │ │ │ │ └── ws_msnag_ga.json │ │ │ │ ├── stale.json │ │ │ │ ├── stale_optimizer.json │ │ │ │ ├── ws.json │ │ │ │ ├── ws_ga.json │ │ │ │ ├── ws_msnag_ga.json │ │ │ │ └── ws_msnag_ga_jfl.json │ │ └── imagenet │ │ │ └── weight_stashing_msnag_gap_aware.json │ ├── dummy.json │ ├── lm │ │ └── wt2 │ │ │ ├── gpt2 │ │ │ └── tied │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ ├── msnag.json │ │ │ │ ├── msnag_ws.json │ │ │ │ ├── seq.json │ │ │ │ ├── stale.json │ │ │ │ ├── ws.json │ │ │ │ ├── ws_ga.json │ │ │ │ ├── ws_msnag_ga.json │ │ │ │ └── ws_msnag_ga_jfl.json │ │ │ └── gpt2xl │ │ │ ├── tied │ │ │ ├── common.json │ │ │ ├── gpipe.json │ │ │ ├── msnag.json │ │ │ ├── msnag_ws.json │ │ │ ├── seq.json │ │ │ ├── stale.json │ │ │ ├── ws.json │ │ │ ├── ws_msnag_ga.json │ │ │ └── ws_msnag_ga_jfl.json │ │ │ ├── untied │ │ │ ├── aggmsnag.json │ │ │ ├── common.json │ │ │ ├── gpipe.json │ │ │ ├── msnag.json │ │ │ ├── msnag_ws.json │ │ │ ├── seq.json │ │ │ ├── stale.json │ │ │ ├── ws.json │ │ │ ├── ws_msnag_ga.json │ │ │ └── ws_msnag_ga_jfl.json │ │ │ └── untied_s512 │ │ │ ├── common.json │ │ │ └── stale.json │ ├── parse_json_config.py │ ├── python_configs │ │ ├── __init__.py │ │ └── configs.py │ ├── t5 │ │ ├── new_t5_exp │ │ │ ├── mpipe │ │ │ │ ├── boolq │ │ │ │ │ ├── common_layer_graph.json │ │ │ │ │ ├── common_op_graph.json │ │ │ │ │ ├── gpipe_layer_graph.json │ │ │ │ │ ├── gpipe_op_graph.json │ │ │ │ │ ├── stale_layer_graph.json │ │ │ │ │ └── stale_op_graph.json │ │ │ │ ├── multirc │ │ │ │ │ ├── common_layer_graph.json │ │ │ │ │ ├── common_op_graph.json │ │ │ │ │ ├── gpipe_layer_graph.json │ │ │ │ │ ├── gpipe_op_graph.json │ │ │ │ │ ├── stale_layer_graph.json │ │ │ │ │ └── stale_op_graph.json │ │ │ │ ├── rte │ │ │ │ │ ├── common_layer_graph.json │ │ │ │ │ ├── common_op_graph.json │ │ │ │ │ ├── gpipe_layer_graph.json │ │ │ │ │ ├── gpipe_op_graph.json │ │ │ │ │ ├── stale_layer_graph.json │ │ │ │ │ └── stale_op_graph.json │ │ │ │ └── wic │ │ │ │ │ ├── common_layer_graph.json │ │ │ │ │ ├── common_op_graph.json │ │ │ │ │ ├── gpipe_layer_graph.json │ │ │ │ │ ├── gpipe_op_graph.json │ │ │ │ │ ├── stale_layer_graph.json │ │ │ │ │ └── stale_op_graph.json │ │ │ ├── seq │ │ │ │ ├── boolq │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ └── pipedream_stale.json │ │ │ │ ├── multirc │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ └── pipedream_stale.json │ │ │ │ ├── rte │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ └── pipedream_stale.json │ │ │ │ └── wic │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ └── pipedream_stale.json │ │ │ └── seq_op_graph │ │ │ │ ├── boolq │ │ │ │ ├── common.json │ │ │ │ ├── gpipe_new.json │ │ │ │ └── pipedream_stale.json │ │ │ │ ├── multirc │ │ │ │ ├── common.json │ │ │ │ ├── gpipe_new.json │ │ │ │ └── pipedream_stale.json │ │ │ │ ├── rte │ │ │ │ ├── common.json │ │ │ │ ├── gpipe_new.json │ │ │ │ └── pipedream_stale.json │ │ │ │ └── wic │ │ │ │ ├── common.json │ │ │ │ ├── gpipe_new.json │ │ │ │ └── pipedream_stale.json │ │ ├── t5_3b_p8 │ │ │ ├── seq │ │ │ │ ├── boolq │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ ├── pipedream_stale.json │ │ │ │ │ └── stale.json │ │ │ │ ├── cola │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe.json │ │ │ │ │ ├── seq.json │ │ │ │ │ └── stale.json │ │ │ │ ├── multirc │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ ├── pipedream_stale.json │ │ │ │ │ ├── seq.json │ │ │ │ │ └── stale.json │ │ │ │ ├── rte │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ ├── pipedream_stale.json │ │ │ │ │ ├── seq.json │ │ │ │ │ └── stale.json │ │ │ │ ├── rte_super_glue │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ ├── pipedream_stale.json │ │ │ │ │ ├── seq.json │ │ │ │ │ └── stale.json │ │ │ │ └── wic │ │ │ │ │ ├── common.json │ │ │ │ │ ├── gpipe.json │ │ │ │ │ ├── gpipe_new.json │ │ │ │ │ ├── pipedream_stale.json │ │ │ │ │ └── stale.json │ │ │ └── virtual_stages │ │ │ │ ├── boolq │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ ├── gpipe_2.json │ │ │ │ ├── stale.json │ │ │ │ └── vs_stale.json │ │ │ │ ├── multirc │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ │ │ ├── rte │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ │ │ ├── superglue_rte │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ │ │ └── wic │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ ├── t5_base │ │ │ └── seq │ │ │ │ └── boolq │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ ├── gpipe_new.json │ │ │ │ ├── pipedream_stale.json │ │ │ │ └── stale.json │ │ ├── t5_mpipe │ │ │ ├── L=32 │ │ │ │ ├── common.json │ │ │ │ └── stale.json │ │ │ ├── boolq │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ │ ├── common.json │ │ │ ├── multirc │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ │ ├── rte │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ │ ├── stale.json │ │ │ └── wic │ │ │ │ ├── common.json │ │ │ │ ├── gpipe.json │ │ │ │ └── stale.json │ │ └── t5_small │ │ │ ├── README.md │ │ │ ├── adafactor │ │ │ ├── common.json │ │ │ └── stale.json │ │ │ ├── common.json │ │ │ ├── rte │ │ │ ├── common.json │ │ │ └── stale.json │ │ │ └── stale.json │ └── vit │ │ ├── cifar100_384.json │ │ ├── cifar10_384.json │ │ ├── cv.json │ │ ├── cv_dcgn_global.json │ │ ├── cv_dcgn_local.json │ │ ├── cv_dcgn_local_prop.json │ │ ├── imagenet_384.json │ │ ├── tst_gpipe.json │ │ ├── tst_gpipe_adafactor_cifar100.json │ │ ├── tst_gpipe_cifar100.json │ │ ├── tst_gpipe_dcgn_global.json │ │ ├── tst_gpipe_dcgn_global_cifar100.json │ │ ├── tst_gpipe_dcgn_local.json │ │ ├── tst_gpipe_dcgn_local_prop_cifar100.json │ │ ├── tst_stale.json │ │ ├── vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json │ │ ├── vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_async_acyclic.json │ │ └── vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json ├── data │ ├── __init__.py │ ├── cep.py │ ├── cv.py │ ├── datasets.py │ ├── download │ │ ├── __init__.py │ │ ├── download_datasets.py │ │ └── download_glue_data.py │ ├── from_args_and_kw.py │ ├── glue.py │ ├── hardcoded_dirs.py │ ├── lm.py │ ├── packing.py │ ├── squad.py │ ├── t5 │ │ ├── __init__.py │ │ ├── analyze_t5_packing.py │ │ ├── new_t5_tfds_eval.py │ │ ├── preproc.py │ │ ├── t5_tfds.py │ │ └── t5_tfds_eval.py │ └── vit.py ├── env_utils │ ├── create_env_new_server_new.sh │ ├── deprecated │ │ ├── Makefile │ │ ├── create_env_new_server.sh │ │ ├── deprecated_create_env.sh │ │ ├── old_environment_mpi1.yml │ │ └── old_environment_mpi2.yml │ ├── docker │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── Dockerfile_from_source │ │ ├── ompi-recipe │ │ │ ├── Dockerfile │ │ │ ├── ompi-cuda │ │ │ │ ├── build.sh │ │ │ │ └── meta.yaml │ │ │ └── ompi_conda.sh │ │ └── pytorch_from_source.sh │ ├── env_add_to_build_from_source.yml │ ├── env_without_mpi.yml │ └── jupyter-lab.sh ├── eval.py ├── experiments │ ├── __init__.py │ ├── analysis │ │ ├── __init__.py │ │ ├── all_results_to_df.py │ │ ├── gen_plots.py │ │ ├── get_results.py │ │ ├── plot.py │ │ ├── plot_sns.py │ │ └── rectangles.ipynb │ ├── experiments.py │ └── t5 │ │ ├── __init__.py │ │ ├── annotation_plot.py │ │ ├── ftpipe_plots.py │ │ ├── l_study.ipynb │ │ ├── nets.ipynb │ │ └── tta_barplot.py ├── main.py ├── misc │ ├── _full_example.py │ ├── deepspeed.py │ ├── filelogger.py │ ├── libmpi.py │ ├── many_isend.py │ ├── mem_on_cpu.py │ ├── mesh_failed_runs │ │ ├── git_log_head.txt │ │ ├── run.sh │ │ ├── run_3b_failed.sh │ │ └── run_3b_failed_recom_omm.sh │ ├── p2p_bw_mat.sh │ ├── print_partition_layers_scopes.py │ ├── replicate_experiments_deprecated.py │ ├── rungrid_bert_squad.py │ ├── sanity_check.py │ ├── test_mpi │ │ ├── README.md │ │ ├── nodes.txt │ │ ├── pytorch_distributed.py │ │ └── pytorch_distributed_cuda_aware.py │ ├── to_partition.sh │ ├── transformers │ │ ├── TODO.md │ │ ├── analyze_res.py │ │ ├── bert-large │ │ │ ├── run.sh │ │ │ └── run_squad.py │ │ ├── run_language_modeling.py │ │ └── run_lm.sh │ ├── tst_ibroadcast.py │ └── tst_isend.py ├── models │ ├── __init__.py │ ├── load_pipeline_weights_to_hf.py │ ├── naive_block_model_parallel.py │ ├── parse_config.py │ ├── registery │ │ ├── __init__.py │ │ ├── cep.py │ │ ├── cv.py │ │ ├── dummy.py │ │ ├── hf.py │ │ ├── model_handler.py │ │ └── vit.py │ ├── simple_partitioning_config.py │ ├── t5_for_generation.py │ ├── transformers_cfg.py │ ├── transformers_utils.py │ └── vit_np_to_pytorch.py ├── optimizers │ ├── __init__.py │ ├── adafactor.py │ ├── adam.py │ ├── adam_record.py │ ├── adam_record_max_lr.py │ ├── adamw.py │ ├── adamw_record.py │ ├── adamw_record_without_step.py │ ├── lr_scheduler.py │ ├── required.py │ ├── sgd.py │ ├── sutskever_modified_sgd.py │ └── utils.py ├── pipeline │ ├── __init__.py │ ├── communication │ │ ├── __init__.py │ │ ├── buffer.py │ │ ├── buffered_comm.py │ │ ├── common_simple_comm.py │ │ ├── grouper.py │ │ ├── interface.py │ │ ├── multiprocessing.py │ │ ├── p2p.py │ │ ├── tags.py │ │ └── wrapper.py │ ├── data_propagation │ │ ├── __init__.py │ │ ├── automatic_prop.py │ │ ├── automatic_prop_non_contig.py │ │ ├── cv_target_prop.py │ │ └── interface.py │ ├── distributed_clip_grad_norm.py │ ├── dp_sim │ │ ├── __init__.py │ │ ├── convert.py │ │ └── simulated_dp_batchnorm.py │ ├── gap_aware │ │ ├── __init__.py │ │ ├── adam_gap_aware.py │ │ ├── adam_gap_aware_max_lr.py │ │ ├── adamw_gap_aware.py │ │ ├── gap_aware_hook.py │ │ ├── interface.py │ │ └── sgd_gap_aware.py │ ├── monkey_patch │ │ ├── __init__.py │ │ ├── dummy_forward_monkey_patcher.py │ │ ├── find_modules.py │ │ ├── patch.py │ │ └── utils.py │ ├── partition.py │ ├── partition_manager.py │ ├── replace_inplace.py │ ├── rng_stasher.py │ ├── trainers │ │ ├── __init__.py │ │ ├── bert_squad_trainer.py │ │ ├── cep_trainer.py │ │ ├── cv_trainer.py │ │ ├── gap_aware_trainer.py │ │ ├── glue_trainer.py │ │ ├── grad_norm │ │ │ ├── __init__.py │ │ │ ├── global_grad_norm.py │ │ │ ├── local_grad_norm.py │ │ │ └── local_grad_norm_prop.py │ │ ├── interface.py │ │ ├── lm_trainer.py │ │ ├── statistics │ │ │ ├── __init__.py │ │ │ ├── cv.py │ │ │ ├── gap.py │ │ │ ├── glue.py │ │ │ ├── interface.py │ │ │ ├── lm.py │ │ │ ├── squad.py │ │ │ └── utils.py │ │ ├── t5_trainer.py │ │ └── utils.py │ ├── true_weights_storage.py │ ├── util.py │ ├── weight_prediction │ │ ├── README.md │ │ ├── __init__.py │ │ ├── adafactor.py │ │ ├── adam.py │ │ ├── adamw.py │ │ ├── cow_dict.py │ │ ├── interface.py │ │ ├── sched_aware.py │ │ ├── sgd.py │ │ ├── sgd_wd.py │ │ └── sym_pred_optimizers │ │ │ ├── __init__.py │ │ │ ├── auto_lambdify.py │ │ │ └── sympy_optimizer.py │ ├── weight_stashing │ │ ├── __init__.py │ │ └── weight_stashing.py │ └── work_schedulers │ │ ├── __init__.py │ │ ├── analysis.py │ │ └── schedulers.py ├── prepare_pipeline.py ├── run │ ├── __init__.py │ ├── gpu_queue.py │ └── helper.py └── train.py ├── t5_used_scripts_example ├── run_experiments.sh ├── run_op_vs_layer_exp_wic.sh ├── to_partition_mpipe_layergraph_t5_3b_boolq_multirc.sh ├── to_partition_mpipe_layergraph_t5_3b_rte.sh ├── to_partition_mpipe_layergraph_t5_3b_wic.sh ├── to_partition_mpipe_t5_3b_opgraph_boolq_multirc.sh ├── to_partition_mpipe_t5_3b_opgraph_rte.sh ├── to_partition_mpipe_t5_3b_opgraph_wic.sh ├── to_partition_mpipe_t5_base.sh ├── to_partition_spipe_OP_t5_3b_boolq_multirc.sh ├── to_partition_spipe_OP_t5_3b_rte.sh ├── to_partition_spipe_OP_t5_3b_wic.sh ├── to_partition_spipe_t5_3b_boolq_multirc.sh ├── to_partition_spipe_t5_3b_rte.sh ├── to_partition_spipe_t5_3b_wic.sh ├── to_partition_spipe_t5_base.sh ├── to_run.sh └── to_run_again_wic.sh └── tests ├── __init__.py └── test_our_vit_convert.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ; select = B,C,E,F,P,T4,W,B9 3 | max-line-length = 120 4 | # C408 ignored because we like the dict keyword argument syntax 5 | # E501 is not flexible enough, we're using B950 instead 6 | # ; ignore = 7 | # ; E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303, 8 | # ; # these ignores are from flake8-bugbear; please fix! 9 | # ; B007,B008, 10 | # ; # these ignores are from flake8-comprehensions; please fix! 11 | # ; C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415 12 | per-file-ignores = __init__.py: F401 13 | exclude = *.pyi,.git,configs,cpp -------------------------------------------------------------------------------- /autopipe/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/* 2 | .ipynb_checkpoints/* 3 | */__pycache__/* 4 | ideas/* 5 | .idea/ 6 | .pytest_cache/** 7 | *.pyc 8 | experiment/stanford_car_dataset_images_in_224x224/* 9 | experiments/stanford-car-dataset-by-classes-folder-224/* 10 | playground.py 11 | *.log 12 | glue_data/** 13 | wikitext-2-raw/** 14 | papers/* 15 | .mypy_cache/ 16 | 17 | old/ 18 | squad1/* 19 | squad2/* 20 | cached_train_bert* 21 | results/ 22 | py_sbatch.sh 23 | glue_ds.* 24 | original_acyclic_partitioning/* 25 | dynamic_acyclic_partitioning/* 26 | datasets/* 27 | megatron/data/* 28 | megatron/megatron_11b.tar.gz 29 | -------------------------------------------------------------------------------- /autopipe/README.md: -------------------------------------------------------------------------------- 1 | # Partitioning 2 | Readme WIP. 3 | 4 | Available algorithms under `autopipe.autopipe.model_partitioning`: 5 | - `mpipe` (mixed-pipe) 6 | - `pipedream` 7 | - `metis` 8 | - `acyclic` 9 | 10 | ## Pitfalls 11 | sometimes ops are traced with `training=True`, so replace, e.g: 12 | 13 | ```bash 14 | sed "s/training=True/training=self.training/" op_* | grep training= 15 | ``` -------------------------------------------------------------------------------- /autopipe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/autopipe/__init__.py -------------------------------------------------------------------------------- /autopipe/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline_partition_analysis import run_analysis 2 | from .analysis_utils import convert_to_analysis_format -------------------------------------------------------------------------------- /autopipe/analysis/pipedream_complexity.py: -------------------------------------------------------------------------------- 1 | """Since PipeDream does exhaustive search, 2 | for small graphs, no mixed-pipe, 1 hardware level hierarchy 3 | it may be better to just use it, 4 | (however it models communication incorrectly) 5 | complexity is (simplified from PipeDream's paper) 6 | L*N^3*m^2 7 | 8 | N - graph nodes (operations/layers) 9 | m - gpus per level 10 | L - number of levels 11 | 12 | """ 13 | 14 | def pipedream_extimated_time(N, m, L=1): 15 | # compute a mult fact from resnet50 Pipedream's largest network, reported 8 seconds 16 | baseline_complexity = 709789824 # resnet (N=177,m=8, L=2) 17 | baseline_seconds = 8 18 | 19 | complexity = L * N**3 * m**2 20 | estimated_time = baseline_seconds * (complexity / baseline_complexity) 21 | 22 | return estimated_time 23 | -------------------------------------------------------------------------------- /autopipe/analysis/profile_replica.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from autopipe.autopipe import move_tensors 4 | from autopipe.autopipe.utils import flatten 5 | 6 | 7 | def cuda_computation_times(model, inputs): 8 | """ measure forward/backward time of a partition on the GPU 9 | """ 10 | if not isinstance(inputs, (tuple, list, dict)): 11 | inputs = (inputs,) 12 | 13 | 14 | model.cuda() 15 | # now we move inputs to GPU 16 | inputs = move_tensors(inputs, 'cuda') 17 | start = torch.cuda.Event(enable_timing=True) 18 | end = torch.cuda.Event(enable_timing=True) 19 | 20 | torch.cuda.synchronize(device='cuda') 21 | start.record() 22 | if isinstance(inputs, (tuple,list)): 23 | outputs = model(*inputs) 24 | elif isinstance(inputs, dict): 25 | outputs = model(**inputs) 26 | else: 27 | raise NotImplementedError(str(type(inputs))) 28 | 29 | # TODO: can generate targets beforehand to use cross_entropy... 30 | # TODO: replace randn_like with pre-generated tensors 31 | # loss = sum((F.cross_entropy(o, torch.randn_like(o)) for o in filter( 32 | # lambda t: isinstance(t, torch.Tensor) and t.requires_grad, 33 | # flatten(outputs)))) 34 | 35 | loss = sum((o.norm() for o in filter( 36 | lambda t: isinstance(t, torch.Tensor) and t.requires_grad, 37 | flatten(outputs)))) # FIXME: just use real loss. 38 | loss.backward() 39 | end.record() 40 | torch.cuda.synchronize(device='cuda') 41 | fb_time = (start.elapsed_time(end)) 42 | 43 | return fb_time -------------------------------------------------------------------------------- /autopipe/analysis/ssgd_analysis.py: -------------------------------------------------------------------------------- 1 | """FIXME: DEPRECATED, not accurate, probably incorrect""" 2 | import math 3 | 4 | from .profile_replica import cuda_computation_times 5 | 6 | 7 | # NOTE: can so similar analysis for ZerO(1,2,3), 8 | # (multiply communication by x1.5 according to what they claim) 9 | 10 | 11 | def run_analysis(sample, model, n_workers, bw_GBps=12, verbose=True): 12 | send_mb = sum([(p.nelement() * p.element_size()) 13 | for p in model.parameters()]) / 1e6 14 | 15 | single_send_time = send_mb / bw_GBps 16 | 17 | # FIXME: this is not correct at all. 18 | # because we can do it with reduce-brodcast 19 | num_sends = n_workers * math.log2(n_workers) 20 | 21 | total_send_time = num_sends * single_send_time 22 | 23 | comp_time = cuda_computation_times(model, sample) 24 | 25 | # NOTE: this is very naive analysis, 26 | # from pytorch >1.3 they overlap comm with comp. 27 | # (gaining around +30% speedup). 28 | utilization = comp_time / (comp_time + total_send_time) 29 | 30 | expected_speedup = utilization * n_workers 31 | 32 | # TODO: print something... 33 | 34 | d = dict(n_workers=n_workers, 35 | send_mb=send_mb, 36 | single_send_time=single_send_time, 37 | num_sends=num_sends, 38 | total_send_time=total_send_time, 39 | comp_time=comp_time, 40 | utilization=utilization, 41 | expected_speedup=expected_speedup) 42 | 43 | return expected_speedup, d 44 | -------------------------------------------------------------------------------- /autopipe/autopipe/__init__.py: -------------------------------------------------------------------------------- 1 | from .cache_utils import compute_and_cache, compute_and_maybe_cache, PickleCache, GraphCache 2 | from .compiler import compile_partitioned_model 3 | from .model_partitioning import metis_partition, acyclic_partition, partition_2dbin_pack, partition_mpipe, \ 4 | analyze_n_clusters, \ 5 | get_weight_functions 6 | from .model_partitioning.async_pipeline import partition_and_match_weights_until_last_partition_is_with_no_recomputation 7 | from .model_profiling import Graph, Node, profile_network, GraphProfiler, trace_module, NodeWeightFunction, \ 8 | EdgeWeightFunction 9 | from .model_profiling.graph_executor import execute_graph, pre_hook_factory, post_hook_factory 10 | from .model_profiling.infer_req_grad import infer_req_grad 11 | from .utils import move_tensors, ExecTimes, FullExecTimes 12 | 13 | 14 | -------------------------------------------------------------------------------- /autopipe/autopipe/compiler/__init__.py: -------------------------------------------------------------------------------- 1 | from .compile_partitioned_model import compile_partitioned_model 2 | -------------------------------------------------------------------------------- /autopipe/autopipe/compiler/compile_normal_model_function.py: -------------------------------------------------------------------------------- 1 | """ This file should be responsible for compiling normal model function""" 2 | 3 | -------------------------------------------------------------------------------- /autopipe/autopipe/model_partitioning/__init__.py: -------------------------------------------------------------------------------- 1 | from .metis import metis_partition 2 | from .acyclic import acyclic_partition 3 | from .mixed_pipe import partition_2dbin_pack, analyze_n_clusters, partition_mpipe 4 | from .heuristics import get_weight_functions 5 | from . import utils 6 | 7 | __all__ = ["acyclic_partition", "metis_partition", "partition_2dbin_pack", "partition_mpipe", "analyze_n_clusters", 8 | "get_weight_functions"] 9 | -------------------------------------------------------------------------------- /autopipe/autopipe/model_partitioning/acyclic/__init__.py: -------------------------------------------------------------------------------- 1 | from .acyclic_partitioning import ALGORITHM, acyclic_partition, Objective, META_ALGORITH, Constraint 2 | from .data_structures import QuotientGraph 3 | -------------------------------------------------------------------------------- /autopipe/autopipe/model_partitioning/metis/__init__.py: -------------------------------------------------------------------------------- 1 | from .metis_partitioning import metis_partition 2 | -------------------------------------------------------------------------------- /autopipe/autopipe/model_partitioning/mixed_pipe/__init__.py: -------------------------------------------------------------------------------- 1 | from .partition_mixed_pipe_v1_clusters import partition_2dbin_pack, analyze_n_clusters 2 | from .partition_mixed_pipe_v2 import partition_mpipe 3 | -------------------------------------------------------------------------------- /autopipe/autopipe/model_partitioning/pipedream/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/autopipe/autopipe/model_partitioning/pipedream/__init__.py -------------------------------------------------------------------------------- /autopipe/autopipe/model_profiling/__init__.py: -------------------------------------------------------------------------------- 1 | from .control_flow_graph import Graph, NodeTypes, Node, NodeWeightFunction, EdgeWeightFunction 2 | from .graph_executor import execute_graph, PostHook, PreHook 3 | from .network_profiler import profile_network 4 | from .profiler import GraphProfiler 5 | from .tracer import trace_module, register_new_traced_function, used_namespaces, register_new_explicit_untraced_function 6 | -------------------------------------------------------------------------------- /autopipe/autopipe/model_profiling/infer_is_contiguous.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict, Any 2 | 3 | import torch 4 | 5 | from .control_flow_graph import Node, Graph 6 | from .graph_executor import execute_graph, pre_hook_factory, post_hook_factory 7 | from ..utils import nested_map, detach_tensors 8 | 9 | 10 | def infer_is_contiguous(graph: Graph, model: torch.nn.Module, args=None, kwargs=None): 11 | if args is None: 12 | args = () 13 | if kwargs is None: 14 | kwargs = dict() 15 | 16 | with torch.no_grad(): 17 | visitor = Visitor() 18 | execute_graph(model, graph, model_args=args, model_kwargs=kwargs, pre_hook=pre_hook_factory(visitor.prehook), 19 | post_hook=post_hook_factory(visitor.posthook)) 20 | 21 | 22 | class Visitor(): 23 | def prehook(self, node: Node, function: Callable, args: tuple, kwargs: Dict): 24 | for n, a in zip(node.args, args): 25 | # the or statement should not be necessary 26 | n.is_contiguous = n.is_contiguous or Visitor.is_contiguous(a) 27 | 28 | for n, kws in node.kwargs.items(): 29 | v = kwargs[kws[0]] 30 | # the or statement should not be necessary 31 | n.is_contiguous = n.is_contiguous or Visitor.is_contiguous(v) 32 | 33 | return detach_tensors(args), detach_tensors(kwargs) 34 | 35 | def posthook(self, node: Node, function: Callable, args: tuple, kwargs: Dict, outputs: Any): 36 | node.is_contiguous = Visitor.is_contiguous(outputs) 37 | 38 | return detach_tensors(outputs) 39 | 40 | @staticmethod 41 | def is_contiguous(ts): 42 | def f(t): 43 | if isinstance(t, torch.Tensor): 44 | return t.is_contiguous() 45 | return False 46 | 47 | return nested_map(f, ts) 48 | -------------------------------------------------------------------------------- /autopipe/autopipe/model_profiling/infer_req_grad.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict, Any 2 | 3 | import torch 4 | 5 | from .control_flow_graph import Node, Graph 6 | from .graph_executor import execute_graph, pre_hook_factory, post_hook_factory 7 | from ..utils import nested_map, detach_tensors 8 | 9 | 10 | def infer_req_grad(graph: Graph, model: torch.nn.Module, args=None, kwargs=None): 11 | if args is None: 12 | args = () 13 | if kwargs is None: 14 | kwargs = dict() 15 | 16 | with torch.enable_grad(): 17 | visitor = Visitor() 18 | execute_graph(model, graph, model_args=args, model_kwargs=kwargs, pre_hook=pre_hook_factory(visitor.prehook), 19 | post_hook=post_hook_factory(visitor.posthook)) 20 | 21 | 22 | class Visitor(): 23 | def prehook(self, node: Node, function: Callable, args: tuple, kwargs: Dict): 24 | for n, a in zip(node.args, args): 25 | # the or statement should not be necessary 26 | n.req_grad = n.req_grad or Visitor.req_grad(a) 27 | 28 | for n, kws in node.kwargs.items(): 29 | v = kwargs[kws[0]] 30 | # the or statement should not be necessary 31 | n.req_grad = n.req_grad or Visitor.req_grad(v) 32 | 33 | return detach_tensors(args), detach_tensors(kwargs) 34 | 35 | def posthook(self, node: Node, function: Callable, args: tuple, kwargs: Dict, outputs: Any): 36 | node.req_grad = Visitor.req_grad(outputs) 37 | 38 | return detach_tensors(outputs) 39 | 40 | @staticmethod 41 | def req_grad(ts): 42 | def f(t): 43 | if isinstance(t, torch.Tensor): 44 | return t.requires_grad 45 | return False 46 | 47 | return nested_map(f, ts) 48 | -------------------------------------------------------------------------------- /autopipe/download/download_cifar.py: -------------------------------------------------------------------------------- 1 | from torchvision.datasets import CIFAR10, CIFAR100 2 | 3 | 4 | if __name__ == "__main__": 5 | CIFAR100(root="", download=True, train=True) 6 | CIFAR100(root="", download=True, train=False) 7 | 8 | CIFAR10(root="", download=True, train=True) 9 | CIFAR10(root="", download=True, train=False) -------------------------------------------------------------------------------- /autopipe/download/make_squad.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | if [ ! -d squad1 ] ; then 3 | mkdir squad1 4 | cd squad1 || exit 1 5 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 6 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json 7 | wget https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py 8 | cd .. 9 | 10 | fi 11 | 12 | if [ ! -d squad2 ] ; then 13 | mkdir squad2 14 | cd squad2 || exit 1 15 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json 16 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json 17 | curl https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ > evaluate-v2.0.py 18 | cd .. 19 | fi 20 | -------------------------------------------------------------------------------- /autopipe/environment.yml: -------------------------------------------------------------------------------- 1 | name: partitioning 2 | channels: 3 | - pytorch 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - python=3.8 8 | - pytorch=1.6.0 9 | - torchvision=0.7.0 10 | - graphviz 11 | - python-graphviz 12 | - networkx 13 | - tqdm 14 | - scikit-learn 15 | - pip 16 | - cython 17 | - pip: 18 | - git+https://github.com/networkx/networkx-metis.git 19 | - transformers>2.9.1 20 | - datasets 21 | -------------------------------------------------------------------------------- /autopipe/partitioning_scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/autopipe/partitioning_scripts/__init__.py -------------------------------------------------------------------------------- /autopipe/py_sbatch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### 4 | # py_sbatch.sh 5 | # 6 | # This script runs python from within our conda env as a slurm batch job. 7 | # All arguments passed to this script are passed directly to the python 8 | # interpreter. 9 | # 10 | 11 | ### 12 | # Example usage: 13 | # 14 | # Running the prepare-submission command from main.py as a batch job 15 | # ./py_sbatch.sh main.py prepare-submission --id 123456789 16 | # 17 | # Running all notebooks without preparing a submission 18 | # ./py_sbatch.sh main.py run-nb *.ipynb 19 | # 20 | # Running any other python script myscript.py with arguments 21 | # ./py_sbatch.sh myscript.py --arg1 --arg2=val2 22 | # 23 | 24 | ### 25 | # Parameters for sbatch 26 | # 27 | NODE=rishon3 28 | NUM_CORES=16 29 | NUM_GPUS=8 30 | JOB_NAME="jobname" 31 | MAIL_USER="username@campus.technion.ac.il" 32 | MAIL_TYPE=ALL # Valid values are NONE, BEGIN, END, FAIL, REQUEUE, ALL 33 | 34 | ### 35 | # Conda parameters 36 | # 37 | CONDA_HOME=$HOME/miniconda3 38 | CONDA_ENV=base 39 | 40 | sbatch \ 41 | -w $NODE \ 42 | -c $NUM_CORES \ 43 | --gres=gpu:$NUM_GPUS \ 44 | --job-name $JOB_NAME \ 45 | --mail-user $MAIL_USER \ 46 | --mail-type $MAIL_TYPE \ 47 | -o '%x_%j.out' \ 48 | < torch.nn.Module: 33 | return FunctionalModel() 34 | 35 | 36 | def get_input(self, args, analysis=False): 37 | if analysis: 38 | return torch.randn( args.analysis_batch_size ,_MODEL_DIM) 39 | 40 | return torch.randn(args.partitioning_batch_size, _MODEL_DIM) 41 | 42 | 43 | register_task("functional_model", ParsePartitioningT5Opts, DumTFunctionalModelPartitioner) 44 | -------------------------------------------------------------------------------- /autopipe/tasks/partitioning_task.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict 3 | 4 | import torch 5 | 6 | 7 | class PartitioningTask(ABC): 8 | 9 | def __init__(self, args) -> None: 10 | pass 11 | 12 | @property 13 | @abstractmethod 14 | def batch_dim(self) -> int: 15 | pass 16 | 17 | @abstractmethod 18 | def get_model(self, args) -> torch.nn.Module: 19 | pass 20 | 21 | @abstractmethod 22 | def get_input(self, args, analysis=False): 23 | pass 24 | 25 | # TODO maybe we want to always register operator.is and operator.is_not as untraced 26 | def register_functions(self): 27 | """ register explicit_traced/untraced_functions 28 | 29 | for example if we wish to trace math.log and not trace operator.is 30 | 31 | then it should be done here 32 | """ 33 | 34 | def update_analysis_kwargs(self, args, config, analysis_kwargs: Dict) -> Dict: 35 | """enable modifications of the analysis_kwargs which are passed to run_analysis 36 | for example set stages_on_same_gpu for gpt2 stateless 37 | """ 38 | return analysis_kwargs 39 | 40 | def post_partitioning(self, args, graph, analysis_result, summary): 41 | """ hook which is called after the partitioning process is done""" 42 | -------------------------------------------------------------------------------- /docs/AcceleratingMixedPipeWithCudaMPS.md: -------------------------------------------------------------------------------- 1 | ### Accelerating mixed pipe with CUDA MPS 2 | As $UID, run the following commands 3 | ``` 4 | ulimit -n 16384 5 | 6 | # export CUDA_VISIBLE_DEVICES=0 # Select GPU 0. 7 | export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps # Select a location that’s 8 | accessible to the given $UID 9 | export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log # Select a location that’s 10 | accessible to the given $UID 11 | nvidia-cuda-mps-control -d # Start deamon in background 12 | ``` 13 | 14 | To shutdown: 15 | ```bash 16 | echo quit | nvidia-cuda-mps-control -------------------------------------------------------------------------------- /docs/MPI.md: -------------------------------------------------------------------------------- 1 | ## Running with MPI 2 | 3 | - [running with mpi](https://www.open-mpi.org/faq/?category=running) 4 | - especially see [mpi-env-vars](https://www.open-mpi.org/faq/?category=running#mpi-environmental-variables). 5 | 6 | -------------------------------------------------------------------------------- /docs/MiscOptimizations.md: -------------------------------------------------------------------------------- 1 | 2 | ## Misc 3 | 4 | ### Communication Matrix Embedding 5 | - Communication Matrix Embedding with cuda P2P samples (15% BW improvment for pipeline). Can use this [script](/misc/p2p_bw_mat.sh). 6 | 7 | ### Binding to nodes 8 | Binding to CPUs which are closer to GPUs can imporve performance. 9 | - use node 0 : `numactl --cpunodebind=0` (requirement: sudo apt install numactl) 10 | - checking this: either `lstopo` or `lspci -vvv | less`. 11 | 12 | ### Check your Pytorch build 13 | - To see how pytorch is compiled, use 14 | ``` 15 | torch.__config__.show() 16 | ``` 17 | -------------------------------------------------------------------------------- /docs/PipeDebug.md: -------------------------------------------------------------------------------- 1 | ## Debugging 2 | 3 | - [debugging mpi python applications with vscode](https://gist.github.com/asroy/ca018117e5dbbf53569b696a8c89204f) 4 | 5 | - debug work only when dataloading is on main thread. (`num_data_workers=0`). 6 | - run same thing, with `--debug` flag, then wait for attachment: 7 | 8 | > > ```bash 9 | > > mpirun -np 2 python main.py --debug 10 | > > ``` 11 | 12 | - If you debug cuda, you may want to fix the trace by: 13 | 14 | > > ```bash 15 | > > CUDA_LAUNCH_BLOCKING=1 mpirun -np 2 python main.py --debug 16 | > > ``` 17 | 18 | - Before you debug, you may want to check run the error is cuda specific and not cpu 19 | 20 | -------------------------------------------------------------------------------- /docs/PipeList.md: -------------------------------------------------------------------------------- 1 | 2 | ## Available pipes 3 | 4 | Altough our [publication](https://www.usenix.org/system/files/atc21-eliad.pdf) refers mainly to 1-2 pipeline approachs for fine-tuning giant models on commodity hardware (mainly, the Pareto frontiers for the discussed setting), the framework we implemented (quite a while before the publication) supports training all model sizes with, for which, of course, different sweetspots apply. 5 | 6 | We implemented many pipeline optimization algorithms to study the tradeoffs of DNN training with asynchronous pipeline-parallelism. 7 | 8 | The following pipeline configurations are available: 9 | 10 | 11 | 12 | - `stale`: no staleness mitigation. 13 | 14 | 15 | - weight prediction (`wp`) : {`msnag`, `aggmsnag`} 16 | - supported for the {`sgd`,`adam`,`adamw`}` optimizers 17 | - `msnag` is momentum based weight prediction 18 | - `aggmsnag` is adopting momentum based wieght prediction to gradient accumulation 19 | 20 | - recomputation 21 | - See Table 1 on [FTPipe paper](https://www.usenix.org/system/files/atc21-eliad.pdf) for the effect on stale pipelines 22 | - no recomputation (`nr` or `norecomp`) 23 | 24 | - weight stashing (`ws`) 25 | 26 | - [Gap Aware](https://arxiv.org/pdf/1909.10802.pdf) staleness mitigation (`ga`) 27 | - for {`sgd`, `adam`, `adamw`} optimizers 28 | - scheduler aware prediction: making the weight prediction aware of the scheduler. 29 | - gradient aggregation in pipeline (`step_every`) 30 | 31 | - combinations of mostly all of the above: {`wp`, `ws`, `ga`} 32 | 33 | Note: Weight predicion is often called `msnag` in code. 34 | 35 | 36 | ### Fully-synchronous 37 | 38 | - `gpipe` 39 | - DistributedDataParallel (DDP): SSGD 40 | - Sequential (`seq`): naive inter-layer model parallelisem (multi gpu) 41 | - and of course, a single gpu for small models. 42 | 43 | 44 | Note: Tied weights are handled (decorated) per use-case. -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/models/__init__.py -------------------------------------------------------------------------------- /models/new_t5_example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/models/new_t5_example/__init__.py -------------------------------------------------------------------------------- /models/new_t5_example/convert_none.py: -------------------------------------------------------------------------------- 1 | from autopipe.autopipe.utils import convert_none_checks 2 | 3 | # run from current dir 4 | if __name__ == '__main__': 5 | # convert_none_checks(input_file="pipe/misc/new_t5/modeling_t5.py", output_file="pipe/misc/new_t5/modeling_t5.py") 6 | convert_none_checks(input_file="modeling_t5.py", output_file="modeling_t5.py") -------------------------------------------------------------------------------- /models/new_t5_example/eval_new_t5.py: -------------------------------------------------------------------------------- 1 | # TODO: make it generic... 2 | from pipe.models.load_pipeline_weights_to_hf import HFLoader 3 | from transformers import AutoModel, AutoConfig, AutoTokenizer, T5ForConditionalGeneration 4 | 5 | 6 | class NewT5HFLoader(HFLoader): 7 | def __init__(self, hf_transformers_model_class=T5ForConditionalGeneration): 8 | super().__init__( 9 | hf_transformers_model_class=hf_transformers_model_class) 10 | 11 | def substitue_state_dict_keys_back_to_original(self, training_state_dict): 12 | # TODO: training_state_dict is origianl state dict used at our training. 13 | d = dict() 14 | 15 | for k, v in training_state_dict.items(): 16 | # we modified keys from prefix.block.N.layer.M.suffix into prefix.N.M.suffix 17 | # this regex substitution performs the reverse transformation 18 | # new_key = re.sub(r'([0-9]+.)([0-9]+.)', r'block.\1layer.\2', k) 19 | d[k] = v 20 | 21 | # in case we load weights from the tied model 22 | if "shared_embed_weight" in d: 23 | w = d.pop("shared_embed_weight") 24 | d['shared.weight'] = d['encoder.embed_tokens.weight'] = d[ 25 | 'decoder.embed_tokens.weight'] = w 26 | return d 27 | -------------------------------------------------------------------------------- /models/normal/NLP_models/__init__.py: -------------------------------------------------------------------------------- 1 | # from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, 2 | # BertForMaskedLM, BertForNextSentencePrediction, 3 | # BertForSequenceClassification, 4 | # BertForMultipleChoice, BertForTokenClassification, 5 | # BertForQuestionAnswering, load_tf_weights_in_bert, 6 | # ) 7 | # from .modeling_ctrl import CTRLModel, CTRLLMHeadModel 8 | # from .modeling_ctrl_tied_weights import CTRLModel as StatelessCTRLModel, CTRLLMHeadModel as StatelessCTRLLMHeadModel 9 | # from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, 10 | # GPT2DoubleHeadsModel) 11 | # from .modeling_gpt2_tied_weights import (GPT2Model as StatelessGPT2Model, 12 | # GPT2LMHeadModel as StatelessGPT2LMHeadModel, 13 | # GPT2DoubleHeadsModel as StatelessGPT2DoubleHeadsModel) 14 | -------------------------------------------------------------------------------- /models/normal/__init__.py: -------------------------------------------------------------------------------- 1 | # from .vision_models import * 2 | # from .NLP_models import GPT2LMHeadModel, GPT2Model,CTRLLMHeadModel,CTRLModel 3 | # from .NLP_models import StatelessGPT2LMHeadModel, StatelessGPT2Model,StatelessCTRLLMHeadModel,StatelessCTRLModel 4 | # from .NLP_models import (BertModel, BertForPreTraining, 5 | # BertForMaskedLM, BertForNextSentencePrediction, 6 | # BertForSequenceClassification, 7 | # BertForMultipleChoice, BertForTokenClassification, 8 | # BertForQuestionAnswering) 9 | -------------------------------------------------------------------------------- /models/normal/dummy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | import torch.nn as nn 4 | 5 | 6 | class Dummy(nn.Module): 7 | def __init__(self): 8 | super(Dummy, self).__init__() 9 | self.l0 = nn.Linear(100, 100) 10 | self.l1 = nn.Linear(100, 100) 11 | self.l2 = nn.Linear(100, 100) 12 | self.l3 = nn.Linear(100, 100) 13 | 14 | def forward(self, x): 15 | output2 = self.l0(x) 16 | t0 = self.l1(x) 17 | t1 = self.l2(t0) 18 | output0, output1 = self.l3(t1) 19 | 20 | return output1, output0, output2 21 | 22 | 23 | class Stage0(nn.Module): 24 | def __init__(self, layers, tensors): 25 | super(Stage0, self).__init__() 26 | assert 'Dummy/Linear[l0]' in layers 27 | self.l = layers['Dummy/Linear[l0]'] 28 | assert isinstance(self.l, nn.Linear) 29 | 30 | def forward(self, x): 31 | return (self.l(x),) 32 | 33 | 34 | class Stage1(nn.Module): 35 | def __init__(self, layers, tensors): 36 | super(Stage1, self).__init__() 37 | assert 'Dummy/Linear[l1]' in layers 38 | self.l = layers['Dummy/Linear[l1]'] 39 | assert isinstance(self.l, nn.Linear) 40 | 41 | def forward(self, x): 42 | return (self.l(x),) 43 | 44 | 45 | class Stage2(nn.Module): 46 | def __init__(self, layers, tensors): 47 | super(Stage2, self).__init__() 48 | assert 'Dummy/Linear[l2]' in layers 49 | self.l = layers['Dummy/Linear[l2]'] 50 | assert isinstance(self.l, nn.Linear) 51 | 52 | def forward(self, x): 53 | return (self.l(x),) 54 | 55 | 56 | class Stage3(nn.Module): 57 | def __init__(self, layers, tensors): 58 | super(Stage3, self).__init__() 59 | assert 'Dummy/Linear[l3]' in layers 60 | self.l = layers['Dummy/Linear[l3]'] 61 | assert isinstance(self.l, nn.Linear) 62 | 63 | def forward(self, x): 64 | x = self.l(x) 65 | return (x, x + 1) 66 | -------------------------------------------------------------------------------- /models/normal/vision_models/LeNet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | __all__ = ["LeNet"] 4 | 5 | 6 | class LeNet(nn.Module): 7 | def __init__(self, num_classes=1000): 8 | super(LeNet, self).__init__() 9 | self.conv1 = nn.Conv2d(3, 6, kernel_size=5) 10 | self.conv2 = nn.Conv2d(6, 16, kernel_size=5) 11 | self.fc1 = nn.Linear(16*5*5, 120) 12 | self.fc2 = nn.Linear(120, 84) 13 | self.fc3 = nn.Linear(84, num_classes) 14 | self.relu1 = nn.ReLU() 15 | self.relu2 = nn.ReLU() 16 | self.relu3 = nn.ReLU() 17 | self.relu4 = nn.ReLU() 18 | self.max_pool2d1 = nn.MaxPool2d(2) 19 | self.max_pool2d2 = nn.MaxPool2d(2) 20 | 21 | def forward(self, x): 22 | x = self.relu1(self.conv1(x)) 23 | x = self.max_pool2d1(x) 24 | x = self.relu2(self.conv2(x)) 25 | x = self.max_pool2d2(x) 26 | x = x.view(x.size(0), -1) 27 | x = self.relu3(self.fc1(x)) 28 | x = self.relu4(self.fc2(x)) 29 | x = self.fc3(x) 30 | return x 31 | -------------------------------------------------------------------------------- /models/normal/vision_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .AlexNet import alexnet 2 | from .VGG import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn 3 | from .ResNet import resnet101, resnet152, resnet18, resnet34, resnet50, resnext101_32x8d, resnext50_32x4d 4 | from .LeNet import LeNet 5 | from .DenseNet import densenet121, densenet161, densenet169, densenet201 6 | from .GoogleNet import GoogLeNet 7 | from .WideResNet import WideResNet 8 | from .Inception import inception_v3 9 | from .SqueezeNet import squeezenet1_0, squeezenet1_1 10 | from .amoebaNet import amoebanetd 11 | from .UNet import UNet 12 | -------------------------------------------------------------------------------- /models/partitioned/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/models/partitioned/__init__.py -------------------------------------------------------------------------------- /pipe/README.md: -------------------------------------------------------------------------------- 1 | # FTPipe 2 | 3 | Pipeline Runtime 4 | 5 | ```bash 6 | python -m pipe.main ... # train models (+eval) (+preprocess) 7 | ``` 8 | 9 | Do use the `--help` and examples to explore. 10 | 11 | ## Get the data 12 | ```bash 13 | python pipe/data/download/datasets/download_datasets.py 14 | ``` 15 | Data for T5 tasks is obtained by using `--mode perprocess` 16 | cmd option. 17 | ## Run 18 | 19 | ### Choose a config 20 | See [configs](configs/) for config examples. 21 | 22 | To choose a spesific config, add it to command line: 23 | 24 | ```bash 25 | mpirun -np 2 python -m pipe.main --config $PATH_TO_CONFIG 26 | ``` 27 | without doing so, it will run the [dummy config](configs/dummy.json) (created for dev usage). 28 | 29 | ### Preprocess 30 | if data preprocessing is needed, run the selected config with: 31 | ```bash 32 | python -m pipe.main --mode preproc --config $PATH_TO_CONFIG ... 33 | ``` 34 | 35 | ### MPI 36 | 37 | cuda aware openmpi: 38 | 39 | ```bash 40 | mpirun -np 2 python -m pipe.main --config $PATH_TO_CONFIG 41 | ``` 42 | 43 | ### Multiprocessing 44 | A PoC runtime which can be used with the `--mode mp` cmd option. 45 | 46 | Supposed to work for very simple stright pipelines only (mostly- torchvision models, vit), and is BUGGY when configuration gets more exotic. (e.g, Tied Wieghts) 47 | ```bash 48 | python -m pipe.main --nprocs 2 --mode mp --config $PATH_TO_CONFIG 49 | ``` 50 | -------------------------------------------------------------------------------- /pipe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/__init__.py -------------------------------------------------------------------------------- /pipe/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/configs/__init__.py -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_base_uncased_2p/hetprofiling/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_base_uncased_2p/pipedream/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wmm/aggmsnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "aggmsnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "weight_stashing": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wmm/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "train_batches_limit": -1, 7 | "bs_train": 3, 8 | "bs_test": 3 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wmm/msnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "msnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "weight_stashing": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wmm/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wmm/ws_msnag_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "msnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "gap_aware": { 13 | "type": "adam", 14 | "policy": "all_except_last", 15 | "args": { 16 | } 17 | }, 18 | "weight_stashing": true 19 | } 20 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_2m/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_2p/aggmsnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ftpipe.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "aggmsnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "weight_stashing": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_2p/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 12, 6 | "train_batches_limit": -1, 7 | "bs_train": 2, 8 | "bs_test": 1 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_2p/pipedream.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": true 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_2p/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ftpipe.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_4p/aggmsnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ftpipe.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "aggmsnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "weight_stashing": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_4p/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 12, 6 | "train_batches_limit": -1, 7 | "bs_train": 2, 8 | "bs_test": 1 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_4p/pipedream.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": true 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_4p/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ftpipe.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_8p/aggmsnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ftpipe.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "aggmsnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "weight_stashing": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_8p/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 12, 6 | "train_batches_limit": -1, 7 | "bs_train": 2, 8 | "bs_test": 1 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_8p/pipedream.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": true 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad/bert_large_uncased_wwm_8p/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ftpipe.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } -------------------------------------------------------------------------------- /pipe/configs/bert/squad2/bert_large_uncased_wmm/aggmsnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "aggmsnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "weight_stashing": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad2/bert_large_uncased_wmm/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "train_batches_limit": -1, 7 | "bs_train": 3, 8 | "bs_test": 3 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad2/bert_large_uncased_wmm/msnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "msnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "weight_stashing": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad2/bert_large_uncased_wmm/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad2/bert_large_uncased_wmm/ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": true 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/bert/squad2/bert_large_uncased_wmm/ws_msnag_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_prediction": { 5 | "type": "msnag", 6 | "args": { 7 | "pred_mem": "clone", 8 | "nag_with_predictor": false, 9 | "sched_aware": true 10 | } 11 | }, 12 | "gap_aware": { 13 | "type": "adam", 14 | "policy": "all_except_last", 15 | "args": { 16 | } 17 | }, 18 | "weight_stashing": true 19 | } 20 | -------------------------------------------------------------------------------- /pipe/configs/cep/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/cep/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir": "results/cep", 5 | "auto_file_name": true, 6 | "out_filename": "cep", 7 | "distributed_backend": "mpi", 8 | "model": "cep_netN50_C20000_4p_bw12_metis", 9 | "stage_to_device_map": [1, 0, 1, 3, 2], 10 | "nprocs": 5, 11 | "shuffle_train": false, 12 | "cep_dataset_kwargs": { 13 | "n": 50, 14 | "k": 11, 15 | "max_samples_num": 5120000 16 | }, 17 | "epochs": 240, 18 | "steps": -1, 19 | "dataset": "cep", 20 | "trainer": { 21 | "type": "cep", 22 | "args": { 23 | } 24 | }, 25 | "statistics": "cv", 26 | "step_every": 16, 27 | "bs_train": 32, 28 | "bs_test": 32, 29 | "num_data_workers": 5, 30 | "optimizer": { 31 | "type": "adamw", 32 | "args": { 33 | "lr": 0.001, 34 | "weight_decay": 1e-2 35 | } 36 | }, 37 | "lr_scheduler": { 38 | "type": "get_constant_schedule_with_warmup", 39 | "preproc_args": { 40 | }, 41 | "args": { 42 | "num_warmup_steps": 0, 43 | "last_epoch": -1 44 | } 45 | }, 46 | "seed_from_cmd": false, 47 | "seed": 42, 48 | "bs_train_from_cmd": false, 49 | "bs_test_from_cmd": false, 50 | "num_chunks": 1, 51 | "verbose_comm": false, 52 | "flush_rate": -1, 53 | "work_scheduler": "gpipe", 54 | "cudnn_benchmark": true, 55 | "max_buffers": 1, 56 | "keep_buffers_alive": false, 57 | "train_batches_limit": -1, 58 | "log_frequency": 200, 59 | "dont_drop_last": true, 60 | "test_batches_limit": 0, 61 | "save_checkpoints": true, 62 | "checkpoints_save_name_prefix": "cep", 63 | "checkpoints_save_dir": "/nfs_Disk2/cep/smaller/", 64 | "load_model_one_by_one": false, 65 | "save_checkpoint_every_x_steps": 300000 66 | } 67 | -------------------------------------------------------------------------------- /pipe/configs/cv/cifar10/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/cv/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir":"results/cv/cifar10/2p/", 5 | "auto_file_name": true, 6 | "out_filename": "cv", 7 | "statistics": "cv", 8 | "distributed_backend": "mpi", 9 | "model": "wrn_16x4_p2", 10 | "dataset": "cifar10", 11 | "trainer": { 12 | "type": "cv", 13 | "args": { 14 | } 15 | }, 16 | 17 | "optimizer": { 18 | "type": "sgd1", 19 | "args": { 20 | "lr": 0.1, 21 | "weight_decay": 0.0005, 22 | "momentum": 0.9, 23 | "nesterov": false 24 | } 25 | }, 26 | "bs_train": 128, 27 | "bs_test": 200, 28 | "num_data_workers": 6, 29 | "lr_scheduler": { 30 | "type": "get_multi_step_lr_schedule_with_warmup", 31 | "args": { 32 | "num_warmup_steps": 5, 33 | "milestones": [60, 120, 160], 34 | "gamma": 0.2, 35 | "last_epoch": -1 36 | } 37 | }, 38 | "epochs": 205, 39 | "steps": -1, 40 | "seed_from_cmd": true, 41 | "num_chunks": 1, 42 | "verbose_comm": false, 43 | "flush_rate": -1, 44 | "work_scheduler": "1F1B", 45 | "cudnn_benchmark": true, 46 | "max_buffers": 1, 47 | "step_every": 1, 48 | "keep_buffers_alive": true, 49 | "train_batches_limit":-1, 50 | "log_frequency": 100 51 | } 52 | -------------------------------------------------------------------------------- /pipe/configs/cv/cifar10/wrn_16x4_c10_p2/EXAMPLE.md: -------------------------------------------------------------------------------- 1 | # Simplest example 2 | 3 | ## Environment 4 | (Experimental, unchecked) env without MPI. 5 | ```bash 6 | conda env create -f env_utils/env_without_mpi.yml 7 | conda activate nompi 8 | ``` 9 | 10 | (if you see import errors just install the missing packages.) 11 | 12 | ## Data 13 | use 14 | ```bash 15 | python download/datasets/download_datasets.py 16 | ``` 17 | to get several datasets. 18 | 19 | or execute just the relevant part in python 20 | ```python 21 | from torchvision.datasets import CIFAR10 22 | DATA_DIR='/home_local/saareliad/data' # replace something of yours ; 23 | CIFAR10(root=DATA_DIR, download=True, train=True) 24 | CIFAR10(root=DATA_DIR, download=True, train=False) 25 | ``` 26 | 27 | ## Run 28 | (single machine optimized streams, no MPI build needed) 29 | ``` 30 | python main.py –mode mp –config configs/cv/cifar10/wrn_16x4_c10_p2/stale_nr.json –seed 42 31 | ``` 32 | 33 | ### Optional: single GPU 34 | 35 | Can change to make it run pipeline on single GPU by changing 36 | relevant lines in [configs/cv/cifar10/wrn_16x4_c10_p2/stale_nr.json](configs/cv/cifar10/wrn_16x4_c10_p2/stale_nr.json)\ 37 | to 38 | ```json 39 | "stage_to_device_map": [0, 0], 40 | ``` 41 | 42 | ## Model 43 | Simplest PoC partitioning.\ 44 | Auto-generated: [models/partitioned/wrn_16x4_c10_p2.py](models/partitioned/wrn_16x4_c10_p2.py) 45 | Code handling reading the config is mostly here: 46 | [models/simple_partitioning_config.py](models/simple_partitioning_config.py) -------------------------------------------------------------------------------- /pipe/configs/cv/cifar10/wrn_16x4_c10_p2/stale_nr.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/cv/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir":"results/cv/cifar10/2p/", 5 | "auto_file_name": true, 6 | "out_filename": "stale_nr", 7 | "statistics": "cv", 8 | "distributed_backend": "mpi", 9 | "model": "wrn_16x4_c10_p2", 10 | "stage_to_device_map": [0, 1], 11 | "nprocs": 2, 12 | "dataset": "cifar10", 13 | "trainer": { 14 | "type": "cv", 15 | "args": { 16 | } 17 | }, 18 | 19 | "optimizer": { 20 | "type": "sgd1", 21 | "args": { 22 | "lr": 0.1, 23 | "weight_decay": 0.0005, 24 | "momentum": 0.9, 25 | "nesterov": true 26 | } 27 | }, 28 | "bs_train": 128, 29 | "bs_test": 200, 30 | "num_data_workers": 6, 31 | "lr_scheduler": { 32 | "type": "get_multi_step_lr_schedule_with_warmup", 33 | "args": { 34 | "num_warmup_steps": 5, 35 | "milestones": [60, 120, 160], 36 | "gamma": 0.2, 37 | "last_epoch": -1 38 | } 39 | }, 40 | "epochs": 205, 41 | "steps": -1, 42 | "seed_from_cmd": true, 43 | "num_chunks": 1, 44 | "verbose_comm": false, 45 | "flush_rate": -1, 46 | "work_scheduler": "1F1B", 47 | "cudnn_benchmark": true, 48 | "max_buffers": 1, 49 | "step_every": 1, 50 | "keep_buffers_alive": true, 51 | "train_batches_limit":-1, 52 | "log_frequency": 100, 53 | "weight_stashing": false, 54 | "no_recomputation": true 55 | } 56 | -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/README.md: -------------------------------------------------------------------------------- 1 | # All configs here: 2 | "configs/cv/cifar100/wrn28x10/" 3 | 4 | ## Recomputation 5 | 6 | configs/cv/cifar100/wrn28x10/msnag.json 7 | configs/cv/cifar100/wrn28x10/stale.json 8 | configs/cv/cifar100/wrn28x10/ws_msnag_ga_jfl.json 9 | 10 | configs/cv/cifar100/wrn28x10/msnag_ws.json 11 | configs/cv/cifar100/wrn28x10/ws_ga.json 12 | configs/cv/cifar100/wrn28x10/ws_msnag_ga.json 13 | configs/cv/cifar100/wrn28x10/ws.json 14 | 15 | 16 | ## No Recomputation 17 | 18 | configs/cv/cifar100/wrn28x10/no_recomputation/msnag_nr.json 19 | configs/cv/cifar100/wrn28x10/no_recomputation/stale_nr.json 20 | 21 | configs/cv/cifar100/wrn28x10/no_recomputation/msnag_ws.json 22 | configs/cv/cifar100/wrn28x10/no_recomputation/ws_ga.json 23 | configs/cv/cifar100/wrn28x10/no_recomputation/ws_msnag_ga.json 24 | configs/cv/cifar100/wrn28x10/no_recomputation/ws.json 25 | 26 | # Estimed Time 27 | 28 | total_configs_we_want = (3 + 4 + 2) = 9 29 | 30 | seeds = 5 31 | 32 | estimed_time_per_config = 3 hours 33 | 34 | total time: 35 | 36 | 45*3 = 135 hours = 5.625 days 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/cv/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir":"results/cv/", 5 | "auto_file_name": true, 6 | "out_filename": "cv", 7 | "statistics": "cv", 8 | "distributed_backend": "mpi", 9 | "model": "wrn_28x10_c100_dr03_gnc32_4p_bw12_pipedream", 10 | "dataset": "cifar100", 11 | "trainer": { 12 | "type": "cv", 13 | "args": { 14 | } 15 | }, 16 | 17 | "optimizer": { 18 | "type": "sgd1", 19 | "args": { 20 | "lr": 0.1, 21 | "weight_decay": 0.0005, 22 | "momentum": 0.9, 23 | "nesterov": false 24 | } 25 | }, 26 | "bs_train": 128, 27 | "bs_test": 200, 28 | "num_data_workers": 10, 29 | "lr_scheduler": { 30 | "type": "get_multi_step_lr_schedule_with_warmup", 31 | "args": { 32 | "num_warmup_steps": 5, 33 | "milestones": [60, 120, 160], 34 | "gamma": 0.2, 35 | "last_epoch": -1 36 | } 37 | }, 38 | "epochs": 205, 39 | "steps": -1, 40 | "seed_from_cmd": true, 41 | "num_chunks": 1, 42 | "verbose_comm": false, 43 | "flush_rate": -1, 44 | "work_scheduler": "1F1B", 45 | "cudnn_benchmark": true, 46 | "max_buffers": 1, 47 | "step_every": 1, 48 | "keep_buffers_alive": true, 49 | "train_batches_limit":-1, 50 | "log_frequency": 100 51 | } 52 | -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/msnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "common.json", 4 | "msnag_optimizer.json" 5 | ], 6 | "base_config_path_is_relative": true, 7 | "weight_stashing": false 8 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/msnag_optimizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "sgd1", 4 | "args": { 5 | "lr": 0.1, 6 | "weight_decay": 0.0005, 7 | "momentum": 0.9, 8 | "nesterov": false 9 | } 10 | }, 11 | "nesterov_set_for_last_partition": true, 12 | "weight_prediction": { 13 | "type": "aggmsnag", 14 | "args": { 15 | "pred_mem": "clone", 16 | "nag_with_predictor": true 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/msnag_ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "common.json", 4 | "msnag_optimizer.json" 5 | ], 6 | "base_config_path_is_relative": true, 7 | "weight_stashing": true 8 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/no_recomputation/msnag_nr.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "../common.json", 4 | "../msnag_optimizer.json", 5 | "norecomp.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "weight_stashing": false 9 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/no_recomputation/msnag_ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "../common.json", 4 | "../msnag_optimizer.json", 5 | "norecomp.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "weight_stashing": true 9 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/no_recomputation/norecomp.json: -------------------------------------------------------------------------------- 1 | { 2 | "out_filename": "cv_norecomp", 3 | "no_recomputation": true 4 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/no_recomputation/stale_nr.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "../common.json", 4 | "../stale_optimizer.json", 5 | "norecomp.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "weight_stashing": false 9 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/no_recomputation/ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "../common.json", 4 | "../stale_optimizer.json", 5 | "norecomp.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "weight_stashing": true 9 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/no_recomputation/ws_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "../common.json", 4 | "../stale_optimizer.json", 5 | "norecomp.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "gap_aware": { 9 | "type": "sgd1", 10 | "policy": "all_except_last", 11 | "args": { 12 | "big_gamma": 0.999, 13 | "epsilon": 1e-8 14 | } 15 | }, 16 | "weight_stashing": true 17 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/no_recomputation/ws_msnag_ga.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "base_config_path": [ 4 | "../common.json", 5 | "../msnag_optimizer.json", 6 | "norecomp.json" 7 | ], 8 | "base_config_path_is_relative": true, 9 | "gap_aware": { 10 | "type": "sgd1", 11 | "policy": "all_except_last", 12 | "args": { 13 | "big_gamma": 0.999, 14 | "epsilon": 1e-8 15 | } 16 | }, 17 | "weight_stashing": true 18 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "common.json", 4 | "stale_optimizer.json" 5 | ], 6 | "base_config_path_is_relative": true, 7 | "weight_stashing": false, 8 | "train_batches_limit":-1 9 | 10 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/stale_optimizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "type": "sgd1", 4 | "args": { 5 | "lr": 0.1, 6 | "weight_decay": 0.0005, 7 | "momentum": 0.9, 8 | "nesterov": true 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "common.json", 4 | "stale_optimizer.json" 5 | ], 6 | "base_config_path_is_relative": true, 7 | "weight_stashing": true 8 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/ws_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "common.json", 4 | "stale_optimizer.json" 5 | ], 6 | "base_config_path_is_relative": true, 7 | "gap_aware": { 8 | "type": "sgd1", 9 | "policy": "all_except_last", 10 | "args": { 11 | "big_gamma": 0.999, 12 | "epsilon": 1e-8 13 | } 14 | }, 15 | "weight_stashing": true 16 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/ws_msnag_ga.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "base_config_path": [ 4 | "common.json", 5 | "msnag_optimizer.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "gap_aware": { 9 | "type": "sgd1", 10 | "policy": "all_except_last", 11 | "args": { 12 | "big_gamma": 0.999, 13 | "epsilon": 1e-8 14 | } 15 | }, 16 | "weight_stashing": true 17 | } -------------------------------------------------------------------------------- /pipe/configs/cv/cifar100/wrn28x10/ws_msnag_ga_jfl.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "common.json", 4 | "msnag_optimizer.json" 5 | ], 6 | "base_config_path_is_relative": true, 7 | "gap_aware": { 8 | "type": "sgd1", 9 | "policy": "all_except_last", 10 | "args": { 11 | "big_gamma": 0.999, 12 | "epsilon": 1e-8 13 | } 14 | }, 15 | "weight_stashing": true, 16 | "gap_aware_just_loss": true, 17 | "no_recomputation": false 18 | } -------------------------------------------------------------------------------- /pipe/configs/cv/imagenet/weight_stashing_msnag_gap_aware.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/", 3 | "out_dir": "results/imagenet", 4 | "data_dir": "/home_local/saareliad/data/imagenet/", 5 | "auto_file_name": true, 6 | "out_filename": "e1", 7 | "distributed_backend": "mpi", 8 | "statistics": "cv", 9 | "model": "resnet50_imagenet_p8", 10 | "dataset": "imagenet", 11 | "trainer": { 12 | "type": "cv", 13 | "args": { 14 | } 15 | }, 16 | "bs_train": 128, 17 | "bs_test": 128, 18 | "num_data_workers": 10, 19 | "optimizer": { 20 | "type": "sgd1", 21 | "args": { 22 | "lr": 0.1, 23 | "weight_decay": 0.0001, 24 | "momentum": 0.9, 25 | "nesterov": false 26 | } 27 | }, 28 | "lr_scheduler": { 29 | "type": "get_multi_step_lr_schedule_with_warmup", 30 | "args": { 31 | "num_warmup_steps": 5, 32 | "milestones": [30, 60, 90], 33 | "gamma": 0.1, 34 | "last_epoch": -1 35 | } 36 | }, 37 | "weight_prediction": { 38 | "type": "msnag", 39 | "args": { 40 | "pred_mem": "clone", 41 | "nag_with_predictor": true 42 | } 43 | }, 44 | "gap_aware": { 45 | "type": "sgd1", 46 | "policy": "all_except_last", 47 | "args": { 48 | "big_gamma": 0.999, 49 | "epsilon": 1e-8 50 | } 51 | }, 52 | "epochs": 100, 53 | "steps": -1, 54 | "seed": 42, 55 | "num_chunks": 1, 56 | "verbose_comm": false, 57 | "flush_rate": -1, 58 | "weight_stashing": true, 59 | "work_scheduler": "1F1B", 60 | "seed_from_cmd": true, 61 | "nesterov_set_for_last_partition": true, 62 | "no_recomputation": false, 63 | "keep_buffers_alive": true, 64 | "max_buffers": 1, 65 | "cudnn_benchmark": true 66 | } 67 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/debug/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir": "results/lm/gpt2/tied/", 5 | "auto_file_name": true, 6 | "out_filename": "tied", 7 | "distributed_backend": "mpi", 8 | "model": "gpt2_p4_lm_tied", 9 | "stage_to_device_map": [0, 1, 2, 3, 0], 10 | "model_name_or_path": "gpt2", 11 | "dataset": "wt2", 12 | "statistics": "lm_loss_per_batch", 13 | "trainer": { 14 | "type": "lm", 15 | "args": { 16 | } 17 | }, 18 | "bs_train": 4, 19 | "bs_test": 4, 20 | "train_seq_len": 1024, 21 | "valid_seq_len": 1024, 22 | "test_seq_len": 1024, 23 | "num_data_workers": 10, 24 | "optimizer": { 25 | "type": "adamw", 26 | "args": { 27 | "lr": 5e-5, 28 | "weight_decay": 0, 29 | "eps": 1e-8 30 | } 31 | }, 32 | "lr_scheduler": { 33 | "type": "get_linear_schedule_with_warmup", 34 | "preproc_args": { 35 | "num_training_steps": "epochs_to_steps", 36 | "num_warmup_steps": "epochs_to_steps" 37 | }, 38 | "args": { 39 | "num_warmup_steps": 0, 40 | "num_training_steps": -1, 41 | "last_epoch": -1 42 | } 43 | }, 44 | "epochs": 3, 45 | "steps": -1, 46 | "seed_from_cmd": true, 47 | "num_chunks": 1, 48 | "verbose_comm": false, 49 | "flush_rate": -1, 50 | "work_scheduler": "1F1B", 51 | "cudnn_benchmark": true, 52 | "max_buffers": 1, 53 | "step_every": 1, 54 | "train_batches_limit": -1, 55 | "log_frequency": 20, 56 | "overwrite_cache": true, 57 | "keep_buffers_alive": false, 58 | "dont_drop_last": true, 59 | "stateless_tied": true, 60 | "nprocs": 5 61 | } 62 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 4, 6 | "overwrite_cache": false, 7 | "train_batches_limit": -1, 8 | "bs_train": 1, 9 | "bs_test": 4, 10 | "epochs": 3 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/msnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "weight_stashing": false 14 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/msnag_ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "weight_stashing": true 14 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/seq.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": false, 6 | "work_scheduler": "SEQ", 7 | "no_recomputation": true, 8 | "out_filename": "seq_tied" 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": false 6 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": true 6 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/ws_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "gap_aware": { 6 | "type": "adamw", 7 | "policy": "all_except_last", 8 | "args": { 9 | } 10 | }, 11 | "weight_stashing": true 12 | } 13 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/ws_msnag_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "gap_aware": { 14 | "type": "adamw", 15 | "policy": "all_except_last", 16 | "args": { 17 | } 18 | }, 19 | "weight_stashing": true 20 | } 21 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2/tied/ws_msnag_ga_jfl.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ws_msnag_ga.json", 3 | "base_config_path_is_relative": true, 4 | "gap_aware_just_loss": true 5 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/debug/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir": "results/lm/gpt2xl/tied/", 5 | "statistics": "lm_loss_per_batch", 6 | "auto_file_name": true, 7 | "out_filename": "tied_wd_wa", 8 | "distributed_backend": "mpi", 9 | "model": "gpt2_xl_p8_lm_tied", 10 | "model_name_or_path": "gpt2-xl", 11 | "dataset": "wt2", 12 | "trainer": { 13 | "type": "lm", 14 | "args": { 15 | } 16 | }, 17 | "bs_train": 1, 18 | "bs_test": 1, 19 | "train_seq_len": 1024, 20 | "valid_seq_len": 1024, 21 | "test_seq_len": 1024, 22 | "num_data_workers": 10, 23 | "optimizer": { 24 | "type": "adamw", 25 | "args": { 26 | "lr": 5e-5, 27 | "weight_decay": 0.01, 28 | "eps": 1e-8 29 | } 30 | }, 31 | "lr_scheduler": { 32 | "type": "get_linear_schedule_with_warmup", 33 | "preproc_args": { 34 | "num_training_steps": "epochs_to_steps" 35 | }, 36 | "args": { 37 | "num_warmup_steps": 200, 38 | "num_training_steps": -1, 39 | "last_epoch": -1 40 | } 41 | }, 42 | "epochs": 2, 43 | "steps": -1, 44 | "seed_from_cmd": true, 45 | "num_chunks": 1, 46 | "verbose_comm": false, 47 | "flush_rate": -1, 48 | "work_scheduler": "1F1B", 49 | "cudnn_benchmark": true, 50 | "max_buffers": 1, 51 | "step_every": 1, 52 | "train_batches_limit": -1, 53 | "dont_drop_last": true, 54 | "keep_buffers_alive": false, 55 | "log_frequency": 80, 56 | "stateless_tied": true, 57 | "stage_to_device_map": [0, 1, 2, 3, 4, 5, 6, 7, 0], 58 | "overwrite_cache": false 59 | } 60 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "overwrite_cache": true, 7 | "train_batches_limit": -1, 8 | "bs_train": 1, 9 | "bs_test": 4, 10 | "epochs": 3 11 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/msnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "weight_stashing": false 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/msnag_ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "weight_stashing": true 14 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/seq.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": false, 6 | "work_scheduler": "SEQ", 7 | "no_recomputation": true, 8 | "out_filename": "seq_wd", 9 | "dont_drop_last": true, 10 | "overwrite_cache": true 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": false 6 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": true 6 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/ws_msnag_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "gap_aware": { 14 | "type": "adamw", 15 | "policy": "all_except_last", 16 | "args": { 17 | } 18 | }, 19 | "weight_stashing": true 20 | } 21 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/tied/ws_msnag_ga_jfl.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ws_msnag_ga.json", 3 | "base_config_path_is_relative": true, 4 | "gap_aware_just_loss": true 5 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/aggmsnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "aggmsnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "weight_stashing": false 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/debug/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir": "results/DEBUG1/lm/gpt2xl/untied/wd", 5 | "statistics": "lm_loss_per_batch", 6 | "auto_file_name": true, 7 | "out_filename": "wd", 8 | "distributed_backend": "mpi", 9 | "model": "old_gpt2xl_8p_untied", 10 | "model_name_or_path": "gpt2-xl", 11 | "dataset": "wt2", 12 | "trainer": { 13 | "type": "lm", 14 | "args": { 15 | } 16 | }, 17 | "bs_train": 1, 18 | "bs_test": 1, 19 | "train_seq_len": 1024, 20 | "valid_seq_len": 1024, 21 | "test_seq_len": 1024, 22 | "num_data_workers": 10, 23 | "optimizer": { 24 | "type": "adamw", 25 | "args": { 26 | "lr": 5e-5, 27 | "weight_decay": 0.01, 28 | "eps": 1e-8 29 | } 30 | }, 31 | "lr_scheduler": { 32 | "type": "get_linear_schedule_with_warmup", 33 | "preproc_args": { 34 | "num_training_steps": "epochs_to_steps", 35 | "num_warmup_steps": "epochs_to_steps" 36 | }, 37 | "args": { 38 | "num_warmup_steps": 0, 39 | "num_training_steps": -1, 40 | "last_epoch": -1 41 | } 42 | }, 43 | "epochs": 1, 44 | "steps": -1, 45 | "seed_from_cmd": true, 46 | "num_chunks": 1, 47 | "verbose_comm": false, 48 | "flush_rate": -1, 49 | "work_scheduler": "1F1B", 50 | "cudnn_benchmark": true, 51 | "max_buffers": 1, 52 | "step_every": 1, 53 | "train_batches_limit": -1, 54 | "dont_drop_last": true, 55 | "keep_buffers_alive": false, 56 | "log_frequency": 80, 57 | "overwrite_cache": true 58 | } 59 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "overwrite_cache": true, 7 | "train_batches_limit": -1, 8 | "bs_train": 1, 9 | "bs_test": 4, 10 | "epochs": 1 11 | 12 | } 13 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/msnag.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "weight_stashing": false 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/msnag_ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "weight_stashing": true 14 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/seq.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": false, 6 | "work_scheduler": "SEQ", 7 | "no_recomputation": true, 8 | "out_filename": "seq_wd", 9 | "dont_drop_last": true, 10 | "overwrite_cache": true 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": false, 6 | "train_batches_limit":-1 7 | } 8 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/ws.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": true 6 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/ws_msnag_ga.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_prediction": { 6 | "type": "msnag", 7 | "args": { 8 | "pred_mem": "clone", 9 | "nag_with_predictor": false, 10 | "sched_aware": true 11 | } 12 | }, 13 | "gap_aware": { 14 | "type": "adamw", 15 | "policy": "all_except_last", 16 | "args": { 17 | } 18 | }, 19 | "weight_stashing": true 20 | } 21 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied/ws_msnag_ga_jfl.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "ws_msnag_ga.json", 3 | "base_config_path_is_relative": true, 4 | "gap_aware_just_loss": true 5 | } -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied_s512/common.json: -------------------------------------------------------------------------------- 1 | { 2 | "logdir": "logs/debug/", 3 | "data_dir": "/home_local/saareliad/data", 4 | "out_dir": "results/new_gpt2xl/lm/gpt2xl_b512/untied/wd/wa", 5 | "statistics": "lm_loss_per_batch", 6 | "auto_file_name": true, 7 | "out_filename": "wd", 8 | "distributed_backend": "mpi", 9 | "model": "new_gpt2_xl_tied_lm_p8_seq_512", 10 | "model_name_or_path": "gpt2-xl", 11 | "dataset": "wt2", 12 | "trainer": { 13 | "type": "lm", 14 | "args": { 15 | } 16 | }, 17 | "bs_train": 2, 18 | "bs_test": 2, 19 | "train_seq_len": 512, 20 | "valid_seq_len": 512, 21 | "test_seq_len": 512, 22 | "num_data_workers": 10, 23 | "optimizer": { 24 | "type": "adamw", 25 | "args": { 26 | "lr": 5e-5, 27 | "weight_decay": 0.01, 28 | "eps": 1e-8 29 | } 30 | }, 31 | "lr_scheduler": { 32 | "type": "get_linear_schedule_with_warmup", 33 | "preproc_args": { 34 | "num_training_steps": "epochs_to_steps", 35 | "num_warmup_steps": "ratio_from_num_training_steps" 36 | }, 37 | "args": { 38 | "num_warmup_steps": 0.06, 39 | "num_training_steps": -1, 40 | "last_epoch": -1 41 | } 42 | }, 43 | "epochs": 1, 44 | "steps": -1, 45 | "seed_from_cmd": true, 46 | "num_chunks": 1, 47 | "verbose_comm": false, 48 | "flush_rate": -1, 49 | "work_scheduler": "1F1B", 50 | "cudnn_benchmark": true, 51 | "max_buffers": 1, 52 | "step_every": 8, 53 | "train_batches_limit": -1, 54 | "dont_drop_last": true, 55 | "keep_buffers_alive": false, 56 | "log_frequency": 80, 57 | "overwrite_cache": true 58 | } 59 | -------------------------------------------------------------------------------- /pipe/configs/lm/wt2/gpt2xl/untied_s512/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "statistics": "lm_loss_per_batch", 5 | "weight_stashing": false, 6 | "train_batches_limit":-1 7 | } 8 | -------------------------------------------------------------------------------- /pipe/configs/python_configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/configs/python_configs/__init__.py -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/boolq/gpipe_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/boolq/gpipe_layer/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/boolq/gpipe_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/boolq/gpipe_op/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/boolq/stale_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/boolq/stale_layer/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/boolq/stale_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/boolq/stale_op/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/multirc/gpipe_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/multirc/gpipe_layer/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/multirc/gpipe_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/multirc/gpipe_op/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/multirc/stale_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/multirc/stale_layer/" 8 | } 9 | 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/multirc/stale_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/mpipe/checkpoints/t5/3b/multirc/stale_op/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/rte/gpipe_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/rte/gpipe_layer/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/rte/gpipe_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/rte/gpipe_op/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/rte/stale_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/rte/stale_layer/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/rte/stale_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/rte/stale_op/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/wic/gpipe_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 16, 7 | "bs_test": 16, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/wic/gpipe_layer/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/wic/gpipe_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 16, 7 | "bs_test": 16, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/wic/gpipe_op/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/wic/stale_layer_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_layer_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/wic/stale_layer/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/mpipe/wic/stale_op_graph.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common_op_graph.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/wic/stale_op/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/boolq/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/no_virtual_stages/checkpoints/t5/3b/boolq/gpipe/", 10 | "save_checkpoints": true, 11 | "epochs": 2, 12 | "steps": -1, 13 | "dont_drop_last": false 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/boolq/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "dont_drop_last": false, 7 | "checkpoints_save_name_prefix": "stale", 8 | "checkpoints_save_dir": "/mnt/qnap/saareliad/no_virtual_stages/checkpoints/t5/3b/boolq/stale/", 9 | "step_every": 10, 10 | "bs_train": 2 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/multirc/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/no_virtual_stages/checkpoints/t5/3b/multirc/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 1, 12 | "dont_drop_last": false, 13 | "steps": -1 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/multirc/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "new_stale_seq", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/no_virtual_stages/checkpoints/t5/3b/multirc/stale/", 8 | "dont_drop_last": false, 9 | "step_every": 4, 10 | "bs_train": 2 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/rte/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "new_gpipe", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/saare/checkpoints/no_virtual_stages/checkpoints/t5/3b/rte/gpipe/", 10 | "epochs": 6, 11 | "steps": -1, 12 | "save_checkpoints": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/rte/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "checkpoints_save_name_prefix": "stale_adafactor", 6 | "checkpoints_save_dir": "/mnt/qnap/saareliad/no_virtual_stages/checkpoints/t5/3b/rte/stale/", 7 | "step_every": 10, 8 | "bs_train": 4, 9 | "bs_test": 4 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/wic/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 16, 7 | "bs_test": 16, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/no_virtual_stages/checkpoints/t5/3b/wic/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 12, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq/wic/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/no_virtual_stages/checkpoints/t5/3b/wic/stale/", 8 | "bs_train": 32, 9 | "step_every": 4 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/boolq/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/seq_op/checkpoints/t5/3b/boolq/gpipe/", 10 | "save_checkpoints": true, 11 | "epochs": -1, 12 | "steps": 3200, 13 | "dont_drop_last": false 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/boolq/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "dont_drop_last": false, 7 | "checkpoints_save_name_prefix": "stale", 8 | "checkpoints_save_dir": "/mnt/qnap/saareliad/seq_op/checkpoints/t5/3b/boolq/stale/", 9 | "step_every": 5, 10 | "bs_train": 4 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/multirc/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/seq_op/checkpoints/t5/3b/multirc/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 2, 12 | "dont_drop_last": false, 13 | "steps": -1 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/multirc/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "new_stale_seq", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/seq_op/checkpoints/t5/3b/multirc/stale/", 8 | "dont_drop_last": false, 9 | "step_every": 2, 10 | "bs_train": 4 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/rte/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "new_gpipe", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/saare/checkpoints/seq_op/checkpoints/t5/3b/rte/gpipe/", 10 | "epochs": 6, 11 | "steps": -1, 12 | "save_checkpoints": false 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/rte/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "checkpoints_save_name_prefix": "stale_adafactor", 6 | "checkpoints_save_dir": "/mnt/qnap/saareliad/seq_op/checkpoints/t5/3b/rte/stale/", 7 | "step_every": 5, 8 | "bs_train": 8, 9 | "bs_test": 8 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/wic/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 16, 6 | "bs_train": 8, 7 | "bs_test": 8, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/mnt/qnap/saareliad/seq_op/checkpoints/t5/3b/wic/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 12, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/new_t5_exp/seq_op_graph/wic/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/mnt/qnap/saareliad/seq_op/checkpoints/t5/3b/wic/stale/", 8 | "bs_train": 32, 9 | "step_every": 4 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/boolq/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/3b/boolq/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 2, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/boolq/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/3b/boolq/gpipe/", 10 | "model": "t5_3b_tied_lmheads_512_4_8p_bw12_squad1_pipedream", 11 | "save_checkpoints": true, 12 | "epochs": -1, 13 | "steps": 3200, 14 | "dont_drop_last": false 15 | } 16 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/boolq/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "model": "t5_3b_tied_lmheads_512_4_8p_bw12_squad1_pipedream", 7 | "dont_drop_last": true, 8 | "step_every": 5, 9 | "bs_train": 4 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/boolq/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/cola/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 32, 7 | "bs_test": 32, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/saare/checkpoints/t5/3b/cola/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/cola/seq.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "SEQ", 6 | "no_recomputation": true, 7 | "out_filename": "seq" 8 | } 9 | 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/cola/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/multirc/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/3b/multirc/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 2, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/multirc/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/3b/multirc/gpipe/", 10 | "model": "t5_3b_tied_lmheads_512_4_8p_bw12_squad1_pipedream", 11 | "save_checkpoints": false, 12 | "epochs": 2, 13 | "steps": -1 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/multirc/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "new_stale_seq", 7 | "model": "t5_3b_tied_lmheads_512_4_8p_bw12_squad1_pipedream", 8 | "dont_drop_last": true, 9 | "step_every": 2, 10 | "bs_train": 4 11 | } 12 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/multirc/seq.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "SEQ", 6 | "no_recomputation": true, 7 | "bs_train": 1 8 | } 9 | 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/multirc/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/saare/checkpoints/t5/3b/rte/seq/", 10 | "save_checkpoints": false, 11 | "epochs": 12, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "new_gpipe", 9 | "checkpoints_save_dir": "/nfs_Disk2/saare/checkpoints/t5/3b/rte/seq/", 10 | "model": "t5_3b_tied_lmheads_320_8_8p_bw12_squad1_pipedream", 11 | "epochs": 6, 12 | "steps": -1, 13 | "save_checkpoints": false 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "model": "t5_3b_tied_lmheads_320_8_8p_bw12_squad1_pipedream", 6 | "step_every": 4, 7 | "bs_train": 10, 8 | "save_checkpoint_every_x_steps": 500 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte/seq.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "SEQ", 6 | "no_recomputation": true, 7 | "out_filename": "seq" 8 | } 9 | 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte_super_glue/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/saare/checkpoints/t5/3b/super_glue_rte/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte_super_glue/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "new_gpipe", 9 | "checkpoints_save_dir": "/nfs_Disk2/saare/checkpoints/t5/3b/super_glue_rte/", 10 | "model": "t5_3b_tied_lmheads_320_8_8p_bw12_squad1_pipedream", 11 | "epochs": 6, 12 | "steps": -1, 13 | "save_checkpoints": false 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte_super_glue/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "model": "t5_3b_tied_lmheads_320_8_8p_bw12_squad1_pipedream", 6 | "step_every": 4, 7 | "bs_train": 10, 8 | "save_checkpoint_every_x_steps": 500 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte_super_glue/seq.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "SEQ", 6 | "no_recomputation": true, 7 | "out_filename": "seq" 8 | } 9 | 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/rte_super_glue/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/wic/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 16, 7 | "bs_test": 16, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/3b/wic/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 12, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/wic/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 16, 7 | "bs_test": 16, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/3b/wic/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 12, 12 | "steps": -1, 13 | "model": "t5_3b_tied_lmheads_64_4_8p_bw12_squad1_pipedream" 14 | } 15 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/wic/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "model": "t5_3b_tied_lmheads_64_4_8p_bw12_squad1_pipedream", 7 | "bs_train": 64, 8 | "step_every": 2 9 | } 10 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/seq/wic/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/boolq/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/virtual_stages/checkpoints/t5/3b/boolq/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/boolq/gpipe_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/virtual_stages/checkpoints/t5/3b/boolq/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 2, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/boolq/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/boolq/vs_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "virtual_stages_1f1b", 6 | "supremum_staleness": 13 7 | } 8 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/multirc/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/virtual_stages/checkpoints/t5/3b/boolq/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/multirc/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/rte/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/virtual_stages/checkpoints/t5/3b/rte/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/rte/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/superglue_rte/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/virtual_stages/checkpoints/t5/3b/gpipe/superglue_rte/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/superglue_rte/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/wic/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 16, 7 | "bs_test": 16, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/virtual_stages/checkpoints/t5/3b/wic/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_3b_p8/virtual_stages/wic/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_base/seq/boolq/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/3b/boolq/gpipe/", 10 | "save_checkpoints": false, 11 | "epochs": 2, 12 | "steps": -1 13 | } 14 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_base/seq/boolq/gpipe_new.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_new", 9 | "checkpoints_save_dir": "/nfs_Disk2/no_virtual_stages/checkpoints/t5/base/boolq/gpipe/", 10 | "model": "t5_base_tied_lmheads_512_4_8p_bw12_squad1_pipedream", 11 | "save_checkpoints": true, 12 | "epochs": -1, 13 | "steps": 3200, 14 | "dont_drop_last": true 15 | } 16 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_base/seq/boolq/pipedream_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "model": "t5_base_tied_lmheads_512_4_8p_bw12_squad1_pipedream", 7 | "dont_drop_last": true, 8 | "step_every": 5, 9 | "bs_train": 4 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_base/seq/boolq/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/L=32/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/boolq/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 2, 7 | "bs_test": 2, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/boolq/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/boolq/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/boolq/stale/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/multirc/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 1, 7 | "bs_test": 1, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/boolq/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/multirc/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/multirc/stale/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/rte/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 10, 6 | "bs_train": 4, 7 | "bs_test": 4, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/rte/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/rte/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/rte/stale/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b" 6 | } 7 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/wic/gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "work_scheduler": "GPIPE", 5 | "step_every": 8, 6 | "bs_train": 16, 7 | "bs_test": 16, 8 | "checkpoints_save_name_prefix": "gpipe_adafactor", 9 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/wic/gpipe/" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_mpipe/wic/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false, 5 | "work_scheduler": "1f1b", 6 | "checkpoints_save_name_prefix": "stale_adafactor", 7 | "checkpoints_save_dir": "/nfs_Disk2/mpipe/checkpoints/t5/3b/wic/stale/" 8 | } 9 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_small/README.md: -------------------------------------------------------------------------------- 1 | # t5-small, TODO: 2 | this whole thing was used as internal test and has deprecated use of 3 | a t5_squad dataset (adapted from huggingface example, which I found inaccurate) -------------------------------------------------------------------------------- /pipe/configs/t5/t5_small/adafactor/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_small/rte/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/t5/t5_small/stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": "common.json", 3 | "base_config_path_is_relative": true, 4 | "weight_stashing": false 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/vit/cifar100_384.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "/home_local/saareliad/data", 3 | "dataset": "cifar100_384", 4 | "epochs": -1 5 | } 6 | -------------------------------------------------------------------------------- /pipe/configs/vit/cifar10_384.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "/home_local/saareliad/data", 3 | "dataset": "cifar10_384", 4 | "epochs": -1, 5 | "steps": 10000 6 | } -------------------------------------------------------------------------------- /pipe/configs/vit/cv.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "type": "cv_per_step_lr_scheduler", 4 | "args": { 5 | } 6 | }, 7 | "statistics": "cv" 8 | } -------------------------------------------------------------------------------- /pipe/configs/vit/cv_dcgn_global.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "type": "cv_per_step_lr_scheduler_global_grad_norm", 4 | "args": { 5 | "always_calc_grad_norm": false, 6 | "max_grad_norm": 1.0 7 | } 8 | }, 9 | "statistics": "cv_grad_norm" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/vit/cv_dcgn_local.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "type": "cv_per_step_lr_scheduler_local_grad_norm", 4 | "args": { 5 | "always_calc_grad_norm": false, 6 | "max_grad_norm": 1.0 7 | } 8 | }, 9 | "statistics": "cv_grad_norm" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/vit/cv_dcgn_local_prop.json: -------------------------------------------------------------------------------- 1 | { 2 | "trainer": { 3 | "type": "cv_per_step_lr_scheduler_local_grad_norm_prop", 4 | "args": { 5 | "always_calc_grad_norm": false, 6 | "max_grad_norm": 1.0 7 | } 8 | }, 9 | "statistics": "cv_grad_norm" 10 | } 11 | -------------------------------------------------------------------------------- /pipe/configs/vit/imagenet_384.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "/home_local/saareliad/data/imagenet/", 3 | "dataset": "imagenet_384", 4 | "epochs": -1, 5 | "steps": 20000 6 | } -------------------------------------------------------------------------------- /pipe/configs/vit/tst_gpipe.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv.json", 4 | "imagenet_384.json", 5 | "vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/imagenet/", 9 | "out_dir": "results/vit/imagenet/", 10 | "auto_file_name": true, 11 | "out_filename": "tst_vit", 12 | "distributed_backend": "mpi", 13 | "bs_train": 32, 14 | "bs_test": 32, 15 | "num_data_workers": 10, 16 | "dont_drop_last": true, 17 | "step_every": 16, 18 | "log_frequency": 2000, 19 | "optimizer": { 20 | "type": "sgd1", 21 | "args": { 22 | "lr": 0.03, 23 | "weight_decay": 0, 24 | "momentum": 0.9, 25 | "nesterov": true 26 | } 27 | }, 28 | "lr_scheduler": { 29 | "type": "get_cosine_schedule_with_warmup", 30 | "preproc_args": { 31 | }, 32 | "args": { 33 | "num_warmup_steps": 500, 34 | "num_training_steps": 20000, 35 | "num_cycles": 0.5, 36 | "last_epoch": -1 37 | } 38 | }, 39 | "weight_stashing": false, 40 | "work_scheduler": "gpipe", 41 | "cudnn_benchmark": true, 42 | "seed_from_cmd": false, 43 | "seed": 42 44 | } -------------------------------------------------------------------------------- /pipe/configs/vit/tst_gpipe_adafactor_cifar100.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv.json", 4 | "cifar100_384.json", 5 | "vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/cifar100/adafactor/", 9 | "out_dir": "results/vit/cifar100/", 10 | "auto_file_name": true, 11 | "out_filename": "fast_adafactor", 12 | "distributed_backend": "mpi", 13 | "bs_train": 32, 14 | "bs_test": 32, 15 | "num_data_workers": 10, 16 | "dont_drop_last": false, 17 | "step_every": 16, 18 | "log_frequency": 100000, 19 | "optimizer": { 20 | "type": "adafactor", 21 | "args": { 22 | "lr": 0.03, 23 | "weight_decay": 0, 24 | "beta1": 0.9, 25 | "scale_parameter": true, 26 | "relative_step": false 27 | } 28 | }, 29 | "steps": 1000, 30 | "lr_scheduler": { 31 | "type": "get_cosine_schedule_with_warmup", 32 | "preproc_args": { 33 | }, 34 | "args": { 35 | "num_warmup_steps": 100, 36 | "num_training_steps": 1000, 37 | "num_cycles": 0.5, 38 | "last_epoch": -1 39 | } 40 | }, 41 | "weight_stashing": false, 42 | "work_scheduler": "gpipe", 43 | "cudnn_benchmark": true, 44 | "seed_from_cmd": false, 45 | "seed": 42 46 | } 47 | -------------------------------------------------------------------------------- /pipe/configs/vit/tst_gpipe_cifar100.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv.json", 4 | "cifar100_384.json", 5 | "vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/cifar100/no_grad_norm/", 9 | "out_dir": "results/vit/cifar100/", 10 | "auto_file_name": true, 11 | "out_filename": "no_grad_norm_no_nesterov_meanstd05", 12 | "distributed_backend": "mpi", 13 | "bs_train": 32, 14 | "bs_test": 32, 15 | "num_data_workers": 10, 16 | "dont_drop_last": false, 17 | "step_every": 16, 18 | "log_frequency": 100000, 19 | "optimizer": { 20 | "type": "sgd2", 21 | "args": { 22 | "lr": 0.03, 23 | "weight_decay": 0, 24 | "momentum": 0.9, 25 | "nesterov": false 26 | } 27 | }, 28 | "steps": 1000, 29 | "lr_scheduler": { 30 | "type": "get_cosine_schedule_with_warmup", 31 | "preproc_args": { 32 | }, 33 | "args": { 34 | "num_warmup_steps": 100, 35 | "num_training_steps": 1000, 36 | "num_cycles": 0.5, 37 | "last_epoch": -1 38 | } 39 | }, 40 | "weight_stashing": false, 41 | "work_scheduler": "gpipe", 42 | "cudnn_benchmark": true, 43 | "seed_from_cmd": false, 44 | "seed": 42 45 | } 46 | -------------------------------------------------------------------------------- /pipe/configs/vit/tst_gpipe_dcgn_global.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv_dcgn_global.json", 4 | "imagenet_384.json", 5 | "vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/imagenet/global/", 9 | "out_dir": "results/vit/imagenet/", 10 | "auto_file_name": true, 11 | "out_filename": "tst_vit_dcgn_global_no_nesterov_meanstd05", 12 | "distributed_backend": "mpi", 13 | "bs_train": 32, 14 | "bs_test": 128, 15 | "num_data_workers": 10, 16 | "dont_drop_last": false, 17 | "step_every": 16, 18 | "log_frequency": 200000, 19 | "optimizer": { 20 | "type": "sgd2", 21 | "args": { 22 | "lr": 0.03, 23 | "weight_decay": 0, 24 | "momentum": 0.9, 25 | "nesterov": false 26 | } 27 | }, 28 | "lr_scheduler": { 29 | "type": "get_cosine_schedule_with_warmup", 30 | "preproc_args": { 31 | }, 32 | "args": { 33 | "num_warmup_steps": 500, 34 | "num_training_steps": 20000, 35 | "num_cycles": 0.5, 36 | "last_epoch": -1 37 | } 38 | }, 39 | "weight_stashing": false, 40 | "work_scheduler": "gpipe", 41 | "cudnn_benchmark": true, 42 | "seed_from_cmd": false, 43 | "seed": 42 44 | } 45 | -------------------------------------------------------------------------------- /pipe/configs/vit/tst_gpipe_dcgn_global_cifar100.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv_dcgn_global.json", 4 | "cifar100_384.json", 5 | "vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/cifar100/global/", 9 | "out_dir": "results/vit/cifar100/", 10 | "auto_file_name": true, 11 | "out_filename": "fast_dcgn_global_no_nesterov_meanstd05", 12 | "distributed_backend": "mpi", 13 | "bs_train": 32, 14 | "bs_test": 32, 15 | "num_data_workers": 10, 16 | "dont_drop_last": false, 17 | "step_every": 16, 18 | "log_frequency": 100000, 19 | "optimizer": { 20 | "type": "sgd2", 21 | "args": { 22 | "lr": 0.03, 23 | "weight_decay": 0, 24 | "momentum": 0.9, 25 | "nesterov": false 26 | } 27 | }, 28 | "steps": 1000, 29 | "lr_scheduler": { 30 | "type": "get_cosine_schedule_with_warmup", 31 | "preproc_args": { 32 | }, 33 | "args": { 34 | "num_warmup_steps": 100, 35 | "num_training_steps": 1000, 36 | "num_cycles": 0.5, 37 | "last_epoch": -1 38 | } 39 | }, 40 | "weight_stashing": false, 41 | "work_scheduler": "gpipe", 42 | "cudnn_benchmark": true, 43 | "seed_from_cmd": false, 44 | "seed": 42 45 | } 46 | -------------------------------------------------------------------------------- /pipe/configs/vit/tst_gpipe_dcgn_local.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv_dcgn_local.json", 4 | "imagenet_384.json", 5 | "vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/imagenet/", 9 | "out_dir": "results/vit/imagenet/", 10 | "auto_file_name": true, 11 | "out_filename": "tst_vit_dcgn_local", 12 | "distributed_backend": "mpi", 13 | "bs_train": 4, 14 | "bs_test": 128, 15 | "num_data_workers": 10, 16 | "dont_drop_last": true, 17 | "step_every": 128, 18 | "log_frequency": 200000, 19 | "optimizer": { 20 | "type": "sgd1", 21 | "args": { 22 | "lr": 0.03, 23 | "weight_decay": 0, 24 | "momentum": 0.9, 25 | "nesterov": true 26 | } 27 | }, 28 | "lr_scheduler": { 29 | "type": "get_cosine_schedule_with_warmup", 30 | "preproc_args": { 31 | }, 32 | "args": { 33 | "num_warmup_steps": 500, 34 | "num_training_steps": 20000, 35 | "num_cycles": 0.5, 36 | "last_epoch": -1 37 | } 38 | }, 39 | "weight_stashing": false, 40 | "work_scheduler": "gpipe", 41 | "cudnn_benchmark": true, 42 | "seed_from_cmd": false, 43 | "seed": 42 44 | } 45 | -------------------------------------------------------------------------------- /pipe/configs/vit/tst_gpipe_dcgn_local_prop_cifar100.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv_dcgn_local_prop.json", 4 | "cifar100_384.json", 5 | "vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/cifar100/local_prop/", 9 | "out_dir": "results/vit/cifar100/", 10 | "auto_file_name": true, 11 | "out_filename": "fast_dcgn_local_prop_no_nesterov_meanstd05", 12 | "distributed_backend": "mpi", 13 | "bs_train": 32, 14 | "bs_test": 32, 15 | "num_data_workers": 10, 16 | "dont_drop_last": false, 17 | "step_every": 16, 18 | "log_frequency": 100000, 19 | "optimizer": { 20 | "type": "sgd2", 21 | "args": { 22 | "lr": 0.03, 23 | "weight_decay": 0, 24 | "momentum": 0.9, 25 | "nesterov": false 26 | } 27 | }, 28 | "steps": 1000, 29 | "lr_scheduler": { 30 | "type": "get_cosine_schedule_with_warmup", 31 | "preproc_args": { 32 | }, 33 | "args": { 34 | "num_warmup_steps": 100, 35 | "num_training_steps": 1000, 36 | "num_cycles": 0.5, 37 | "last_epoch": -1 38 | } 39 | }, 40 | "weight_stashing": false, 41 | "work_scheduler": "gpipe", 42 | "cudnn_benchmark": true, 43 | "seed_from_cmd": false, 44 | "seed": 42 45 | } 46 | -------------------------------------------------------------------------------- /pipe/configs/vit/tst_stale.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_config_path": [ 3 | "cv.json", 4 | "imagenet_384.json", 5 | "vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_async_acyclic.json" 6 | ], 7 | "base_config_path_is_relative": true, 8 | "logdir": "logs/vit/imagenet/", 9 | "out_dir": "results/vit/imagenet/", 10 | "auto_file_name": true, 11 | "out_filename": "tst_vit", 12 | "distributed_backend": "mpi", 13 | "bs_train": 128, 14 | "bs_test": 128, 15 | "num_data_workers": 10, 16 | "dont_drop_last": true, 17 | "step_every": 4, 18 | "optimizer": { 19 | "type": "sgd1", 20 | "args": { 21 | "lr": 0.03, 22 | "weight_decay": 0, 23 | "momentum": 0.9, 24 | "nesterov": true 25 | } 26 | }, 27 | "lr_scheduler": { 28 | "type": "get_cosine_schedule_with_warmup", 29 | "preproc_args": { 30 | }, 31 | "args": { 32 | "num_warmup_steps": 500, 33 | "num_training_steps": 20000, 34 | "num_cycles": 0.5, 35 | "last_epoch": -1 36 | } 37 | }, 38 | "weight_stashing": false, 39 | "work_scheduler": "1F1B", 40 | "cudnn_benchmark": true, 41 | "seed_from_cmd": false, 42 | "seed": 42 43 | } -------------------------------------------------------------------------------- /pipe/configs/vit/vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "vit_base_patch16_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic", 3 | "stage_to_device_map": [0,1,2,3,4,5,6,7], 4 | "nprocs": 8 5 | } -------------------------------------------------------------------------------- /pipe/configs/vit/vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_async_acyclic.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_async_acyclic", 3 | "stage_to_device_map": [0,1,2,3,4,5,6,7], 4 | "nprocs": 8 5 | } -------------------------------------------------------------------------------- /pipe/configs/vit/vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "vit_large_patch32_384_in21k_imagenet_384c384_8p_bw12_gpipe_acyclic", 3 | "stage_to_device_map": [0,1,2,3,4,5,6,7], 4 | "nprocs": 8 5 | } -------------------------------------------------------------------------------- /pipe/data/__init__.py: -------------------------------------------------------------------------------- 1 | # TODO: be explicit 2 | from typing import Optional, Dict, Any 3 | 4 | from .datasets import * 5 | from . import cv, lm, cep, vit, squad 6 | from .from_args_and_kw import * 7 | # Now, import all so available datasets will be loaded 8 | from .t5 import t5_tfds 9 | 10 | 11 | def is_explicit_non_seperated_dataset(args): 12 | return "_nonsep" in args.data_propagator 13 | 14 | 15 | def get_dataloaders(args, 16 | pipe_config: Optional[PipelineConfig] = None, 17 | dataset_keywords: Optional[Dict[str, Any]] = None): 18 | if dataset_keywords is None: 19 | dataset_keywords = dict() 20 | # TODO: replicated 21 | if not is_explicit_non_seperated_dataset(args): 22 | train_dl, test_dl, samplers, extra = get_separate_dls_from_args( 23 | args, 24 | pipe_config=pipe_config, 25 | verbose=False, 26 | dataset_keywords=dataset_keywords, 27 | shuffle_train=getattr(args, "shuffle_train", True) 28 | ) 29 | else: 30 | raise NotImplementedError("now deprecated") 31 | return train_dl, test_dl, samplers, extra 32 | -------------------------------------------------------------------------------- /pipe/data/cep.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import TensorDataset 2 | 3 | from models.normal.cep import Dataset 4 | from pipe.data import CommonDatasetHandler, register_dataset, register_hardcoded_just_xy_dataset 5 | 6 | 7 | def _get_separated_dataset(just, DATA_DIR, args, **dataset_keywords): 8 | if just is None: 9 | return TensorDataset(), TensorDataset() 10 | return Dataset(**args.cep_dataset_kwargs, just=just), Dataset(**args.cep_dataset_kwargs, just=just) 11 | 12 | 13 | class SEP_CEP_DatasetHandler(CommonDatasetHandler): 14 | def __init__(self, **kw): 15 | super().__init__() 16 | train_ds, test_ds = _get_separated_dataset(**kw) 17 | self.train_ds = train_ds 18 | self.test_ds = test_ds 19 | 20 | def get_train_ds(self, **kw): 21 | return self.train_ds 22 | 23 | def get_test_ds(self, **kw): 24 | return self.test_ds # TODO 25 | 26 | def get_validation_ds(self, **kw): 27 | NotImplementedError() 28 | 29 | def get_modify_trainer_fn(self): 30 | pass 31 | 32 | def modify_dataloader_keywords(self, dataloader_keywords): 33 | return dataloader_keywords 34 | 35 | 36 | register_dataset("cep", SEP_CEP_DatasetHandler) 37 | register_hardcoded_just_xy_dataset("cep") 38 | -------------------------------------------------------------------------------- /pipe/data/download/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/data/download/__init__.py -------------------------------------------------------------------------------- /pipe/data/hardcoded_dirs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Fallback to this dataset dir of no other dir is given as argument to functions. 4 | DEFAULT_DATA_DIR = os.path.expanduser('~/.pytorch-datasets') 5 | IMAGENET_ROOT_DIR = "/home_local/saareliad/data/imagenet/" 6 | # WIKI2_DATA_DIR = DATA_DIR/wikitext-2-raw 7 | -------------------------------------------------------------------------------- /pipe/data/t5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/data/t5/__init__.py -------------------------------------------------------------------------------- /pipe/env_utils/deprecated/Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /bin/bash 2 | # 3 | # The SHELL enebales us to source stuff. 4 | # 5 | ######################################## 6 | # Below are stuff for pytorch 1.3v # 7 | # (a compiled nighlty version) # 8 | ######################################## 9 | .PHONY: env 10 | env: 11 | # Install initial "easy" requirements from file. 12 | conda env create -f environment.yml 13 | conda activate msnag 14 | # solves some problems of conda feature tracking. 15 | conda config --env --add channels saareliad 16 | conda config --env --add pinned_packages saareliad::pytorch 17 | # Install pytorch: 18 | # optional, remove previous installation 19 | # conda uninstall pytorch -y # In case it intalled the normal pytorch somehow 20 | conda install -c saareliad pytorch -y 21 | python -c"import torch" 22 | 23 | # Install torchvision: 24 | # Note: we need to do it after we installed pytorch. 25 | # (1) Install faster pollow-simd with AVX2 support. 26 | # can if we have AVX2 support with grep avx2 /proc/cpuinfo 27 | pip uninstall pillow 28 | CC="cc -mavx2" pip install -U --force-reinstall pillow-simd 29 | # (2) Install torchvision from source. 30 | pip install git+https://github.com/pytorch/vision.git@v0.5.0 31 | 32 | # Note: torchvision has to be built with the same cuda as pytroch. (currently: 10.1) 33 | # if it does not work, just do 34 | # pip install torchvision==0.5 --no-dependencies, but we won't have AVX2. 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /pipe/env_utils/deprecated/old_environment_mpi1.yml: -------------------------------------------------------------------------------- 1 | name: msnag 2 | channels: 3 | - saareliad # should be explicitly set first, to gain priority over pytorch, however anaconda got channel priority problems (*) 4 | - pytorch 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - python=3.7 9 | - pip 10 | # - pytorch # (*) therefore I removed this, and add script in makefile. 11 | # - torchvision # Remove it too, as it requires pytorch... 12 | - se-msnag1 # cuda aware openmpi 13 | - cudatoolkit 14 | - jupyterlab 15 | - numpy 16 | - scikit-learn 17 | - ipython 18 | - jupyter 19 | - pandas 20 | - cython 21 | - pytest 22 | - tqdm 23 | - sympy 24 | # - pybind11 # For binding c++ code to python, may be used to increace performance. 25 | - graphviz # For partitioning visualization 26 | - python-graphviz # For partitioning visualization 27 | - networkx # For partitioning 28 | - seaborn 29 | - pip: 30 | - git+https://github.com/networkx/networkx-metis.git # For partitioning 31 | - matplotlib 32 | - nbmerge 33 | - pytest-xdist 34 | - flake8 35 | - autopep8 36 | - rope 37 | - click 38 | - transformers 39 | - ptvsd 40 | - t5 41 | - datasets 42 | -------------------------------------------------------------------------------- /pipe/env_utils/deprecated/old_environment_mpi2.yml: -------------------------------------------------------------------------------- 1 | name: msnag2 2 | channels: 3 | - saareliad # should be explicitly set first, to gain priority over pytorch, however anaconda got channel priority problems (*) 4 | - pytorch 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - python=3.8 9 | - pip 10 | - se-msnag2 # cuda aware openmpi 11 | - magma-cuda102 12 | - numpy 13 | - ninja 14 | - pyyaml 15 | - mkl 16 | - mkl-include 17 | - setuptools 18 | - cmake 19 | - cffi 20 | - jupyterlab 21 | - scikit-learn 22 | - ipython 23 | - jupyter 24 | - pandas 25 | - cython 26 | - pytest 27 | - tqdm 28 | - sympy 29 | - pybind11 # For binding c++ code to python, may be used to increace performance. 30 | - graphviz # For partitioning visualization 31 | - python-graphviz # For partitioning visualization 32 | - networkx # For partitioning 33 | - seaborn 34 | - pip: 35 | - git+https://github.com/networkx/networkx-metis.git # For partitioning 36 | - matplotlib 37 | - nbmerge 38 | - pytest-xdist 39 | - flake8 40 | - autopep8 41 | - rope 42 | - click 43 | - transformers 44 | - ptvsd 45 | - t5 46 | - datasets 47 | -------------------------------------------------------------------------------- /pipe/env_utils/docker/.gitignore: -------------------------------------------------------------------------------- 1 | pytorch-conda-recipe 2 | pytorch-recipe -------------------------------------------------------------------------------- /pipe/env_utils/docker/Dockerfile_from_source: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu16.04 2 | ARG PYTHON_VERSION=3.8 3 | ARG WITH_TORCHVISION=1 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | build-essential \ 6 | cmake \ 7 | git \ 8 | curl \ 9 | ca-certificates \ 10 | libjpeg-dev \ 11 | libpng-dev && \ 12 | rm -rf /var/lib/apt/lists/* 13 | 14 | 15 | RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ 16 | chmod +x ~/miniconda.sh && \ 17 | ~/miniconda.sh -b -p /opt/conda && \ 18 | rm ~/miniconda.sh && \ 19 | /opt/conda/bin/conda install -y python=$PYTHON_VERSION pip numpy pyyaml scipy ipython mkl mkl-include ninja cython && \ 20 | /opt/conda/bin/conda install -y -c pytorch magma-cuda102 && \ 21 | /opt/conda/bin/conda install -y -c saareliad se-msnag2 && \ 22 | /opt/conda/bin/conda clean -ya 23 | ENV PATH /opt/conda/bin:$PATH 24 | # This must be done before pip so that requirements.txt is available 25 | WORKDIR /opt/pytorch 26 | COPY . . 27 | 28 | RUN git submodule sync && git submodule update --init --recursive 29 | RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ 30 | CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ 31 | pip install -v . 32 | 33 | ENV CC "cc -mavx2" 34 | RUN pip uninstall pillow 35 | RUN pip install -U --force-reinstall pillow-simd 36 | 37 | RUN if [ "$WITH_TORCHVISION" = "1" ] ; then git clone https://github.com/pytorch/vision.git && cd vision && pip install -v . ; else echo "building without torchvision" ; fi 38 | 39 | WORKDIR /workspace 40 | RUN chmod -R a+w . -------------------------------------------------------------------------------- /pipe/env_utils/docker/ompi-recipe/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG baseImage=nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 2 | FROM $baseImage 3 | # pytorch args 4 | # ARG PYTHON_VERSION=3.7 5 | # ARG WITH_TORCHVISION=1 6 | 7 | # building ompi 8 | RUN apt-get update && \ 9 | apt-get install -y --no-install-recommends \ 10 | bzip2 \ 11 | ca-certificates \ 12 | curl \ 13 | wget && \ 14 | rm -rf /var/lib/apt/lists/* 15 | 16 | RUN wget "https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh" -O /opt/miniconda.sh && \ 17 | chmod +x /opt/miniconda.sh && \ 18 | /opt/miniconda.sh -b -p /opt/conda && \ 19 | /opt/conda/bin/conda update -n base conda && \ 20 | rm /opt/miniconda.sh 21 | 22 | ENV PATH /opt/conda/bin:${PATH} 23 | 24 | RUN conda install -c anaconda \ 25 | anaconda-client \ 26 | conda-build \ 27 | conda-verify && \ 28 | conda clean -ya 29 | 30 | COPY . /opt/ompi-recipe 31 | 32 | WORKDIR /opt/ompi-recipe 33 | -------------------------------------------------------------------------------- /pipe/env_utils/docker/ompi-recipe/ompi-cuda/build.sh: -------------------------------------------------------------------------------- 1 | export CC=`basename $CC` 2 | export CXX=`basename $CXX` 3 | export LIBRARY_PATH=$PREFIX/lib 4 | 5 | pushd ompi && \ 6 | ./configure --prefix=$PREFIX \ 7 | --disable-dependency-tracking \ 8 | --disable-mpi-fortran \ 9 | --disable-wrapper-rpath \ 10 | --disable-wrapper-runpath \ 11 | --with-cuda \ 12 | --with-wrapper-cflags="-I$PREFIX/include" \ 13 | --with-wrapper-cxxflags="-I$PREFIX/include" \ 14 | --with-wrapper-ldflags="-L$PREFIX/lib -Wl,-rpath,$PREFIX/lib" && \ 15 | make -j${CPU_COUNT} all && \ 16 | make install && \ 17 | popd 18 | 19 | #--with-sge \ 20 | #--with-slrum \ 21 | -------------------------------------------------------------------------------- /pipe/env_utils/docker/ompi-recipe/ompi-cuda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "se-msnag2" %} 2 | {% set ompiVersion = "4.0.3" %} 3 | {% set ompiVersionShort = "4.0" %} 4 | {% set cudaVersion = '.'.join(environ.get('CUDA_VERSION', '10.2').split('.')[:2]) %} 5 | 6 | package: 7 | name: {{ name }} 8 | version: {{ ompiVersion }} 9 | 10 | source: 11 | - url: https://download.open-mpi.org/release/open-mpi/v{{ ompiVersionShort }}/openmpi-{{ ompiVersion }}.tar.gz 12 | folder: ompi 13 | 14 | build: 15 | number: 0 16 | noarch: generic 17 | string: cuda{{ cudaVersion }} 18 | 19 | requirements: 20 | build: 21 | - ca-certificates 22 | - cmake 23 | - git 24 | - make 25 | - zlib 26 | - {{ compiler('c') }} 27 | - {{ compiler('cxx') }} 28 | run: 29 | - {{ compiler('c') }} 30 | - {{ compiler('cxx') }} 31 | 32 | test: 33 | commands: 34 | - mpiexec --version 35 | - orte-info 36 | 37 | about: 38 | summary: "No free meals until we publish :)" 39 | 40 | extra: 41 | recipe-maintainers: 42 | - saareliad 43 | -------------------------------------------------------------------------------- /pipe/env_utils/docker/ompi-recipe/ompi_conda.sh: -------------------------------------------------------------------------------- 1 | # without docker: 2 | # wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.3.tar.gz 3 | # tar -xvzf openmpi-4.0.3.tar.gz ompi 4 | 5 | docker build -t conda:ompi-dev . && docker run --rm -it --runtime nvidia conda:ompi-dev /bin/bash 6 | # inside-container# anaconda login 7 | # inside-container# conda build ompi-cuda 8 | # inside-container# anaconda upload /path/to/ompi-cuda/package.tar.gz 9 | # anaconda upload /opt/conda/conda-bld/noarch/se-msnag2-4.0.3-cuda10.2.tar.bz2 10 | 11 | 12 | 13 | # # The path is determind by prefix env var (?) 14 | # # export PREFIX='' 15 | # Or somewhere in conda build: /opt/conda/... see dockerfile. -------------------------------------------------------------------------------- /pipe/env_utils/env_add_to_build_from_source.yml: -------------------------------------------------------------------------------- 1 | name: pt2 2 | channels: 3 | - defaults 4 | dependencies: 5 | - scikit-learn # here we start 6 | - jupyter 7 | - jupyterlab 8 | - pandas 9 | - tqdm 10 | - sympy 11 | - networkx 12 | - matplotlib 13 | - pip 14 | - pip: 15 | - ptvsd 16 | - flake8 17 | - autopep8 18 | - transformers 19 | - yapf 20 | - t5==0.7.1 21 | - datasets 22 | - git+https://github.com/networkx/networkx-metis.git # For partitioning 23 | - timm 24 | - sortedcollections 25 | - graphviz 26 | - adjustText 27 | - seaborn 28 | 29 | 30 | 31 | #prefix: /home_local/saareliad/miniconda3/envs/py38 32 | 33 | -------------------------------------------------------------------------------- /pipe/env_utils/env_without_mpi.yml: -------------------------------------------------------------------------------- 1 | name: nompi 2 | channels: 3 | - pytorch 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - python=3.8 8 | - pytorch=1.8.1 9 | - torchvision=0.9.1 10 | - scikit-learn # here we start 11 | - jupyter 12 | - jupyterlab 13 | - pandas 14 | - tqdm 15 | - sympy 16 | - networkx 17 | - matplotlib 18 | - scipy 19 | - ipython 20 | - numpy 21 | - pip 22 | - pip: 23 | - ptvsd 24 | - flake8 25 | - autopep8 26 | - transformers 27 | - yapf 28 | - t5==0.7.1 29 | - datasets 30 | - seaborn 31 | - timm 32 | - sortedcollections 33 | - graphviz 34 | - adjustText 35 | - git+https://github.com/networkx/networkx-metis.git # For partitioning -------------------------------------------------------------------------------- /pipe/env_utils/jupyter-lab.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ### 4 | # jupyter-lab.sh 5 | # 6 | # This script is intended to help you run jupyter lab on servers. 7 | # 8 | # Example usage: 9 | # 10 | # To run on the gateway machine (limited resources, no GPU): 11 | # ./jupyter-lab.sh 12 | # 13 | # To run on a compute node: 14 | # srun -c 2 --gres=gpu:1 --pty jupyter-lab.sh 15 | # 16 | 17 | ### 18 | # Conda parameters 19 | # 20 | HH=$HOME 21 | test "$(hostname)" == 'ninja1' && HH=/home_local/$USER 22 | test "$(hostname)" == 'ninja2' && HH=/home_local/$USER 23 | test "$(hostname)" == 'ninja4' && HH=/home_local/$USER 24 | test "$(hostname)" == 'rambo1' && HH=/home_local/$USER 25 | test "$(hostname)" == 'rambo2' && HH=/home_local/$USER 26 | test "$(hostname)" == 'rambo3' && HH=/home_local/$USER 27 | test "$(hostname)" == 'rambo4' && HH=/home_local/$USER 28 | test "$(hostname)" == 'rambo5' && HH=/home_local/$USER 29 | 30 | CONDA_HOME=$HH/miniconda3 31 | 32 | CONDA_ENV=py38 33 | 34 | unset XDG_RUNTIME_DIR 35 | source $CONDA_HOME/etc/profile.d/conda.sh 36 | conda activate $CONDA_ENV 37 | 38 | jupyter lab --no-browser --ip=$(hostname -I | cut -d' ' -f1) --port-retries=100 39 | 40 | -------------------------------------------------------------------------------- /pipe/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from .experiments import load_experiment, load_experiment_for_update, save_experiment -------------------------------------------------------------------------------- /pipe/experiments/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/experiments/analysis/__init__.py -------------------------------------------------------------------------------- /pipe/experiments/t5/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/experiments/t5/__init__.py -------------------------------------------------------------------------------- /pipe/misc/deepspeed.py: -------------------------------------------------------------------------------- 1 | # need to change 2 | # https://github.com/microsoft/DeepSpeed/blob/01726ce2b8ec1adbffae7974b5bfe600962c2043/deepspeed/runtime/engine.py#L545 3 | # to support other optimizers (adagrad) 4 | 5 | 6 | # doe they support fp32? 7 | # https://github.com/microsoft/DeepSpeed/issues/109 8 | 9 | 10 | # { 11 | # "train_batch_size": 8, 12 | # "gradient_accumulation_steps": 1, 13 | # "steps_per_print": 1, 14 | # "zero_optimization": true, 15 | # "fp32_allreduce": true, 16 | # "optimizer": { 17 | # "type": "Adam", 18 | # "params": { 19 | # "lr": 0.0001 20 | # } 21 | # }, 22 | # 23 | # "fp16": { 24 | # "enabled": false 25 | # } 26 | # } 27 | # 28 | 29 | -------------------------------------------------------------------------------- /pipe/misc/mesh_failed_runs/git_log_head.txt: -------------------------------------------------------------------------------- 1 | commit 1c70f92b22159d55d2f57eb5e34dbd78f067799f Author: Adam Roberts Date: Mon Aug 31 11:52:25 2020 -0700 Allow additional gin configs to be passed to MtfModel for parsing after the operative config. PiperOrigin-RevId: 329347298 commit 3effd60e0fb052cd6519c279c7f026ee9f1a0975 Author: Sharan Narang 2 | -------------------------------------------------------------------------------- /pipe/misc/mesh_failed_runs/run.sh: -------------------------------------------------------------------------------- 1 | t5_mesh_transformer \ 2 | --model_dir="t5_3b_recompute_fp32" \ 3 | --gin_file="dataset.gin" \ 4 | --gin_param="utils.run.mesh_shape = 'model:8,batch:1'" \ 5 | --gin_param="utils.run.mesh_devices = ['gpu:0', 'gpu:1', 'gpu:2', 'gpu:3', 'gpu:4', 'gpu:5', 'gpu:6', 'gpu:7']" \ 6 | --gin_param="MIXTURE_NAME = 'glue_rte_v002'" \ 7 | --gin_param="run.train_steps = 1004000" \ 8 | --gin_param="tokens_per_batch=12800" \ 9 | --gin_param="inputs_length = 320" \ 10 | --gin_param="targets_length = 8" \ 11 | --gin_param="pack_or_pad.pack = False" \ 12 | --gin_param="serialize_num_microbatches.tokens_per_microbatch_per_replica = 320" \ 13 | --gin_param="encoder/LayerStack.recompute_grads = True" \ 14 | --gin_param="decoder/LayerStack.recompute_grads = True" \ 15 | --gin_file="learning_rate_schedules/constant_0_001.gin" \ 16 | --gin_file="gs://t5-data/pretrained_models/3B/operative_config.gin" 17 | 18 | 19 | 20 | # --gin_param="run.sequence_length = {'inputs': 320, 'targets': 8}" 21 | # --gin_param="get_variable_dtype.activation_dtype = 'float32'" \ 22 | # --gin_param="get_variable_dtype.master_dtype = 'float32'" \ 23 | # --gin_param="get_variable_dtype.slice_dtype = 'float33'" \ 24 | -------------------------------------------------------------------------------- /pipe/misc/mesh_failed_runs/run_3b_failed.sh: -------------------------------------------------------------------------------- 1 | t5_mesh_transformer \ 2 | --model_dir="model" \ 3 | --gin_file="dataset.gin" \ 4 | --gin_param="utils.run.mesh_shape = 'model:8,batch:1'" \ 5 | --gin_param="utils.run.mesh_devices = ['gpu:0', 'gpu:1', 'gpu:2', 'gpu:3', 'gpu:4', 'gpu:5', 'gpu:6', 'gpu:7']" \ 6 | --gin_param="MIXTURE_NAME = 'glue_rte_v002'" \ 7 | --gin_param="run.train_steps = 1004000" \ 8 | --gin_param="tokens_per_batch=12800" \ 9 | --gin_param="inputs_length = 320" \ 10 | --gin_param="targets_length = 8" \ 11 | --gin_param="pack_or_pad.pack = False" \ 12 | --gin_param="serialize_num_microbatches.tokens_per_microbatch_per_replica = 1280" \ 13 | --gin_file="learning_rate_schedules/constant_0_001.gin" \ 14 | --gin_file="gs://t5-data/pretrained_models/3B/operative_config.gin" 15 | 16 | 17 | 18 | # --gin_param="run.sequence_length = {'inputs': 320, 'targets': 8}" 19 | -------------------------------------------------------------------------------- /pipe/misc/mesh_failed_runs/run_3b_failed_recom_omm.sh: -------------------------------------------------------------------------------- 1 | t5_mesh_transformer \ 2 | --model_dir="t5_3b_recompute_fp32" \ 3 | --gin_file="dataset.gin" \ 4 | --gin_param="utils.run.mesh_shape = 'model:8,batch:1'" \ 5 | --gin_param="utils.run.mesh_devices = ['gpu:0', 'gpu:1', 'gpu:2', 'gpu:3', 'gpu:4', 'gpu:5', 'gpu:6', 'gpu:7']" \ 6 | --gin_param="MIXTURE_NAME = 'glue_rte_v002'" \ 7 | --gin_param="run.train_steps = 1004000" \ 8 | --gin_param="tokens_per_batch=12800" \ 9 | --gin_param="inputs_length = 320" \ 10 | --gin_param="targets_length = 8" \ 11 | --gin_param="pack_or_pad.pack = False" \ 12 | --gin_param="serialize_num_microbatches.tokens_per_microbatch_per_replica = 1280" \ 13 | --gin_param="encoder/LayerStack.recompute_grads = True" \ 14 | --gin_param="decoder/LayerStack.recompute_grads = True" \ 15 | --gin_file="learning_rate_schedules/constant_0_001.gin" \ 16 | --gin_file="gs://t5-data/pretrained_models/3B/operative_config.gin" 17 | 18 | 19 | 20 | # --gin_param="run.sequence_length = {'inputs': 320, 'targets': 8}" 21 | # --gin_param="get_variable_dtype.activation_dtype = 'float32'" \ 22 | # --gin_param="get_variable_dtype.master_dtype = 'float32'" \ 23 | # --gin_param="get_variable_dtype.slice_dtype = 'float33'" \ 24 | -------------------------------------------------------------------------------- /pipe/misc/p2p_bw_mat.sh: -------------------------------------------------------------------------------- 1 | DIR=/usr/local/cuda/samples/1_Utilities/p2pBandwidthLatencyTest/ 2 | 3 | cd ${DIR} 4 | sudo make 5 | ./p2pBandwidthLatencyTest 6 | cd - 7 | -------------------------------------------------------------------------------- /pipe/misc/print_partition_layers_scopes.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | # from models.partitioned.t5_3b_tied_lmheads_320_8_8p_bw12_squad1_virtual_stages import * 4 | # from models.partitioned.t5_3b_tied_lmheads_64_4_8p_bw12_squad1_acyclic import * 5 | from models.partitioned.t5_3b_tied_lmheads_64_4_8p_bw12_squad1_pipedream import * 6 | 7 | if __name__ == '__main__': 8 | 9 | 10 | for i, v in list(locals().items()): 11 | i: str 12 | if not i.startswith("Partition"): 13 | continue 14 | print(i) 15 | pprint(v.LAYER_SCOPES) 16 | pprint(v.TENSORS) 17 | print() 18 | 19 | -------------------------------------------------------------------------------- /pipe/misc/test_mpi/README.md: -------------------------------------------------------------------------------- 1 | ## Test MPI run 2 | [adapted from here](https://medium.com/@esaliya/pytorch-distributed-with-mpi-acb84b3ae5fd) 3 | Test if pytorch has openmpi backend 4 | 5 | ``` 6 | mpirun -np 2 python pytorch_distributed.py 7 | ``` 8 | 9 | ## Test for multiple machines 10 | on each machine: 11 | ``` 12 | mpirun --hostfile nodes.txt --map-by node -np 2 python pytorch_distributed.py 13 | ``` 14 | testing for ninja4 and ninja2 (in this order) 15 | 16 | TODO: play with init to make it work (did not work OOTB) 17 | -------------------------------------------------------------------------------- /pipe/misc/test_mpi/nodes.txt: -------------------------------------------------------------------------------- 1 | 132.68.36.205 2 | 132.68.36.203 3 | -------------------------------------------------------------------------------- /pipe/misc/test_mpi/pytorch_distributed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | def run(rank, size, hostname): 8 | print(f"I am {rank} of {size} in {hostname}") 9 | tensor = torch.zeros(1) 10 | if rank == 0: 11 | tensor += 1 12 | # Send the tensor to process 1 13 | dist.send(tensor=tensor, dst=1) 14 | else: 15 | # Receive tensor from process 0 16 | dist.recv(tensor=tensor, src=0) 17 | print('Rank ', rank, ' has data ', tensor[0]) 18 | 19 | 20 | def init_processes(rank, size, hostname, fn, backend='tcp'): 21 | """ Initialize the distributed environment. """ 22 | dist.init_process_group(backend, rank=rank, world_size=size) 23 | fn(rank, size, hostname) 24 | 25 | 26 | if __name__ == "__main__": 27 | world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) 28 | world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) 29 | hostname = socket.gethostname() 30 | init_processes(world_rank, world_size, hostname, run, backend='mpi') 31 | -------------------------------------------------------------------------------- /pipe/misc/test_mpi/pytorch_distributed_cuda_aware.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | def run(rank, size, hostname): 8 | print(f"I am {rank} of {size} in {hostname}") 9 | tensor = torch.zeros(1).cuda() 10 | if rank == 0: 11 | tensor += 1 12 | # Send the tensor to process 1 13 | dist.send(tensor=tensor, dst=1) 14 | else: 15 | # Receive tensor from process 0 16 | dist.recv(tensor=tensor, src=0) 17 | print('Rank ', rank, ' has data ', tensor[0]) 18 | 19 | 20 | def init_processes(rank, size, hostname, fn, backend='tcp'): 21 | """ Initialize the distributed environment. """ 22 | dist.init_process_group(backend, rank=rank, world_size=size) 23 | fn(rank, size, hostname) 24 | 25 | 26 | if __name__ == "__main__": 27 | world_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) 28 | world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) 29 | hostname = socket.gethostname() 30 | init_processes(world_rank, world_size, hostname, run, backend='mpi') 31 | -------------------------------------------------------------------------------- /pipe/misc/transformers/TODO.md: -------------------------------------------------------------------------------- 1 | # TODOs 2 | * try with the exact same model we use 3 | * try without tied wieghts 4 | * try without grad clip 5 | * try with weight decay 6 | -------------------------------------------------------------------------------- /pipe/misc/transformers/analyze_res.py: -------------------------------------------------------------------------------- 1 | import os 2 | from torch import tensor # for parsing. 3 | import warnings 4 | from pprint import pprint 5 | 6 | relative_dir_names = [f"epoch_{i}" for i in range(3)] + [""] 7 | dirs = [os.path.join(os.getcwd(), d) for d in relative_dir_names] 8 | files = [os.path.join(d, "eval_results.txt") for d in dirs] 9 | 10 | epoch_to_ppl = {} 11 | for epoch, file in enumerate(files): 12 | with open(file, "r") as f: 13 | perplexity = None 14 | exec(f.read()) 15 | if perplexity is None: 16 | pass 17 | # s = "perplexity is None, epoch:{epoch}".format(epoch=epoch) 18 | # warnings.warn(s) 19 | else: 20 | epoch_to_ppl[epoch] = perplexity.item() 21 | 22 | pprint(epoch_to_ppl) 23 | -------------------------------------------------------------------------------- /pipe/misc/transformers/bert-large/run.sh: -------------------------------------------------------------------------------- 1 | export SQUAD_DIR=/home_local/saareliad/data/squad2/ 2 | export OMP_NUM_THREADS=10 3 | # MODEL="deepset/bert-large-uncased-whole-word-masking-squad2" # its not finetuned... 4 | MODEL="bert-large-uncased-whole-word-masking" 5 | function eval(){ 6 | python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \ 7 | --model_type bert \ 8 | --model_name_or_path ${MODEL} \ 9 | --do_eval \ 10 | --do_lower_case \ 11 | --train_file $SQUAD_DIR/train-v2.0.json \ 12 | --predict_file $SQUAD_DIR/dev-v2.0.json \ 13 | --learning_rate 3e-5 \ 14 | --num_train_epochs 2 \ 15 | --max_seq_length 384 \ 16 | --doc_stride 128 \ 17 | --output_dir ./wwm_uncased_finetuned_squad2/ \ 18 | --per_gpu_eval_batch_size=16 19 | } 20 | 21 | function train(){ 22 | python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \ 23 | --model_type bert \ 24 | --model_name_or_path ${MODEL} \ 25 | --do_eval \ 26 | --do_train \ 27 | --do_lower_case \ 28 | --train_file $SQUAD_DIR/train-v2.0.json \ 29 | --predict_file $SQUAD_DIR/dev-v2.0.json \ 30 | --learning_rate 3e-5 \ 31 | --num_train_epochs 2 \ 32 | --max_seq_length 384 \ 33 | --doc_stride 128 \ 34 | --output_dir ./wwm_uncased_finetuned_squad2/ \ 35 | --per_gpu_train_batch_size=3 \ 36 | --per_gpu_eval_batch_size=3 37 | } 38 | 39 | train 40 | -------------------------------------------------------------------------------- /pipe/misc/tst_ibroadcast.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | BACKEND = 'gloo' 7 | CUDA = True 8 | 9 | if __name__ == "__main__": 10 | dist.init_process_group(BACKEND, init_method="env://", world_size=2) 11 | pg = dist.new_group(ranks=[0, 1], backend=BACKEND) 12 | pg2 = dist.new_group(ranks=[0, 1], backend=BACKEND) 13 | shape = (10, 10, 1000) 14 | 15 | res = torch.ones(*shape) 16 | buff = torch.zeros(*shape) 17 | if CUDA: 18 | res = res.cuda() 19 | buff = buff.cuda() 20 | 21 | start = time.time() 22 | if dist.get_rank() == 0: 23 | tensor = torch.ones(*shape).cuda() 24 | o = dist.broadcast(tensor, 0, async_op=True, group=pg) 25 | o2 = dist.broadcast(buff, 1, async_op=True, group=pg2) 26 | 27 | else: 28 | tensor = buff 29 | tensor2 = buff.clone() + 6 30 | o2 = dist.broadcast(tensor2, 1, async_op=True, group=pg2) 31 | o = dist.broadcast(tensor, 0, async_op=True, group=pg) 32 | 33 | if dist.get_rank() == 1: 34 | o.wait() 35 | end = time.time() 36 | print(end - start) 37 | 38 | print(torch.sum(tensor)) 39 | print(tensor.dtype) 40 | assert torch.all(tensor == res) 41 | print("Done") 42 | end = time.time() 43 | print(end - start) 44 | 45 | if dist.get_rank() == 0: 46 | o2.wait() 47 | print("Done o2", "got", buff.sum()) 48 | 49 | """ 50 | python -m torch.distributed.launch --nnodes 1 --nproc_per_node 2 --node_rank 0 tst_iboradcast.py 51 | 52 | """ 53 | -------------------------------------------------------------------------------- /pipe/misc/tst_isend.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | """ Will test MPI with this later... """ 5 | 6 | BACKAND = 'gloo' 7 | CUDA = False 8 | 9 | if __name__ == "__main__": 10 | dist.init_process_group(BACKAND, init_method="env://", world_size=2) 11 | shape = (10, 10, 10) 12 | if dist.get_rank() == 0: 13 | if CUDA: 14 | o = dist.isend(torch.ones(*shape).cuda(), 1, tag=4) 15 | o2 = dist.isend(torch.ones(*shape).mul_(2), 1, tag=6) 16 | else: 17 | if CUDA: 18 | tensor = torch.zeros(*shape) 19 | tensor2 = torch.zeros(*shape) 20 | 21 | if CUDA: 22 | o = dist.irecv(tensor, 0, tag=4) 23 | 24 | o2 = dist.irecv(tensor2, 0, tag=6) 25 | 26 | if dist.get_rank() == 1: 27 | if CUDA: 28 | o.wait() 29 | o2.wait() 30 | if CUDA: 31 | print("tensor", torch.sum(tensor), tensor.dtype) 32 | 33 | print("tensor2", torch.sum(tensor2), tensor2.dtype) 34 | 35 | if CUDA: 36 | assert torch.all(tensor == torch.ones(*shape).cuda()) 37 | 38 | assert torch.all(tensor2 == torch.ones(*shape).mul_(2)) 39 | print("Done") 40 | 41 | """ 42 | python -m torch.distributed.launch --nnodes 1 --nproc_per_node 2 --node_rank 0 misc/tst_isend.py 43 | 44 | """ 45 | -------------------------------------------------------------------------------- /pipe/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import transformers_utils 2 | from . import transformers_cfg 3 | from . import parse_config 4 | from .registery import AVAILABLE_MODELS, register_model -------------------------------------------------------------------------------- /pipe/models/registery/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | 4 | from . import model_handler 5 | from .model_handler import AVAILABLE_MODELS, register_model 6 | 7 | 8 | # from . import cv 9 | # from . import hf 10 | # from . import vit 11 | # from . import cep 12 | # from . import dummy 13 | 14 | def _import_handlers_from_dir(tasks_dir=os.path.dirname(__file__), 15 | module_name='.models.registery.', package="pipe"): 16 | """ Automatically import any Python files in the tasks directory 17 | in order to automatically register all available tasks 18 | Args: 19 | tasks_dir: task dir to import from 20 | """ 21 | 22 | for file in os.listdir(tasks_dir): 23 | path = os.path.join(tasks_dir, file) 24 | if ( 25 | not file.startswith('_') 26 | and not file.startswith('.') 27 | and (file.endswith('.py') or os.path.isdir(path)) 28 | ): 29 | task_name = file[:file.find('.py')] if file.endswith('.py') else file 30 | 31 | importlib.import_module(module_name + task_name, package=package) 32 | 33 | 34 | _import_handlers_from_dir() 35 | -------------------------------------------------------------------------------- /pipe/models/registery/cep.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from models.normal.cep import Net 4 | from pipe.models.registery.model_handler import CommonModelHandler 5 | 6 | 7 | def get_cep_model(n=50, k=11, c=500, n_split=4): 8 | model = Net(n, c, n_split=n_split) 9 | return model 10 | 11 | 12 | class CEPModelHandler(CommonModelHandler): 13 | def __init__(self, normal_model_fn, *args, **kw): 14 | super().__init__(*args, **kw) 15 | self.normal_model_fn = normal_model_fn 16 | 17 | def _get_normal_model_instance(self, *args, **kwargs): 18 | return self.normal_model_fn(*args, **kwargs) 19 | 20 | 21 | CEPModelHandler( 22 | normal_model_fn=functools.partial(get_cep_model, n=50, c=500, n_split=4)).register_autogenerated( 23 | generated_file_name_or_path="cep_netN50_C500_4p_bw12_metis") 24 | 25 | CEPModelHandler( 26 | normal_model_fn=functools.partial(get_cep_model, n=50, c=20000, n_split=4)).register_autogenerated( 27 | generated_file_name_or_path="cep_netN50_C20000_4p_bw12_metis") 28 | -------------------------------------------------------------------------------- /pipe/models/registery/dummy.py: -------------------------------------------------------------------------------- 1 | from pipe.models.registery.model_handler import CommonModelHandler 2 | from autopipe.tasks.dummy_t5 import DumT5Partitioner, T5Tokenizer 3 | from types import SimpleNamespace 4 | 5 | class DummyModelHandler(CommonModelHandler): 6 | def __init__(self, *args, **kw): 7 | super().__init__(*args, **kw) 8 | 9 | def _get_normal_model_instance(self, *args, **kwargs): 10 | if self.normal_model_instance is None: 11 | 12 | args = SimpleNamespace() 13 | p = DumT5Partitioner(args) 14 | args.lmhead = True 15 | args.stateless_tied = True 16 | args.precompute_masks = False 17 | self.normal_model_instance = p.get_model(args) 18 | self.tokenizer = p.tokenizer 19 | self.config = p.config 20 | 21 | return self.normal_model_instance 22 | 23 | def get_extra(self, *args, **kw): 24 | return dict(config=self.config, tokenizer=self.tokenizer) 25 | 26 | 27 | DummyModelHandler().register_autogenerated("DUMMY_LAYERSt5_base_tied_lmheads_512_4_2p_bw12_squad1_pipedream") 28 | DummyModelHandler().register_autogenerated("DUMMY_t5_base_tied_lmheads_512_4_2p_bw12_squad1_pipedream") 29 | DummyModelHandler().register_autogenerated("DUMMY_LAYERSt5_base_tied_lmheads_512_4_2p_bw12_squad1_mpipe") 30 | DummyModelHandler().register_autogenerated("DUMMY_t5_base_tied_lmheads_512_4_2p_bw12_squad1_mpipe") 31 | -------------------------------------------------------------------------------- /pipe/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .adafactor import Adafactor 2 | from .adam import Adam 3 | from .adam_record import Adam as AdamGA 4 | from .adamw import AdamW 5 | from .adamw_record import AdamW as AdamWGA 6 | # .data parameter update is only change (pytorch 1.5) 7 | from .sgd import SGD as PytorchSGD 8 | from .sutskever_modified_sgd import SGD as SutskeverSGD 9 | 10 | # from .utils import linear_lr_scaling 11 | 12 | AVAILBALE_OPTIMIZERS = { 13 | 'sgd1': PytorchSGD, 14 | 'sgd2': SutskeverSGD, 15 | 'adam': Adam, 16 | 'adamw': AdamW, 17 | 'adam_record_step': AdamGA, 18 | 'adamw_record_step': AdamWGA, 19 | 'adafactor': Adafactor 20 | } 21 | -------------------------------------------------------------------------------- /pipe/optimizers/required.py: -------------------------------------------------------------------------------- 1 | """ hack - copied from Pytorch because the API hides it - so modified models will need less parameters""" 2 | 3 | class _RequiredParameter(object): 4 | """Singleton class representing a required parameter for an Optimizer.""" 5 | def __repr__(self): 6 | return "" 7 | 8 | required = _RequiredParameter() -------------------------------------------------------------------------------- /pipe/optimizers/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def linear_lr_scaling(bs_train, BASE_LR, BASE_BS_TRAIN, downscale=False): 3 | 4 | if bs_train < BASE_BS_TRAIN: 5 | if not downscale: 6 | return BASE_LR 7 | else: 8 | lr = BASE_LR / (BASE_BS_TRAIN / bs_train) 9 | else: 10 | lr = BASE_LR * (bs_train / BASE_BS_TRAIN) 11 | 12 | assert(lr > 0) 13 | return lr 14 | -------------------------------------------------------------------------------- /pipe/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .communication import CommunicationHandlerBase, get_auto_comm_handler_cls 2 | from .partition_manager import SinglePartitionManager 3 | from .true_weights_storage import TrueWeightsStorage 4 | -------------------------------------------------------------------------------- /pipe/pipeline/communication/__init__.py: -------------------------------------------------------------------------------- 1 | from .interface import CommunicationHandlerBase 2 | from .p2p import P2PCommunicationHandler 3 | 4 | # from .common_simple_comm import SimpleCommBase 5 | # from .bcast import BCASTCommunicationHandler 6 | # from .replicated import P2PRankIO as ReplicatedCommunicationHandler 7 | # from .replicated import create_replicated_comm_handler_args 8 | 9 | # TODO: We want to support hybrid comm 10 | # TODO: Add replicated comm. 11 | # TODO: gloo will raise notimplementer error, it can be used for testing though. 12 | 13 | 14 | __all__ = [ 15 | "get_auto_comm_handler_cls", "CommunicationHandlerBase", 16 | "P2PCommunicationHandler", 17 | ] 18 | 19 | # "ReplicatedCommunicationHandler" 20 | 21 | from enum import Enum, auto 22 | 23 | 24 | class CommPolicy(Enum): 25 | P2P = auto() 26 | BCAST = auto() 27 | 28 | 29 | def to_policy(backend, cpu): 30 | assert backend in {'nccl', 'gloo', 'mpi'} 31 | 32 | if backend == 'mpi' or cpu: 33 | return CommPolicy.P2P 34 | 35 | raise NotImplementedError() 36 | # return CommPolicy.BCAST 37 | 38 | 39 | # TODO: add replicated somewhow. 40 | POLICY_TO_COMM = { 41 | CommPolicy.P2P: P2PCommunicationHandler, 42 | # CommPolicy.BCAST: BCASTCommunicationHandler, 43 | } 44 | 45 | 46 | def get_auto_comm_handler_cls(backend, cpu): 47 | return POLICY_TO_COMM[to_policy(backend, cpu)] 48 | -------------------------------------------------------------------------------- /pipe/pipeline/communication/grouper.py: -------------------------------------------------------------------------------- 1 | from itertools import zip_longest 2 | 3 | __all__ = ["grouper"] 4 | 5 | 6 | # Creating iteration tool for "Double Buffers" 7 | 8 | 9 | def zip_discard_compr(*iterables, sentinel=object()): 10 | # https://stackoverflow.com/questions/38054593/zip-longest-without-fillvalue 11 | return [[entry for entry in iterable if entry is not sentinel] 12 | for iterable in zip_longest(*iterables, fillvalue=sentinel)] 13 | 14 | 15 | def grouper(iterable, n): 16 | """Collect data into *non fixed-length* chunks or blocks 17 | (changed the one in itertools recepies) 18 | """ 19 | # grouper('ABCDEFG', 3,) --> ABC DEF Gxx" 20 | args = [iter(iterable)] * n 21 | return zip_discard_compr(*args) 22 | 23 | # Fixed recved: 24 | # [torch.cat(group) for group in grouper(x, num_chunks)] 25 | 26 | # [torch.cat(group) for group in grouper(x, self.comm_handler.num_chunks)] 27 | -------------------------------------------------------------------------------- /pipe/pipeline/data_propagation/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from .automatic_prop import AutomaticPipelinePropagator 4 | from .automatic_prop_non_contig import AutomaticPipelinePropagatorNonContig 5 | from .interface import PipelineDataPropagator 6 | 7 | AVAILABLE_PROPAGATORS = { 8 | 'auto': AutomaticPipelinePropagator, # HACK: has call for contagious. 9 | 'auto_non_contig': AutomaticPipelinePropagatorNonContig 10 | } 11 | 12 | 13 | def get_propagator_cls(args) -> Type[PipelineDataPropagator]: 14 | propagator_cls = AVAILABLE_PROPAGATORS.get(args.data_propagator) 15 | if propagator_cls is None: 16 | raise NotImplementedError( 17 | f"args.data_propagator={args.data_propagator}, AVAILABLE_PROPAGATORS={AVAILABLE_PROPAGATORS.keys()}") 18 | 19 | return propagator_cls 20 | -------------------------------------------------------------------------------- /pipe/pipeline/data_propagation/automatic_prop_non_contig.py: -------------------------------------------------------------------------------- 1 | from pipe.pipeline.data_propagation.automatic_prop import PipelineDataPropagator 2 | 3 | 4 | class AutomaticPipelinePropagatorNonContig(PipelineDataPropagator): 5 | 6 | def __init__(self, *args, **kw): 7 | super().__init__() 8 | 9 | def pack_send_context(self, model_out, *ctx): 10 | # ctx here is just the label y, in case we send it in the pipeline. 11 | # otherwise, it just returns model_out. 12 | # return tuple(x.detach().contiguous() if isinstance(x, torch.Tensor) else x for x in chain(model_out, ctx)) 13 | return *model_out, *ctx 14 | -------------------------------------------------------------------------------- /pipe/pipeline/data_propagation/cv_target_prop.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .interface import PipelineDataPropagator 4 | 5 | 6 | class CVTargetInPipePropagator(PipelineDataPropagator): 7 | def __init__(self, device, is_last_partition, is_first_partition): 8 | super().__init__() 9 | self.device = device 10 | 11 | # Determine unpack_cls 12 | if is_last_partition: 13 | self.unpack_cls = self.unpack_data_for_last_partition 14 | elif is_first_partition: 15 | self.unpack_cls = self.unpack_data_for_first_partition 16 | else: 17 | self.unpack_cls = self.unpack_data_for_mid_partition 18 | 19 | def unpack_data_for_partition(self, data): 20 | # assert len(data) == 2 21 | return self.unpack_cls(data) 22 | 23 | def unpack_data_for_last_partition(self, data): 24 | *x, y = data 25 | # x = x.to(self.device, non_blocking=True) 26 | with torch.no_grad(): 27 | y = y.to(self.device, non_blocking=True) 28 | return x, y 29 | 30 | def unpack_data_for_first_partition(self, data): 31 | x, y = data 32 | with torch.no_grad(): 33 | x = x.to(self.device, non_blocking=True) 34 | # Note: we don't send the y to GPU if we don't use it in this partition. 35 | return x, y 36 | 37 | def unpack_data_for_mid_partition(self, data): 38 | # x we already be on our device :) 39 | # we don't need the y. 40 | # try: 41 | *x, y = data 42 | # FIXME 43 | 44 | return x, y 45 | # x, y = data 46 | # x = x.to(self.device, non_blocking=True) 47 | # Note: we don't send the y to GPU if we don't use it in this partition. 48 | # return x, y 49 | 50 | def pack_send_context(self, model_out, *ctx): 51 | # ctx here is just the label y 52 | return (*model_out, *ctx) 53 | -------------------------------------------------------------------------------- /pipe/pipeline/data_propagation/interface.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Tuple, Any 3 | 4 | 5 | class PipelineDataPropagator(abc.ABC): 6 | """ 7 | Class describing how to handle data loaded or passed through the pipeline. 8 | 9 | Usage: 10 | 11 | # Get data: 12 | (1) 13 | from_prev_stage = (...) # get it from somewhere 14 | to_stage, to_somewhere_else = propagator.preload_from_dataloader(dlitr) 15 | x = (*to_stage, *from_prev_stage) 16 | 17 | # Run the model: 18 | (2) 19 | x, *ctx = propagator.unpack_data_for_partition(data) 20 | model_out = model(x, ...) 21 | 22 | # Unify outside context 23 | (3) 24 | ctx = (*ctx, *to_somewhere_else) 25 | 26 | # Send Data: 27 | (4) 28 | t = propagator.pack_send_context(model_out, *ctx) 29 | send(t) ... 30 | """ 31 | 32 | def __init__(self, *args, **kw): 33 | pass 34 | 35 | # @staticmethod 36 | @abc.abstractmethod 37 | def unpack_data_for_partition(self, data) -> Tuple[Tuple[Any], Tuple[Any]]: 38 | """ In case we send labels in pipeline: extract them from the output. 39 | For last partition: extract what is loaded for outside loss and statistics (e.g: batch size, ...) 40 | """ 41 | pass 42 | 43 | # @staticmethod 44 | @abc.abstractmethod 45 | def pack_send_context(self, model_out, *ctx) -> Tuple[Any]: 46 | pass 47 | 48 | def preload_from_dataloader(self, dlitr) -> Tuple[Tuple[Any], Tuple[Any]]: 49 | if dlitr is None: 50 | return (), () 51 | else: 52 | raise NotImplementedError() 53 | -------------------------------------------------------------------------------- /pipe/pipeline/dp_sim/__init__.py: -------------------------------------------------------------------------------- 1 | from .convert import convert_to_num_gpus 2 | from .simulated_dp_batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d 3 | -------------------------------------------------------------------------------- /pipe/pipeline/gap_aware/__init__.py: -------------------------------------------------------------------------------- 1 | # Here will come implementation for GAP aware. 2 | # https://arxiv.org/abs/1909.10802 3 | # We can apply it if one of the following happends: 4 | # 1. we stash the parameters theta we did forwad on (so we could calculate the gap) 5 | # 2. the gap is easy (e.g the gradient) 6 | 7 | from .adam_gap_aware import AdamGapAware, get_adam_gap_aware_cls 8 | from .adamw_gap_aware import AdamWGapAware, get_adamw_gap_aware_cls 9 | from .interface import GapAwareBase 10 | from .sgd_gap_aware import GapAware, get_sgd_gap_aware_cls 11 | 12 | SUPPORTED_GAP_AWARE_POLICIES = { 13 | 'almost_last_partition', 'all_except_last', 14 | 'all_except_last_two' 15 | } 16 | 17 | # TODO: adamw 18 | -------------------------------------------------------------------------------- /pipe/pipeline/monkey_patch/__init__.py: -------------------------------------------------------------------------------- 1 | # from .patch import dummy_forward_monkeypatch 2 | from .dummy_forward_monkey_patcher import DummyForwardMonkeyPatcher 3 | -------------------------------------------------------------------------------- /pipe/pipeline/monkey_patch/find_modules.py: -------------------------------------------------------------------------------- 1 | def find_modules(module, module_name, module_instance, found): 2 | """ 3 | Recursively find all instances of a specific module inside a module. 4 | 5 | Arguments: 6 | module {nn.Module} -- Module to search on 7 | module_name {str} -- Name of the model to search on in the currect context (used to output access string) 8 | module_instance {nn.Module} -- Class of the module to search 9 | found {list} -- List to append results to. 10 | 11 | Result will be [(access_string, model),...] inside 'found'. 12 | 13 | # Adapted from facebook XLM repo 14 | 15 | Examples: 16 | 17 | 1. Example of finding inside a class comprehended of MODEL_NAMES: 18 | ``` 19 | for name in self.MODEL_NAMES: 20 | find_modules(getattr(self, name), 21 | f'self.{name}', HashingMemory, self.memory_list) 22 | ``` 23 | 24 | 2. Example finding PKMLayer inside txl: 25 | ``` 26 | from find_modules import find_modules 27 | found = [] 28 | find_modules(model, 'model', PKMLayer, found) 29 | print([t[0] for t in found]) 30 | ``` 31 | """ 32 | 33 | if isinstance(module, module_instance): 34 | found.append((module_name, module)) 35 | else: 36 | for name, child in module.named_children(): 37 | name = ('%s[%s]' if name.isdigit() 38 | else '%s.%s') % (module_name, name) 39 | find_modules(child, name, module_instance, found) 40 | -------------------------------------------------------------------------------- /pipe/pipeline/trainers/grad_norm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/pipeline/trainers/grad_norm/__init__.py -------------------------------------------------------------------------------- /pipe/pipeline/trainers/grad_norm/local_grad_norm.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | import torch 4 | from torch.nn.utils import clip_grad_norm_ 5 | 6 | from pipe.pipeline.trainers.interface import ScheduledOptimizationStepMultiPartitionTrainer 7 | from pipe.pipeline.trainers.utils import calc_local_total_norm 8 | 9 | 10 | def local_grad_norm_mixin_trainer_factory(trainer_cls: Type[ScheduledOptimizationStepMultiPartitionTrainer]): 11 | class GradNormMixedTrainer(trainer_cls): 12 | def __init__(self, *args, max_grad_norm=None, always_calc_grad_norm=False, **kw): 13 | super().__init__(*args, **kw) 14 | self.always_calc_grad_norm = always_calc_grad_norm 15 | self.max_grad_norm = max_grad_norm 16 | 17 | def step_on_computed_grads(self, old_lrs=None): 18 | self._grad_norm() 19 | return super().step_on_computed_grads(old_lrs=old_lrs) 20 | 21 | def _grad_norm(self): 22 | total_norm = None 23 | if self.max_grad_norm: 24 | with torch.no_grad(): 25 | total_norm = clip_grad_norm_(self.model.parameters(), 26 | self.max_grad_norm, 27 | norm_type=2) 28 | elif self.always_calc_grad_norm: 29 | with torch.no_grad(): 30 | total_norm = calc_local_total_norm(self.model.parameters(), norm_type=2) 31 | 32 | if total_norm and self.statistics.has_statistic("local_grad_norm"): 33 | self.statistics.update_on_batch("local_grad_norm", total_norm.item(), 1) 34 | 35 | return GradNormMixedTrainer 36 | -------------------------------------------------------------------------------- /pipe/pipeline/trainers/statistics/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from .cv import CVStats, NormCVstats, CVDistanceNorm, CVDistance 4 | from .glue import GlueStats, NormGluestats, GlueDistanceNorm, GlueDistance 5 | from .interface import Stats 6 | from .lm import LMStats, NormLMstats, LMDistanceNorm, LMDistance 7 | from .squad import SquadStats, NormSquadstats, SquadDistanceNorm, SquadDistance 8 | 9 | # TODO: remove the "record_loss_per_batch", it is not used mostly. 10 | # TODO: option to record every X batches, when epoch is giant. 11 | 12 | # TODO: change the way of getting statistics 13 | 14 | AVAILBALE_STATS = {} 15 | 16 | 17 | def register_statistics(name: str, stats_cls: Type[Stats]): 18 | AVAILBALE_STATS[name] = stats_cls 19 | AVAILBALE_STATS[name + "_loss_per_batch"] = stats_cls 20 | 21 | 22 | def get_statistics(name: str, *args, **kw) -> Stats: 23 | record_loss_per_batch = "loss_per_batch" in name 24 | st_cls = AVAILBALE_STATS.get(name) 25 | return st_cls(*args, record_loss_per_batch=record_loss_per_batch, **kw) 26 | 27 | 28 | register_statistics("cv", CVStats) 29 | register_statistics("cv_grad_norm", NormCVstats) 30 | register_statistics("cv_theta_dist", CVDistance) 31 | register_statistics("cv_dist_grad_norm", CVDistanceNorm) 32 | 33 | register_statistics("lm", LMStats) 34 | register_statistics("lm_grad_norm", NormLMstats) 35 | register_statistics("lm_theta_dist", LMDistance) 36 | register_statistics("lm_dist_grad_norm", LMDistanceNorm) 37 | 38 | register_statistics("squad", SquadStats) 39 | register_statistics("squad_grad_norm", NormSquadstats) 40 | register_statistics("squad_theta_dist", SquadDistance) 41 | register_statistics("squad_dist_grad_norm", SquadDistanceNorm) 42 | 43 | register_statistics("glue", GlueStats) 44 | register_statistics("glue_grad_norm", NormGluestats) 45 | register_statistics("glue_theta_dist", GlueDistance) 46 | register_statistics("glue_dist_grad_norm", GlueDistanceNorm) 47 | -------------------------------------------------------------------------------- /pipe/pipeline/trainers/statistics/gap.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | import torch 4 | from torch.optim import Optimizer 5 | 6 | from pipe.pipeline.trainers.statistics import Stats 7 | 8 | 9 | def try_record_real_gap_from_current(statistics: Stats, 10 | optimizer: Optimizer, 11 | real_theta, 12 | pre_computed_gap=None, 13 | gap_name="gap"): 14 | """ calculates gap between model parameters and a given set of parameters, real_theta 15 | real_theta: Given set of parameters. TODO: rename 16 | """ 17 | if statistics.has_statistic(gap_name): 18 | if pre_computed_gap is None: 19 | if real_theta is None: 20 | gap = 0 21 | else: 22 | with torch.no_grad(): 23 | gap = sum([ 24 | torch.dist(a, b, p=2).item() for a, b in zip( 25 | chain.from_iterable([[p for p in pg['params']] 26 | for pg in 27 | optimizer.param_groups]), 28 | chain.from_iterable(real_theta)) 29 | ]) 30 | else: 31 | gap = pre_computed_gap 32 | 33 | statistics.update_on_batch(gap_name, gap, 1) 34 | return gap -------------------------------------------------------------------------------- /pipe/pipeline/trainers/statistics/utils.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value""" 3 | 4 | def __init__(self): 5 | self.reset() 6 | 7 | def reset(self): 8 | self.avg = 0 9 | self.sum = 0 10 | self.count = 0 11 | # self.record = [] 12 | 13 | def update(self, val, n=1): 14 | self.sum += val * n 15 | self.count += n 16 | 17 | def get_avg(self): 18 | return self.sum / self.count 19 | 20 | 21 | class AccuracyMeter(AverageMeter): 22 | def __init__(self): 23 | super().__init__() 24 | 25 | def update(self, val, n=1): 26 | """ just to supoort adding num correct instead of accuracy """ 27 | self.sum += val 28 | self.count += n 29 | 30 | def get_avg(self): 31 | return (self.sum / self.count) * 100 32 | -------------------------------------------------------------------------------- /pipe/pipeline/trainers/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch._six import inf 3 | 4 | 5 | def calc_local_total_norm(parameters, norm_type=2): 6 | """ Exactly like clip_grad_norm_, but without the clip. 7 | # See https://github.com/pytorch/pytorch/blob/master/torch/nn/utils/clip_grad.py 8 | """ 9 | if isinstance(parameters, torch.Tensor): 10 | parameters = [parameters] 11 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 12 | norm_type = float(norm_type) 13 | if norm_type == inf: 14 | total_norm = max(p.grad.detach().abs().max() for p in parameters) 15 | else: 16 | total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type) for p in parameters]), norm_type) 17 | # clip_coef = max_norm / (total_norm + 1e-6) 18 | # if clip_coef < 1: 19 | # for p in parameters: 20 | # p.grad.detach().mul_(clip_coef) 21 | return total_norm 22 | 23 | 24 | def calc_local_total_norm_wo_sqrt(parameters, norm_type=2): 25 | """ Exactly like clip_grad_norm_, but without the clip. 26 | # See https://github.com/pytorch/pytorch/blob/master/torch/nn/utils/clip_grad.py 27 | """ 28 | if isinstance(parameters, torch.Tensor): 29 | parameters = [parameters] 30 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 31 | norm_type = float(norm_type) 32 | if norm_type == inf: 33 | raise NotImplementedError() 34 | else: 35 | total_norm = torch.stack([torch.vdot(v, v) for v in [p.grad.detach().view(-1) for p in parameters]]).sum() 36 | # clip_coef = max_norm / (total_norm + 1e-6) 37 | # if clip_coef < 1: 38 | # for p in parameters: 39 | # p.grad.detach().mul_(clip_coef) 40 | return total_norm 41 | -------------------------------------------------------------------------------- /pipe/pipeline/weight_prediction/README.md: -------------------------------------------------------------------------------- 1 | # Counting experiments 2 | 3 | ## Counting `WeightPredictor`s 4 | 5 | For every combination of 6 | 7 | prediction_mem {clone, linear} 8 | optimization algorithm (e.g {PYTORCH_SGD, TENSORFLOW_SGD, ADAM, WADAM, ...}) 9 | 10 | We have a WeightPredictor 11 | 12 | ## Counting `FixFunction`s 13 | 14 | for every combination of 15 | 16 | optimization algorithm (e.g {PYTORCH_SGD, TENSORFLOW_SGD, ADAM, WADAM, ...}) 17 | 18 | we have several ways for predicting 19 | 20 | {ms_nag, just_multiply} 21 | 22 | ## Calculation with numbers 23 | 24 | opt = 3 (sgd1, sgd2, wadam) 25 | pred_mem = 2 26 | pred_alg = 2 27 | 28 | ### Total prediction runs (12) 29 | 30 | pred_runs: 2*3*2 = 12 31 | 32 | ### Total to comapre (12) 33 | 34 | # Stale weights (3) 35 | # weight stashing (3) 36 | # Fully sync (3) 37 | # GPipe/DP (3) 38 | 39 | Total of 24 runs per net/dataset/.... 40 | 41 | ## Then we vary 42 | 43 | pipe length... 44 | flush rate... 45 | ... 46 | -------------------------------------------------------------------------------- /pipe/pipeline/weight_prediction/cow_dict.py: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/csernazs/cowdict/blob/master/cowdict/cowdict.py 2 | from collections import MutableMapping 3 | 4 | 5 | class CowDict(MutableMapping): 6 | def __init__(self, base: dict): 7 | self.base = base 8 | self.dict = {} 9 | self.deleted_keys = set() 10 | 11 | def __getitem__(self, key): 12 | if key in self.deleted_keys: 13 | raise KeyError(key) 14 | 15 | try: 16 | return self.dict[key] 17 | except KeyError: 18 | return self.base[key] 19 | 20 | def __setitem__(self, key, value): 21 | try: 22 | self.deleted_keys.remove(key) 23 | except KeyError: 24 | pass 25 | 26 | self.dict[key] = value 27 | 28 | def __delitem__(self, key): 29 | if key in self.base: 30 | try: 31 | del self.dict[key] 32 | except KeyError: 33 | pass 34 | 35 | self.deleted_keys.add(key) 36 | 37 | elif key in self.dict: 38 | del self.dict[key] 39 | self.deleted_keys.add(key) 40 | else: 41 | raise KeyError(key) 42 | 43 | def __len__(self): 44 | return len(set(self.dict.keys()).union(set(self.base.keys())) - self.deleted_keys) 45 | 46 | def __iter__(self): 47 | 48 | for key in self.dict: 49 | if key not in self.deleted_keys: 50 | yield key 51 | 52 | for key in self.base: 53 | if key not in self.dict and key not in self.deleted_keys: 54 | yield key 55 | 56 | def __repr__(self): 57 | retval = ["{"] 58 | for key, value in self.items(): 59 | retval.append(repr(key)) 60 | retval.append(": ") 61 | retval.append(repr(value)) 62 | retval.append(", ") 63 | 64 | del retval[-1] 65 | retval.append("}") 66 | return "".join(retval) 67 | -------------------------------------------------------------------------------- /pipe/pipeline/weight_prediction/interface.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class WeightPredictor(abc.ABC): 5 | def __init__(self, optimizer, 6 | fix_fn=None, scheduler=None, nag_with_predictor=False, true_weights_storage=None): 7 | self.optimizer = optimizer 8 | self.fix_fn = fix_fn 9 | self.scheduler = scheduler 10 | self.nag_with_predictor = nag_with_predictor 11 | if nag_with_predictor: 12 | print("-I- Doing NAG with predictor") 13 | self.true_weights_storage = true_weights_storage 14 | 15 | def setup(self, n_steps): 16 | if n_steps == 0 and self.nag_with_predictor: 17 | n_steps = 1 18 | self.n_steps = n_steps 19 | 20 | @abc.abstractmethod 21 | def forward(self): 22 | raise NotImplementedError() 23 | 24 | @abc.abstractmethod 25 | def revert(self): 26 | raise NotImplementedError() 27 | 28 | 29 | class FixFunction(abc.ABC): 30 | @abc.abstractmethod 31 | def __call__(self, p: WeightPredictor, pg): 32 | # WeightPredictor is used mainly to get sched from.... 33 | raise NotImplementedError() 34 | -------------------------------------------------------------------------------- /pipe/pipeline/weight_prediction/sym_pred_optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .auto_lambdify import auto_lambdify 2 | from .sympy_optimizer import WDSympySGD, WDSympySGDMsnag, NormalSympyAdam, run_and_display_sim, run_sim 3 | -------------------------------------------------------------------------------- /pipe/pipeline/weight_stashing/__init__.py: -------------------------------------------------------------------------------- 1 | from .weight_stashing import WeightStasher, WeightStashingCachePolicy 2 | -------------------------------------------------------------------------------- /pipe/run/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/pipe/run/__init__.py -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_boolq_multirc.sh: -------------------------------------------------------------------------------- 1 | rm new_trace_cache_t53b_512_4_lg new_prof_cache_t53b_512_4_lg_ftpipe 2 | python -m autopipe.partition new_t5 \ 3 | --model_name_or_path \ 4 | t5-3b \ 5 | --t5_task \ 6 | squad1 \ 7 | --lmhead \ 8 | --n_iter \ 9 | 10 \ 10 | --analysis_batch_size \ 11 | 8 \ 12 | --partitioning_batch_size \ 13 | 8 \ 14 | --ct \ 15 | new_trace_cache_t53b_512_4_lg \ 16 | --cp \ 17 | new_prof_cache_t53b_512_4_lg_ftpipe \ 18 | --stateless_tied \ 19 | --lmhead \ 20 | --n_partitions \ 21 | 8 \ 22 | --L \ 23 | 16 \ 24 | --max_seq_length \ 25 | 512 \ 26 | --answer_max_seq_length \ 27 | 4 \ 28 | --partitioning_method \ 29 | mpipe \ 30 | --preset \ 31 | ftpipe \ 32 | --dont_use_async_meta_alg \ 33 | --save_memory_mode \ 34 | --disable_op_profiling \ 35 | --special_blocks \ 36 | T5Block \ 37 | --basic_blocks \ 38 | T5Block \ 39 | --output_file \ 40 | layer_graph_ 41 | # > partitioning_output_mpipe_t53b_512_4_lg_ftpipe.txt 42 | 43 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_rte.sh: -------------------------------------------------------------------------------- 1 | python -m autopipe.partition new_t5 \ 2 | --model_name_or_path \ 3 | t5-3b \ 4 | --t5_task \ 5 | squad1 \ 6 | --lmhead \ 7 | --n_iter \ 8 | 10 \ 9 | --analysis_batch_size \ 10 | 8 \ 11 | --partitioning_batch_size \ 12 | 8 \ 13 | --ct \ 14 | new_trace_cache_t53b_320_8_lg \ 15 | --cp \ 16 | new_prof_cache_t53b_320_8_lg_ftpipe \ 17 | --stateless_tied \ 18 | --lmhead \ 19 | --n_partitions \ 20 | 8 \ 21 | --L \ 22 | 16 \ 23 | --max_seq_length \ 24 | 320 \ 25 | --answer_max_seq_length \ 26 | 8 \ 27 | --partitioning_method \ 28 | mpipe \ 29 | --preset \ 30 | ftpipe \ 31 | --dont_use_async_meta_alg \ 32 | --save_memory_mode \ 33 | --disable_op_profiling \ 34 | --special_blocks \ 35 | T5Block \ 36 | --basic_blocks \ 37 | T5Block \ 38 | --output_file \ 39 | layer_graph_ 40 | #> partitioning_output_mpipe_t53b_320_8_lg_ftpipe.txt 41 | 42 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_wic.sh: -------------------------------------------------------------------------------- 1 | # NOTE: reducing size to fit in memory (?) it used to work b4. 2 | rm new_trace_cache_t53b_64_4_lg new_prof_cache_t53b_64_4_lg_ftpipe 3 | python -m autopipe.partition new_t5 \ 4 | --model_name_or_path \ 5 | t5-3b \ 6 | --t5_task \ 7 | squad1 \ 8 | --lmhead \ 9 | --n_iter \ 10 | 10 \ 11 | --analysis_batch_size \ 12 | 64 \ 13 | --partitioning_batch_size \ 14 | 64 \ 15 | --ct \ 16 | new_trace_cache_t53b_64_4_lg \ 17 | --cp \ 18 | new_prof_cache_t53b_64_4_lg_ftpipe \ 19 | --stateless_tied \ 20 | --lmhead \ 21 | --n_partitions \ 22 | 8 \ 23 | --L \ 24 | 16 \ 25 | --max_seq_length \ 26 | 64 \ 27 | --answer_max_seq_length \ 28 | 4 \ 29 | --partitioning_method \ 30 | mpipe \ 31 | --preset \ 32 | ftpipe \ 33 | --dont_use_async_meta_alg \ 34 | --save_memory_mode \ 35 | --disable_op_profiling \ 36 | --special_blocks \ 37 | T5Block \ 38 | --basic_blocks \ 39 | T5Block \ 40 | --output_file \ 41 | layer_graph_ 42 | # > partitioning_output_mpipe_t53b_64_4_lg_ftpipe.txt 43 | 44 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_boolq_multirc.sh: -------------------------------------------------------------------------------- 1 | # NOTE: reduced size to fit in memory. 2 | rm new_trace_cache_t53b_512_4_op new_prof_cache_t53b_512_4_op_ftpipe 3 | python -m autopipe.partition new_t5 \ 4 | --model_name_or_path \ 5 | t5-3b \ 6 | --t5_task \ 7 | squad1 \ 8 | --lmhead \ 9 | --n_iter \ 10 | 5 \ 11 | --analysis_batch_size \ 12 | 4 \ 13 | --partitioning_batch_size \ 14 | 2 \ 15 | --ct \ 16 | new_trace_cache_t53b_512_4_op \ 17 | --cp \ 18 | new_prof_cache_t53b_512_4_op_ftpipe \ 19 | --stateless_tied \ 20 | --lmhead \ 21 | --n_partitions \ 22 | 8 \ 23 | --L \ 24 | 15 \ 25 | 16 \ 26 | 17 \ 27 | --max_seq_length \ 28 | 512 \ 29 | --answer_max_seq_length \ 30 | 4 \ 31 | --partitioning_method \ 32 | mpipe \ 33 | --preset \ 34 | ftpipe \ 35 | --dont_use_async_meta_alg \ 36 | --save_memory_mode \ 37 | --special_blocks \ 38 | T5Block \ 39 | --output_file \ 40 | op_ 41 | # --basic_blocks \ 42 | # T5Block 43 | 44 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_rte.sh: -------------------------------------------------------------------------------- 1 | python -m autopipe.partition new_t5 \ 2 | --model_name_or_path \ 3 | t5-3b \ 4 | --t5_task \ 5 | squad1 \ 6 | --lmhead \ 7 | --n_iter \ 8 | 10 \ 9 | --analysis_batch_size \ 10 | 8 \ 11 | --partitioning_batch_size \ 12 | 8 \ 13 | --ct \ 14 | new_trace_cache_t53b_320_8_op \ 15 | --cp \ 16 | new_prof_cache_t53b_320_8_op_ftpipe \ 17 | --stateless_tied \ 18 | --lmhead \ 19 | --n_partitions \ 20 | 8 \ 21 | --L \ 22 | 16 \ 23 | --max_seq_length \ 24 | 320 \ 25 | --answer_max_seq_length \ 26 | 8 \ 27 | --partitioning_method \ 28 | mpipe \ 29 | --preset \ 30 | ftpipe \ 31 | --dont_use_async_meta_alg \ 32 | --save_memory_mode \ 33 | --special_blocks \ 34 | T5Block \ 35 | --output_file \ 36 | op_ 37 | # --output_file \ 38 | # lg \ 39 | # --basic_blocks \ 40 | # T5Block 41 | 42 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_wic.sh: -------------------------------------------------------------------------------- 1 | rm new_prof_cache_t53b_64_4_op_ftpipe new_trace_cache_t53b_64_4_op 2 | python -m autopipe.partition new_t5 \ 3 | --model_name_or_path \ 4 | t5-3b \ 5 | --t5_task \ 6 | squad1 \ 7 | --lmhead \ 8 | --n_iter \ 9 | 10 \ 10 | --analysis_batch_size \ 11 | 32 \ 12 | --partitioning_batch_size \ 13 | 32 \ 14 | --ct \ 15 | new_trace_cache_t53b_64_4_op \ 16 | --cp \ 17 | new_prof_cache_t53b_64_4_op_ftpipe \ 18 | --stateless_tied \ 19 | --lmhead \ 20 | --n_partitions \ 21 | 8 \ 22 | --L \ 23 | 16 \ 24 | --max_seq_length \ 25 | 64 \ 26 | --answer_max_seq_length \ 27 | 4 \ 28 | --partitioning_method \ 29 | mpipe \ 30 | --preset \ 31 | ftpipe \ 32 | --dont_use_async_meta_alg \ 33 | --save_memory_mode \ 34 | --special_blocks \ 35 | T5Block \ 36 | --output_file \ 37 | op_ 38 | 39 | # --output_file \ 40 | # lg \ 41 | # --basic_blocks \ 42 | # T5Block 43 | 44 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_mpipe_t5_base.sh: -------------------------------------------------------------------------------- 1 | python -m autopipe.partition new_t5 \ 2 | --model_name_or_path \ 3 | t5-base \ 4 | --t5_task \ 5 | squad1 \ 6 | --lmhead \ 7 | --n_iter \ 8 | 1 \ 9 | --analysis_batch_size \ 10 | 2 \ 11 | --partitioning_batch_size \ 12 | 2 \ 13 | --stateless_tied \ 14 | --lmhead \ 15 | --n_partitions \ 16 | 4 \ 17 | --L \ 18 | 4 \ 19 | 8 \ 20 | 12 \ 21 | 16 \ 22 | --max_seq_length \ 23 | 512 \ 24 | --answer_max_seq_length \ 25 | 4 \ 26 | --partitioning_method \ 27 | mpipe \ 28 | --save_memory_mode \ 29 | --output_file \ 30 | lg 31 | --special_blocks \ 32 | T5Block \ 33 | --basic_blocks \ 34 | T5Block 35 | 36 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_spipe_OP_t5_3b_boolq_multirc.sh: -------------------------------------------------------------------------------- 1 | rm new_prof_cache_t53b_512_4_op new_trace_cache_t53b_512_4_op 2 | python -m autopipe.partition new_t5 \ 3 | --model_name_or_path \ 4 | t5-3b \ 5 | --t5_task \ 6 | squad1 \ 7 | --lmhead \ 8 | --n_iter \ 9 | 5 \ 10 | --analysis_batch_size \ 11 | 32 \ 12 | --partitioning_batch_size \ 13 | 32 \ 14 | --ct \ 15 | new_trace_cache_t53b_512_4_op \ 16 | --cp \ 17 | new_prof_cache_t53b_512_4_op \ 18 | --stateless_tied \ 19 | --lmhead \ 20 | --n_partitions \ 21 | 8 \ 22 | --max_seq_length \ 23 | 512 \ 24 | --answer_max_seq_length \ 25 | 4 \ 26 | --partitioning_method \ 27 | pipedream \ 28 | --preset \ 29 | pipedream \ 30 | --dont_use_async_meta_alg \ 31 | --save_memory_mode \ 32 | --special_blocks \ 33 | T5Block \ 34 | --output_file \ 35 | op_graph_ 36 | 37 | # --basic_blocks \ 38 | # T5Block 39 | 40 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_spipe_OP_t5_3b_rte.sh: -------------------------------------------------------------------------------- 1 | rm new_prof_cache_t53b_320_8_op new_trace_cache_t53b_320_8_op 2 | python -m autopipe.partition new_t5 \ 3 | --model_name_or_path \ 4 | t5-3b \ 5 | --t5_task \ 6 | squad1 \ 7 | --lmhead \ 8 | --n_iter \ 9 | 5 \ 10 | --analysis_batch_size \ 11 | 32 \ 12 | --partitioning_batch_size \ 13 | 32 \ 14 | --ct \ 15 | new_trace_cache_t53b_320_8_op \ 16 | --cp \ 17 | new_prof_cache_t53b_320_8_op \ 18 | --stateless_tied \ 19 | --lmhead \ 20 | --n_partitions \ 21 | 8 \ 22 | --max_seq_length \ 23 | 320 \ 24 | --answer_max_seq_length \ 25 | 8 \ 26 | --partitioning_method \ 27 | pipedream \ 28 | --preset \ 29 | pipedream \ 30 | --dont_use_async_meta_alg \ 31 | --save_memory_mode \ 32 | --special_blocks \ 33 | T5Block \ 34 | --output_file \ 35 | op_graph_ 36 | 37 | # --basic_blocks \ 38 | # T5Block 39 | 40 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_spipe_OP_t5_3b_wic.sh: -------------------------------------------------------------------------------- 1 | rm new_prof_cache_t53b_64_4_op new_trace_cache_t53b_64_4_op 2 | python -m autopipe.partition new_t5 \ 3 | --model_name_or_path \ 4 | t5-3b \ 5 | --t5_task \ 6 | squad1 \ 7 | --lmhead \ 8 | --n_iter \ 9 | 5 \ 10 | --analysis_batch_size \ 11 | 8 \ 12 | --partitioning_batch_size \ 13 | 8 \ 14 | --ct \ 15 | new_trace_cache_t53b_64_4_op \ 16 | --cp \ 17 | new_prof_cache_t53b_64_4_op \ 18 | --stateless_tied \ 19 | --lmhead \ 20 | --n_partitions \ 21 | 8 \ 22 | --max_seq_length \ 23 | 64 \ 24 | --answer_max_seq_length \ 25 | 4 \ 26 | --partitioning_method \ 27 | pipedream \ 28 | --preset \ 29 | pipedream \ 30 | --dont_use_async_meta_alg \ 31 | --save_memory_mode \ 32 | --special_blocks \ 33 | T5Block \ 34 | --output_file \ 35 | op_graph_ 36 | 37 | # --basic_blocks \ 38 | # T5Block 39 | 40 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_spipe_t5_3b_boolq_multirc.sh: -------------------------------------------------------------------------------- 1 | python -m autopipe.partition new_t5 \ 2 | --model_name_or_path \ 3 | t5-3b \ 4 | --t5_task \ 5 | squad1 \ 6 | --lmhead \ 7 | --n_iter \ 8 | 10 \ 9 | --analysis_batch_size \ 10 | 4 \ 11 | --partitioning_batch_size \ 12 | 4 \ 13 | --ct \ 14 | new_trace_cache_t53b_512_4_lg \ 15 | --cp \ 16 | new_prof_cache_t53b_512_4_lg \ 17 | --stateless_tied \ 18 | --lmhead \ 19 | --n_partitions \ 20 | 8 \ 21 | --max_seq_length \ 22 | 512 \ 23 | --answer_max_seq_length \ 24 | 4 \ 25 | --partitioning_method \ 26 | pipedream \ 27 | --preset \ 28 | pipedream \ 29 | --disable_op_profiling \ 30 | --dont_use_async_meta_alg \ 31 | --save_memory_mode \ 32 | --special_blocks \ 33 | T5Block \ 34 | --basic_blocks \ 35 | T5Block \ 36 | --output_file \ 37 | layer_graph_ 38 | 39 | # --basic_blocks \ 40 | # T5Block 41 | 42 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_spipe_t5_3b_rte.sh: -------------------------------------------------------------------------------- 1 | rm new_trace_cache_t53b_320_8_lg new_prof_cache_t53b_320_8_lg 2 | python -m autopipe.partition new_t5 \ 3 | --model_name_or_path \ 4 | t5-3b \ 5 | --t5_task \ 6 | squad1 \ 7 | --lmhead \ 8 | --n_iter \ 9 | 10 \ 10 | --analysis_batch_size \ 11 | 8 \ 12 | --partitioning_batch_size \ 13 | 8 \ 14 | --ct \ 15 | new_trace_cache_t53b_320_8_lg \ 16 | --cp \ 17 | new_prof_cache_t53b_320_8_lg \ 18 | --stateless_tied \ 19 | --lmhead \ 20 | --n_partitions \ 21 | 8 \ 22 | --max_seq_length \ 23 | 320 \ 24 | --answer_max_seq_length \ 25 | 8 \ 26 | --partitioning_method \ 27 | pipedream \ 28 | --preset \ 29 | pipedream \ 30 | --disable_op_profiling \ 31 | --dont_use_async_meta_alg \ 32 | --save_memory_mode \ 33 | --special_blocks \ 34 | T5Block \ 35 | --basic_blocks \ 36 | T5Block \ 37 | --output_file \ 38 | layer_graph_ 39 | 40 | # --basic_blocks \ 41 | # T5Block 42 | 43 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_spipe_t5_3b_wic.sh: -------------------------------------------------------------------------------- 1 | rm new_trace_cache_t53b_64_4_lg new_prof_cache_t53b_64_4_lg 2 | python -m autopipe.partition new_t5 \ 3 | --model_name_or_path \ 4 | t5-3b \ 5 | --t5_task \ 6 | squad1 \ 7 | --lmhead \ 8 | --n_iter \ 9 | 10 \ 10 | --analysis_batch_size \ 11 | 64 \ 12 | --partitioning_batch_size \ 13 | 64 \ 14 | --ct \ 15 | new_trace_cache_t53b_64_4_lg \ 16 | --cp \ 17 | new_prof_cache_t53b_64_4_lg \ 18 | --stateless_tied \ 19 | --lmhead \ 20 | --n_partitions \ 21 | 8 \ 22 | --max_seq_length \ 23 | 64 \ 24 | --answer_max_seq_length \ 25 | 4 \ 26 | --partitioning_method \ 27 | pipedream \ 28 | --preset \ 29 | pipedream \ 30 | --disable_op_profiling \ 31 | --dont_use_async_meta_alg \ 32 | --save_memory_mode \ 33 | --special_blocks \ 34 | T5Block \ 35 | --basic_blocks \ 36 | T5Block \ 37 | --output_file \ 38 | layer_graph_ 39 | 40 | # --basic_blocks \ 41 | # T5Block 42 | 43 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_partition_spipe_t5_base.sh: -------------------------------------------------------------------------------- 1 | python -m autopipe.partition new_t5 \ 2 | --model_name_or_path \ 3 | t5-base \ 4 | --t5_task \ 5 | squad1 \ 6 | --lmhead \ 7 | --n_iter \ 8 | 10 \ 9 | --analysis_batch_size \ 10 | 4 \ 11 | --partitioning_batch_size \ 12 | 4 \ 13 | --ct \ 14 | new_trace_cache_t5base_512_4_lg \ 15 | --cp \ 16 | new_prof_cache_t5base_512_4_lg \ 17 | --stateless_tied \ 18 | --lmhead \ 19 | --n_partitions \ 20 | 8 \ 21 | --max_seq_length \ 22 | 512 \ 23 | --answer_max_seq_length \ 24 | 4 \ 25 | --partitioning_method \ 26 | pipedream \ 27 | --preset \ 28 | pipedream \ 29 | --disable_op_profiling \ 30 | --dont_use_async_meta_alg \ 31 | --save_memory_mode \ 32 | --special_blocks \ 33 | T5Block \ 34 | --basic_blocks \ 35 | T5Block 36 | # --output_file \ 37 | # lg \ 38 | # --basic_blocks \ 39 | # T5Block 40 | 41 | -------------------------------------------------------------------------------- /t5_used_scripts_example/to_run.sh: -------------------------------------------------------------------------------- 1 | 2 | rm new_trace_cache_t53b_* 3 | rm new_prof_cache_t53b_* 4 | 5 | ### mpipe (layers graph) 6 | bash t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_boolq_multirc.sh 7 | bash t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_rte.sh 8 | bash t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_wic.sh 9 | 10 | ### mpipe (op graph) 11 | bash t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_boolq_multirc.sh 12 | bash t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_rte.sh 13 | bash t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_wic.sh 14 | 15 | 16 | ### spipe (layers graph) 17 | bash t5_used_scripts_example/to_partition_spipe_t5_3b_boolq_multirc.sh 18 | bash t5_used_scripts_example/to_partition_spipe_t5_3b_rte.sh 19 | bash t5_used_scripts_example/to_partition_spipe_t5_3b_wic.sh 20 | 21 | 22 | # TODO: gpipe: partition with smaller micro batch. 23 | 24 | #to_partition_mpipe_t5_base 25 | #to_partition_spipe_t5 26 | #to_partition_spipe_t5_base -------------------------------------------------------------------------------- /t5_used_scripts_example/to_run_again_wic.sh: -------------------------------------------------------------------------------- 1 | 2 | rm new_trace_cache_t53b_* 3 | rm new_prof_cache_t53b_* 4 | rm prof_cache_t53b_64_4_lg_ftpipe 5 | rm new_prof_cache_t53b_64_4_lg_ftpipe 6 | rm new_trace_cache_t53b_64_4_lg 7 | 8 | ### mpipe (layers graph) 9 | # bash t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_boolq_multirc.sh 10 | # bash t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_rte.sh 11 | bash t5_used_scripts_example/to_partition_mpipe_layergraph_t5_3b_wic.sh #Failed to size? 12 | 13 | ### mpipe (op graph) 14 | #bash t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_boolq_multirc.sh # FAILED: MEM 15 | #bash t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_rte.sh 16 | bash t5_used_scripts_example/to_partition_mpipe_t5_3b_opgraph_wic.sh 17 | 18 | 19 | ### spipe (layers graph) 20 | #bash t5_used_scripts_example/to_partition_spipe_t5_3b_boolq_multirc.sh 21 | #bash t5_used_scripts_example/to_partition_spipe_t5_3b_rte.sh 22 | bash t5_used_scripts_example/to_partition_spipe_t5_3b_wic.sh 23 | 24 | 25 | ### spipe (op graph) 26 | bash t5_used_scripts_example/to_partition_spipe_OP_t5_3b_wic.sh 27 | 28 | # TODO: gpipe: partition with smaller micro batch. 29 | 30 | #to_partition_mpipe_t5_base 31 | #to_partition_spipe_t5 32 | #to_partition_spipe_t5_base -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saareliad/FTPipe/986014df3bb985595f22139ab355a48ca00dc12f/tests/__init__.py --------------------------------------------------------------------------------