├── .gitignore ├── .gitmodules ├── CHANGELOG.rst ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── NOTICE ├── README.md ├── byteps.exp ├── byteps.lds ├── byteps ├── __init__.py ├── __version__.py ├── _keras │ ├── __init__.py │ └── callbacks.py ├── common │ ├── __init__.py │ ├── common.cc │ ├── common.h │ ├── communicator.cc │ ├── communicator.h │ ├── compressor │ │ ├── common.h │ │ ├── compressor.cc │ │ ├── compressor.h │ │ ├── compressor_registry.cc │ │ ├── compressor_registry.h │ │ ├── error_feedback.cc │ │ ├── error_feedback.h │ │ ├── impl │ │ │ ├── dgc.cc │ │ │ ├── dgc.h │ │ │ ├── efsignsgd.cc │ │ │ ├── efsignsgd.h │ │ │ ├── nesterov_momentum.cc │ │ │ ├── nesterov_momentum.h │ │ │ ├── none.cc │ │ │ ├── none.h │ │ │ ├── onebit.cc │ │ │ ├── onebit.h │ │ │ ├── randomk.cc │ │ │ ├── randomk.h │ │ │ ├── test_error_feedback.cc │ │ │ ├── test_error_feedback.h │ │ │ ├── topk.cc │ │ │ ├── topk.h │ │ │ ├── vanilla_error_feedback.cc │ │ │ └── vanilla_error_feedback.h │ │ ├── momentum.cc │ │ ├── momentum.h │ │ └── utils.h │ ├── core_loops.cc │ ├── core_loops.h │ ├── cpu_reducer.cc │ ├── cpu_reducer.h │ ├── global.cc │ ├── global.h │ ├── half.h │ ├── logging.cc │ ├── logging.h │ ├── nccl_manager.cc │ ├── nccl_manager.h │ ├── operations.cc │ ├── operations.h │ ├── ready_table.cc │ ├── ready_table.h │ ├── scheduled_queue.cc │ ├── scheduled_queue.h │ ├── shared_memory.cc │ ├── shared_memory.h │ └── thread_pool.h ├── compressor_microbenchmark │ ├── Makefile │ ├── bandwidthtest.cu │ ├── common.h │ ├── compressor.h │ ├── dgc.cc │ ├── dgc.h │ ├── efsignSGD.cc │ ├── efsignSGD.h │ ├── log │ ├── onebit.cc │ ├── onebit.h │ ├── randomk.cc │ ├── randomk.h │ ├── test.cc │ ├── topk.cc │ └── topk.h ├── keras │ ├── __init__.py │ └── callbacks.py ├── misc │ ├── __init__.py │ └── imagenet18 │ │ └── __init__.py ├── mxnet │ ├── __init__.py │ ├── adapter.cc │ ├── adapter.h │ ├── compression.py │ ├── cuda_util.cc │ ├── cuda_util.h │ ├── ops.cc │ ├── ops.h │ ├── ops.py │ ├── ready_event.cc │ ├── ready_event.h │ ├── tensor_util.cc │ ├── tensor_util.h │ └── util.h ├── server │ ├── __init__.py │ ├── queue.h │ ├── server.cc │ └── server.h ├── sparse_cpu_microbenchmark │ ├── Makefile │ ├── log_145608 │ ├── log_145608_omp16 │ ├── log_439926 │ ├── log_439926_fp16 │ ├── log_439926_omp16 │ ├── log_439926_omp8 │ ├── test.cc │ └── test_fp16.cc ├── tensorflow │ ├── __init__.py │ ├── compression.py │ ├── distribute │ │ ├── __init__.py │ │ ├── cross_device_ops.py │ │ └── mirrored_strategy.py │ ├── keras │ │ ├── __init__.py │ │ └── callbacks.py │ ├── mergeComp │ │ ├── __init__.py │ │ ├── communicator │ │ │ ├── pool_allgather.py │ │ │ ├── pool_allreduce.py │ │ │ ├── pool_byteps.py │ │ │ └── pool_ps.py │ │ ├── compressor │ │ │ ├── pooldgc.py │ │ │ ├── poolefsignsgd.py │ │ │ ├── poolfp16.py │ │ │ ├── poolnone.py │ │ │ ├── poolonebit.py │ │ │ ├── poolqsgd.py │ │ │ ├── poolrandomk.py │ │ │ ├── poolsignsgd.py │ │ │ ├── poolsignum.py │ │ │ ├── poolterngrad.py │ │ │ └── pooltopk.py │ │ ├── helper.py │ │ ├── memory │ │ │ ├── dgc.py │ │ │ ├── memory_layer.py │ │ │ ├── memory_pool.py │ │ │ ├── none.py │ │ │ ├── pooldgc.py │ │ │ ├── poolnone.py │ │ │ ├── poolresidual.py │ │ │ └── residual.py │ │ ├── scheduler │ │ │ └── scheduler.py │ │ └── util.py │ ├── ops.cc │ ├── ops.h │ ├── ops.py │ ├── sparse_optimizer.py │ ├── synthetic_benchmark_tf2.py │ └── util.py └── torch │ ├── __init__.py │ ├── adapter.cc │ ├── adapter.h │ ├── benchmark_byteps.py │ ├── compression.py │ ├── cross_barrier.py │ ├── cuda_util.cc │ ├── cuda_util.h │ ├── examples │ ├── BERT │ │ ├── README.md │ │ ├── dataset │ │ │ ├── checkpoint │ │ │ │ ├── bert_base_config.json │ │ │ │ └── bert_config.json │ │ │ ├── squad │ │ │ │ └── v1.1 │ │ │ │ │ ├── dev-v1.1.json │ │ │ │ │ ├── evaluate-v1.1.py │ │ │ │ │ └── train-v1.1.json │ │ │ └── vocab.txt │ │ ├── modeling.py │ │ ├── optimization.py │ │ ├── run_baseline.sh │ │ ├── run_espresso.sh │ │ ├── run_squad.py │ │ ├── schedulers.py │ │ ├── tokenization.py │ │ └── utils.py │ ├── README.md │ ├── compressor_benchmark.py │ ├── extract.py │ ├── gpt-2 │ │ ├── README.md │ │ ├── gpt-2 │ │ │ ├── .circleci │ │ │ │ ├── config.yml │ │ │ │ └── deploy.sh │ │ │ ├── .coveragerc │ │ │ ├── .github │ │ │ │ ├── ISSUE_TEMPLATE │ │ │ │ │ ├── ---new-benchmark.md │ │ │ │ │ ├── --new-model-addition.md │ │ │ │ │ ├── bug-report.md │ │ │ │ │ ├── feature-request.md │ │ │ │ │ ├── migration.md │ │ │ │ │ └── question-help.md │ │ │ │ └── stale.yml │ │ │ ├── .gitignore │ │ │ ├── CONTRIBUTING.md │ │ │ ├── LICENSE │ │ │ ├── MANIFEST.in │ │ │ ├── Makefile │ │ │ ├── README.md │ │ │ ├── deploy_multi_version_doc.sh │ │ │ ├── docker │ │ │ │ └── Dockerfile │ │ │ ├── docs │ │ │ │ ├── Makefile │ │ │ │ ├── README.md │ │ │ │ └── source │ │ │ │ │ ├── _static │ │ │ │ │ ├── css │ │ │ │ │ │ ├── Calibre-Light.ttf │ │ │ │ │ │ ├── Calibre-Medium.otf │ │ │ │ │ │ ├── Calibre-Regular.otf │ │ │ │ │ │ ├── Calibre-Thin.otf │ │ │ │ │ │ ├── code-snippets.css │ │ │ │ │ │ └── huggingface.css │ │ │ │ │ └── js │ │ │ │ │ │ ├── custom.js │ │ │ │ │ │ └── huggingface_logo.svg │ │ │ │ │ ├── benchmarks.md │ │ │ │ │ ├── bertology.rst │ │ │ │ │ ├── conf.py │ │ │ │ │ ├── converting_tensorflow_models.rst │ │ │ │ │ ├── examples.md │ │ │ │ │ ├── glossary.rst │ │ │ │ │ ├── imgs │ │ │ │ │ ├── transformers_logo_name.png │ │ │ │ │ ├── warmup_constant_schedule.png │ │ │ │ │ ├── warmup_cosine_hard_restarts_schedule.png │ │ │ │ │ ├── warmup_cosine_schedule.png │ │ │ │ │ ├── warmup_cosine_warm_restarts_schedule.png │ │ │ │ │ └── warmup_linear_schedule.png │ │ │ │ │ ├── index.rst │ │ │ │ │ ├── installation.md │ │ │ │ │ ├── main_classes │ │ │ │ │ ├── configuration.rst │ │ │ │ │ ├── model.rst │ │ │ │ │ ├── optimizer_schedules.rst │ │ │ │ │ ├── processors.rst │ │ │ │ │ └── tokenizer.rst │ │ │ │ │ ├── migration.md │ │ │ │ │ ├── model_doc │ │ │ │ │ ├── albert.rst │ │ │ │ │ ├── auto.rst │ │ │ │ │ ├── bert.rst │ │ │ │ │ ├── camembert.rst │ │ │ │ │ ├── ctrl.rst │ │ │ │ │ ├── distilbert.rst │ │ │ │ │ ├── flaubert.rst │ │ │ │ │ ├── gpt.rst │ │ │ │ │ ├── gpt2.rst │ │ │ │ │ ├── roberta.rst │ │ │ │ │ ├── transformerxl.rst │ │ │ │ │ ├── xlm.rst │ │ │ │ │ ├── xlmroberta.rst │ │ │ │ │ └── xlnet.rst │ │ │ │ │ ├── model_sharing.md │ │ │ │ │ ├── multilingual.rst │ │ │ │ │ ├── notebooks.rst │ │ │ │ │ ├── pretrained_models.rst │ │ │ │ │ ├── quickstart.md │ │ │ │ │ ├── serialization.rst │ │ │ │ │ └── torchscript.rst │ │ │ ├── examples │ │ │ │ ├── README.md │ │ │ │ ├── benchmarks.py │ │ │ │ ├── contrib │ │ │ │ │ ├── README.md │ │ │ │ │ ├── run_camembert.py │ │ │ │ │ ├── run_openai_gpt.py │ │ │ │ │ ├── run_swag.py │ │ │ │ │ └── run_transfo_xl.py │ │ │ │ ├── distillation │ │ │ │ │ ├── README.md │ │ │ │ │ ├── distiller.py │ │ │ │ │ ├── grouped_batch_sampler.py │ │ │ │ │ ├── lm_seqs_dataset.py │ │ │ │ │ ├── requirements.txt │ │ │ │ │ ├── run_squad_w_distillation.py │ │ │ │ │ ├── train.py │ │ │ │ │ ├── training_configs │ │ │ │ │ │ ├── distilbert-base-multilingual-cased.json │ │ │ │ │ │ ├── distilbert-base-uncased.json │ │ │ │ │ │ ├── distilgpt2.json │ │ │ │ │ │ └── distilroberta-base.json │ │ │ │ │ └── utils.py │ │ │ │ ├── hans │ │ │ │ │ ├── hans_processors.py │ │ │ │ │ ├── test_hans.py │ │ │ │ │ └── utils_hans.py │ │ │ │ ├── mm-imdb │ │ │ │ │ ├── run_mmimdb.py │ │ │ │ │ └── utils_mmimdb.py │ │ │ │ ├── pplm │ │ │ │ │ ├── README.md │ │ │ │ │ ├── imgs │ │ │ │ │ │ ├── headfigure.png │ │ │ │ │ │ └── wooly.png │ │ │ │ │ ├── pplm_classification_head.py │ │ │ │ │ ├── run_pplm.py │ │ │ │ │ └── run_pplm_discrim_train.py │ │ │ │ ├── requirements.txt │ │ │ │ ├── run_bertology.py │ │ │ │ ├── run_generation.py │ │ │ │ ├── run_glue.py │ │ │ │ ├── run_lm_finetuning.py │ │ │ │ ├── run_lm_finetuning_bps.py │ │ │ │ ├── run_lm_finetuning_bytecomp.py │ │ │ │ ├── run_multiple_choice.py │ │ │ │ ├── run_ner.py │ │ │ │ ├── run_squad.py │ │ │ │ ├── run_tf_glue.py │ │ │ │ ├── run_tf_ner.py │ │ │ │ ├── run_xnli.py │ │ │ │ ├── summarization │ │ │ │ │ ├── README.md │ │ │ │ │ ├── configuration_bertabs.py │ │ │ │ │ ├── convert_bertabs_original_pytorch_checkpoint.py │ │ │ │ │ ├── modeling_bertabs.py │ │ │ │ │ ├── requirements.txt │ │ │ │ │ ├── run_summarization.py │ │ │ │ │ ├── test_utils_summarization.py │ │ │ │ │ └── utils_summarization.py │ │ │ │ ├── test_examples.py │ │ │ │ ├── tests_samples │ │ │ │ │ ├── .gitignore │ │ │ │ │ ├── MRPC │ │ │ │ │ │ ├── dev.tsv │ │ │ │ │ │ └── train.tsv │ │ │ │ │ └── SQUAD │ │ │ │ │ │ ├── dev-v2.0.json │ │ │ │ │ │ └── train-v2.0.json │ │ │ │ ├── utils_multiple_choice.py │ │ │ │ └── utils_ner.py │ │ │ ├── hubconf.py │ │ │ ├── notebooks │ │ │ │ ├── Comparing-PT-and-TF-models.ipynb │ │ │ │ ├── Comparing-TF-and-PT-models-MLM-NSP.ipynb │ │ │ │ ├── Comparing-TF-and-PT-models-SQuAD.ipynb │ │ │ │ └── Comparing-TF-and-PT-models.ipynb │ │ │ ├── setup.cfg │ │ │ ├── setup.py │ │ │ ├── src │ │ │ │ └── transformers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── commands │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── convert.py │ │ │ │ │ ├── download.py │ │ │ │ │ ├── run.py │ │ │ │ │ ├── serving.py │ │ │ │ │ ├── train.py │ │ │ │ │ └── user.py │ │ │ │ │ ├── configuration_albert.py │ │ │ │ │ ├── configuration_auto.py │ │ │ │ │ ├── configuration_bert.py │ │ │ │ │ ├── configuration_camembert.py │ │ │ │ │ ├── configuration_ctrl.py │ │ │ │ │ ├── configuration_distilbert.py │ │ │ │ │ ├── configuration_flaubert.py │ │ │ │ │ ├── configuration_gpt2.py │ │ │ │ │ ├── configuration_mmbt.py │ │ │ │ │ ├── configuration_openai.py │ │ │ │ │ ├── configuration_roberta.py │ │ │ │ │ ├── configuration_t5.py │ │ │ │ │ ├── configuration_transfo_xl.py │ │ │ │ │ ├── configuration_utils.py │ │ │ │ │ ├── configuration_xlm.py │ │ │ │ │ ├── configuration_xlm_roberta.py │ │ │ │ │ ├── configuration_xlnet.py │ │ │ │ │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ │ │ │ │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_pytorch_checkpoint_to_tf2.py │ │ │ │ │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ │ │ │ │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── data │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── metrics │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── squad_metrics.py │ │ │ │ │ └── processors │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── glue.py │ │ │ │ │ │ ├── squad.py │ │ │ │ │ │ ├── utils.py │ │ │ │ │ │ └── xnli.py │ │ │ │ │ ├── file_utils.py │ │ │ │ │ ├── hf_api.py │ │ │ │ │ ├── modelcard.py │ │ │ │ │ ├── modeling_albert.py │ │ │ │ │ ├── modeling_auto.py │ │ │ │ │ ├── modeling_bert.py │ │ │ │ │ ├── modeling_camembert.py │ │ │ │ │ ├── modeling_ctrl.py │ │ │ │ │ ├── modeling_distilbert.py │ │ │ │ │ ├── modeling_encoder_decoder.py │ │ │ │ │ ├── modeling_flaubert.py │ │ │ │ │ ├── modeling_gpt2.py │ │ │ │ │ ├── modeling_mmbt.py │ │ │ │ │ ├── modeling_openai.py │ │ │ │ │ ├── modeling_roberta.py │ │ │ │ │ ├── modeling_t5.py │ │ │ │ │ ├── modeling_tf_albert.py │ │ │ │ │ ├── modeling_tf_auto.py │ │ │ │ │ ├── modeling_tf_bert.py │ │ │ │ │ ├── modeling_tf_camembert.py │ │ │ │ │ ├── modeling_tf_ctrl.py │ │ │ │ │ ├── modeling_tf_distilbert.py │ │ │ │ │ ├── modeling_tf_gpt2.py │ │ │ │ │ ├── modeling_tf_openai.py │ │ │ │ │ ├── modeling_tf_pytorch_utils.py │ │ │ │ │ ├── modeling_tf_roberta.py │ │ │ │ │ ├── modeling_tf_t5.py │ │ │ │ │ ├── modeling_tf_transfo_xl.py │ │ │ │ │ ├── modeling_tf_transfo_xl_utilities.py │ │ │ │ │ ├── modeling_tf_utils.py │ │ │ │ │ ├── modeling_tf_xlm.py │ │ │ │ │ ├── modeling_tf_xlm_roberta.py │ │ │ │ │ ├── modeling_tf_xlnet.py │ │ │ │ │ ├── modeling_transfo_xl.py │ │ │ │ │ ├── modeling_transfo_xl_utilities.py │ │ │ │ │ ├── modeling_utils.py │ │ │ │ │ ├── modeling_xlm.py │ │ │ │ │ ├── modeling_xlm_roberta.py │ │ │ │ │ ├── modeling_xlnet.py │ │ │ │ │ ├── optimization.py │ │ │ │ │ ├── optimization_tf.py │ │ │ │ │ ├── pipelines.py │ │ │ │ │ ├── tokenization_albert.py │ │ │ │ │ ├── tokenization_auto.py │ │ │ │ │ ├── tokenization_bert.py │ │ │ │ │ ├── tokenization_bert_japanese.py │ │ │ │ │ ├── tokenization_camembert.py │ │ │ │ │ ├── tokenization_ctrl.py │ │ │ │ │ ├── tokenization_distilbert.py │ │ │ │ │ ├── tokenization_flaubert.py │ │ │ │ │ ├── tokenization_gpt2.py │ │ │ │ │ ├── tokenization_openai.py │ │ │ │ │ ├── tokenization_roberta.py │ │ │ │ │ ├── tokenization_t5.py │ │ │ │ │ ├── tokenization_transfo_xl.py │ │ │ │ │ ├── tokenization_utils.py │ │ │ │ │ ├── tokenization_xlm.py │ │ │ │ │ ├── tokenization_xlm_roberta.py │ │ │ │ │ └── tokenization_xlnet.py │ │ │ ├── templates │ │ │ │ ├── adding_a_new_example_script │ │ │ │ │ ├── README.md │ │ │ │ │ ├── run_xxx.py │ │ │ │ │ └── utils_xxx.py │ │ │ │ └── adding_a_new_model │ │ │ │ │ ├── README.md │ │ │ │ │ ├── configuration_xxx.py │ │ │ │ │ ├── convert_xxx_original_tf_checkpoint_to_pytorch.py │ │ │ │ │ ├── modeling_tf_xxx.py │ │ │ │ │ ├── modeling_xxx.py │ │ │ │ │ ├── tests │ │ │ │ │ ├── test_modeling_tf_xxx.py │ │ │ │ │ ├── test_modeling_xxx.py │ │ │ │ │ └── test_tokenization_xxx.py │ │ │ │ │ └── tokenization_xxx.py │ │ │ ├── tests │ │ │ │ ├── __init__.py │ │ │ │ ├── fixtures │ │ │ │ │ ├── dummy-config.json │ │ │ │ │ ├── empty.txt │ │ │ │ │ ├── input.txt │ │ │ │ │ ├── sample_text.txt │ │ │ │ │ └── test_sentencepiece.model │ │ │ │ ├── test_configuration_auto.py │ │ │ │ ├── test_configuration_common.py │ │ │ │ ├── test_doc_samples.py │ │ │ │ ├── test_hf_api.py │ │ │ │ ├── test_model_card.py │ │ │ │ ├── test_modeling_albert.py │ │ │ │ ├── test_modeling_auto.py │ │ │ │ ├── test_modeling_bert.py │ │ │ │ ├── test_modeling_common.py │ │ │ │ ├── test_modeling_ctrl.py │ │ │ │ ├── test_modeling_distilbert.py │ │ │ │ ├── test_modeling_encoder_decoder.py │ │ │ │ ├── test_modeling_gpt2.py │ │ │ │ ├── test_modeling_openai.py │ │ │ │ ├── test_modeling_roberta.py │ │ │ │ ├── test_modeling_t5.py │ │ │ │ ├── test_modeling_tf_albert.py │ │ │ │ ├── test_modeling_tf_auto.py │ │ │ │ ├── test_modeling_tf_bert.py │ │ │ │ ├── test_modeling_tf_common.py │ │ │ │ ├── test_modeling_tf_ctrl.py │ │ │ │ ├── test_modeling_tf_distilbert.py │ │ │ │ ├── test_modeling_tf_gpt2.py │ │ │ │ ├── test_modeling_tf_openai_gpt.py │ │ │ │ ├── test_modeling_tf_roberta.py │ │ │ │ ├── test_modeling_tf_t5.py │ │ │ │ ├── test_modeling_tf_transfo_xl.py │ │ │ │ ├── test_modeling_tf_xlm.py │ │ │ │ ├── test_modeling_tf_xlnet.py │ │ │ │ ├── test_modeling_transfo_xl.py │ │ │ │ ├── test_modeling_xlm.py │ │ │ │ ├── test_modeling_xlnet.py │ │ │ │ ├── test_optimization.py │ │ │ │ ├── test_optimization_tf.py │ │ │ │ ├── test_pipelines.py │ │ │ │ ├── test_tokenization_albert.py │ │ │ │ ├── test_tokenization_auto.py │ │ │ │ ├── test_tokenization_bert.py │ │ │ │ ├── test_tokenization_bert_japanese.py │ │ │ │ ├── test_tokenization_common.py │ │ │ │ ├── test_tokenization_ctrl.py │ │ │ │ ├── test_tokenization_distilbert.py │ │ │ │ ├── test_tokenization_gpt2.py │ │ │ │ ├── test_tokenization_openai.py │ │ │ │ ├── test_tokenization_roberta.py │ │ │ │ ├── test_tokenization_t5.py │ │ │ │ ├── test_tokenization_transfo_xl.py │ │ │ │ ├── test_tokenization_utils.py │ │ │ │ ├── test_tokenization_xlm.py │ │ │ │ ├── test_tokenization_xlnet.py │ │ │ │ └── utils.py │ │ │ ├── transformers-cli │ │ │ ├── utils │ │ │ │ ├── download_glue_data.py │ │ │ │ └── link_tester.py │ │ │ └── valohai.yaml │ │ ├── run_baseline.sh │ │ ├── run_espresso.sh │ │ └── run_prepare.sh │ ├── json_parser.py │ ├── lstm │ │ ├── README.md │ │ ├── data.py │ │ ├── getdata.sh │ │ ├── main.py │ │ ├── model.py │ │ ├── run_baseline.sh │ │ ├── run_espresso.sh │ │ ├── splitcross.py │ │ └── utils.py │ ├── resnet101 │ │ ├── README.md │ │ ├── main.py │ │ ├── run_baseline.sh │ │ └── run_espresso.sh │ ├── run_nvlink_models.sh │ ├── run_pcie_models.sh │ ├── test_compressor_cpu.py │ ├── ugatit │ │ ├── LICENSE │ │ ├── README.md │ │ ├── UGATIT.py │ │ ├── assets │ │ │ ├── ablation.png │ │ │ ├── discriminator.png │ │ │ ├── generator.png │ │ │ ├── kid.png │ │ │ ├── teaser.png │ │ │ └── user_study.png │ │ ├── dataset.py │ │ ├── main.py │ │ ├── networks.py │ │ ├── requirements.txt │ │ ├── run_baseline.sh │ │ ├── run_espresso.sh │ │ └── utils.py │ └── vgg16 │ │ ├── README.md │ │ ├── main.py │ │ ├── run_baseline.sh │ │ └── run_espresso.sh │ ├── handle_manager.cc │ ├── handle_manager.h │ ├── json_parser.py │ ├── launch_bps.py │ ├── mergeComp │ ├── __init__.py │ ├── communicator │ │ ├── DDPbackend.py │ │ ├── ddp_allgather.py │ │ ├── ddp_allgather_twolayer.py │ │ ├── ddp_fp16.py │ │ ├── ddp_hipress.py │ │ ├── ddp_hipress_resnet.py │ │ ├── global_comm_comp.py │ │ ├── inter_comm_comp.py │ │ ├── intra_comm_comp.py │ │ ├── intra_comm_comp_byteps.py │ │ ├── pool_allreduce.py │ │ └── pool_bytecomp.py │ ├── compressor │ │ ├── pooldgc.py │ │ ├── poolefsignsgd.py │ │ ├── poolfp16.py │ │ ├── poolint8.py │ │ ├── poolnone.py │ │ ├── poolonebit.py │ │ ├── poolqsgd.py │ │ ├── poolrandomk.py │ │ ├── poolsignsgd.py │ │ ├── poolsignum.py │ │ ├── poolterngrad.py │ │ └── pooltopk.py │ ├── helper.py │ ├── memory │ │ ├── dgc.py │ │ ├── efsignsgd.py │ │ ├── memory_layer.py │ │ ├── memory_pool.py │ │ ├── none.py │ │ ├── pooldgc.py │ │ ├── poolnone.py │ │ ├── poolresidual.py │ │ ├── residual.py │ │ └── topk.py │ ├── scheduler │ │ ├── README.md │ │ ├── bert │ │ │ └── nvlink_randomk_cpu.json │ │ ├── gpt2 │ │ │ └── nvlink_efsignsgd_cpu.json │ │ ├── lstm │ │ │ └── pcie_efsignsgd_two_cpu.json │ │ ├── model_tensor.py │ │ ├── resnet101 │ │ │ └── pcie_dgc_cpu.json │ │ ├── run_all_models.sh │ │ ├── scheduler.py │ │ ├── simulator_espresso.py │ │ ├── simulator_logs │ │ │ ├── bert │ │ │ │ └── pcie_randomk_cpu.json │ │ │ ├── gpt2 │ │ │ │ └── pcie_efsignsgd_cpu.json │ │ │ ├── lstm │ │ │ │ └── pcie_efsignsgd_two_cpu.json │ │ │ ├── resnet101 │ │ │ │ └── pcie_dgc_two_cpu.json │ │ │ ├── ugatit │ │ │ │ └── pcie_dgc_cpu.json │ │ │ └── vgg16 │ │ │ │ └── pcie_randomk_two_cpu.json │ │ ├── ugatit │ │ │ ├── nvlink_dgc_cpu.json │ │ │ └── nvlink_randomk_cpu.json │ │ └── vgg16 │ │ │ ├── pcie_efsignsgd_cpu.json │ │ │ └── pcie_randomk_two_cpu.json │ ├── util.py │ └── util_cpu.py │ ├── ops.cc │ ├── ops.h │ ├── ops.py │ ├── parallel │ ├── __init__.py │ └── distributed.py │ ├── ready_event.cc │ ├── ready_event.h │ ├── requirements.txt │ ├── run_byteps_ddp.sh │ ├── run_byteps_test_cpu.sh │ ├── run_byteps_test_gpu.sh │ ├── sparse_optimizer.py │ ├── test_ddp.py │ ├── test_torch.py │ └── utils.py ├── docker ├── Dockerfile └── README.md ├── docs ├── DistributedDataParallel.md ├── MirroredStrategy.md ├── architecture.md ├── best-practice.md ├── cross-barrier.md ├── env.md ├── faq.md ├── gradient-compression.md ├── performance.md ├── rationale.md ├── run-on-k8s.md ├── running.md ├── step-by-step-tutorial.md ├── timeline.md └── troubleshooting.md ├── espresso_EuroSys23.pdf ├── espresso_EuroSys_AE.pdf ├── example ├── README.md ├── keras │ ├── keras_imagenet_resnet50.py │ ├── keras_mnist.py │ ├── keras_mnist_advanced.py │ └── keras_synthetic_benchmark_tf2.py ├── mxnet │ ├── common │ │ ├── __init__.py │ │ ├── data.py │ │ ├── data_byteps.py │ │ ├── find_mxnet.py │ │ ├── fit.py │ │ ├── fit_byteps.py │ │ ├── modelzoo.py │ │ └── util.py │ ├── data │ │ ├── caltech256.sh │ │ └── imagenet1k-val.sh │ ├── symbols │ │ ├── README.md │ │ ├── __init__.py │ │ ├── alexnet.py │ │ ├── googlenet.py │ │ ├── inception-bn.py │ │ ├── inception-resnet-v2.py │ │ ├── inception-v3.py │ │ ├── inception-v4.py │ │ ├── lenet.py │ │ ├── mlp.py │ │ ├── mobilenet.py │ │ ├── mobilenetv2.py │ │ ├── resnet-v1.py │ │ ├── resnet.py │ │ ├── resnetv1.py │ │ ├── resnext.py │ │ └── vgg.py │ ├── train_cifar100_byteps_gc.py │ ├── train_gluon_imagenet_byteps_gc.py │ ├── train_gluon_mnist_byteps.py │ ├── train_gluon_mnist_byteps_gc.py │ └── train_imagenet_byteps.py ├── pytorch │ ├── benchmark_byteps.py │ ├── benchmark_byteps_ddp.py │ ├── benchmark_cross_barrier_byteps.py │ ├── elastic_benchmark_byteps.py │ ├── mnist-distributed.py │ ├── test_bytecomp_pytorch.py │ ├── train_imagenet_resnet50_byteps.py │ ├── train_imagenet_resnet_byteps_ddp.py │ └── train_mnist_byteps.py └── tensorflow │ ├── synthetic_benchmark.py │ ├── synthetic_benchmark_tf2.py │ ├── tensorflow2_keras_mnist.py │ ├── tensorflow2_mnist.py │ ├── tensorflow2_mnist_bps_MirroredStrategy.py │ ├── tensorflow_keras_mnist.py │ └── tensorflow_mnist.py ├── install.sh ├── launcher ├── README.md ├── dist_launcher.py └── launch.py ├── pre_setup.py ├── setup.py └── tests ├── meta_test.py ├── run_byteps_test.sh ├── test_dithering.py ├── test_mxnet.py ├── test_onebit.py ├── test_randomk.py ├── test_tensorflow_keras.py ├── test_topk.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # vscode 2 | .vscode 3 | *.gz 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | bin/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | 111 | # pycharm 112 | .idea 113 | 114 | # mac 115 | .DS_Store 116 | 117 | # for development 118 | scripts/ 119 | exps/ 120 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rdparty/ps-lite"] 2 | path = 3rdparty/ps-lite 3 | url = https://github.com/bytedance/ps-lite 4 | branch = byteps 5 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2 | Changelog for BytePS 3 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 4 | 0.2.4 (2020-06) 5 | ------------------ 6 | * Fix compatibility issue with tf2 + standalone keras 7 | * Add support for tensorflow.keras 8 | * Improve robustness of broadcast 9 | 10 | 11 | 0.2.3 (2020-05) 12 | ------------------ 13 | * Add DistributedDataParallel module for PyTorch 14 | * Fix the problem of different CPU tensor using the same name 15 | * Add skip_synchronize api for PyTorch 16 | * Add the option for lazy/non-lazy init 17 | 18 | 19 | 0.2.0 (2020-02) 20 | ------------------ 21 | * Largely improve RDMA performance by enforcing page aligned memory. 22 | * Add IPC support for RDMA. Now support colocating servers and workers without sacrificing much performance. 23 | * Fix a hanging bug in BytePS server. 24 | * Fix RDMA-related segmentation fault problem during fork() (e.g., used by PyTorch data loader). 25 | * New feature: Enable mixing use of colocate and non-colocate servers, along with a smart tensor allocation strategy. 26 | * New feature: Add ``bpslaunch`` as the command to launch tasks. 27 | * Add support for pip install: ``pip3 install byteps`` 28 | 29 | 30 | 0.1.0 (2019-12) 31 | ------------------ 32 | * First official release. 33 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guidelines 2 | 3 | First of all, thanks for taking the time to contribute! 4 | 5 | Please refer to the following guidelines to contribute new functionality or bug fixes: 6 | 7 | 1. Use [autopep8](https://github.com/hhatto/autopep8) to format the Python code. 8 | 2. Use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C++ code. Changes to BytePS C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). 9 | 3. Add unit tests for any new code you write. 10 | 4. Run unit tests in both CI and GPU environments. 11 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include */* LICENSE byteps.lds byteps.exp 2 | prune .git 3 | prune dist 4 | recursive-include * *.cc *.h 5 | prune __pycache__ 6 | prune 3rdparty 7 | graft 3rdparty/ps-lite 8 | prune 3rdparty/ps-lite/build 9 | prune 3rdparty/ps-lite/deps 10 | exclude 3rdparty/ps-lite/tests/test_benchmark 11 | exclude 3rdparty/ps-lite/tests/test_benchmark.d 12 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark 13 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark.d 14 | 15 | include pre_setup.py pre_setup_local.py zeromq-4.1.4.tar.gz ucx.zip 16 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | BytePS includes derived work from the following: 2 | 3 | Horovod 4 | Copyright 2018 Uber Technologies, Inc. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | 18 | kennethreitz/setup.py 19 | Copyright 2019 Kenneth Reitz 20 | 21 | Permission is hereby granted, free of charge, to any person obtaining 22 | a copy of this software and associated documentation files (the 23 | "Software"), to deal in the Software without restriction, including 24 | without limitation the rights to use, copy, modify, merge, publish, 25 | distribute, sublicense, and/or sell copies of the Software, and to 26 | permit persons to whom the Software is furnished to do so, subject to 27 | the following conditions: 28 | 29 | The above copyright notice and this permission notice shall be included 30 | in all copies or substantial portions of the Software. 31 | 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 33 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 34 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 35 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 36 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 37 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 38 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reproduce Espresso Experiments 2 | 3 | This repository contains the system code and scripts that help run the Espresso experiments from our EuroSys '23 paper. 4 | 5 | ## Prerequisites 6 | 7 | - CUDA == 11.1 8 | - PyTorch >= 1.8.0 9 | - NCCL >= 2.8.3 10 | 11 | ## What machines? 12 | 13 | All of our experiments require multiple GPU machines. 14 | We expect each GPU machine has eight V100 GPUs and the GPU memory is 32GB. 15 | NVLink-based (100Gbps TCP) and PCIe-only (25Gbps TCP) GPU machines are needed for Figure 11 and Figure 12. 16 | If resources are not available, it is fine to perform experiments with 4 GPU machines and use 25Gbps TCP for NVLink-based experiments. 17 | It is also fine to perform PCIe-only experiments on NVLink-based machines with export NCCL_P2P_DISABLE=1 18 | 19 | 20 | ## Installation 21 | 22 | Install Espresso on each GPU machine. Please make sure the machines can successfully install [BytePS](https://github.com/bytedance/byteps). 23 | 24 | ```bash 25 | # In case you need to install PyTorch 26 | pip3 install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html 27 | 28 | git clone https://github.com/zhuangwang93/Espresso.git --recursive 29 | cd Espresso 30 | 31 | # install dependencies and datasets for training 32 | bash install.sh 33 | ``` 34 | 35 | ## AE file 36 | 37 | Follow the instructions in [espresso_EuroSys_AE.pdf](https://github.com/zhuangwang93/Espresso/blob/master/espresso_EuroSys_AE.pdf) to reproduce the experimental results in our paper. 38 | 39 | 40 | ## End-to-end training throughput 41 | 42 | The DNN models used by Espresso are in [examples](https://github.com/zhuangwang93/Espresso/tree/master/byteps/torch/examples). 43 | There are six DNN models and each of them has a folder under examples/ 44 | 45 | Follow the instructions in this directory to reproduce the experiments results in Figure 11 and Figure 12. 46 | -------------------------------------------------------------------------------- /byteps.exp: -------------------------------------------------------------------------------- 1 | *byteps* 2 | # PyTorch binding 3 | *PyInit* 4 | *initc_lib* 5 | -------------------------------------------------------------------------------- /byteps.lds: -------------------------------------------------------------------------------- 1 | { 2 | global: 3 | *byteps*; 4 | # PyTorch binding 5 | *PyInit*; 6 | *initc_lib*; 7 | local: *; 8 | }; 9 | -------------------------------------------------------------------------------- /byteps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/__init__.py -------------------------------------------------------------------------------- /byteps/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 2, 5) 2 | 3 | __version__ = '.'.join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /byteps/common/compressor/compressor_registry.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H 17 | #define BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H 18 | 19 | #include "compressor.h" 20 | #include "utils.h" 21 | 22 | namespace byteps { 23 | namespace common { 24 | namespace compressor { 25 | 26 | class CompressorRegistry { 27 | public: 28 | // constructor of compressor 29 | using ctor_t = std::function( 30 | const kwargs_t& kwargs, size_t size, DataType dtype)>; 31 | 32 | using map_t = std::unordered_map; 33 | 34 | struct Register { 35 | Register(std::string name, ctor_t ctor); 36 | }; 37 | 38 | static ctor_t Find(const std::string& name); 39 | 40 | static std::unique_ptr Create(const kwargs_t& kwargs, size_t size, 41 | DataType dtype); 42 | 43 | private: 44 | static map_t _ctor_map; 45 | 46 | CompressorRegistry() = delete; 47 | ~CompressorRegistry() = delete; 48 | }; 49 | 50 | } // namespace compressor 51 | } // namespace common 52 | } // namespace byteps 53 | 54 | #endif // BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H -------------------------------------------------------------------------------- /byteps/common/compressor/error_feedback.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "error_feedback.h" 17 | 18 | namespace byteps { 19 | namespace common { 20 | namespace compressor { 21 | 22 | tensor_t ErrorFeedback::Compress(tensor_t grad) { 23 | 24 | // 1. grad <- grad + error 25 | UpdateGradient(grad); 26 | 27 | // 2. c <- Compress(grad) 28 | auto compressed = _cptr->Compress(grad); 29 | 30 | // 3. e <- grad - Decompress(c) 31 | // postpone the UpdateError() to avoid the blocking 32 | UpdateError(grad, compressed); 33 | 34 | return compressed; 35 | } 36 | 37 | tensor_t ErrorFeedback::Decompress(tensor_t compressed) { 38 | // directly forward to internal compressor 39 | return _cptr->Decompress(compressed); 40 | } 41 | 42 | 43 | void ErrorFeedback::UpdateError(tensor_t corrected, tensor_t compressed) { 44 | tensor_t error{_error.get(), _size, corrected.dtype}; 45 | _cptr->FastUpdateError(error, corrected, compressed); 46 | } 47 | 48 | } // namespace compressor 49 | } // namespace common 50 | } // namespace byteps -------------------------------------------------------------------------------- /byteps/common/compressor/impl/nesterov_momentum.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H 17 | #define BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H 18 | 19 | #include "../momentum.h" 20 | 21 | namespace byteps { 22 | namespace common { 23 | namespace compressor { 24 | 25 | /*! 26 | * \brief Nesterov Momentum Compressor 27 | * 28 | * paper: A method for solving the convex programming problem with convergence 29 | * rate $O (1/k^2)$ 30 | * 31 | * m_t <- \mu m_{t-1} + g_t 32 | * g_t <- \mu m_t + g_t 33 | * 34 | */ 35 | class NesterovMomentumCompressor : public Momentum { 36 | public: 37 | NesterovMomentumCompressor(size_t size, DataType dtype, 38 | std::unique_ptr cptr, float mu) 39 | : Momentum(size, dtype, std::move(cptr), mu){}; 40 | virtual ~NesterovMomentumCompressor() = default; 41 | 42 | protected: 43 | void UpdateMom(tensor_t grad) override; 44 | void UpdateGradient(tensor_t grad) override; 45 | }; 46 | 47 | } // namespace compressor 48 | } // namespace common 49 | } // namespace byteps 50 | 51 | #endif // BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H -------------------------------------------------------------------------------- /byteps/common/compressor/impl/test_error_feedback.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include 17 | 18 | #include "../compressor_registry.h" 19 | #include "test_error_feedback.h" 20 | 21 | namespace byteps { 22 | namespace common { 23 | namespace compressor { 24 | namespace { 25 | CompressorRegistry::Register reg( 26 | "test_ef", 27 | [](const kwargs_t& kwargs, size_t size, 28 | DataType dtype) -> std::unique_ptr { 29 | // register cptr 30 | auto kwargs_clone = kwargs; 31 | kwargs_clone.erase("ef_type"); 32 | auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype); 33 | BPS_CHECK_NE(cptr, nullptr); 34 | return std::unique_ptr( 35 | new TestErrorFeedbackCompressor(size, dtype, std::move(cptr))); 36 | }); 37 | } 38 | 39 | TestErrorFeedbackCompressor::TestErrorFeedbackCompressor( 40 | size_t size, DataType dtype, std::unique_ptr cptr) 41 | : ErrorFeedback(size, dtype, std::move(cptr)) {} 42 | 43 | void TestErrorFeedbackCompressor::UpdateGradient(tensor_t grad) { 44 | this->_cpu_reducer->sum(grad.data, _error.get(), grad.size, 45 | static_cast(grad.dtype)); 46 | } 47 | 48 | } // namespace compressor 49 | } // namespace common 50 | } // namespace byteps -------------------------------------------------------------------------------- /byteps/common/compressor/momentum.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "momentum.h" 17 | 18 | namespace byteps { 19 | namespace common { 20 | namespace compressor { 21 | 22 | tensor_t Momentum::Compress(tensor_t grad) { 23 | // 1. m_t = \mu * m_{t-1} + g_t 24 | UpdateMom(grad); 25 | 26 | // 2. p_t = \mu m_t + g_t 27 | UpdateGradient(grad); 28 | 29 | // 3. compress 30 | return _cptr->Compress(grad); 31 | } 32 | 33 | tensor_t Momentum::Decompress(tensor_t compressed) { 34 | // directly forward to internal compressor 35 | return _cptr->Decompress(compressed); 36 | } 37 | 38 | } // namespace compressor 39 | } // namespace common 40 | } // namespace byteps -------------------------------------------------------------------------------- /byteps/common/core_loops.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_CORE_LOOPS_H 17 | #define BYTEPS_CORE_LOOPS_H 18 | 19 | namespace byteps { 20 | namespace common { 21 | 22 | void CoordinateNoneLoop(); 23 | 24 | void CoordinateReduceLoop(); 25 | void CoordinateIntraReduceLoop(); 26 | void CoordinateIntraGatherLoop(); 27 | void CoordinateIntraBroadcastLoop(); 28 | void CoordinateIntraReducescatterLoop(); 29 | void CoordinateIntraAllgatherLoop(); 30 | void CoordinateIntraAlltoallLoop(); 31 | 32 | void CoordinateBroadcastLoop(); 33 | 34 | void CoordinatePushLoop(); 35 | 36 | void PcieReduceLoop(); 37 | 38 | void RootNcclLoop(); 39 | 40 | void NonRootNcclLoop(); 41 | 42 | void SyncNcclLoop(); 43 | 44 | void CopyDevice2HostLoop(); 45 | void CompressCopyDevice2HostLoop(); 46 | 47 | void CompressLoop(); 48 | 49 | void PushLoop(); 50 | 51 | void PullLoop(); 52 | 53 | void DecompressLoop(); 54 | 55 | void RootCopyHost2DeviceLoop(); 56 | 57 | void NonRootCopyListenLoop(); 58 | 59 | void NonRootCopyHost2DeviceLoop(); 60 | 61 | } // namespace common 62 | } // namespace byteps 63 | 64 | #endif // BYTEPS_CORE_LOOPS_H 65 | -------------------------------------------------------------------------------- /byteps/common/ready_table.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #include "ready_table.h" 17 | 18 | #include "logging.h" 19 | 20 | namespace byteps { 21 | namespace common { 22 | 23 | // below are methods for accessing/modifying the _ready_table 24 | bool ReadyTable::IsKeyReady(uint64_t key) { 25 | std::lock_guard lock(_table_mutex); 26 | return _ready_table[key] == (_ready_count); 27 | } 28 | 29 | int ReadyTable::AddReadyCount(uint64_t key) { 30 | std::lock_guard lock(_table_mutex); 31 | BPS_CHECK_LT(_ready_table[key], _ready_count) 32 | << _table_name << ": " << _ready_table[key] << ", " << (_ready_count); 33 | return ++_ready_table[key]; 34 | } 35 | 36 | int ReadyTable::SetReadyCount(uint64_t key, int cnt) { 37 | std::lock_guard lock(_table_mutex); 38 | _ready_table[key] = cnt; 39 | } 40 | 41 | void ReadyTable::ClearReadyCount(uint64_t key) { 42 | std::lock_guard lock(_table_mutex); 43 | _ready_table[key] = 0; 44 | } 45 | 46 | } // namespace common 47 | } // namespace byteps 48 | -------------------------------------------------------------------------------- /byteps/common/ready_table.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_READY_TABLE_H 17 | #define BYTEPS_READY_TABLE_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace byteps { 25 | namespace common { 26 | 27 | class ReadyTable { 28 | public: 29 | ReadyTable(int ready_count, const char* name) { 30 | _ready_count = ready_count; 31 | _table_name = std::string(name); 32 | } 33 | // methods to access or modify the _ready_table 34 | bool IsKeyReady(uint64_t key); 35 | int AddReadyCount(uint64_t key); 36 | int SetReadyCount(uint64_t key, int cnt); 37 | void ClearReadyCount(uint64_t key); 38 | 39 | private: 40 | // (key, ready_signal_count) pair, only valid for root device 41 | std::unordered_map _ready_table; 42 | // use this mutex to access/modify the _ready_table 43 | std::mutex _table_mutex; 44 | int _ready_count; 45 | std::string _table_name; 46 | }; 47 | 48 | } // namespace common 49 | } // namespace byteps 50 | 51 | #endif // BYTEPS_READY_TABLE_H 52 | -------------------------------------------------------------------------------- /byteps/common/scheduled_queue.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_SCHEDULED_QUEUE_H 17 | #define BYTEPS_SCHEDULED_QUEUE_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "common.h" 24 | #include "ready_table.h" 25 | 26 | namespace byteps { 27 | namespace common { 28 | 29 | class BytePSScheduledQueue { 30 | public: 31 | BytePSScheduledQueue(QueueType type); 32 | QueueType getQueueType() { return _qt; } 33 | void addTask(std::shared_ptr); 34 | void recorderTs(std::shared_ptr); 35 | std::shared_ptr getTask(); 36 | std::shared_ptr getTask(uint64_t key); 37 | uint32_t pendingSize(); 38 | void reportFinish(int size); 39 | void reset(uint64_t key, int cnt); 40 | 41 | private: 42 | // TODO: use priority queue or heap 43 | std::vector> _sq; 44 | std::mutex _mutex; 45 | uint64_t _credits; 46 | bool _is_scheduled; 47 | QueueType _qt; 48 | ReadyTable *_rt; 49 | }; 50 | 51 | } // namespace common 52 | } // namespace byteps 53 | 54 | #endif // BYTEPS_SCHEDULED_QUEUE_H 55 | -------------------------------------------------------------------------------- /byteps/common/shared_memory.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_SHARED_MEMORY_H 17 | #define BYTEPS_SHARED_MEMORY_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "logging.h" 30 | 31 | namespace byteps { 32 | namespace common { 33 | 34 | class BytePSSharedMemory { 35 | public: 36 | BytePSSharedMemory() {} 37 | 38 | ~BytePSSharedMemory() { 39 | for (auto &it : _key_shm_addr) { 40 | CUDA_CALL(cudaHostUnregister(it.second)); 41 | munmap(it.second, _key_shm_size[it.first]); 42 | shm_unlink(it.first.c_str()); 43 | } 44 | 45 | BPS_LOG(DEBUG) << "Clear shared memory: all BytePS shared memory " 46 | "released/unregistered."; 47 | } 48 | 49 | void *openSharedMemory(const std::string &prefix, uint64_t key, size_t size); 50 | std::vector openPcieSharedMemory(uint64_t key, size_t size); 51 | 52 | private: 53 | std::unordered_map _key_shm_addr; 54 | std::unordered_map _key_shm_size; 55 | 56 | std::mutex _shm_mu; 57 | }; 58 | 59 | } // namespace common 60 | } // namespace byteps 61 | 62 | #endif // BYTEPS_SHARED_MEMORY_H 63 | -------------------------------------------------------------------------------- /byteps/compressor_microbenchmark/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | CFLAGS = -g -Wall -O2 3 | LDFLAGS = -fopenmp 4 | 5 | TARGET = test 6 | DEPS = common.h compressor.h 7 | OBJ = efsignSGD.o onebit.o randomk.o dgc.o 8 | 9 | %.o: %.cc $(DEPS) 10 | $(CC) $(LDFLAGS) -c -o $@ $< 11 | 12 | 13 | $(TARGET): $(OBJ) test.o 14 | $(CC) $(LDFLAGS) -o $@ $^ $(CFLAGS) 15 | 16 | all: $(TARGET) 17 | 18 | .PHONY: clean 19 | 20 | clean: 21 | rm -f *.o $(TARGET) -------------------------------------------------------------------------------- /byteps/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/misc/__init__.py -------------------------------------------------------------------------------- /byteps/mxnet/adapter.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #if HAVE_CUDA 18 | #include "cuda.h" 19 | #endif 20 | 21 | #include "adapter.h" 22 | #include "cuda_util.h" 23 | #include "tensor_util.h" 24 | 25 | namespace byteps { 26 | namespace mxnet { 27 | 28 | 29 | template 30 | MXTensor::MXTensor(T* tensor) : tensor_(tensor) {} 31 | 32 | template 33 | const DataType MXTensor::dtype() const { 34 | return TensorUtil::GetDType(tensor_); 35 | } 36 | 37 | template 38 | const TensorShape MXTensor::shape() const { 39 | auto shape = TensorUtil::GetShape(tensor_); 40 | if (shape.dims() == 0) { 41 | // Tensor with empty shape is a Tensor with no values in MXNet, unlike a 42 | // constant in TensorFlow. So, we inject a dummy zero dimension to make sure 43 | // that the number-of-elements calculation is correct. 44 | shape.AddDim(0); 45 | } 46 | return shape; 47 | } 48 | 49 | template 50 | const void* MXTensor::data() const { 51 | return TensorUtil::GetData(tensor_); 52 | } 53 | 54 | template 55 | int64_t MXTensor::size() const { 56 | return TensorUtil::GetSize(tensor_); 57 | } 58 | 59 | template class MXTensor; 60 | 61 | } // namespace mxnet 62 | } // namespace byteps 63 | -------------------------------------------------------------------------------- /byteps/mxnet/adapter.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_ADAPTER_H 18 | #define BYTEPS_MXNET_ADAPTER_H 19 | 20 | #include 21 | #include "../common/common.h" 22 | 23 | namespace byteps { 24 | namespace mxnet { 25 | 26 | using namespace byteps::common; 27 | 28 | template 29 | class MXTensor : public Tensor { 30 | public: 31 | MXTensor(T* tensor); 32 | virtual const DataType dtype() const override; 33 | virtual const TensorShape shape() const override; 34 | virtual const void* data() const override; 35 | virtual int64_t size() const override; 36 | 37 | protected: 38 | T* tensor_; 39 | }; 40 | 41 | inline void ThrowIfError(const Status& status) { 42 | if (!status.ok()) { 43 | throw dmlc::Error(status.reason()); 44 | } 45 | } 46 | 47 | } // namespace mxnet 48 | } // namespace byteps 49 | 50 | #endif // BYTEPS_MXNET_ADAPTER_H 51 | -------------------------------------------------------------------------------- /byteps/mxnet/cuda_util.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #if HAVE_CUDA 18 | #include 19 | #include "cuda_runtime.h" 20 | #endif 21 | 22 | #include "../common/common.h" 23 | #include "cuda_util.h" 24 | #include "util.h" 25 | 26 | namespace byteps { 27 | namespace mxnet { 28 | 29 | with_device::with_device(int device) { 30 | if (device == CPU_DEVICE_ID) { 31 | restore_device_ = CPU_DEVICE_ID; 32 | } else { 33 | #if HAVE_CUDA 34 | CUDA_CALL(cudaGetDevice(&restore_device_)); 35 | CUDA_CALL(cudaSetDevice(device)); 36 | #else 37 | throw std::logic_error( 38 | "Internal error. Requested device context manager " 39 | "with GPU device but not compiled with CUDA."); 40 | #endif 41 | } 42 | } 43 | 44 | with_device::~with_device() { 45 | #if HAVE_CUDA 46 | if (restore_device_ != CPU_DEVICE_ID) { 47 | CUDA_CALL(cudaSetDevice(restore_device_)); 48 | } 49 | #endif 50 | } 51 | 52 | } // namespace mxnet 53 | } // namespace byteps 54 | -------------------------------------------------------------------------------- /byteps/mxnet/cuda_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_CUDA_UTIL_H 18 | #define BYTEPS_MXNET_CUDA_UTIL_H 19 | 20 | namespace byteps { 21 | namespace mxnet { 22 | 23 | class with_device { 24 | public: 25 | with_device(int device); 26 | ~with_device(); 27 | 28 | private: 29 | int restore_device_; 30 | }; 31 | 32 | } // namespace mxnet 33 | } // namespace byteps 34 | 35 | #endif // BYTEPS_MXNET_CUDA_UTIL_H 36 | -------------------------------------------------------------------------------- /byteps/mxnet/ops.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_OPS_H 18 | #define BYTEPS_MXNET_OPS_H 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "../common/common.h" 26 | 27 | namespace byteps { 28 | namespace mxnet { 29 | 30 | using namespace byteps::common; 31 | 32 | typedef ::mxnet::Engine Engine; 33 | typedef ::mxnet::NDArray NDArray; 34 | typedef ::mxnet::Engine::CallbackOnComplete Callback; 35 | 36 | extern "C" int byteps_mxnet_push_pull_async(NDArray* input, char* name, 37 | int version, int priority, 38 | bool is_average); 39 | 40 | extern "C" void byteps_mxnet_declare_tensor(char* name, int num_args, 41 | char** args_keys, 42 | char** args_vals); 43 | 44 | } // namespace mxnet 45 | } // namespace byteps 46 | 47 | #endif // BYTEPS_MXNET_OPS_H 48 | -------------------------------------------------------------------------------- /byteps/mxnet/ready_event.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #include 18 | 19 | #if HAVE_CUDA 20 | #include 21 | 22 | #include "ready_event.h" 23 | 24 | namespace byteps { 25 | namespace mxnet { 26 | 27 | template 28 | MXReadyEvent::MXReadyEvent(NDArray* tensor) : tensor_(tensor) { 29 | assert(tensor->ctx().real_dev_id() != CPU_DEVICE_ID); 30 | } 31 | 32 | template 33 | MXReadyEvent::~MXReadyEvent() {} 34 | 35 | template 36 | bool MXReadyEvent::Ready() const { 37 | return true; 38 | } 39 | 40 | template class MXReadyEvent; 41 | 42 | } // namespace mxnet 43 | } // namespace byteps 44 | #endif 45 | -------------------------------------------------------------------------------- /byteps/mxnet/ready_event.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_READY_EVENT_H 18 | #define BYTEPS_MXNET_READY_EVENT_H 19 | 20 | #include 21 | 22 | #if HAVE_CUDA 23 | #include 24 | #include 25 | #include 26 | #include "cuda_runtime.h" 27 | 28 | #include "../common/common.h" 29 | 30 | namespace byteps { 31 | namespace mxnet { 32 | 33 | using namespace byteps::common; 34 | typedef ::mxnet::NDArray NDArray; 35 | 36 | template 37 | class MXReadyEvent : public ReadyEvent { 38 | public: 39 | MXReadyEvent(NDArray* tensor); 40 | ~MXReadyEvent(); 41 | virtual bool Ready() const override; 42 | 43 | private: 44 | NDArray* tensor_; 45 | }; 46 | 47 | } // namespace mxnet 48 | } // namespace byteps 49 | #endif 50 | 51 | #endif // BYTEPS_MXNET_READY_EVENT_H 52 | -------------------------------------------------------------------------------- /byteps/mxnet/util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_MXNET_UTIL_H 18 | #define BYTEPS_MXNET_UTIL_H 19 | 20 | #if HAVE_CUDA 21 | 22 | #include 23 | 24 | /*! 25 | * \brief Protected CUDA call. 26 | * \param func Expression to call. 27 | * 28 | * It checks for CUDA errors after invocation of the expression. 29 | */ 30 | #define CUDA_CALL(func) \ 31 | { \ 32 | cudaError_t e = (func); \ 33 | CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ 34 | << "CUDA: " << cudaGetErrorString(e); \ 35 | } 36 | 37 | #endif // HAVE_CUDA 38 | 39 | #endif // BYTEPS_MXNET_UTIL_H 40 | -------------------------------------------------------------------------------- /byteps/server/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | import ctypes 17 | import os 18 | from byteps.common import get_ext_suffix 19 | 20 | 21 | def run(): 22 | dll_path = os.path.join(os.path.dirname(__file__), 23 | 'c_lib' + get_ext_suffix()) 24 | SERVER_LIB_CTYPES = ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL) 25 | SERVER_LIB_CTYPES.byteps_server() 26 | 27 | run() 28 | -------------------------------------------------------------------------------- /byteps/sparse_cpu_microbenchmark/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | CFLAGS = -g -Wall -O2 3 | LDFLAGS = -fopenmp 4 | 5 | TARGET = test 6 | 7 | %.o: %.cc $(DEPS) 8 | $(CC) $(LDFLAGS) -c -o $@ $< 9 | 10 | 11 | $(TARGET): $(OBJ) test.o 12 | $(CC) $(LDFLAGS) -o $@ $^ $(CFLAGS) 13 | 14 | all: $(TARGET) 15 | 16 | .PHONY: clean 17 | 18 | clean: 19 | rm -f *.o $(TARGET) -------------------------------------------------------------------------------- /byteps/tensorflow/distribute/__init__.py: -------------------------------------------------------------------------------- 1 | from . mirrored_strategy import MirroredStrategy 2 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/communicator/pool_allgather.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mergeComp_dl.torch import Communicator 3 | from horovod.torch import allgather, allgather_async, synchronize 4 | import time 5 | import horovod.torch as hvd 6 | 7 | 8 | class PoolAllgather(Communicator): 9 | def __init__(self, compressor, memory): 10 | super().__init__(compressor, memory) 11 | self.world_size = hvd.size() 12 | self.name = "PoolAllGather" 13 | 14 | 15 | def async_send(self, tensors_compressed, ctx): 16 | if tensors_compressed is None: 17 | return 18 | 19 | handles = [] 20 | for i, tensor_compressed in enumerate(tensors_compressed): 21 | handle = allgather_async(tensor_compressed, ctx[0] + str(i)) 22 | handles.append(handle) 23 | 24 | return handles 25 | 26 | 27 | def wait_receive(self, handles, ctx): 28 | tensors_compressed = [] 29 | for h in handles: 30 | tensor_compressed = synchronize(h) 31 | tensors_compressed.append(tensor_compressed.chunk(self.world_size)) 32 | 33 | tensors_decompressed = [] 34 | if len(tensors_compressed) == 1: 35 | for tensor in tensors_compressed[0]: 36 | tensors_decompressed.append(self.compressor.decompress([tensor], ctx)) 37 | elif len(tensors_compressed) == 2: 38 | for tensor, meta in zip(tensors_compressed[0], tensors_compressed[1]): 39 | tensors_decompressed.append(self.compressor.decompress((tensor, meta), ctx)) 40 | 41 | tensors_decompressed = self.memory.aggregate(tensors_decompressed) 42 | return tensors_decompressed 43 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/communicator/pool_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mergeComp_dl.torch import Communicator 3 | from horovod.torch import allreduce_async, synchronize 4 | from horovod.torch.mpi_ops import Average 5 | 6 | 7 | class PoolAllreduce(Communicator): 8 | def __init__(self, compressor, memory): 9 | super().__init__(compressor, memory) 10 | self.name = "PoolAllReduce" 11 | 12 | 13 | def async_send(self, tensors_compressed, ctx): 14 | # assert only one tensor in tensors_compressed for allreduce 15 | return allreduce_async(tensors_compressed[0], name=ctx[0], op=Average) 16 | 17 | 18 | def wait_receive(self, handle, ctx): 19 | output = [synchronize(handle)] 20 | return [self.compressor.decompress(output, ctx)] 21 | 22 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/communicator/pool_byteps.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from byteps.torch.ops import push_pull_async_inplace as byteps_push_pull 4 | from byteps.torch.ops import synchronize 5 | import sys 6 | sys.path.append("../..") 7 | from mergeComp import Communicator 8 | 9 | class PoolBytePS(Communicator): 10 | def __init__(self, compressor, memory): 11 | super().__init__(compressor, memory) 12 | self.name = "PoolBytePS" 13 | 14 | 15 | def async_send(self, tensors_compressed, ctx): 16 | return byteps_push_pull(tensors_compressed[0], average=False, name=ctx[0]) 17 | 18 | 19 | def wait_receive(self, handle, ctx): 20 | output = [synchronize(handle)] 21 | return [self.compressor.decompress(output, ctx)] -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolefsignsgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mergeComp_dl.torch import Compressor 4 | from mergeComp_dl.torch.util import packbits, unpackbits 5 | 6 | 7 | class PoolEFSignSGDCompressor(Compressor): 8 | def __init__(self): 9 | super().__init__() 10 | self.name = "EFSignSGD" 11 | self.quantization = True 12 | 13 | 14 | def compress(self, tensor, name, ctx, server=False): 15 | """Encoding and compressing the signs """ 16 | numel = tensor.numel() 17 | 18 | sign_encode = tensor >= 0 19 | mean = tensor.abs().mean().reshape((1,)) 20 | 21 | int8_tensor, size = packbits(sign_encode) 22 | tensor_compressed = int8_tensor, mean 23 | 24 | ctx = (name, numel) 25 | return tensor_compressed, ctx 26 | 27 | 28 | def decompress(self, tensor_compressed, ctx, server=False): 29 | """Decoding the signs to float format """ 30 | int8_tensor, mean = tensor_compressed 31 | mean = mean[0] 32 | name, numel = ctx 33 | 34 | sign_decode = unpackbits(int8_tensor, numel) 35 | sign_decode = sign_decode.type(torch.float32) * 2 - 1 36 | return mean * sign_decode 37 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolfp16.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Compressor 5 | 6 | 7 | class PoolFP16Compressor(Compressor): 8 | """Compress all floating point gradients to 16-bit.""" 9 | def __init__(self): 10 | super().__init__() 11 | self.name = "PoolFP16" 12 | self.quantization = False 13 | 14 | def compress(self, tensor, name, start=0): 15 | """Downcasts the tensor to 16-bit.""" 16 | dtype = tensor.dtype 17 | tensor_compressed = tensor 18 | if dtype.is_floating: 19 | # Only allow compression from other floating point types 20 | tensor_compressed = tf.cast(tensor, dtype=tf.float16) 21 | ctx = (name, dtype) 22 | return [tensor_compressed], ctx 23 | 24 | 25 | def decompress(self, tensors, ctx): 26 | """Upcasts the tensor to the initialization dtype.""" 27 | tensor_compressed = tensors[0] 28 | _, dtype = ctx 29 | tensor_decompressed = tensor_compressed 30 | #print("[decompress] before", ctx, torch.sum(tensor_compressed)) 31 | if dtype.is_floating: 32 | tensor_decompressed = tf.cast(tensor_compressed, dtype=dtype) 33 | #print("[decompress] after", ctx, torch.sum(tensor_compressed)) 34 | return tensor_decompressed 35 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolnone.py: -------------------------------------------------------------------------------- 1 | from mergeComp_dl.torch import Compressor 2 | 3 | 4 | class PoolNoneCompressor(Compressor): 5 | """Default no-op compression.""" 6 | def __init__(self): 7 | super().__init__() 8 | self.name = "PoolNone" 9 | self.quantization = False 10 | 11 | def compress(self, tensor, name, start=None, server=False): 12 | ctx = (name, tensor.numel()) 13 | return [tensor], ctx 14 | 15 | def decompress(self, tensors, ctx, server=False): 16 | return tensors[0] 17 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolonebit.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mergeComp_dl.torch import Compressor 4 | from mergeComp_dl.torch.util import packbits, unpackbits 5 | 6 | 7 | class PoolOneBitCompressor(Compressor): 8 | def __init__(self): 9 | super().__init__() 10 | self.name = "PoolOneBit" 11 | self.quantization = False 12 | 13 | 14 | def compress(self, tensor, name, ctx, server=False): 15 | numel = tensor.numel() 16 | 17 | mask0 = tensor < 0 18 | sum0 = torch.sum(tensor[mask0]) 19 | num0 = torch.sum(mask0).float() 20 | mean0 = sum0 / num0 if num0 > 0 else sum0 21 | mean0 = mean0.reshape((1,)) 22 | 23 | mask1 = ~mask0 24 | sum1 = torch.sum(tensor[mask1]) 25 | num1 = numel - num0 26 | mean1 = sum1 / num1 if num1 > 0 else sum1 27 | mean1 = mean1.reshape((1,)) 28 | 29 | means = torch.cat((mean0, mean1)) 30 | 31 | int8_tensor, size = packbits(mask0) 32 | tensor_compressed = int8_tensor, means 33 | 34 | ctx = (name, numel) 35 | return tensor_compressed, ctx 36 | 37 | 38 | def decompress(self, tensor_compressed, ctx, server=False): 39 | int8_tensor, means = tensor_compressed 40 | mean0, mean1 = means[0], means[1] 41 | name, numel = ctx 42 | 43 | uint8_tensor = unpackbits(int8_tensor, numel) 44 | 45 | tensor_decompressed = uint8_tensor * mean0 + ~uint8_tensor * mean1 46 | return tensor_decompressed -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolqsgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mergeComp_dl.torch import Compressor 4 | 5 | 6 | class PoolQSGDCompressor(Compressor): 7 | 8 | def __init__(self, quantum_num): 9 | super().__init__() 10 | self.name = "PoolQSGD" 11 | self.quantization = True 12 | self.quantum_num = quantum_num 13 | 14 | 15 | def compress(self, tensor, name, ctx, server=False): 16 | shape = tensor.size() 17 | tensor = tensor.flatten() 18 | norm = tensor.norm().reshape((1,)) 19 | 20 | abs_gradient = tensor.abs() 21 | 22 | level_float = self.quantum_num / norm * abs_gradient 23 | previous_level = level_float.floor() 24 | prob = torch.empty_like(tensor).uniform_() 25 | is_next_level = (prob < (level_float - previous_level)).type(torch.float32) 26 | new_level = (previous_level + is_next_level) 27 | 28 | sign = tensor.sign() 29 | tensor_compressed = (new_level * sign).type(torch.int16) 30 | tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half) 31 | tensor_compressed = tensor_compressed, norm 32 | 33 | ctx = (name, shape) 34 | return tensor_compressed, ctx 35 | 36 | 37 | def decompress(self, tensor_compressed, ctx, server=False): 38 | tensor, norm = tensor_compressed 39 | norm = norm[0] 40 | decode_output = tensor.type(torch.float32) 41 | tensor_decompressed = norm / self.quantum_num * decode_output 42 | return tensor_decompressed 43 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolrandomk.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mergeComp_dl.torch import Compressor 4 | 5 | 6 | def sparsify(tensor, compress_ratio): 7 | numel = tensor.numel() 8 | k = max(1, int(numel * compress_ratio)) 9 | indices = torch.randperm(numel, device=tensor.device)[:k] 10 | values = tensor[indices] 11 | return values, indices.type(torch.int32) 12 | 13 | 14 | class PoolRandomKCompressor(Compressor): 15 | def __init__(self, compress_ratio): 16 | super().__init__() 17 | self.name = "RandomK" 18 | self.quantization = False 19 | self.compress_ratio = compress_ratio 20 | 21 | 22 | def compress(self, tensor, name, start): 23 | tensors = sparsify(tensor, self.compress_ratio) 24 | ctx = name, tensor.numel(), tensor.size() 25 | return tensors, ctx 26 | 27 | 28 | def decompress(self, tensors, ctx): 29 | name, numel, size = ctx 30 | values, indices = tensors 31 | tensor_decompressed = torch.zeros(numel, dtype=values.dtype, device=values.device) 32 | tensor_decompressed.scatter_(0, indices.type(torch.int64), values) 33 | return tensor_decompressed 34 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolsignsgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | from mergeComp_dl.torch import Compressor 5 | from mergeComp_dl.torch.util import packbits, unpackbits 6 | 7 | 8 | class PoolSignSGDCompressor(Compressor): 9 | def __init__(self): 10 | super().__init__(average=False) 11 | self.name = "PoolSignSGD" 12 | self.quantization = True 13 | 14 | 15 | def compress(self, tensor, name, ctx, server=False): 16 | numel = tensor.numel() 17 | 18 | sign_encode = tensor >= 0 19 | mean = tensor.abs().mean().reshape((1,)) 20 | 21 | int8_tensor, size = packbits(sign_encode) 22 | tensor_compressed = int8_tensor, mean 23 | 24 | ctx = (name, numel) 25 | return tensor_compressed, ctx 26 | 27 | 28 | def decompress(self, tensor_compressed, ctx, server=False): 29 | """Decoding the signs to float format """ 30 | int8_tensor, mean = tensor_compressed 31 | mean = mean[0] 32 | name, numel = ctx 33 | 34 | sign_decode = unpackbits(int8_tensor, numel) 35 | sign_decode = sign_decode.type(torch.float32) * 2 - 1 36 | 37 | return mean * sign_decode -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolsignum.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | from mergeComp_dl.torch import Compressor 5 | from mergeComp_dl.torch.util import packbits, unpackbits 6 | 7 | 8 | class PoolSignumCompressor(Compressor): 9 | def __init__(self, momentum): 10 | super().__init__(average=False) 11 | self.name = "PoolSignNum" 12 | self.quantization = True 13 | self.momentum = momentum 14 | self.momentums = {} 15 | 16 | 17 | def compress(self, tensor, name, ctx, server=False): 18 | """Encoding and compressing the signs """ 19 | numel = tensor.numel() 20 | mean = tensor.abs().mean().reshape((1,)) 21 | 22 | # update tensor by momentum 23 | if name in self.momentums: 24 | tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name] 25 | self.momentums[name] = tensor 26 | sign_encode = tensor >= 0 27 | 28 | int8_tensor, size = packbits(sign_encode) 29 | tensor_compressed = int8_tensor, mean 30 | 31 | ctx = (name, numel) 32 | return tensor_compressed, ctx 33 | 34 | 35 | def decompress(self, tensor_compressed, ctx, server=False): 36 | """Decoding the signs to float format """ 37 | int8_tensor, _ = tensor_compressed 38 | name, numel = ctx 39 | 40 | sign_decode = unpackbits(int8_tensor, numel) 41 | return sign_decode.type(torch.float32) * 2 - 1 42 | 43 | 44 | def aggregate(self, tensors): 45 | """Aggregate a list of tensors.""" 46 | agged_tensor = sum(tensors) 47 | agged_tensor = agged_tensor >= 0 48 | agged_tensor = agged_tensor * 2.0 - 1.0 49 | return [agged_tensor] 50 | 51 | 52 | def clean(self): 53 | self.momentums = {} -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/poolterngrad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | from mergeComp_dl.torch import Compressor 5 | from mergeComp_dl.torch.util import pack2bits, unpack2bits 6 | 7 | 8 | class PoolTernGradCompressor(Compressor): 9 | def __init__(self): 10 | super().__init__() 11 | self.name = "PoolTernGrad" 12 | self.quantization = True 13 | 14 | 15 | def compress(self, tensor, name, ctx, server=False): 16 | numel = tensor.numel() 17 | abs_gradient = tensor.abs() 18 | scalar = abs_gradient.max() 19 | sign_gradient = tensor.sign() * scalar 20 | 21 | try: 22 | rnd_sample = torch.empty_like(tensor).cuda().uniform_(0, scalar.item()) 23 | except: 24 | rnd_sample = torch.zeros_like(tensor).cuda() 25 | 26 | sign_gradient[rnd_sample >= abs_gradient] = 0 27 | 28 | mask = sign_gradient.sign() > 0 29 | tern_tensor = sign_gradient.sign() + 1 # {-1, 0, 1} + 1 30 | print(tern_tensor.sum()) 31 | 32 | int8_tensor, size = pack2bits(mask, tern_tensor) 33 | tensor_compressed = int8_tensor, scalar.flatten() 34 | 35 | ctx = (name, numel) 36 | return tensor_compressed, ctx 37 | 38 | 39 | def decompress(self, tensor_compressed, ctx, server=False): 40 | int8_tensor, scalar = tensor_compressed 41 | name, numel = ctx 42 | 43 | tern_tensor = unpack2bits(int8_tensor, numel) 44 | print(tern_tensor.sum()) 45 | 46 | sign = tern_tensor.type(torch.float32) - 1 # {0, 1, 2} - 1 47 | return sign * scalar -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/compressor/pooltopk.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from mergeComp_dl.torch import Compressor 4 | 5 | 6 | def sparsify(tensor, compress_ratio): 7 | k = max(1, int(tensor.numel() * compress_ratio)) 8 | _, indices = torch.topk(tensor.abs(), k) 9 | values = tensor[indices] 10 | return values, indices.type(torch.int32) 11 | 12 | 13 | def desparsify(tensors, numel): 14 | values, indices = tensors 15 | tensor_decompressed = torch.zeros(numel, dtype=values.dtype, device=values.device) 16 | tensor_decompressed.scatter_(0, indices.type(torch.int64), values) 17 | return tensor_decompressed 18 | 19 | 20 | class PoolTopKCompressor(Compressor): 21 | 22 | def __init__(self, compress_ratio): 23 | super().__init__() 24 | self.name = "PoolTopK" 25 | self.quantization = False 26 | self.compress_ratio = compress_ratio 27 | 28 | 29 | def compress(self, tensor, name, start): 30 | tensors = sparsify(tensor, self.compress_ratio) 31 | ctx = (name, tensor.numel(), tensor.size()) 32 | return tensors, ctx 33 | 34 | 35 | def decompress(self, tensors, ctx): 36 | """Decompress by filling empty slots with zeros and reshape back using the original shape""" 37 | name, numel, size = ctx 38 | tensor_decompressed = desparsify(tensors, numel) 39 | return tensor_decompressed 40 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/memory/none.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import horovod.torch as hvd 3 | 4 | from .memory_layer import MemoryLayer 5 | 6 | 7 | class NoneMemory(MemoryLayer): 8 | def __init__(self, named_parameters): 9 | self.world_size = hvd.size() 10 | super().__init__(named_parameters) 11 | 12 | def compensate(self, tensor, name): 13 | """Update the tensor with the residuals.""" 14 | grad = self.get_grad(name) 15 | residual = self.get_velocity(name) 16 | residual.copy_(grad) 17 | 18 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 19 | """Update the residuals.""" 20 | pass 21 | 22 | def reduce(self, ctx, name): 23 | reduction = self.get_reduction(name) 24 | reduction.zero_() 25 | for c in ctx: 26 | reduction.add_(c/self.world_size) 27 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/memory/poolnone.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import byteps.tensorflow as bps 3 | 4 | from .memory_pool import MemoryPool 5 | 6 | 7 | class PoolNoneMemory(MemoryPool): 8 | def __init__(self, named_parameters, fusion_num=2): 9 | self.world_size = bps.size() 10 | super().__init__(named_parameters, fusion_num) 11 | 12 | def compensate(self, tensor, name): 13 | """Update the tensor with the residuals.""" 14 | grad = self.get_grad(name) 15 | residual = self.get_velocity(name) 16 | residual.assign(grad) 17 | 18 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 19 | """Update the residuals.""" 20 | pass 21 | 22 | def reduce(self, ctx, name): 23 | reduction = self.get_reduction(name) 24 | #reduction -= reduction 25 | # TODO:for compression algorithms with allreduce, the received results have been averaged already. 26 | # Probably there is no need to divide c with self.world_size. 27 | 28 | #print(len(reduction), len(ctx)) 29 | reduction.assign(ctx) -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/memory/poolresidual.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import byteps.tensorflow as bps 3 | 4 | from .memory_pool import MemoryPool 5 | 6 | 7 | class PoolResidualMemory(MemoryPool): 8 | #TODO: tune beta and gamma to increase accurary 9 | def __init__(self, named_parameters, fusion_num=2, beta=0.9, gamma=1.0): 10 | self.beta = beta 11 | self.gamma = gamma 12 | self.world_size = hvd.size() 13 | super().__init__(named_parameters, fusion_num) 14 | 15 | 16 | def compensate(self, tensor, name): 17 | """vec stores the residuals""" 18 | grad = self.get_grad(name) 19 | residual = self.get_velocity(name) 20 | #residual.add_(grad) 21 | residual.mul_(self.beta).add_(self.gamma*grad) 22 | 23 | 24 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 25 | """Update the residuals.""" 26 | tensor_decompressed = compressor.decompress(tensor_compressed, ctx) 27 | residual = self.get_velocity(name) 28 | residual.assign(tensor.view(-1) - tensor_decompressed) 29 | 30 | 31 | def reduce(self, ctx, name): 32 | reduction = self.get_reduction(name) 33 | reduction.zero_() 34 | for c in ctx: 35 | #reduction.add_(c) 36 | reduction.add_(c/self.world_size) 37 | -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/memory/residual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import horovod.torch as hvd 3 | 4 | from .memory_layer import MemoryLayer 5 | 6 | 7 | class ResidualMemory(MemoryLayer): 8 | def __init__(self, named_parameters, beta=0.9, gamma=1.0): 9 | self.beta = beta 10 | self.gamma = gamma 11 | self.world_size = hvd.size() 12 | super().__init__(named_parameters) 13 | 14 | 15 | def compensate(self, tensor, name): 16 | """vec stores the residuals""" 17 | grad = self.get_grad(name) 18 | residual = self.get_velocity(name) 19 | residual.mul_(self.beta).add_(self.gamma*grad) 20 | 21 | 22 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 23 | """Update the residuals.""" 24 | tensor_decompressed = compressor.decompress(tensor_compressed, ctx) 25 | residual = self.get_velocity(name) 26 | residual.assign(tensor.view(-1) - tensor_decompressed) 27 | 28 | 29 | def reduce(self, ctx, name): 30 | reduction = self.get_reduction(name) 31 | reduction.zero_() 32 | for c in ctx: 33 | reduction.add_(c/self.world_size) -------------------------------------------------------------------------------- /byteps/tensorflow/mergeComp/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.dlpack import to_dlpack 3 | from torch.utils.dlpack import from_dlpack 4 | import cupy 5 | 6 | 7 | def torch2cupy(tensor): 8 | return cupy.fromDlpack(to_dlpack(tensor)) 9 | 10 | 11 | def cupy2torch(cupy_tensor): 12 | return from_dlpack(cupy_tensor.toDlpack()) 13 | 14 | 15 | def packbits(array): 16 | return cupy2torch(cupy.packbits(torch2cupy(array))), array.numel() 17 | 18 | 19 | def unpackbits(array, size): 20 | return cupy2torch(cupy.unpackbits(torch2cupy(array))[:size]) 21 | 22 | 23 | def pack2bits(first, second): 24 | data = torch.cat((first, second.type(torch.bool)), 0) 25 | return cupy2torch(cupy.packbits(torch2cupy(data))), first.numel() 26 | 27 | 28 | def unpack2bits(array, size): 29 | decode = cupy2torch(cupy.unpackbits(torch2cupy(array))) 30 | first = decode[:size] 31 | second = decode[size:2*size] 32 | second[first > 0] = 2 33 | 34 | return second 35 | 36 | 37 | -------------------------------------------------------------------------------- /byteps/tensorflow/ops.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #ifndef BYTEPS_TENSORFLOW_OPS_H 17 | #define BYTEPS_TENSORFLOW_OPS_H 18 | 19 | #include 20 | 21 | #include "tensorflow/core/framework/op.h" 22 | #include "tensorflow/core/framework/op_kernel.h" 23 | #include "tensorflow/core/framework/shape_inference.h" 24 | 25 | #define EIGEN_USE_THREADS 26 | #include "tensorflow/stream_executor/stream.h" 27 | 28 | #include "../common/operations.h" 29 | 30 | namespace byteps { 31 | namespace tensorflow { 32 | 33 | class TFReadyEvent : public common::ReadyEvent { 34 | public: 35 | TFReadyEvent(::tensorflow::DeviceContext* device_context); 36 | bool Ready() const override; 37 | 38 | private: 39 | std::shared_ptr event_; 40 | }; 41 | 42 | class TFTensor : public common::Tensor { 43 | public: 44 | TFTensor(::tensorflow::Tensor& tensor); 45 | virtual const common::DataType dtype() const override; 46 | virtual const common::TensorShape shape() const override; 47 | virtual const void* data() const override; 48 | virtual int64_t size() const override; 49 | 50 | protected: 51 | ::tensorflow::Tensor tensor_; 52 | }; 53 | 54 | extern "C" void byteps_tensorflow_declare_tensor(char* name); 55 | 56 | } // namespace tensorflow 57 | } // namespace byteps 58 | 59 | #endif // BYTEPS_TENSORFLOW_OPS_H 60 | -------------------------------------------------------------------------------- /byteps/tensorflow/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | from distutils.version import LooseVersion 16 | 17 | import tensorflow as tf 18 | 19 | 20 | if LooseVersion(tf.__version__) >= LooseVersion("1.9.0"): 21 | from tensorflow.python.eager import context 22 | _has_eager = True 23 | else: 24 | _has_eager = False 25 | 26 | 27 | def _executing_eagerly(): 28 | """Returns true if eager execution is supported and enabled.""" 29 | return _has_eager and context.in_eager_mode() 30 | -------------------------------------------------------------------------------- /byteps/torch/adapter.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_ADAPTER_H 18 | #define BYTEPS_TORCH_ADAPTER_H 19 | 20 | #include 21 | #include 22 | 23 | #include "../common/common.h" 24 | 25 | namespace byteps { 26 | namespace torch { 27 | 28 | using namespace byteps::common; 29 | 30 | class TorchTensor : public Tensor { 31 | public: 32 | TorchTensor(::torch::Tensor tensor); 33 | virtual const DataType dtype() const override; 34 | virtual const TensorShape shape() const override; 35 | virtual const void* data() const override; 36 | virtual int64_t size() const override; 37 | 38 | protected: 39 | ::torch::Tensor tensor_; 40 | }; 41 | 42 | void ThrowIfError(Status status); 43 | 44 | } // namespace torch 45 | } // namespace byteps 46 | 47 | #endif // BYTEPS_TORCH_ADAPTER_H 48 | -------------------------------------------------------------------------------- /byteps/torch/cuda_util.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | // ============================================================================= 15 | 16 | #if HAVE_CUDA 17 | #include 18 | #include "cuda_runtime.h" 19 | #endif 20 | 21 | #include "../common/common.h" 22 | #include "cuda_util.h" 23 | 24 | namespace byteps { 25 | namespace torch { 26 | 27 | with_device::with_device(int device) { 28 | if (device == CPU_DEVICE_ID) { 29 | restore_device_ = CPU_DEVICE_ID; 30 | } else { 31 | #if HAVE_CUDA 32 | THCudaCheck(cudaGetDevice(&restore_device_)); 33 | THCudaCheck(cudaSetDevice(device)); 34 | #else 35 | throw std::logic_error( 36 | "Internal error. Requested device context manager " 37 | "with GPU device but not compiled with CUDA."); 38 | #endif 39 | } 40 | } 41 | 42 | with_device::~with_device() { 43 | #if HAVE_CUDA 44 | if (restore_device_ != CPU_DEVICE_ID) { 45 | THCudaCheck(cudaSetDevice(restore_device_)); 46 | } 47 | #endif 48 | } 49 | 50 | } // namespace torch 51 | } // namespace byteps 52 | -------------------------------------------------------------------------------- /byteps/torch/cuda_util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_CUDA_UTIL_H 18 | #define BYTEPS_TORCH_CUDA_UTIL_H 19 | 20 | #include "../common/common.h" 21 | 22 | namespace byteps { 23 | namespace torch { 24 | 25 | class with_device { 26 | public: 27 | with_device(int device); 28 | ~with_device(); 29 | 30 | private: 31 | int restore_device_ = CPU_DEVICE_ID; 32 | }; 33 | 34 | } // namespace torch 35 | } // namespace byteps 36 | 37 | #endif // BYTEPS_TORCH_CUDA_UTIL_H 38 | -------------------------------------------------------------------------------- /byteps/torch/examples/BERT/dataset/checkpoint/bert_base_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 768, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 3072, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 12, 10 | "num_hidden_layers": 12, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } -------------------------------------------------------------------------------- /byteps/torch/examples/BERT/dataset/checkpoint/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.1, 5 | "hidden_size": 1024, 6 | "initializer_range": 0.02, 7 | "intermediate_size": 4096, 8 | "max_position_embeddings": 512, 9 | "num_attention_heads": 16, 10 | "num_hidden_layers": 24, 11 | "type_vocab_size": 2, 12 | "vocab_size": 30522 13 | } -------------------------------------------------------------------------------- /byteps/torch/examples/BERT/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import byteps.torch as bps 15 | 16 | def get_rank(): 17 | return bps.rank() 18 | 19 | def is_main_process(): 20 | return get_rank() == 0 21 | 22 | def format_step(step): 23 | if isinstance(step, str): 24 | return step 25 | s = "" 26 | if len(step) > 0: 27 | s += "Training Epoch: {} ".format(step[0]) 28 | if len(step) > 1: 29 | s += "Training Iteration: {} ".format(step[1]) 30 | if len(step) > 2: 31 | s += "Validation Iteration: {} ".format(step[2]) 32 | return s -------------------------------------------------------------------------------- /byteps/torch/examples/README.md: -------------------------------------------------------------------------------- 1 | # End-to-end training throughput 2 | 3 | ## How to run 4 | 5 | `run_nvlink_models.sh` and `run_pcie_models.sh` are two scripts to run the experiments for E1 (NVLink-based experiments) and E2 (PCIe-only experiments) 6 | 7 | Set ifname and DMLC_PS_ROOT_URI in the two scripts. 8 | 9 | ifname: the network interface card name, e.g., eth0, eth2 10 | 11 | DMLC_PS_ROOT_URI: the IP address of the root machine. Any machine involved in training can serve as the root machine and its ID is 0. 12 | 13 | For E1, run the command on each NVLink-based GPU machines 14 | ```bash 15 | bash run_nvlink_models.sh WORKERS ID 16 | ``` 17 | 18 | For E2, run the command on each PCIe-only GPU machines 19 | ```bash 20 | bash run_pcie_models.sh WORKERS ID 21 | ``` 22 | 23 | WORKERS: the number of GPU machines in the training 24 | 25 | ID: the id of a machine. machines have distinct IDs that start from 0 to WORKERS-1 26 | 27 | ## Results 28 | 29 | The results of model training throughput with different systems are logged in model_log. 30 | The metrics are `images/sec` or `tokens/sec`. 31 | Check the logs after the completion of training. 32 | 33 | 34 | ## For trace of time gaps 35 | 36 | Add the following environment variables in Shell scripts 37 | 38 | ```bash 39 | export BYTEPS_TRACE_ON=1 40 | export BYTEPS_TRACE_START_STEP=10 41 | export BYTEPS_TRACE_END_STEP=20 42 | export BYTEPS_TRACE_DIR=trace 43 | ``` 44 | 45 | Make sure there is a folder named `trace/0/` and then run the training without applying compression algorithms. Extract the time gaps with `byteps/torch/examples/json_parser.py` and remember to change the input file in json_parse.py. 46 | -------------------------------------------------------------------------------- /byteps/torch/examples/extract.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import numpy as np 4 | 5 | 6 | folder = sys.argv[1] 7 | keyword = "Training speed" 8 | 9 | files = os.listdir(folder) 10 | 11 | for filename in files: 12 | path = os.path.join(folder, filename) 13 | print(filename) 14 | speeds = [] 15 | with open(path, 'r') as fp: 16 | lines = fp.readlines() 17 | for line in lines: 18 | if keyword in line: 19 | print(line.strip('\n')) 20 | speed = line.split()[-2] 21 | speeds.append(float(speed)) 22 | 23 | speeds = np.array(speeds) 24 | print("avg: {:.3f}\t std: {:.3f}".format(np.mean(speeds), np.std(speeds))) 25 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/README.md: -------------------------------------------------------------------------------- 1 | # GPT-2 2 | 3 | You can run the model step by step as follows. 4 | The dataset and the dependencies are all set after "bash install.sh". 5 | Go to "How to run" directly. 6 | 7 | ## Download the dataset 8 | ```bash 9 | mkdir ~/data 10 | cd ~/data 11 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip 12 | unzip wikitext-2-raw-v1.zip 13 | ``` 14 | **Note**: the default location of the dataset is ~/data, and the dataset for GPT2 is in ~/data/wikitext-2-raw 15 | 16 | 17 | ## Install dependencies 18 | ```bash 19 | bash run_prepare.sh 20 | ``` 21 | 22 | ## How to run 23 | **Note**: Make sure the dataset is in the right location and it runs on NVLink-based GPU machines. 24 | Set ifname in run_espresso.sh and run_baseline.sh. 25 | 26 | ifname: the network interface card name, e.g., eth0, eth2 27 | 28 | ```bash 29 | export DMLC_PS_ROOT_URI="ip" 30 | export DMLC_NUM_WORKER=WORKERS 31 | export DMLC_WORKER_ID=WORKER_ID 32 | ``` 33 | 34 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine 35 | 36 | WORKERS: the number of GPU machines in the training 37 | 38 | ID: the id of a machine. machines have distinct IDs that start from 0 39 | 40 | 41 | ### Espresso 42 | Run on each machine 43 | ```bash 44 | bash run_espresso.sh 45 | ``` 46 | 47 | ### Baselines 48 | Run on each machine 49 | ```bash 50 | bash run_baseline.sh 51 | ``` -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.circleci/deploy.sh: -------------------------------------------------------------------------------- 1 | cd docs 2 | 3 | function deploy_doc(){ 4 | echo "Creating doc at commit $1 and pushing to folder $2" 5 | git checkout $1 6 | if [ ! -z "$2" ] 7 | then 8 | if [ -d "$dir/$2" ]; then 9 | echo "Directory" $2 "already exists" 10 | else 11 | echo "Pushing version" $2 12 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2 13 | fi 14 | else 15 | echo "Pushing master" 16 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir 17 | fi 18 | } 19 | 20 | deploy_doc "master" 21 | deploy_doc "b33a385" v1.0.0 22 | deploy_doc "fe02e45" v1.1.0 23 | deploy_doc "89fd345" v1.2.0 24 | deploy_doc "fc9faa8" v2.0.0 25 | deploy_doc "3ddce1d" v2.1.1 26 | deploy_doc "3616209" v2.2.0 27 | deploy_doc "d0f8b9a" v2.3.0 28 | deploy_doc "6664ea9" v2.4.0 -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source=transformers 3 | omit = 4 | # skip convertion scripts from testing for now 5 | */convert_* 6 | */__main__.py 7 | [report] 8 | exclude_lines = 9 | pragma: no cover 10 | raise 11 | except 12 | register_parameter -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/---new-benchmark.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F5A5 New benchmark" 3 | about: Benchmark a part of this library and share your results 4 | title: "[Benchmark]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🖥 Benchmarking `transformers` 11 | 12 | ## Benchmark 13 | 14 | Which part of `transformers` did you benchmark? 15 | 16 | ## Set-up 17 | 18 | What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use? 19 | 20 | ## Results 21 | 22 | Put your results here! 23 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/--new-model-addition.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F31F New model addition" 3 | about: Submit a proposal/request to implement a new Transformer-based model 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🌟 New model addition 11 | 12 | ## Model description 13 | 14 | 15 | 16 | ## Open source status 17 | 18 | * [ ] the model implementation is available: (give details) 19 | * [ ] the model weights are available: (give details) 20 | * [ ] who are the authors: (mention them, if possible by @gh-username) 21 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F41B Bug Report" 3 | about: Submit a bug report to help us improve transformers 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🐛 Bug 11 | 12 | ## Information 13 | 14 | Model I am using (Bert, XLNet ...): 15 | 16 | Language I am using the model on (English, Chinese ...): 17 | 18 | The problem arises when using: 19 | * [ ] the official example scripts: (give details below) 20 | * [ ] my own modified scripts: (give details below) 21 | 22 | The tasks I am working on is: 23 | * [ ] an official GLUE/SQUaD task: (give the name) 24 | * [ ] my own task or dataset: (give details below) 25 | 26 | ## To reproduce 27 | 28 | Steps to reproduce the behavior: 29 | 30 | 1. 31 | 2. 32 | 3. 33 | 34 | 37 | 38 | ## Expected behavior 39 | 40 | 41 | 42 | ## Environment 43 | 44 | * OS: 45 | * Python version: 46 | * PyTorch version: 47 | * `transformers` version (or branch): 48 | * Using GPU ? 49 | * Distributed or parallel setup ? 50 | * Any other relevant information: 51 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680 Feature request" 3 | about: Submit a proposal/request for a new transformers feature 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 🚀 Feature request 11 | 12 | 14 | 15 | ## Motivation 16 | 17 | 20 | 21 | ## Your contribution 22 | 23 | 26 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/migration.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers" 3 | about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # 📚 Migration 11 | 12 | ## Information 13 | 14 | 15 | 16 | Model I am using (Bert, XLNet ...): 17 | 18 | Language I am using the model on (English, Chinese ...): 19 | 20 | The problem arises when using: 21 | * [ ] the official example scripts: (give details below) 22 | * [ ] my own modified scripts: (give details below) 23 | 24 | The tasks I am working on is: 25 | * [ ] an official GLUE/SQUaD task: (give the name) 26 | * [ ] my own task or dataset: (give details below) 27 | 28 | ## Details 29 | 30 | 35 | 36 | ## Environment 37 | 38 | * OS: 39 | * Python version: 40 | * PyTorch version: 41 | * `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch): 42 | * `transformers` version (or branch): 43 | * Using GPU? 44 | * Distributed or parallel setup? 45 | * Any other relevant information: 46 | 47 | ## Checklist 48 | 49 | - [ ] I have read the migration guide in the readme. 50 | ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers); 51 | [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers)) 52 | - [ ] I checked if a related official extension example runs on my machine. 53 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/question-help.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "❓ Questions & Help" 3 | about: Post your general questions on Stack Overflow tagged huggingface-transformers 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | # ❓ Questions & Help 11 | 12 | 23 | 24 | ## Details 25 | 26 | 27 | 29 | **A link to original question on Stack Overflow**: -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: wontfix 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: quality style test test-examples 2 | 3 | # Check that source code meets quality standards 4 | 5 | quality: 6 | black --check --line-length 119 --target-version py35 examples templates tests src utils 7 | isort --check-only --recursive examples templates tests src utils 8 | flake8 examples templates tests src utils 9 | 10 | # Format source code automatically 11 | 12 | style: 13 | black --line-length 119 --target-version py35 examples templates tests src utils 14 | isort --recursive examples templates tests src utils 15 | 16 | # Run tests for the library 17 | 18 | test: 19 | python -m pytest -n auto --dist=loadfile -s -v ./tests/ 20 | 21 | # Run tests for examples 22 | 23 | test-examples: 24 | python -m pytest -n auto --dist=loadfile -s -v ./examples/ 25 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/deploy_multi_version_doc.sh: -------------------------------------------------------------------------------- 1 | cd docs 2 | 3 | function deploy_doc(){ 4 | echo "Creating doc at commit $1 and pushing to folder $2" 5 | git checkout $1 6 | if [ ! -z "$2" ] 7 | then 8 | echo "Pushing version" $2 9 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2 10 | else 11 | echo "Pushing master" 12 | make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir 13 | fi 14 | } 15 | 16 | deploy_doc "master" 17 | deploy_doc "b33a385" v1.0.0 18 | deploy_doc "fe02e45" v1.1.0 19 | deploy_doc "89fd345" v1.2.0 20 | deploy_doc "fc9faa8" v2.0.0 21 | deploy_doc "3ddce1d" v2.1.1 22 | deploy_doc "f2f3294" v2.2.0 23 | deploy_doc "d0f8b9a" v2.3.0 24 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:latest 2 | 3 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext 4 | 5 | RUN pip install transformers 6 | 7 | WORKDIR /workspace -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Light.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Light.ttf -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Medium.otf -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Regular.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Regular.otf -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Thin.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Thin.otf -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/code-snippets.css: -------------------------------------------------------------------------------- 1 | 2 | .highlight .c1, .highlight .sd{ 3 | color: #999 4 | } 5 | 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc { 7 | color: #FB8D68; 8 | } 9 | 10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow { 11 | color: #6670FF; 12 | } -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/bertology.rst: -------------------------------------------------------------------------------- 1 | BERTology 2 | --------- 3 | 4 | There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are: 5 | 6 | 7 | * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950 8 | * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 9 | * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341 10 | 11 | In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650): 12 | 13 | 14 | * accessing all the hidden-states of BERT/GPT/GPT-2, 15 | * accessing all the attention weights for each head of BERT/GPT/GPT-2, 16 | * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650. 17 | 18 | To help you understand and use these features, we have added a specific example script: `bertology.py `_ while extract information and prune a model pre-trained on GLUE. 19 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/transformers_logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/transformers_logo_name.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_constant_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_constant_schedule.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_schedule.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_linear_schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_linear_schedule.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/configuration.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ---------------------------------------------------- 3 | 4 | The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PretrainedConfig`` 7 | ~~~~~~~~~~~~~~~~~~~~~ 8 | 9 | .. autoclass:: transformers.PretrainedConfig 10 | :members: 11 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/model.rst: -------------------------------------------------------------------------------- 1 | Models 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedModel`` also implements a few methods which are common among all the models to: 7 | 8 | - resize the input token embeddings when new tokens are added to the vocabulary 9 | - prune the attention heads of the model. 10 | 11 | ``PreTrainedModel`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.PreTrainedModel 15 | :members: 16 | 17 | ``TFPreTrainedModel`` 18 | ~~~~~~~~~~~~~~~~~~~~~ 19 | 20 | .. autoclass:: transformers.TFPreTrainedModel 21 | :members: 22 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/optimizer_schedules.rst: -------------------------------------------------------------------------------- 1 | Optimizer 2 | ---------------------------------------------------- 3 | 4 | The ``.optimization`` module provides: 5 | 6 | - an optimizer with weight decay fixed that can be used to fine-tuned models, and 7 | - several schedules in the form of schedule objects that inherit from ``_LRSchedule``: 8 | - a gradient accumulation class to accumulate the gradients of multiple batches 9 | 10 | ``AdamW`` 11 | ~~~~~~~~~~~~~~~~ 12 | 13 | .. autoclass:: transformers.AdamW 14 | :members: 15 | 16 | ``AdamWeightDecay`` 17 | ~~~~~~~~~~~~~~~~~~~ 18 | 19 | .. autoclass:: transformers.AdamWeightDecay 20 | :members: 21 | 22 | .. autofunction:: transformers.create_optimizer 23 | 24 | Schedules 25 | ---------------------------------------------------- 26 | 27 | Learning Rate Schedules 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | .. autofunction:: transformers.get_constant_schedule 30 | 31 | 32 | .. autofunction:: transformers.get_constant_schedule_with_warmup 33 | 34 | .. image:: /imgs/warmup_constant_schedule.png 35 | :target: /imgs/warmup_constant_schedule.png 36 | :alt: 37 | 38 | 39 | .. autofunction:: transformers.get_cosine_schedule_with_warmup 40 | 41 | .. image:: /imgs/warmup_cosine_schedule.png 42 | :target: /imgs/warmup_cosine_schedule.png 43 | :alt: 44 | 45 | 46 | .. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup 47 | 48 | .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png 49 | :target: /imgs/warmup_cosine_hard_restarts_schedule.png 50 | :alt: 51 | 52 | 53 | 54 | .. autofunction:: transformers.get_linear_schedule_with_warmup 55 | 56 | .. image:: /imgs/warmup_linear_schedule.png 57 | :target: /imgs/warmup_linear_schedule.png 58 | :alt: 59 | 60 | ``Warmup`` 61 | ~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.WarmUp 64 | :members: 65 | 66 | Gradient Strategies 67 | ---------------------------------------------------- 68 | 69 | ``GradientAccumulator`` 70 | ~~~~~~~~~~~~~~~~~~~~~~~ 71 | 72 | .. autoclass:: transformers.GradientAccumulator 73 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/tokenizer.rst: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | ---------------------------------------------------- 3 | 4 | The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository). 5 | 6 | ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers: 7 | 8 | - tokenizing, converting tokens to ids and back and encoding/decoding, 9 | - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...), 10 | - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization) 11 | 12 | ``PreTrainedTokenizer`` 13 | ~~~~~~~~~~~~~~~~~~~~~~~~ 14 | 15 | .. autoclass:: transformers.PreTrainedTokenizer 16 | :members: 17 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/model_doc/auto.rst: -------------------------------------------------------------------------------- 1 | AutoModels 2 | ----------- 3 | 4 | In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method. 5 | 6 | AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary: 7 | 8 | Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``). 9 | 10 | 11 | ``AutoConfig`` 12 | ~~~~~~~~~~~~~~~~~~~~~ 13 | 14 | .. autoclass:: transformers.AutoConfig 15 | :members: 16 | 17 | 18 | ``AutoTokenizer`` 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | .. autoclass:: transformers.AutoTokenizer 22 | :members: 23 | 24 | 25 | ``AutoModel`` 26 | ~~~~~~~~~~~~~~~~~~~~~ 27 | 28 | .. autoclass:: transformers.AutoModel 29 | :members: 30 | 31 | 32 | ``AutoModelForPreTraining`` 33 | ~~~~~~~~~~~~~~~~~~~~~ 34 | 35 | .. autoclass:: transformers.AutoModelForPreTraining 36 | :members: 37 | 38 | 39 | ``AutoModelWithLMHead`` 40 | ~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | .. autoclass:: transformers.AutoModelWithLMHead 43 | :members: 44 | 45 | 46 | ``AutoModelForSequenceClassification`` 47 | ~~~~~~~~~~~~~~~~~~~~~ 48 | 49 | .. autoclass:: transformers.AutoModelForSequenceClassification 50 | :members: 51 | 52 | 53 | ``AutoModelForQuestionAnswering`` 54 | ~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | .. autoclass:: transformers.AutoModelForQuestionAnswering 57 | :members: 58 | 59 | 60 | ``AutoModelForTokenClassification`` 61 | ~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | .. autoclass:: transformers.AutoModelForTokenClassification 64 | :members: 65 | 66 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/model_sharing.md: -------------------------------------------------------------------------------- 1 | # Model upload and sharing 2 | 3 | Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the CLI that's built-in to the library. 4 | 5 | **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then: 6 | 7 | ```shell 8 | transformers-cli login 9 | # log in using the same credentials as on huggingface.co 10 | ``` 11 | Upload your model: 12 | ```shell 13 | transformers-cli upload ./path/to/pretrained_model/ 14 | 15 | # ^^ Upload folder containing weights/tokenizer/config 16 | # saved via `.save_pretrained()` 17 | 18 | transformers-cli upload ./config.json [--filename folder/foobar.json] 19 | 20 | # ^^ Upload a single file 21 | # (you can optionally override its filename, which can be nested inside a folder) 22 | ``` 23 | 24 | Your model will then be accessible through its identifier, a concatenation of your username and the folder name above: 25 | ```python 26 | "username/pretrained_model" 27 | ``` 28 | 29 | Anyone can load it from code: 30 | ```python 31 | tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model") 32 | model = AutoModel.from_pretrained("username/pretrained_model") 33 | ``` 34 | 35 | Finally, list all your files on S3: 36 | ```shell 37 | transformers-cli s3 ls 38 | # List all your S3 objects. 39 | ``` 40 | 41 | You can also delete files: 42 | 43 | ```shell 44 | transformers-cli s3 rm … 45 | ``` -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/docs/source/notebooks.rst: -------------------------------------------------------------------------------- 1 | Notebooks 2 | ================================================ 3 | 4 | We include `three Jupyter Notebooks `_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model. 5 | 6 | 7 | * 8 | The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb `_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models. 9 | 10 | * 11 | The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb `_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models. 12 | 13 | * 14 | The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb `_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model. 15 | 16 | Please follow the instructions given in the notebooks to run and modify them. 17 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/contrib/README.md: -------------------------------------------------------------------------------- 1 | # Community contributed examples 2 | 3 | This folder contains examples which are not actively maintained (mostly contributed by the community). 4 | 5 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working. 6 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/contrib/run_camembert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from transformers.modeling_camembert import CamembertForMaskedLM 4 | from transformers.tokenization_camembert import CamembertTokenizer 5 | 6 | 7 | def fill_mask(masked_input, model, tokenizer, topk=5): 8 | # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py 9 | assert masked_input.count("") == 1 10 | input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0) # Batch size 1 11 | logits = model(input_ids)[0] # The last hidden-state is the first element of the output tuple 12 | masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item() 13 | logits = logits[0, masked_index, :] 14 | prob = logits.softmax(dim=0) 15 | values, indices = prob.topk(k=topk, dim=0) 16 | topk_predicted_token_bpe = " ".join( 17 | [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))] 18 | ) 19 | masked_token = tokenizer.mask_token 20 | topk_filled_outputs = [] 21 | for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")): 22 | predicted_token = predicted_token_bpe.replace("\u2581", " ") 23 | if " {0}".format(masked_token) in masked_input: 24 | topk_filled_outputs.append( 25 | ( 26 | masked_input.replace(" {0}".format(masked_token), predicted_token), 27 | values[index].item(), 28 | predicted_token, 29 | ) 30 | ) 31 | else: 32 | topk_filled_outputs.append( 33 | (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,) 34 | ) 35 | return topk_filled_outputs 36 | 37 | 38 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base") 39 | model = CamembertForMaskedLM.from_pretrained("camembert-base") 40 | model.eval() 41 | 42 | masked_input = "Le camembert est :)" 43 | print(fill_mask(masked_input, model, tokenizer, topk=3)) 44 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/distillation/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | 3 | gitpython==3.0.2 4 | tensorboard>=1.14.0 5 | tensorboardX==1.8 6 | psutil==5.6.3 7 | scipy==1.3.1 8 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilbert-base-multilingual-cased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 119547 14 | } 15 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilbert-base-uncased.json: -------------------------------------------------------------------------------- 1 | { 2 | "activation": "gelu", 3 | "attention_dropout": 0.1, 4 | "dim": 768, 5 | "dropout": 0.1, 6 | "hidden_dim": 3072, 7 | "initializer_range": 0.02, 8 | "max_position_embeddings": 512, 9 | "n_heads": 12, 10 | "n_layers": 6, 11 | "sinusoidal_pos_embds": true, 12 | "tie_weights_": true, 13 | "vocab_size": 30522 14 | } 15 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilgpt2.json: -------------------------------------------------------------------------------- 1 | { 2 | "initializer_range": 0.02, 3 | "layer_norm_epsilon": 0.00001, 4 | "n_ctx": 1024, 5 | "n_embd": 768, 6 | "n_head": 12, 7 | "n_layer": 6, 8 | "n_positions": 1024, 9 | "vocab_size": 50257 10 | } -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilroberta-base.json: -------------------------------------------------------------------------------- 1 | { 2 | "vocab_size": 50265, 3 | "hidden_size": 768, 4 | "num_hidden_layers": 6, 5 | "num_attention_heads": 12, 6 | "intermediate_size": 3072, 7 | "hidden_act": "gelu", 8 | "hidden_dropout_prob": 0.1, 9 | "attention_probs_dropout_prob": 0.1, 10 | "max_position_embeddings": 514, 11 | "type_vocab_size": 1, 12 | "initializer_range": 0.02, 13 | "layer_norm_eps": 0.00001 14 | } -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/headfigure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/headfigure.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/wooly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/wooly.png -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/pplm/pplm_classification_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class ClassificationHead(torch.nn.Module): 5 | """Classification Head for transformer encoders""" 6 | 7 | def __init__(self, class_size, embed_size): 8 | super().__init__() 9 | self.class_size = class_size 10 | self.embed_size = embed_size 11 | # self.mlp1 = torch.nn.Linear(embed_size, embed_size) 12 | # self.mlp2 = (torch.nn.Linear(embed_size, class_size)) 13 | self.mlp = torch.nn.Linear(embed_size, class_size) 14 | 15 | def forward(self, hidden_state): 16 | # hidden_state = F.relu(self.mlp1(hidden_state)) 17 | # hidden_state = self.mlp2(hidden_state) 18 | logits = self.mlp(hidden_state) 19 | return logits 20 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | tensorboard 3 | scikit-learn 4 | seqeval 5 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/summarization/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | 3 | # For ROUGE 4 | nltk 5 | py-rouge 6 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/tests_samples/.gitignore: -------------------------------------------------------------------------------- 1 | *.* 2 | cache* 3 | temp* 4 | !*.tsv 5 | !*.json 6 | !.gitignore -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/tests_samples/MRPC/dev.tsv: -------------------------------------------------------------------------------- 1 | Quality #1 ID #2 ID #1 String #2 String 2 | 1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy . 3 | 0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war . 4 | 0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent . 5 | 1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | 0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty . 7 | 1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/examples/tests_samples/MRPC/train.tsv: -------------------------------------------------------------------------------- 1 | Quality #1 ID #2 ID #1 String #2 String 2 | 1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy . 3 | 0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war . 4 | 0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent . 5 | 1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries . 6 | 0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty . 7 | 1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status . 8 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | ensure_newline_before_comments = True 3 | force_grid_wrap = 0 4 | include_trailing_comma = True 5 | known_first_party = transformers 6 | known_third_party = 7 | absl 8 | fairseq 9 | fastprogress 10 | git 11 | h5py 12 | MeCab 13 | nltk 14 | numpy 15 | packaging 16 | PIL 17 | psutil 18 | seqeval 19 | sklearn 20 | tensorboardX 21 | tensorflow 22 | tensorflow_datasets 23 | torch 24 | torchtext 25 | torchvision 26 | 27 | line_length = 119 28 | lines_after_imports = 2 29 | multi_line_output = 3 30 | use_parentheses = True 31 | 32 | [flake8] 33 | ignore = E203, E501, W503 34 | max-line-length = 119 35 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/src/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/src/transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/src/transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 28 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", 29 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", 30 | } 31 | 32 | 33 | class CamembertConfig(RobertaConfig): 34 | """ 35 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 36 | superclass for the appropriate documentation alongside usage examples. 37 | """ 38 | 39 | pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 40 | model_type = "camembert" 41 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/src/transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | 19 | import logging 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class MMBTConfig(object): 26 | """Configuration class to store the configuration of a `MMBT Model`. 27 | 28 | Args: 29 | config (:obj:`~transformers.PreTrainedConfig`): 30 | Config of the underlying Transformer models. Its values are 31 | copied over to use a single config. 32 | num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, optional, defautls to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/src/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import is_sklearn_available 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | squad_convert_examples_to_features, 20 | xnli_output_modes, 21 | xnli_processors, 22 | xnli_tasks_num_labels, 23 | ) 24 | 25 | 26 | if is_sklearn_available(): 27 | from .metrics import glue_compute_metrics, xnli_compute_metrics 28 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/src/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 9 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/templates/adding_a_new_example_script/README.md: -------------------------------------------------------------------------------- 1 | # How to add a new example script in 🤗Transformers 2 | 3 | This folder provide a template for adding a new example script implementing a training or inference task with the models in the 🤗Transformers library. 4 | 5 | Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases. 6 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/tests/__init__.py -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/dummy-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_type": "roberta" 3 | } -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/empty.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/empty.txt -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/input.txt: -------------------------------------------------------------------------------- 1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer 2 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/test_sentencepiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/test_sentencepiece.model -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/tests/test_tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from transformers.tokenization_distilbert import DistilBertTokenizer 18 | 19 | from .test_tokenization_bert import BertTokenizationTest 20 | from .utils import slow 21 | 22 | 23 | class DistilBertTokenizationTest(BertTokenizationTest): 24 | 25 | tokenizer_class = DistilBertTokenizer 26 | 27 | def get_tokenizer(self, **kwargs): 28 | return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs) 29 | 30 | @slow 31 | def test_sequence_builders(self): 32 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") 33 | 34 | text = tokenizer.encode("sequence builders", add_special_tokens=False) 35 | text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) 36 | 37 | encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) 38 | encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) 39 | 40 | assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] 41 | assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [ 42 | tokenizer.sep_token_id 43 | ] 44 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/tests/test_tokenization_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 HuggingFace Inc.. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | import unittest 18 | 19 | from transformers import PreTrainedTokenizer 20 | from transformers.tokenization_gpt2 import GPT2Tokenizer 21 | 22 | from .utils import slow 23 | 24 | 25 | class TokenizerUtilsTest(unittest.TestCase): 26 | def check_tokenizer_from_pretrained(self, tokenizer_class): 27 | s3_models = list(tokenizer_class.max_model_input_sizes.keys()) 28 | for model_name in s3_models[:1]: 29 | tokenizer = tokenizer_class.from_pretrained(model_name) 30 | self.assertIsNotNone(tokenizer) 31 | self.assertIsInstance(tokenizer, tokenizer_class) 32 | self.assertIsInstance(tokenizer, PreTrainedTokenizer) 33 | 34 | for special_tok in tokenizer.all_special_tokens: 35 | self.assertIsInstance(special_tok, str) 36 | special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) 37 | self.assertIsInstance(special_tok_id, int) 38 | 39 | @slow 40 | def test_pretrained_tokenizers(self): 41 | self.check_tokenizer_from_pretrained(GPT2Tokenizer) 42 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/gpt-2/transformers-cli: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.download import DownloadCommand 5 | from transformers.commands.run import RunCommand 6 | from transformers.commands.user import UserCommands 7 | from transformers.commands.convert import ConvertCommand 8 | from transformers.commands.serving import ServeCommand 9 | 10 | if __name__ == '__main__': 11 | parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') 12 | commands_parser = parser.add_subparsers(help='transformers-cli command helpers') 13 | 14 | # Register commands 15 | ConvertCommand.register_subcommand(commands_parser) 16 | DownloadCommand.register_subcommand(commands_parser) 17 | RunCommand.register_subcommand(commands_parser) 18 | ServeCommand.register_subcommand(commands_parser) 19 | UserCommands.register_subcommand(commands_parser) 20 | 21 | # Let's go 22 | args = parser.parse_args() 23 | 24 | if not hasattr(args, 'func'): 25 | parser.print_help() 26 | exit(1) 27 | 28 | # Run 29 | service = args.func(args) 30 | service.run() 31 | -------------------------------------------------------------------------------- /byteps/torch/examples/gpt-2/run_prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # before running this, should make sure the data has been downloaded from: 4 | # https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ 5 | 6 | export DATA_DIR=${DATA_DIR:-~/data} 7 | export TRAIN_FILE=${TRAIN_FILE:-$DATA_DIR/wikitext-2-raw/wiki.train.raw} 8 | export TEST_FILE=${TRAIN_FILE:-$DATA_DIR/wikitext-2-raw/wiki.test.raw} 9 | export DISTRIBUTED_FRAMEWORK=${DISTRIBUTED_FRAMEWORK:-byteps} 10 | 11 | THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 12 | 13 | cd ~ 14 | cd $THIS_DIR/gpt-2 15 | pip3 install . 16 | pip3 install -r ./examples/requirements.txt 17 | 18 | # export NCCL_P2P_DISABLE=1 19 | 20 | # for the first run, you can do a dryrun to 21 | # prepare/download necessary configurations and models 22 | # in order to avoid multi-write conflicts 23 | cd $THIS_DIR/gpt-2/examples 24 | python3 -m torch.distributed.launch --nproc_per_node 8 run_lm_finetuning.py \ 25 | --output_dir=output \ 26 | --model_type=gpt2 \ 27 | --model_name_or_path=gpt2 \ 28 | --do_train \ 29 | --save_steps 1000000 \ 30 | --overwrite_output_dir \ 31 | --num_train_epochs 1 \ 32 | --per_gpu_eval_batch_size 4 \ 33 | --train_data_file=$TRAIN_FILE -------------------------------------------------------------------------------- /byteps/torch/examples/json_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # Opening JSON file 4 | f = open('trace/0/tensor_ready.json',) 5 | 6 | # returns JSON object as 7 | # a dictionary 8 | data = json.load(f) 9 | 10 | # Iterating through the json 11 | # list 12 | time_points = {} 13 | for item in data['TensorReadyTraceEvents']: 14 | name = item["name"] 15 | ts = item["ts"] 16 | if name not in time_points: 17 | time_points[name] = [ts] 18 | else: 19 | time_points[name].append(ts) 20 | 21 | iters = 0 22 | for key, value in time_points.items(): 23 | if iters == 0: 24 | iters = len(value) 25 | value.sort() 26 | 27 | 28 | timelines = [[] for _ in range(iters)] 29 | time_gaps = [[0] for _ in range(iters)] 30 | 31 | for key, value in time_points.items(): 32 | for i, time in enumerate(value): 33 | timelines[i].append(time) 34 | 35 | for i, timeline in enumerate(timelines): 36 | timeline.sort() 37 | for j in range(len(timeline) - 1): 38 | gap = timeline[j+1] - timeline[j] 39 | time_gaps[i].append(gap) 40 | 41 | 42 | # Closing file 43 | f.close() 44 | for time_gap in time_gaps: 45 | print(sum(time_gap), time_gap) 46 | -------------------------------------------------------------------------------- /byteps/torch/examples/lstm/README.md: -------------------------------------------------------------------------------- 1 | # LSTM 2 | 3 | You can run the model step by step as follows. 4 | The dataset and the dependencies are all set after "bash install.sh". 5 | Go to "How to run" directly. 6 | 7 | 8 | ## Download the dataset 9 | ```bash 10 | bash getdata.sh 11 | ``` 12 | The default location of the dataset is ~/data, and the dataset for LSTM is in ~/data/wikitext-2 13 | 14 | 15 | ## How to run 16 | **Note**: Make sure it runs on NVLink-based GPU machines. 17 | Set ifname in run_espresso.sh and run_baseline.sh. 18 | 19 | ifname: the network interface card name, e.g., eth0, eth2 20 | 21 | ```bash 22 | export DMLC_PS_ROOT_URI="ip" 23 | export DMLC_NUM_WORKER=WORKERS 24 | export DMLC_WORKER_ID=WORKER_ID 25 | ``` 26 | 27 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine 28 | 29 | WORKERS: the number of GPU machines in the training 30 | 31 | ID: the id of a machine. machines have distinct IDs that start from 0 32 | 33 | 34 | ### Espresso 35 | Run on each machine 36 | ```bash 37 | bash run_espresso.sh 38 | ``` 39 | 40 | ### Baselines 41 | Run on each machine 42 | ```bash 43 | bash run_baseline.sh 44 | ``` -------------------------------------------------------------------------------- /byteps/torch/examples/lstm/getdata.sh: -------------------------------------------------------------------------------- 1 | echo "=== Acquiring datasets ===" 2 | echo "---" 3 | mkdir -p save 4 | 5 | cd ~/data 6 | 7 | echo "- Downloading WikiText-2 (WT2)" 8 | wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 9 | unzip -q wikitext-2-v1.zip 10 | cd wikitext-2 11 | mv wiki.train.tokens train.txt 12 | mv wiki.valid.tokens valid.txt 13 | mv wiki.test.tokens test.txt -------------------------------------------------------------------------------- /byteps/torch/examples/lstm/run_espresso.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path="`dirname $0`" 4 | set -x 5 | 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name 7 | ifname=$1 8 | 9 | compress_ratio=0.01 10 | gpus=0,1,2,3,4,5,6,7 11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0} 12 | export DMLC_INTERFACE=${ifname} 13 | export NCCL_IB_DISABLE=1 14 | export NCCL_IB_GID_INDEX=3 15 | export NCCL_IB_HCA=mlx5_0 16 | export NCCL_SOCKET_IFNAME=${ifname} 17 | # export DMLC_NUM_WORKER=$1 18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER 19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)" 20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213} 21 | export NVIDIA_VISIBLE_DEVICES=${gpus} 22 | export BYTEPS_FORCE_DISTRIBUTED=0 23 | export BYTEPS_COMPRESSOR_k=${compress_ratio} 24 | export OMP_NUM_THREADS=4 25 | export TEST_TYPE=${TEST_TYPE:=torch} 26 | export NCCL_DEBUG=VERSION 27 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 28 | export NCCL_BUFFSIZE=16777216 29 | # export DMLC_WORKER_ID=$2 30 | 31 | IFS=', ' read -ra a <<< $gpus; 32 | gpus_per_node=${#a[@]} 33 | declare -p a; 34 | 35 | model='LSTM' 36 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345" 37 | 38 | export BYTEPS_PARTITION_BYTES=4096000 39 | pkill -9 python3 40 | 41 | export NCCL_P2P_DISABLE=1 42 | 43 | for compressor in "efsignsgd" 44 | do 45 | pkill -9 python3 46 | export BYTEPS_INTER_COMPRESSOR=${compressor} 47 | scheduler_file="../../mergeComp/scheduler/lstm/pcie_${compressor}_two_cpu" 48 | BENCHMARK_ARGS="--compress --compressor ${compressor} --memory efsignsgd --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file} --scheduler-type -1" 49 | python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --model ${model} --epochs 2 $BENCHMARK_ARGS 50 | sleep 5 51 | done -------------------------------------------------------------------------------- /byteps/torch/examples/lstm/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def repackage_hidden(h): 5 | """Wraps hidden states in new Tensors, 6 | to detach them from their history.""" 7 | if isinstance(h, torch.Tensor): 8 | return h.detach() 9 | else: 10 | return tuple(repackage_hidden(v) for v in h) 11 | 12 | 13 | def batchify(data, bsz, args): 14 | # Work out how cleanly we can divide the dataset into bsz parts. 15 | nbatch = data.size(0) // bsz 16 | # Trim off any extra elements that wouldn't cleanly fit (remainders). 17 | data = data.narrow(0, 0, nbatch * bsz) 18 | # Evenly divide the data across the bsz batches. 19 | data = data.view(bsz, -1).t().contiguous() 20 | if args.cuda: 21 | data = data.cuda() 22 | return data 23 | 24 | 25 | def get_batch(source, i, args, seq_len=None, evaluation=False): 26 | seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i) 27 | data = source[i:i+seq_len] 28 | target = source[i+1:i+1+seq_len].view(-1) 29 | return data, target -------------------------------------------------------------------------------- /byteps/torch/examples/resnet101/README.md: -------------------------------------------------------------------------------- 1 | # ResNet101 2 | 3 | You can run the model step by step as follows. 4 | The dataset and the dependencies are all set after "bash install.sh". 5 | Go to "How to run" directly. 6 | 7 | ## Download the dataset 8 | ```bash 9 | cd ~/data 10 | # we use a small dataset from ImageNet 11 | wget https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz 12 | tar xf imagewang.tgz 13 | ``` 14 | The default location of the dataset is ~/data, and the dataset for VGG16 and ResNet101 is in ~/data/imagewang 15 | 16 | ## How to run 17 | **Note**: Make sure it runs on NVLink-based GPU machines. 18 | Set ifname in run_espresso.sh and run_baseline.sh. 19 | 20 | ifname: the network interface card name, e.g., eth0, eth2 21 | 22 | ```bash 23 | export DMLC_PS_ROOT_URI="ip" 24 | export DMLC_NUM_WORKER=WORKERS 25 | export DMLC_WORKER_ID=WORKER_ID 26 | ``` 27 | 28 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine 29 | 30 | WORKERS: the number of GPU machines in the training 31 | 32 | ID: the id of a machine. machines have distinct IDs that start from 0 33 | 34 | 35 | ### Espresso 36 | Run on each machine 37 | ```bash 38 | bash run_espresso.sh 39 | ``` 40 | 41 | ### Baselines 42 | Run on each machine 43 | ```bash 44 | bash run_baseline.sh 45 | ``` -------------------------------------------------------------------------------- /byteps/torch/examples/resnet101/run_espresso.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path="`dirname $0`" 4 | set -x 5 | 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name 7 | ifname=$1 8 | 9 | compress_ratio=0.01 10 | gpus=0,1,2,3,4,5,6,7 11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0} 12 | export DMLC_INTERFACE=${ifname} 13 | export NCCL_IB_DISABLE=1 14 | export NCCL_IB_GID_INDEX=3 15 | export NCCL_IB_HCA=mlx5_0 16 | export NCCL_SOCKET_IFNAME=${ifname} 17 | # export DMLC_NUM_WORKER=$1 18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER 19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)" 20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213} 21 | export NVIDIA_VISIBLE_DEVICES=${gpus} 22 | export BYTEPS_FORCE_DISTRIBUTED=0 23 | export OMP_NUM_THREADS=4 24 | export TEST_TYPE=${TEST_TYPE:=torch} 25 | export NCCL_DEBUG=VERSION 26 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 27 | export NCCL_BUFFSIZE=16777216 28 | # export DMLC_WORKER_ID=$2 29 | 30 | IFS=', ' read -ra a <<< $gpus; 31 | gpus_per_node=${#a[@]} 32 | declare -p a; 33 | 34 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345" 35 | 36 | export BYTEPS_PARTITION_BYTES=4096000 37 | pkill -9 python3 38 | export NCCL_P2P_DISABLE=1 39 | 40 | for model in "resnet101" 41 | do 42 | for compressor in "dgc" 43 | do 44 | pkill -9 python3 45 | scheduler_file="../../mergeComp/scheduler/${model}/pcie_dgc_cpu" 46 | export BYTEPS_INTER_COMPRESSOR=${compressor} 47 | BENCHMARK_ARGS="--compress --compressor ${compressor} --memory topk --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file} --scheduler-type -1" 48 | $GDB python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --model ${model} --epochs 5 --batch-size 32 --speed_test $BENCHMARK_ARGS 49 | sleep 5 50 | done 51 | done -------------------------------------------------------------------------------- /byteps/torch/examples/run_nvlink_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name 4 | ifname="eth0" 5 | export DMLC_PS_ROOT_URI="10.188.181.156" 6 | export DMLC_NUM_WORKER=$1 7 | export DMLC_WORKER_ID=$2 8 | 9 | # BERT-base 10 | cd BERT/ 11 | bash run_baseline.sh ${ifname} | tee -a ../bert_log 12 | bash run_espresso.sh ${ifname} | tee -a ../bert_log 13 | 14 | # GPT-2 15 | cd ../gpt-2/ 16 | bash run_prepare.sh 17 | bash run_baseline.sh ${ifname} | tee -a ../gpt2_log 18 | bash run_espresso.sh ${ifname} | tee -a ../gpt2_log 19 | 20 | 21 | # UGATIT 22 | cd ../ugatit/ 23 | bash run_baseline.sh ${ifname} | tee -a ../ugatit_log 24 | bash run_espresso.sh ${ifname} | tee -a ../ugatit_log -------------------------------------------------------------------------------- /byteps/torch/examples/run_pcie_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name 4 | ifname="eth0" 5 | export DMLC_PS_ROOT_URI="10.188.181.156" 6 | export DMLC_NUM_WORKER=$1 7 | export DMLC_WORKER_ID=$2 8 | 9 | # VGG16 10 | cd vgg16/ 11 | bash run_baseline.sh ${ifname} | tee -a ../vgg16_log 12 | bash run_espresso.sh ${ifname} | tee -a ../vgg16_log 13 | 14 | # LSTM 15 | cd ../lstm/ 16 | bash run_baseline.sh ${ifname} | tee -a ../lstm_log 17 | bash run_espresso.sh ${ifname} | tee -a ../lstm_log 18 | 19 | 20 | # ResNet101 21 | cd ../resnet101/ 22 | bash run_baseline.sh ${ifname} | tee -a ../resnet101_log 23 | bash run_espresso.sh ${ifname} | tee -a ../resnet101_log -------------------------------------------------------------------------------- /byteps/torch/examples/test_compressor_cpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | from mergeComp_dl.torch.compressor.poolsignsgd import PoolSignSGDCompressor 4 | from mergeComp_dl.torch.compressor.pooldgc import PoolDgcCompressor 5 | from mergeComp_dl.torch.compressor.pooltopk import PoolTopKCompressor 6 | from mergeComp_dl.torch.compressor.poolfp16 import PoolFP16Compressor 7 | 8 | #compressor = PoolSignSGDCompressor() 9 | #compressor = PoolDgcCompressor(0.01) 10 | #compressor = PoolTopKCompressor(0.01) 11 | compressor = PoolFP16Compressor() 12 | 13 | base_size = 2 ** 10 14 | device = torch.device("cpu") 15 | 16 | kwargs = {'dtype': torch.float32, 17 | 'device': device, 18 | 'requires_grad': False} 19 | 20 | name = "test" 21 | size_list = [] 22 | compress_latency = [] 23 | decompress_latency = [] 24 | 25 | runs = 20 26 | 27 | for i in range(0, 18, 1): 28 | ctx = None 29 | size = base_size * 2 ** i 30 | size_list.append(10 + i) 31 | compress_time, decompress_time = 0, 0 32 | 33 | for _ in range(0, runs): 34 | tensor = torch.rand(size, **kwargs) 35 | torch.cuda.synchronize() 36 | start_time = time.time() 37 | compressed_tensor, ctx = compressor.compress(tensor, name, ctx) 38 | torch.cuda.synchronize() 39 | end_time = time.time() 40 | 41 | compress_time += end_time-start_time 42 | 43 | #print("Compress, size:", size, "time:", end_time-start_time) 44 | 45 | torch.cuda.synchronize() 46 | start_time = time.time() 47 | decompressed = compressor.decompress(compressed_tensor, ctx) 48 | torch.cuda.synchronize() 49 | end_time = time.time() 50 | decompress_time += end_time-start_time 51 | 52 | #print("Decompress, size:", size, "time:", end_time-start_time) 53 | 54 | compress_latency.append(round(compress_time*1000/runs, 2)) 55 | decompress_latency.append(round(decompress_time*1000/runs, 2)) 56 | 57 | print(size_list) 58 | print(compress_latency) 59 | print(decompress_latency) -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Hyeonwoo Kang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/README.md: -------------------------------------------------------------------------------- 1 | # UGATIT 2 | 3 | Our script is based on [Official PyTorch Implementation of UGATIT](https://github.com/znxlwm/UGATIT-pytorch) and you can find the dataset [here](https://drive.google.com/file/d/1xOWj1UVgp6NKMT3HbPhBbtq2A4EDkghF/view). 4 | 5 | 6 | You can run the model step by step as follows. 7 | The dataset and the dependencies are all set after "bash install.sh". 8 | Go to "How to run" directly. 9 | 10 | ## Download the dataset 11 | 12 | ```bash 13 | cd ~/data 14 | gdown 1xOWj1UVgp6NKMT3HbPhBbtq2A4EDkghF 15 | mkdir selfie2anime && unzip selfie2anime.zip -d selfie2anime 16 | ``` 17 | The default location of the dataset is ~/data, and the dataset for UGATIT is in ~/data/selfie2anime 18 | 19 | 20 | ## Install dependencies 21 | ```bash 22 | sudo apt-get update && sudo apt-get install libgl1 -y 23 | pip3 install opencv-python 24 | ``` 25 | 26 | ## How to run 27 | **Note**: Make sure the dataset is in the right location and it runs on NVLink-based GPU machines. 28 | Set ifname in run_espresso.sh and run_baseline.sh. 29 | 30 | ifname: the network interface card name, e.g., eth0, eth2 31 | 32 | ```bash 33 | export DMLC_PS_ROOT_URI="ip" 34 | export DMLC_NUM_WORKER=WORKERS 35 | export DMLC_WORKER_ID=WORKER_ID 36 | ``` 37 | 38 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine 39 | 40 | WORKERS: the number of GPU machines in the training 41 | 42 | ID: the id of a machine. machines have distinct IDs that start from 0 43 | 44 | 45 | ### Espresso 46 | Run on each machine 47 | ```bash 48 | bash run_espresso.sh 49 | ``` 50 | 51 | ### Baselines 52 | Run on each machine 53 | ```bash 54 | bash run_baseline.sh 55 | ``` -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/assets/ablation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/ablation.png -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/assets/discriminator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/discriminator.png -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/assets/generator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/generator.png -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/assets/kid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/kid.png -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/teaser.png -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/assets/user_study.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/user_study.png -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python==3.3.1.11 2 | torchvision==0.3.0 3 | -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/run_espresso.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path="`dirname $0`" 4 | set -x 5 | 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name 7 | ifname=$1 8 | 9 | compress_ratio=0.01 10 | gpus=0,1,2,3,4,5,6,7 11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0} 12 | export DMLC_INTERFACE=${ifname} 13 | export NCCL_IB_DISABLE=1 14 | export NCCL_IB_GID_INDEX=3 15 | export NCCL_IB_HCA=mlx5_0 16 | export NCCL_SOCKET_IFNAME=${ifname} 17 | # export DMLC_NUM_WORKER=$1 18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER 19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)" 20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213} 21 | export NVIDIA_VISIBLE_DEVICES=${gpus} 22 | export BYTEPS_FORCE_DISTRIBUTED=0 23 | export OMP_NUM_THREADS=4 24 | export NCCL_DEBUG=VERSION 25 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 26 | export NCCL_BUFFSIZE=16777216 27 | # export DMLC_WORKER_ID=$2 28 | 29 | IFS=', ' read -ra a <<< $gpus; 30 | gpus_per_node=${#a[@]} 31 | declare -p a; 32 | 33 | model='ugatit' 34 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345" 35 | 36 | export NCCL_P2P_DISABLE=0 37 | export BYTEPS_PARTITION_BYTES=4096000 38 | 39 | # echo "Espresso" 40 | for compressor in "dgc" 41 | do 42 | pkill -9 python3 43 | export BYTEPS_INTER_COMPRESSOR=${compressor} 44 | scheduler_file="../../mergeComp/scheduler/${model}/nvlink_dgc_cpu" 45 | BENCHMARK_ARGS="--compress --compressor ${compressor} --memory topk --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file}" 46 | python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --dataset selfie2anime --dataset_dir "$HOME/data" $BENCHMARK_ARGS 47 | sleep 5 48 | done -------------------------------------------------------------------------------- /byteps/torch/examples/ugatit/utils.py: -------------------------------------------------------------------------------- 1 | from scipy import misc 2 | import os, cv2, torch 3 | import numpy as np 4 | 5 | def load_test_data(image_path, size=256): 6 | img = misc.imread(image_path, mode='RGB') 7 | img = misc.imresize(img, [size, size]) 8 | img = np.expand_dims(img, axis=0) 9 | img = preprocessing(img) 10 | 11 | return img 12 | 13 | def preprocessing(x): 14 | x = x/127.5 - 1 # -1 ~ 1 15 | return x 16 | 17 | def save_images(images, size, image_path): 18 | return imsave(inverse_transform(images), size, image_path) 19 | 20 | def inverse_transform(images): 21 | return (images+1.) / 2 22 | 23 | def imsave(images, size, path): 24 | return misc.imsave(path, merge(images, size)) 25 | 26 | def merge(images, size): 27 | h, w = images.shape[1], images.shape[2] 28 | img = np.zeros((h * size[0], w * size[1], 3)) 29 | for idx, image in enumerate(images): 30 | i = idx % size[1] 31 | j = idx // size[1] 32 | img[h*j:h*(j+1), w*i:w*(i+1), :] = image 33 | 34 | return img 35 | 36 | def check_folder(log_dir): 37 | if not os.path.exists(log_dir): 38 | os.makedirs(log_dir) 39 | return log_dir 40 | 41 | def str2bool(x): 42 | return x.lower() in ('true') 43 | 44 | def cam(x, size = 256): 45 | x = x - np.min(x) 46 | cam_img = x / np.max(x) 47 | cam_img = np.uint8(255 * cam_img) 48 | cam_img = cv2.resize(cam_img, (size, size)) 49 | cam_img = cv2.applyColorMap(cam_img, cv2.COLORMAP_JET) 50 | return cam_img / 255.0 51 | 52 | def imagenet_norm(x): 53 | mean = [0.485, 0.456, 0.406] 54 | std = [0.299, 0.224, 0.225] 55 | mean = torch.FloatTensor(mean).unsqueeze(0).unsqueeze(2).unsqueeze(3).to(x.device) 56 | std = torch.FloatTensor(std).unsqueeze(0).unsqueeze(2).unsqueeze(3).to(x.device) 57 | return (x - mean) / std 58 | 59 | def denorm(x): 60 | return x * 0.5 + 0.5 61 | 62 | def tensor2numpy(x): 63 | return x.detach().cpu().numpy().transpose(1,2,0) 64 | 65 | def RGB2BGR(x): 66 | return cv2.cvtColor(x, cv2.COLOR_RGB2BGR) -------------------------------------------------------------------------------- /byteps/torch/examples/vgg16/README.md: -------------------------------------------------------------------------------- 1 | # VGG16 2 | 3 | 4 | You can run the model step by step as follows. 5 | The dataset and the dependencies are all set after "bash install.sh". 6 | Go to "How to run" directly. 7 | 8 | ## Download the dataset 9 | ```bash 10 | cd ~/data 11 | # we use a small dataset from ImageNet 12 | wget https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz 13 | tar xf imagewang.tgz 14 | ``` 15 | The default location of the dataset is ~/data, and the dataset for VGG16 and ResNet101 is in ~/data/imagewang 16 | 17 | **Note**: Make sure it runs on NVLink-based GPU machines. 18 | Set ifname in run_espresso.sh and run_baseline.sh. 19 | 20 | ifname: the network interface card name, e.g., eth0, eth2 21 | 22 | ```bash 23 | export DMLC_PS_ROOT_URI="ip" 24 | export DMLC_NUM_WORKER=WORKERS 25 | export DMLC_WORKER_ID=WORKER_ID 26 | ``` 27 | 28 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine 29 | 30 | WORKERS: the number of GPU machines in the training 31 | 32 | ID: the id of a machine. machines have distinct IDs that start from 0 33 | 34 | 35 | ### Espresso 36 | Run on each machine 37 | ```bash 38 | bash run_espresso.sh 39 | ``` 40 | 41 | ### Baselines 42 | Run on each machine 43 | ```bash 44 | bash run_baseline.sh 45 | ``` -------------------------------------------------------------------------------- /byteps/torch/examples/vgg16/run_espresso.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path="`dirname $0`" 4 | set -x 5 | 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name 7 | ifname="eth0" 8 | 9 | compress_ratio=0.01 10 | gpus=0,1,2,3,4,5,6,7 11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0} 12 | export DMLC_INTERFACE=${ifname} 13 | export NCCL_IB_DISABLE=1 14 | export NCCL_IB_GID_INDEX=3 15 | export NCCL_IB_HCA=mlx5_0 16 | export NCCL_SOCKET_IFNAME=${ifname} 17 | # export DMLC_NUM_WORKER=$1 18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER 19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)" 20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213} 21 | export NVIDIA_VISIBLE_DEVICES=${gpus} 22 | export BYTEPS_FORCE_DISTRIBUTED=0 23 | export OMP_NUM_THREADS=4 24 | export TEST_TYPE=${TEST_TYPE:=torch} 25 | export NCCL_DEBUG=VERSION 26 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 27 | export NCCL_BUFFSIZE=16777216 28 | # export DMLC_WORKER_ID=$2 29 | 30 | IFS=', ' read -ra a <<< $gpus; 31 | gpus_per_node=${#a[@]} 32 | declare -p a; 33 | 34 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345" 35 | 36 | export BYTEPS_PARTITION_BYTES=4096000 37 | pkill -9 python3 38 | export NCCL_P2P_DISABLE=1 39 | 40 | for model in "vgg16" 41 | do 42 | for compressor in "randomk" 43 | do 44 | pkill -9 python3 45 | scheduler_file="../../mergeComp/scheduler/${model}/pcie_randomk_two_cpu" 46 | export BYTEPS_INTER_COMPRESSOR=${compressor} 47 | BENCHMARK_ARGS="--compress --compressor ${compressor} --memory topk --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file} --scheduler-type -1" 48 | $GDB python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --model ${model} --epochs 5 --batch-size 32 --speed_test $BENCHMARK_ARGS 49 | sleep 5 50 | done 51 | done -------------------------------------------------------------------------------- /byteps/torch/handle_manager.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_HANDLE_MANAGER_H 18 | #define BYTEPS_TORCH_HANDLE_MANAGER_H 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "../common/common.h" 26 | 27 | namespace byteps { 28 | namespace torch { 29 | 30 | using namespace byteps::common; 31 | 32 | class HandleManager { 33 | public: 34 | int AllocateHandle(); 35 | void MarkDone(int handle, const Status& status); 36 | bool PollHandle(int handle); 37 | std::shared_ptr ReleaseHandle(int handle); 38 | 39 | private: 40 | std::atomic_int last_handle_; 41 | std::unordered_map> results_; 42 | std::mutex mutex_; 43 | }; 44 | 45 | } // namespace torch 46 | } // namespace byteps 47 | 48 | #endif // BYTEPS_TORCH_HANDLE_MANAGER_H 49 | -------------------------------------------------------------------------------- /byteps/torch/json_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # Opening JSON file 4 | f = open('traces/0/tensor_ready.json',) 5 | 6 | # returns JSON object as 7 | # a dictionary 8 | data = json.load(f) 9 | 10 | # Iterating through the json 11 | # list 12 | time_points = {} 13 | for item in data['TensorReadyTraceEvents']: 14 | name = item["name"] 15 | ts = item["ts"] 16 | if name not in time_points: 17 | time_points[name] = [ts] 18 | else: 19 | time_points[name].append(ts) 20 | 21 | iters = 0 22 | for key, value in time_points.items(): 23 | if iters == 0: 24 | iters = len(value) 25 | value.sort() 26 | 27 | 28 | timelines = [[] for _ in range(iters)] 29 | time_gaps = [[0] for _ in range(iters)] 30 | 31 | for key, value in time_points.items(): 32 | for i, time in enumerate(value): 33 | timelines[i].append(time) 34 | 35 | for i, timeline in enumerate(timelines): 36 | timeline.sort() 37 | for j in range(len(timeline) - 1): 38 | gap = timeline[j+1] - timeline[j] 39 | time_gaps[i].append(gap) 40 | 41 | 42 | # Closing file 43 | f.close() 44 | for time_gap in time_gaps: 45 | print(sum(time_gap), time_gap) -------------------------------------------------------------------------------- /byteps/torch/mergeComp/communicator/ddp_fp16.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import byteps.torch as bps 3 | import sys 4 | from time import time 5 | sys.path.append("../..") 6 | from mergeComp import Communicator 7 | 8 | 9 | class DDPFP16(Communicator): 10 | def __init__(self, fp16_compressor, memory, DDPbackend, profile=False): 11 | super().__init__(fp16_compressor, fp16_compressor, memory) 12 | self.allreduce = DDPbackend.global_allreduce 13 | self.name = "DDPFP16" 14 | self.world_size = bps.size() 15 | self.comm_stream = torch.cuda.Stream(priority=-1) 16 | self.handles = {} 17 | self.shapes = {} 18 | self.profile = profile 19 | self.compress_overhead = 0 20 | self.decompress_overhead = 0 21 | self.iteration = -1 22 | 23 | 24 | def async_send(self, tensor, name): 25 | with torch.cuda.stream(self.comm_stream): 26 | self.handles[name] = self.allreduce(tensor.type(torch.float16)) 27 | return [-1], (name,) 28 | 29 | 30 | def wait_receive(self, handle, ctx): 31 | name = ctx[0] 32 | torch.cuda.current_stream().wait_stream(self.comm_stream) 33 | return self.handles[name].type(torch.float32) -------------------------------------------------------------------------------- /byteps/torch/mergeComp/communicator/pool_allreduce.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from mergeComp.torch import Communicator 3 | from horovod.torch import allreduce_async, synchronize 4 | from horovod.torch.mpi_ops import Average 5 | 6 | 7 | class PoolAllreduce(Communicator): 8 | def __init__(self, compressor, memory): 9 | super().__init__(compressor, memory) 10 | self.name = "PoolAllReduce" 11 | 12 | 13 | def async_send(self, tensors_compressed, ctx): 14 | # assert only one tensor in tensors_compressed for allreduce 15 | return allreduce_async(tensors_compressed[0], name=ctx[0], op=Average) 16 | 17 | 18 | def wait_receive(self, handle, ctx): 19 | output = [synchronize(handle)] 20 | return [self.compressor.decompress(output, ctx)] 21 | 22 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/compressor/poolfp16.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Compressor 5 | 6 | 7 | class PoolFP16Compressor(Compressor): 8 | """Compress all floating point gradients to 16-bit.""" 9 | def __init__(self): 10 | super().__init__() 11 | self.name = "PoolFP16" 12 | self.quantization = False 13 | 14 | def compress(self, tensor, name): 15 | """Downcasts the tensor to 16-bit.""" 16 | dtype = tensor.dtype 17 | numel = tensor.numel() 18 | tensor_compressed = tensor 19 | if dtype.is_floating_point: 20 | # Only allow compression from other floating point types 21 | tensor_compressed = tensor.type(torch.float16) 22 | ctx = (name, numel, dtype) 23 | return [tensor_compressed], ctx 24 | 25 | 26 | def decompress(self, tensors, ctx): 27 | """Upcasts the tensor to the initialization dtype.""" 28 | tensor_compressed = tensors[0] 29 | name, numel, dtype = ctx 30 | tensor_decompressed = tensor_compressed 31 | #print("[decompress] before", ctx, torch.sum(tensor_compressed)) 32 | if dtype.is_floating_point: 33 | tensor_decompressed = tensor_compressed.type(dtype) 34 | #print("[decompress] after", ctx, torch.sum(tensor_compressed)) 35 | return tensor_decompressed 36 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/compressor/poolint8.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Compressor 5 | 6 | 7 | class PoolInt8Compressor(Compressor): 8 | """Compress all floating point gradients to 16-bit.""" 9 | def __init__(self): 10 | super().__init__() 11 | self.name = "PoolInt8" 12 | self.quantization = False 13 | 14 | def compress(self, tensor, name): 15 | """Downcasts the tensor to 8-bit.""" 16 | dtype = tensor.dtype 17 | 18 | tensor_compressed = tensor 19 | if dtype.is_floating_point: 20 | # Only allow compression from other floating point types 21 | tensor_compressed = tensor.type(torch.uint8) 22 | ctx = (name, dtype) 23 | return [tensor_compressed], ctx 24 | 25 | 26 | def decompress(self, tensors, ctx): 27 | """Upcasts the tensor to the initialization dtype.""" 28 | tensor_compressed = tensors[0] 29 | _, dtype = ctx 30 | tensor_decompressed = tensor_compressed 31 | #print("[decompress] before", ctx, torch.sum(tensor_compressed)) 32 | if dtype.is_floating_point: 33 | tensor_decompressed = tensor_compressed.type(dtype) 34 | #print("[decompress] after", ctx, torch.sum(tensor_compressed)) 35 | return tensor_decompressed -------------------------------------------------------------------------------- /byteps/torch/mergeComp/compressor/poolnone.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Compressor 5 | 6 | class PoolNoneCompressor(Compressor): 7 | """Default no-op compression.""" 8 | def __init__(self): 9 | super().__init__() 10 | self.name = "PoolNone" 11 | self.quantization = False 12 | self.sparsification = False 13 | 14 | def compress(self, tensor, name, signsgd_unit_size=8, alltoall_nodes=1): 15 | ctx = (name, tensor.numel()) 16 | return [tensor, tensor.abs().mean().reshape((1,))], ctx 17 | 18 | def decompress(self, tensors, ctx, alltoall=False): 19 | return tensors[0] 20 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/compressor/poolqsgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import sys 4 | sys.path.append("../..") 5 | from mergeComp import Compressor 6 | 7 | 8 | class PoolQSGDCompressor(Compressor): 9 | 10 | def __init__(self, quantum_num): 11 | super().__init__() 12 | self.name = "PoolQSGD" 13 | self.quantization = True 14 | self.quantum_num = quantum_num 15 | 16 | 17 | def compress(self, tensor, name, server=False): 18 | shape = tensor.size() 19 | tensor = tensor.flatten() 20 | norm = tensor.norm().reshape((1,)) 21 | 22 | abs_gradient = tensor.abs() 23 | 24 | level_float = self.quantum_num / norm * abs_gradient 25 | previous_level = level_float.floor() 26 | prob = torch.empty_like(tensor).uniform_() 27 | is_next_level = (prob < (level_float - previous_level)).type(torch.float32) 28 | new_level = (previous_level + is_next_level) 29 | 30 | sign = tensor.sign() 31 | tensor_compressed = (new_level * sign).type(torch.int16) 32 | tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half) 33 | tensor_compressed = tensor_compressed, norm 34 | 35 | ctx = (name, shape) 36 | return tensor_compressed, ctx 37 | 38 | 39 | def decompress(self, tensor_compressed, ctx, server=False): 40 | tensor, norm = tensor_compressed 41 | norm = norm[0] 42 | decode_output = tensor.type(torch.float32) 43 | tensor_decompressed = norm / self.quantum_num * decode_output 44 | return tensor_decompressed 45 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/compressor/poolsignum.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import sys 4 | sys.path.append("../..") 5 | from mergeComp import Compressor 6 | from mergeComp.util import packbits, unpackbits 7 | 8 | 9 | class PoolSignumCompressor(Compressor): 10 | def __init__(self, momentum): 11 | super().__init__(average=False) 12 | self.name = "PoolSignNum" 13 | self.quantization = True 14 | self.momentum = momentum 15 | self.momentums = {} 16 | 17 | 18 | def get_scalar(self, tensor): 19 | return None 20 | 21 | 22 | def compress(self, tensor, name, scalar=None): 23 | """Encoding and compressing the signs """ 24 | numel = tensor.numel() 25 | mean = tensor.abs().mean().reshape((1,)) 26 | 27 | # update tensor by momentum 28 | if name in self.momentums: 29 | tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name] 30 | self.momentums[name] = tensor 31 | sign_encode = tensor >= 0 32 | 33 | int8_tensor = packbits(sign_encode) 34 | tensor_compressed = int8_tensor, mean 35 | 36 | ctx = (name, numel) 37 | return tensor_compressed, ctx 38 | 39 | 40 | def decompress(self, tensor_compressed, ctx): 41 | """Decoding the signs to float format """ 42 | int8_tensor, _ = tensor_compressed 43 | name, numel = ctx 44 | 45 | sign_decode = unpackbits(int8_tensor, numel) 46 | return sign_decode.type(torch.float32) * 2 - 1 47 | 48 | 49 | def aggregate(self, tensors): 50 | """Aggregate a list of tensors.""" 51 | agged_tensor = sum(tensors) 52 | agged_tensor = agged_tensor >= 0 53 | agged_tensor = agged_tensor * 2.0 - 1.0 54 | return [agged_tensor] 55 | 56 | 57 | def clean(self): 58 | self.momentums = {} -------------------------------------------------------------------------------- /byteps/torch/mergeComp/compressor/poolterngrad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import sys 4 | sys.path.append("../..") 5 | from mergeComp import Compressor 6 | from mergeComp.util import pack2bits, unpack2bits 7 | 8 | 9 | class PoolTernGradCompressor(Compressor): 10 | def __init__(self): 11 | super().__init__() 12 | self.name = "PoolTernGrad" 13 | self.quantization = True 14 | 15 | 16 | def compress(self, tensor, name, server=False): 17 | numel = tensor.numel() 18 | abs_gradient = tensor.abs() 19 | scalar = abs_gradient.max() 20 | sign_gradient = tensor.sign() * scalar 21 | 22 | try: 23 | rnd_sample = torch.empty_like(tensor).cuda().uniform_(0, scalar.item()) 24 | except: 25 | rnd_sample = torch.zeros_like(tensor).cuda() 26 | 27 | sign_gradient[rnd_sample >= abs_gradient] = 0 28 | 29 | mask = sign_gradient.sign() > 0 30 | tern_tensor = sign_gradient.sign() + 1 # {-1, 0, 1} + 1 31 | 32 | int8_tensor = pack2bits(mask, tern_tensor) 33 | tensor_compressed = int8_tensor, scalar.flatten() 34 | 35 | ctx = (name, numel) 36 | return tensor_compressed, ctx 37 | 38 | 39 | def decompress(self, tensor_compressed, ctx, server=False): 40 | int8_tensor, scalar = tensor_compressed 41 | name, numel = ctx 42 | 43 | tern_tensor = unpack2bits(int8_tensor, numel) 44 | 45 | sign = tern_tensor.type(torch.float32) - 1 # {0, 1, 2} - 1 46 | return sign * scalar -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/dgc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Memory 5 | import byteps.torch as bps 6 | 7 | 8 | class DgcMemory(Memory): 9 | def __init__(self, momentum=0.9, gradient_clipping=False): 10 | self.gradient_clipping = gradient_clipping 11 | self.momentum = momentum 12 | self.gradients = {} 13 | self.residuals = {} 14 | 15 | 16 | def compensate(self, tensor, name): 17 | """Update the tensor with the residuals.""" 18 | # https://github.com/synxlin/deep-gradient-compression/blob/master/dgc/memory.py 19 | if self.gradient_clipping: 20 | tensor_squ_sum = torch.sum(tensor * tensor) 21 | clipping_val = torch.sqrt(bps.byteps_push_pull(tensor_squ_sum, average=True, name=name)) 22 | tensor = tensor.clamp(-clipping_val, clipping_val) 23 | 24 | if name in self.residuals: 25 | self.residuals[name] = self.momentum * self.residuals[name] + tensor 26 | else: 27 | self.residuals[name] = tensor 28 | 29 | if name in self.gradients: 30 | self.gradients[name] += self.residuals[name] 31 | tensor = self.gradients[name] 32 | else: 33 | self.gradients[name] = tensor 34 | return tensor 35 | 36 | 37 | 38 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 39 | """Update the residuals.""" 40 | mask = ctx[1] 41 | not_mask = ~mask 42 | 43 | temp = self.residuals[name] * not_mask 44 | self.residuals[name] = temp 45 | temp = self.gradients[name] * not_mask 46 | self.gradients[name] = temp -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/efsignsgd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Memory 5 | 6 | 7 | class EFSignSGDMemory(Memory): 8 | def __init__(self, lr=0.5): 9 | self.residuals = {} 10 | # the training is sensitive to lr 11 | # for ResNet50 + CIFAR100 + EFSignSGD/OneBit, lr = 0.5. if lr = 0.6, the loss becomes nan. If lr is smaller, the gradients become zero 12 | self.lr = lr 13 | 14 | 15 | def compensate(self, tensor, name): 16 | """Update the tensor with the residuals.""" 17 | if name in self.residuals: 18 | tensor = self.lr * self.residuals[name] + tensor 19 | return tensor 20 | 21 | 22 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 23 | """Update the residuals.""" 24 | tensor_decompressed = compressor.decompress(tensor_compressed, ctx) 25 | self.residuals[name] = tensor - tensor_decompressed -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/none.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Memory 5 | 6 | 7 | class NoneMemory(Memory): 8 | def compensate(self, tensor, name): 9 | """Update the tensor with the residuals.""" 10 | return tensor 11 | 12 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 13 | """Update the residuals.""" 14 | pass -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/pooldgc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | #from extensions.cuda.functions import accumulate 4 | from .memory_pool import MemoryPool 5 | 6 | 7 | class PoolDgcMemory(MemoryPool): 8 | def __init__(self, named_parameters, lr=1e-3, momentum=0.2, fusion_num=2, gradient_clipping=False, momentum_masking=True): 9 | self.gradient_clipping = gradient_clipping 10 | self.momentum = momentum 11 | self.momentum_masking = momentum_masking 12 | super().__init__(named_parameters, fusion_num, lr=lr) 13 | self.iterations = -1 14 | 15 | 16 | def compensate(self, tensor, name): 17 | """Update the tensor with the residuals.""" 18 | # https://github.com/synxlin/deep-gradient-compression/blob/master/dgc/memory.py 19 | grad = self.get_grad(name) 20 | #if self.gradient_clipping: 21 | # tensor_squ_sum = torch.sum(grad * grad) 22 | # clipping_val = torch.sqrt(allreduce_(tensor_squ_sum, average=True, name=name)) 23 | # grad = grad.clamp(-clipping_val, clipping_val) 24 | mmt = self.get_momentum(name) 25 | vec = self.get_velocity(name) 26 | 27 | if self.momentum_masking: 28 | mmt.mul_(self.momentum).add_(grad) 29 | vec.add_(mmt) 30 | else: 31 | vec.mul_(self.momentum).add_(grad) 32 | 33 | 34 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 35 | """Update the residuals.""" 36 | mask = ctx[1] 37 | not_mask = ~mask 38 | 39 | values, indices = tensor_compressed 40 | indices_int64 = indices.type(torch.int64) 41 | 42 | if self.momentum_masking: 43 | mmt = self.get_momentum(name) 44 | mmt.copy_(mmt * not_mask) 45 | 46 | vec = self.get_velocity(name) 47 | vec.copy_(vec * not_mask) 48 | 49 | 50 | def reduce(self, ctx, name): 51 | reduction = self.get_reduction(name) 52 | reduction.set_(sum(ctx)) -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/poolnone.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .memory_pool import MemoryPool 3 | 4 | 5 | class PoolNoneMemory(MemoryPool): 6 | def __init__(self, named_parameters, fusion_num=2): 7 | super().__init__(named_parameters, fusion_num) 8 | 9 | def compensate(self, tensor, name): 10 | """Update the tensor with the residuals.""" 11 | grad = self.get_grad(name) 12 | residual = self.get_velocity(name) 13 | residual.copy_(grad) 14 | 15 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 16 | """Update the residuals.""" 17 | pass 18 | 19 | def reduce(self, ctx, name): 20 | reduction = self.get_reduction(name) 21 | reduction.set_(sum(ctx)) -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/poolresidual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .memory_pool import MemoryPool 3 | 4 | 5 | class PoolResidualMemory(MemoryPool): 6 | #TODO: tune beta and gamma to increase accurary 7 | def __init__(self, named_parameters, fusion_num=2, beta=0.9, gamma=1.0): 8 | self.beta = beta 9 | self.gamma = gamma 10 | super().__init__(named_parameters, fusion_num) 11 | 12 | 13 | def compensate(self, tensor, name): 14 | """vec stores the residuals""" 15 | grad = self.get_grad(name) 16 | residual = self.get_velocity(name) 17 | residual.add_(grad) 18 | #residual.mul_(self.beta).add_(self.gamma*grad) 19 | 20 | 21 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 22 | """Update the residuals.""" 23 | tensor_decompressed = compressor.decompress(tensor_compressed, ctx) 24 | residual = self.get_velocity(name) 25 | residual.copy_(tensor.view(-1) - tensor_decompressed) 26 | 27 | 28 | def reduce(self, ctx, name): 29 | reduction = self.get_reduction(name) 30 | reduction.set_(sum(ctx)) -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/residual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Memory 5 | 6 | 7 | class ResidualMemory(Memory): 8 | def __init__(self, beta=0.9, gamma=1.0): 9 | self.residuals = {} 10 | self.beta = beta 11 | self.gamma = gamma 12 | 13 | 14 | def compensate(self, tensor, name): 15 | """Update the tensor with the residuals.""" 16 | if name in self.residuals: 17 | tensor = self.beta * self.residuals[name] + tensor 18 | return tensor 19 | 20 | 21 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 22 | """Update the residuals.""" 23 | tensor_decompressed = compressor.decompress(tensor_compressed, ctx) 24 | self.residuals[name] = tensor - tensor_decompressed 25 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/memory/topk.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import sys 3 | sys.path.append("../..") 4 | from mergeComp import Memory 5 | import byteps.torch as bps 6 | 7 | 8 | class TopKMemory(Memory): 9 | def __init__(self, beta=0.9, gamma=1.0): 10 | self.residuals = {} 11 | self.beta = beta 12 | self.gamma = gamma 13 | self.zeros = {} 14 | 15 | 16 | def compensate(self, tensor, name): 17 | if name in self.residuals: 18 | tensor = tensor + self.beta*self.residuals[name] 19 | return tensor 20 | 21 | 22 | def update(self, tensor, name, compressor, tensor_compressed, ctx): 23 | """Update the residuals.""" 24 | values, indices = tensor_compressed 25 | if name not in self.zeros: 26 | self.zeros[name] = torch.zeros_like(values) 27 | self.residuals[name] = tensor.scatter_(0, indices.type(torch.int64), self.zeros[name]) 28 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/scheduler/README.md: -------------------------------------------------------------------------------- 1 | ## How to run 2 | ```bash 3 | bash run_all_models.sh 4 | ``` 5 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/scheduler/lstm/pcie_efsignsgd_two_cpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "lstm", 3 | "compressor": "efsignsgd", 4 | "nodes": 4, 5 | "gpus_per_node": 8, 6 | "comp_ratio": 0.03125, 7 | "hierarchical comm": true, 8 | "CPU compression": true, 9 | "PCIe": true, 10 | "tensor names": [ 11 | "decoder.bias", 12 | "rnns.1.weight_ih_l0", 13 | "rnns.1.weight_hh_l0", 14 | "rnns.1.bias_hh_l0", 15 | "rnns.1.bias_ih_l0", 16 | "rnns.0.weight_ih_l0", 17 | "rnns.0.weight_hh_l0", 18 | "rnns.0.bias_hh_l0", 19 | "rnns.0.bias_ih_l0", 20 | "encoder.weight" 21 | ], 22 | "tensor sizes": [ 23 | 33278, 24 | 9000000, 25 | 9000000, 26 | 6000, 27 | 6000, 28 | 9000000, 29 | 9000000, 30 | 6000, 31 | 6000, 32 | 49917000 33 | ], 34 | "options": [ 35 | 0, 36 | 1, 37 | 1, 38 | 0, 39 | 0, 40 | 6, 41 | 6, 42 | 0, 43 | 0, 44 | 6 45 | ] 46 | } -------------------------------------------------------------------------------- /byteps/torch/mergeComp/scheduler/run_all_models.sh: -------------------------------------------------------------------------------- 1 | python3 simulator_espresso.py --model vgg16 --node 4 --compressor randomk --cpu --two-level --profile 2 | python3 simulator_espresso.py --model resnet101 --node 4 --compressor dgc --cpu --two-level --profile 3 | python3 simulator_espresso.py --model ugatit --node 4 --compressor dgc --cpu --profile 4 | python3 simulator_espresso.py --model bert --node 4 --compressor randomk --cpu --profile 5 | python3 simulator_espresso.py --model gpt2 --node 4 --compressor efsignsgd --cpu --profile 6 | python3 simulator_espresso.py --model lstm --node 4 --compressor efsignsgd --cpu --two-level --profile 7 | -------------------------------------------------------------------------------- /byteps/torch/mergeComp/scheduler/simulator_logs/lstm/pcie_efsignsgd_two_cpu.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": "lstm", 3 | "compressor": "efsignsgd", 4 | "nodes": 4, 5 | "gpus_per_node": 8, 6 | "comp_ratio": 0.03125, 7 | "hierarchical comm": true, 8 | "CPU compression": true, 9 | "PCIe": true, 10 | "tensor names": [ 11 | "decoder.bias", 12 | "rnns.1.weight_ih_l0", 13 | "rnns.1.weight_hh_l0", 14 | "rnns.1.bias_hh_l0", 15 | "rnns.1.bias_ih_l0", 16 | "rnns.0.weight_ih_l0", 17 | "rnns.0.weight_hh_l0", 18 | "rnns.0.bias_hh_l0", 19 | "rnns.0.bias_ih_l0", 20 | "encoder.weight" 21 | ], 22 | "tensor sizes": [ 23 | 33278, 24 | 9000000, 25 | 9000000, 26 | 6000, 27 | 6000, 28 | 9000000, 29 | 9000000, 30 | 6000, 31 | 6000, 32 | 49917000 33 | ], 34 | "options": [ 35 | 0, 36 | 5, 37 | 6, 38 | 0, 39 | 0, 40 | 6, 41 | 6, 42 | 0, 43 | 0, 44 | 6 45 | ] 46 | } -------------------------------------------------------------------------------- /byteps/torch/mergeComp/util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.dlpack import to_dlpack 3 | from torch.utils.dlpack import from_dlpack 4 | import cupy 5 | 6 | zero_paddings = torch.zeros((32), dtype=torch.bool).cuda() 7 | 8 | def torch2cupy(tensor): 9 | return cupy.fromDlpack(to_dlpack(tensor)) 10 | 11 | 12 | def cupy2torch(cupy_tensor): 13 | return from_dlpack(cupy_tensor.toDlpack()) 14 | 15 | 16 | def packbits(array, unit_size=8): 17 | tensor = array 18 | numel = tensor.numel() 19 | if numel % unit_size != 0: 20 | padding_size = unit_size - numel % unit_size 21 | tensor = torch.cat((tensor, zero_paddings[:padding_size]), dim=0) 22 | 23 | if unit_size == 8: 24 | return cupy2torch(cupy.packbits(torch2cupy(array))) 25 | elif unit_size == 16: 26 | cupy_tensor = cupy.packbits(torch2cupy(tensor)) 27 | return cupy2torch(cupy_tensor.view(cupy.float16)) 28 | elif unit_size == 32: 29 | cupy_tensor = cupy.packbits(torch2cupy(tensor)) 30 | return cupy2torch(cupy_tensor.view(cupy.float32)) 31 | else: 32 | raise AttributeError("unsupported data type size") 33 | 34 | 35 | def unpackbits(array, size): 36 | return cupy2torch(cupy.unpackbits(torch2cupy(array).view(cupy.uint8))[:size]) 37 | 38 | 39 | def pack2bits(first, second): 40 | data = torch.cat((first, second.type(torch.bool)), 0) 41 | return cupy2torch(cupy.packbits(torch2cupy(data))) 42 | 43 | 44 | def unpack2bits(array, size): 45 | decode = cupy2torch(cupy.unpackbits(torch2cupy(array))) 46 | first = decode[:size] 47 | second = decode[size:2*size] 48 | second[first > 0] = 2 49 | 50 | return second -------------------------------------------------------------------------------- /byteps/torch/mergeComp/util_cpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.dlpack import to_dlpack 3 | from torch.utils.dlpack import from_dlpack 4 | import numpy as np 5 | 6 | 7 | def packbits(array): 8 | return np.packbits(array) 9 | 10 | 11 | def unpackbits(array, size): 12 | return torch.from_numpy(np.unpackbits(array)[:size]) 13 | -------------------------------------------------------------------------------- /byteps/torch/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedDataParallel 2 | -------------------------------------------------------------------------------- /byteps/torch/ready_event.h: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved. 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved. 3 | // 4 | // Licensed under the Apache License, Version 2.0 (the "License"); 5 | // you may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | // 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | // 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | // ============================================================================= 16 | 17 | #ifndef BYTEPS_TORCH_READY_EVENT_H 18 | #define BYTEPS_TORCH_READY_EVENT_H 19 | 20 | #if HAVE_CUDA 21 | #include "cuda_runtime.h" 22 | #endif 23 | 24 | #include 25 | 26 | #include "../common/common.h" 27 | 28 | namespace byteps { 29 | namespace torch { 30 | 31 | using namespace byteps::common; 32 | 33 | #if HAVE_CUDA 34 | class TorchReadyEvent : public ReadyEvent { 35 | public: 36 | TorchReadyEvent(int device); 37 | ~TorchReadyEvent(); 38 | virtual bool Ready() const override; 39 | 40 | private: 41 | int device_ = CPU_DEVICE_ID; 42 | cudaEvent_t cuda_event_ = nullptr; 43 | }; 44 | #endif 45 | 46 | std::shared_ptr RecordReadyEvent(int device); 47 | 48 | } // namespace torch 49 | } // namespace byteps 50 | 51 | #endif // BYTEPS_TORCH_READY_EVENT_H 52 | -------------------------------------------------------------------------------- /byteps/torch/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | numpy 3 | cupy-cuda111 4 | tqdm 5 | requests 6 | scipy 7 | six 8 | gdown -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.0-devel-ubuntu18.04 2 | 3 | ARG https_proxy 4 | ARG http_proxy 5 | 6 | ARG BYTEPS_BASE_PATH=/usr/local 7 | ARG BYTEPS_PATH=$BYTEPS_BASE_PATH/byteps 8 | ARG BYTEPS_GIT_LINK=https://github.com/bytedance/byteps 9 | ARG BYTEPS_BRANCH=master 10 | 11 | ARG DEBIAN_FRONTEND=noninteractive 12 | RUN apt-get update 13 | RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ 14 | build-essential \ 15 | tzdata \ 16 | ca-certificates \ 17 | git \ 18 | curl \ 19 | wget \ 20 | vim \ 21 | cmake \ 22 | lsb-release \ 23 | libcudnn7=7.6.0.64-1+cuda10.0 \ 24 | libnuma-dev \ 25 | ibverbs-providers \ 26 | librdmacm-dev \ 27 | ibverbs-utils \ 28 | rdmacm-utils \ 29 | libibverbs-dev \ 30 | python3 \ 31 | python3-dev \ 32 | python3-pip \ 33 | python3-setuptools \ 34 | libnccl2=2.4.7-1+cuda10.0 \ 35 | libnccl-dev=2.4.7-1+cuda10.0 36 | 37 | # install framework 38 | # note: for tf <= 1.14, you need gcc-4.9 39 | ARG FRAMEWORK=tensorflow 40 | RUN if [ "$FRAMEWORK" = "tensorflow" ]; then \ 41 | pip3 install --upgrade pip; \ 42 | pip3 install -U tensorflow-gpu==1.15.0; \ 43 | elif [ "$FRAMEWORK" = "pytorch" ]; then \ 44 | pip3 install -U numpy==1.18.1 torchvision==0.5.0 torch==1.4.0; \ 45 | elif [ "$FRAMEWORK" = "mxnet" ]; then \ 46 | pip3 install -U mxnet-cu100==1.5.0; \ 47 | else \ 48 | echo "unknown framework: $FRAMEWORK"; \ 49 | exit 1; \ 50 | fi 51 | 52 | ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH 53 | 54 | RUN cd $BYTEPS_BASE_PATH &&\ 55 | git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK &&\ 56 | cd $BYTEPS_PATH &&\ 57 | python3 setup.py install 58 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Prebuilt Images 2 | 3 | Belows are prebuilt docker images, and their associated commands to build. These prebuilt images might not be up-to-date. 4 | You may need to manually build them to get the latest functionalities of BytePS using the dockerfile. 5 | 6 | | Docker image | How to build | 7 | | --- | --- | 8 | | bytepsimage/tensorflow | docker build -t bytepsimage/tensorflow . -f Dockerfile --build-arg FRAMEWORK=tensorflow | 9 | | bytepsimage/pytorch | docker build -t bytepsimage/pytorch . -f Dockerfile --build-arg FRAMEWORK=pytorch | 10 | | bytepsimage/mxnet | docker build -t bytepsimage/mxnet . -f Dockerfile --build-arg FRAMEWORK=mxnet | 11 | -------------------------------------------------------------------------------- /docs/DistributedDataParallel.md: -------------------------------------------------------------------------------- 1 | # DistributedDataParallel 2 | 3 | BytePS Distributed Data Parallel module is compatible with PyTorch Distributed 4 | Data Parallel for the most part. Instead of using PyTorch communication 5 | backends, it uses BytePS push-pull for gradients reduction between nodes. 6 | 7 | It currently supports the Single-Process Single-GPU mode. In this mode each 8 | process works with one GPU. Example usage: 9 | 10 | 11 | ```python 12 | # byteps_ddp_example.py 13 | from byteps.torch.parallel import DistributedDataParallel 14 | 15 | model = DistributedDataParallel(model, device_ids=[i]) 16 | output = model(data) 17 | loss = F.nll_loss(output, target) 18 | loss.backward() 19 | optimizer.step() 20 | ``` 21 | 22 | Some models have branches, part of the model is skipped during the forward 23 | pass. In that case it's required to call the 24 | DistributedDataParallel.synchronize() function after loss.backward(), e.g.: 25 | 26 | ```python 27 | # byteps_ddp_example.py 28 | from byteps.torch.parallel import DistributedDataParallel 29 | 30 | # construct a model which skips some layers in the forward pass, then wrap the 31 | # model with DistributedDataParallel() 32 | model = DistributedDataParallel(model, device_ids=[i]) 33 | output = model(data) 34 | loss = F.nll_loss(output, target) 35 | loss.backward() 36 | # the synchronize() call here is required because some layers were skipped in 37 | # the forward pass 38 | model.synchronize() 39 | optimizer.step() 40 | ``` 41 | 42 | To run the program, use `bpslaunch` to launch one process for each device you 43 | wish to use. Refer to the [running](./running.md) document for how to use `bpslaunch`. 44 | -------------------------------------------------------------------------------- /docs/MirroredStrategy.md: -------------------------------------------------------------------------------- 1 | # MirroredStrategy 2 | 3 | The BytePS MirroredStrategy module is compatible with tensorflow 4 | MultiWorkerMirroredStrategy for the most part. Instead of using the builtin 5 | tensorflow collective communication implementation, it uses BytePS push-pull 6 | for gradients reduction between nodes. 7 | 8 | It currently supports the Single-Process Single-GPU mode. In this mode each 9 | process works with one GPU. Example usage: 10 | 11 | 12 | ```python 13 | import byteps.tensorflow as bps 14 | from byteps.tensorflow.distribute import MirroredStrategy 15 | 16 | bps.init() 17 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU') 18 | strategy = MirroredStrategy(devices=["/gpu:0"]) 19 | 20 | with strategy.scope(): 21 | # Model building/compiling need to be within `strategy.scope()`. 22 | multi_worker_model = build_and_compile_cnn_model() 23 | 24 | multi_worker_model.fit(multi_worker_dataset, epochs=100, steps_per_epoch=70) 25 | ``` 26 | To run the program, use `bpslaunch` to launch one process for each device you 27 | wish to use. Refer to the [running](./running.md) document for how to use 28 | `bpslaunch`. 29 | -------------------------------------------------------------------------------- /docs/cross-barrier.md: -------------------------------------------------------------------------------- 1 | # Cross Global Barrier 2 | 3 | This eliminates the global barrier between training iterations for distributed training frameworks (e.g., 4 | PyTorch), so that the priority-based communication scheduling in BytePS can be effective. 5 | 6 | ## Why Crossing Barrier? 7 | 8 | Existing distributed training frameworks (PyTorch, TensorFlow, etc) do not fully utilize the potentials of overlapping 9 | computation and communication to speed up neural network training: they only support communication overlapping with 10 | backward propagation. But due to layer-wise dependencies in DNN training, we can actually schedule gradient 11 | synchronization order based on when they are consumed in the next iteration, and hence overlap communication with 12 | forward-propagation of the next iteration! Read the paper https://dl.acm.org/citation.cfm?id=3359642 for more 13 | communication scheduling details. 14 | 15 | To make this idea work, the first step is to remove the global barrier between two iterations to build layer-wise 16 | dependencies, so that the forward computation of next step can start without waiting for parameter synchronization 17 | completion of all parameters. 18 | 19 | Fig.1 shows the dependency graph with global barrier. Machine learning frameworks such as PyTorch and TensorFlow have 20 | similar dependencies when using BytePS for push and pull. 21 | 22 | ![dag_barrier](https://user-images.githubusercontent.com/13852819/69863244-4b5ee400-12d7-11ea-9356-2dd41dff95ab.png) 23 | 24 | *Fig.1: Dependency Graph With Global Barrier* 25 | 26 | Fig. 2 shows the dependency graph after removing global barrier. What we do here is to change the dependency 27 | graph from Fig. 1 to Fig. 2 by removing the barrier, building layer-wise dependencies while guaranteeing computation correctness. 28 | 29 | 30 | ![dag_without_barrier](https://user-images.githubusercontent.com/13852819/69863268-5d408700-12d7-11ea-8b39-5e48e3d94c2b.png) 31 | *Fig.2: Dependency Graph After Removing Global Barrier* 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | # BytePS Performance when training CNN 2 | 3 | ## NVLink + TCP 4 | 5 | We test two models: VGG16 (communication-intensive) and Resnet50 (computation-intensive) on a popular public cloud. Both models are trained using fp32. 6 | 7 | We use Tesla V100 16GB GPUs and set batch size equal to 64 *per GPU*. The machines are VMs on the cloud. Each machine has 8 V100 GPUs with NVLink-enabled. Machines are inter-connected with 20 Gbps TCP/IP network. 8 | 9 | BytePS outperforms Horovod (NCCL) by 44% for Resnet50, and 100% for VGG16. 10 | 11 | ![vgg16_tcp](https://user-images.githubusercontent.com/13852819/69873424-41e37500-12f3-11ea-93b8-705215e3e901.png) 12 | ![resnet50_tcp](https://user-images.githubusercontent.com/13852819/69873419-40b24800-12f3-11ea-9ff3-0f11347c089e.png) 13 | 14 | You can reproduce the results using the Dockerfiles and example scripts we provide. 15 | 16 | ## PCIe + RDMA 17 | 18 | Note: here we present the *worse case scenario* of BytePS, i.e., 100Gbps RDMA + no NVLinks. 19 | 20 | We get below results on machines that are based on PCIe-switch architecture -- 4 GPUs under one PCIe switch, and each machine contains two PCIe switches. 21 | The machines are inter-connected by 100 Gbps RoCEv2 networks. 22 | In this case, BytePS outperforms Horovod (NCCL) by 7% for Resnet50, and 17% for VGG16. 23 | 24 | ![perf_rdma_pcie_resnet50](https://user-images.githubusercontent.com/13852819/68925125-57b64d80-07bd-11ea-9f72-d108cf4294ad.png) 25 | 26 | ![perf_rdma_pcie_vgg16](https://user-images.githubusercontent.com/13852819/68925175-70befe80-07bd-11ea-98d6-ca7df3670bbd.png) 27 | 28 | 29 | To have BytePS outperform NCCL by so little, you have to have 100Gbps RDMA network *and* no NVLinks. In this case, the communication is actually bottlenecked by internal PCI-e switches, not the network. BytePS has done some optimization so that it still outperforms NCCL. However, the performance gain is not as large as other cases where the network is the bottleneck. 30 | -------------------------------------------------------------------------------- /docs/running.md: -------------------------------------------------------------------------------- 1 | # Running BytePS 2 | 3 | BytePS follows the same running model as MXNet's PS implemenation, and provides a script, launcher/launcher.py, to help you start individual processes. **Below instructions, including those DMLC variables, apply to all frameworks.** 4 | 5 | Let's say you have two worker machines (or docker containers) that have GPUs, one machine or container as a server, and a scheduler. The scheduler binds on 10.0.0.1 and port 9000. The workers and the server can connect to the scheduler via the IP and port using TCP. 6 | 7 | To use launcher/launcher.py, NVIDIA_VISIBLE_DEVICES should exist -- either automatically set by nvidia-docker, or manually set by you. 8 | 9 | On worker 0, run: 10 | 11 | ``` 12 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 13 | DMLC_WORKER_ID=0 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \ 14 | bpslaunch YOUR_COMMAND 15 | ``` 16 | 17 | On worker 1, run (only DMLC_WORKER_ID is different from above): 18 | 19 | ``` 20 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 21 | DMLC_WORKER_ID=1 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \ 22 | bpslaunch YOUR_COMMAND 23 | ``` 24 | 25 | **For servers and schedulers, we highly recommend you use the docker image we build:** 26 | 27 | ``` 28 | docker pull bytepsimage/byteps_server 29 | ``` 30 | 31 | Start server and scheduler docker instances with this image. In the server, run the following. Compared with the worker command, we remove DMLC_WORKER_ID, and set role to server. 32 | 33 | ``` 34 | DMLC_ROLE=server DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 35 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch 36 | ``` 37 | 38 | On the scheduler, run (we also remove DMLC_WORKER_ID, and set role to scheduler): 39 | 40 | ``` 41 | DMLC_ROLE=scheduler DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \ 42 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch 43 | ``` 44 | 45 | In this example, your scheduler must be able to bind to `10.0.0.1:9000`. 46 | 47 | The order of starting workers/servers/scheduler does not matter. 48 | -------------------------------------------------------------------------------- /docs/troubleshooting.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | We suggest you read the Horovod troubleshooting, especially for problems during the build process. BytePS has almost the same dependencies as Horovod minus MPI. 4 | 5 | https://github.com/horovod/horovod/blob/v0.16.4/docs/troubleshooting.rst 6 | 7 | ## Network connectivity 8 | 9 | When launching distributed jobs, if you see hanging at the beginning, one possible reason is that your network connection has trouble. You can use `ps-lite` benchmark to verify the connectivity. 10 | 11 | Install ps-lite: 12 | 13 | ``` 14 | git clone -b byteps https://github.com/bytedance/ps-lite.git 15 | cd ps-lite 16 | make -j 17 | ``` 18 | 19 | 20 | For the scheduler 21 | ``` 22 | export DMLC_ROLE=scheduler 23 | export DMLC_NUM_WORKER=1 24 | export DMLC_NUM_SERVER=1 25 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP] 26 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT] 27 | export DMLC_INTERFACE=eth0 28 | ./ps-lite/tests/test_benchmark 29 | ``` 30 | 31 | For the server 32 | ``` 33 | export DMLC_ROLE=server 34 | export DMLC_NUM_WORKER=1 35 | export DMLC_NUM_SERVER=1 36 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP] 37 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT] 38 | export DMLC_INTERFACE=eth0 39 | ./ps-lite/tests/test_benchmark 40 | ``` 41 | 42 | For the worker: 43 | ``` 44 | export DMLC_ROLE=worker 45 | export DMLC_NUM_WORKER=1 46 | export DMLC_NUM_SERVER=1 47 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP] 48 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT] 49 | export DMLC_INTERFACE=eth0 50 | ./ps-lite/tests/test_benchmark 1024000 100 0 51 | ``` 52 | 53 | If it succeed, you should be able to see something like this on the worker. 54 | ``` 55 | push_byte=4096000, repeat=100, total_time=128.842ms 56 | pull_byte=4096000, repeat=100, total_time=353.38ms 57 | ``` 58 | 59 | (Note: for RDMA networks, use `make -j USE_RDMA=1` to build, and `export DMLC_ENABLE_RDMA=1` for running the scheduler / server / worker) 60 | 61 | If it still hang, you may need to check your network connectivity. 62 | -------------------------------------------------------------------------------- /espresso_EuroSys23.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/espresso_EuroSys23.pdf -------------------------------------------------------------------------------- /espresso_EuroSys_AE.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/espresso_EuroSys_AE.pdf -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | For more examples, see: https://github.com/byteps/examples -------------------------------------------------------------------------------- /example/mxnet/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/example/mxnet/common/__init__.py -------------------------------------------------------------------------------- /example/mxnet/common/find_mxnet.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | import os, sys 19 | try: 20 | import mxnet as mx 21 | except ImportError: 22 | curr_path = os.path.abspath(os.path.dirname(__file__)) 23 | sys.path.append(os.path.join(curr_path, "../../../python")) 24 | import mxnet as mx 25 | -------------------------------------------------------------------------------- /example/mxnet/data/imagenet1k-val.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | 20 | 21 | # This file download the imagnet-1k validation dataset and convert it into a rec 22 | # file. One need to provide the URL for the ILSVRC2012_img_val.tar, which can be 23 | # find at http://www.image-net.org/download-images 24 | # 25 | # Example usage (replace the URL with the correct one): 26 | # ./imagenet1k-val.sh http://xxxxxx/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar 27 | 28 | if [ ! -e ILSVRC2012_img_val.tar ]; then 29 | wget $1 30 | fi 31 | mkdir -p val 32 | tar -xf ILSVRC2012_img_val.tar -C val 33 | wget http://data.mxnet.io/models/imagenet/resnet/val.lst -O imagenet1k-val.lst 34 | 35 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 36 | MX_DIR=${CUR_DIR}/../../../ 37 | 38 | python ${CUR_DIR}/../../../tools/im2rec.py --resize 256 --quality 90 --num-thread 16 imagenet1k-val val/ 39 | 40 | rm -rf val 41 | -------------------------------------------------------------------------------- /example/mxnet/symbols/README.md: -------------------------------------------------------------------------------- 1 | # Symbol 2 | 3 | This fold contains definition of various networks. To add a new network, please 4 | use the following format. 5 | 6 | ## Python 7 | 8 | - A file implements one network proposed in a paper, with the network name as the 9 | filename. 10 | - Mention the paper and the modifications made if any at the beginning 11 | of the file. 12 | - Indicate how to reproduce the accuracy numbers in the paper if it is not straightforward 13 | - Provide a function `get_symbol()` that return the network 14 | -------------------------------------------------------------------------------- /example/mxnet/symbols/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/example/mxnet/symbols/__init__.py -------------------------------------------------------------------------------- /example/mxnet/symbols/mlp.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | 18 | """ 19 | a simple multilayer perceptron 20 | """ 21 | import mxnet as mx 22 | 23 | def get_symbol(num_classes=10, **kwargs): 24 | data = mx.symbol.Variable('data') 25 | data = mx.sym.Flatten(data=data) 26 | fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) 27 | act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") 28 | fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) 29 | act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") 30 | fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) 31 | mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') 32 | return mlp 33 | -------------------------------------------------------------------------------- /example/pytorch/test_bytecomp_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import time 5 | 6 | import torch 7 | import byteps.torch as bps 8 | 9 | 10 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', 11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 12 | parser.add_argument('--no-cuda', action='store_true', default=False, 13 | help='disables CUDA training') 14 | 15 | args = parser.parse_args() 16 | args.cuda = not args.no_cuda 17 | 18 | bps.init() 19 | my_rank = bps.rank() 20 | print("xxxx python myrank ", my_rank) 21 | if args.cuda: 22 | # Horovod: pin GPU to local rank. 23 | torch.cuda.set_device(my_rank) 24 | 25 | 26 | size = 1024000 27 | half_size = size // 2 28 | for ii in range(1): 29 | grad = torch.ones(size, dtype=torch.float32).cuda() 30 | grad[:half_size].mul_(4*my_rank + 4) 31 | 32 | print("before push pull", grad[:half_size].mean(), grad[half_size:].mean()) 33 | #handle = bps.byteps_push_pull(grad, average=True, name="test") 34 | #grad = bps.synchronize(handle) 35 | grad = bps.intra_push(grad, average=False, name="test") 36 | print("after push pull", grad[:size//2].mean(), grad[size//2:].mean()) -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | export BYTEPS_NCCL_LINK=SHARED 2 | python3 setup.py install --user 3 | cd byteps/torch 4 | pip3 install -r requirements.txt 5 | 6 | pip3 install nvidia-pyindex 7 | pip3 install nvidia-dllogger 8 | 9 | git clone https://github.com/NVIDIA/apex 10 | cd apex 11 | git checkout d6b5ae5d04f531ff862f651e67f241fef88fd159 12 | pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 13 | cd .. 14 | 15 | sudo apt-get update && sudo apt-get install libgl1 -y 16 | pip3 install opencv-python 17 | sudo apt install unzip 18 | 19 | echo "download dataset for LSTM" 20 | mkdir ~/data 21 | cd examples/lstm 22 | bash getdata.sh 23 | 24 | echo "download model checkpoint for BERT-base" 25 | cd ../BERT 26 | cd ./dataset/checkpoint 27 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_base_qa_squad11_amp/versions/19.09.0/zip -O bert_pyt_ckpt_base_qa_squad11_amp_19.09.0.zip 28 | unzip bert_pyt_ckpt_base_qa_squad11_amp_19.09.0.zip 29 | cd ../../ && mkdir -p results 30 | 31 | 32 | echo "download dataset for GPT-2" 33 | cd ~/data 34 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip 35 | unzip wikitext-2-raw-v1.zip 36 | 37 | 38 | echo "download dataset for VGG16 and ResNet101" 39 | cd ~/data 40 | wget https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz 41 | tar xf imagewang.tgz 42 | 43 | 44 | echo "download dataset for UGATIT" 45 | cd ~/data 46 | gdown 1xOWj1UVgp6NKMT3HbPhBbtq2A4EDkghF 47 | mkdir selfie2anime && unzip selfie2anime.zip -d selfie2anime -------------------------------------------------------------------------------- /launcher/README.md: -------------------------------------------------------------------------------- 1 | ### How to use distributed launcher 2 | 3 | Create two host files: `worker_hosts` and `server_hosts`, put your lists of hosts inside (one IP:port per line). 4 | 5 | For example, we want `10.0.0.1:12345` to be the scheduler, `10.0.0.2` and `10.0.0.3` to be the workers, `10.0.0.4` and `10.0.0.5` to be the servers. 6 | 7 | Then `worker_hosts` should be: 8 | ``` 9 | 10.0.0.2 10 | 10.0.0.3 11 | ``` 12 | 13 | And `server_hosts` should be: 14 | ``` 15 | 10.0.0.4 16 | 10.0.0.5 17 | ``` 18 | 19 | Finally, start the distributed ssh launcher by: 20 | 21 | ``` 22 | python dist_launcher.py --worker-hostfile worker_hosts --server-hostfile server_hosts \ 23 | --scheduler-ip 10.0.0.1 --scheduler-port 12345 \ 24 | --username root --env ENV1:1 --env ENV2:2 \ 25 | 'echo this is $DMLC_ROLE; python byteps/launcher/launch.py YOUR_COMMAND' 26 | ``` 27 | 28 | The script will automatically help you setup the necessary [environment variables](/docs/env.md) and launch BytePS processes. -------------------------------------------------------------------------------- /pre_setup.py: -------------------------------------------------------------------------------- 1 | # For internal use. Please do not modify this file. 2 | 3 | def setup(): 4 | return 5 | 6 | def extra_make_option(): 7 | return "" 8 | 9 | ucx_path = "" 10 | -------------------------------------------------------------------------------- /tests/run_byteps_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | path="$(dirname $0)" 4 | 5 | #export PATH=~/.local/bin:$PATH 6 | export LD_LIBRARY_PATH=$UCX_HOME/lib:$LD_LIBRARY_PATH 7 | export DMLC_NUM_WORKER=${DMLC_NUM_WORKER:-2} 8 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER 9 | export DMLC_PS_ROOT_URI=${DMLC_PS_ROOT_URI:-10.188.137.23} 10 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-22210} 11 | export DMLC_NODE_HOST=${DMLC_NODE_HOST:-$DMLC_PS_ROOT_URI} 12 | export BYTEPS_LOCAL_RANK=0 13 | export BYTEPS_LOCAL_SIZE=1 14 | export NVIDIA_VISIBLE_DEVICES=0 15 | export BYTEPS_FORCE_DISTRIBUTED=1 16 | export BYTEPS_COMPRESSOR=signsgd 17 | export BYTEPS_PARTITION_BYTES=4096000 18 | #export BYTEPS_LOG_LEVEL=${BYTEPS_LOG_LEVEL:-TRACE} 19 | export BYTEPS_LOG_LEVEL=${BYTEPS_LOG_LEVEL:-DEBUG} 20 | export PS_VERBOSE=${PS_VERBOSE:-0} 21 | export TEST_TYPE=${TEST_TYPE:=torch} 22 | 23 | function cleanup() { 24 | rm -rf lr.s 25 | } 26 | 27 | trap cleanup EXIT 28 | 29 | pkill bpslaunch 30 | pkill python3 31 | 32 | if [ $1 == "scheduler" ]; then 33 | echo "Launch scheduler" 34 | DMLC_ROLE=scheduler python3 -c 'import byteps.server' 35 | exit 36 | fi 37 | 38 | 39 | export DMLC_WORKER_ID=$2 40 | if [ $1 == "server" ]; then 41 | echo "Launch server" 42 | DMLC_ROLE=server python3 -c 'import byteps.server' 43 | exit 44 | fi 45 | 46 | #export GDB=" gdb -ex run --args " 47 | export GDB=" " 48 | 49 | if [ $1 == "worker" ] || [ $1 == "joint" ]; then 50 | export DMLC_ROLE=$1 51 | if [ "$TEST_TYPE" == "torch" ]; then 52 | echo "TEST TORCH ..." 53 | $GDB python3 $path/benchmark_byteps.py 54 | else 55 | echo "Error: unsupported $TEST_TYPE" 56 | exit 1 57 | fi 58 | fi -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import mxnet.ndarray as nd 3 | import numpy as np 4 | from numba import jit 5 | 6 | 7 | def fake_data(dtype="float32", batch_size=32, height=224, width=224, depth=3, num_classes=1000): 8 | image_list = [] 9 | label_list = [] 10 | for _ in range(8): 11 | image = mx.ndarray.random.normal(-1, 1, 12 | shape=[1, depth, height, width], 13 | dtype=dtype) 14 | label = mx.ndarray.random.randint(0, num_classes, [1, 1]) 15 | 16 | images = mx.ndarray.repeat(image, 128, axis=0) 17 | labels = mx.ndarray.repeat(label, 128, axis=0) 18 | # print(labels) 19 | image_list.append(images) 20 | label_list.append(labels) 21 | 22 | images = nd.concat(*image_list, dim=0) 23 | labels = nd.concat(*label_list, dim=0) 24 | # print(labels) 25 | fake_dataset = mx.gluon.data.ArrayDataset(images, labels) 26 | 27 | return mx.gluon.data.DataLoader(fake_dataset, batch_size=batch_size, num_workers=4, 28 | shuffle=True, last_batch='discard') 29 | 30 | 31 | @jit(nopython=True) 32 | def xorshift128p(state): 33 | t = state[0] 34 | s = state[1] 35 | state[0] = s 36 | t ^= t << np.uint64(23) 37 | t ^= t >> np.uint64(17) 38 | t ^= s ^ (s >> np.uint64(26)) 39 | state[1] = t 40 | return int(t + s) 41 | 42 | 43 | @jit(nopython=True) 44 | def bernoulli(p, state): 45 | t = p * np.iinfo(np.uint64).max 46 | r = np.array([xorshift128p(state) for _ in range(len(p))], dtype=np.float32) 47 | return r < t 48 | 49 | 50 | @jit(nopython=True) 51 | def randint(low, high, state): 52 | return xorshift128p(state) % (high - low) + low 53 | --------------------------------------------------------------------------------