├── .gitignore
├── .gitmodules
├── CHANGELOG.rst
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── NOTICE
├── README.md
├── byteps.exp
├── byteps.lds
├── byteps
    ├── __init__.py
    ├── __version__.py
    ├── _keras
    │   ├── __init__.py
    │   └── callbacks.py
    ├── common
    │   ├── __init__.py
    │   ├── common.cc
    │   ├── common.h
    │   ├── communicator.cc
    │   ├── communicator.h
    │   ├── compressor
    │   │   ├── common.h
    │   │   ├── compressor.cc
    │   │   ├── compressor.h
    │   │   ├── compressor_registry.cc
    │   │   ├── compressor_registry.h
    │   │   ├── error_feedback.cc
    │   │   ├── error_feedback.h
    │   │   ├── impl
    │   │   │   ├── dgc.cc
    │   │   │   ├── dgc.h
    │   │   │   ├── efsignsgd.cc
    │   │   │   ├── efsignsgd.h
    │   │   │   ├── nesterov_momentum.cc
    │   │   │   ├── nesterov_momentum.h
    │   │   │   ├── none.cc
    │   │   │   ├── none.h
    │   │   │   ├── onebit.cc
    │   │   │   ├── onebit.h
    │   │   │   ├── randomk.cc
    │   │   │   ├── randomk.h
    │   │   │   ├── test_error_feedback.cc
    │   │   │   ├── test_error_feedback.h
    │   │   │   ├── topk.cc
    │   │   │   ├── topk.h
    │   │   │   ├── vanilla_error_feedback.cc
    │   │   │   └── vanilla_error_feedback.h
    │   │   ├── momentum.cc
    │   │   ├── momentum.h
    │   │   └── utils.h
    │   ├── core_loops.cc
    │   ├── core_loops.h
    │   ├── cpu_reducer.cc
    │   ├── cpu_reducer.h
    │   ├── global.cc
    │   ├── global.h
    │   ├── half.h
    │   ├── logging.cc
    │   ├── logging.h
    │   ├── nccl_manager.cc
    │   ├── nccl_manager.h
    │   ├── operations.cc
    │   ├── operations.h
    │   ├── ready_table.cc
    │   ├── ready_table.h
    │   ├── scheduled_queue.cc
    │   ├── scheduled_queue.h
    │   ├── shared_memory.cc
    │   ├── shared_memory.h
    │   └── thread_pool.h
    ├── compressor_microbenchmark
    │   ├── Makefile
    │   ├── bandwidthtest.cu
    │   ├── common.h
    │   ├── compressor.h
    │   ├── dgc.cc
    │   ├── dgc.h
    │   ├── efsignSGD.cc
    │   ├── efsignSGD.h
    │   ├── log
    │   ├── onebit.cc
    │   ├── onebit.h
    │   ├── randomk.cc
    │   ├── randomk.h
    │   ├── test.cc
    │   ├── topk.cc
    │   └── topk.h
    ├── keras
    │   ├── __init__.py
    │   └── callbacks.py
    ├── misc
    │   ├── __init__.py
    │   └── imagenet18
    │   │   └── __init__.py
    ├── mxnet
    │   ├── __init__.py
    │   ├── adapter.cc
    │   ├── adapter.h
    │   ├── compression.py
    │   ├── cuda_util.cc
    │   ├── cuda_util.h
    │   ├── ops.cc
    │   ├── ops.h
    │   ├── ops.py
    │   ├── ready_event.cc
    │   ├── ready_event.h
    │   ├── tensor_util.cc
    │   ├── tensor_util.h
    │   └── util.h
    ├── server
    │   ├── __init__.py
    │   ├── queue.h
    │   ├── server.cc
    │   └── server.h
    ├── sparse_cpu_microbenchmark
    │   ├── Makefile
    │   ├── log_145608
    │   ├── log_145608_omp16
    │   ├── log_439926
    │   ├── log_439926_fp16
    │   ├── log_439926_omp16
    │   ├── log_439926_omp8
    │   ├── test.cc
    │   └── test_fp16.cc
    ├── tensorflow
    │   ├── __init__.py
    │   ├── compression.py
    │   ├── distribute
    │   │   ├── __init__.py
    │   │   ├── cross_device_ops.py
    │   │   └── mirrored_strategy.py
    │   ├── keras
    │   │   ├── __init__.py
    │   │   └── callbacks.py
    │   ├── mergeComp
    │   │   ├── __init__.py
    │   │   ├── communicator
    │   │   │   ├── pool_allgather.py
    │   │   │   ├── pool_allreduce.py
    │   │   │   ├── pool_byteps.py
    │   │   │   └── pool_ps.py
    │   │   ├── compressor
    │   │   │   ├── pooldgc.py
    │   │   │   ├── poolefsignsgd.py
    │   │   │   ├── poolfp16.py
    │   │   │   ├── poolnone.py
    │   │   │   ├── poolonebit.py
    │   │   │   ├── poolqsgd.py
    │   │   │   ├── poolrandomk.py
    │   │   │   ├── poolsignsgd.py
    │   │   │   ├── poolsignum.py
    │   │   │   ├── poolterngrad.py
    │   │   │   └── pooltopk.py
    │   │   ├── helper.py
    │   │   ├── memory
    │   │   │   ├── dgc.py
    │   │   │   ├── memory_layer.py
    │   │   │   ├── memory_pool.py
    │   │   │   ├── none.py
    │   │   │   ├── pooldgc.py
    │   │   │   ├── poolnone.py
    │   │   │   ├── poolresidual.py
    │   │   │   └── residual.py
    │   │   ├── scheduler
    │   │   │   └── scheduler.py
    │   │   └── util.py
    │   ├── ops.cc
    │   ├── ops.h
    │   ├── ops.py
    │   ├── sparse_optimizer.py
    │   ├── synthetic_benchmark_tf2.py
    │   └── util.py
    └── torch
    │   ├── __init__.py
    │   ├── adapter.cc
    │   ├── adapter.h
    │   ├── benchmark_byteps.py
    │   ├── compression.py
    │   ├── cross_barrier.py
    │   ├── cuda_util.cc
    │   ├── cuda_util.h
    │   ├── examples
    │       ├── BERT
    │       │   ├── README.md
    │       │   ├── dataset
    │       │   │   ├── checkpoint
    │       │   │   │   ├── bert_base_config.json
    │       │   │   │   └── bert_config.json
    │       │   │   ├── squad
    │       │   │   │   └── v1.1
    │       │   │   │   │   ├── dev-v1.1.json
    │       │   │   │   │   ├── evaluate-v1.1.py
    │       │   │   │   │   └── train-v1.1.json
    │       │   │   └── vocab.txt
    │       │   ├── modeling.py
    │       │   ├── optimization.py
    │       │   ├── run_baseline.sh
    │       │   ├── run_espresso.sh
    │       │   ├── run_squad.py
    │       │   ├── schedulers.py
    │       │   ├── tokenization.py
    │       │   └── utils.py
    │       ├── README.md
    │       ├── compressor_benchmark.py
    │       ├── extract.py
    │       ├── gpt-2
    │       │   ├── README.md
    │       │   ├── gpt-2
    │       │   │   ├── .circleci
    │       │   │   │   ├── config.yml
    │       │   │   │   └── deploy.sh
    │       │   │   ├── .coveragerc
    │       │   │   ├── .github
    │       │   │   │   ├── ISSUE_TEMPLATE
    │       │   │   │   │   ├── ---new-benchmark.md
    │       │   │   │   │   ├── --new-model-addition.md
    │       │   │   │   │   ├── bug-report.md
    │       │   │   │   │   ├── feature-request.md
    │       │   │   │   │   ├── migration.md
    │       │   │   │   │   └── question-help.md
    │       │   │   │   └── stale.yml
    │       │   │   ├── .gitignore
    │       │   │   ├── CONTRIBUTING.md
    │       │   │   ├── LICENSE
    │       │   │   ├── MANIFEST.in
    │       │   │   ├── Makefile
    │       │   │   ├── README.md
    │       │   │   ├── deploy_multi_version_doc.sh
    │       │   │   ├── docker
    │       │   │   │   └── Dockerfile
    │       │   │   ├── docs
    │       │   │   │   ├── Makefile
    │       │   │   │   ├── README.md
    │       │   │   │   └── source
    │       │   │   │   │   ├── _static
    │       │   │   │   │       ├── css
    │       │   │   │   │       │   ├── Calibre-Light.ttf
    │       │   │   │   │       │   ├── Calibre-Medium.otf
    │       │   │   │   │       │   ├── Calibre-Regular.otf
    │       │   │   │   │       │   ├── Calibre-Thin.otf
    │       │   │   │   │       │   ├── code-snippets.css
    │       │   │   │   │       │   └── huggingface.css
    │       │   │   │   │       └── js
    │       │   │   │   │       │   ├── custom.js
    │       │   │   │   │       │   └── huggingface_logo.svg
    │       │   │   │   │   ├── benchmarks.md
    │       │   │   │   │   ├── bertology.rst
    │       │   │   │   │   ├── conf.py
    │       │   │   │   │   ├── converting_tensorflow_models.rst
    │       │   │   │   │   ├── examples.md
    │       │   │   │   │   ├── glossary.rst
    │       │   │   │   │   ├── imgs
    │       │   │   │   │       ├── transformers_logo_name.png
    │       │   │   │   │       ├── warmup_constant_schedule.png
    │       │   │   │   │       ├── warmup_cosine_hard_restarts_schedule.png
    │       │   │   │   │       ├── warmup_cosine_schedule.png
    │       │   │   │   │       ├── warmup_cosine_warm_restarts_schedule.png
    │       │   │   │   │       └── warmup_linear_schedule.png
    │       │   │   │   │   ├── index.rst
    │       │   │   │   │   ├── installation.md
    │       │   │   │   │   ├── main_classes
    │       │   │   │   │       ├── configuration.rst
    │       │   │   │   │       ├── model.rst
    │       │   │   │   │       ├── optimizer_schedules.rst
    │       │   │   │   │       ├── processors.rst
    │       │   │   │   │       └── tokenizer.rst
    │       │   │   │   │   ├── migration.md
    │       │   │   │   │   ├── model_doc
    │       │   │   │   │       ├── albert.rst
    │       │   │   │   │       ├── auto.rst
    │       │   │   │   │       ├── bert.rst
    │       │   │   │   │       ├── camembert.rst
    │       │   │   │   │       ├── ctrl.rst
    │       │   │   │   │       ├── distilbert.rst
    │       │   │   │   │       ├── flaubert.rst
    │       │   │   │   │       ├── gpt.rst
    │       │   │   │   │       ├── gpt2.rst
    │       │   │   │   │       ├── roberta.rst
    │       │   │   │   │       ├── transformerxl.rst
    │       │   │   │   │       ├── xlm.rst
    │       │   │   │   │       ├── xlmroberta.rst
    │       │   │   │   │       └── xlnet.rst
    │       │   │   │   │   ├── model_sharing.md
    │       │   │   │   │   ├── multilingual.rst
    │       │   │   │   │   ├── notebooks.rst
    │       │   │   │   │   ├── pretrained_models.rst
    │       │   │   │   │   ├── quickstart.md
    │       │   │   │   │   ├── serialization.rst
    │       │   │   │   │   └── torchscript.rst
    │       │   │   ├── examples
    │       │   │   │   ├── README.md
    │       │   │   │   ├── benchmarks.py
    │       │   │   │   ├── contrib
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── run_camembert.py
    │       │   │   │   │   ├── run_openai_gpt.py
    │       │   │   │   │   ├── run_swag.py
    │       │   │   │   │   └── run_transfo_xl.py
    │       │   │   │   ├── distillation
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── distiller.py
    │       │   │   │   │   ├── grouped_batch_sampler.py
    │       │   │   │   │   ├── lm_seqs_dataset.py
    │       │   │   │   │   ├── requirements.txt
    │       │   │   │   │   ├── run_squad_w_distillation.py
    │       │   │   │   │   ├── train.py
    │       │   │   │   │   ├── training_configs
    │       │   │   │   │   │   ├── distilbert-base-multilingual-cased.json
    │       │   │   │   │   │   ├── distilbert-base-uncased.json
    │       │   │   │   │   │   ├── distilgpt2.json
    │       │   │   │   │   │   └── distilroberta-base.json
    │       │   │   │   │   └── utils.py
    │       │   │   │   ├── hans
    │       │   │   │   │   ├── hans_processors.py
    │       │   │   │   │   ├── test_hans.py
    │       │   │   │   │   └── utils_hans.py
    │       │   │   │   ├── mm-imdb
    │       │   │   │   │   ├── run_mmimdb.py
    │       │   │   │   │   └── utils_mmimdb.py
    │       │   │   │   ├── pplm
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── imgs
    │       │   │   │   │   │   ├── headfigure.png
    │       │   │   │   │   │   └── wooly.png
    │       │   │   │   │   ├── pplm_classification_head.py
    │       │   │   │   │   ├── run_pplm.py
    │       │   │   │   │   └── run_pplm_discrim_train.py
    │       │   │   │   ├── requirements.txt
    │       │   │   │   ├── run_bertology.py
    │       │   │   │   ├── run_generation.py
    │       │   │   │   ├── run_glue.py
    │       │   │   │   ├── run_lm_finetuning.py
    │       │   │   │   ├── run_lm_finetuning_bps.py
    │       │   │   │   ├── run_lm_finetuning_bytecomp.py
    │       │   │   │   ├── run_multiple_choice.py
    │       │   │   │   ├── run_ner.py
    │       │   │   │   ├── run_squad.py
    │       │   │   │   ├── run_tf_glue.py
    │       │   │   │   ├── run_tf_ner.py
    │       │   │   │   ├── run_xnli.py
    │       │   │   │   ├── summarization
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── configuration_bertabs.py
    │       │   │   │   │   ├── convert_bertabs_original_pytorch_checkpoint.py
    │       │   │   │   │   ├── modeling_bertabs.py
    │       │   │   │   │   ├── requirements.txt
    │       │   │   │   │   ├── run_summarization.py
    │       │   │   │   │   ├── test_utils_summarization.py
    │       │   │   │   │   └── utils_summarization.py
    │       │   │   │   ├── test_examples.py
    │       │   │   │   ├── tests_samples
    │       │   │   │   │   ├── .gitignore
    │       │   │   │   │   ├── MRPC
    │       │   │   │   │   │   ├── dev.tsv
    │       │   │   │   │   │   └── train.tsv
    │       │   │   │   │   └── SQUAD
    │       │   │   │   │   │   ├── dev-v2.0.json
    │       │   │   │   │   │   └── train-v2.0.json
    │       │   │   │   ├── utils_multiple_choice.py
    │       │   │   │   └── utils_ner.py
    │       │   │   ├── hubconf.py
    │       │   │   ├── notebooks
    │       │   │   │   ├── Comparing-PT-and-TF-models.ipynb
    │       │   │   │   ├── Comparing-TF-and-PT-models-MLM-NSP.ipynb
    │       │   │   │   ├── Comparing-TF-and-PT-models-SQuAD.ipynb
    │       │   │   │   └── Comparing-TF-and-PT-models.ipynb
    │       │   │   ├── setup.cfg
    │       │   │   ├── setup.py
    │       │   │   ├── src
    │       │   │   │   └── transformers
    │       │   │   │   │   ├── __init__.py
    │       │   │   │   │   ├── commands
    │       │   │   │   │       ├── __init__.py
    │       │   │   │   │       ├── convert.py
    │       │   │   │   │       ├── download.py
    │       │   │   │   │       ├── run.py
    │       │   │   │   │       ├── serving.py
    │       │   │   │   │       ├── train.py
    │       │   │   │   │       └── user.py
    │       │   │   │   │   ├── configuration_albert.py
    │       │   │   │   │   ├── configuration_auto.py
    │       │   │   │   │   ├── configuration_bert.py
    │       │   │   │   │   ├── configuration_camembert.py
    │       │   │   │   │   ├── configuration_ctrl.py
    │       │   │   │   │   ├── configuration_distilbert.py
    │       │   │   │   │   ├── configuration_flaubert.py
    │       │   │   │   │   ├── configuration_gpt2.py
    │       │   │   │   │   ├── configuration_mmbt.py
    │       │   │   │   │   ├── configuration_openai.py
    │       │   │   │   │   ├── configuration_roberta.py
    │       │   │   │   │   ├── configuration_t5.py
    │       │   │   │   │   ├── configuration_transfo_xl.py
    │       │   │   │   │   ├── configuration_utils.py
    │       │   │   │   │   ├── configuration_xlm.py
    │       │   │   │   │   ├── configuration_xlm_roberta.py
    │       │   │   │   │   ├── configuration_xlnet.py
    │       │   │   │   │   ├── convert_albert_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
    │       │   │   │   │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_pytorch_checkpoint_to_tf2.py
    │       │   │   │   │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_t5_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── data
    │       │   │   │   │       ├── __init__.py
    │       │   │   │   │       ├── metrics
    │       │   │   │   │       │   ├── __init__.py
    │       │   │   │   │       │   └── squad_metrics.py
    │       │   │   │   │       └── processors
    │       │   │   │   │       │   ├── __init__.py
    │       │   │   │   │       │   ├── glue.py
    │       │   │   │   │       │   ├── squad.py
    │       │   │   │   │       │   ├── utils.py
    │       │   │   │   │       │   └── xnli.py
    │       │   │   │   │   ├── file_utils.py
    │       │   │   │   │   ├── hf_api.py
    │       │   │   │   │   ├── modelcard.py
    │       │   │   │   │   ├── modeling_albert.py
    │       │   │   │   │   ├── modeling_auto.py
    │       │   │   │   │   ├── modeling_bert.py
    │       │   │   │   │   ├── modeling_camembert.py
    │       │   │   │   │   ├── modeling_ctrl.py
    │       │   │   │   │   ├── modeling_distilbert.py
    │       │   │   │   │   ├── modeling_encoder_decoder.py
    │       │   │   │   │   ├── modeling_flaubert.py
    │       │   │   │   │   ├── modeling_gpt2.py
    │       │   │   │   │   ├── modeling_mmbt.py
    │       │   │   │   │   ├── modeling_openai.py
    │       │   │   │   │   ├── modeling_roberta.py
    │       │   │   │   │   ├── modeling_t5.py
    │       │   │   │   │   ├── modeling_tf_albert.py
    │       │   │   │   │   ├── modeling_tf_auto.py
    │       │   │   │   │   ├── modeling_tf_bert.py
    │       │   │   │   │   ├── modeling_tf_camembert.py
    │       │   │   │   │   ├── modeling_tf_ctrl.py
    │       │   │   │   │   ├── modeling_tf_distilbert.py
    │       │   │   │   │   ├── modeling_tf_gpt2.py
    │       │   │   │   │   ├── modeling_tf_openai.py
    │       │   │   │   │   ├── modeling_tf_pytorch_utils.py
    │       │   │   │   │   ├── modeling_tf_roberta.py
    │       │   │   │   │   ├── modeling_tf_t5.py
    │       │   │   │   │   ├── modeling_tf_transfo_xl.py
    │       │   │   │   │   ├── modeling_tf_transfo_xl_utilities.py
    │       │   │   │   │   ├── modeling_tf_utils.py
    │       │   │   │   │   ├── modeling_tf_xlm.py
    │       │   │   │   │   ├── modeling_tf_xlm_roberta.py
    │       │   │   │   │   ├── modeling_tf_xlnet.py
    │       │   │   │   │   ├── modeling_transfo_xl.py
    │       │   │   │   │   ├── modeling_transfo_xl_utilities.py
    │       │   │   │   │   ├── modeling_utils.py
    │       │   │   │   │   ├── modeling_xlm.py
    │       │   │   │   │   ├── modeling_xlm_roberta.py
    │       │   │   │   │   ├── modeling_xlnet.py
    │       │   │   │   │   ├── optimization.py
    │       │   │   │   │   ├── optimization_tf.py
    │       │   │   │   │   ├── pipelines.py
    │       │   │   │   │   ├── tokenization_albert.py
    │       │   │   │   │   ├── tokenization_auto.py
    │       │   │   │   │   ├── tokenization_bert.py
    │       │   │   │   │   ├── tokenization_bert_japanese.py
    │       │   │   │   │   ├── tokenization_camembert.py
    │       │   │   │   │   ├── tokenization_ctrl.py
    │       │   │   │   │   ├── tokenization_distilbert.py
    │       │   │   │   │   ├── tokenization_flaubert.py
    │       │   │   │   │   ├── tokenization_gpt2.py
    │       │   │   │   │   ├── tokenization_openai.py
    │       │   │   │   │   ├── tokenization_roberta.py
    │       │   │   │   │   ├── tokenization_t5.py
    │       │   │   │   │   ├── tokenization_transfo_xl.py
    │       │   │   │   │   ├── tokenization_utils.py
    │       │   │   │   │   ├── tokenization_xlm.py
    │       │   │   │   │   ├── tokenization_xlm_roberta.py
    │       │   │   │   │   └── tokenization_xlnet.py
    │       │   │   ├── templates
    │       │   │   │   ├── adding_a_new_example_script
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── run_xxx.py
    │       │   │   │   │   └── utils_xxx.py
    │       │   │   │   └── adding_a_new_model
    │       │   │   │   │   ├── README.md
    │       │   │   │   │   ├── configuration_xxx.py
    │       │   │   │   │   ├── convert_xxx_original_tf_checkpoint_to_pytorch.py
    │       │   │   │   │   ├── modeling_tf_xxx.py
    │       │   │   │   │   ├── modeling_xxx.py
    │       │   │   │   │   ├── tests
    │       │   │   │   │       ├── test_modeling_tf_xxx.py
    │       │   │   │   │       ├── test_modeling_xxx.py
    │       │   │   │   │       └── test_tokenization_xxx.py
    │       │   │   │   │   └── tokenization_xxx.py
    │       │   │   ├── tests
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── fixtures
    │       │   │   │   │   ├── dummy-config.json
    │       │   │   │   │   ├── empty.txt
    │       │   │   │   │   ├── input.txt
    │       │   │   │   │   ├── sample_text.txt
    │       │   │   │   │   └── test_sentencepiece.model
    │       │   │   │   ├── test_configuration_auto.py
    │       │   │   │   ├── test_configuration_common.py
    │       │   │   │   ├── test_doc_samples.py
    │       │   │   │   ├── test_hf_api.py
    │       │   │   │   ├── test_model_card.py
    │       │   │   │   ├── test_modeling_albert.py
    │       │   │   │   ├── test_modeling_auto.py
    │       │   │   │   ├── test_modeling_bert.py
    │       │   │   │   ├── test_modeling_common.py
    │       │   │   │   ├── test_modeling_ctrl.py
    │       │   │   │   ├── test_modeling_distilbert.py
    │       │   │   │   ├── test_modeling_encoder_decoder.py
    │       │   │   │   ├── test_modeling_gpt2.py
    │       │   │   │   ├── test_modeling_openai.py
    │       │   │   │   ├── test_modeling_roberta.py
    │       │   │   │   ├── test_modeling_t5.py
    │       │   │   │   ├── test_modeling_tf_albert.py
    │       │   │   │   ├── test_modeling_tf_auto.py
    │       │   │   │   ├── test_modeling_tf_bert.py
    │       │   │   │   ├── test_modeling_tf_common.py
    │       │   │   │   ├── test_modeling_tf_ctrl.py
    │       │   │   │   ├── test_modeling_tf_distilbert.py
    │       │   │   │   ├── test_modeling_tf_gpt2.py
    │       │   │   │   ├── test_modeling_tf_openai_gpt.py
    │       │   │   │   ├── test_modeling_tf_roberta.py
    │       │   │   │   ├── test_modeling_tf_t5.py
    │       │   │   │   ├── test_modeling_tf_transfo_xl.py
    │       │   │   │   ├── test_modeling_tf_xlm.py
    │       │   │   │   ├── test_modeling_tf_xlnet.py
    │       │   │   │   ├── test_modeling_transfo_xl.py
    │       │   │   │   ├── test_modeling_xlm.py
    │       │   │   │   ├── test_modeling_xlnet.py
    │       │   │   │   ├── test_optimization.py
    │       │   │   │   ├── test_optimization_tf.py
    │       │   │   │   ├── test_pipelines.py
    │       │   │   │   ├── test_tokenization_albert.py
    │       │   │   │   ├── test_tokenization_auto.py
    │       │   │   │   ├── test_tokenization_bert.py
    │       │   │   │   ├── test_tokenization_bert_japanese.py
    │       │   │   │   ├── test_tokenization_common.py
    │       │   │   │   ├── test_tokenization_ctrl.py
    │       │   │   │   ├── test_tokenization_distilbert.py
    │       │   │   │   ├── test_tokenization_gpt2.py
    │       │   │   │   ├── test_tokenization_openai.py
    │       │   │   │   ├── test_tokenization_roberta.py
    │       │   │   │   ├── test_tokenization_t5.py
    │       │   │   │   ├── test_tokenization_transfo_xl.py
    │       │   │   │   ├── test_tokenization_utils.py
    │       │   │   │   ├── test_tokenization_xlm.py
    │       │   │   │   ├── test_tokenization_xlnet.py
    │       │   │   │   └── utils.py
    │       │   │   ├── transformers-cli
    │       │   │   ├── utils
    │       │   │   │   ├── download_glue_data.py
    │       │   │   │   └── link_tester.py
    │       │   │   └── valohai.yaml
    │       │   ├── run_baseline.sh
    │       │   ├── run_espresso.sh
    │       │   └── run_prepare.sh
    │       ├── json_parser.py
    │       ├── lstm
    │       │   ├── README.md
    │       │   ├── data.py
    │       │   ├── getdata.sh
    │       │   ├── main.py
    │       │   ├── model.py
    │       │   ├── run_baseline.sh
    │       │   ├── run_espresso.sh
    │       │   ├── splitcross.py
    │       │   └── utils.py
    │       ├── resnet101
    │       │   ├── README.md
    │       │   ├── main.py
    │       │   ├── run_baseline.sh
    │       │   └── run_espresso.sh
    │       ├── run_nvlink_models.sh
    │       ├── run_pcie_models.sh
    │       ├── test_compressor_cpu.py
    │       ├── ugatit
    │       │   ├── LICENSE
    │       │   ├── README.md
    │       │   ├── UGATIT.py
    │       │   ├── assets
    │       │   │   ├── ablation.png
    │       │   │   ├── discriminator.png
    │       │   │   ├── generator.png
    │       │   │   ├── kid.png
    │       │   │   ├── teaser.png
    │       │   │   └── user_study.png
    │       │   ├── dataset.py
    │       │   ├── main.py
    │       │   ├── networks.py
    │       │   ├── requirements.txt
    │       │   ├── run_baseline.sh
    │       │   ├── run_espresso.sh
    │       │   └── utils.py
    │       └── vgg16
    │       │   ├── README.md
    │       │   ├── main.py
    │       │   ├── run_baseline.sh
    │       │   └── run_espresso.sh
    │   ├── handle_manager.cc
    │   ├── handle_manager.h
    │   ├── json_parser.py
    │   ├── launch_bps.py
    │   ├── mergeComp
    │       ├── __init__.py
    │       ├── communicator
    │       │   ├── DDPbackend.py
    │       │   ├── ddp_allgather.py
    │       │   ├── ddp_allgather_twolayer.py
    │       │   ├── ddp_fp16.py
    │       │   ├── ddp_hipress.py
    │       │   ├── ddp_hipress_resnet.py
    │       │   ├── global_comm_comp.py
    │       │   ├── inter_comm_comp.py
    │       │   ├── intra_comm_comp.py
    │       │   ├── intra_comm_comp_byteps.py
    │       │   ├── pool_allreduce.py
    │       │   └── pool_bytecomp.py
    │       ├── compressor
    │       │   ├── pooldgc.py
    │       │   ├── poolefsignsgd.py
    │       │   ├── poolfp16.py
    │       │   ├── poolint8.py
    │       │   ├── poolnone.py
    │       │   ├── poolonebit.py
    │       │   ├── poolqsgd.py
    │       │   ├── poolrandomk.py
    │       │   ├── poolsignsgd.py
    │       │   ├── poolsignum.py
    │       │   ├── poolterngrad.py
    │       │   └── pooltopk.py
    │       ├── helper.py
    │       ├── memory
    │       │   ├── dgc.py
    │       │   ├── efsignsgd.py
    │       │   ├── memory_layer.py
    │       │   ├── memory_pool.py
    │       │   ├── none.py
    │       │   ├── pooldgc.py
    │       │   ├── poolnone.py
    │       │   ├── poolresidual.py
    │       │   ├── residual.py
    │       │   └── topk.py
    │       ├── scheduler
    │       │   ├── README.md
    │       │   ├── bert
    │       │   │   └── nvlink_randomk_cpu.json
    │       │   ├── gpt2
    │       │   │   └── nvlink_efsignsgd_cpu.json
    │       │   ├── lstm
    │       │   │   └── pcie_efsignsgd_two_cpu.json
    │       │   ├── model_tensor.py
    │       │   ├── resnet101
    │       │   │   └── pcie_dgc_cpu.json
    │       │   ├── run_all_models.sh
    │       │   ├── scheduler.py
    │       │   ├── simulator_espresso.py
    │       │   ├── simulator_logs
    │       │   │   ├── bert
    │       │   │   │   └── pcie_randomk_cpu.json
    │       │   │   ├── gpt2
    │       │   │   │   └── pcie_efsignsgd_cpu.json
    │       │   │   ├── lstm
    │       │   │   │   └── pcie_efsignsgd_two_cpu.json
    │       │   │   ├── resnet101
    │       │   │   │   └── pcie_dgc_two_cpu.json
    │       │   │   ├── ugatit
    │       │   │   │   └── pcie_dgc_cpu.json
    │       │   │   └── vgg16
    │       │   │   │   └── pcie_randomk_two_cpu.json
    │       │   ├── ugatit
    │       │   │   ├── nvlink_dgc_cpu.json
    │       │   │   └── nvlink_randomk_cpu.json
    │       │   └── vgg16
    │       │   │   ├── pcie_efsignsgd_cpu.json
    │       │   │   └── pcie_randomk_two_cpu.json
    │       ├── util.py
    │       └── util_cpu.py
    │   ├── ops.cc
    │   ├── ops.h
    │   ├── ops.py
    │   ├── parallel
    │       ├── __init__.py
    │       └── distributed.py
    │   ├── ready_event.cc
    │   ├── ready_event.h
    │   ├── requirements.txt
    │   ├── run_byteps_ddp.sh
    │   ├── run_byteps_test_cpu.sh
    │   ├── run_byteps_test_gpu.sh
    │   ├── sparse_optimizer.py
    │   ├── test_ddp.py
    │   ├── test_torch.py
    │   └── utils.py
├── docker
    ├── Dockerfile
    └── README.md
├── docs
    ├── DistributedDataParallel.md
    ├── MirroredStrategy.md
    ├── architecture.md
    ├── best-practice.md
    ├── cross-barrier.md
    ├── env.md
    ├── faq.md
    ├── gradient-compression.md
    ├── performance.md
    ├── rationale.md
    ├── run-on-k8s.md
    ├── running.md
    ├── step-by-step-tutorial.md
    ├── timeline.md
    └── troubleshooting.md
├── espresso_EuroSys23.pdf
├── espresso_EuroSys_AE.pdf
├── example
    ├── README.md
    ├── keras
    │   ├── keras_imagenet_resnet50.py
    │   ├── keras_mnist.py
    │   ├── keras_mnist_advanced.py
    │   └── keras_synthetic_benchmark_tf2.py
    ├── mxnet
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── data.py
    │   │   ├── data_byteps.py
    │   │   ├── find_mxnet.py
    │   │   ├── fit.py
    │   │   ├── fit_byteps.py
    │   │   ├── modelzoo.py
    │   │   └── util.py
    │   ├── data
    │   │   ├── caltech256.sh
    │   │   └── imagenet1k-val.sh
    │   ├── symbols
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── alexnet.py
    │   │   ├── googlenet.py
    │   │   ├── inception-bn.py
    │   │   ├── inception-resnet-v2.py
    │   │   ├── inception-v3.py
    │   │   ├── inception-v4.py
    │   │   ├── lenet.py
    │   │   ├── mlp.py
    │   │   ├── mobilenet.py
    │   │   ├── mobilenetv2.py
    │   │   ├── resnet-v1.py
    │   │   ├── resnet.py
    │   │   ├── resnetv1.py
    │   │   ├── resnext.py
    │   │   └── vgg.py
    │   ├── train_cifar100_byteps_gc.py
    │   ├── train_gluon_imagenet_byteps_gc.py
    │   ├── train_gluon_mnist_byteps.py
    │   ├── train_gluon_mnist_byteps_gc.py
    │   └── train_imagenet_byteps.py
    ├── pytorch
    │   ├── benchmark_byteps.py
    │   ├── benchmark_byteps_ddp.py
    │   ├── benchmark_cross_barrier_byteps.py
    │   ├── elastic_benchmark_byteps.py
    │   ├── mnist-distributed.py
    │   ├── test_bytecomp_pytorch.py
    │   ├── train_imagenet_resnet50_byteps.py
    │   ├── train_imagenet_resnet_byteps_ddp.py
    │   └── train_mnist_byteps.py
    └── tensorflow
    │   ├── synthetic_benchmark.py
    │   ├── synthetic_benchmark_tf2.py
    │   ├── tensorflow2_keras_mnist.py
    │   ├── tensorflow2_mnist.py
    │   ├── tensorflow2_mnist_bps_MirroredStrategy.py
    │   ├── tensorflow_keras_mnist.py
    │   └── tensorflow_mnist.py
├── install.sh
├── launcher
    ├── README.md
    ├── dist_launcher.py
    └── launch.py
├── pre_setup.py
├── setup.py
└── tests
    ├── meta_test.py
    ├── run_byteps_test.sh
    ├── test_dithering.py
    ├── test_mxnet.py
    ├── test_onebit.py
    ├── test_randomk.py
    ├── test_tensorflow_keras.py
    ├── test_topk.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # vscode
  2 | .vscode
  3 | *.gz
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | bin/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 
111 | # pycharm
112 | .idea
113 | 
114 | # mac
115 | .DS_Store
116 | 
117 | # for development
118 | scripts/
119 | exps/
120 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "3rdparty/ps-lite"]
2 | 	path = 3rdparty/ps-lite
3 | 	url = https://github.com/bytedance/ps-lite
4 | 	branch = byteps
5 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 2 | Changelog for BytePS
 3 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 4 | 0.2.4 (2020-06)
 5 | ------------------
 6 | * Fix compatibility issue with tf2 + standalone keras
 7 | * Add support for tensorflow.keras
 8 | * Improve robustness of broadcast
 9 | 
10 | 
11 | 0.2.3 (2020-05)
12 | ------------------
13 | * Add DistributedDataParallel module for PyTorch
14 | * Fix the problem of different CPU tensor using the same name
15 | * Add skip_synchronize api for PyTorch
16 | * Add the option for lazy/non-lazy init
17 | 
18 | 
19 | 0.2.0 (2020-02)
20 | ------------------
21 | * Largely improve RDMA performance by enforcing page aligned memory.
22 | * Add IPC support for RDMA. Now support colocating servers and workers without sacrificing much performance.
23 | * Fix a hanging bug in BytePS server.
24 | * Fix RDMA-related segmentation fault problem during fork() (e.g., used by PyTorch data loader).
25 | * New feature: Enable mixing use of colocate and non-colocate servers, along with a smart tensor allocation strategy.
26 | * New feature: Add ``bpslaunch`` as the command to launch tasks.
27 | * Add support for pip install: ``pip3 install byteps``
28 | 
29 | 
30 | 0.1.0 (2019-12)
31 | ------------------
32 | * First official release.
33 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribution guidelines
 2 | 
 3 | First of all, thanks for taking the time to contribute!
 4 | 
 5 | Please refer to the following guidelines to contribute new functionality or bug fixes:
 6 | 
 7 | 1. Use [autopep8](https://github.com/hhatto/autopep8) to format the Python code.
 8 | 2. Use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to format C++ code. Changes to BytePS C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 9 | 3. Add unit tests for any new code you write.
10 | 4. Run unit tests in both CI and GPU environments.
11 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include */* LICENSE byteps.lds byteps.exp
 2 | prune .git
 3 | prune dist
 4 | recursive-include * *.cc *.h
 5 | prune __pycache__
 6 | prune 3rdparty
 7 | graft 3rdparty/ps-lite
 8 | prune 3rdparty/ps-lite/build
 9 | prune 3rdparty/ps-lite/deps
10 | exclude 3rdparty/ps-lite/tests/test_benchmark
11 | exclude 3rdparty/ps-lite/tests/test_benchmark.d
12 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark
13 | exclude 3rdparty/ps-lite/tests/test_ipc_benchmark.d
14 | 
15 | include pre_setup.py pre_setup_local.py zeromq-4.1.4.tar.gz ucx.zip
16 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | BytePS includes derived work from the following:
 2 | 
 3 | Horovod
 4 | Copyright 2018 Uber Technologies, Inc.
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this file except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 |     http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | 
18 | kennethreitz/setup.py
19 | Copyright 2019 Kenneth Reitz
20 | 
21 | Permission is hereby granted, free of charge, to any person obtaining
22 | a copy of this software and associated documentation files (the
23 | "Software"), to deal in the Software without restriction, including
24 | without limitation the rights to use, copy, modify, merge, publish,
25 | distribute, sublicense, and/or sell copies of the Software, and to
26 | permit persons to whom the Software is furnished to do so, subject to
27 | the following conditions:
28 | 
29 | The above copyright notice and this permission notice shall be included
30 | in all copies or substantial portions of the Software.
31 | 
32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
33 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
34 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
35 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
36 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
37 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
38 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reproduce Espresso Experiments
 2 | 
 3 | This repository contains the system code and scripts that help run the Espresso experiments from our EuroSys '23 paper.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - CUDA == 11.1
 8 | - PyTorch >= 1.8.0
 9 | - NCCL >= 2.8.3
10 | 
11 | ## What machines?
12 | 
13 | All of our experiments require multiple GPU machines.
14 | We expect each GPU machine has eight V100 GPUs and the GPU memory is 32GB.
15 | NVLink-based (100Gbps TCP) and PCIe-only (25Gbps TCP) GPU machines are needed for Figure 11 and Figure 12. 
16 | If resources are not available, it is fine to perform experiments with 4 GPU machines and use 25Gbps TCP for NVLink-based experiments. 
17 | It is also fine to perform PCIe-only experiments on NVLink-based machines with export NCCL_P2P_DISABLE=1
18 | 
19 | 
20 | ## Installation
21 | 
22 | Install Espresso on each GPU machine. Please make sure the machines can successfully install [BytePS](https://github.com/bytedance/byteps).
23 | 
24 | ```bash
25 | # In case you need to install PyTorch
26 | pip3 install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
27 | 
28 | git clone https://github.com/zhuangwang93/Espresso.git --recursive
29 | cd Espresso
30 | 
31 | # install dependencies and datasets for training
32 | bash install.sh
33 | ```
34 | 
35 | ## AE file
36 | 
37 | Follow the instructions in [espresso_EuroSys_AE.pdf](https://github.com/zhuangwang93/Espresso/blob/master/espresso_EuroSys_AE.pdf) to reproduce the experimental results in our paper.
38 | 
39 | 
40 | ## End-to-end training throughput
41 | 
42 | The DNN models used by Espresso are in [examples](https://github.com/zhuangwang93/Espresso/tree/master/byteps/torch/examples). 
43 | There are six DNN models and each of them has a folder under examples/
44 | 
45 | Follow the instructions in this directory to reproduce the experiments results in Figure 11 and Figure 12.
46 | 


--------------------------------------------------------------------------------
/byteps.exp:
--------------------------------------------------------------------------------
1 | *byteps*
2 | # PyTorch binding
3 | *PyInit*
4 | *initc_lib*
5 | 


--------------------------------------------------------------------------------
/byteps.lds:
--------------------------------------------------------------------------------
1 | {
2 |   global:
3 |     *byteps*;
4 |     # PyTorch binding
5 |     *PyInit*;
6 |     *initc_lib*;
7 |   local: *;
8 | };
9 | 


--------------------------------------------------------------------------------
/byteps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/__init__.py


--------------------------------------------------------------------------------
/byteps/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 2, 5)
2 | 
3 | __version__ = '.'.join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/byteps/common/compressor/compressor_registry.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H
17 | #define BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H
18 | 
19 | #include "compressor.h"
20 | #include "utils.h"
21 | 
22 | namespace byteps {
23 | namespace common {
24 | namespace compressor {
25 | 
26 | class CompressorRegistry {
27 |  public:
28 |   // constructor of compressor
29 |   using ctor_t = std::function<std::unique_ptr<Compressor>(
30 |       const kwargs_t& kwargs, size_t size, DataType dtype)>;
31 | 
32 |   using map_t = std::unordered_map<std::string, ctor_t>;
33 | 
34 |   struct Register {
35 |     Register(std::string name, ctor_t ctor);
36 |   };
37 | 
38 |   static ctor_t Find(const std::string& name);
39 | 
40 |   static std::unique_ptr<Compressor> Create(const kwargs_t& kwargs, size_t size,
41 |                                             DataType dtype);
42 | 
43 |  private:
44 |   static map_t _ctor_map;
45 | 
46 |   CompressorRegistry() = delete;
47 |   ~CompressorRegistry() = delete;
48 | };
49 | 
50 | }  // namespace compressor
51 | }  // namespace common
52 | }  // namespace byteps
53 | 
54 | #endif  // BYTEPS_COMPRESSOR_COMPRESSOR_REGISTRY_H


--------------------------------------------------------------------------------
/byteps/common/compressor/error_feedback.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "error_feedback.h"
17 | 
18 | namespace byteps {
19 | namespace common {
20 | namespace compressor {
21 | 
22 | tensor_t ErrorFeedback::Compress(tensor_t grad) {
23 |   
24 |   // 1. grad <- grad + error
25 |   UpdateGradient(grad);
26 | 
27 |   // 2. c <- Compress(grad)
28 |   auto compressed = _cptr->Compress(grad);
29 | 
30 |   // 3. e <- grad - Decompress(c)
31 |   // postpone the UpdateError() to avoid the blocking
32 |   UpdateError(grad, compressed);
33 | 
34 |   return compressed;
35 | }
36 | 
37 | tensor_t ErrorFeedback::Decompress(tensor_t compressed) {
38 |   // directly forward to internal compressor
39 |   return _cptr->Decompress(compressed);
40 | }
41 | 
42 | 
43 | void ErrorFeedback::UpdateError(tensor_t corrected, tensor_t compressed) {
44 |   tensor_t error{_error.get(), _size, corrected.dtype};
45 |   _cptr->FastUpdateError(error, corrected, compressed);
46 | }
47 | 
48 | }  // namespace compressor
49 | }  // namespace common
50 | }  // namespace byteps


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/nesterov_momentum.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H
17 | #define BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H
18 | 
19 | #include "../momentum.h"
20 | 
21 | namespace byteps {
22 | namespace common {
23 | namespace compressor {
24 | 
25 | /*!
26 |  * \brief Nesterov Momentum Compressor
27 |  *
28 |  * paper: A method for solving the convex programming problem with convergence
29 |  * rate $O (1/k^2)$
30 |  *
31 |  * m_t <- \mu m_{t-1} + g_t
32 |  * g_t <- \mu m_t + g_t
33 |  *
34 |  */
35 | class NesterovMomentumCompressor : public Momentum {
36 |  public:
37 |   NesterovMomentumCompressor(size_t size, DataType dtype,
38 |                              std::unique_ptr<Compressor> cptr, float mu)
39 |       : Momentum(size, dtype, std::move(cptr), mu){};
40 |   virtual ~NesterovMomentumCompressor() = default;
41 | 
42 |  protected:
43 |   void UpdateMom(tensor_t grad) override;
44 |   void UpdateGradient(tensor_t grad) override;
45 | };
46 | 
47 | }  // namespace compressor
48 | }  // namespace common
49 | }  // namespace byteps
50 | 
51 | #endif  // BYTEPS_COMPRESSOR_IMPL_NESTEROV_MOMENTUM_H


--------------------------------------------------------------------------------
/byteps/common/compressor/impl/test_error_feedback.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include <unistd.h>
17 | 
18 | #include "../compressor_registry.h"
19 | #include "test_error_feedback.h"
20 | 
21 | namespace byteps {
22 | namespace common {
23 | namespace compressor {
24 | namespace {
25 | CompressorRegistry::Register reg(
26 |     "test_ef",
27 |     [](const kwargs_t& kwargs, size_t size,
28 |        DataType dtype) -> std::unique_ptr<Compressor> {
29 |       // register cptr
30 |       auto kwargs_clone = kwargs;
31 |       kwargs_clone.erase("ef_type");
32 |       auto cptr = CompressorRegistry::Create(kwargs_clone, size, dtype);
33 |       BPS_CHECK_NE(cptr, nullptr);
34 |       return std::unique_ptr<TestErrorFeedbackCompressor>(
35 |           new TestErrorFeedbackCompressor(size, dtype, std::move(cptr)));
36 |     });
37 | }
38 | 
39 | TestErrorFeedbackCompressor::TestErrorFeedbackCompressor(
40 |     size_t size, DataType dtype, std::unique_ptr<Compressor> cptr)
41 |     : ErrorFeedback(size, dtype, std::move(cptr)) {}
42 | 
43 | void TestErrorFeedbackCompressor::UpdateGradient(tensor_t grad) {
44 |   this->_cpu_reducer->sum(grad.data, _error.get(), grad.size,
45 |                           static_cast<DataType>(grad.dtype));
46 | }
47 | 
48 | }  // namespace compressor
49 | }  // namespace common
50 | }  // namespace byteps


--------------------------------------------------------------------------------
/byteps/common/compressor/momentum.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2020 Amazon Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "momentum.h"
17 | 
18 | namespace byteps {
19 | namespace common {
20 | namespace compressor {
21 | 
22 | tensor_t Momentum::Compress(tensor_t grad) {
23 |   // 1. m_t = \mu * m_{t-1} + g_t
24 |   UpdateMom(grad);
25 | 
26 |   // 2. p_t = \mu m_t + g_t
27 |   UpdateGradient(grad);
28 | 
29 |   // 3. compress
30 |   return _cptr->Compress(grad);
31 | }
32 | 
33 | tensor_t Momentum::Decompress(tensor_t compressed) {
34 |   // directly forward to internal compressor
35 |   return _cptr->Decompress(compressed);
36 | }
37 | 
38 | }  // namespace compressor
39 | }  // namespace common
40 | }  // namespace byteps


--------------------------------------------------------------------------------
/byteps/common/core_loops.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_CORE_LOOPS_H
17 | #define BYTEPS_CORE_LOOPS_H
18 | 
19 | namespace byteps {
20 | namespace common {
21 | 
22 | void CoordinateNoneLoop();
23 | 
24 | void CoordinateReduceLoop();
25 | void CoordinateIntraReduceLoop();
26 | void CoordinateIntraGatherLoop();
27 | void CoordinateIntraBroadcastLoop();
28 | void CoordinateIntraReducescatterLoop();
29 | void CoordinateIntraAllgatherLoop();
30 | void CoordinateIntraAlltoallLoop();
31 | 
32 | void CoordinateBroadcastLoop();
33 | 
34 | void CoordinatePushLoop();
35 | 
36 | void PcieReduceLoop();
37 | 
38 | void RootNcclLoop();
39 | 
40 | void NonRootNcclLoop();
41 | 
42 | void SyncNcclLoop();
43 | 
44 | void CopyDevice2HostLoop();
45 | void CompressCopyDevice2HostLoop();
46 | 
47 | void CompressLoop();
48 | 
49 | void PushLoop();
50 | 
51 | void PullLoop();
52 | 
53 | void DecompressLoop();
54 | 
55 | void RootCopyHost2DeviceLoop();
56 | 
57 | void NonRootCopyListenLoop();
58 | 
59 | void NonRootCopyHost2DeviceLoop();
60 | 
61 | }  // namespace common
62 | }  // namespace byteps
63 | 
64 | #endif  // BYTEPS_CORE_LOOPS_H
65 | 


--------------------------------------------------------------------------------
/byteps/common/ready_table.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #include "ready_table.h"
17 | 
18 | #include "logging.h"
19 | 
20 | namespace byteps {
21 | namespace common {
22 | 
23 | // below are methods for accessing/modifying the _ready_table
24 | bool ReadyTable::IsKeyReady(uint64_t key) {
25 |   std::lock_guard<std::mutex> lock(_table_mutex);
26 |   return _ready_table[key] == (_ready_count);
27 | }
28 | 
29 | int ReadyTable::AddReadyCount(uint64_t key) {
30 |   std::lock_guard<std::mutex> lock(_table_mutex);
31 |   BPS_CHECK_LT(_ready_table[key], _ready_count)
32 |       << _table_name << ": " << _ready_table[key] << ", " << (_ready_count);
33 |   return ++_ready_table[key];
34 | }
35 | 
36 | int ReadyTable::SetReadyCount(uint64_t key, int cnt) {
37 |   std::lock_guard<std::mutex> lock(_table_mutex);
38 |   _ready_table[key] = cnt;
39 | }
40 | 
41 | void ReadyTable::ClearReadyCount(uint64_t key) {
42 |   std::lock_guard<std::mutex> lock(_table_mutex);
43 |   _ready_table[key] = 0;
44 | }
45 | 
46 | }  // namespace common
47 | }  // namespace byteps
48 | 


--------------------------------------------------------------------------------
/byteps/common/ready_table.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_READY_TABLE_H
17 | #define BYTEPS_READY_TABLE_H
18 | 
19 | #include <mutex>
20 | #include <thread>
21 | #include <unordered_map>
22 | #include <iostream>
23 | 
24 | namespace byteps {
25 | namespace common {
26 | 
27 | class ReadyTable {
28 |  public:
29 |   ReadyTable(int ready_count, const char* name) {
30 |     _ready_count = ready_count;
31 |     _table_name = std::string(name);
32 |   }
33 |   // methods to access or modify the _ready_table
34 |   bool IsKeyReady(uint64_t key);
35 |   int AddReadyCount(uint64_t key);
36 |   int SetReadyCount(uint64_t key, int cnt);
37 |   void ClearReadyCount(uint64_t key);
38 | 
39 |  private:
40 |   // (key, ready_signal_count) pair, only valid for root device
41 |   std::unordered_map<uint64_t, int> _ready_table;
42 |   // use this mutex to access/modify the _ready_table
43 |   std::mutex _table_mutex;
44 |   int _ready_count;
45 |   std::string _table_name;
46 | };
47 | 
48 | }  // namespace common
49 | }  // namespace byteps
50 | 
51 | #endif  // BYTEPS_READY_TABLE_H
52 | 


--------------------------------------------------------------------------------
/byteps/common/scheduled_queue.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_SCHEDULED_QUEUE_H
17 | #define BYTEPS_SCHEDULED_QUEUE_H
18 | 
19 | #include <atomic>
20 | #include <memory>
21 | #include <unordered_map>
22 | #include <vector>
23 | #include "common.h"
24 | #include "ready_table.h"
25 | 
26 | namespace byteps {
27 | namespace common {
28 | 
29 | class BytePSScheduledQueue {
30 |  public:
31 |   BytePSScheduledQueue(QueueType type);
32 |   QueueType getQueueType() { return _qt; }
33 |   void addTask(std::shared_ptr<TensorTableEntry>);
34 |   void recorderTs(std::shared_ptr<TensorTableEntry>);
35 |   std::shared_ptr<TensorTableEntry> getTask();
36 |   std::shared_ptr<TensorTableEntry> getTask(uint64_t key);
37 |   uint32_t pendingSize();
38 |   void reportFinish(int size);
39 |   void reset(uint64_t key, int cnt);
40 | 
41 |  private:
42 |   // TODO: use priority queue or heap
43 |   std::vector<std::shared_ptr<TensorTableEntry>> _sq;
44 |   std::mutex _mutex;
45 |   uint64_t _credits;
46 |   bool _is_scheduled;
47 |   QueueType _qt;
48 |   ReadyTable *_rt;
49 | };
50 | 
51 | }  // namespace common
52 | }  // namespace byteps
53 | 
54 | #endif  // BYTEPS_SCHEDULED_QUEUE_H
55 | 


--------------------------------------------------------------------------------
/byteps/common/shared_memory.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_SHARED_MEMORY_H
17 | #define BYTEPS_SHARED_MEMORY_H
18 | 
19 | #include <cuda_runtime.h>
20 | #include <sys/mman.h>
21 | #include <cerrno>
22 | #include <cstdio>
23 | #include <cstdlib>
24 | #include <cstring>
25 | #include <mutex>
26 | #include <thread>
27 | #include <unordered_map>
28 | #include <vector>
29 | #include "logging.h"
30 | 
31 | namespace byteps {
32 | namespace common {
33 | 
34 | class BytePSSharedMemory {
35 |  public:
36 |   BytePSSharedMemory() {}
37 | 
38 |   ~BytePSSharedMemory() {
39 |     for (auto &it : _key_shm_addr) {
40 |       CUDA_CALL(cudaHostUnregister(it.second));
41 |       munmap(it.second, _key_shm_size[it.first]);
42 |       shm_unlink(it.first.c_str());
43 |     }
44 | 
45 |     BPS_LOG(DEBUG) << "Clear shared memory: all BytePS shared memory "
46 |                       "released/unregistered.";
47 |   }
48 | 
49 |   void *openSharedMemory(const std::string &prefix, uint64_t key, size_t size);
50 |   std::vector<void *> openPcieSharedMemory(uint64_t key, size_t size);
51 | 
52 |  private:
53 |   std::unordered_map<std::string, void *> _key_shm_addr;
54 |   std::unordered_map<std::string, size_t> _key_shm_size;
55 | 
56 |   std::mutex _shm_mu;
57 | };
58 | 
59 | }  // namespace common
60 | }  // namespace byteps
61 | 
62 | #endif  // BYTEPS_SHARED_MEMORY_H
63 | 


--------------------------------------------------------------------------------
/byteps/compressor_microbenchmark/Makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | CFLAGS = -g -Wall -O2
 3 | LDFLAGS = -fopenmp
 4 | 
 5 | TARGET = test
 6 | DEPS = common.h compressor.h
 7 | OBJ = efsignSGD.o onebit.o randomk.o dgc.o
 8 | 
 9 | %.o: %.cc $(DEPS)
10 | 	$(CC) $(LDFLAGS) -c -o $@ $<
11 | 
12 | 
13 | $(TARGET): $(OBJ) test.o
14 | 	$(CC) $(LDFLAGS) -o $@ $^ $(CFLAGS)
15 | 
16 | all: $(TARGET)
17 | 
18 | .PHONY: clean
19 | 
20 | clean:
21 | 	rm -f *.o $(TARGET)


--------------------------------------------------------------------------------
/byteps/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/misc/__init__.py


--------------------------------------------------------------------------------
/byteps/mxnet/adapter.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #if HAVE_CUDA
18 | #include "cuda.h"
19 | #endif
20 | 
21 | #include "adapter.h"
22 | #include "cuda_util.h"
23 | #include "tensor_util.h"
24 | 
25 | namespace byteps {
26 | namespace mxnet {
27 | 
28 | 
29 | template <class T>
30 | MXTensor<T>::MXTensor(T* tensor) : tensor_(tensor) {}
31 | 
32 | template <class T>
33 | const DataType MXTensor<T>::dtype() const {
34 |   return TensorUtil::GetDType(tensor_);
35 | }
36 | 
37 | template <class T>
38 | const TensorShape MXTensor<T>::shape() const {
39 |   auto shape = TensorUtil::GetShape(tensor_);
40 |   if (shape.dims() == 0) {
41 |     // Tensor with empty shape is a Tensor with no values in MXNet, unlike a
42 |     // constant in TensorFlow. So, we inject a dummy zero dimension to make sure
43 |     // that the number-of-elements calculation is correct.
44 |     shape.AddDim(0);
45 |   }
46 |   return shape;
47 | }
48 | 
49 | template <class T>
50 | const void* MXTensor<T>::data() const {
51 |   return TensorUtil::GetData(tensor_);
52 | }
53 | 
54 | template <class T>
55 | int64_t MXTensor<T>::size() const {
56 |   return TensorUtil::GetSize(tensor_);
57 | }
58 | 
59 | template class MXTensor<NDArray>;
60 | 
61 | }  // namespace mxnet
62 | }  // namespace byteps
63 | 


--------------------------------------------------------------------------------
/byteps/mxnet/adapter.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_ADAPTER_H
18 | #define BYTEPS_MXNET_ADAPTER_H
19 | 
20 | #include <mxnet/base.h>
21 | #include "../common/common.h"
22 | 
23 | namespace byteps {
24 | namespace mxnet {
25 | 
26 | using namespace byteps::common;
27 | 
28 | template <class T>
29 | class MXTensor : public Tensor {
30 |  public:
31 |   MXTensor(T* tensor);
32 |   virtual const DataType dtype() const override;
33 |   virtual const TensorShape shape() const override;
34 |   virtual const void* data() const override;
35 |   virtual int64_t size() const override;
36 | 
37 |  protected:
38 |   T* tensor_;
39 | };
40 | 
41 | inline void ThrowIfError(const Status& status) {
42 |   if (!status.ok()) {
43 |     throw dmlc::Error(status.reason());
44 |   }
45 | }
46 | 
47 | }  // namespace mxnet
48 | }  // namespace byteps
49 | 
50 | #endif  // BYTEPS_MXNET_ADAPTER_H
51 | 


--------------------------------------------------------------------------------
/byteps/mxnet/cuda_util.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #if HAVE_CUDA
18 | #include <mxnet/base.h>
19 | #include "cuda_runtime.h"
20 | #endif
21 | 
22 | #include "../common/common.h"
23 | #include "cuda_util.h"
24 | #include "util.h"
25 | 
26 | namespace byteps {
27 | namespace mxnet {
28 | 
29 | with_device::with_device(int device) {
30 |   if (device == CPU_DEVICE_ID) {
31 |     restore_device_ = CPU_DEVICE_ID;
32 |   } else {
33 | #if HAVE_CUDA
34 |     CUDA_CALL(cudaGetDevice(&restore_device_));
35 |     CUDA_CALL(cudaSetDevice(device));
36 | #else
37 |     throw std::logic_error(
38 |         "Internal error. Requested device context manager "
39 |         "with GPU device but not compiled with CUDA.");
40 | #endif
41 |   }
42 | }
43 | 
44 | with_device::~with_device() {
45 | #if HAVE_CUDA
46 |   if (restore_device_ != CPU_DEVICE_ID) {
47 |     CUDA_CALL(cudaSetDevice(restore_device_));
48 |   }
49 | #endif
50 | }
51 | 
52 | }  // namespace mxnet
53 | }  // namespace byteps
54 | 


--------------------------------------------------------------------------------
/byteps/mxnet/cuda_util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_CUDA_UTIL_H
18 | #define BYTEPS_MXNET_CUDA_UTIL_H
19 | 
20 | namespace byteps {
21 | namespace mxnet {
22 | 
23 | class with_device {
24 |  public:
25 |   with_device(int device);
26 |   ~with_device();
27 | 
28 |  private:
29 |   int restore_device_;
30 | };
31 | 
32 | }  // namespace mxnet
33 | }  // namespace byteps
34 | 
35 | #endif  // BYTEPS_MXNET_CUDA_UTIL_H
36 | 


--------------------------------------------------------------------------------
/byteps/mxnet/ops.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_OPS_H
18 | #define BYTEPS_MXNET_OPS_H
19 | 
20 | #include <mxnet/base.h>
21 | #include <mxnet/c_api.h>
22 | #include <mxnet/c_api_error.h>
23 | #include <mxnet/engine.h>
24 | #include <mxnet/ndarray.h>
25 | #include "../common/common.h"
26 | 
27 | namespace byteps {
28 | namespace mxnet {
29 | 
30 | using namespace byteps::common;
31 | 
32 | typedef ::mxnet::Engine Engine;
33 | typedef ::mxnet::NDArray NDArray;
34 | typedef ::mxnet::Engine::CallbackOnComplete Callback;
35 | 
36 | extern "C" int byteps_mxnet_push_pull_async(NDArray* input, char* name,
37 |                                             int version, int priority,
38 |                                             bool is_average);
39 | 
40 | extern "C" void byteps_mxnet_declare_tensor(char* name, int num_args,
41 |                                             char** args_keys,
42 |                                             char** args_vals);
43 | 
44 | }  // namespace mxnet
45 | }  // namespace byteps
46 | 
47 | #endif  // BYTEPS_MXNET_OPS_H
48 | 


--------------------------------------------------------------------------------
/byteps/mxnet/ready_event.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #include <mxnet/base.h>
18 | 
19 | #if HAVE_CUDA
20 | #include <cassert>
21 | 
22 | #include "ready_event.h"
23 | 
24 | namespace byteps {
25 | namespace mxnet {
26 | 
27 | template <class T>
28 | MXReadyEvent<T>::MXReadyEvent(NDArray* tensor) : tensor_(tensor) {
29 |   assert(tensor->ctx().real_dev_id() != CPU_DEVICE_ID);
30 | }
31 | 
32 | template <class T>
33 | MXReadyEvent<T>::~MXReadyEvent() {}
34 | 
35 | template <class T>
36 | bool MXReadyEvent<T>::Ready() const {
37 |   return true;
38 | }
39 | 
40 | template class MXReadyEvent<NDArray>;
41 | 
42 | }  // namespace mxnet
43 | }  // namespace byteps
44 | #endif
45 | 


--------------------------------------------------------------------------------
/byteps/mxnet/ready_event.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_READY_EVENT_H
18 | #define BYTEPS_MXNET_READY_EVENT_H
19 | 
20 | #include <mxnet/ndarray.h>
21 | 
22 | #if HAVE_CUDA
23 | #include <mutex>
24 | #include <queue>
25 | #include <unordered_map>
26 | #include "cuda_runtime.h"
27 | 
28 | #include "../common/common.h"
29 | 
30 | namespace byteps {
31 | namespace mxnet {
32 | 
33 | using namespace byteps::common;
34 | typedef ::mxnet::NDArray NDArray;
35 | 
36 | template <class T>
37 | class MXReadyEvent : public ReadyEvent {
38 |  public:
39 |   MXReadyEvent(NDArray* tensor);
40 |   ~MXReadyEvent();
41 |   virtual bool Ready() const override;
42 | 
43 |  private:
44 |   NDArray* tensor_;
45 | };
46 | 
47 | }  // namespace mxnet
48 | }  // namespace byteps
49 | #endif
50 | 
51 | #endif  // BYTEPS_MXNET_READY_EVENT_H
52 | 


--------------------------------------------------------------------------------
/byteps/mxnet/util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_MXNET_UTIL_H
18 | #define BYTEPS_MXNET_UTIL_H
19 | 
20 | #if HAVE_CUDA
21 | 
22 | #include <cuda_runtime.h>
23 | 
24 | /*!
25 |  * \brief Protected CUDA call.
26 |  * \param func Expression to call.
27 |  *
28 |  * It checks for CUDA errors after invocation of the expression.
29 |  */
30 | #define CUDA_CALL(func)                                      \
31 |   {                                                          \
32 |     cudaError_t e = (func);                                  \
33 |     CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
34 |         << "CUDA: " << cudaGetErrorString(e);                \
35 |   }
36 | 
37 | #endif  // HAVE_CUDA
38 | 
39 | #endif  // BYTEPS_MXNET_UTIL_H
40 | 


--------------------------------------------------------------------------------
/byteps/server/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | 
16 | import ctypes
17 | import os
18 | from byteps.common import get_ext_suffix
19 | 
20 | 
21 | def run():
22 |     dll_path = os.path.join(os.path.dirname(__file__),
23 |                             'c_lib' + get_ext_suffix())
24 |     SERVER_LIB_CTYPES = ctypes.CDLL(dll_path, ctypes.RTLD_GLOBAL)
25 |     SERVER_LIB_CTYPES.byteps_server()
26 | 
27 | run()
28 | 


--------------------------------------------------------------------------------
/byteps/sparse_cpu_microbenchmark/Makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | CFLAGS = -g -Wall -O2
 3 | LDFLAGS = -fopenmp
 4 | 
 5 | TARGET = test
 6 | 
 7 | %.o: %.cc $(DEPS)
 8 | 	$(CC) $(LDFLAGS) -c -o $@ $<
 9 | 
10 | 
11 | $(TARGET): $(OBJ) test.o
12 | 	$(CC) $(LDFLAGS) -o $@ $^ $(CFLAGS)
13 | 
14 | all: $(TARGET)
15 | 
16 | .PHONY: clean
17 | 
18 | clean:
19 | 	rm -f *.o $(TARGET)


--------------------------------------------------------------------------------
/byteps/tensorflow/distribute/__init__.py:
--------------------------------------------------------------------------------
1 | from . mirrored_strategy import MirroredStrategy
2 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/communicator/pool_allgather.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mergeComp_dl.torch import Communicator
 3 | from horovod.torch import allgather, allgather_async, synchronize
 4 | import time
 5 | import horovod.torch as hvd
 6 | 
 7 | 
 8 | class PoolAllgather(Communicator):
 9 |     def __init__(self, compressor, memory):
10 |         super().__init__(compressor, memory)
11 |         self.world_size = hvd.size()
12 |         self.name = "PoolAllGather"
13 | 
14 | 
15 |     def async_send(self, tensors_compressed, ctx):
16 |         if tensors_compressed is None:
17 |             return
18 | 
19 |         handles = []
20 |         for i, tensor_compressed in enumerate(tensors_compressed):
21 |             handle = allgather_async(tensor_compressed, ctx[0] + str(i))
22 |             handles.append(handle)
23 | 
24 |         return handles
25 | 
26 | 
27 |     def wait_receive(self, handles, ctx):
28 |         tensors_compressed = []
29 |         for h in handles:
30 |             tensor_compressed = synchronize(h)
31 |             tensors_compressed.append(tensor_compressed.chunk(self.world_size))
32 | 
33 |         tensors_decompressed = []
34 |         if len(tensors_compressed) == 1:
35 |             for tensor in tensors_compressed[0]:
36 |                 tensors_decompressed.append(self.compressor.decompress([tensor], ctx))
37 |         elif len(tensors_compressed) == 2:
38 |             for tensor, meta in zip(tensors_compressed[0], tensors_compressed[1]):
39 |                 tensors_decompressed.append(self.compressor.decompress((tensor, meta), ctx))
40 | 
41 |         tensors_decompressed = self.memory.aggregate(tensors_decompressed)
42 |         return tensors_decompressed
43 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/communicator/pool_allreduce.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mergeComp_dl.torch import Communicator
 3 | from horovod.torch import allreduce_async, synchronize
 4 | from horovod.torch.mpi_ops import Average
 5 | 
 6 | 
 7 | class PoolAllreduce(Communicator):
 8 |     def __init__(self, compressor, memory):
 9 |         super().__init__(compressor, memory)
10 |         self.name = "PoolAllReduce"
11 | 
12 | 
13 |     def async_send(self, tensors_compressed, ctx):
14 |         # assert only one tensor in tensors_compressed for allreduce
15 |         return allreduce_async(tensors_compressed[0], name=ctx[0], op=Average)
16 | 
17 | 
18 |     def wait_receive(self, handle, ctx):
19 |         output = [synchronize(handle)]
20 |         return [self.compressor.decompress(output, ctx)]
21 | 
22 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/communicator/pool_byteps.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from byteps.torch.ops import push_pull_async_inplace as byteps_push_pull
 4 | from byteps.torch.ops import synchronize
 5 | import sys
 6 | sys.path.append("../..")
 7 | from mergeComp import Communicator
 8 | 
 9 | class PoolBytePS(Communicator):
10 |     def __init__(self, compressor, memory):
11 |         super().__init__(compressor, memory)
12 |         self.name = "PoolBytePS"
13 | 
14 | 
15 |     def async_send(self, tensors_compressed, ctx):
16 |         return byteps_push_pull(tensors_compressed[0], average=False, name=ctx[0])
17 | 
18 | 
19 |     def wait_receive(self, handle, ctx):
20 |         output = [synchronize(handle)]
21 |         return [self.compressor.decompress(output, ctx)]


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolefsignsgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mergeComp_dl.torch import Compressor
 4 | from mergeComp_dl.torch.util import packbits, unpackbits
 5 | 
 6 | 
 7 | class PoolEFSignSGDCompressor(Compressor):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.name = "EFSignSGD"
11 |         self.quantization = True
12 | 
13 | 
14 |     def compress(self, tensor, name, ctx, server=False):
15 |         """Encoding and compressing the signs """
16 |         numel = tensor.numel()
17 | 
18 |         sign_encode = tensor >= 0
19 |         mean = tensor.abs().mean().reshape((1,))
20 | 
21 |         int8_tensor, size = packbits(sign_encode)
22 |         tensor_compressed = int8_tensor, mean
23 | 
24 |         ctx = (name, numel)
25 |         return tensor_compressed, ctx
26 | 
27 | 
28 |     def decompress(self, tensor_compressed, ctx, server=False):
29 |         """Decoding the signs to float format """
30 |         int8_tensor, mean = tensor_compressed
31 |         mean = mean[0]
32 |         name, numel = ctx
33 | 
34 |         sign_decode = unpackbits(int8_tensor, numel)
35 |         sign_decode = sign_decode.type(torch.float32) * 2 - 1
36 |         return mean * sign_decode
37 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolfp16.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Compressor
 5 | 
 6 | 
 7 | class PoolFP16Compressor(Compressor):
 8 |     """Compress all floating point gradients to 16-bit."""
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.name = "PoolFP16"
12 |         self.quantization = False
13 | 
14 |     def compress(self, tensor, name, start=0):
15 |         """Downcasts the tensor to 16-bit."""
16 |         dtype = tensor.dtype
17 |         tensor_compressed = tensor
18 |         if dtype.is_floating:
19 |             # Only allow compression from other floating point types
20 |             tensor_compressed = tf.cast(tensor, dtype=tf.float16)
21 |         ctx = (name, dtype)
22 |         return [tensor_compressed], ctx
23 | 
24 | 
25 |     def decompress(self, tensors, ctx):
26 |         """Upcasts the tensor to the initialization dtype."""
27 |         tensor_compressed = tensors[0]
28 |         _, dtype = ctx
29 |         tensor_decompressed = tensor_compressed
30 |         #print("[decompress] before", ctx, torch.sum(tensor_compressed))
31 |         if dtype.is_floating:
32 |             tensor_decompressed = tf.cast(tensor_compressed, dtype=dtype)
33 |         #print("[decompress] after", ctx, torch.sum(tensor_compressed))
34 |         return tensor_decompressed
35 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolnone.py:
--------------------------------------------------------------------------------
 1 | from mergeComp_dl.torch import Compressor
 2 | 
 3 | 
 4 | class PoolNoneCompressor(Compressor):
 5 |     """Default no-op compression."""
 6 |     def __init__(self):
 7 |         super().__init__()
 8 |         self.name = "PoolNone"
 9 |         self.quantization = False
10 | 
11 |     def compress(self, tensor, name, start=None, server=False):
12 |         ctx = (name, tensor.numel())
13 |         return [tensor], ctx
14 | 
15 |     def decompress(self, tensors, ctx, server=False):
16 |         return tensors[0]
17 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolonebit.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mergeComp_dl.torch import Compressor
 4 | from mergeComp_dl.torch.util import packbits, unpackbits
 5 | 
 6 | 
 7 | class PoolOneBitCompressor(Compressor):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.name = "PoolOneBit"
11 |         self.quantization = False
12 | 
13 | 
14 |     def compress(self, tensor, name, ctx, server=False):
15 |         numel = tensor.numel()
16 | 
17 |         mask0 = tensor < 0
18 |         sum0 = torch.sum(tensor[mask0])
19 |         num0 = torch.sum(mask0).float()
20 |         mean0 = sum0 / num0 if num0 > 0 else sum0
21 |         mean0 = mean0.reshape((1,))
22 | 
23 |         mask1 = ~mask0
24 |         sum1 = torch.sum(tensor[mask1])
25 |         num1 = numel - num0
26 |         mean1 = sum1 / num1 if num1 > 0 else sum1
27 |         mean1 = mean1.reshape((1,))
28 | 
29 |         means = torch.cat((mean0, mean1))
30 | 
31 |         int8_tensor, size = packbits(mask0)
32 |         tensor_compressed = int8_tensor, means
33 | 
34 |         ctx = (name, numel)
35 |         return tensor_compressed, ctx
36 | 
37 | 
38 |     def decompress(self, tensor_compressed, ctx, server=False):
39 |         int8_tensor, means = tensor_compressed
40 |         mean0, mean1 = means[0], means[1]
41 |         name, numel = ctx
42 | 
43 |         uint8_tensor = unpackbits(int8_tensor, numel)
44 | 
45 |         tensor_decompressed = uint8_tensor * mean0 + ~uint8_tensor * mean1
46 |         return tensor_decompressed


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolqsgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mergeComp_dl.torch import Compressor
 4 | 
 5 | 
 6 | class PoolQSGDCompressor(Compressor):
 7 | 
 8 |     def __init__(self, quantum_num):
 9 |         super().__init__()
10 |         self.name = "PoolQSGD"
11 |         self.quantization = True
12 |         self.quantum_num = quantum_num
13 | 
14 | 
15 |     def compress(self, tensor, name, ctx, server=False):
16 |         shape = tensor.size()
17 |         tensor = tensor.flatten()
18 |         norm = tensor.norm().reshape((1,))
19 | 
20 |         abs_gradient = tensor.abs()
21 | 
22 |         level_float = self.quantum_num / norm * abs_gradient
23 |         previous_level = level_float.floor()
24 |         prob = torch.empty_like(tensor).uniform_()
25 |         is_next_level = (prob < (level_float - previous_level)).type(torch.float32)
26 |         new_level = (previous_level + is_next_level)
27 | 
28 |         sign = tensor.sign()
29 |         tensor_compressed = (new_level * sign).type(torch.int16)
30 |         tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half)
31 |         tensor_compressed = tensor_compressed, norm
32 | 
33 |         ctx = (name, shape)
34 |         return tensor_compressed, ctx
35 | 
36 | 
37 |     def decompress(self, tensor_compressed, ctx, server=False):
38 |         tensor, norm = tensor_compressed
39 |         norm = norm[0]
40 |         decode_output = tensor.type(torch.float32)
41 |         tensor_decompressed = norm / self.quantum_num * decode_output
42 |         return tensor_decompressed
43 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolrandomk.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mergeComp_dl.torch import Compressor
 4 | 
 5 | 
 6 | def sparsify(tensor, compress_ratio):
 7 |     numel = tensor.numel()
 8 |     k = max(1, int(numel * compress_ratio))
 9 |     indices = torch.randperm(numel, device=tensor.device)[:k]
10 |     values = tensor[indices]
11 |     return values, indices.type(torch.int32)
12 | 
13 | 
14 | class PoolRandomKCompressor(Compressor):
15 |     def __init__(self, compress_ratio):
16 |         super().__init__()
17 |         self.name = "RandomK"
18 |         self.quantization = False
19 |         self.compress_ratio = compress_ratio
20 | 
21 | 
22 |     def compress(self, tensor, name, start):
23 |         tensors = sparsify(tensor, self.compress_ratio)
24 |         ctx = name, tensor.numel(), tensor.size()
25 |         return tensors, ctx
26 | 
27 | 
28 |     def decompress(self, tensors, ctx):
29 |         name, numel, size = ctx
30 |         values, indices = tensors
31 |         tensor_decompressed = torch.zeros(numel, dtype=values.dtype, device=values.device)
32 |         tensor_decompressed.scatter_(0, indices.type(torch.int64), values)
33 |         return tensor_decompressed
34 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolsignsgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | 
 4 | from mergeComp_dl.torch import Compressor
 5 | from mergeComp_dl.torch.util import packbits, unpackbits
 6 | 
 7 | 
 8 | class PoolSignSGDCompressor(Compressor):
 9 |     def __init__(self):
10 |         super().__init__(average=False)
11 |         self.name = "PoolSignSGD"
12 |         self.quantization = True
13 | 
14 | 
15 |     def compress(self, tensor, name, ctx, server=False):
16 |         numel = tensor.numel()
17 | 
18 |         sign_encode = tensor >= 0
19 |         mean = tensor.abs().mean().reshape((1,))
20 | 
21 |         int8_tensor, size = packbits(sign_encode)
22 |         tensor_compressed = int8_tensor, mean
23 | 
24 |         ctx = (name, numel)
25 |         return tensor_compressed, ctx
26 | 
27 | 
28 |     def decompress(self, tensor_compressed, ctx, server=False):
29 |         """Decoding the signs to float format """
30 |         int8_tensor, mean = tensor_compressed
31 |         mean = mean[0]
32 |         name, numel = ctx
33 | 
34 |         sign_decode = unpackbits(int8_tensor, numel)
35 |         sign_decode = sign_decode.type(torch.float32) * 2 - 1
36 | 
37 |         return mean * sign_decode


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolsignum.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | 
 4 | from mergeComp_dl.torch import Compressor
 5 | from mergeComp_dl.torch.util import packbits, unpackbits
 6 | 
 7 | 
 8 | class PoolSignumCompressor(Compressor):
 9 |     def __init__(self, momentum):
10 |         super().__init__(average=False)
11 |         self.name = "PoolSignNum"
12 |         self.quantization = True
13 |         self.momentum = momentum
14 |         self.momentums = {}
15 | 
16 | 
17 |     def compress(self, tensor, name, ctx, server=False):
18 |         """Encoding and compressing the signs """
19 |         numel = tensor.numel()
20 |         mean = tensor.abs().mean().reshape((1,))
21 | 
22 |         # update tensor by momentum
23 |         if name in self.momentums:
24 |             tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name]
25 |         self.momentums[name] = tensor
26 |         sign_encode = tensor >= 0
27 | 
28 |         int8_tensor, size = packbits(sign_encode)
29 |         tensor_compressed = int8_tensor, mean
30 | 
31 |         ctx = (name, numel)
32 |         return tensor_compressed, ctx
33 | 
34 | 
35 |     def decompress(self, tensor_compressed, ctx, server=False):
36 |         """Decoding the signs to float format """
37 |         int8_tensor, _ = tensor_compressed
38 |         name, numel = ctx
39 | 
40 |         sign_decode = unpackbits(int8_tensor, numel)
41 |         return sign_decode.type(torch.float32) * 2 - 1
42 | 
43 | 
44 |     def aggregate(self, tensors):
45 |         """Aggregate a list of tensors."""
46 |         agged_tensor = sum(tensors)
47 |         agged_tensor = agged_tensor >= 0
48 |         agged_tensor = agged_tensor * 2.0 - 1.0
49 |         return [agged_tensor]
50 | 
51 | 
52 |     def clean(self):
53 |         self.momentums = {}


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/poolterngrad.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | 
 4 | from mergeComp_dl.torch import Compressor
 5 | from mergeComp_dl.torch.util import pack2bits, unpack2bits
 6 | 
 7 | 
 8 | class PoolTernGradCompressor(Compressor):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.name = "PoolTernGrad"
12 |         self.quantization = True
13 | 
14 | 
15 |     def compress(self, tensor, name, ctx, server=False):
16 |         numel = tensor.numel()
17 |         abs_gradient = tensor.abs()
18 |         scalar = abs_gradient.max()
19 |         sign_gradient = tensor.sign() * scalar
20 | 
21 |         try:
22 |             rnd_sample = torch.empty_like(tensor).cuda().uniform_(0, scalar.item())
23 |         except:
24 |             rnd_sample = torch.zeros_like(tensor).cuda()
25 | 
26 |         sign_gradient[rnd_sample >= abs_gradient] = 0
27 | 
28 |         mask = sign_gradient.sign() > 0
29 |         tern_tensor = sign_gradient.sign() + 1  # {-1, 0, 1} + 1
30 |         print(tern_tensor.sum())
31 | 
32 |         int8_tensor, size = pack2bits(mask, tern_tensor)
33 |         tensor_compressed = int8_tensor, scalar.flatten()
34 | 
35 |         ctx = (name, numel)
36 |         return tensor_compressed, ctx
37 | 
38 | 
39 |     def decompress(self, tensor_compressed, ctx, server=False):
40 |         int8_tensor, scalar = tensor_compressed
41 |         name, numel = ctx
42 | 
43 |         tern_tensor = unpack2bits(int8_tensor, numel)
44 |         print(tern_tensor.sum())
45 | 
46 |         sign = tern_tensor.type(torch.float32) - 1  # {0, 1, 2} - 1
47 |         return sign * scalar


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/compressor/pooltopk.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mergeComp_dl.torch import Compressor
 4 | 
 5 | 
 6 | def sparsify(tensor, compress_ratio):
 7 |     k = max(1, int(tensor.numel() * compress_ratio))
 8 |     _, indices = torch.topk(tensor.abs(), k)
 9 |     values = tensor[indices]
10 |     return values, indices.type(torch.int32)
11 | 
12 | 
13 | def desparsify(tensors, numel):
14 |     values, indices = tensors
15 |     tensor_decompressed = torch.zeros(numel, dtype=values.dtype, device=values.device)
16 |     tensor_decompressed.scatter_(0, indices.type(torch.int64), values)
17 |     return tensor_decompressed
18 | 
19 | 
20 | class PoolTopKCompressor(Compressor):
21 | 
22 |     def __init__(self, compress_ratio):
23 |         super().__init__()
24 |         self.name = "PoolTopK"
25 |         self.quantization = False
26 |         self.compress_ratio = compress_ratio
27 | 
28 | 
29 |     def compress(self, tensor, name, start):
30 |         tensors = sparsify(tensor, self.compress_ratio)
31 |         ctx = (name, tensor.numel(), tensor.size())
32 |         return tensors, ctx
33 | 
34 | 
35 |     def decompress(self, tensors, ctx):
36 |         """Decompress by filling empty slots with zeros and reshape back using the original shape"""
37 |         name, numel, size = ctx
38 |         tensor_decompressed = desparsify(tensors, numel)
39 |         return tensor_decompressed
40 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/memory/none.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import horovod.torch as hvd
 3 | 
 4 | from .memory_layer import MemoryLayer
 5 | 
 6 | 
 7 | class NoneMemory(MemoryLayer):
 8 |     def __init__(self, named_parameters):
 9 |         self.world_size = hvd.size()
10 |         super().__init__(named_parameters)
11 | 
12 |     def compensate(self, tensor, name):
13 |         """Update the tensor with the residuals."""
14 |         grad = self.get_grad(name)
15 |         residual = self.get_velocity(name)
16 |         residual.copy_(grad)
17 | 
18 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
19 |         """Update the residuals."""
20 |         pass
21 | 
22 |     def reduce(self, ctx, name):
23 |         reduction = self.get_reduction(name)
24 |         reduction.zero_()
25 |         for c in ctx:
26 |             reduction.add_(c/self.world_size)
27 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/memory/poolnone.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import byteps.tensorflow as bps
 3 | 
 4 | from .memory_pool import MemoryPool
 5 | 
 6 | 
 7 | class PoolNoneMemory(MemoryPool):
 8 |     def __init__(self, named_parameters, fusion_num=2):
 9 |         self.world_size = bps.size()
10 |         super().__init__(named_parameters, fusion_num)
11 | 
12 |     def compensate(self, tensor, name):
13 |         """Update the tensor with the residuals."""
14 |         grad = self.get_grad(name)
15 |         residual = self.get_velocity(name)
16 |         residual.assign(grad)
17 | 
18 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
19 |         """Update the residuals."""
20 |         pass
21 | 
22 |     def reduce(self, ctx, name):
23 |         reduction = self.get_reduction(name)
24 |         #reduction -= reduction
25 |         # TODO:for compression algorithms with allreduce, the received results have been averaged already.
26 |         # Probably there is no need to divide c with self.world_size.
27 |         
28 |         #print(len(reduction), len(ctx))
29 |         reduction.assign(ctx)


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/memory/poolresidual.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import byteps.tensorflow as bps
 3 | 
 4 | from .memory_pool import MemoryPool
 5 | 
 6 | 
 7 | class PoolResidualMemory(MemoryPool):
 8 |     #TODO: tune beta and gamma to increase accurary
 9 |     def __init__(self, named_parameters, fusion_num=2, beta=0.9, gamma=1.0):
10 |         self.beta = beta
11 |         self.gamma = gamma
12 |         self.world_size = hvd.size()
13 |         super().__init__(named_parameters, fusion_num)
14 | 
15 | 
16 |     def compensate(self, tensor, name):
17 |         """vec stores the residuals"""
18 |         grad = self.get_grad(name)
19 |         residual = self.get_velocity(name)
20 |         #residual.add_(grad)
21 |         residual.mul_(self.beta).add_(self.gamma*grad)
22 | 
23 | 
24 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
25 |         """Update the residuals."""
26 |         tensor_decompressed = compressor.decompress(tensor_compressed, ctx)
27 |         residual = self.get_velocity(name)
28 |         residual.assign(tensor.view(-1) - tensor_decompressed)
29 | 
30 | 
31 |     def reduce(self, ctx, name):
32 |         reduction = self.get_reduction(name)
33 |         reduction.zero_()
34 |         for c in ctx:
35 |             #reduction.add_(c)
36 |             reduction.add_(c/self.world_size)
37 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/memory/residual.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import horovod.torch as hvd
 3 | 
 4 | from .memory_layer import MemoryLayer
 5 | 
 6 | 
 7 | class ResidualMemory(MemoryLayer):
 8 |     def __init__(self, named_parameters, beta=0.9, gamma=1.0):
 9 |         self.beta = beta
10 |         self.gamma = gamma
11 |         self.world_size = hvd.size()
12 |         super().__init__(named_parameters)
13 | 
14 | 
15 |     def compensate(self, tensor, name):
16 |         """vec stores the residuals"""
17 |         grad = self.get_grad(name)
18 |         residual = self.get_velocity(name)
19 |         residual.mul_(self.beta).add_(self.gamma*grad)
20 | 
21 | 
22 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
23 |         """Update the residuals."""
24 |         tensor_decompressed = compressor.decompress(tensor_compressed, ctx)
25 |         residual = self.get_velocity(name)
26 |         residual.assign(tensor.view(-1) - tensor_decompressed)
27 | 
28 | 
29 |     def reduce(self, ctx, name):
30 |         reduction = self.get_reduction(name)
31 |         reduction.zero_()
32 |         for c in ctx:
33 |             reduction.add_(c/self.world_size)


--------------------------------------------------------------------------------
/byteps/tensorflow/mergeComp/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.dlpack import to_dlpack
 3 | from torch.utils.dlpack import from_dlpack
 4 | import cupy
 5 | 
 6 | 
 7 | def torch2cupy(tensor):
 8 |     return cupy.fromDlpack(to_dlpack(tensor))
 9 | 
10 | 
11 | def cupy2torch(cupy_tensor):
12 |     return from_dlpack(cupy_tensor.toDlpack())
13 | 
14 | 
15 | def packbits(array):
16 |     return cupy2torch(cupy.packbits(torch2cupy(array))), array.numel()
17 | 
18 | 
19 | def unpackbits(array, size):
20 |     return cupy2torch(cupy.unpackbits(torch2cupy(array))[:size])
21 | 
22 | 
23 | def pack2bits(first, second):
24 |     data = torch.cat((first, second.type(torch.bool)), 0)
25 |     return cupy2torch(cupy.packbits(torch2cupy(data))), first.numel()
26 | 
27 | 
28 | def unpack2bits(array, size):
29 |     decode = cupy2torch(cupy.unpackbits(torch2cupy(array)))
30 |     first = decode[:size]
31 |     second = decode[size:2*size]
32 |     second[first > 0] = 2
33 | 
34 |     return second
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/ops.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. or its affiliates. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #ifndef BYTEPS_TENSORFLOW_OPS_H
17 | #define BYTEPS_TENSORFLOW_OPS_H
18 | 
19 | #include <string>
20 | 
21 | #include "tensorflow/core/framework/op.h"
22 | #include "tensorflow/core/framework/op_kernel.h"
23 | #include "tensorflow/core/framework/shape_inference.h"
24 | 
25 | #define EIGEN_USE_THREADS
26 | #include "tensorflow/stream_executor/stream.h"
27 | 
28 | #include "../common/operations.h"
29 | 
30 | namespace byteps {
31 | namespace tensorflow {
32 | 
33 | class TFReadyEvent : public common::ReadyEvent {
34 |  public:
35 |   TFReadyEvent(::tensorflow::DeviceContext* device_context);
36 |   bool Ready() const override;
37 | 
38 |  private:
39 |   std::shared_ptr<perftools::gputools::Event> event_;
40 | };
41 | 
42 | class TFTensor : public common::Tensor {
43 |  public:
44 |   TFTensor(::tensorflow::Tensor& tensor);
45 |   virtual const common::DataType dtype() const override;
46 |   virtual const common::TensorShape shape() const override;
47 |   virtual const void* data() const override;
48 |   virtual int64_t size() const override;
49 | 
50 |  protected:
51 |   ::tensorflow::Tensor tensor_;
52 | };
53 | 
54 | extern "C" void byteps_tensorflow_declare_tensor(char* name);
55 | 
56 | }  // namespace tensorflow
57 | }  // namespace byteps
58 | 
59 | #endif  // BYTEPS_TENSORFLOW_OPS_H
60 | 


--------------------------------------------------------------------------------
/byteps/tensorflow/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | from distutils.version import LooseVersion
16 | 
17 | import tensorflow as tf
18 | 
19 | 
20 | if LooseVersion(tf.__version__) >= LooseVersion("1.9.0"):
21 |     from tensorflow.python.eager import context
22 |     _has_eager = True
23 | else:
24 |     _has_eager = False
25 | 
26 | 
27 | def _executing_eagerly():
28 |     """Returns true if eager execution is supported and enabled."""
29 |     return _has_eager and context.in_eager_mode()
30 | 


--------------------------------------------------------------------------------
/byteps/torch/adapter.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_ADAPTER_H
18 | #define BYTEPS_TORCH_ADAPTER_H
19 | 
20 | #include <torch/extension.h>
21 | #include <torch/torch.h>
22 | 
23 | #include "../common/common.h"
24 | 
25 | namespace byteps {
26 | namespace torch {
27 | 
28 | using namespace byteps::common;
29 | 
30 | class TorchTensor : public Tensor {
31 |  public:
32 |   TorchTensor(::torch::Tensor tensor);
33 |   virtual const DataType dtype() const override;
34 |   virtual const TensorShape shape() const override;
35 |   virtual const void* data() const override;
36 |   virtual int64_t size() const override;
37 | 
38 |  protected:
39 |   ::torch::Tensor tensor_;
40 | };
41 | 
42 | void ThrowIfError(Status status);
43 | 
44 | }  // namespace torch
45 | }  // namespace byteps
46 | 
47 | #endif  // BYTEPS_TORCH_ADAPTER_H
48 | 


--------------------------------------------------------------------------------
/byteps/torch/cuda_util.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | // =============================================================================
15 | 
16 | #if HAVE_CUDA
17 | #include <THC/THC.h>
18 | #include "cuda_runtime.h"
19 | #endif
20 | 
21 | #include "../common/common.h"
22 | #include "cuda_util.h"
23 | 
24 | namespace byteps {
25 | namespace torch {
26 | 
27 | with_device::with_device(int device) {
28 |   if (device == CPU_DEVICE_ID) {
29 |     restore_device_ = CPU_DEVICE_ID;
30 |   } else {
31 | #if HAVE_CUDA
32 |     THCudaCheck(cudaGetDevice(&restore_device_));
33 |     THCudaCheck(cudaSetDevice(device));
34 | #else
35 |     throw std::logic_error(
36 |         "Internal error. Requested device context manager "
37 |         "with GPU device but not compiled with CUDA.");
38 | #endif
39 |   }
40 | }
41 | 
42 | with_device::~with_device() {
43 | #if HAVE_CUDA
44 |   if (restore_device_ != CPU_DEVICE_ID) {
45 |     THCudaCheck(cudaSetDevice(restore_device_));
46 |   }
47 | #endif
48 | }
49 | 
50 | }  // namespace torch
51 | }  // namespace byteps
52 | 


--------------------------------------------------------------------------------
/byteps/torch/cuda_util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 ByteDance, Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_CUDA_UTIL_H
18 | #define BYTEPS_TORCH_CUDA_UTIL_H
19 | 
20 | #include "../common/common.h"
21 | 
22 | namespace byteps {
23 | namespace torch {
24 | 
25 | class with_device {
26 |  public:
27 |   with_device(int device);
28 |   ~with_device();
29 | 
30 |  private:
31 |   int restore_device_ = CPU_DEVICE_ID;
32 | };
33 | 
34 | }  // namespace torch
35 | }  // namespace byteps
36 | 
37 | #endif  // BYTEPS_TORCH_CUDA_UTIL_H
38 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/BERT/dataset/checkpoint/bert_base_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "num_hidden_layers": 12,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }


--------------------------------------------------------------------------------
/byteps/torch/examples/BERT/dataset/checkpoint/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 1024,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 4096,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 16,
10 |   "num_hidden_layers": 24,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522
13 | }


--------------------------------------------------------------------------------
/byteps/torch/examples/BERT/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import byteps.torch as bps
15 | 
16 | def get_rank():
17 |     return bps.rank()
18 | 
19 | def is_main_process():
20 |     return get_rank() == 0
21 | 
22 | def format_step(step):
23 |     if isinstance(step, str):
24 |         return step
25 |     s = ""
26 |     if len(step) > 0:
27 |         s += "Training Epoch: {} ".format(step[0])
28 |     if len(step) > 1:
29 |         s += "Training Iteration: {} ".format(step[1])
30 |     if len(step) > 2:
31 |         s += "Validation Iteration: {} ".format(step[2])
32 |     return s


--------------------------------------------------------------------------------
/byteps/torch/examples/README.md:
--------------------------------------------------------------------------------
 1 | # End-to-end training throughput
 2 | 
 3 | ## How to run
 4 | 
 5 | `run_nvlink_models.sh` and `run_pcie_models.sh` are two scripts to run the experiments for E1 (NVLink-based experiments) and E2 (PCIe-only experiments)
 6 | 
 7 | Set ifname and DMLC_PS_ROOT_URI in the two scripts.
 8 | 
 9 | ifname: the network interface card name, e.g., eth0, eth2
10 | 
11 | DMLC_PS_ROOT_URI: the IP address of the root machine. Any machine involved in training can serve as the root machine and its ID is 0.
12 | 
13 | For E1, run the command on each NVLink-based GPU machines 
14 | ```bash
15 | bash run_nvlink_models.sh WORKERS ID
16 | ```
17 | 
18 | For E2, run the command on each PCIe-only GPU machines
19 | ```bash
20 | bash run_pcie_models.sh WORKERS ID
21 | ```
22 | 
23 | WORKERS: the number of GPU machines in the training
24 | 
25 | ID: the id of a machine. machines have distinct IDs that start from 0 to WORKERS-1
26 | 
27 | ## Results
28 | 
29 | The results of model training throughput with different systems are logged in model_log. 
30 | The metrics are `images/sec` or `tokens/sec`.
31 | Check the logs after the completion of training.
32 | 
33 | 
34 | ## For trace of time gaps
35 | 
36 | Add the following environment variables in Shell scripts
37 | 
38 | ```bash
39 | export BYTEPS_TRACE_ON=1
40 | export BYTEPS_TRACE_START_STEP=10
41 | export BYTEPS_TRACE_END_STEP=20
42 | export BYTEPS_TRACE_DIR=trace
43 | ```
44 | 
45 | Make sure there is a folder named `trace/0/` and then run the training without applying compression algorithms. Extract the time gaps with `byteps/torch/examples/json_parser.py` and remember to change the input file in json_parse.py.
46 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/extract.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import numpy as np
 4 | 
 5 | 
 6 | folder = sys.argv[1]
 7 | keyword = "Training speed"
 8 | 
 9 | files = os.listdir(folder)
10 | 
11 | for filename in files:
12 |     path = os.path.join(folder, filename)
13 |     print(filename)
14 |     speeds = []
15 |     with open(path, 'r') as fp:
16 |         lines = fp.readlines()
17 |         for line in lines:
18 |             if keyword in line:
19 |                 print(line.strip('\n'))
20 |                 speed = line.split()[-2]
21 |                 speeds.append(float(speed))
22 | 
23 |         speeds = np.array(speeds)
24 |         print("avg: {:.3f}\t std: {:.3f}".format(np.mean(speeds), np.std(speeds)))
25 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/README.md:
--------------------------------------------------------------------------------
 1 | # GPT-2
 2 | 
 3 | You can run the model step by step as follows. 
 4 | The dataset and the dependencies are all set after "bash install.sh". 
 5 | Go to "How to run" directly.
 6 | 
 7 | ## Download the dataset
 8 | ```bash
 9 | mkdir ~/data
10 | cd ~/data
11 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
12 | unzip wikitext-2-raw-v1.zip
13 | ```
14 | **Note**: the default location of the dataset is ~/data, and the dataset for GPT2 is in ~/data/wikitext-2-raw
15 | 
16 | 
17 | ## Install dependencies
18 | ```bash
19 | bash run_prepare.sh
20 | ```
21 | 
22 | ## How to run
23 | **Note**: Make sure the dataset is in the right location and it runs on NVLink-based GPU machines.
24 | Set ifname in run_espresso.sh and run_baseline.sh.
25 | 
26 | ifname: the network interface card name, e.g., eth0, eth2
27 | 
28 | ```bash
29 | export DMLC_PS_ROOT_URI="ip"
30 | export DMLC_NUM_WORKER=WORKERS
31 | export DMLC_WORKER_ID=WORKER_ID
32 | ```
33 | 
34 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine
35 | 
36 | WORKERS: the number of GPU machines in the training
37 | 
38 | ID: the id of a machine. machines have distinct IDs that start from 0
39 | 
40 | 
41 | ### Espresso
42 | Run on each machine
43 | ```bash
44 | bash run_espresso.sh
45 | ```
46 | 
47 | ### Baselines
48 | Run on each machine
49 | ```bash
50 | bash run_baseline.sh
51 | ``` 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.circleci/deploy.sh:
--------------------------------------------------------------------------------
 1 | cd docs
 2 | 
 3 | function deploy_doc(){
 4 | 	echo "Creating doc at commit $1 and pushing to folder $2"
 5 | 	git checkout $1
 6 | 	if [ ! -z "$2" ]
 7 | 	then
 8 | 		if [ -d "$dir/$2" ]; then
 9 | 			echo "Directory" $2 "already exists"
10 | 		else
11 | 			echo "Pushing version" $2
12 | 			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
13 | 		fi
14 | 	else
15 | 		echo "Pushing master"
16 | 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
17 | 	fi
18 | }
19 | 
20 | deploy_doc "master"
21 | deploy_doc "b33a385" v1.0.0
22 | deploy_doc "fe02e45" v1.1.0
23 | deploy_doc "89fd345" v1.2.0
24 | deploy_doc "fc9faa8" v2.0.0
25 | deploy_doc "3ddce1d" v2.1.1
26 | deploy_doc "3616209" v2.2.0
27 | deploy_doc "d0f8b9a" v2.3.0
28 | deploy_doc "6664ea9" v2.4.0


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source=transformers
 3 | omit =
 4 |     # skip convertion scripts from testing for now
 5 |     */convert_*
 6 |     */__main__.py
 7 | [report]
 8 | exclude_lines =
 9 |     pragma: no cover
10 |     raise
11 |     except
12 |     register_parameter


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/---new-benchmark.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F5A5 New benchmark"
 3 | about: Benchmark a part of this library and share your results
 4 | title: "[Benchmark]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 🖥 Benchmarking `transformers`
11 | 
12 | ## Benchmark
13 | 
14 | Which part of `transformers` did you benchmark?
15 | 
16 | ## Set-up
17 | 
18 | What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
19 | 
20 | ## Results
21 | 
22 | Put your results here!
23 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/--new-model-addition.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F31F New model addition"
 3 | about: Submit a proposal/request to implement a new Transformer-based model
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 🌟 New model addition
11 | 
12 | ## Model description
13 | 
14 | <!-- Important information -->
15 | 
16 | ## Open source status
17 | 
18 | * [ ] the model implementation is available: (give details)
19 | * [ ] the model weights are available: (give details)
20 | * [ ] who are the authors: (mention them, if possible by @gh-username)
21 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F41B Bug Report"
 3 | about: Submit a bug report to help us improve transformers
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 🐛 Bug
11 | 
12 | ## Information
13 | 
14 | Model I am using (Bert, XLNet ...):
15 | 
16 | Language I am using the model on (English, Chinese ...):
17 | 
18 | The problem arises when using:
19 | * [ ] the official example scripts: (give details below)
20 | * [ ] my own modified scripts: (give details below)
21 | 
22 | The tasks I am working on is:
23 | * [ ] an official GLUE/SQUaD task: (give the name)
24 | * [ ] my own task or dataset: (give details below)
25 | 
26 | ## To reproduce
27 | 
28 | Steps to reproduce the behavior:
29 | 
30 | 1.
31 | 2.
32 | 3.
33 | 
34 | <!-- If you have code snippets, error messages, stack traces please provide them here as well.
35 |      Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
36 |      Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->
37 | 
38 | ## Expected behavior
39 | 
40 | <!-- A clear and concise description of what you would expect to happen. -->
41 | 
42 | ## Environment
43 | 
44 | * OS:
45 | * Python version:
46 | * PyTorch version:
47 | * `transformers` version (or branch):
48 | * Using GPU ?
49 | * Distributed or parallel setup ?
50 | * Any other relevant information:
51 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F680 Feature request"
 3 | about: Submit a proposal/request for a new transformers feature
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 🚀 Feature request
11 | 
12 | <!-- A clear and concise description of the feature proposal.
13 |      Please provide a link to the paper and code in case they exist. -->
14 | 
15 | ## Motivation
16 | 
17 | <!-- Please outline the motivation for the proposal. Is your feature request
18 |      related to a problem? e.g., I'm always frustrated when [...]. If this is related
19 |      to another GitHub issue, please link here too. -->
20 | 
21 | ## Your contribution
22 | 
23 | <!-- Is there any way that you could help, e.g. by submitting a PR?
24 |      Make sure to read the CONTRIBUTING.MD readme:
25 |      https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md -->
26 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/migration.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
 3 | about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # 📚 Migration
11 | 
12 | ## Information
13 | 
14 | <!-- Important information -->
15 | 
16 | Model I am using (Bert, XLNet ...):
17 | 
18 | Language I am using the model on (English, Chinese ...):
19 | 
20 | The problem arises when using:
21 | * [ ] the official example scripts: (give details below)
22 | * [ ] my own modified scripts: (give details below)
23 | 
24 | The tasks I am working on is:
25 | * [ ] an official GLUE/SQUaD task: (give the name)
26 | * [ ] my own task or dataset: (give details below)
27 | 
28 | ## Details
29 | 
30 | <!-- A clear and concise description of the migration issue.
31 |     If you have code snippets, please provide it here as well.
32 |     Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
33 |     Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
34 |     -->
35 | 
36 | ## Environment
37 | 
38 | * OS:
39 | * Python version:
40 | * PyTorch version:
41 | * `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):
42 | * `transformers` version (or branch):
43 | * Using GPU?
44 | * Distributed or parallel setup?
45 | * Any other relevant information:
46 | 
47 | ## Checklist
48 | 
49 | - [ ] I have read the migration guide in the readme.
50 |  ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
51 |   [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))
52 | - [ ] I checked if a related official extension example runs on my machine.
53 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.github/ISSUE_TEMPLATE/question-help.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "❓ Questions & Help"
 3 | about: Post your general questions on Stack Overflow tagged huggingface-transformers
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | # ❓ Questions & Help
11 | 
12 | <!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
13 |      new models and benchmarks, and migration questions. For all other questions,
14 |      we direct you to Stack Overflow (SO) where a whole community of PyTorch and
15 |      Tensorflow enthusiast can help you out. Make sure to tag your question with the
16 |      right deep learning framework as well as the huggingface-transformers tag: 
17 |      https://stackoverflow.com/questions/tagged/huggingface-transformers 
18 |      
19 |      If your question wasn't answered after a period of time on Stack Overflow, you
20 |      can always open a question on GitHub. You should then link to the SO question 
21 |      that you posted.
22 |      -->
23 | 
24 | ## Details
25 | <!-- Description of your issue -->
26 | 
27 | <!-- You should first ask your question on SO, and only if
28 |      you didn't get an answer ask it here on GitHub. -->
29 | **A link to original question on Stack Overflow**: 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: wontfix
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: quality style test test-examples
 2 | 
 3 | # Check that source code meets quality standards
 4 | 
 5 | quality:
 6 | 	black --check --line-length 119 --target-version py35 examples templates tests src utils
 7 | 	isort --check-only --recursive examples templates tests src utils
 8 | 	flake8 examples templates tests src utils
 9 | 
10 | # Format source code automatically
11 | 
12 | style:
13 | 	black --line-length 119 --target-version py35 examples templates tests src utils
14 | 	isort --recursive examples templates tests src utils
15 | 
16 | # Run tests for the library
17 | 
18 | test:
19 | 	python -m pytest -n auto --dist=loadfile -s -v ./tests/
20 | 
21 | # Run tests for examples
22 | 
23 | test-examples:
24 | 	python -m pytest -n auto --dist=loadfile -s -v ./examples/
25 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/deploy_multi_version_doc.sh:
--------------------------------------------------------------------------------
 1 | cd docs
 2 | 
 3 | function deploy_doc(){
 4 | 	echo "Creating doc at commit $1 and pushing to folder $2"
 5 | 	git checkout $1
 6 | 	if [ ! -z "$2" ] 
 7 | 	then
 8 | 		echo "Pushing version" $2
 9 | 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
10 | 	else
11 | 		echo "Pushing master"
12 | 		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
13 | 	fi
14 | }
15 | 
16 | deploy_doc "master" 
17 | deploy_doc "b33a385" v1.0.0
18 | deploy_doc "fe02e45" v1.1.0
19 | deploy_doc "89fd345" v1.2.0
20 | deploy_doc "fc9faa8" v2.0.0
21 | deploy_doc "3ddce1d" v2.1.1
22 | deploy_doc "f2f3294" v2.2.0
23 | deploy_doc "d0f8b9a" v2.3.0
24 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:latest
2 | 
3 | RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
4 | 
5 | RUN pip install transformers
6 | 
7 | WORKDIR /workspace


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Light.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Light.ttf


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Medium.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Medium.otf


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Regular.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Regular.otf


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Thin.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/Calibre-Thin.otf


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/_static/css/code-snippets.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .highlight .c1, .highlight .sd{
 3 |     color: #999
 4 | }
 5 | 
 6 | .highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
 7 |     color: #FB8D68;
 8 | }
 9 | 
10 | .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
11 |     color: #6670FF;
12 | }


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/bertology.rst:
--------------------------------------------------------------------------------
 1 | BERTology
 2 | ---------
 3 | 
 4 | There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
 5 | 
 6 | 
 7 | * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
 8 | * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 9 | * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
10 | 
11 | In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
12 | 
13 | 
14 | * accessing all the hidden-states of BERT/GPT/GPT-2,
15 | * accessing all the attention weights for each head of BERT/GPT/GPT-2,
16 | * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
17 | 
18 | To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
19 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/transformers_logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/transformers_logo_name.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_constant_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_constant_schedule.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_schedule.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_linear_schedule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/docs/source/imgs/warmup_linear_schedule.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/configuration.rst:
--------------------------------------------------------------------------------
 1 | Configuration
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PretrainedConfig``
 7 | ~~~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | .. autoclass:: transformers.PretrainedConfig
10 |     :members:
11 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/model.rst:
--------------------------------------------------------------------------------
 1 | Models
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PreTrainedModel`` also implements a few methods which are common among all the models to:
 7 | 
 8 | - resize the input token embeddings when new tokens are added to the vocabulary
 9 | - prune the attention heads of the model.
10 | 
11 | ``PreTrainedModel``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.PreTrainedModel
15 |     :members:
16 | 
17 | ``TFPreTrainedModel``
18 | ~~~~~~~~~~~~~~~~~~~~~
19 | 
20 | .. autoclass:: transformers.TFPreTrainedModel
21 |     :members:
22 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/optimizer_schedules.rst:
--------------------------------------------------------------------------------
 1 | Optimizer
 2 | ----------------------------------------------------
 3 | 
 4 | The ``.optimization`` module provides:
 5 | 
 6 | - an optimizer with weight decay fixed that can be used to fine-tuned models, and
 7 | - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
 8 | - a gradient accumulation class to accumulate the gradients of multiple batches
 9 | 
10 | ``AdamW``
11 | ~~~~~~~~~~~~~~~~
12 | 
13 | .. autoclass:: transformers.AdamW
14 |     :members:
15 | 
16 | ``AdamWeightDecay``
17 | ~~~~~~~~~~~~~~~~~~~
18 | 
19 | .. autoclass:: transformers.AdamWeightDecay
20 |     :members:
21 | 
22 | .. autofunction:: transformers.create_optimizer
23 | 
24 | Schedules
25 | ----------------------------------------------------
26 | 
27 | Learning Rate Schedules
28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 | .. autofunction:: transformers.get_constant_schedule
30 | 
31 | 
32 | .. autofunction:: transformers.get_constant_schedule_with_warmup
33 | 
34 | .. image:: /imgs/warmup_constant_schedule.png
35 |     :target: /imgs/warmup_constant_schedule.png
36 |     :alt:
37 | 
38 | 
39 | .. autofunction:: transformers.get_cosine_schedule_with_warmup
40 | 
41 | .. image:: /imgs/warmup_cosine_schedule.png
42 |     :target: /imgs/warmup_cosine_schedule.png
43 |     :alt:
44 | 
45 | 
46 | .. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup
47 | 
48 | .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
49 |     :target: /imgs/warmup_cosine_hard_restarts_schedule.png
50 |     :alt:
51 | 
52 | 
53 | 
54 | .. autofunction:: transformers.get_linear_schedule_with_warmup
55 | 
56 | .. image:: /imgs/warmup_linear_schedule.png
57 |     :target: /imgs/warmup_linear_schedule.png
58 |     :alt:
59 | 
60 | ``Warmup``
61 | ~~~~~~~~~~~~~~~~
62 | 
63 | .. autoclass:: transformers.WarmUp
64 |     :members:
65 | 
66 | Gradient Strategies
67 | ----------------------------------------------------
68 | 
69 | ``GradientAccumulator``
70 | ~~~~~~~~~~~~~~~~~~~~~~~
71 | 
72 | .. autoclass:: transformers.GradientAccumulator
73 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/main_classes/tokenizer.rst:
--------------------------------------------------------------------------------
 1 | Tokenizer
 2 | ----------------------------------------------------
 3 | 
 4 | The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
 5 | 
 6 | ``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
 7 | 
 8 | - tokenizing, converting tokens to ids and back and encoding/decoding,
 9 | - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
10 | - managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
11 | 
12 | ``PreTrainedTokenizer``
13 | ~~~~~~~~~~~~~~~~~~~~~~~~
14 | 
15 | .. autoclass:: transformers.PreTrainedTokenizer
16 |     :members:
17 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/model_doc/auto.rst:
--------------------------------------------------------------------------------
 1 | AutoModels
 2 | -----------
 3 | 
 4 | In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
 5 | 
 6 | AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:
 7 | 
 8 | Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
 9 | 
10 | 
11 | ``AutoConfig``
12 | ~~~~~~~~~~~~~~~~~~~~~
13 | 
14 | .. autoclass:: transformers.AutoConfig
15 |     :members:
16 | 
17 | 
18 | ``AutoTokenizer``
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 | 
21 | .. autoclass:: transformers.AutoTokenizer
22 |     :members:
23 | 
24 | 
25 | ``AutoModel``
26 | ~~~~~~~~~~~~~~~~~~~~~
27 | 
28 | .. autoclass:: transformers.AutoModel
29 |     :members:
30 | 
31 | 
32 | ``AutoModelForPreTraining``
33 | ~~~~~~~~~~~~~~~~~~~~~
34 | 
35 | .. autoclass:: transformers.AutoModelForPreTraining
36 |     :members:
37 | 
38 | 
39 | ``AutoModelWithLMHead``
40 | ~~~~~~~~~~~~~~~~~~~~~
41 | 
42 | .. autoclass:: transformers.AutoModelWithLMHead
43 |     :members:
44 | 
45 | 
46 | ``AutoModelForSequenceClassification``
47 | ~~~~~~~~~~~~~~~~~~~~~
48 | 
49 | .. autoclass:: transformers.AutoModelForSequenceClassification
50 |     :members:
51 | 
52 | 
53 | ``AutoModelForQuestionAnswering``
54 | ~~~~~~~~~~~~~~~~~~~~~
55 | 
56 | .. autoclass:: transformers.AutoModelForQuestionAnswering
57 |     :members:
58 | 
59 | 
60 | ``AutoModelForTokenClassification``
61 | ~~~~~~~~~~~~~~~~~~~~~
62 | 
63 | .. autoclass:: transformers.AutoModelForTokenClassification
64 |     :members:
65 | 
66 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/model_sharing.md:
--------------------------------------------------------------------------------
 1 | # Model upload and sharing
 2 | 
 3 | Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
 4 | 
 5 | **First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
 6 | 
 7 | ```shell
 8 | transformers-cli login
 9 | # log in using the same credentials as on huggingface.co
10 | ```
11 | Upload your model:
12 | ```shell
13 | transformers-cli upload ./path/to/pretrained_model/
14 | 
15 | # ^^ Upload folder containing weights/tokenizer/config
16 | # saved via `.save_pretrained()`
17 | 
18 | transformers-cli upload ./config.json [--filename folder/foobar.json]
19 | 
20 | # ^^ Upload a single file
21 | # (you can optionally override its filename, which can be nested inside a folder)
22 | ```
23 | 
24 | Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
25 | ```python
26 | "username/pretrained_model"
27 | ```
28 | 
29 | Anyone can load it from code:
30 | ```python
31 | tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
32 | model = AutoModel.from_pretrained("username/pretrained_model")
33 | ```
34 | 
35 | Finally, list all your files on S3:
36 | ```shell
37 | transformers-cli s3 ls
38 | # List all your S3 objects.
39 | ```
40 | 
41 | You can also delete files:
42 | 
43 | ```shell
44 | transformers-cli s3 rm …
45 | ```


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/docs/source/notebooks.rst:
--------------------------------------------------------------------------------
 1 | Notebooks
 2 | ================================================
 3 | 
 4 | We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
 5 | 
 6 | 
 7 | *
 8 |   The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
 9 | 
10 | *
11 |   The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
12 | 
13 | *
14 |   The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
15 | 
16 | Please follow the instructions given in the notebooks to run and modify them.
17 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/contrib/README.md:
--------------------------------------------------------------------------------
1 | # Community contributed examples
2 | 
3 | This folder contains examples which are not actively maintained (mostly contributed by the community).
4 | 
5 | Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
6 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/contrib/run_camembert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from transformers.modeling_camembert import CamembertForMaskedLM
 4 | from transformers.tokenization_camembert import CamembertTokenizer
 5 | 
 6 | 
 7 | def fill_mask(masked_input, model, tokenizer, topk=5):
 8 |     # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
 9 |     assert masked_input.count("<mask>") == 1
10 |     input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
11 |     logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
12 |     masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
13 |     logits = logits[0, masked_index, :]
14 |     prob = logits.softmax(dim=0)
15 |     values, indices = prob.topk(k=topk, dim=0)
16 |     topk_predicted_token_bpe = " ".join(
17 |         [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
18 |     )
19 |     masked_token = tokenizer.mask_token
20 |     topk_filled_outputs = []
21 |     for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
22 |         predicted_token = predicted_token_bpe.replace("\u2581", " ")
23 |         if " {0}".format(masked_token) in masked_input:
24 |             topk_filled_outputs.append(
25 |                 (
26 |                     masked_input.replace(" {0}".format(masked_token), predicted_token),
27 |                     values[index].item(),
28 |                     predicted_token,
29 |                 )
30 |             )
31 |         else:
32 |             topk_filled_outputs.append(
33 |                 (masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token,)
34 |             )
35 |     return topk_filled_outputs
36 | 
37 | 
38 | tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
39 | model = CamembertForMaskedLM.from_pretrained("camembert-base")
40 | model.eval()
41 | 
42 | masked_input = "Le camembert est <mask> :)"
43 | print(fill_mask(masked_input, model, tokenizer, topk=3))
44 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/distillation/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 
3 | gitpython==3.0.2
4 | tensorboard>=1.14.0
5 | tensorboardX==1.8
6 | psutil==5.6.3
7 | scipy==1.3.1
8 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilbert-base-multilingual-cased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 119547
14 |   }
15 |   


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilbert-base-uncased.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"activation": "gelu",
 3 | 	"attention_dropout": 0.1,
 4 | 	"dim": 768,
 5 | 	"dropout": 0.1,
 6 | 	"hidden_dim": 3072,
 7 | 	"initializer_range": 0.02,
 8 | 	"max_position_embeddings": 512,
 9 | 	"n_heads": 12,
10 | 	"n_layers": 6,
11 | 	"sinusoidal_pos_embds": true,
12 | 	"tie_weights_": true,
13 | 	"vocab_size": 30522
14 |   }
15 |   


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilgpt2.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"initializer_range": 0.02,
 3 | 	"layer_norm_epsilon": 0.00001,
 4 | 	"n_ctx": 1024,
 5 | 	"n_embd": 768,
 6 | 	"n_head": 12,
 7 | 	"n_layer": 6,
 8 | 	"n_positions": 1024,
 9 | 	"vocab_size": 50257
10 | }


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/distillation/training_configs/distilroberta-base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "vocab_size": 50265,
 3 |     "hidden_size": 768,
 4 |     "num_hidden_layers": 6,
 5 |     "num_attention_heads": 12,
 6 |     "intermediate_size": 3072,
 7 |     "hidden_act": "gelu",
 8 |     "hidden_dropout_prob": 0.1,
 9 |     "attention_probs_dropout_prob": 0.1,
10 |     "max_position_embeddings": 514,
11 |     "type_vocab_size": 1,
12 |     "initializer_range": 0.02,
13 |     "layer_norm_eps": 0.00001
14 | }


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/headfigure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/headfigure.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/wooly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/examples/pplm/imgs/wooly.png


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/pplm/pplm_classification_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class ClassificationHead(torch.nn.Module):
 5 |     """Classification Head for  transformer encoders"""
 6 | 
 7 |     def __init__(self, class_size, embed_size):
 8 |         super().__init__()
 9 |         self.class_size = class_size
10 |         self.embed_size = embed_size
11 |         # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
12 |         # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
13 |         self.mlp = torch.nn.Linear(embed_size, class_size)
14 | 
15 |     def forward(self, hidden_state):
16 |         # hidden_state = F.relu(self.mlp1(hidden_state))
17 |         # hidden_state = self.mlp2(hidden_state)
18 |         logits = self.mlp(hidden_state)
19 |         return logits
20 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | tensorboard
3 | scikit-learn
4 | seqeval
5 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/summarization/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | 
3 | # For ROUGE
4 | nltk
5 | py-rouge
6 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/tests_samples/.gitignore:
--------------------------------------------------------------------------------
1 | *.*
2 | cache*
3 | temp*
4 | !*.tsv
5 | !*.json
6 | !.gitignore


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/tests_samples/MRPC/dev.tsv:
--------------------------------------------------------------------------------
1 | ﻿Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/examples/tests_samples/MRPC/train.tsv:
--------------------------------------------------------------------------------
1 | ﻿Quality	#1 ID	#2 ID	#1 String	#2 String
2 | 1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
3 | 0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
4 | 0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
5 | 1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
6 | 0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
7 | 1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
8 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | ensure_newline_before_comments = True
 3 | force_grid_wrap = 0
 4 | include_trailing_comma = True
 5 | known_first_party = transformers
 6 | known_third_party =
 7 |     absl
 8 |     fairseq
 9 |     fastprogress
10 |     git
11 |     h5py
12 |     MeCab
13 |     nltk
14 |     numpy
15 |     packaging
16 |     PIL
17 |     psutil
18 |     seqeval
19 |     sklearn
20 |     tensorboardX
21 |     tensorflow
22 |     tensorflow_datasets
23 |     torch
24 |     torchtext
25 |     torchvision
26 | 
27 | line_length = 119
28 | lines_after_imports = 2
29 | multi_line_output = 3
30 | use_parentheses = True
31 | 
32 | [flake8]
33 | ignore = E203, E501, W503
34 | max-line-length = 119
35 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/src/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from argparse import ArgumentParser
 3 | 
 4 | 
 5 | class BaseTransformersCLICommand(ABC):
 6 |     @staticmethod
 7 |     @abstractmethod
 8 |     def register_subcommand(parser: ArgumentParser):
 9 |         raise NotImplementedError()
10 | 
11 |     @abstractmethod
12 |     def run(self):
13 |         raise NotImplementedError()
14 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/src/transformers/commands/download.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from transformers.commands import BaseTransformersCLICommand
 4 | 
 5 | 
 6 | def download_command_factory(args):
 7 |     return DownloadCommand(args.model, args.cache_dir, args.force)
 8 | 
 9 | 
10 | class DownloadCommand(BaseTransformersCLICommand):
11 |     @staticmethod
12 |     def register_subcommand(parser: ArgumentParser):
13 |         download_parser = parser.add_parser("download")
14 |         download_parser.add_argument(
15 |             "--cache-dir", type=str, default=None, help="Path to location to store the models"
16 |         )
17 |         download_parser.add_argument(
18 |             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
19 |         )
20 |         download_parser.add_argument("model", type=str, help="Name of the model to download")
21 |         download_parser.set_defaults(func=download_command_factory)
22 | 
23 |     def __init__(self, model: str, cache: str, force: bool):
24 |         self._model = model
25 |         self._cache = cache
26 |         self._force = force
27 | 
28 |     def run(self):
29 |         from transformers import AutoModel, AutoTokenizer
30 | 
31 |         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
32 |         AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
33 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/src/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
28 |     "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json",
29 |     "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json",
30 | }
31 | 
32 | 
33 | class CamembertConfig(RobertaConfig):
34 |     """
35 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
36 |     superclass for the appropriate documentation alongside usage examples.
37 |     """
38 | 
39 |     pretrained_config_archive_map = CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
40 |     model_type = "camembert"
41 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/src/transformers/configuration_mmbt.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # Copyright (c) HuggingFace Inc. team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | class MMBTConfig(object):
26 |     """Configuration class to store the configuration of a `MMBT Model`.
27 | 
28 |     Args:
29 |         config (:obj:`~transformers.PreTrainedConfig`):
30 |             Config of the underlying Transformer models. Its values are
31 |             copied over to use a single config.
32 |         num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
33 |             Size of final Linear layer for classification.
34 |         modal_hidden_size (:obj:`int`, optional, defautls to 2048):
35 |             Embedding dimension of the non-text modality encoder.
36 |     """
37 | 
38 |     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 |         self.__dict__ = config.__dict__
40 |         self.modal_hidden_size = modal_hidden_size
41 |         if num_labels:
42 |             self.num_labels = num_labels
43 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/src/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from .metrics import is_sklearn_available
 6 | from .processors import (
 7 |     DataProcessor,
 8 |     InputExample,
 9 |     InputFeatures,
10 |     SingleSentenceClassificationProcessor,
11 |     SquadExample,
12 |     SquadFeatures,
13 |     SquadV1Processor,
14 |     SquadV2Processor,
15 |     glue_convert_examples_to_features,
16 |     glue_output_modes,
17 |     glue_processors,
18 |     glue_tasks_num_labels,
19 |     squad_convert_examples_to_features,
20 |     xnli_output_modes,
21 |     xnli_processors,
22 |     xnli_tasks_num_labels,
23 | )
24 | 
25 | 
26 | if is_sklearn_available():
27 |     from .metrics import glue_compute_metrics, xnli_compute_metrics
28 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/src/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 | 
5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
9 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/templates/adding_a_new_example_script/README.md:
--------------------------------------------------------------------------------
1 | # How to add a new example script in 🤗Transformers
2 | 
3 | This folder provide a template for adding a new example script implementing a training or inference task with the models in the  🤗Transformers library.
4 | 
5 | Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.
6 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/tests/__init__.py


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/dummy-config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "model_type": "roberta"
3 | }


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/empty.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/empty.txt


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/input.txt:
--------------------------------------------------------------------------------
1 | Who was Jim Henson ? ||| Jim Henson was a puppeteer
2 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/test_sentencepiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/gpt-2/gpt-2/tests/fixtures/test_sentencepiece.model


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/tests/test_tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from transformers.tokenization_distilbert import DistilBertTokenizer
18 | 
19 | from .test_tokenization_bert import BertTokenizationTest
20 | from .utils import slow
21 | 
22 | 
23 | class DistilBertTokenizationTest(BertTokenizationTest):
24 | 
25 |     tokenizer_class = DistilBertTokenizer
26 | 
27 |     def get_tokenizer(self, **kwargs):
28 |         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
29 | 
30 |     @slow
31 |     def test_sequence_builders(self):
32 |         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
33 | 
34 |         text = tokenizer.encode("sequence builders", add_special_tokens=False)
35 |         text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
36 | 
37 |         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
38 |         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
39 | 
40 |         assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
41 |         assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
42 |             tokenizer.sep_token_id
43 |         ]
44 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/tests/test_tokenization_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 HuggingFace Inc..
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import unittest
18 | 
19 | from transformers import PreTrainedTokenizer
20 | from transformers.tokenization_gpt2 import GPT2Tokenizer
21 | 
22 | from .utils import slow
23 | 
24 | 
25 | class TokenizerUtilsTest(unittest.TestCase):
26 |     def check_tokenizer_from_pretrained(self, tokenizer_class):
27 |         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
28 |         for model_name in s3_models[:1]:
29 |             tokenizer = tokenizer_class.from_pretrained(model_name)
30 |             self.assertIsNotNone(tokenizer)
31 |             self.assertIsInstance(tokenizer, tokenizer_class)
32 |             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
33 | 
34 |             for special_tok in tokenizer.all_special_tokens:
35 |                 self.assertIsInstance(special_tok, str)
36 |                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
37 |                 self.assertIsInstance(special_tok_id, int)
38 | 
39 |     @slow
40 |     def test_pretrained_tokenizers(self):
41 |         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
42 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/gpt-2/transformers-cli:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers.commands.download import DownloadCommand
 5 | from transformers.commands.run import RunCommand
 6 | from transformers.commands.user import UserCommands
 7 | from transformers.commands.convert import ConvertCommand
 8 | from transformers.commands.serving import ServeCommand
 9 | 
10 | if __name__ == '__main__':
11 |     parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli <command> [<args>]')
12 |     commands_parser = parser.add_subparsers(help='transformers-cli command helpers')
13 | 
14 |     # Register commands
15 |     ConvertCommand.register_subcommand(commands_parser)
16 |     DownloadCommand.register_subcommand(commands_parser)
17 |     RunCommand.register_subcommand(commands_parser)
18 |     ServeCommand.register_subcommand(commands_parser)
19 |     UserCommands.register_subcommand(commands_parser)
20 | 
21 |     # Let's go
22 |     args = parser.parse_args()
23 | 
24 |     if not hasattr(args, 'func'):
25 |         parser.print_help()
26 |         exit(1)
27 | 
28 |     # Run
29 |     service = args.func(args)
30 |     service.run()
31 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/gpt-2/run_prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # before running this, should make sure the data has been downloaded from:
 4 | # https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
 5 | 
 6 | export DATA_DIR=${DATA_DIR:-~/data}
 7 | export TRAIN_FILE=${TRAIN_FILE:-$DATA_DIR/wikitext-2-raw/wiki.train.raw}
 8 | export TEST_FILE=${TRAIN_FILE:-$DATA_DIR/wikitext-2-raw/wiki.test.raw}
 9 | export DISTRIBUTED_FRAMEWORK=${DISTRIBUTED_FRAMEWORK:-byteps}
10 | 
11 | THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
12 | 
13 | cd ~ 
14 | cd $THIS_DIR/gpt-2
15 | pip3 install .
16 | pip3 install -r ./examples/requirements.txt
17 | 
18 | # export NCCL_P2P_DISABLE=1
19 | 
20 | # for the first run, you can do a dryrun to
21 | # prepare/download necessary configurations and models
22 | # in order to avoid multi-write conflicts 
23 | cd $THIS_DIR/gpt-2/examples
24 | python3 -m torch.distributed.launch --nproc_per_node 8 run_lm_finetuning.py \
25 |     --output_dir=output  \
26 |     --model_type=gpt2   \
27 |     --model_name_or_path=gpt2 \
28 |     --do_train   \
29 |     --save_steps 1000000     \
30 |     --overwrite_output_dir    \
31 |     --num_train_epochs 1     \
32 |     --per_gpu_eval_batch_size 4     \
33 |     --train_data_file=$TRAIN_FILE


--------------------------------------------------------------------------------
/byteps/torch/examples/json_parser.py:
--------------------------------------------------------------------------------
 1 | import json
 2 |   
 3 | # Opening JSON file
 4 | f = open('trace/0/tensor_ready.json',)
 5 |   
 6 | # returns JSON object as 
 7 | # a dictionary
 8 | data = json.load(f)
 9 |   
10 | # Iterating through the json
11 | # list
12 | time_points = {}
13 | for item in data['TensorReadyTraceEvents']:
14 |     name = item["name"]
15 |     ts = item["ts"]
16 |     if name not in time_points:
17 |         time_points[name] = [ts]
18 |     else:
19 |         time_points[name].append(ts)
20 | 
21 | iters = 0
22 | for key, value in time_points.items():
23 |     if iters == 0:
24 |         iters = len(value)
25 |     value.sort()
26 | 
27 | 
28 | timelines = [[] for _ in range(iters)]
29 | time_gaps = [[0] for _ in range(iters)]
30 | 
31 | for key, value in time_points.items():
32 |     for i, time in enumerate(value):
33 |         timelines[i].append(time)
34 | 
35 | for i, timeline in enumerate(timelines):
36 |     timeline.sort()
37 |     for j in range(len(timeline) - 1):
38 |         gap = timeline[j+1] - timeline[j]
39 |         time_gaps[i].append(gap)
40 | 
41 | 
42 | # Closing file
43 | f.close()
44 | for time_gap in time_gaps:
45 |     print(sum(time_gap), time_gap)
46 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/lstm/README.md:
--------------------------------------------------------------------------------
 1 | # LSTM
 2 | 
 3 | You can run the model step by step as follows. 
 4 | The dataset and the dependencies are all set after "bash install.sh". 
 5 | Go to "How to run" directly.
 6 | 
 7 | 
 8 | ## Download the dataset
 9 | ```bash
10 | bash getdata.sh 
11 | ```
12 | The default location of the dataset is ~/data, and the dataset for LSTM is in ~/data/wikitext-2
13 | 
14 | 
15 | ## How to run
16 | **Note**: Make sure it runs on NVLink-based GPU machines.
17 | Set ifname in run_espresso.sh and run_baseline.sh.
18 | 
19 | ifname: the network interface card name, e.g., eth0, eth2
20 | 
21 | ```bash
22 | export DMLC_PS_ROOT_URI="ip"
23 | export DMLC_NUM_WORKER=WORKERS
24 | export DMLC_WORKER_ID=WORKER_ID
25 | ```
26 | 
27 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine
28 | 
29 | WORKERS: the number of GPU machines in the training
30 | 
31 | ID: the id of a machine. machines have distinct IDs that start from 0
32 | 
33 | 
34 | ### Espresso
35 | Run on each machine
36 | ```bash
37 | bash run_espresso.sh
38 | ```
39 | 
40 | ### Baselines
41 | Run on each machine
42 | ```bash
43 | bash run_baseline.sh
44 | ``` 


--------------------------------------------------------------------------------
/byteps/torch/examples/lstm/getdata.sh:
--------------------------------------------------------------------------------
 1 | echo "=== Acquiring datasets ==="
 2 | echo "---"
 3 | mkdir -p save
 4 | 
 5 | cd ~/data
 6 | 
 7 | echo "- Downloading WikiText-2 (WT2)"
 8 | wget --quiet --continue https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
 9 | unzip -q wikitext-2-v1.zip
10 | cd wikitext-2
11 | mv wiki.train.tokens train.txt
12 | mv wiki.valid.tokens valid.txt
13 | mv wiki.test.tokens test.txt


--------------------------------------------------------------------------------
/byteps/torch/examples/lstm/run_espresso.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | path="`dirname $0`"
 4 | set -x
 5 | 
 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name
 7 | ifname=$1
 8 | 
 9 | compress_ratio=0.01
10 | gpus=0,1,2,3,4,5,6,7
11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0}
12 | export DMLC_INTERFACE=${ifname}
13 | export NCCL_IB_DISABLE=1 
14 | export NCCL_IB_GID_INDEX=3 
15 | export NCCL_IB_HCA=mlx5_0 
16 | export NCCL_SOCKET_IFNAME=${ifname}
17 | # export DMLC_NUM_WORKER=$1
18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER
19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)"
20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213}
21 | export NVIDIA_VISIBLE_DEVICES=${gpus}
22 | export BYTEPS_FORCE_DISTRIBUTED=0
23 | export BYTEPS_COMPRESSOR_k=${compress_ratio}
24 | export OMP_NUM_THREADS=4
25 | export TEST_TYPE=${TEST_TYPE:=torch}
26 | export NCCL_DEBUG=VERSION
27 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 
28 | export NCCL_BUFFSIZE=16777216
29 | # export DMLC_WORKER_ID=$2
30 | 
31 | IFS=', ' read -ra a <<< $gpus; 
32 | gpus_per_node=${#a[@]}
33 | declare -p a;
34 | 
35 | model='LSTM'
36 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345"
37 | 
38 | export BYTEPS_PARTITION_BYTES=4096000
39 | pkill -9 python3
40 | 
41 | export NCCL_P2P_DISABLE=1
42 | 
43 | for compressor in "efsignsgd"
44 | do
45 |   pkill -9 python3
46 |   export BYTEPS_INTER_COMPRESSOR=${compressor}
47 |   scheduler_file="../../mergeComp/scheduler/lstm/pcie_${compressor}_two_cpu" 
48 |   BENCHMARK_ARGS="--compress --compressor ${compressor} --memory efsignsgd --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file} --scheduler-type -1"
49 |   python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --model ${model} --epochs 2 $BENCHMARK_ARGS
50 |   sleep 5
51 | done


--------------------------------------------------------------------------------
/byteps/torch/examples/lstm/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def repackage_hidden(h):
 5 |     """Wraps hidden states in new Tensors,
 6 |     to detach them from their history."""
 7 |     if isinstance(h, torch.Tensor):
 8 |         return h.detach()
 9 |     else:
10 |         return tuple(repackage_hidden(v) for v in h)
11 | 
12 | 
13 | def batchify(data, bsz, args):
14 |     # Work out how cleanly we can divide the dataset into bsz parts.
15 |     nbatch = data.size(0) // bsz
16 |     # Trim off any extra elements that wouldn't cleanly fit (remainders).
17 |     data = data.narrow(0, 0, nbatch * bsz)
18 |     # Evenly divide the data across the bsz batches.
19 |     data = data.view(bsz, -1).t().contiguous()
20 |     if args.cuda:
21 |         data = data.cuda()
22 |     return data
23 | 
24 | 
25 | def get_batch(source, i, args, seq_len=None, evaluation=False):
26 |     seq_len = min(seq_len if seq_len else args.bptt, len(source) - 1 - i)
27 |     data = source[i:i+seq_len]
28 |     target = source[i+1:i+1+seq_len].view(-1)
29 |     return data, target


--------------------------------------------------------------------------------
/byteps/torch/examples/resnet101/README.md:
--------------------------------------------------------------------------------
 1 | # ResNet101
 2 | 
 3 | You can run the model step by step as follows. 
 4 | The dataset and the dependencies are all set after "bash install.sh". 
 5 | Go to "How to run" directly.
 6 | 
 7 | ## Download the dataset
 8 | ```bash
 9 | cd ~/data
10 | # we use a small dataset from ImageNet
11 | wget https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz
12 | tar xf imagewang.tgz
13 | ```
14 | The default location of the dataset is ~/data, and the dataset for VGG16 and ResNet101 is in ~/data/imagewang
15 | 
16 | ## How to run
17 | **Note**: Make sure it runs on NVLink-based GPU machines.
18 | Set ifname in run_espresso.sh and run_baseline.sh.
19 | 
20 | ifname: the network interface card name, e.g., eth0, eth2
21 | 
22 | ```bash
23 | export DMLC_PS_ROOT_URI="ip"
24 | export DMLC_NUM_WORKER=WORKERS
25 | export DMLC_WORKER_ID=WORKER_ID
26 | ```
27 | 
28 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine
29 | 
30 | WORKERS: the number of GPU machines in the training
31 | 
32 | ID: the id of a machine. machines have distinct IDs that start from 0
33 | 
34 | 
35 | ### Espresso
36 | Run on each machine
37 | ```bash
38 | bash run_espresso.sh
39 | ```
40 | 
41 | ### Baselines
42 | Run on each machine
43 | ```bash
44 | bash run_baseline.sh
45 | ``` 


--------------------------------------------------------------------------------
/byteps/torch/examples/resnet101/run_espresso.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | path="`dirname $0`"
 4 | set -x
 5 | 
 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name
 7 | ifname=$1
 8 | 
 9 | compress_ratio=0.01
10 | gpus=0,1,2,3,4,5,6,7
11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0}
12 | export DMLC_INTERFACE=${ifname}
13 | export NCCL_IB_DISABLE=1 
14 | export NCCL_IB_GID_INDEX=3 
15 | export NCCL_IB_HCA=mlx5_0 
16 | export NCCL_SOCKET_IFNAME=${ifname}
17 | # export DMLC_NUM_WORKER=$1
18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER
19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)"
20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213}
21 | export NVIDIA_VISIBLE_DEVICES=${gpus}
22 | export BYTEPS_FORCE_DISTRIBUTED=0
23 | export OMP_NUM_THREADS=4
24 | export TEST_TYPE=${TEST_TYPE:=torch}
25 | export NCCL_DEBUG=VERSION
26 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 
27 | export NCCL_BUFFSIZE=16777216
28 | # export DMLC_WORKER_ID=$2
29 | 
30 | IFS=', ' read -ra a <<< $gpus; 
31 | gpus_per_node=${#a[@]}
32 | declare -p a;
33 | 
34 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345"
35 | 
36 | export BYTEPS_PARTITION_BYTES=4096000
37 | pkill -9 python3
38 | export NCCL_P2P_DISABLE=1
39 | 
40 | for model in "resnet101"
41 | do
42 |   for compressor in "dgc"
43 |   do
44 |     pkill -9 python3
45 |     scheduler_file="../../mergeComp/scheduler/${model}/pcie_dgc_cpu"
46 |     export BYTEPS_INTER_COMPRESSOR=${compressor}
47 |     BENCHMARK_ARGS="--compress --compressor ${compressor} --memory topk --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file} --scheduler-type -1"
48 |     $GDB python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --model ${model} --epochs 5 --batch-size 32 --speed_test $BENCHMARK_ARGS
49 |     sleep 5
50 |   done
51 | done


--------------------------------------------------------------------------------
/byteps/torch/examples/run_nvlink_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name
 4 | ifname="eth0"
 5 | export DMLC_PS_ROOT_URI="10.188.181.156"
 6 | export DMLC_NUM_WORKER=$1
 7 | export DMLC_WORKER_ID=$2
 8 | 
 9 | # BERT-base
10 | cd BERT/
11 | bash run_baseline.sh ${ifname} | tee -a ../bert_log
12 | bash run_espresso.sh ${ifname} | tee -a ../bert_log
13 | 
14 | # GPT-2
15 | cd ../gpt-2/
16 | bash run_prepare.sh
17 | bash run_baseline.sh ${ifname} | tee -a ../gpt2_log
18 | bash run_espresso.sh ${ifname} | tee -a ../gpt2_log
19 | 
20 | 
21 | # UGATIT
22 | cd ../ugatit/
23 | bash run_baseline.sh ${ifname} | tee -a ../ugatit_log
24 | bash run_espresso.sh ${ifname} | tee -a ../ugatit_log


--------------------------------------------------------------------------------
/byteps/torch/examples/run_pcie_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name
 4 | ifname="eth0"
 5 | export DMLC_PS_ROOT_URI="10.188.181.156"
 6 | export DMLC_NUM_WORKER=$1
 7 | export DMLC_WORKER_ID=$2
 8 | 
 9 | # VGG16
10 | cd vgg16/
11 | bash run_baseline.sh ${ifname} | tee -a ../vgg16_log
12 | bash run_espresso.sh ${ifname} | tee -a ../vgg16_log
13 | 
14 | # LSTM
15 | cd ../lstm/
16 | bash run_baseline.sh ${ifname} | tee -a ../lstm_log
17 | bash run_espresso.sh ${ifname} | tee -a ../lstm_log
18 | 
19 | 
20 | # ResNet101
21 | cd ../resnet101/
22 | bash run_baseline.sh ${ifname} | tee -a ../resnet101_log
23 | bash run_espresso.sh ${ifname} | tee -a ../resnet101_log


--------------------------------------------------------------------------------
/byteps/torch/examples/test_compressor_cpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | from mergeComp_dl.torch.compressor.poolsignsgd import PoolSignSGDCompressor
 4 | from mergeComp_dl.torch.compressor.pooldgc import PoolDgcCompressor
 5 | from mergeComp_dl.torch.compressor.pooltopk import PoolTopKCompressor
 6 | from mergeComp_dl.torch.compressor.poolfp16 import PoolFP16Compressor
 7 | 
 8 | #compressor = PoolSignSGDCompressor()
 9 | #compressor = PoolDgcCompressor(0.01)
10 | #compressor = PoolTopKCompressor(0.01)
11 | compressor = PoolFP16Compressor()
12 | 
13 | base_size = 2 ** 10
14 | device = torch.device("cpu")
15 | 
16 | kwargs = {'dtype': torch.float32,
17 |           'device': device,
18 |           'requires_grad': False}
19 | 
20 | name = "test"
21 | size_list = []
22 | compress_latency = []
23 | decompress_latency = []
24 | 
25 | runs = 20
26 | 
27 | for i in range(0, 18, 1):
28 |     ctx = None 
29 |     size = base_size * 2 ** i
30 |     size_list.append(10 + i)
31 |     compress_time, decompress_time = 0, 0
32 | 
33 |     for _ in range(0, runs):
34 |         tensor = torch.rand(size, **kwargs)
35 |         torch.cuda.synchronize()
36 |         start_time = time.time()
37 |         compressed_tensor, ctx = compressor.compress(tensor, name, ctx)
38 |         torch.cuda.synchronize()
39 |         end_time = time.time()
40 |         
41 |         compress_time += end_time-start_time
42 |         
43 |         #print("Compress, size:", size, "time:", end_time-start_time)
44 | 
45 |         torch.cuda.synchronize()
46 |         start_time = time.time()
47 |         decompressed = compressor.decompress(compressed_tensor, ctx)
48 |         torch.cuda.synchronize()
49 |         end_time = time.time()
50 |         decompress_time += end_time-start_time
51 | 
52 |         #print("Decompress, size:", size, "time:", end_time-start_time)
53 | 
54 |     compress_latency.append(round(compress_time*1000/runs, 2)) 
55 |     decompress_latency.append(round(decompress_time*1000/runs, 2))
56 | 
57 | print(size_list)
58 | print(compress_latency)
59 | print(decompress_latency)


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Hyeonwoo Kang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/README.md:
--------------------------------------------------------------------------------
 1 | # UGATIT
 2 | 
 3 | Our script is based on [Official PyTorch Implementation of UGATIT](https://github.com/znxlwm/UGATIT-pytorch) and you can find the dataset [here](https://drive.google.com/file/d/1xOWj1UVgp6NKMT3HbPhBbtq2A4EDkghF/view).
 4 | 
 5 | 
 6 | You can run the model step by step as follows. 
 7 | The dataset and the dependencies are all set after "bash install.sh". 
 8 | Go to "How to run" directly.
 9 | 
10 | ## Download the dataset
11 | 
12 | ```bash
13 | cd ~/data
14 | gdown 1xOWj1UVgp6NKMT3HbPhBbtq2A4EDkghF
15 | mkdir selfie2anime && unzip selfie2anime.zip -d selfie2anime
16 | ```
17 | The default location of the dataset is ~/data, and the dataset for UGATIT is in ~/data/selfie2anime
18 | 
19 | 
20 | ## Install dependencies
21 | ```bash
22 | sudo apt-get update && sudo apt-get install libgl1 -y
23 | pip3 install opencv-python
24 | ```
25 | 
26 | ## How to run
27 | **Note**: Make sure the dataset is in the right location and it runs on NVLink-based GPU machines.
28 | Set ifname in run_espresso.sh and run_baseline.sh.
29 | 
30 | ifname: the network interface card name, e.g., eth0, eth2
31 | 
32 | ```bash
33 | export DMLC_PS_ROOT_URI="ip"
34 | export DMLC_NUM_WORKER=WORKERS
35 | export DMLC_WORKER_ID=WORKER_ID
36 | ```
37 | 
38 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine
39 | 
40 | WORKERS: the number of GPU machines in the training
41 | 
42 | ID: the id of a machine. machines have distinct IDs that start from 0
43 | 
44 | 
45 | ### Espresso
46 | Run on each machine
47 | ```bash
48 | bash run_espresso.sh
49 | ```
50 | 
51 | ### Baselines
52 | Run on each machine
53 | ```bash
54 | bash run_baseline.sh
55 | ``` 


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/assets/ablation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/ablation.png


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/assets/discriminator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/discriminator.png


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/assets/generator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/generator.png


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/assets/kid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/kid.png


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/teaser.png


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/assets/user_study.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/byteps/torch/examples/ugatit/assets/user_study.png


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python==3.3.1.11
2 | torchvision==0.3.0
3 | 


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/run_espresso.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | path="`dirname $0`"
 4 | set -x
 5 | 
 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name
 7 | ifname=$1
 8 | 
 9 | compress_ratio=0.01
10 | gpus=0,1,2,3,4,5,6,7
11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0}
12 | export DMLC_INTERFACE=${ifname}
13 | export NCCL_IB_DISABLE=1 
14 | export NCCL_IB_GID_INDEX=3 
15 | export NCCL_IB_HCA=mlx5_0 
16 | export NCCL_SOCKET_IFNAME=${ifname}
17 | # export DMLC_NUM_WORKER=$1
18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER
19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)"
20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213}
21 | export NVIDIA_VISIBLE_DEVICES=${gpus}
22 | export BYTEPS_FORCE_DISTRIBUTED=0
23 | export OMP_NUM_THREADS=4
24 | export NCCL_DEBUG=VERSION
25 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 
26 | export NCCL_BUFFSIZE=16777216
27 | # export DMLC_WORKER_ID=$2
28 | 
29 | IFS=', ' read -ra a <<< $gpus; 
30 | gpus_per_node=${#a[@]}
31 | declare -p a;
32 | 
33 | model='ugatit'
34 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345"
35 | 
36 | export NCCL_P2P_DISABLE=0
37 | export BYTEPS_PARTITION_BYTES=4096000
38 | 
39 | # echo "Espresso"
40 | for compressor in "dgc"
41 | do
42 |   pkill -9 python3
43 |   export BYTEPS_INTER_COMPRESSOR=${compressor}
44 |   scheduler_file="../../mergeComp/scheduler/${model}/nvlink_dgc_cpu"
45 |   BENCHMARK_ARGS="--compress --compressor ${compressor} --memory topk --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file}"
46 |   python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --dataset selfie2anime --dataset_dir "$HOME/data" $BENCHMARK_ARGS
47 |   sleep 5
48 | done


--------------------------------------------------------------------------------
/byteps/torch/examples/ugatit/utils.py:
--------------------------------------------------------------------------------
 1 | from scipy import misc
 2 | import os, cv2, torch
 3 | import numpy as np
 4 | 
 5 | def load_test_data(image_path, size=256):
 6 |     img = misc.imread(image_path, mode='RGB')
 7 |     img = misc.imresize(img, [size, size])
 8 |     img = np.expand_dims(img, axis=0)
 9 |     img = preprocessing(img)
10 | 
11 |     return img
12 | 
13 | def preprocessing(x):
14 |     x = x/127.5 - 1 # -1 ~ 1
15 |     return x
16 | 
17 | def save_images(images, size, image_path):
18 |     return imsave(inverse_transform(images), size, image_path)
19 | 
20 | def inverse_transform(images):
21 |     return (images+1.) / 2
22 | 
23 | def imsave(images, size, path):
24 |     return misc.imsave(path, merge(images, size))
25 | 
26 | def merge(images, size):
27 |     h, w = images.shape[1], images.shape[2]
28 |     img = np.zeros((h * size[0], w * size[1], 3))
29 |     for idx, image in enumerate(images):
30 |         i = idx % size[1]
31 |         j = idx // size[1]
32 |         img[h*j:h*(j+1), w*i:w*(i+1), :] = image
33 | 
34 |     return img
35 | 
36 | def check_folder(log_dir):
37 |     if not os.path.exists(log_dir):
38 |         os.makedirs(log_dir)
39 |     return log_dir
40 | 
41 | def str2bool(x):
42 |     return x.lower() in ('true')
43 | 
44 | def cam(x, size = 256):
45 |     x = x - np.min(x)
46 |     cam_img = x / np.max(x)
47 |     cam_img = np.uint8(255 * cam_img)
48 |     cam_img = cv2.resize(cam_img, (size, size))
49 |     cam_img = cv2.applyColorMap(cam_img, cv2.COLORMAP_JET)
50 |     return cam_img / 255.0
51 | 
52 | def imagenet_norm(x):
53 |     mean = [0.485, 0.456, 0.406]
54 |     std = [0.299, 0.224, 0.225]
55 |     mean = torch.FloatTensor(mean).unsqueeze(0).unsqueeze(2).unsqueeze(3).to(x.device)
56 |     std = torch.FloatTensor(std).unsqueeze(0).unsqueeze(2).unsqueeze(3).to(x.device)
57 |     return (x - mean) / std
58 | 
59 | def denorm(x):
60 |     return x * 0.5 + 0.5
61 | 
62 | def tensor2numpy(x):
63 |     return x.detach().cpu().numpy().transpose(1,2,0)
64 | 
65 | def RGB2BGR(x):
66 |     return cv2.cvtColor(x, cv2.COLOR_RGB2BGR)


--------------------------------------------------------------------------------
/byteps/torch/examples/vgg16/README.md:
--------------------------------------------------------------------------------
 1 | # VGG16
 2 | 
 3 | 
 4 | You can run the model step by step as follows. 
 5 | The dataset and the dependencies are all set after "bash install.sh". 
 6 | Go to "How to run" directly.
 7 | 
 8 | ## Download the dataset
 9 | ```bash
10 | cd ~/data
11 | # we use a small dataset from ImageNet
12 | wget https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz
13 | tar xf imagewang.tgz
14 | ```
15 | The default location of the dataset is ~/data, and the dataset for VGG16 and ResNet101 is in ~/data/imagewang
16 | 
17 | **Note**: Make sure it runs on NVLink-based GPU machines.
18 | Set ifname in run_espresso.sh and run_baseline.sh.
19 | 
20 | ifname: the network interface card name, e.g., eth0, eth2
21 | 
22 | ```bash
23 | export DMLC_PS_ROOT_URI="ip"
24 | export DMLC_NUM_WORKER=WORKERS
25 | export DMLC_WORKER_ID=WORKER_ID
26 | ```
27 | 
28 | DMLC_PS_ROOT_URI: the IP address of the root GPU machine
29 | 
30 | WORKERS: the number of GPU machines in the training
31 | 
32 | ID: the id of a machine. machines have distinct IDs that start from 0
33 | 
34 | 
35 | ### Espresso
36 | Run on each machine
37 | ```bash
38 | bash run_espresso.sh
39 | ```
40 | 
41 | ### Baselines
42 | Run on each machine
43 | ```bash
44 | bash run_baseline.sh
45 | ``` 


--------------------------------------------------------------------------------
/byteps/torch/examples/vgg16/run_espresso.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | path="`dirname $0`"
 4 | set -x
 5 | 
 6 | # set DMLC_PS_ROOT_URI with the IP address of the root GPU machine and ifname with the NIC name
 7 | ifname="eth0"
 8 | 
 9 | compress_ratio=0.01
10 | gpus=0,1,2,3,4,5,6,7
11 | export DMLC_ENABLE_RDMA=${DMLC_ENABLE_RDMA:-0}
12 | export DMLC_INTERFACE=${ifname}
13 | export NCCL_IB_DISABLE=1 
14 | export NCCL_IB_GID_INDEX=3 
15 | export NCCL_IB_HCA=mlx5_0 
16 | export NCCL_SOCKET_IFNAME=${ifname}
17 | # export DMLC_NUM_WORKER=$1
18 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER
19 | export DMLC_NODE_HOST="$(/sbin/ip -o -4 addr list ${ifname} | awk '{print $4}' | cut -d/ -f1)"
20 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-12213}
21 | export NVIDIA_VISIBLE_DEVICES=${gpus}
22 | export BYTEPS_FORCE_DISTRIBUTED=0
23 | export OMP_NUM_THREADS=4
24 | export TEST_TYPE=${TEST_TYPE:=torch}
25 | export NCCL_DEBUG=VERSION
26 | # Ensure the NCCL_BUFFSIZE is larger than the message size of the compressed tensors 
27 | export NCCL_BUFFSIZE=16777216
28 | # export DMLC_WORKER_ID=$2
29 | 
30 | IFS=', ' read -ra a <<< $gpus; 
31 | gpus_per_node=${#a[@]}
32 | declare -p a;
33 | 
34 | DISTRIBUTED_ARGS="--nproc_per_node ${gpus_per_node} --nnodes ${DMLC_NUM_WORKER} --node_rank ${DMLC_WORKER_ID} --master_addr ${DMLC_PS_ROOT_URI} --master_port 12345"
35 | 
36 | export BYTEPS_PARTITION_BYTES=4096000
37 | pkill -9 python3
38 | export NCCL_P2P_DISABLE=1
39 | 
40 | for model in "vgg16"
41 | do
42 |   for compressor in "randomk"
43 |   do
44 |     pkill -9 python3
45 |     scheduler_file="../../mergeComp/scheduler/${model}/pcie_randomk_two_cpu"
46 |     export BYTEPS_INTER_COMPRESSOR=${compressor}
47 |     BENCHMARK_ARGS="--compress --compressor ${compressor} --memory topk --comm espresso --compress-ratio ${compress_ratio} --scheduler-file ${scheduler_file} --scheduler-type -1"
48 |     $GDB python3 -m torch.distributed.launch $DISTRIBUTED_ARGS $path/main.py --model ${model} --epochs 5 --batch-size 32 --speed_test $BENCHMARK_ARGS
49 |     sleep 5
50 |   done
51 | done


--------------------------------------------------------------------------------
/byteps/torch/handle_manager.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_HANDLE_MANAGER_H
18 | #define BYTEPS_TORCH_HANDLE_MANAGER_H
19 | 
20 | #include <atomic>
21 | #include <memory>
22 | #include <mutex>
23 | #include <unordered_map>
24 | 
25 | #include "../common/common.h"
26 | 
27 | namespace byteps {
28 | namespace torch {
29 | 
30 | using namespace byteps::common;
31 | 
32 | class HandleManager {
33 |  public:
34 |   int AllocateHandle();
35 |   void MarkDone(int handle, const Status& status);
36 |   bool PollHandle(int handle);
37 |   std::shared_ptr<Status> ReleaseHandle(int handle);
38 | 
39 |  private:
40 |   std::atomic_int last_handle_;
41 |   std::unordered_map<int, std::shared_ptr<Status>> results_;
42 |   std::mutex mutex_;
43 | };
44 | 
45 | }  // namespace torch
46 | }  // namespace byteps
47 | 
48 | #endif  // BYTEPS_TORCH_HANDLE_MANAGER_H
49 | 


--------------------------------------------------------------------------------
/byteps/torch/json_parser.py:
--------------------------------------------------------------------------------
 1 | import json
 2 |   
 3 | # Opening JSON file
 4 | f = open('traces/0/tensor_ready.json',)
 5 |   
 6 | # returns JSON object as 
 7 | # a dictionary
 8 | data = json.load(f)
 9 |   
10 | # Iterating through the json
11 | # list
12 | time_points = {}
13 | for item in data['TensorReadyTraceEvents']:
14 |     name = item["name"]
15 |     ts = item["ts"]
16 |     if name not in time_points:
17 |         time_points[name] = [ts]
18 |     else:
19 |         time_points[name].append(ts)
20 | 
21 | iters = 0
22 | for key, value in time_points.items():
23 |     if iters == 0:
24 |         iters = len(value)
25 |     value.sort()
26 | 
27 | 
28 | timelines = [[] for _ in range(iters)]
29 | time_gaps = [[0] for _ in range(iters)]
30 | 
31 | for key, value in time_points.items():
32 |     for i, time in enumerate(value):
33 |         timelines[i].append(time)
34 | 
35 | for i, timeline in enumerate(timelines):
36 |     timeline.sort()
37 |     for j in range(len(timeline) - 1):
38 |         gap = timeline[j+1] - timeline[j]
39 |         time_gaps[i].append(gap)
40 | 
41 | 
42 | # Closing file
43 | f.close()
44 | for time_gap in time_gaps:
45 |     print(sum(time_gap), time_gap)


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/communicator/ddp_fp16.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import byteps.torch as bps
 3 | import sys
 4 | from time import time
 5 | sys.path.append("../..")
 6 | from mergeComp import Communicator
 7 | 
 8 | 
 9 | class DDPFP16(Communicator):
10 |     def __init__(self, fp16_compressor, memory, DDPbackend, profile=False):
11 |         super().__init__(fp16_compressor, fp16_compressor, memory)
12 |         self.allreduce = DDPbackend.global_allreduce
13 |         self.name = "DDPFP16"
14 |         self.world_size = bps.size()
15 |         self.comm_stream = torch.cuda.Stream(priority=-1)
16 |         self.handles = {}
17 |         self.shapes = {}
18 |         self.profile = profile
19 |         self.compress_overhead = 0
20 |         self.decompress_overhead = 0
21 |         self.iteration = -1
22 | 
23 | 
24 |     def async_send(self, tensor, name):    
25 |         with torch.cuda.stream(self.comm_stream):
26 |             self.handles[name] = self.allreduce(tensor.type(torch.float16))
27 |             return [-1], (name,)
28 | 
29 | 
30 |     def wait_receive(self, handle, ctx):
31 |         name = ctx[0]
32 |         torch.cuda.current_stream().wait_stream(self.comm_stream)
33 |         return self.handles[name].type(torch.float32)


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/communicator/pool_allreduce.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from mergeComp.torch import Communicator
 3 | from horovod.torch import allreduce_async, synchronize
 4 | from horovod.torch.mpi_ops import Average
 5 | 
 6 | 
 7 | class PoolAllreduce(Communicator):
 8 |     def __init__(self, compressor, memory):
 9 |         super().__init__(compressor, memory)
10 |         self.name = "PoolAllReduce"
11 | 
12 | 
13 |     def async_send(self, tensors_compressed, ctx):
14 |         # assert only one tensor in tensors_compressed for allreduce
15 |         return allreduce_async(tensors_compressed[0], name=ctx[0], op=Average)
16 | 
17 | 
18 |     def wait_receive(self, handle, ctx):
19 |         output = [synchronize(handle)]
20 |         return [self.compressor.decompress(output, ctx)]
21 | 
22 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/compressor/poolfp16.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Compressor
 5 | 
 6 | 
 7 | class PoolFP16Compressor(Compressor):
 8 |     """Compress all floating point gradients to 16-bit."""
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.name = "PoolFP16"
12 |         self.quantization = False
13 | 
14 |     def compress(self, tensor, name):
15 |         """Downcasts the tensor to 16-bit."""
16 |         dtype = tensor.dtype
17 |         numel = tensor.numel()
18 |         tensor_compressed = tensor
19 |         if dtype.is_floating_point:
20 |             # Only allow compression from other floating point types
21 |             tensor_compressed = tensor.type(torch.float16)
22 |         ctx = (name, numel, dtype)
23 |         return [tensor_compressed], ctx
24 | 
25 | 
26 |     def decompress(self, tensors, ctx):
27 |         """Upcasts the tensor to the initialization dtype."""
28 |         tensor_compressed = tensors[0]
29 |         name, numel, dtype = ctx
30 |         tensor_decompressed = tensor_compressed
31 |         #print("[decompress] before", ctx, torch.sum(tensor_compressed))
32 |         if dtype.is_floating_point:
33 |             tensor_decompressed = tensor_compressed.type(dtype)
34 |         #print("[decompress] after", ctx, torch.sum(tensor_compressed))
35 |         return tensor_decompressed
36 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/compressor/poolint8.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Compressor
 5 | 
 6 | 
 7 | class PoolInt8Compressor(Compressor):
 8 |     """Compress all floating point gradients to 16-bit."""
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.name = "PoolInt8"
12 |         self.quantization = False
13 | 
14 |     def compress(self, tensor, name):
15 |         """Downcasts the tensor to 8-bit."""
16 |         dtype = tensor.dtype
17 | 
18 |         tensor_compressed = tensor
19 |         if dtype.is_floating_point:
20 |             # Only allow compression from other floating point types
21 |             tensor_compressed = tensor.type(torch.uint8)
22 |         ctx = (name, dtype)
23 |         return [tensor_compressed], ctx
24 | 
25 | 
26 |     def decompress(self, tensors, ctx):
27 |         """Upcasts the tensor to the initialization dtype."""
28 |         tensor_compressed = tensors[0]
29 |         _, dtype = ctx
30 |         tensor_decompressed = tensor_compressed
31 |         #print("[decompress] before", ctx, torch.sum(tensor_compressed))
32 |         if dtype.is_floating_point:
33 |             tensor_decompressed = tensor_compressed.type(dtype)
34 |         #print("[decompress] after", ctx, torch.sum(tensor_compressed))
35 |         return tensor_decompressed


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/compressor/poolnone.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Compressor
 5 | 
 6 | class PoolNoneCompressor(Compressor):
 7 |     """Default no-op compression."""
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self.name = "PoolNone"
11 |         self.quantization = False
12 |         self.sparsification = False
13 | 
14 |     def compress(self, tensor, name, signsgd_unit_size=8, alltoall_nodes=1):
15 |         ctx = (name, tensor.numel())
16 |         return [tensor, tensor.abs().mean().reshape((1,))], ctx
17 | 
18 |     def decompress(self, tensors, ctx, alltoall=False):
19 |         return tensors[0]
20 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/compressor/poolqsgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import sys
 4 | sys.path.append("../..")
 5 | from mergeComp import Compressor
 6 | 
 7 | 
 8 | class PoolQSGDCompressor(Compressor):
 9 | 
10 |     def __init__(self, quantum_num):
11 |         super().__init__()
12 |         self.name = "PoolQSGD"
13 |         self.quantization = True
14 |         self.quantum_num = quantum_num
15 | 
16 | 
17 |     def compress(self, tensor, name, server=False):
18 |         shape = tensor.size()
19 |         tensor = tensor.flatten()
20 |         norm = tensor.norm().reshape((1,))
21 | 
22 |         abs_gradient = tensor.abs()
23 | 
24 |         level_float = self.quantum_num / norm * abs_gradient
25 |         previous_level = level_float.floor()
26 |         prob = torch.empty_like(tensor).uniform_()
27 |         is_next_level = (prob < (level_float - previous_level)).type(torch.float32)
28 |         new_level = (previous_level + is_next_level)
29 | 
30 |         sign = tensor.sign()
31 |         tensor_compressed = (new_level * sign).type(torch.int16)
32 |         tensor_compressed = tensor_compressed.type(torch.int8 if self.quantum_num < 128 else torch.half)
33 |         tensor_compressed = tensor_compressed, norm
34 | 
35 |         ctx = (name, shape)
36 |         return tensor_compressed, ctx
37 | 
38 | 
39 |     def decompress(self, tensor_compressed, ctx, server=False):
40 |         tensor, norm = tensor_compressed
41 |         norm = norm[0]
42 |         decode_output = tensor.type(torch.float32)
43 |         tensor_decompressed = norm / self.quantum_num * decode_output
44 |         return tensor_decompressed
45 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/compressor/poolsignum.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | import sys
 4 | sys.path.append("../..")
 5 | from mergeComp import Compressor
 6 | from mergeComp.util import packbits, unpackbits
 7 | 
 8 | 
 9 | class PoolSignumCompressor(Compressor):
10 |     def __init__(self, momentum):
11 |         super().__init__(average=False)
12 |         self.name = "PoolSignNum"
13 |         self.quantization = True
14 |         self.momentum = momentum
15 |         self.momentums = {}
16 | 
17 | 
18 |     def get_scalar(self, tensor):
19 |         return None
20 | 
21 | 
22 |     def compress(self, tensor, name, scalar=None):
23 |         """Encoding and compressing the signs """
24 |         numel = tensor.numel()
25 |         mean = tensor.abs().mean().reshape((1,))
26 | 
27 |         # update tensor by momentum
28 |         if name in self.momentums:
29 |             tensor = (1.0 - self.momentum) * tensor + self.momentum * self.momentums[name]
30 |         self.momentums[name] = tensor
31 |         sign_encode = tensor >= 0
32 | 
33 |         int8_tensor = packbits(sign_encode)
34 |         tensor_compressed = int8_tensor, mean
35 | 
36 |         ctx = (name, numel)
37 |         return tensor_compressed, ctx
38 | 
39 | 
40 |     def decompress(self, tensor_compressed, ctx):
41 |         """Decoding the signs to float format """
42 |         int8_tensor, _ = tensor_compressed
43 |         name, numel = ctx
44 | 
45 |         sign_decode = unpackbits(int8_tensor, numel)
46 |         return sign_decode.type(torch.float32) * 2 - 1
47 | 
48 | 
49 |     def aggregate(self, tensors):
50 |         """Aggregate a list of tensors."""
51 |         agged_tensor = sum(tensors)
52 |         agged_tensor = agged_tensor >= 0
53 |         agged_tensor = agged_tensor * 2.0 - 1.0
54 |         return [agged_tensor]
55 | 
56 | 
57 |     def clean(self):
58 |         self.momentums = {}


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/compressor/poolterngrad.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | import sys
 4 | sys.path.append("../..")
 5 | from mergeComp import Compressor
 6 | from mergeComp.util import pack2bits, unpack2bits
 7 | 
 8 | 
 9 | class PoolTernGradCompressor(Compressor):
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.name = "PoolTernGrad"
13 |         self.quantization = True
14 | 
15 | 
16 |     def compress(self, tensor, name, server=False):
17 |         numel = tensor.numel()
18 |         abs_gradient = tensor.abs()
19 |         scalar = abs_gradient.max()
20 |         sign_gradient = tensor.sign() * scalar
21 | 
22 |         try:
23 |             rnd_sample = torch.empty_like(tensor).cuda().uniform_(0, scalar.item())
24 |         except:
25 |             rnd_sample = torch.zeros_like(tensor).cuda()
26 | 
27 |         sign_gradient[rnd_sample >= abs_gradient] = 0
28 | 
29 |         mask = sign_gradient.sign() > 0
30 |         tern_tensor = sign_gradient.sign() + 1  # {-1, 0, 1} + 1
31 | 
32 |         int8_tensor = pack2bits(mask, tern_tensor)
33 |         tensor_compressed = int8_tensor, scalar.flatten()
34 | 
35 |         ctx = (name, numel)
36 |         return tensor_compressed, ctx
37 | 
38 | 
39 |     def decompress(self, tensor_compressed, ctx, server=False):
40 |         int8_tensor, scalar = tensor_compressed
41 |         name, numel = ctx
42 | 
43 |         tern_tensor = unpack2bits(int8_tensor, numel)
44 | 
45 |         sign = tern_tensor.type(torch.float32) - 1  # {0, 1, 2} - 1
46 |         return sign * scalar


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/dgc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Memory
 5 | import byteps.torch as bps
 6 | 
 7 | 
 8 | class DgcMemory(Memory):
 9 |     def __init__(self, momentum=0.9, gradient_clipping=False):
10 |         self.gradient_clipping = gradient_clipping
11 |         self.momentum = momentum
12 |         self.gradients = {}
13 |         self.residuals = {}
14 | 
15 | 
16 |     def compensate(self, tensor, name):
17 |         """Update the tensor with the residuals."""
18 |         # https://github.com/synxlin/deep-gradient-compression/blob/master/dgc/memory.py
19 |         if self.gradient_clipping:
20 |             tensor_squ_sum = torch.sum(tensor * tensor)
21 |             clipping_val = torch.sqrt(bps.byteps_push_pull(tensor_squ_sum, average=True, name=name))
22 |             tensor = tensor.clamp(-clipping_val, clipping_val)
23 | 
24 |         if name in self.residuals:
25 |             self.residuals[name] = self.momentum * self.residuals[name] + tensor
26 |         else:
27 |             self.residuals[name] = tensor
28 |         
29 |         if name in self.gradients:
30 |             self.gradients[name] += self.residuals[name]
31 |             tensor = self.gradients[name]
32 |         else:
33 |             self.gradients[name] = tensor
34 |         return tensor
35 | 
36 | 
37 | 
38 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
39 |         """Update the residuals."""
40 |         mask = ctx[1]
41 |         not_mask = ~mask
42 | 
43 |         temp = self.residuals[name] * not_mask
44 |         self.residuals[name] = temp
45 |         temp = self.gradients[name] * not_mask
46 |         self.gradients[name] = temp


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/efsignsgd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Memory
 5 | 
 6 | 
 7 | class EFSignSGDMemory(Memory):
 8 |     def __init__(self, lr=0.5):
 9 |         self.residuals = {}
10 |         # the training is sensitive to lr
11 |         # for ResNet50 + CIFAR100 + EFSignSGD/OneBit, lr = 0.5. if lr = 0.6, the loss becomes nan. If lr is smaller, the gradients become zero
12 |         self.lr = lr
13 | 
14 | 
15 |     def compensate(self, tensor, name):
16 |         """Update the tensor with the residuals."""
17 |         if name in self.residuals:
18 |             tensor = self.lr * self.residuals[name] + tensor
19 |         return tensor
20 | 
21 | 
22 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
23 |         """Update the residuals."""
24 |         tensor_decompressed = compressor.decompress(tensor_compressed, ctx)
25 |         self.residuals[name] = tensor - tensor_decompressed


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/none.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Memory
 5 | 
 6 | 
 7 | class NoneMemory(Memory):
 8 |     def compensate(self, tensor, name):
 9 |         """Update the tensor with the residuals."""
10 |         return tensor
11 | 
12 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
13 |         """Update the residuals."""
14 |         pass


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/pooldgc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | #from extensions.cuda.functions import accumulate
 4 | from .memory_pool import MemoryPool
 5 | 
 6 | 
 7 | class PoolDgcMemory(MemoryPool):
 8 |     def __init__(self, named_parameters, lr=1e-3, momentum=0.2, fusion_num=2, gradient_clipping=False, momentum_masking=True):
 9 |         self.gradient_clipping = gradient_clipping
10 |         self.momentum = momentum
11 |         self.momentum_masking = momentum_masking
12 |         super().__init__(named_parameters, fusion_num, lr=lr)
13 |         self.iterations = -1
14 | 
15 | 
16 |     def compensate(self, tensor, name):
17 |         """Update the tensor with the residuals."""
18 |         # https://github.com/synxlin/deep-gradient-compression/blob/master/dgc/memory.py
19 |         grad = self.get_grad(name)
20 |         #if self.gradient_clipping:
21 |         #    tensor_squ_sum = torch.sum(grad * grad)
22 |         #    clipping_val = torch.sqrt(allreduce_(tensor_squ_sum, average=True, name=name))
23 |         #    grad = grad.clamp(-clipping_val, clipping_val)
24 |         mmt = self.get_momentum(name)
25 |         vec = self.get_velocity(name)
26 | 
27 |         if self.momentum_masking:
28 |             mmt.mul_(self.momentum).add_(grad)
29 |             vec.add_(mmt)
30 |         else:
31 |             vec.mul_(self.momentum).add_(grad)
32 | 
33 | 
34 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
35 |         """Update the residuals."""
36 |         mask = ctx[1]
37 |         not_mask = ~mask
38 | 
39 |         values, indices = tensor_compressed
40 |         indices_int64 = indices.type(torch.int64)
41 | 
42 |         if self.momentum_masking:
43 |             mmt = self.get_momentum(name)
44 |             mmt.copy_(mmt * not_mask)
45 | 
46 |         vec = self.get_velocity(name)
47 |         vec.copy_(vec * not_mask)
48 | 
49 | 
50 |     def reduce(self, ctx, name):
51 |         reduction = self.get_reduction(name)
52 |         reduction.set_(sum(ctx))


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/poolnone.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .memory_pool import MemoryPool
 3 | 
 4 | 
 5 | class PoolNoneMemory(MemoryPool):
 6 |     def __init__(self, named_parameters, fusion_num=2):
 7 |         super().__init__(named_parameters, fusion_num)
 8 | 
 9 |     def compensate(self, tensor, name):
10 |         """Update the tensor with the residuals."""
11 |         grad = self.get_grad(name)
12 |         residual = self.get_velocity(name)
13 |         residual.copy_(grad)
14 | 
15 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
16 |         """Update the residuals."""
17 |         pass
18 | 
19 |     def reduce(self, ctx, name):
20 |         reduction = self.get_reduction(name)
21 |         reduction.set_(sum(ctx))


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/poolresidual.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .memory_pool import MemoryPool
 3 | 
 4 | 
 5 | class PoolResidualMemory(MemoryPool):
 6 |     #TODO: tune beta and gamma to increase accurary
 7 |     def __init__(self, named_parameters, fusion_num=2, beta=0.9, gamma=1.0):
 8 |         self.beta = beta
 9 |         self.gamma = gamma
10 |         super().__init__(named_parameters, fusion_num)
11 | 
12 | 
13 |     def compensate(self, tensor, name):
14 |         """vec stores the residuals"""
15 |         grad = self.get_grad(name)
16 |         residual = self.get_velocity(name)
17 |         residual.add_(grad)
18 |         #residual.mul_(self.beta).add_(self.gamma*grad)
19 | 
20 | 
21 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
22 |         """Update the residuals."""
23 |         tensor_decompressed = compressor.decompress(tensor_compressed, ctx)
24 |         residual = self.get_velocity(name)
25 |         residual.copy_(tensor.view(-1) - tensor_decompressed)
26 | 
27 | 
28 |     def reduce(self, ctx, name):
29 |         reduction = self.get_reduction(name)
30 |         reduction.set_(sum(ctx))        


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/residual.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Memory
 5 | 
 6 | 
 7 | class ResidualMemory(Memory):
 8 |     def __init__(self, beta=0.9, gamma=1.0):
 9 |         self.residuals = {}
10 |         self.beta = beta
11 |         self.gamma = gamma
12 | 
13 | 
14 |     def compensate(self, tensor, name):
15 |         """Update the tensor with the residuals."""
16 |         if name in self.residuals:
17 |             tensor = self.beta * self.residuals[name] + tensor
18 |         return tensor
19 | 
20 | 
21 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
22 |         """Update the residuals."""
23 |         tensor_decompressed = compressor.decompress(tensor_compressed, ctx)
24 |         self.residuals[name] = tensor - tensor_decompressed
25 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/memory/topk.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import sys
 3 | sys.path.append("../..")
 4 | from mergeComp import Memory
 5 | import byteps.torch as bps
 6 | 
 7 | 
 8 | class TopKMemory(Memory):
 9 |     def __init__(self, beta=0.9, gamma=1.0):
10 |         self.residuals = {}
11 |         self.beta = beta
12 |         self.gamma = gamma
13 |         self.zeros = {}
14 | 
15 | 
16 |     def compensate(self, tensor, name):
17 |         if name in self.residuals:
18 |             tensor = tensor + self.beta*self.residuals[name]
19 |         return tensor
20 | 
21 | 
22 |     def update(self, tensor, name, compressor, tensor_compressed, ctx):
23 |         """Update the residuals."""
24 |         values, indices = tensor_compressed
25 |         if name not in self.zeros:
26 |             self.zeros[name] = torch.zeros_like(values)
27 |         self.residuals[name] = tensor.scatter_(0, indices.type(torch.int64), self.zeros[name])
28 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/scheduler/README.md:
--------------------------------------------------------------------------------
1 | ## How to run
2 | ```bash
3 | bash run_all_models.sh
4 | ```
5 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/scheduler/lstm/pcie_efsignsgd_two_cpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": "lstm",
 3 |   "compressor": "efsignsgd",
 4 |   "nodes": 4,
 5 |   "gpus_per_node": 8,
 6 |   "comp_ratio": 0.03125,
 7 |   "hierarchical comm": true,
 8 |   "CPU compression": true,
 9 |   "PCIe": true,
10 |   "tensor names": [
11 |     "decoder.bias",
12 |     "rnns.1.weight_ih_l0",
13 |     "rnns.1.weight_hh_l0",
14 |     "rnns.1.bias_hh_l0",
15 |     "rnns.1.bias_ih_l0",
16 |     "rnns.0.weight_ih_l0",
17 |     "rnns.0.weight_hh_l0",
18 |     "rnns.0.bias_hh_l0",
19 |     "rnns.0.bias_ih_l0",
20 |     "encoder.weight"
21 |   ],
22 |   "tensor sizes": [
23 |     33278,
24 |     9000000,
25 |     9000000,
26 |     6000,
27 |     6000,
28 |     9000000,
29 |     9000000,
30 |     6000,
31 |     6000,
32 |     49917000
33 |   ],
34 |   "options": [
35 |     0,
36 |     1,
37 |     1,
38 |     0,
39 |     0,
40 |     6,
41 |     6,
42 |     0,
43 |     0,
44 |     6
45 |   ]
46 | }


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/scheduler/run_all_models.sh:
--------------------------------------------------------------------------------
1 | python3 simulator_espresso.py --model vgg16 --node 4 --compressor randomk --cpu --two-level --profile
2 | python3 simulator_espresso.py --model resnet101 --node 4 --compressor dgc --cpu --two-level --profile
3 | python3 simulator_espresso.py --model ugatit --node 4 --compressor dgc --cpu --profile
4 | python3 simulator_espresso.py --model bert --node 4 --compressor randomk --cpu --profile
5 | python3 simulator_espresso.py --model gpt2 --node 4 --compressor efsignsgd --cpu --profile
6 | python3 simulator_espresso.py --model lstm --node 4 --compressor efsignsgd --cpu --two-level --profile
7 | 


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/scheduler/simulator_logs/lstm/pcie_efsignsgd_two_cpu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": "lstm",
 3 |   "compressor": "efsignsgd",
 4 |   "nodes": 4,
 5 |   "gpus_per_node": 8,
 6 |   "comp_ratio": 0.03125,
 7 |   "hierarchical comm": true,
 8 |   "CPU compression": true,
 9 |   "PCIe": true,
10 |   "tensor names": [
11 |     "decoder.bias",
12 |     "rnns.1.weight_ih_l0",
13 |     "rnns.1.weight_hh_l0",
14 |     "rnns.1.bias_hh_l0",
15 |     "rnns.1.bias_ih_l0",
16 |     "rnns.0.weight_ih_l0",
17 |     "rnns.0.weight_hh_l0",
18 |     "rnns.0.bias_hh_l0",
19 |     "rnns.0.bias_ih_l0",
20 |     "encoder.weight"
21 |   ],
22 |   "tensor sizes": [
23 |     33278,
24 |     9000000,
25 |     9000000,
26 |     6000,
27 |     6000,
28 |     9000000,
29 |     9000000,
30 |     6000,
31 |     6000,
32 |     49917000
33 |   ],
34 |   "options": [
35 |     0,
36 |     5,
37 |     6,
38 |     0,
39 |     0,
40 |     6,
41 |     6,
42 |     0,
43 |     0,
44 |     6
45 |   ]
46 | }


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.dlpack import to_dlpack
 3 | from torch.utils.dlpack import from_dlpack
 4 | import cupy
 5 | 
 6 | zero_paddings = torch.zeros((32), dtype=torch.bool).cuda()
 7 | 
 8 | def torch2cupy(tensor):
 9 |     return cupy.fromDlpack(to_dlpack(tensor))
10 | 
11 | 
12 | def cupy2torch(cupy_tensor):
13 |     return from_dlpack(cupy_tensor.toDlpack())
14 | 
15 | 
16 | def packbits(array, unit_size=8):
17 |     tensor = array
18 |     numel = tensor.numel()
19 |     if numel % unit_size != 0:
20 |         padding_size = unit_size - numel % unit_size
21 |         tensor = torch.cat((tensor, zero_paddings[:padding_size]), dim=0)
22 |     
23 |     if unit_size == 8:
24 |         return cupy2torch(cupy.packbits(torch2cupy(array)))
25 |     elif unit_size == 16:
26 |         cupy_tensor = cupy.packbits(torch2cupy(tensor))
27 |         return cupy2torch(cupy_tensor.view(cupy.float16))
28 |     elif unit_size == 32:
29 |         cupy_tensor = cupy.packbits(torch2cupy(tensor))
30 |         return cupy2torch(cupy_tensor.view(cupy.float32))
31 |     else:
32 |         raise AttributeError("unsupported data type size")
33 | 
34 | 
35 | def unpackbits(array, size):
36 |     return cupy2torch(cupy.unpackbits(torch2cupy(array).view(cupy.uint8))[:size])
37 | 
38 | 
39 | def pack2bits(first, second):
40 |     data = torch.cat((first, second.type(torch.bool)), 0)
41 |     return cupy2torch(cupy.packbits(torch2cupy(data)))
42 | 
43 | 
44 | def unpack2bits(array, size):
45 |     decode = cupy2torch(cupy.unpackbits(torch2cupy(array)))
46 |     first = decode[:size]
47 |     second = decode[size:2*size]
48 |     second[first > 0] = 2
49 | 
50 |     return second


--------------------------------------------------------------------------------
/byteps/torch/mergeComp/util_cpu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.dlpack import to_dlpack
 3 | from torch.utils.dlpack import from_dlpack
 4 | import numpy as np
 5 | 
 6 | 
 7 | def packbits(array):
 8 |     return np.packbits(array)
 9 | 
10 | 
11 | def unpackbits(array, size):
12 |     return torch.from_numpy(np.unpackbits(array)[:size])
13 | 


--------------------------------------------------------------------------------
/byteps/torch/parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import  DistributedDataParallel
2 | 


--------------------------------------------------------------------------------
/byteps/torch/ready_event.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Bytedance Inc. All Rights Reserved.
 2 | // Copyright 2018 Uber Technologies, Inc. All Rights Reserved.
 3 | //
 4 | // Licensed under the Apache License, Version 2.0 (the "License");
 5 | // you may not use this file except in compliance with the License.
 6 | // You may obtain a copy of the License at
 7 | //
 8 | //     http://www.apache.org/licenses/LICENSE-2.0
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software
11 | // distributed under the License is distributed on an "AS IS" BASIS,
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | // See the License for the specific language governing permissions and
14 | // limitations under the License.
15 | // =============================================================================
16 | 
17 | #ifndef BYTEPS_TORCH_READY_EVENT_H
18 | #define BYTEPS_TORCH_READY_EVENT_H
19 | 
20 | #if HAVE_CUDA
21 | #include "cuda_runtime.h"
22 | #endif
23 | 
24 | #include <memory>
25 | 
26 | #include "../common/common.h"
27 | 
28 | namespace byteps {
29 | namespace torch {
30 | 
31 | using namespace byteps::common;
32 | 
33 | #if HAVE_CUDA
34 | class TorchReadyEvent : public ReadyEvent {
35 |  public:
36 |   TorchReadyEvent(int device);
37 |   ~TorchReadyEvent();
38 |   virtual bool Ready() const override;
39 | 
40 |  private:
41 |   int device_ = CPU_DEVICE_ID;
42 |   cudaEvent_t cuda_event_ = nullptr;
43 | };
44 | #endif
45 | 
46 | std::shared_ptr<ReadyEvent> RecordReadyEvent(int device);
47 | 
48 | }  // namespace torch
49 | }  // namespace byteps
50 | 
51 | #endif  // BYTEPS_TORCH_READY_EVENT_H
52 | 


--------------------------------------------------------------------------------
/byteps/torch/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | numpy
3 | cupy-cuda111
4 | tqdm
5 | requests
6 | scipy
7 | six
8 | gdown


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.0-devel-ubuntu18.04
 2 | 
 3 | ARG https_proxy
 4 | ARG http_proxy
 5 | 
 6 | ARG BYTEPS_BASE_PATH=/usr/local
 7 | ARG BYTEPS_PATH=$BYTEPS_BASE_PATH/byteps
 8 | ARG BYTEPS_GIT_LINK=https://github.com/bytedance/byteps
 9 | ARG BYTEPS_BRANCH=master
10 | 
11 | ARG DEBIAN_FRONTEND=noninteractive
12 | RUN apt-get update
13 | RUN apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
14 |         build-essential \
15 |         tzdata \
16 |         ca-certificates \
17 |         git \
18 |         curl \
19 |         wget \
20 |         vim \
21 |         cmake \
22 |         lsb-release \
23 |         libcudnn7=7.6.0.64-1+cuda10.0 \
24 |         libnuma-dev \
25 |         ibverbs-providers \
26 |         librdmacm-dev \
27 |         ibverbs-utils \
28 |         rdmacm-utils \
29 |         libibverbs-dev \
30 |         python3 \
31 |         python3-dev \
32 |         python3-pip \
33 |         python3-setuptools \
34 |         libnccl2=2.4.7-1+cuda10.0 \
35 |         libnccl-dev=2.4.7-1+cuda10.0
36 | 
37 | # install framework
38 | # note: for tf <= 1.14, you need gcc-4.9
39 | ARG FRAMEWORK=tensorflow
40 | RUN if [ "$FRAMEWORK" = "tensorflow" ]; then \
41 |         pip3 install --upgrade pip; \
42 |         pip3 install -U tensorflow-gpu==1.15.0; \
43 |     elif [ "$FRAMEWORK" = "pytorch" ]; then \
44 |         pip3 install -U numpy==1.18.1 torchvision==0.5.0 torch==1.4.0; \
45 |     elif [ "$FRAMEWORK" = "mxnet" ]; then \
46 |         pip3 install -U mxnet-cu100==1.5.0; \
47 |     else \
48 |         echo "unknown framework: $FRAMEWORK"; \
49 |         exit 1; \
50 |     fi
51 | 
52 | ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
53 | 
54 | RUN cd $BYTEPS_BASE_PATH &&\
55 |     git clone --recursive -b $BYTEPS_BRANCH $BYTEPS_GIT_LINK &&\
56 |     cd $BYTEPS_PATH &&\
57 |     python3 setup.py install
58 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | # Prebuilt Images
 2 | 
 3 | Belows are prebuilt docker images, and their associated commands to build. These prebuilt images might not be up-to-date.
 4 | You may need to manually build them to get the latest functionalities of BytePS using the dockerfile.
 5 | 
 6 | | Docker image | How to build |
 7 | | --- | --- |
 8 | | bytepsimage/tensorflow       | docker build -t bytepsimage/tensorflow . -f Dockerfile --build-arg FRAMEWORK=tensorflow |
 9 | | bytepsimage/pytorch          | docker build -t bytepsimage/pytorch . -f Dockerfile --build-arg FRAMEWORK=pytorch |
10 | | bytepsimage/mxnet            | docker build -t bytepsimage/mxnet . -f Dockerfile --build-arg FRAMEWORK=mxnet |
11 | 


--------------------------------------------------------------------------------
/docs/DistributedDataParallel.md:
--------------------------------------------------------------------------------
 1 | # DistributedDataParallel
 2 | 
 3 | BytePS Distributed Data Parallel module is compatible with PyTorch Distributed
 4 | Data Parallel for the most part. Instead of using PyTorch communication
 5 | backends, it uses BytePS push-pull for gradients reduction between nodes.
 6 | 
 7 | It currently supports the Single-Process Single-GPU mode. In this mode each
 8 | process works with one GPU. Example usage:
 9 | 
10 | 
11 | ```python
12 | # byteps_ddp_example.py
13 | from byteps.torch.parallel import DistributedDataParallel
14 | 
15 | model = DistributedDataParallel(model, device_ids=[i])
16 | output = model(data)
17 | loss = F.nll_loss(output, target)
18 | loss.backward()
19 | optimizer.step()
20 | ```
21 | 
22 | Some models have branches, part of the model is skipped during the forward
23 | pass. In that case it's required to call the
24 | DistributedDataParallel.synchronize() function after loss.backward(), e.g.:
25 | 
26 | ```python
27 | # byteps_ddp_example.py
28 | from byteps.torch.parallel import DistributedDataParallel
29 | 
30 | # construct a model which skips some layers in the forward pass, then wrap the
31 | # model with DistributedDataParallel()
32 | model = DistributedDataParallel(model, device_ids=[i])
33 | output = model(data)
34 | loss = F.nll_loss(output, target)
35 | loss.backward()
36 | # the synchronize() call here is required because some layers were skipped in
37 | # the forward pass
38 | model.synchronize()
39 | optimizer.step()
40 | ```
41 | 
42 | To run the program, use `bpslaunch` to launch one process for each device you
43 | wish to use. Refer to the [running](./running.md) document for how to use `bpslaunch`.
44 | 


--------------------------------------------------------------------------------
/docs/MirroredStrategy.md:
--------------------------------------------------------------------------------
 1 | # MirroredStrategy
 2 | 
 3 | The BytePS MirroredStrategy module is compatible with tensorflow
 4 | MultiWorkerMirroredStrategy for the most part. Instead of using the builtin
 5 | tensorflow collective communication implementation, it uses BytePS push-pull
 6 | for gradients reduction between nodes.
 7 | 
 8 | It currently supports the Single-Process Single-GPU mode. In this mode each
 9 | process works with one GPU. Example usage:
10 | 
11 | 
12 | ```python
13 | import byteps.tensorflow as bps
14 | from  byteps.tensorflow.distribute import MirroredStrategy
15 | 
16 | bps.init()
17 | tf.config.experimental.set_visible_devices(gpus[bps.local_rank()], 'GPU')
18 | strategy = MirroredStrategy(devices=["/gpu:0"])
19 | 
20 | with strategy.scope():
21 |   # Model building/compiling need to be within `strategy.scope()`.
22 |   multi_worker_model = build_and_compile_cnn_model()
23 | 
24 | multi_worker_model.fit(multi_worker_dataset, epochs=100, steps_per_epoch=70)
25 | ```
26 | To run the program, use `bpslaunch` to launch one process for each device you
27 | wish to use. Refer to the [running](./running.md) document for how to use
28 | `bpslaunch`.
29 | 


--------------------------------------------------------------------------------
/docs/cross-barrier.md:
--------------------------------------------------------------------------------
 1 | # Cross Global Barrier
 2 | 
 3 | This eliminates the global barrier between training iterations for distributed training frameworks (e.g.,
 4 | PyTorch), so that the priority-based communication scheduling in BytePS can be effective.
 5 | 
 6 | ## Why Crossing Barrier?
 7 | 
 8 | Existing distributed training frameworks (PyTorch, TensorFlow, etc) do not fully utilize the potentials of overlapping
 9 | computation and communication to speed up neural network training: they only support communication overlapping with
10 | backward propagation. But due to layer-wise dependencies in DNN training, we can actually schedule gradient
11 | synchronization order based on when they are consumed in the next iteration, and hence overlap communication with
12 | forward-propagation of the next iteration! Read the paper https://dl.acm.org/citation.cfm?id=3359642 for more
13 | communication scheduling details.
14 | 
15 | To make this idea work, the first step is to remove the global barrier between two iterations to build layer-wise
16 | dependencies, so that the forward computation of next step can start without waiting for parameter synchronization
17 | completion of all parameters.
18 | 
19 | Fig.1 shows the dependency graph with global barrier. Machine learning frameworks such as PyTorch and TensorFlow have
20 | similar dependencies when using BytePS for push and pull.
21 | 
22 | ![dag_barrier](https://user-images.githubusercontent.com/13852819/69863244-4b5ee400-12d7-11ea-9356-2dd41dff95ab.png)
23 | 
24 | *Fig.1: Dependency Graph With Global Barrier*
25 | 
26 | Fig. 2 shows the dependency graph after removing global barrier. What we do here is to change the dependency
27 | graph from Fig. 1 to Fig. 2 by removing the barrier, building layer-wise dependencies while guaranteeing computation correctness.
28 | 
29 | 
30 | ![dag_without_barrier](https://user-images.githubusercontent.com/13852819/69863268-5d408700-12d7-11ea-8b39-5e48e3d94c2b.png)
31 | *Fig.2: Dependency Graph After Removing Global Barrier*
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/docs/performance.md:
--------------------------------------------------------------------------------
 1 | # BytePS Performance when training CNN
 2 | 
 3 | ## NVLink + TCP
 4 | 
 5 | We test two models: VGG16 (communication-intensive) and Resnet50 (computation-intensive) on a popular public cloud. Both models are trained using fp32.
 6 | 
 7 | We use Tesla V100 16GB GPUs and set batch size equal to 64 *per GPU*. The machines are VMs on the cloud. Each machine has 8 V100 GPUs with NVLink-enabled. Machines are inter-connected with 20 Gbps TCP/IP network.
 8 | 
 9 | BytePS outperforms Horovod (NCCL) by 44% for Resnet50, and 100% for VGG16.
10 | 
11 | ![vgg16_tcp](https://user-images.githubusercontent.com/13852819/69873424-41e37500-12f3-11ea-93b8-705215e3e901.png)
12 | ![resnet50_tcp](https://user-images.githubusercontent.com/13852819/69873419-40b24800-12f3-11ea-9ff3-0f11347c089e.png)
13 | 
14 | You can reproduce the results using the Dockerfiles and example scripts we provide.
15 | 
16 | ## PCIe + RDMA
17 | 
18 | Note: here we present the *worse case scenario* of BytePS, i.e., 100Gbps RDMA + no NVLinks.
19 | 
20 | We get below results on machines that are based on PCIe-switch architecture -- 4 GPUs under one PCIe switch, and each machine contains two PCIe switches.
21 | The machines are inter-connected by 100 Gbps RoCEv2 networks.
22 | In this case, BytePS outperforms Horovod (NCCL) by 7% for Resnet50, and 17% for VGG16.
23 | 
24 | ![perf_rdma_pcie_resnet50](https://user-images.githubusercontent.com/13852819/68925125-57b64d80-07bd-11ea-9f72-d108cf4294ad.png)
25 | 
26 | ![perf_rdma_pcie_vgg16](https://user-images.githubusercontent.com/13852819/68925175-70befe80-07bd-11ea-98d6-ca7df3670bbd.png)
27 | 
28 | 
29 | To have BytePS outperform NCCL by so little, you have to have 100Gbps RDMA network *and* no NVLinks. In this case, the communication is actually bottlenecked by internal PCI-e switches, not the network. BytePS has done some optimization so that it still outperforms NCCL. However, the performance gain is not as large as other cases where the network is the bottleneck.
30 | 


--------------------------------------------------------------------------------
/docs/running.md:
--------------------------------------------------------------------------------
 1 | # Running BytePS
 2 | 
 3 | BytePS follows the same running model as MXNet's PS implemenation, and provides a script, launcher/launcher.py, to help you start individual processes. **Below instructions, including those DMLC variables, apply to all frameworks.**
 4 | 
 5 | Let's say you have two worker machines (or docker containers) that have GPUs, one machine or container as a server, and a scheduler. The scheduler binds on 10.0.0.1 and port 9000. The workers and the server can connect to the scheduler via the IP and port using TCP.
 6 | 
 7 | To use launcher/launcher.py, NVIDIA_VISIBLE_DEVICES should exist -- either automatically set by nvidia-docker, or manually set by you.
 8 | 
 9 | On worker 0, run:
10 | 
11 | ```
12 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
13 | DMLC_WORKER_ID=0 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \
14 | bpslaunch YOUR_COMMAND
15 | ```
16 | 
17 | On worker 1, run (only DMLC_WORKER_ID is different from above):
18 | 
19 | ```
20 | DMLC_ROLE=worker DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
21 | DMLC_WORKER_ID=1 DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 \
22 | bpslaunch YOUR_COMMAND
23 | ```
24 | 
25 | **For servers and schedulers, we highly recommend you use the docker image we build:**
26 | 
27 | ```
28 | docker pull bytepsimage/byteps_server
29 | ```
30 | 
31 | Start server and scheduler docker instances with this image. In the server, run the following. Compared with the worker command, we remove DMLC_WORKER_ID, and set role to server.
32 | 
33 | ```
34 | DMLC_ROLE=server DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
35 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch
36 | ```
37 | 
38 | On the scheduler, run (we also remove DMLC_WORKER_ID, and set role to scheduler):
39 | 
40 | ```
41 | DMLC_ROLE=scheduler DMLC_PS_ROOT_URI=10.0.0.1 DMLC_PS_ROOT_PORT=9000 \
42 | DMLC_NUM_WORKER=2 DMLC_NUM_SERVER=1 bpslaunch
43 | ```
44 | 
45 | In this example, your scheduler must be able to bind to `10.0.0.1:9000`.
46 | 
47 | The order of starting workers/servers/scheduler does not matter.
48 | 


--------------------------------------------------------------------------------
/docs/troubleshooting.md:
--------------------------------------------------------------------------------
 1 | # Troubleshooting
 2 | 
 3 | We suggest you read the Horovod troubleshooting, especially for problems during the build process. BytePS has almost the same dependencies as Horovod minus MPI.
 4 | 
 5 | https://github.com/horovod/horovod/blob/v0.16.4/docs/troubleshooting.rst
 6 | 
 7 | ## Network connectivity
 8 | 
 9 | When launching distributed jobs, if you see hanging at the beginning, one possible reason is that your network connection has trouble. You can use `ps-lite` benchmark to verify the connectivity.
10 | 
11 | Install ps-lite:
12 | 
13 | ```
14 | git clone -b byteps https://github.com/bytedance/ps-lite.git
15 | cd ps-lite
16 | make -j
17 | ```
18 | 
19 | 
20 | For the scheduler
21 | ```
22 | export DMLC_ROLE=scheduler
23 | export DMLC_NUM_WORKER=1
24 | export DMLC_NUM_SERVER=1
25 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP]
26 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT]
27 | export DMLC_INTERFACE=eth0
28 | ./ps-lite/tests/test_benchmark
29 | ```
30 | 
31 | For the server
32 | ```
33 | export DMLC_ROLE=server
34 | export DMLC_NUM_WORKER=1
35 | export DMLC_NUM_SERVER=1
36 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP]
37 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT]
38 | export DMLC_INTERFACE=eth0
39 | ./ps-lite/tests/test_benchmark
40 | ```
41 | 
42 | For the worker:
43 | ```
44 | export DMLC_ROLE=worker
45 | export DMLC_NUM_WORKER=1
46 | export DMLC_NUM_SERVER=1
47 | export DMLC_PS_ROOT_URI=[YOUR_SCHEDULER_IP]
48 | export DMLC_PS_ROOT_PORT=[YOUR_SCHEDULER_PORT]
49 | export DMLC_INTERFACE=eth0
50 | ./ps-lite/tests/test_benchmark 1024000 100 0
51 | ```
52 | 
53 | If it succeed, you should be able to see something like this on the worker.
54 | ```
55 | push_byte=4096000, repeat=100, total_time=128.842ms
56 | pull_byte=4096000, repeat=100, total_time=353.38ms
57 | ```
58 | 
59 | (Note: for RDMA networks, use `make -j USE_RDMA=1` to build, and `export DMLC_ENABLE_RDMA=1` for running the scheduler / server / worker)
60 | 
61 | If it still hang, you may need to check your network connectivity.
62 | 


--------------------------------------------------------------------------------
/espresso_EuroSys23.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/espresso_EuroSys23.pdf


--------------------------------------------------------------------------------
/espresso_EuroSys_AE.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/espresso_EuroSys_AE.pdf


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
1 | For more examples, see: https://github.com/byteps/examples


--------------------------------------------------------------------------------
/example/mxnet/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/example/mxnet/common/__init__.py


--------------------------------------------------------------------------------
/example/mxnet/common/find_mxnet.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | import os, sys
19 | try:
20 |     import mxnet as mx
21 | except ImportError:
22 |     curr_path = os.path.abspath(os.path.dirname(__file__))
23 |     sys.path.append(os.path.join(curr_path, "../../../python"))
24 |     import mxnet as mx
25 | 


--------------------------------------------------------------------------------
/example/mxnet/data/imagenet1k-val.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #   http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied.  See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | 
20 | 
21 | # This file download the imagnet-1k validation dataset and convert it into a rec
22 | # file. One need to provide the URL for the ILSVRC2012_img_val.tar, which can be
23 | # find at http://www.image-net.org/download-images
24 | #
25 | # Example usage (replace the URL with the correct one):
26 | # ./imagenet1k-val.sh http://xxxxxx/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
27 | 
28 | if [ ! -e ILSVRC2012_img_val.tar ]; then
29 |     wget $1
30 | fi
31 | mkdir -p val
32 | tar -xf ILSVRC2012_img_val.tar -C val
33 | wget http://data.mxnet.io/models/imagenet/resnet/val.lst -O imagenet1k-val.lst
34 | 
35 | CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
36 | MX_DIR=${CUR_DIR}/../../../
37 | 
38 | python ${CUR_DIR}/../../../tools/im2rec.py --resize 256 --quality 90 --num-thread 16 imagenet1k-val val/
39 | 
40 | rm -rf val
41 | 


--------------------------------------------------------------------------------
/example/mxnet/symbols/README.md:
--------------------------------------------------------------------------------
 1 | # Symbol
 2 | 
 3 | This fold contains definition of various networks. To add a new network, please
 4 | use the following format.
 5 | 
 6 | ## Python
 7 | 
 8 | - A file implements one network proposed in a paper, with the network name as the
 9 | filename.
10 | - Mention the paper and the modifications made if any at the beginning
11 | of the file.
12 | - Indicate how to reproduce the accuracy numbers in the paper if it is not straightforward
13 | - Provide a function `get_symbol()` that return the network
14 | 


--------------------------------------------------------------------------------
/example/mxnet/symbols/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhuangwang93/Espresso/a0294e3a31590d66b17608dacce5818814f832d8/example/mxnet/symbols/__init__.py


--------------------------------------------------------------------------------
/example/mxnet/symbols/mlp.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | 
18 | """
19 | a simple multilayer perceptron
20 | """
21 | import mxnet as mx
22 | 
23 | def get_symbol(num_classes=10, **kwargs):
24 |     data = mx.symbol.Variable('data')
25 |     data = mx.sym.Flatten(data=data)
26 |     fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
27 |     act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
28 |     fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
29 |     act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
30 |     fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
31 |     mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
32 |     return mlp
33 | 


--------------------------------------------------------------------------------
/example/pytorch/test_bytecomp_pytorch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import numpy as np
 4 | import time
 5 | 
 6 | import torch
 7 | import byteps.torch as bps
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark',
11 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
12 | parser.add_argument('--no-cuda', action='store_true', default=False,
13 |                     help='disables CUDA training')
14 | 
15 | args = parser.parse_args()
16 | args.cuda = not args.no_cuda
17 | 
18 | bps.init()
19 | my_rank = bps.rank()
20 | print("xxxx python myrank ", my_rank)
21 | if args.cuda:
22 |     # Horovod: pin GPU to local rank.
23 |     torch.cuda.set_device(my_rank)
24 | 
25 | 
26 | size = 1024000
27 | half_size = size // 2
28 | for ii in range(1):
29 |     grad = torch.ones(size, dtype=torch.float32).cuda()
30 |     grad[:half_size].mul_(4*my_rank + 4)
31 | 
32 |     print("before push pull", grad[:half_size].mean(), grad[half_size:].mean())
33 |     #handle = bps.byteps_push_pull(grad, average=True, name="test")
34 |     #grad = bps.synchronize(handle)
35 |     grad = bps.intra_push(grad, average=False, name="test")
36 |     print("after push pull", grad[:size//2].mean(), grad[size//2:].mean())


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | export BYTEPS_NCCL_LINK=SHARED
 2 | python3 setup.py install --user
 3 | cd byteps/torch 
 4 | pip3 install -r requirements.txt
 5 | 
 6 | pip3 install nvidia-pyindex
 7 | pip3 install nvidia-dllogger 
 8 | 
 9 | git clone https://github.com/NVIDIA/apex
10 | cd apex
11 | git checkout d6b5ae5d04f531ff862f651e67f241fef88fd159
12 | pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
13 | cd ..
14 | 
15 | sudo apt-get update && sudo apt-get install libgl1 -y
16 | pip3 install opencv-python
17 | sudo apt install unzip
18 | 
19 | echo "download dataset for LSTM"
20 | mkdir ~/data
21 | cd examples/lstm
22 | bash getdata.sh
23 | 
24 | echo "download model checkpoint for BERT-base"
25 | cd ../BERT
26 | cd ./dataset/checkpoint
27 | wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/bert_pyt_ckpt_base_qa_squad11_amp/versions/19.09.0/zip -O bert_pyt_ckpt_base_qa_squad11_amp_19.09.0.zip
28 | unzip bert_pyt_ckpt_base_qa_squad11_amp_19.09.0.zip
29 | cd ../../ && mkdir -p results
30 | 
31 | 
32 | echo "download dataset for GPT-2"
33 | cd ~/data
34 | wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
35 | unzip wikitext-2-raw-v1.zip
36 | 
37 | 
38 | echo "download dataset for VGG16 and ResNet101"
39 | cd ~/data
40 | wget https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz
41 | tar xf imagewang.tgz
42 | 
43 | 
44 | echo "download dataset for UGATIT"
45 | cd ~/data 
46 | gdown 1xOWj1UVgp6NKMT3HbPhBbtq2A4EDkghF
47 | mkdir selfie2anime && unzip selfie2anime.zip -d selfie2anime


--------------------------------------------------------------------------------
/launcher/README.md:
--------------------------------------------------------------------------------
 1 | ### How to use distributed launcher
 2 | 
 3 | Create two host files: `worker_hosts` and `server_hosts`, put your lists of hosts inside (one IP:port per line).
 4 | 
 5 | For example, we want `10.0.0.1:12345` to be the scheduler, `10.0.0.2` and `10.0.0.3` to be the workers, `10.0.0.4` and `10.0.0.5` to be the servers.
 6 | 
 7 | Then `worker_hosts` should be:
 8 | ```
 9 | 10.0.0.2
10 | 10.0.0.3
11 | ```
12 | 
13 | And `server_hosts` should be:
14 | ```
15 | 10.0.0.4
16 | 10.0.0.5
17 | ```
18 | 
19 | Finally, start the distributed ssh launcher by:
20 | 
21 | ```
22 | python dist_launcher.py --worker-hostfile worker_hosts --server-hostfile server_hosts \
23 |         --scheduler-ip 10.0.0.1 --scheduler-port 12345 \
24 |         --username root --env ENV1:1 --env ENV2:2 \
25 |         'echo this is $DMLC_ROLE; python byteps/launcher/launch.py YOUR_COMMAND'
26 | ```
27 | 
28 | The script will automatically help you setup the necessary [environment variables](/docs/env.md) and launch BytePS processes.


--------------------------------------------------------------------------------
/pre_setup.py:
--------------------------------------------------------------------------------
 1 | # For internal use. Please do not modify this file.
 2 | 
 3 | def setup():
 4 |     return
 5 | 
 6 | def extra_make_option():
 7 |     return ""
 8 | 
 9 | ucx_path = ""
10 | 


--------------------------------------------------------------------------------
/tests/run_byteps_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | path="$(dirname $0)"
 4 | 
 5 | #export PATH=~/.local/bin:$PATH
 6 | export LD_LIBRARY_PATH=$UCX_HOME/lib:$LD_LIBRARY_PATH
 7 | export DMLC_NUM_WORKER=${DMLC_NUM_WORKER:-2}
 8 | export DMLC_NUM_SERVER=$DMLC_NUM_WORKER
 9 | export DMLC_PS_ROOT_URI=${DMLC_PS_ROOT_URI:-10.188.137.23}
10 | export DMLC_PS_ROOT_PORT=${DMLC_PS_ROOT_PORT:-22210}
11 | export DMLC_NODE_HOST=${DMLC_NODE_HOST:-$DMLC_PS_ROOT_URI}
12 | export BYTEPS_LOCAL_RANK=0
13 | export BYTEPS_LOCAL_SIZE=1
14 | export NVIDIA_VISIBLE_DEVICES=0
15 | export BYTEPS_FORCE_DISTRIBUTED=1
16 | export BYTEPS_COMPRESSOR=signsgd
17 | export BYTEPS_PARTITION_BYTES=4096000
18 | #export BYTEPS_LOG_LEVEL=${BYTEPS_LOG_LEVEL:-TRACE}
19 | export BYTEPS_LOG_LEVEL=${BYTEPS_LOG_LEVEL:-DEBUG}
20 | export PS_VERBOSE=${PS_VERBOSE:-0}
21 | export TEST_TYPE=${TEST_TYPE:=torch}
22 | 
23 | function cleanup() {
24 |   rm -rf lr.s
25 | }
26 | 
27 | trap cleanup EXIT
28 | 
29 | pkill bpslaunch
30 | pkill python3
31 | 
32 | if [ $1 == "scheduler" ]; then
33 |   echo "Launch scheduler"
34 |   DMLC_ROLE=scheduler python3 -c 'import byteps.server'
35 |   exit
36 | fi
37 | 
38 | 
39 | export DMLC_WORKER_ID=$2
40 | if [ $1 == "server" ]; then
41 |   echo "Launch server"
42 |   DMLC_ROLE=server python3 -c 'import byteps.server'
43 |   exit
44 | fi
45 | 
46 | #export GDB=" gdb -ex run --args "
47 | export GDB=" "
48 | 
49 | if [ $1 == "worker" ] || [ $1 == "joint" ]; then
50 |   export DMLC_ROLE=$1
51 |   if [ "$TEST_TYPE" == "torch" ]; then
52 |     echo "TEST TORCH ..."
53 |     $GDB python3 $path/benchmark_byteps.py
54 |   else
55 |     echo "Error: unsupported $TEST_TYPE"
56 |     exit 1
57 |   fi
58 | fi


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import mxnet.ndarray as nd
 3 | import numpy as np
 4 | from numba import jit
 5 | 
 6 | 
 7 | def fake_data(dtype="float32", batch_size=32, height=224, width=224, depth=3, num_classes=1000):
 8 |     image_list = []
 9 |     label_list = []
10 |     for _ in range(8):
11 |         image = mx.ndarray.random.normal(-1, 1,
12 |                                          shape=[1, depth, height, width],
13 |                                          dtype=dtype)
14 |         label = mx.ndarray.random.randint(0, num_classes, [1, 1])
15 | 
16 |         images = mx.ndarray.repeat(image, 128, axis=0)
17 |         labels = mx.ndarray.repeat(label, 128, axis=0)
18 |         # print(labels)
19 |         image_list.append(images)
20 |         label_list.append(labels)
21 | 
22 |     images = nd.concat(*image_list, dim=0)
23 |     labels = nd.concat(*label_list, dim=0)
24 |     # print(labels)
25 |     fake_dataset = mx.gluon.data.ArrayDataset(images, labels)
26 | 
27 |     return mx.gluon.data.DataLoader(fake_dataset, batch_size=batch_size, num_workers=4,
28 |                                     shuffle=True, last_batch='discard')
29 | 
30 | 
31 | @jit(nopython=True)
32 | def xorshift128p(state):
33 |     t = state[0]
34 |     s = state[1]
35 |     state[0] = s
36 |     t ^= t << np.uint64(23)
37 |     t ^= t >> np.uint64(17)
38 |     t ^= s ^ (s >> np.uint64(26))
39 |     state[1] = t
40 |     return int(t + s)
41 | 
42 | 
43 | @jit(nopython=True)
44 | def bernoulli(p, state):
45 |     t = p * np.iinfo(np.uint64).max
46 |     r = np.array([xorshift128p(state) for _ in range(len(p))], dtype=np.float32)
47 |     return r < t
48 | 
49 | 
50 | @jit(nopython=True)
51 | def randint(low, high, state):
52 |     return xorshift128p(state) % (high - low) + low
53 | 


--------------------------------------------------------------------------------