├── .github
    └── workflows
    │   ├── ci.md
    │   └── main.yml
├── .gitignore
├── .gitlab-ci.yml
├── CODEOWNERS
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── examples
    ├── create_embeddings.sh
    ├── curriculum_learning
    │   ├── README.md
    │   ├── ds_config_cl.json
    │   └── pretrain_gpt_cl.sh
    ├── evaluate_ict_zeroshot_nq.sh
    ├── evaluate_zeroshot_gpt.sh
    ├── finetune_mnli_distributed.sh
    ├── finetune_race_distributed.sh
    ├── generate_text.sh
    ├── merge_mp_bert.sh
    ├── pretrain_bert.sh
    ├── pretrain_bert_distributed.sh
    ├── pretrain_bert_distributed_with_mp.sh
    ├── pretrain_gpt.sh
    ├── pretrain_gpt3_175B.sh
    ├── pretrain_gpt_distributed.sh
    ├── pretrain_gpt_distributed_with_mp.sh
    ├── pretrain_gpt_multilingual.sh
    ├── pretrain_gpt_single_node.sh
    ├── pretrain_gpt_tiny.sh
    ├── pretrain_ict.sh
    ├── pretrain_t5.sh
    ├── pretrain_t5_distributed.sh
    ├── pretrain_t5_distributed_with_mp.sh
    ├── run_evalharness.sh
    ├── run_evalharness_deepspeed.md
    ├── run_evalharness_deepspeed.slurm
    └── run_evalharness_tr11-176b-ml.slurm
├── finetune_t0_non_causal_decoder.py
├── images
    └── cases_april2021.png
├── megatron
    ├── __init__.py
    ├── arguments.py
    ├── checkpointing.py
    ├── data
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── bert_dataset.py
    │   ├── biencoder_dataset_utils.py
    │   ├── blendable_dataset.py
    │   ├── data_samplers.py
    │   ├── dataset_utils.py
    │   ├── decoder_packed_mtf_dataset.py
    │   ├── distdata.py
    │   ├── gpt_dataset.py
    │   ├── helpers.cpp
    │   ├── ict_dataset.py
    │   ├── indexed_dataset.py
    │   ├── mlm_dataset.py
    │   ├── mtf_dataset.py
    │   ├── orqa_wiki_dataset.py
    │   ├── realm_dataset_utils.py
    │   ├── realm_index.py
    │   ├── t5_dataset.py
    │   ├── test
    │   │   ├── test_indexed_dataset.py
    │   │   └── test_preprocess_data.sh
    │   └── vit_dataset.py
    ├── enums.py
    ├── fp16_deprecated
    │   └── loss_scaler.py
    ├── fused_kernels
    │   ├── __init__.py
    │   ├── compat.h
    │   ├── layer_norm_cuda.cpp
    │   ├── layer_norm_cuda_kernel.cu
    │   ├── scaled_masked_softmax.cpp
    │   ├── scaled_masked_softmax.h
    │   ├── scaled_masked_softmax_cuda.cu
    │   ├── scaled_upper_triang_masked_softmax.cpp
    │   ├── scaled_upper_triang_masked_softmax.h
    │   ├── scaled_upper_triang_masked_softmax_cuda.cu
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_fused_kernels.py
    │   └── type_shim.h
    ├── global_vars.py
    ├── indexer.py
    ├── initialize.py
    ├── learning_rates.py
    ├── logging.py
    ├── memory.py
    ├── microbatches.py
    ├── model
    │   ├── __init__.py
    │   ├── bert_model.py
    │   ├── biencoder_model.py
    │   ├── classification.py
    │   ├── distributed.py
    │   ├── fused_bias_gelu.py
    │   ├── fused_layer_norm.py
    │   ├── fused_softmax.py
    │   ├── glu_activations.py
    │   ├── gpt_model.py
    │   ├── language_model.py
    │   ├── module.py
    │   ├── multiple_choice.py
    │   ├── positional_embeddings.py
    │   ├── realm_model.py
    │   ├── t5_model.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── vit_model.py
    ├── mpu
    │   ├── __init__.py
    │   ├── cross_entropy.py
    │   ├── data.py
    │   ├── initialize.py
    │   ├── layers.py
    │   ├── mappings.py
    │   ├── random.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_layers.py
    │   │   └── test_random.py
    │   └── utils.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── clip_grads.py
    │   ├── grad_scaler.py
    │   └── optimizer.py
    ├── p2p_communication.py
    ├── package_info.py
    ├── schedules.py
    ├── testing_utils.py
    ├── text_generation_utils.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── bert_tokenization.py
    │   ├── gpt2_tokenization.py
    │   └── tokenizer.py
    ├── training.py
    └── utils.py
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_ict.py
├── pretrain_prefix_lm.py
├── pretrain_t5.py
├── pretrain_vit.py
├── pyproject.toml
├── requirements.txt
├── run.sh
├── run_bf16.sh
├── run_fp16.sh
├── run_universal_bf16.sh
├── scripts
    ├── README.md
    ├── bloom-inference-scripts
    │   └── README.md
    ├── bloom-inference-server
    │   └── README.md
    └── test_multiple_dataset_sampling
    │   ├── create_dummy_dataset.py
    │   ├── preprocess_data.py
    │   ├── test_sampling.py
    │   └── test_sampling.sh
├── setup.cfg
├── setup.py
├── start_fast.md
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_harness
    │   ├── download.py
    │   ├── evaluate.py
    │   └── report-to-csv.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── main.py
    ├── orqa
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   └── natural_questions
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification.py
    │   ├── eval_utils.py
    │   ├── finetune_utils.py
    │   └── main.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── README.md
    ├── conftest.py
    ├── data
    │   └── gpt2
    │   │   ├── README.md
    │   │   ├── ag_news_prompt_inputs_document.bin
    │   │   ├── ag_news_prompt_inputs_document.idx
    │   │   ├── ag_news_prompt_targets_document.bin
    │   │   ├── ag_news_prompt_targets_document.idx
    │   │   ├── generate_ag_news_mtf_dataset.sh
    │   │   ├── gpt2-tiny-merges.txt
    │   │   ├── gpt2-tiny-vocab.json
    │   │   ├── meg-gpt2-openwebtext_text_document.bin
    │   │   ├── meg-gpt2-openwebtext_text_document.idx
    │   │   └── openwebtext-1000.jsonl
    ├── ds_config.json
    ├── ds_config_bf16.json
    ├── ds_config_cl.json
    ├── ds_config_inference.json
    ├── test_activations.py
    ├── test_basic.py
    ├── test_checkpoints.py
    ├── test_dataloaders.py
    ├── test_model.py
    ├── test_preprocessing.py
    ├── test_tensor_parallel.py
    ├── test_training.py
    └── tools
    │   ├── README.md
    │   ├── openwebtext-to-jsonl.py
    │   └── shrink-tokenizer.py
└── tools
    ├── README.md
    ├── convert_checkpoint
        ├── README.md
        ├── deepspeed_to_deepspeed.py
        ├── deepspeed_to_megatron.py
        ├── deepspeed_to_transformers.py
        ├── ds_to_universal.py
        ├── inspect_checkpoint.py
        └── inspect_deepspeed_checkpoint.py
    ├── create_doc_index.py
    ├── generate_samples_gpt.py
    ├── linter.py
    ├── logs
        ├── rescale-logs.py
        ├── tb-rename-events.py
        └── tb-rescale-scalars.py
    ├── merge_mp_partitions.py
    ├── merge_preprocessed_data.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    ├── preprocess_data.py
    ├── preprocess_data_dist.py
    ├── preprocess_data_many_cores.py
    ├── sample_idxs_to_text.py
    └── tb
        ├── tb-remove-events-by-group.py
        ├── tb-remove-events-by-tag.py
        └── tb-rename-events.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # tests
  2 | # megatron autogenerated indices
  3 | tests/data/*/*npy
  4 | tests/tools/openwebtext-1000.jsonl
  5 | tmp/
  6 | 
  7 | # macOS
  8 | .DS_Store
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | */__pycache__/
 12 | *.py[cod]
 13 | *.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | cover/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | db.sqlite3-journal
 73 | 
 74 | # Flask:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | Pipfile
 96 | Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Intellij project settings
115 | .idea/
116 | .iml
117 | 
118 | # VSCode
119 | .vscode/
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # static files generated from Django application
142 | media
143 | staticfiles
144 | /tags
145 | 
146 | # tmp files
147 | *.swp
148 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | image: gitlab-master.nvidia.com/dl/dgx/pytorch:20.12-py3-devel
 2 | 
 3 | test:
 4 |   script:
 5 |     - pytest --junitxml=report.xml tests
 6 |   artifacts:
 7 |     when: always
 8 |     reports:
 9 |       junit: report.xml
10 |     


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @bigscience-workshop/megatron-deepspeed-codeowners
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include megatron/data/Makefile
2 | include megatron/data/helpers.cpp
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: test style
 2 | 
 3 | check_dirs := tests tools/convert_checkpoint
 4 | 
 5 | help: ## this help
 6 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
 7 | 
 8 | test: ## run tests
 9 | 	pytest tests
10 | 
11 | style: ## checks for code style and applies formatting
12 | 	black $(check_dirs)
13 | 	isort $(check_dirs)
14 | 


--------------------------------------------------------------------------------
/examples/create_embeddings.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Compute embeddings for each entry of a given dataset (e.g. Wikipedia)
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | # Wikipedia data can be downloaded from the following link:
 9 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
10 | EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
11 | EMBEDDING_PATH=<Specify path to store embeddings>
12 | CHECKPOINT_PATH=<Specify path of pretrained ICT model>
13 | 
14 | python tools/create_doc_index.py \
15 |     --num-layers 12 \
16 |     --hidden-size 768 \
17 |     --num-attention-heads 12 \
18 |     --tensor-model-parallel-size 1 \
19 |     --micro-batch-size 128 \
20 |     --checkpoint-activations \
21 |     --seq-length 512 \
22 |     --retriever-seq-length 256 \
23 |     --max-position-embeddings 512 \
24 |     --load ${CHECKPOINT_PATH} \
25 |     --evidence-data-path ${EVIDENCE_DATA_DIR} \
26 |     --embedding-path ${EMBEDDING_PATH} \
27 |     --indexer-log-interval 1000 \
28 |     --indexer-batch-size 128 \
29 |     --vocab-file bert-vocab.txt \
30 |     --num-workers 2 \
31 |     --fp16
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/curriculum_learning/ds_config_cl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 512,
 3 |   "gradient_accumulation_steps": 1,
 4 |   "steps_per_print": 1,
 5 |   "zero_optimization": {
 6 |     "stage": 0
 7 |   },
 8 |   "optimizer": {
 9 |     "type": "Adam",
10 |     "params": {
11 |       "lr": 0.00015,
12 |       "max_grad_norm": 1.0,
13 |       "betas": [0.9, 0.95]
14 |     }
15 |   },
16 |   "gradient_clipping": 1.0,
17 |   "fp16": {
18 |     "enabled": true,
19 |     "loss_scale": 0,
20 |     "loss_scale_window": 1000,
21 |     "hysteresis": 2,
22 |     "min_loss_scale": 1
23 |   },
24 |   "wall_clock_breakdown": false,
25 |   "zero_allow_untested_optimizer": false,
26 |   "curriculum_learning": {
27 |     "enabled": true,
28 |     "curriculum_type": "seqlen",
29 |     "min_difficulty": 8,
30 |     "max_difficulty": 1024,
31 |     "schedule_type": "fixed_linear",
32 |     "schedule_config": {
33 |       "total_curriculum_step": 60000,
34 |       "difficulty_step": 8
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/examples/curriculum_learning/pretrain_gpt_cl.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This is a dummy train script to show how to use curriculum
  4 | # learning, some parameters are not for actual GPT pretraining.
  5 | 
  6 | TARGET_GLOBAL_BATCH_SIZE=512
  7 | TRAIN_SAMPLES=146_484_375
  8 | LR=1.0e-4
  9 | MIN_LR=1.0e-5
 10 | LR_DECAY_SAMPLES=126_953_125
 11 | LR_WARMUP_SAMPLES=183_105
 12 | SEQLEN=1024
 13 | 
 14 | ############################################################
 15 | # New configs for curriculum learning, see README.md
 16 | TRAIN_TOKENS=10_000_000_000
 17 | LR_DECAY_TOKENS=$(($LR_DECAY_SAMPLES*$SEQLEN))
 18 | ############################################################
 19 | 
 20 | LOG_INTERVAL=100
 21 | EVAL_ITERS=10
 22 | EVAL_INTERVAL=100
 23 | SAVE_INTERVAL=1000
 24 | 
 25 | VOCAB_PATH=/data/Megatron-LM/data/gpt2-vocab.json
 26 | MERGE_PATH=/data/Megatron-LM/data/gpt2-merges.txt
 27 | DATA_PATH=/data/Megatron-LM/data/indexed_datasets/megatron
 28 | 
 29 | MICRO_BATCH_SIZE=1
 30 | MP_SIZE=1
 31 | PP_SIZE=1
 32 | 
 33 | NUM_GPUS=128
 34 | echo ${NUM_GPUS}
 35 | if [[ $PP_SIZE -gt 0 ]]; then
 36 |     DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
 37 | else
 38 |     DP_SIZE=$(( ${NUM_GPUS} / ${MP_SIZE} ))
 39 | fi
 40 | GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${MICRO_BATCH_SIZE} * ${DP_SIZE}) ))
 41 | 
 42 | NAME="gpt-117M-pp${PP_SIZE}-mp${MP_SIZE}-bsz${TARGET_GLOBAL_BATCH_SIZE}-mbsz${MICRO_BATCH_SIZE}-cl"
 43 | current_time=$(date "+%Y.%m.%d-%H.%M.%S")
 44 | host="${HOSTNAME}"
 45 | TENSORBOARD_DIR="tensorboard/${NAME}_${host}_${current_time}"
 46 | mkdir -p ${TENSORBOARD_DIR}
 47 | CHECKPOINT_PATH="checkpoints/${NAME}"
 48 | 
 49 | megatron_options=" \
 50 |         --data-path ${DATA_PATH} \
 51 |         --vocab-file ${VOCAB_PATH} \
 52 |         --merge-file ${MERGE_PATH} \
 53 |         --data-impl mmap \
 54 |         --override-lr-scheduler \
 55 |         --adam-beta1 0.9 \
 56 |         --adam-beta2 0.95 \
 57 |         --tensor-model-parallel-size ${MP_SIZE} \
 58 |         --init-method-std 0.014 \
 59 |         --lr-decay-tokens ${LR_DECAY_TOKENS} \
 60 |         --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
 61 |         --micro-batch-size ${MICRO_BATCH_SIZE} \
 62 |         --global-batch-size ${TARGET_GLOBAL_BATCH_SIZE} \
 63 |         --num-layers 12 \
 64 |         --hidden-size 768 \
 65 |         --num-attention-heads 16 \
 66 |         --seq-length ${SEQLEN} \
 67 |         --max-position-embeddings ${SEQLEN} \
 68 |         --train-samples ${TRAIN_SAMPLES} \
 69 |         --train-tokens ${TRAIN_TOKENS} \
 70 |         --lr ${LR} \
 71 |         --min-lr ${MIN_LR} \
 72 |         --lr-decay-style cosine \
 73 |         --split 98,2,0 \
 74 |         --log-interval ${LOG_INTERVAL} \
 75 |         --eval-interval ${EVAL_INTERVAL} \
 76 |         --eval-iters ${EVAL_ITERS} \
 77 |         --save-interval ${SAVE_INTERVAL} \
 78 |         --weight-decay 0.1 \
 79 |         --clip-grad 1.0 \
 80 |         --hysteresis 2 \
 81 |         --num-workers 0 \
 82 |         --checkpoint-activations \
 83 |         --fp16 \
 84 |         --load ${CHECKPOINT_PATH} \
 85 |         --save ${CHECKPOINT_PATH} \
 86 |         --tensorboard-queue-size 1 \
 87 |         --log-timers-to-tensorboard \
 88 |         --log-batch-size-to-tensorboard \
 89 |         --log-validation-ppl-to-tensorboard \
 90 |         --tensorboard-dir ${TENSORBOARD_DIR}"
 91 | 
 92 | config_json="ds_config_cl.json"
 93 | 
 94 | deepspeed_options=" \
 95 | 		    --deepspeed \
 96 | 		    --deepspeed_config ${config_json} \
 97 | 		    --pipeline-model-parallel-size ${PP_SIZE} \
 98 | 		    --partition-activations"
 99 | 
100 | run_cmd="deepspeed ../../pretrain_gpt.py ${megatron_options} ${deepspeed_options} &>> ${NAME}.log"
101 | echo ${run_cmd}
102 | eval ${run_cmd}
103 | set +x
104 | 


--------------------------------------------------------------------------------
/examples/evaluate_ict_zeroshot_nq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained
 4 | # ICT model
 5 | 
 6 | # Datasets can be downloaded from the following link:
 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 8 | 
 9 | EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
10 | EMBEDDING_PATH=<Specify path of the embeddings>
11 | CHECKPOINT_PATH=<Specify path of pretrained ICT model>
12 | 
13 | QA_FILE=<Path of the natural question test dataset>
14 | 
15 | python tasks/main.py \
16 |     --task ICT-ZEROSHOT-NQ \
17 |     --tokenizer-type BertWordPieceLowerCase \
18 |     --num-layers 12 \
19 |     --hidden-size 768 \
20 |     --num-attention-heads 12 \
21 |     --tensor-model-parallel-size 1 \
22 |     --micro-batch-size 128 \
23 |     --checkpoint-activations \
24 |     --seq-length 512 \
25 |     --max-position-embeddings 512 \
26 |     --load ${CHECKPOINT_PATH} \
27 |     --evidence-data-path ${EVIDENCE_DATA_DIR} \
28 |     --embedding-path ${EMBEDDING_PATH} \
29 |     --retriever-seq-length 256 \
30 |     --vocab-file  bert-vocab.txt\
31 |     --qa-data-test ${QA_FILE} \
32 |     --num-workers 2 \
33 |     --faiss-use-gpu \
34 |     --retriever-report-topk-accuracies 1 5 20 100 \
35 |     --fp16
36 | 
37 | 


--------------------------------------------------------------------------------
/examples/evaluate_zeroshot_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TASK="LAMBADA"
12 | 
13 | VALID_DATA=<lambada path>
14 | VOCAB_FILE=gpt2-vocab.json
15 | MERGE_FILE=gpt2-merges.txt
16 | CHECKPOINT=checkpoints/gpt2_345m
17 | 
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
20 |                --task $TASK \
21 |                --valid-data $VALID_DATA \
22 |                --tokenizer-type GPT2BPETokenizer \
23 |                --strict-lambada \
24 |                --vocab-file $VOCAB_FILE \
25 |                --merge-file $MERGE_FILE \
26 |                --load $CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 8 \
32 |                --checkpoint-activations \
33 |                --seq-length 1024 \
34 |                --max-position-embeddings 1024 \
35 |                --log-interval 10 \
36 |                --fp16 \
37 |                --no-load-optim \
38 |                --no-load-rng
39 | 


--------------------------------------------------------------------------------
/examples/finetune_mnli_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv"
12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
13 |             data/glue_data/MNLI/dev_mismatched.tsv"
14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
15 | VOCAB_FILE=bert-vocab.txt
16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task MNLI \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 5 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 8 \
32 |                --checkpoint-activations \
33 |                --lr 5.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.065 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 500000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --fp16
45 | 


--------------------------------------------------------------------------------
/examples/finetune_race_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/RACE/train/middle"
12 | VALID_DATA="data/RACE/dev/middle \
13 |             data/RACE/dev/high"
14 | VOCAB_FILE=bert-vocab.txt
15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
16 | CHECKPOINT_PATH=checkpoints/bert_345m_race
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task RACE \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 3 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 4 \
32 |                --checkpoint-activations \
33 |                --lr 1.0e-5 \
34 |                --lr-decay-style linear \
35 |                --lr-warmup-fraction 0.06 \
36 |                --seq-length 512 \
37 |                --max-position-embeddings 512 \
38 |                --save-interval 100000 \
39 |                --save $CHECKPOINT_PATH \
40 |                --log-interval 10 \
41 |                --eval-interval 100 \
42 |                --eval-iters 50 \
43 |                --weight-decay 1.0e-1 \
44 |                --clip-grad 1.0 \
45 |                --hidden-dropout 0.1 \
46 |                --attention-dropout 0.1 \
47 |                --fp16
48 | 


--------------------------------------------------------------------------------
/examples/generate_text.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CHECKPOINT_PATH=checkpoints/gpt2_345m
 4 | VOCAB_FILE=gpt2-vocab.json
 5 | MERGE_FILE=gpt2-merges.txt
 6 | 
 7 | python tools/generate_samples_gpt2.py \
 8 |        --tensor-model-parallel-size 1 \
 9 |        --num-layers 24 \
10 |        --hidden-size 1024 \
11 |        --load $CHECKPOINT_PATH \
12 |        --num-attention-heads 16 \
13 |        --max-position-embeddings 1024 \
14 |        --tokenizer-type GPT2BPETokenizer \
15 |        --fp16 \
16 |        --batch-size 2 \
17 |        --seq-length 1024 \
18 |        --out-seq-length 1024 \
19 |        --temperature 1.0 \
20 |        --vocab-file $VOCAB_FILE \
21 |        --merge-file $MERGE_FILE \
22 |        --genfile unconditional_samples.json \
23 |        --num-samples 2 \
24 |        --top_p 0.9 \
25 |        --recompute
26 | 


--------------------------------------------------------------------------------
/examples/merge_mp_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TENSOR_MODEL_PARALLEL_SIZE=2
 4 | 
 5 | VOCAB_FILE=bert-vocab.txt
 6 | CHECKPOINT_PATH=checkpoints/bert_345m
 7 | 
 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
 9 |                                 --model-type BERT \
10 |                                 --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
11 |                                 --tokenizer-type BertWordPieceLowerCase \
12 |                                 --vocab-file $VOCAB_FILE \
13 |                                 --num-layers 24 \
14 |                                 --hidden-size 1024 \
15 |                                 --num-attention-heads 16 \
16 |                                 --seq-length 512 \
17 |                                 --max-position-embeddings 512 \
18 |                                 --load $CHECKPOINT_PATH
19 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0
 4 | WORLD_SIZE=1
 5 | DATA_PATH=<Specify path and file prefix>_text_sentence
 6 | CHECKPOINT_PATH=<Specify path>
 7 | 
 8 | python pretrain_bert.py \
 9 |        --num-layers 24 \
10 |        --hidden-size 1024 \
11 |        --num-attention-heads 16 \
12 |        --micro-batch-size 4 \
13 |        --global-batch-size 8 \
14 |        --seq-length 512 \
15 |        --max-position-embeddings 512 \
16 |        --train-iters 2000000 \
17 |        --lr-decay-iters 990000 \
18 |        --save $CHECKPOINT_PATH \
19 |        --load $CHECKPOINT_PATH \
20 |        --data-path $DATA_PATH \
21 |        --vocab-file bert-vocab.txt \
22 |        --data-impl mmap \
23 |        --split 949,50,1 \
24 |        --lr 0.0001 \
25 |        --min-lr 0.00001 \
26 |        --lr-decay-style linear \
27 |        --lr-warmup-fraction .01 \
28 |        --weight-decay 1e-2 \
29 |        --clip-grad 1.0 \
30 |        --log-interval 100 \
31 |        --save-interval 10000 \
32 |        --eval-interval 1000 \
33 |        --eval-iters 10 \
34 |        --fp16
35 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>_text_sentence
12 | CHECKPOINT_PATH=<Specify path>
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |        pretrain_bert.py \
18 |        --num-layers 24 \
19 |        --hidden-size 1024 \
20 |        --num-attention-heads 16 \
21 |        --micro-batch-size 4 \
22 |        --global-batch-size 32 \
23 |        --seq-length 512 \
24 |        --max-position-embeddings 512 \
25 |        --train-iters 1000000 \
26 |        --save $CHECKPOINT_PATH \
27 |        --load $CHECKPOINT_PATH \
28 |        --data-path $DATA_PATH \
29 |        --vocab-file bert-vocab.txt \
30 |        --data-impl mmap \
31 |        --split 949,50,1 \
32 |        --distributed-backend nccl \
33 |        --lr 0.0001 \
34 |        --lr-decay-style linear \
35 |        --min-lr 1.0e-5 \
36 |        --lr-decay-iters 990000 \
37 |        --weight-decay 1e-2 \
38 |        --clip-grad 1.0 \
39 |        --lr-warmup-fraction .01 \
40 |        --log-interval 100 \
41 |        --save-interval 10000 \
42 |        --eval-interval 1000 \
43 |        --eval-iters 10 \
44 |        --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>_text_sentence
12 | VOCAB_FILE=<Specify path to vocab.txt>
13 | CHECKPOINT_PATH=<Specify path>
14 | 
15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
16 | 
17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
18 |        pretrain_bert.py \
19 |        --tensor-model-parallel-size 2 \
20 |        --pipeline-model-parallel-size 2 \
21 |        --num-layers 24 \
22 |        --hidden-size 1024 \
23 |        --num-attention-heads 16 \
24 |        --micro-batch-size 2 \
25 |        --global-batch-size 16 \
26 |        --max-position-embeddings 512 \
27 |        --train-iters 1000000 \
28 |        --save $CHECKPOINT_PATH \
29 |        --load $CHECKPOINT_PATH \
30 |        --data-path $DATA_PATH \
31 |        --vocab-file $VOCAB_FILE \
32 |        --data-impl mmap \
33 |        --split 949,50,1 \
34 |        --distributed-backend nccl \
35 |        --lr 0.0001 \
36 |        --lr-decay-style linear \
37 |        --min-lr 1.0e-5 \
38 |        --lr-decay-iters 990000 \
39 |        --weight-decay 1e-2 \
40 |        --clip-grad 1.0 \
41 |        --lr-warmup-fraction .01 \
42 |        --log-interval 100 \
43 |        --save-interval 10000 \
44 |        --eval-interval 1000 \
45 |        --eval-iters 10 \
46 |        --fp16
47 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | DATA_PATH=<Specify path and file prefix>_text_document
 9 | CHECKPOINT_PATH=<Specify path>
10 | 
11 | 
12 | deepspeed --num_gpus 1 pretrain_gpt.py \
13 |        --num-layers 24 \
14 |        --hidden-size 1024 \
15 |        --num-attention-heads 16 \
16 |        --micro-batch-size 4 \
17 |        --global-batch-size 8 \
18 |        --seq-length 1024 \
19 |        --max-position-embeddings 1024 \
20 |        --train-iters 500000 \
21 |        --lr-decay-iters 320000 \
22 |        --save $CHECKPOINT_PATH \
23 |        --load $CHECKPOINT_PATH \
24 |        --data-path $DATA_PATH \
25 |        --vocab-file gpt2-vocab.json \
26 |        --merge-file gpt2-merges.txt \
27 |        --data-impl mmap \
28 |        --split 949,50,1 \
29 |        --distributed-backend nccl \
30 |        --lr 0.00015 \
31 |        --min-lr 1.0e-5 \
32 |        --lr-decay-style cosine \
33 |        --weight-decay 1e-2 \
34 |        --clip-grad 1.0 \
35 |        --lr-warmup-fraction .01 \
36 |        --checkpoint-activations \
37 |        --log-interval 100 \
38 |        --save-interval 10000 \
39 |        --eval-interval 1000 \
40 |        --eval-iters 10 \
41 |        --fp16
42 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_175B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | #SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
 5 | 
 6 | 
 7 | DIR=`pwd`
 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 9 | mkdir -p $DIR/logs
10 | 
11 | 
12 | DATASET_1="<PATH TO THE FIRST DATASET>"
13 | DATASET_2="<PATH TO THE SECOND DATASET>"
14 | DATASET_3="<PATH TO THE THIRD DATASET>"
15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
16 | 
17 | 
18 | options=" \
19 | 	--tensor-model-parallel-size 8 \
20 | 	--pipeline-model-parallel-size 16 \
21 |         --num-layers 96 \
22 |         --hidden-size 12288 \
23 |         --num-attention-heads 96 \
24 |         --seq-length 2048 \
25 |         --max-position-embeddings 2048 \
26 |   --micro-batch-size 1 \
27 | 	--global-batch-size 1536 \
28 | 	--rampup-batch-size 16 16 5859375 \
29 | 	--train-samples 146484375 \
30 |        	--lr-decay-samples 126953125 \
31 |         --lr-warmup-samples 183105 \
32 |         --lr 6.0e-5 \
33 | 	--min-lr 6.0e-6 \
34 |         --lr-decay-style cosine \
35 |         --log-interval 10 \
36 |         --eval-iters 40 \
37 |         --eval-interval 1000 \
38 | 	--data-path ${DATASET} \
39 | 	--vocab-file <PATH TO gpt-vocab.json> \
40 | 	--merge-file <PATH TO gpt-merges.txt> \
41 | 	--save-interval 1000 \
42 | 	--save <PATH TO CHECKPOINTS DIRECTORY> \
43 | 	--load <PATH TO CHECKPOINTS DIRECTORY> \
44 |         --split 98,2,0 \
45 |         --clip-grad 1.0 \
46 | 	--weight-decay 0.1 \
47 | 	--adam-beta1 0.9 \
48 | 	--adam-beta2 0.95 \
49 | 	--init-method-std 0.006 \
50 | 	--tensorboard-dir <TENSORBOARD DIRECTORY> \
51 |         --fp16 \
52 | 	--checkpoint-activations "
53 | 
54 | 
55 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
56 | 
57 | 
58 | srun -l \
59 |      --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
60 |      --container-mounts "<DIRECTORIES TO MOUNT>" \
61 |      --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
62 | 
63 | 
64 | set +x
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | DATA_PATH=<Specify path and file prefix>_text_document
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19 |        pretrain_gpt.py \
20 |        --num-layers 24 \
21 |        --hidden-size 1024 \
22 |        --num-attention-heads 16 \
23 |        --micro-batch-size 8 \
24 |        --global-batch-size 64 \
25 |        --seq-length 1024 \
26 |        --max-position-embeddings 1024 \
27 |        --train-iters 500000 \
28 |        --lr-decay-iters 320000 \
29 |        --save $CHECKPOINT_PATH \
30 |        --load $CHECKPOINT_PATH \
31 |        --data-path $DATA_PATH \
32 |        --vocab-file gpt2-vocab.json \
33 |        --merge-file gpt2-merges.txt \
34 |        --data-impl mmap \
35 |        --split 949,50,1 \
36 |        --distributed-backend nccl \
37 |        --lr 0.00015 \
38 |        --lr-decay-style cosine \
39 |        --min-lr 1.0e-5 \
40 |        --weight-decay 1e-2 \
41 |        --clip-grad 1.0 \
42 |        --lr-warmup-fraction .01 \
43 |        --checkpoint-activations \
44 |        --log-interval 100 \
45 |        --save-interval 10000 \
46 |        --eval-interval 1000 \
47 |        --eval-iters 10 \
48 |        --fp16
49 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | DATA_PATH=<Specify path and file prefix>_text_document
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
19 |        pretrain_gpt.py \
20 |        --tensor-model-parallel-size 2 \
21 |        --pipeline-model-parallel-size 2 \
22 |        --num-layers 24 \
23 |        --hidden-size 1024 \
24 |        --num-attention-heads 16 \
25 |        --micro-batch-size 4 \
26 |        --global-batch-size 16 \
27 |        --seq-length 1024 \
28 |        --max-position-embeddings 1024 \
29 |        --train-iters 500000 \
30 |        --lr-decay-iters 320000 \
31 |        --save $CHECKPOINT_PATH \
32 |        --load $CHECKPOINT_PATH \
33 |        --data-path $DATA_PATH \
34 |        --vocab-file gpt2-vocab.json \
35 |        --merge-file gpt2-merges.txt \
36 |        --data-impl mmap \
37 |        --split 949,50,1 \
38 |        --distributed-backend nccl \
39 |        --lr 0.00015 \
40 |        --lr-decay-style cosine \
41 |        --min-lr 1.0e-5 \
42 |        --weight-decay 1e-2 \
43 |        --clip-grad 1.0 \
44 |        --lr-warmup-fraction .01 \
45 |        --checkpoint-activations \
46 |        --log-interval 100 \
47 |        --save-interval 10000 \
48 |        --eval-interval 1000 \
49 |        --eval-iters 10 \
50 |        --fp16
51 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_multilingual.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | # paths to multilingual preprocessed datasets
 9 | DATA_PATH_EN=<Specify path and file prefix>_text_document
10 | DATA_PATH_AR=<Specify path and file prefix>_text_document
11 | DATA_PATH_KR=<Specify path and file prefix>_text_document
12 | DATA_PATH_JP=<Specify path and file prefix>_text_document
13 | 
14 | CHECKPOINT_PATH=<Specify path>
15 | 
16 | 
17 | deepspeed --num_gpus 1 pretrain_gpt.py \
18 |        --num-layers 24 \
19 |        --hidden-size 1024 \
20 |        --num-attention-heads 16 \
21 |        --micro-batch-size 4 \
22 |        --global-batch-size 8 \
23 |        --seq-length 1024 \
24 |        --max-position-embeddings 1024 \
25 |        --train-iters 500000 \
26 |        --lr-decay-iters 320000 \
27 |        --save $CHECKPOINT_PATH \
28 |        --load $CHECKPOINT_PATH \
29 |        --train-weighted-split-paths "TRAIN: 0.3 0:0.6 $DATA_EN 1 0:0.6 $DATA_AR 1 0:0.6 $DATA_KR 1 0:0.6 $DATA_JP" \
30 |        --valid-weighted-split-paths \
31 |        "VALID_EN: 1 0.6:0.8 $DATA_EN" \
32 |        "VALID_AR: 1 0.6:0.8 $DATA_AR" \
33 |        "VALID_JP: 1 0.6:0.8 $DATA_KR" \
34 |        "VALID_KR: 1 0.6:0.8 $DATA_JP" \
35 |        "VALID_EN-AR-JP-KR_BALANCED: 1 0.6:0.8 $DATA_EN, 1 0.6:0.8 $DATA_AR, 1 0.6:0.8 $DATA_JP, 1 0.6:0.8 $DATA_KR" \
36 |        --test-weighted-split-paths \
37 |        "TEST_EN: 1 0.8:1 $DATA_EN" \
38 |        "TEST_AR: 1 0.8:1 $DATA_AR" \
39 |        "TEST_JP: 1 0.8:1 $DATA_JP" \
40 |        "TEST_KR: 1 0.8:1 $DATA_KR" \
41 |        "TEST_EN-AR-JP-KR_BALANCED: 1 0.8:1 $DATA_EN, 1 0.8:1 $DATA_AR, 1 0.8:1 $DATA_JP, 1 0.8:1 $DATA_KR" \
42 |        --vocab-file gpt2-vocab.json \
43 |        --merge-file gpt2-merges.txt \
44 |        --data-impl mmap \
45 |        --split 949,50,1 \
46 |        --distributed-backend nccl \
47 |        --lr 0.00015 \
48 |        --min-lr 1.0e-5 \
49 |        --lr-decay-style cosine \
50 |        --weight-decay 1e-2 \
51 |        --clip-grad 1.0 \
52 |        --lr-warmup-fraction .01 \
53 |        --checkpoint-activations \
54 |        --log-interval 100 \
55 |        --save-interval 10000 \
56 |        --eval-interval 1000 \
57 |        --eval-iters 10 \
58 |        --fp16
59 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_single_node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Adapted to use deepspeed on a single node
 4 | #
 5 | # Multi-node will require either a `hostfile` or switching to `torch.distributed.launch`
 6 | 
 7 | # adjust to the number of GPUs to use
 8 | N_GPUS=1
 9 | 
10 | CHECKPOINT_PATH=checkpoints/gpt2
11 | VOCAB_FILE=data/gpt2-vocab.json
12 | MERGE_FILE=data/gpt2-merges.txt
13 | DATA_PATH=data/meg-gpt2_text_document
14 | 
15 | GPT_ARGS=" \
16 |     --num-layers 24 \
17 |     --hidden-size 1024 \
18 |     --num-attention-heads 16 \
19 |     --seq-length 1024 \
20 |     --max-position-embeddings 1024 \
21 |     --micro-batch-size 4 \
22 |     --global-batch-size 8 \
23 |     --lr-decay-iters 320000 \
24 |     --lr 0.00015 \
25 |     --min-lr 1.0e-5 \
26 |     --lr-decay-style cosine \
27 |     --train-iters 5000 \
28 |     --vocab-file $VOCAB_FILE \
29 |     --merge-file $MERGE_FILE \
30 |     --data-impl mmap \
31 |     --split 949,50,1 \
32 |     --distributed-backend nccl \
33 |     --weight-decay 1e-2 \
34 |     --clip-grad 1.0 \
35 |     --lr-warmup-fraction .01 \
36 |     --fp16 \
37 |     "
38 | 
39 | OUTPUT_ARGS=" \
40 |     --log-interval 10 \
41 |     --save-interval 500 \
42 |     --eval-interval 100 \
43 |     --eval-iters 10 \
44 |     --checkpoint-activations \
45 |     "
46 | 
47 | DATA_ARGS=" \
48 |     --save $CHECKPOINT_PATH \
49 |     --load $CHECKPOINT_PATH \
50 |     --data-path $DATA_PATH \
51 |     "
52 | 
53 | ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS"
54 | 
55 | LAUNCHER="deepspeed --num_gpus $N_GPUS"
56 | 
57 | CMD="$LAUNCHER pretrain_gpt.py $ALL_ARGS"
58 | 
59 | echo $CMD
60 | 
61 | $CMD
62 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_tiny.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | DATA_PATH=GPT2/c4_en_partial_gpt2_text_document
 9 | CHECKPOINT_PATH=GPT2
10 | 
11 | 
12 | deepspeed --num_gpus 1 pretrain_gpt.py \
13 |        --num-layers 2 \
14 |        --hidden-size 128 \
15 |        --num-attention-heads 4 \
16 |        --micro-batch-size 4 \
17 |        --global-batch-size 8 \
18 |        --seq-length 256 \
19 |        --max-position-embeddings 256 \
20 |        --train-iters 10000 \
21 |        --lr-decay-iters 5000 \
22 |        --save $CHECKPOINT_PATH \
23 |        --load $CHECKPOINT_PATH \
24 |        --data-path $DATA_PATH \
25 |         --tokenizer-type PretrainedFromHF \
26 |         --tokenizer-name-or-path t5-small \
27 |        --data-impl mmap \
28 |        --split 949,50,1 \
29 |        --distributed-backend nccl \
30 |        --lr 0.00015 \
31 |        --min-lr 1.0e-5 \
32 |        --lr-decay-style cosine \
33 |        --weight-decay 1e-2 \
34 |        --clip-grad 1.0 \
35 |        --lr-warmup-fraction .01 \
36 |        --checkpoint-activations \
37 |        --log-interval 100 \
38 |        --save-interval 10000 \
39 |        --eval-interval 1000 \
40 |        --eval-iters 10 \
41 |        --fp16 \
42 |        --tensorboard-dir GPT2
43 | 
44 | #        --vocab-file GPT2/gpt2-vocab.json \
45 | #        --merge-file GPT2/gpt2-merges.txt \
46 | 


--------------------------------------------------------------------------------
/examples/pretrain_ict.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "217M" parameter biencoder model for ICT retriever
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
 9 | TEXT_DATA_PATH=<Specify path and file prefix of the text data>
10 | TITLE_DATA_PATH=<Specify path and file prefix od the titles>
11 | CHECKPOINT_PATH=<Specify path>
12 | 
13 | 
14 | python pretrain_ict.py \
15 |         --num-layers 12 \
16 |         --hidden-size 768 \
17 |         --num-attention-heads 12 \
18 |         --tensor-model-parallel-size 1 \
19 |         --micro-batch-size 32 \
20 |         --seq-length 256 \
21 |         --max-position-embeddings 512 \
22 |         --train-iters 100000 \
23 |         --vocab-file bert-vocab.txt \
24 |         --tokenizer-type BertWordPieceLowerCase \
25 |         --DDP-impl torch \
26 |         --bert-load ${PRETRAINED_BERT_PATH} \
27 |         --log-interval 100 \
28 |         --eval-interval 1000 \
29 |         --eval-iters 10 \
30 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
31 |         --retriever-score-scaling \
32 |         --load $CHECKPOINT_PATH \
33 |         --save $CHECKPOINT_PATH \
34 |         --data-path ${TEXT_DATA_PATH} \
35 |         --titles-data-path ${TITLE_DATA_PATH} \
36 |         --lr 0.0001 \
37 |         --lr-decay-style linear \
38 |         --weight-decay 1e-2 \
39 |         --clip-grad 1.0 \
40 |         --lr-warmup-fraction 0.01 \
41 |         --save-interval 4000 \
42 |         --exit-interval 8000 \
43 |         --query-in-block-prob 0.1 \
44 |         --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | RANK=0
 4 | WORLD_SIZE=1
 5 | DATA_PATH=<Specify path and file prefix>
 6 | VOCAB_FILE=<Specify path to vocab.txt>
 7 | CHECKPOINT_PATH=<Specify path>
 8 | 
 9 | python pretrain_t5.py \
10 |        --num-layers 12 \
11 |        --hidden-size 768 \
12 |        --num-attention-heads 12 \
13 |        --kv-channels 64 \
14 |        --ffn-hidden-size 3072 \
15 |        --encoder-seq-length 512 \
16 |        --decoder-seq-length 128 \
17 |        --micro-batch-size 16 \
18 |        --global-batch-size 2048 \
19 |        --max-position-embeddings 512 \
20 |        --train-iters 1000000 \
21 |        --lr-decay-iters 1000000 \
22 |        --save $CHECKPOINT_PATH \
23 |        --load $CHECKPOINT_PATH \
24 |        --data-path $DATA_PATH \
25 |        --vocab-file $VOCAB_FILE \
26 |        --data-impl mmap \
27 |        --split 949,50,1 \
28 |        --lr 0.0001 \
29 |        --min-lr 0.00001 \
30 |        --lr-decay-style linear \
31 |        --lr-warmup-fraction .01 \
32 |        --weight-decay 1e-2 \
33 |        --clip-grad 1.0 \
34 |        --log-interval 100 \
35 |        --save-interval 10000 \
36 |        --eval-interval 1000 \
37 |        --eval-iters 10 \
38 |        --fp16
39 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>
12 | VOCAB_FILE=<Specify path to vocab.txt>
13 | CHECKPOINT_PATH=<Specify path>
14 | 
15 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
16 | 
17 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
18 |        pretrain_t5.py \
19 |        --num-layers 12 \
20 |        --hidden-size 768 \
21 |        --num-attention-heads 12 \
22 |        --kv-channels 64 \
23 |        --ffn-hidden-size 3072 \
24 |        --encoder-seq-length 512 \
25 |        --decoder-seq-length 128 \
26 |        --micro-batch-size 16 \
27 |        --global-batch-size 2048 \
28 |        --max-position-embeddings 512 \
29 |        --train-iters 1000000 \
30 |        --lr-decay-iters 1000000 \
31 |        --save $CHECKPOINT_PATH \
32 |        --load $CHECKPOINT_PATH \
33 |        --data-path $DATA_PATH \
34 |        --vocab-file $VOCAB_FILE \
35 |        --data-impl mmap \
36 |        --split 949,50,1 \
37 |        --lr 0.0001 \
38 |        --min-lr 0.00001 \
39 |        --lr-decay-style linear \
40 |        --lr-warmup-fraction .01 \
41 |        --weight-decay 1e-2 \
42 |        --clip-grad 1.0 \
43 |        --log-interval 100 \
44 |        --save-interval 10000 \
45 |        --eval-interval 1000 \
46 |        --eval-iters 10 \
47 |        --fp16
48 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | DATA_PATH=<Specify path and file prefix>
12 | CHECKPOINT_PATH=<Specify path>
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |        pretrain_t5.py \
18 |        --tensor-model-parallel-size 2 \
19 |        --num-layers 12 \
20 |        --hidden-size 768 \
21 |        --num-attention-heads 12 \
22 |        --kv-channels 64 \
23 |        --ffn-hidden-size 3072 \
24 |        --encoder-seq-length 512 \
25 |        --decoder-seq-length 128 \
26 |        --micro-batch-size 16 \
27 |        --global-batch-size 2048 \
28 |        --seq-length 512 \
29 |        --max-position-embeddings 512 \
30 |        --train-iters 1000000 \
31 |        --lr-decay-iters 1000000 \
32 |        --save $CHECKPOINT_PATH \
33 |        --load $CHECKPOINT_PATH \
34 |        --data-path $DATA_PATH \
35 |        --vocab-file t5-vocab.txt \
36 |        --data-impl mmap \
37 |        --split 949,50,1 \
38 |        --lr 0.0001 \
39 |        --min-lr 0.00001 \
40 |        --lr-decay-style linear \
41 |        --lr-warmup-fraction .01 \
42 |        --weight-decay 1e-2 \
43 |        --clip-grad 1.0 \
44 |        --log-interval 100 \
45 |        --save-interval 10000 \
46 |        --eval-interval 1000 \
47 |        --eval-iters 10 \
48 |        --fp16
49 | 


--------------------------------------------------------------------------------
/examples/run_evalharness.sh:
--------------------------------------------------------------------------------
 1 | CHECKPOINT_PATH=/gpfsscratch/rech/bbv/utw68ny/checkpoints/tr3m-1B3-pile/global_step296023/
 2 | 
 3 | PP_SIZE=1
 4 | TP_SIZE=1
 5 | VOCAB_FILE=gpt2-vocab.json
 6 | MERGE_FILE=gpt2-merges.txt
 7 | 
 8 | export HF_DATASETS_OFFLINE=1
 9 | 
10 | #dummy arguments to make megatron happy.
11 | MEGATRON_REQUIRED_ARGS="\
12 |     --num-layers -1\
13 |     --hidden-size -1\
14 |     --num-attention-heads -1\
15 |     --seq-length -1 \
16 |     --max-position-embeddings -1
17 | "
18 | 
19 | CMD="./tasks/eval_harness/evaluate.py \
20 |     --load $CHECKPOINT_PATH\
21 |     --tensor-model-parallel-size $TP_SIZE \
22 |     --pipeline-model-parallel-size $PP_SIZE\
23 |     --vocab-file $VOCAB_FILE\
24 |     --merge-file $MERGE_FILE\
25 |     --micro-batch-size 64\
26 |     --adaptive_seq_len\
27 |     --eval_fp32\
28 |     --task_list hellaswag,mrpc,piqa\
29 |     $MEGATRON_REQUIRED_ARGS\
30 |     "
31 | 
32 | N_GPUS=1
33 | LAUNCHER="deepspeed --num_gpus $N_GPUS"
34 | $LAUNCHER $CMD


--------------------------------------------------------------------------------
/examples/run_evalharness_deepspeed.slurm:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=eval-harness-deepspeed
 3 | #SBATCH --constraint=v100-16g
 4 | #SBATCH --nodes=1
 5 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 6 | #SBATCH --cpus-per-task=40           # number of cores per tasks
 7 | #SBATCH --hint=nomultithread         # we get physical cores not logical
 8 | #SBATCH --gres=gpu:1                 # number of gpus
 9 | #SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
10 | #SBATCH --output=%x-%j.out           # output file name
11 | #SBATCH --account=six@gpu
12 | 
13 | 
14 | set -x -e
15 | 
16 | source $six_ALL_CCFRWORK/start-prod
17 | 
18 | echo "START TIME: $(date)"
19 | 
20 | # a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same "results.json" file.
21 | VARIANT="tr9c-1B3-swiglu"
22 | 
23 | CHECKPOINT_PATH=/gpfsdsstore/projects/rech/six/commun/checkpoints/tr3m-1B3-emb-norm-pile/global_step296023
24 | MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
25 | 
26 | # you want these 2 on JZ, and pre-download/cache any datasets/tokenizers/models
27 | # but comment these out if you're running on a node with Internet access
28 | export HF_DATASETS_OFFLINE=1
29 | export TRANSFORMERS_OFFLINE=1
30 | 
31 | cd $MEGATRON_DEEPSPEED_REPO
32 | 
33 | # eval topology
34 | PP_SIZE=1
35 | TP_SIZE=1
36 | 
37 | VOCAB_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-vocab.json
38 | MERGE_FILE=$MEGATRON_DEEPSPEED_REPO/data/gpt2-merges.txt
39 | SEQ_LEN=2048
40 | 
41 | # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
42 | # make as big as it can fit into gpu w/o OOM, but not too close to 100%
43 | 
44 | EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
45 | #EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
46 | 
47 | 
48 | #dummy arguments to make megatron happy.
49 | MEGATRON_REQUIRED_ARGS=" \
50 |     --num-layers -1 \
51 |     --hidden-size -1 \
52 |     --num-attention-heads -1 \
53 |     --seq-length -1  \
54 |     --max-position-embeddings -1
55 | "
56 | 
57 | 
58 | ZERO_STAGE=0
59 | 
60 | config_json="./ds_config.json"
61 | cat <<EOT > $config_json
62 | {
63 |   "train_micro_batch_size_per_gpu": 1,
64 |   "train_batch_size": 1,
65 |   "zero_optimization": { "stage": $ZERO_STAGE },
66 |   "fp16": { "enabled": true },
67 |   "steps_per_print": 2000,
68 |   "wall_clock_breakdown": false
69 | }
70 | EOT
71 | 
72 | CMD="./tasks/eval_harness/evaluate.py  \
73 |     --load $CHECKPOINT_PATH \
74 |     --results_path $VARIANT-results.json \
75 |     --tensor-model-parallel-size $TP_SIZE  \
76 |     --pipeline-model-parallel-size $PP_SIZE \
77 |     --vocab-file $VOCAB_FILE \
78 |     --merge-file $MERGE_FILE \
79 |     --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
80 |     --no-load-optim \
81 |     --no-load-rng \
82 |     --inference \
83 |     --deepspeed \
84 |     --deepspeed_config ds_config.json \
85 |     --seq-length $SEQ_LEN \
86 |     --adaptive_seq_len \
87 |     --eval_fp32 \
88 |     --task_list arc_challenge,arc_easy,boolq,copa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sst,webqs,wic,winogrande,wnli,wsc,triviaqa,sciq \
89 |     $MEGATRON_REQUIRED_ARGS \
90 |     "
91 | 
92 | N_GPUS=1
93 | LAUNCHER="deepspeed --num_gpus $N_GPUS"
94 | echo $LAUNCHER $CMD
95 | 
96 | export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
97 | 
98 | $LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
99 | 


--------------------------------------------------------------------------------
/examples/run_evalharness_tr11-176b-ml.slurm:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #SBATCH --job-name=run_evalharness-tr11-176b-ml
  3 | #SBATCH --partition=gpu_p5
  4 | #SBATCH --constraint=a100
  5 | #SBATCH --nodes=1
  6 | #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
  7 | #SBATCH --cpus-per-task=64           # number of cores per tasks
  8 | #SBATCH --hint=nomultithread         # we get physical cores not logical
  9 | #SBATCH --gres=gpu:8                 # number of gpus
 10 | #SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
 11 | #SBATCH --output=%x-%j.out           # output file name
 12 | #SBATCH --account=six@a100
 13 | 
 14 | 
 15 | set -x -e
 16 | 
 17 | source $six_ALL_CCFRWORK/start-py38-pt111
 18 | 
 19 | echo "START TIME: $(date)"
 20 | 
 21 | # a unique identifier for the current eval ideally correspnding to the modelname
 22 | VARIANT="tr11-176b-ml"
 23 | 
 24 | 
 25 | CHECKPOINT_PATH=$six_ALL_CCFRSCRATCH/checkpoints/tr11-176B-ml/checkpoints/main/global_step50000
 26 | MEGATRON_DEEPSPEED_REPO=/gpfsssd/worksf/projects/rech/six/commun/code/eval/Megatron-DeepSpeed
 27 | export HF_DATASETS_OFFLINE=1
 28 | export TRANSFORMERS_OFFLINE=1
 29 | 
 30 | export TRANSFORMERS_CACHE=$six_ALL_CCFRWORK/models
 31 | export HF_DATASETS_CACHE=$six_ALL_CCFRWORK/datasets
 32 | export HF_MODULES_CACHE=$six_ALL_CCFRWORK/modules
 33 | export HF_METRICS_CACHE=$six_ALL_CCFRWORK/metrics
 34 | 
 35 | cd $MEGATRON_DEEPSPEED_REPO
 36 | 
 37 | TOKENIZER_NAME_OR_PATH=bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles
 38 | 
 39 | PP_SIZE=8
 40 | TP_SIZE=1
 41 | SEQ_LEN=2048
 42 | 
 43 | # different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
 44 | # make as big as it can fit into gpu w/o OOM, but not too close to 100%
 45 | EVAL_MICRO_BATCH_SIZE=1
 46 | 
 47 | #dummy arguments to make megatron happy.
 48 | MEGATRON_REQUIRED_ARGS=" \
 49 |     --num-layers -1 \
 50 |     --hidden-size -1 \
 51 |     --num-attention-heads -1 \
 52 |     --seq-length -1  \
 53 |     --max-position-embeddings -1 \
 54 | "
 55 | 
 56 | 
 57 | ZERO_STAGE=0
 58 | 
 59 | config_json="./ds_config.json"
 60 | 
 61 | # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
 62 | cat <<EOT > $config_json
 63 | {
 64 |   "train_micro_batch_size_per_gpu": 1,
 65 |   "train_batch_size": 1,
 66 |   "gradient_clipping": 1.0,
 67 |   "zero_optimization": {
 68 |     "stage": $ZERO_STAGE
 69 |   },
 70 |   "bf16": {
 71 |     "enabled": true
 72 |   },
 73 |   "steps_per_print": 2000,
 74 |   "wall_clock_breakdown": false
 75 | }
 76 | EOT
 77 | 
 78 | 
 79 | CMD="./tasks/eval_harness/evaluate.py  \
 80 |     --load $CHECKPOINT_PATH \
 81 |     --results_path $VARIANT-results.json \
 82 |     --tensor-model-parallel-size $TP_SIZE  \
 83 |     --pipeline-model-parallel-size $PP_SIZE \
 84 |     --tokenizer-type PretrainedFromHF \
 85 |     --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
 86 |     --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
 87 |     --no-load-optim \
 88 |     --no-load-rng \
 89 |     --bf16 \
 90 |     --inference \
 91 |     --seq-length $SEQ_LEN \
 92 |     --task_list arc_challenge,arc_easy,boolq,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,rte,sciq,sst,triviaqa,webqs,wic,winogrande,wnli,wsc \
 93 |     --deepspeed \
 94 |     --deepspeed_config ds_config.json \
 95 |     --bootstrap_iters 2 \
 96 |     --intermed_results \
 97 |     --adaptive_seq_len \
 98 |     --micro_bs_multiplier 4 \
 99 |     $MEGATRON_REQUIRED_ARGS \
100 |     "
101 | 
102 | GPUS_PER_NODE=8
103 | NNODES=$SLURM_NNODES
104 | MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
105 | MASTER_PORT=6000
106 | export LAUNCHER="python -u -m torch.distributed.run \
107 |     --nproc_per_node $GPUS_PER_NODE \
108 |     --nnodes $NNODES \
109 |     --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
110 |     --rdzv_backend c10d \
111 |     --max_restarts 0 \
112 |     --tee 3 \
113 |     "
114 | 
115 | export CUDA_LAUNCH_BLOCKING=1
116 | 
117 | echo $LAUNCHER $CMD
118 | 
119 | export PYTHONPATH=$MEGATRON_DEEPSPEED_REPO
120 | 
121 | $LAUNCHER $CMD 2>&1 | tee $VARIANT-eval-harness.log
122 | 


--------------------------------------------------------------------------------
/images/cases_april2021.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/images/cases_april2021.png


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | from .package_info import (
18 |     __description__,
19 |     __contact_names__,
20 |     __url__,
21 |     __download_url__,
22 |     __keywords__,
23 |     __license__,
24 |     __package_name__,
25 |     __version__,
26 | )
27 | 
28 | from .global_vars import get_args
29 | from .global_vars import get_current_global_batch_size
30 | from .global_vars import get_num_microbatches
31 | from .global_vars import update_num_microbatches
32 | from .global_vars import get_tokenizer
33 | from .global_vars import get_tensorboard_writer
34 | from .global_vars import get_adlr_autoresume
35 | from .global_vars import get_timers
36 | from .initialize  import initialize_megatron
37 | 
38 | def print_rank_0(message):
39 |     """If distributed is initialized, print only on rank 0."""
40 |     if torch.distributed.is_initialized():
41 |         if torch.distributed.get_rank() == 0:
42 |             print(message, flush=True)
43 |     else:
44 |         print(message, flush=True)
45 | 
46 | def is_last_rank():
47 |     return torch.distributed.get_rank() == (
48 |         torch.distributed.get_world_size() - 1)
49 | 
50 | def print_rank_last(message):
51 |     """If distributed is initialized, print only on last rank."""
52 |     if torch.distributed.is_initialized():
53 |         if is_last_rank():
54 |             print(message, flush=True)
55 |     else:
56 |         print(message, flush=True)
57 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | PYTHON3CONFIG := $(shell command -v python3-config 2> /dev/null)
 3 | 
 4 | ifndef PYTHON3CONFIG
 5 |     $(error "python3-config is not available. Please install it. It may be in a python-dev or another package")
 6 | endif
 7 | 
 8 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 9 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
10 | LIBNAME = helpers
11 | LIBEXT = $(shell python3-config --extension-suffix)
12 | 
13 | default: $(LIBNAME)$(LIBEXT)
14 | 
15 | %$(LIBEXT): %.cpp
16 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
17 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/megatron/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Blendable dataset."""
17 | 
18 | import time
19 | 
20 | import numpy as np
21 | import torch
22 | 
23 | from megatron import print_rank_0
24 | from megatron import mpu
25 | 
26 | 
27 | class BlendableDataset(torch.utils.data.Dataset):
28 | 
29 | 
30 |     def __init__(self, datasets, weights):
31 | 
32 |         self.datasets = datasets
33 |         num_datasets = len(datasets)
34 |         assert num_datasets == len(weights)
35 | 
36 |         self.size = 0
37 |         for dataset in self.datasets:
38 |             self.size += len(dataset)
39 | 
40 |         # Normalize weights.
41 |         weights = np.array(weights, dtype=np.float64)
42 |         sum_weights = np.sum(weights)
43 |         assert sum_weights > 0.0
44 |         weights /= sum_weights
45 | 
46 |         # Build indecies.
47 |         start_time = time.time()
48 |         assert num_datasets < 255
49 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
50 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
51 | 
52 |         from megatron.data import helpers
53 |         helpers.build_blending_indices(self.dataset_index,
54 |                                        self.dataset_sample_index,
55 |                                        weights, num_datasets, self.size,
56 |                                        torch.distributed.get_rank() == 0)
57 |         print_rank_0('> elapsed time for building blendable dataset indices: '
58 |                      '{:.2f} (sec)'.format(time.time() - start_time))
59 | 
60 | 
61 |     def __len__(self):
62 |         return self.size
63 | 
64 | 
65 |     def __getitem__(self, idx):
66 |         dataset_idx = self.dataset_index[idx]
67 |         sample_idx = self.dataset_sample_index[idx]
68 |         return self.datasets[dataset_idx][sample_idx]
69 | 


--------------------------------------------------------------------------------
/megatron/data/mtf_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Multitask Finetune style dataset."""
17 | 
18 | import time
19 | 
20 | import numpy as np
21 | import torch
22 | 
23 | from megatron import print_rank_0
24 | from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
25 | 
26 | class MTFDataset(torch.utils.data.Dataset):
27 | 
28 |     def __init__(
29 |         self,
30 |         name,
31 |         data_prefix,
32 |         data_impl,
33 |         skip_warmup,
34 |         documents,
35 |     ):
36 |         # Params to store.
37 |         self.name = name
38 | 
39 |         # Dataset.
40 |         self.input_indexed_dataset = get_indexed_dataset(data_prefix, is_input=True, data_impl=data_impl, skip_warmup=skip_warmup)
41 |         self.target_indexed_dataset = get_indexed_dataset(data_prefix, is_input=False, data_impl=data_impl, skip_warmup=skip_warmup)
42 | 
43 |         # Checks
44 |         assert np.min(documents) >= 0
45 |         assert np.max(documents) < self.input_indexed_dataset.sizes.shape[0]
46 |         assert np.max(documents) < self.target_indexed_dataset.sizes.shape[0]
47 |         assert self.input_indexed_dataset.sizes.shape[0] == self.target_indexed_dataset.sizes.shape[0]
48 | 
49 |     def __len__(self):
50 |         return len(self.input_indexed_dataset)
51 | 
52 |     def __getitem__(self, idx):
53 |         input_tokens = self.input_indexed_dataset.get(idx)
54 |         target_tokens = self.target_indexed_dataset.get(idx)
55 | 
56 |         assert len(input_tokens) > 0
57 |         assert len(target_tokens) > 0
58 | 
59 |         return {
60 |             'input_tokens': input_tokens,
61 |             'target_tokens': target_tokens,
62 |         }
63 | 
64 |     def size(self, index):
65 |         return {
66 |             'input_tokens': self.input_indexed_dataset.size(index),
67 |             'target_tokens': self.target_indexed_dataset.size(index),
68 |         }
69 | 
70 | def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_warmup: bool):
71 |     if is_input:
72 |         field = "inputs"
73 |     else:
74 |         field = "targets"
75 | 
76 |     return get_indexed_dataset_(f"{data_prefix}_{field}_document", data_impl, skip_warmup)
77 | 
78 | def get_indexed_dataset_(path, data_impl, skip_warmup):
79 |     """Build indexed dataset."""
80 |     print_rank_0(' > building dataset index ...')
81 |     start_time = time.time()
82 |     indexed_dataset = make_indexed_dataset(path,
83 |                                            data_impl,
84 |                                            skip_warmup)
85 |     print_rank_0(' > finished creating indexed dataset in {:4f} '
86 |                  'seconds'.format(time.time() - start_time))
87 |     print_rank_0('    number of documents: {}'.format(
88 |         indexed_dataset.sizes.shape[0]))
89 | 
90 |     return indexed_dataset
91 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/megatron/data/vit_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import os
16 | import torch
17 | from torchvision import datasets, transforms
18 | from megatron.data.autoaugment import ImageNetPolicy
19 | 
20 | 
21 | def build_train_valid_datasets(data_path, crop_size=224, color_jitter=True):
22 | 
23 |     # training dataset
24 |     train_data_path = os.path.join(data_path[0], "train")
25 |     normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
26 |     process = [
27 |         transforms.RandomResizedCrop(crop_size),
28 |         transforms.RandomHorizontalFlip(),
29 |     ]
30 |     if color_jitter:
31 |         process += [
32 |             transforms.ColorJitter(
33 |                 brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1
34 |             )
35 |         ]
36 |     fp16_t = transforms.ConvertImageDtype(torch.half)
37 |     process += [ImageNetPolicy(), transforms.ToTensor(), normalize, fp16_t]
38 |     transform_train = transforms.Compose(process)
39 |     train_data = datasets.ImageFolder(
40 |         root=train_data_path, transform=transform_train
41 |     )
42 | 
43 |     # validation dataset
44 |     val_data_path = os.path.join(data_path[0], "val")
45 |     transform_val = transforms.Compose(
46 |         [
47 |             transforms.Resize(crop_size),
48 |             transforms.CenterCrop(crop_size),
49 |             transforms.ToTensor(),
50 |             normalize,
51 |             fp16_t
52 |         ]
53 |     )
54 |     val_data = datasets.ImageFolder(
55 |         root=val_data_path, transform=transform_val
56 |     )
57 | 
58 |     return train_data, val_data
59 | 


--------------------------------------------------------------------------------
/megatron/enums.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import enum
17 | 
18 | class LayerType(enum.Enum):
19 |     encoder = 1
20 |     decoder = 2
21 |  
22 | class AttnType(enum.Enum):
23 |     self_attn = 1
24 |     cross_attn = 2
25 | 
26 | class AttnMaskType(enum.Enum):
27 |     padding = 1
28 |     causal = 2 # Overrides `attention_mask` to be a lower triangular matrix
29 |     prefix = 3
30 |     custom = 4 # Forces one to pass an `attention_mask` that's 1 if we need to mask. Tensor that can be broadcast to [micro_batch_size, n_head, seq_length, seq_length]
31 | 
32 | class PositionEmbeddingType(enum.Enum):
33 |     rotary = 1
34 |     absolute = 2
35 |     alibi = 3
36 | 


--------------------------------------------------------------------------------
/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """For backward compatibility, we need the class definitions to deserialize."""
17 | 
18 | class LossScaler:
19 |     def __init__(self, scale=1):
20 |         self.cur_scale = scale
21 | 
22 | class DynamicLossScaler:
23 |     def __init__(self,
24 |                  init_scale=2**32,
25 |                  scale_factor=2.,
26 |                  scale_window=1000,
27 |                  min_scale=1,
28 |                  delayed_shift=1,
29 |                  consecutive_hysteresis=False):
30 |         self.cur_scale = init_scale
31 |         self.cur_iter = 0
32 |         self.last_overflow_iter = -1
33 |         self.scale_factor = scale_factor
34 |         self.scale_window = scale_window
35 |         self.min_scale = min_scale
36 |         self.delayed_shift = delayed_shift
37 |         self.cur_hysteresis = delayed_shift
38 |         self.consecutive_hysteresis = consecutive_hysteresis
39 | 
40 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*This code is copied fron NVIDIA apex:
18 |  *     https://github.com/NVIDIA/apex
19 |  *     with minor changes. */
20 | 
21 | 
22 | 
23 | #ifndef TORCH_CHECK
24 | #define TORCH_CHECK AT_CHECK
25 | #endif
26 | 
27 | #ifdef VERSION_GE_1_3
28 | #define DATA_PTR data_ptr
29 | #else
30 | #define DATA_PTR data
31 | #endif
32 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     torch::Tensor const& mask,
28 |     float scale_factor);
29 | 
30 | torch::Tensor bwd_cuda(
31 |     torch::Tensor const& output_grads, 
32 |     torch::Tensor const& softmax_results,
33 |     float scale_factor);
34 | 
35 | int get_batch_per_block_cuda(
36 |     int query_seq_len,
37 |     int key_seq_len,
38 |     int batches,
39 |     int attn_heads);
40 | 
41 | torch::Tensor fwd(
42 |     torch::Tensor const& input,
43 |     torch::Tensor const& mask,
44 |     float scale_factor) {
45 |   AT_ASSERTM(input.dim() == 4, "expected 4D tensor");
46 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
47 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
48 |       "Only fp16 and bf16 are supported");
49 |   AT_ASSERTM(mask.dim() == 4, "expected 4D tensor");
50 | 
51 |   return fwd_cuda(input, mask, scale_factor);
52 | }
53 | 
54 | torch::Tensor bwd(
55 |     torch::Tensor const& output_grads, 
56 |     torch::Tensor const& softmax_results,
57 |     float scale_factor) {
58 | 
59 |   AT_ASSERTM(output_grads.dim() == 4, "expected 3D tensor");
60 |   AT_ASSERTM(softmax_results.dim() == 4, "expected 3D tensor");
61 | 
62 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
63 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
64 |       "Only fp16 and bf16 are supported");
65 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
66 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
67 |       "Only fp16 and bf16 are supported");
68 | 
69 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
70 | }
71 | 
72 | int get_batch_per_block(
73 |     int query_seq_len,
74 |     int key_seq_len,
75 |     int batches,
76 |     int attn_heads) {
77 |     return get_batch_per_block_cuda(query_seq_len, key_seq_len, batches, attn_heads);
78 | }
79 | 
80 | } // end namespace scaled_masked_softmax
81 | } // end namespace fused_softmax
82 | } // end namespace multihead_attn
83 | 
84 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
85 |   m.def("forward", 
86 |         &multihead_attn::fused_softmax::scaled_masked_softmax::fwd, 
87 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
88 | 
89 |   m.def("backward",
90 |         &multihead_attn::fused_softmax::scaled_masked_softmax::bwd,
91 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
92 | 
93 |   m.def("get_batch_per_block",
94 |         &multihead_attn::fused_softmax::scaled_masked_softmax::get_batch_per_block,
95 |         "Return Batch per block size."
96 |   );
97 | }
98 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <cuda_fp16.h>
18 | #include <torch/extension.h>
19 | #include <vector>
20 | 
21 | namespace multihead_attn {
22 | namespace fused_softmax {
23 | namespace scaled_upper_triang_masked_softmax {
24 | 
25 | torch::Tensor fwd_cuda(
26 |     torch::Tensor const& input, 
27 |     float scale_factor);
28 | 
29 | torch::Tensor bwd_cuda(
30 |     torch::Tensor const& output_grads, 
31 |     torch::Tensor const& softmax_results,
32 |     float scale_factor);
33 | 
34 | torch::Tensor fwd(torch::Tensor const& input, float scale_factor) {
35 |   AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
36 |   AT_ASSERTM((input.scalar_type() == at::ScalarType::Half) ||
37 | 	     (input.scalar_type() == at::ScalarType::BFloat16), 
38 |       "Only fp16 and bf16 are supported");
39 | 
40 |   return fwd_cuda(input, scale_factor);
41 | }
42 | 
43 | torch::Tensor bwd(
44 |     torch::Tensor const& output_grads, 
45 |     torch::Tensor const& softmax_results,
46 |     float scale_factor) {
47 | 
48 |   AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
49 |   AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
50 | 
51 |   AT_ASSERTM((output_grads.scalar_type() == at::ScalarType::Half) ||
52 | 	     (output_grads.scalar_type() == at::ScalarType::BFloat16), 
53 |       "Only fp16 and bf16 are supported");
54 |   AT_ASSERTM((softmax_results.scalar_type() == at::ScalarType::Half) ||
55 | 	     (softmax_results.scalar_type() == at::ScalarType::BFloat16), 
56 |       "Only fp16 and bf16 are supported");
57 | 
58 |   return bwd_cuda(output_grads, softmax_results, scale_factor);
59 | }
60 | 
61 | } // end namespace scaled_upper_triang_masked_softmax
62 | } // end namespace fused_softmax
63 | } // end namespace multihead_attn
64 | 
65 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
66 |   m.def("forward", 
67 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::fwd,
68 | 	"Self Multihead Attention scaled, time masked softmax -- Forward.");
69 |   m.def("backward", 
70 |         &multihead_attn::fused_softmax::scaled_upper_triang_masked_softmax::bwd,
71 | 	"Self Multihead Attention scaled, time masked softmax -- Backward.");
72 | }
73 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <ATen/ATen.h>
18 | #include <cuda.h>
19 | #include <cuda_runtime.h>
20 | #include <cuda_fp16.h>
21 | #include <cuda_profiler_api.h>
22 | #include <ATen/cuda/CUDAContext.h>
23 | #include <torch/extension.h>
24 | #include "scaled_upper_triang_masked_softmax.h"
25 | #include "type_shim.h"
26 | 
27 | namespace multihead_attn {
28 | namespace fused_softmax {
29 | namespace scaled_upper_triang_masked_softmax {
30 | 
31 | torch::Tensor fwd_cuda(
32 |     torch::Tensor const& input, 
33 |     float scale_factor)
34 | {
35 |   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
36 |   const int attn_batches = input.size(0);
37 |   const int seq_len = input.size(1);
38 |   TORCH_INTERNAL_ASSERT(seq_len <= 2048);
39 | 
40 |   // Output 
41 |   auto act_options = input.options().requires_grad(false);
42 |   torch::Tensor softmax_results = 
43 |       torch::empty({attn_batches, seq_len, seq_len}, act_options);
44 | 
45 |   // Softmax Intermediate Result Ptr
46 |   void* input_ptr = static_cast<void*>(input.data_ptr());
47 |   void* softmax_results_ptr = static_cast<void*>(softmax_results.data_ptr());
48 | 
49 |   DISPATCH_HALF_AND_BFLOAT(
50 |       input.scalar_type(),
51 |       "dispatch_scaled_upper_triang_masked_softmax_forward",
52 |       dispatch_scaled_upper_triang_masked_softmax_forward<scalar_t, scalar_t, float>(
53 | 	  reinterpret_cast<scalar_t*>(softmax_results_ptr),
54 | 	  reinterpret_cast<const scalar_t*>(input_ptr),
55 | 	  scale_factor,
56 | 	  seq_len,
57 | 	  seq_len,
58 | 	  attn_batches);
59 |       );
60 |   return softmax_results;
61 | }
62 | 				      
63 | 
64 | torch::Tensor bwd_cuda(
65 |     torch::Tensor const& output_grads_, 
66 |     torch::Tensor const& softmax_results_, 
67 |     float scale_factor)  {
68 | 	
69 |   auto output_grads = output_grads_.contiguous();
70 |   auto softmax_results = softmax_results_.contiguous();
71 | 
72 |   //output grads is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
73 |   const int attn_batches = output_grads.size(0);
74 |   const int seq_len = output_grads.size(1);
75 |   TORCH_INTERNAL_ASSERT(output_grads.size(1) == output_grads.size(2));
76 | 
77 |   void* output_grads_ptr = static_cast<void*>(output_grads.data_ptr());
78 | 
79 |   //Softmax Grad
80 |   DISPATCH_HALF_AND_BFLOAT(
81 |       output_grads_.scalar_type(),
82 |       "dispatch_scaled_upper_triang_masked_softmax_backward",
83 |       dispatch_scaled_upper_triang_masked_softmax_backward<scalar_t, scalar_t, float>(
84 |           reinterpret_cast<scalar_t*>(output_grads_ptr), 
85 | 	  reinterpret_cast<scalar_t*>(output_grads_ptr), 
86 | 	  reinterpret_cast<scalar_t const*>(softmax_results.data_ptr()),
87 | 	  scale_factor,
88 | 	  seq_len,
89 | 	  seq_len,
90 | 	  attn_batches);
91 |       );
92 |   
93 |   //backward pass is completely in-place
94 |   return output_grads;
95 | }
96 | }
97 | }
98 | }
99 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/fused_kernels/type_shim.h:
--------------------------------------------------------------------------------
 1 | /* coding=utf-8
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | 
18 | #include <ATen/ATen.h>
19 | #include "compat.h"
20 | 
21 | 
22 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
23 |   switch(TYPE)								\
24 |     {									\
25 |     case at::ScalarType::Half:						\
26 |       {									\
27 | 	using scalar_t = at::Half;					\
28 | 	__VA_ARGS__;							\
29 | 	break;								\
30 |       }									\
31 |     case at::ScalarType::BFloat16:					\
32 |       {									\
33 | 	using scalar_t = at::BFloat16;					\
34 | 	__VA_ARGS__;							\
35 | 	break;								\
36 |       }									\
37 |     default:								\
38 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
39 |       }
40 | 
41 | 
42 | 
43 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
44 |   switch(TYPEIN)							\
45 |     {									\
46 |     case at::ScalarType::Float:						\
47 |       {									\
48 | 	using scalar_t_in = float;					\
49 | 	switch(TYPEOUT)							\
50 | 	  {								\
51 | 	  case at::ScalarType::Float:					\
52 | 	    {								\
53 | 	      using scalar_t_out = float;				\
54 | 	      __VA_ARGS__;						\
55 | 	      break;							\
56 | 	    }								\
57 | 	  case at::ScalarType::Half:					\
58 | 	    {								\
59 | 	      using scalar_t_out = at::Half;				\
60 | 	      __VA_ARGS__;						\
61 | 	      break;							\
62 | 	    }								\
63 | 	  case at::ScalarType::BFloat16:				\
64 | 	    {								\
65 | 	      using scalar_t_out = at::BFloat16;			\
66 | 	      __VA_ARGS__;						\
67 | 	      break;							\
68 | 	    }								\
69 | 	  default:							\
70 | 	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
71 | 	  }								\
72 | 	break;								\
73 |       }									\
74 |     case at::ScalarType::Half:						\
75 |       {									\
76 | 	using scalar_t_in = at::Half;					\
77 | 	using scalar_t_out = at::Half;					\
78 | 	__VA_ARGS__;							\
79 | 	break;								\
80 |       }									\
81 |     case at::ScalarType::BFloat16:					\
82 |       {									\
83 | 	using scalar_t_in = at::BFloat16;				\
84 | 	using scalar_t_out = at::BFloat16;				\
85 | 	__VA_ARGS__;							\
86 | 	break;								\
87 |       }									\
88 |     default:								\
89 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
90 |     }
91 | 
92 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
17 | 
18 | from .distributed import DistributedDataParallel
19 | from .bert_model import BertModel
20 | from .gpt_model import GPTModel, GPTModelPipe
21 | from .t5_model import T5Model
22 | from .language_model import get_language_model
23 | from .module import Float16Module
24 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | 
18 | torch._C._jit_set_profiling_mode(False)
19 | torch._C._jit_set_profiling_executor(False)
20 | torch._C._jit_override_can_fuse_on_cpu(True)
21 | torch._C._jit_override_can_fuse_on_gpu(True)
22 | 
23 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
24 | # 1/sqrt(2*pi)-> 0.3989423
25 | # 1/sqrt(2)   -> 0.70710678
26 | # sqrt(2/pi)  -> 0.79788456
27 | # this function is tanh approximation of gelu
28 | # actual gelu is:
29 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
30 | 
31 | @torch.jit.script
32 | def bias_gelu(bias, y):
33 |     x = bias + y
34 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
35 | 
36 | # gradient of tanh approximation of gelu
37 | # gradient of actual gelu is:
38 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
39 | @torch.jit.script
40 | def bias_gelu_back(g, bias, y):
41 |     x = bias + y
42 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
43 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
44 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
45 |     return ff*g
46 | 
47 | class GeLUFunction(torch.autograd.Function):
48 |     @staticmethod
49 |     # bias is an optional argument
50 |     def forward(ctx, input, bias):
51 |         ctx.save_for_backward(input, bias)
52 |         return bias_gelu(bias, input)
53 | 
54 |     @staticmethod
55 |     def backward(ctx, grad_output):
56 |         input, bias = ctx.saved_tensors
57 |         tmp = bias_gelu_back(grad_output, bias, input)
58 |         return tmp, tmp
59 | 
60 | bias_gelu_impl = GeLUFunction.apply
61 | 


--------------------------------------------------------------------------------
/megatron/model/fused_layer_norm.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """This code is copied fron NVIDIA apex:
 17 |       https://github.com/NVIDIA/apex
 18 |    with some changes. """
 19 | 
 20 | import numbers
 21 | 
 22 | 
 23 | from megatron import get_args
 24 | from megatron import mpu
 25 | from packaging import version
 26 | from torch import nn
 27 | from torch.nn import init
 28 | from torch.nn.parameter import Parameter
 29 | import importlib
 30 | import torch
 31 | import torch.nn.functional as F
 32 | 
 33 | global fused_mix_prec_layer_norm_cuda
 34 | fused_mix_prec_layer_norm_cuda = None
 35 | 
 36 | 
 37 | class FusedLayerNormAffineFunction(torch.autograd.Function):
 38 | 
 39 |   @staticmethod
 40 |   def forward(ctx, input, weight, bias, normalized_shape, eps):
 41 | 
 42 |     ctx.normalized_shape = normalized_shape
 43 |     ctx.eps = eps
 44 |     input_ = input.contiguous()
 45 |     weight_ = weight.contiguous()
 46 |     bias_ = bias.contiguous()
 47 |     output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
 48 |         input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
 49 |     ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
 50 | 
 51 |     return output
 52 | 
 53 | 
 54 |   @staticmethod
 55 |   def backward(ctx, grad_output):
 56 | 
 57 |     input_, weight_, bias_, mean, invvar = ctx.saved_tensors
 58 |     grad_input = grad_weight = grad_bias = None
 59 |     grad_input, grad_weight, grad_bias \
 60 |       = fused_mix_prec_layer_norm_cuda.backward_affine(
 61 |         grad_output.contiguous(), mean, invvar,
 62 |         input_, ctx.normalized_shape,
 63 |         weight_, bias_, ctx.eps)
 64 | 
 65 |     return grad_input, grad_weight, grad_bias, None, None
 66 | 
 67 | 
 68 | 
 69 | class MixedFusedLayerNorm(torch.nn.Module):
 70 | 
 71 |   def __init__(self, normalized_shape, eps=1e-5):
 72 |     super(MixedFusedLayerNorm, self).__init__()
 73 | 
 74 |     global fused_mix_prec_layer_norm_cuda
 75 |     fused_mix_prec_layer_norm_cuda = importlib.import_module(
 76 |       "fused_mix_prec_layer_norm_cuda")
 77 | 
 78 |     if isinstance(normalized_shape, numbers.Integral):
 79 |         normalized_shape = (normalized_shape,)
 80 |     self.normalized_shape = torch.Size(normalized_shape)
 81 |     self.eps = eps
 82 |     self.weight = Parameter(torch.Tensor(*normalized_shape))
 83 |     self.bias = Parameter(torch.Tensor(*normalized_shape))
 84 |     self.reset_parameters()
 85 | 
 86 |     args = get_args()
 87 |     self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters
 88 | 
 89 |     self.use_meg_ds_fused_layer_norm = (
 90 |       args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
 91 |       or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920
 92 |     )
 93 | 
 94 | 
 95 |   def reset_parameters(self):
 96 | 
 97 |     init.ones_(self.weight)
 98 |     init.zeros_(self.bias)
 99 | 
100 | 
101 |   def forward(self, input):
102 | 
103 |     if self.layernorm_tp_auto_sync:
104 |       torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
105 |       torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
106 | 
107 |     if self.use_meg_ds_fused_layer_norm:
108 |         return FusedLayerNormAffineFunction.apply(
109 |             input, self.weight, self.bias, self.normalized_shape, self.eps)
110 |     else:
111 |         return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
112 | 


--------------------------------------------------------------------------------
/megatron/model/glu_activations.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | 
 5 | from megatron import logging
 6 | from megatron.model.utils import log_debug_usage
 7 | 
 8 | logger = logging.get_logger(__name__)
 9 | 
10 | class _GLUBaseModule(nn.Module):
11 |     def __init__(self, activation_fn):
12 |         super().__init__()
13 |         self.activation_fn = activation_fn
14 | 
15 |     def forward(self, x):
16 |         # dim=-1 breaks in jit for pt<1.10
17 |         x1, x2 = x.chunk(2, dim=(x.ndim - 1))
18 |         return x1 * self.activation_fn(x2)
19 | 
20 | 
21 | class LiGLU(_GLUBaseModule):
22 |     def __init__(self):
23 |         super().__init__(nn.Identity())
24 | 
25 | 
26 | class GEGLU(_GLUBaseModule):
27 |     def __init__(self):
28 |         super().__init__(F.gelu)
29 | 
30 | 
31 | class ReGLU(_GLUBaseModule):
32 |     def __init__(self):
33 |         super().__init__(F.relu)
34 | 
35 | 
36 | class SwiGLU(_GLUBaseModule):
37 |     def __init__(self):
38 |         super().__init__(F.silu)
39 | 
40 | 
41 | liglu = log_debug_usage(logger, "Using GLU activation: LiGLU.")(torch.jit.script(LiGLU()))
42 | geglu = log_debug_usage(logger, "Using GLU activation: GELU.")(torch.jit.script(GEGLU()))
43 | reglu = log_debug_usage(logger, "Using GLU activation: ReGLU.")(torch.jit.script(ReGLU()))
44 | swiglu = log_debug_usage(logger, "Using GLU activation: SwiGLU.")(torch.jit.script(SwiGLU()))
45 | 
46 | 
47 | GLU_ACTIVATIONS = {
48 |     "geglu": geglu,
49 |     "liglu": liglu,
50 |     "reglu": reglu,
51 |     "swiglu": swiglu,
52 | }
53 | 


--------------------------------------------------------------------------------
/megatron/model/positional_embeddings.py:
--------------------------------------------------------------------------------
 1 | # Extracted from: https://github.com/EleutherAI/gpt-neox
 2 | import torch
 3 | 
 4 | 
 5 | class RotaryEmbedding(torch.nn.Module):
 6 | 
 7 |     def __init__(self, dim, base=10000, precision=torch.half):
 8 |         super().__init__()
 9 |         inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))
10 |         self.register_buffer('inv_freq', inv_freq)
11 |         self.max_seq_len_cached = None
12 |         self.cos_cached = None
13 |         self.sin_cached = None
14 |         self.precision = precision
15 | 
16 |     def forward(self, x, seq_dim=1, seq_len=None):
17 |         if seq_len is None:
18 |             seq_len = x.shape[seq_dim]
19 |         if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
20 |             self.max_seq_len_cached = seq_len
21 |             t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
22 |             freqs = torch.einsum('i,j->ij', t, self.inv_freq)
23 |             # Different from paper, but it uses a different permutation in order to obtain the same calculation
24 |             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
25 |             if self.precision == torch.bfloat16:
26 |                 emb = emb.float()
27 |             # [sx, 1 (b * np), hn]
28 |             self.cos_cached = emb.cos()[:, None, :]
29 |             self.sin_cached = emb.sin()[:, None, :]
30 |             if self.precision == torch.bfloat16:
31 |                 self.cos_cached = self.cos_cached.bfloat16()
32 |                 self.sin_cached = self.sin_cached.bfloat16()
33 |         return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
34 | 
35 | 
36 | # rotary pos emb helpers:
37 | 
38 | def rotate_half(x):
39 |     x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
40 |     return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
41 | 
42 | 
43 | @torch.jit.script
44 | def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0):
45 |     cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
46 |     return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
47 | 
48 | 
49 | def apply_rotary_pos_emb_torch(q, k, cos, sin, offset: int = 0):  # jitting fails with bf16
50 |     cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
51 |     return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)


--------------------------------------------------------------------------------
/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Utilities for models."""
17 | 
18 | import math
19 | from functools import wraps
20 | 
21 | import torch
22 | 
23 | from megatron import get_args
24 | 
25 | def init_method_normal(sigma):
26 |     """Init method based on N(0, sigma)."""
27 |     def init_(tensor):
28 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
29 | 
30 |     return init_
31 | 
32 | 
33 | def scaled_init_method_normal(sigma, num_layers):
34 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
35 |     std = sigma / math.sqrt(2.0 * num_layers)
36 | 
37 |     def init_(tensor):
38 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
39 | 
40 |     return init_
41 | 
42 | 
43 | def attention_mask_func(attention_scores, attention_mask):
44 |     args = get_args()
45 |     if args.curriculum_learning:
46 |         attention_mask_ = attention_mask
47 |         actual_seqlen = attention_scores.size()[2]
48 |         if actual_seqlen != attention_mask_.size()[2]:
49 |             # attention_mask has size [1, 1, seqlen, seqlen]
50 |             attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous()
51 |         attention_scores.masked_fill_(attention_mask_, torch.finfo(attention_scores.dtype).min)
52 |     else:
53 |         attention_scores.masked_fill_(attention_mask, torch.finfo(attention_scores.dtype).min)
54 |     return attention_scores
55 | 
56 | 
57 | def get_linear_layer(rows, columns, init_method):
58 |     """Simple linear layer with weight initialization."""
59 |     layer = torch.nn.Linear(rows, columns)
60 |     init_method(layer.weight)
61 |     with torch.no_grad():
62 |         layer.bias.zero_()
63 |     return layer
64 | 
65 | @torch.jit.script
66 | def gelu_impl(x):
67 |     """OpenAI's gelu implementation."""
68 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
69 |                                        (1.0 + 0.044715 * x * x)))
70 | def openai_gelu(x):
71 |     return gelu_impl(x)
72 | 
73 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
74 | @torch.jit.script
75 | def erf_gelu(x):
76 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
77 | 
78 | def log_debug_usage(logger, msg: str):
79 |     def log_debug_usage_(func):
80 |         """Helper function in order to log a message when using a function for the first time"""
81 |         func.__logged_message__ = False
82 | 
83 |         @wraps(func)
84 |         def wrapped(*args, **kwargs):
85 |             if func.__logged_message__ is False:
86 |                 logger.debug(msg)
87 |                 func.__logged_message__ = True
88 |             return func(*args, **kwargs)
89 | 
90 |         return wrapped
91 |     return log_debug_usage_
92 | 


--------------------------------------------------------------------------------
/megatron/mpu/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Model parallel utility interface."""
17 | 
18 | from .cross_entropy import vocab_parallel_cross_entropy
19 | 
20 | from .data import broadcast_data
21 | 
22 | from .initialize import is_unitialized
23 | from .initialize import destroy_model_parallel
24 | from .initialize import get_data_parallel_group
25 | from .initialize import get_data_parallel_rank
26 | from .initialize import get_data_parallel_world_size
27 | from .initialize import get_embedding_group
28 | from .initialize import get_model_parallel_group
29 | from .initialize import get_tensor_model_parallel_group
30 | from .initialize import get_pipeline_model_parallel_group
31 | from .initialize import get_tensor_model_parallel_rank, set_tensor_model_parallel_rank
32 | from .initialize import get_pipeline_model_parallel_rank, set_pipeline_model_parallel_rank
33 | from .initialize import is_pipeline_first_stage, is_pipeline_last_stage
34 | from .initialize import get_tensor_model_parallel_src_rank
35 | from .initialize import get_pipeline_model_parallel_first_rank
36 | from .initialize import get_pipeline_model_parallel_last_rank
37 | from .initialize import get_pipeline_model_parallel_next_rank
38 | from .initialize import get_pipeline_model_parallel_prev_rank
39 | from .initialize import get_tensor_model_parallel_world_size, set_tensor_model_parallel_world_size
40 | from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_model_parallel_world_size
41 | from .initialize import get_virtual_pipeline_model_parallel_rank, set_virtual_pipeline_model_parallel_rank
42 | from .initialize import initialize_model_parallel
43 | from .initialize import model_parallel_is_initialized
44 | from .initialize import get_model_parallel_world_size, get_model_parallel_rank
45 | 
46 | from .layers import ColumnParallelLinear
47 | from .layers import RowParallelLinear
48 | from .layers import VocabParallelEmbedding
49 | from .layers import (set_tensor_model_parallel_attributes,
50 |                      set_defaults_if_not_set_tensor_model_parallel_attributes,
51 |                      copy_tensor_model_parallel_attributes)
52 |                      
53 | from .mappings import copy_to_tensor_model_parallel_region
54 | from .mappings import gather_from_tensor_model_parallel_region
55 | from .mappings import reduce_from_tensor_model_parallel_region
56 | from .mappings import scatter_to_tensor_model_parallel_region
57 | 
58 | from .random import checkpoint
59 | from .random import get_cuda_rng_tracker
60 | from .random import init_checkpointed_activations_memory_buffer
61 | from .random import model_parallel_cuda_manual_seed
62 | from .random import reset_checkpointed_activations_memory_buffer
63 | from .random import gather_split_1d_tensor
64 | from .random import split_tensor_into_1d_equal_chunks
65 | 
66 | from .utils import divide
67 | from .utils import split_tensor_along_last_dim
68 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import os
18 | import random
19 | import numpy
20 | import torch
21 | 
22 | import mpu
23 | 
24 | 
25 | class IdentityLayer(torch.nn.Module):
26 |     def __init__(self, size, scale=1.0):
27 |         super(IdentityLayer, self).__init__()
28 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
29 | 
30 |     def forward(self):
31 |         return self.weight
32 | 
33 | 
34 | def set_random_seed(seed):
35 |     """Set random seed for reproducability."""
36 |     random.seed(seed)
37 |     numpy.random.seed(seed)
38 |     torch.manual_seed(seed)
39 |     mpu.model_parallel_cuda_manual_seed(seed)
40 | 
41 | 
42 | def initialize_distributed(backend='nccl'):
43 |     """Initialize torch.distributed."""
44 |     # Get local rank in case it is provided.
45 |     parser = argparse.ArgumentParser()
46 |     parser.add_argument('--local_rank', type=int, default=None,
47 |                         help='local rank passed from distributed launcher')
48 |     args = parser.parse_args()
49 |     local_rank = args.local_rank
50 | 
51 |     # Get rank and world size.
52 |     rank = int(os.getenv('RANK', '0'))
53 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
54 | 
55 |     print('> initializing torch.distributed with local rank: {}, '
56 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
57 | 
58 |     # Set the device id.
59 |     device = rank % torch.cuda.device_count()
60 |     if local_rank is not None:
61 |         device = local_rank
62 |     torch.cuda.set_device(device)
63 | 
64 |     # Call the init process.
65 |     init_method = 'tcp://'
66 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
67 |     master_port = os.getenv('MASTER_PORT', '6000')
68 |     init_method += master_ip + ':' + master_port
69 |     torch.distributed.init_process_group(
70 |         backend=backend,
71 |         world_size=world_size,
72 |         rank=rank,
73 |         init_method=init_method)
74 | 
75 | 
76 | def print_separator(message):
77 |     torch.distributed.barrier()
78 |     filler_len = (78 - len(message)) // 2
79 |     filler = '-' * filler_len
80 |     string = '\n' + filler + ' {} '.format(message) + filler
81 |     if torch.distributed.get_rank() == 0:
82 |         print(string, flush=True)
83 |     torch.distributed.barrier()
84 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | from mpu import data as data_utils
19 | import mpu
20 | import torch
21 | import functools
22 | import operator
23 | import sys
24 | sys.path.append("../..")
25 | 
26 | 
27 | def test_broadcast_data(tensor_model_parallel_size):
28 | 
29 |     if torch.distributed.get_rank() == 0:
30 |         print('> testing broadcast_data with model parallel size {} ...'.
31 |               format(tensor_model_parallel_size))
32 | 
33 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
34 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
35 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
36 | 
37 |     key_size_t = {'key1': [7, 11],
38 |                   'key2': [8, 2, 1],
39 |                   'key3': [13],
40 |                   'key4': [5, 1, 2],
41 |                   'key5': [5, 12]}
42 |     keys = list(key_size_t.keys())
43 | 
44 |     data = {}
45 |     data_t = {}
46 |     for key in key_size_t:
47 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
48 |         data_t[key] = data[key].clone()
49 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
50 |     data_t['keyX'] = data['keyX'].clone()
51 |     if mpu.get_tensor_model_parallel_rank() != 0:
52 |         data = None
53 | 
54 |     data_utils._check_data_types(keys, data_t, torch.int64)
55 |     key_size, key_numel, \
56 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
57 |     for key in keys:
58 |         assert key_size[key] == key_size_t[key]
59 |     total_numel_t = 0
60 |     for key in keys:
61 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
62 |         assert key_numel[key] == target_size
63 |         total_numel_t += target_size
64 |     assert total_numel == total_numel_t
65 | 
66 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
67 |     for key in keys:
68 |         tensor = data_t[key].cuda()
69 |         assert data_b[key].sub(tensor).abs().max() == 0
70 | 
71 |     # Reset groups
72 |     mpu.destroy_tensor_model_parallel()
73 | 
74 |     torch.distributed.barrier()
75 |     if torch.distributed.get_rank() == 0:
76 |         print('>> passed the test :-)')
77 | 
78 | 
79 | if __name__ == '__main__':
80 | 
81 |     initialize_distributed()
82 |     world_size = torch.distributed.get_world_size()
83 | 
84 |     tensor_model_parallel_size = 1
85 |     while tensor_model_parallel_size <= world_size:
86 |         print_separator('test test broadcast data')
87 |         test_broadcast_data(tensor_model_parallel_size)
88 |         tensor_model_parallel_size *= 2
89 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from commons import print_separator
17 | from commons import initialize_distributed
18 | import mpu
19 | import torch
20 | import sys
21 | sys.path.append("../..")
22 | 
23 | 
24 | def test_initialize_model_parallel(tensor_model_parallel_size):
25 | 
26 |     if torch.distributed.get_rank() == 0:
27 |         print('> testing initialize_model_parallel with size {} ...'.format(
28 |             tensor_model_parallel_size))
29 |     tensor_model_parallel_size_ = min(tensor_model_parallel_size,
30 |                                torch.distributed.get_world_size())
31 |     assert not mpu.model_parallel_is_initialized()
32 |     mpu.initialize_model_parallel(tensor_model_parallel_size_)
33 |     assert mpu.model_parallel_is_initialized()
34 | 
35 |     # Checks.
36 |     def check(group, world_size, rank):
37 |         assert world_size == torch.distributed.get_world_size(group=group)
38 |         assert rank == torch.distributed.get_rank(group=group)
39 | 
40 |     # Model parallel.
41 |     world_size = tensor_model_parallel_size_
42 |     rank = torch.distributed.get_rank() % tensor_model_parallel_size_
43 |     assert world_size == mpu.get_tensor_model_parallel_world_size()
44 |     assert rank == mpu.get_tensor_model_parallel_rank()
45 |     check(mpu.get_tensor_model_parallel_group(), world_size, rank)
46 | 
47 |     # Data parallel.
48 |     world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
49 |     rank = torch.distributed.get_rank() // tensor_model_parallel_size
50 |     assert world_size == mpu.get_data_parallel_world_size()
51 |     assert rank == mpu.get_data_parallel_rank()
52 |     check(mpu.get_data_parallel_group(), world_size, rank)
53 | 
54 |     # Reset groups
55 |     mpu.destroy_model_parallel()
56 | 
57 |     torch.distributed.barrier()
58 |     if torch.distributed.get_rank() == 0:
59 |         print('>> passed the test :-)')
60 | 
61 | 
62 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
63 | 
64 |     if torch.distributed.get_rank() == 0:
65 |         print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
66 |             tensor_model_parallel_size_))
67 |     tensor_model_parallel_size = min(tensor_model_parallel_size_,
68 |                               torch.distributed.get_world_size())
69 |     assert not mpu.model_parallel_is_initialized()
70 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
71 |     assert mpu.model_parallel_is_initialized()
72 | 
73 |     # Checks
74 |     src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank()
75 |     assert mpu.get_tensor_model_parallel_src_rank() == src_rank
76 | 
77 |     # Reset groups
78 |     mpu.destroy_model_parallel()
79 | 
80 |     torch.distributed.barrier()
81 |     if torch.distributed.get_rank() == 0:
82 |         print('>> passed the test :-)')
83 | 
84 | 
85 | if __name__ == '__main__':
86 | 
87 |     initialize_distributed()
88 |     world_size = torch.distributed.get_world_size()
89 |     tensor_model_parallel_size = 1
90 |     while tensor_model_parallel_size <= world_size:
91 |         print_separator('test initialize model parallel')
92 |         test_initialize_model_parallel(tensor_model_parallel_size)
93 |         print_separator('test model parallel source rank')
94 |         test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
95 |         tensor_model_parallel_size *= 2
96 | 


--------------------------------------------------------------------------------
/megatron/mpu/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import torch
18 | 
19 | 
20 | def ensure_divisibility(numerator, denominator):
21 |     """Ensure that numerator is divisible by the denominator."""
22 |     assert numerator % denominator == 0, '{} is not divisible by {}'.format(
23 |         numerator, denominator)
24 | 
25 | 
26 | def divide(numerator, denominator):
27 |     """Ensure that numerator is divisible by the denominator and return
28 |     the division value."""
29 |     ensure_divisibility(numerator, denominator)
30 |     return numerator // denominator
31 | 
32 | 
33 | def split_tensor_along_last_dim(tensor, num_partitions,
34 |                                 contiguous_split_chunks=False):
35 |     """Split a tensor along its last dimension.
36 |     Arguments:
37 |         tensor: input tensor.
38 |         num_partitions: number of partitions to split the tensor
39 |         contiguous_split_chunks: If True, make each chunk contiguous
40 |                                  in memory.
41 |     """
42 |     # Get the size and dimension.
43 |     last_dim = tensor.dim() - 1
44 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
45 |     # Split.
46 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
47 |     # Note: torch.split does not create contiguous tensors by default.
48 |     if contiguous_split_chunks:
49 |         return tuple(chunk.contiguous() for chunk in tensor_list)
50 | 
51 |     return tensor_list
52 | 
53 | 
54 | class VocabUtility:
55 |     """Split the vocabulary into `world_size` chunks amd return the
56 |         first and last index of the vocabulary belonging to the `rank`
57 |         partition: Note that indecies in [fist, last)"""
58 | 
59 |     @staticmethod
60 |     def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
61 |                                                   rank, world_size):
62 |         index_f = rank * per_partition_vocab_size
63 |         index_l = index_f + per_partition_vocab_size
64 |         return index_f, index_l
65 | 
66 |     @staticmethod
67 |     def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size):
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank, world_size)
71 | 


--------------------------------------------------------------------------------
/megatron/package_info.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | MAJOR = 1
17 | MINOR = 1.5
18 | 
19 | # Use the following formatting: (major, minor)
20 | VERSION = (MAJOR, MINOR)
21 | 
22 | __version__ = '.'.join(map(str, VERSION)) + '.bs'
23 | __package_name__ = 'megatron-lm'
24 | __contact_names__ = 'NVIDIA INC'
25 | __url__ = 'https://github.com/NVIDIA/Megatron-LM'
26 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
27 | __description__ = 'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
28 | __license__ = 'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
29 | __keywords__ = 'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
30 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | from .tokenizer import build_tokenizer
18 | 


--------------------------------------------------------------------------------
/pretrain_vit.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Pretrain VIT"""
17 | 
18 | import torch
19 | import torch.nn.functional as F
20 | from megatron import get_args, get_timers, mpu, print_rank_0
21 | from megatron.data.vit_dataset import build_train_valid_datasets
22 | from megatron.model.vit_model import VitModel
23 | from megatron.training import pretrain
24 | from megatron.utils import average_losses_across_data_parallel_group
25 | 
26 | def model_provider():
27 |     """Build the model."""
28 | 
29 |     print_rank_0("building VIT model ...")
30 |     args = get_args()
31 | 
32 |     model = VitModel(num_classes=args.num_classes)
33 |     return model
34 | 
35 | def get_batch(data_iterator):
36 |     """Build the batch."""
37 |     data = next(data_iterator)
38 | 
39 |     # only data parallelism; no need for broadcast
40 |     images = data[0].cuda()
41 |     labels = data[1].cuda()
42 | 
43 |     return images, labels
44 | 
45 | def forward_step(data_iterator, model, input_tensor):
46 |     """Forward step."""
47 |     timers = get_timers()
48 |     assert input_tensor is None
49 | 
50 |     # Get the batch.
51 |     timers("batch-generator").start()
52 |     (
53 |         images,
54 |         labels,
55 |     ) = get_batch(data_iterator)
56 |     timers("batch-generator").stop()
57 | 
58 |     # Forward model. lm_labels
59 |     logits = model(images).contiguous().float()
60 |     loss = F.cross_entropy(logits, labels)
61 | 
62 |     outputs = torch.argmax(logits, -1)
63 |     correct = (outputs == labels).float()
64 |     accuracy = torch.mean(correct)
65 | 
66 |     averaged_loss = average_losses_across_data_parallel_group([loss, accuracy])
67 | 
68 |     return loss, {"loss": averaged_loss[0], "accuracy": averaged_loss[1]}
69 | 
70 | 
71 | def train_valid_test_datasets_provider(train_val_test_num_samples):
72 |     """Build train, valid, and test datasets."""
73 |     args = get_args()
74 | 
75 |     print_rank_0(
76 |         "> building train, validation, and test datasets " "for VIT ..."
77 |     )
78 |     train_ds, valid_ds = build_train_valid_datasets(data_path=args.data_path)
79 |     print_rank_0("> finished creating VIT datasets ...")
80 | 
81 |     return train_ds, valid_ds, None
82 | 
83 | 
84 | if __name__ == "__main__":
85 | 
86 |     pretrain(
87 |         train_valid_test_datasets_provider,
88 |         model_provider,
89 |         forward_step,
90 |         args_defaults={'dataloader_type': 'cyclic'}
91 |     )
92 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 119
3 | target-version = ['py35']


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets
 2 | nltk
 3 | numpy
 4 | parameterized
 5 | pybind11
 6 | regex
 7 | six
 8 | tensorboard
 9 | torch>=1.7
10 | transformers
11 | DeepSpeed @ git+https://github.com/microsoft/DeepSpeed.git
12 | # versions from HF transformers
13 | black==21.4b0
14 | isort>=5.5.4
15 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | DIR=`pwd`
  5 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
  6 | #mkdir -p $DIR/logs
  7 | #mkdir -p /tmp/logs
  8 | 
  9 | 
 10 | #DATASET_1="<PATH TO THE FIRST DATASET>"
 11 | #DATASET_2="<PATH TO THE SECOND DATASET>"
 12 | #DATASET_3="<PATH TO THE THIRD DATASET>"
 13 | #DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
 14 | 
 15 | BASE_DATA_PATH=/data/Megatron-LM/data
 16 | DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
 17 | VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
 18 | MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
 19 | 
 20 | 
 21 | script_path=$(realpath $0)
 22 | script_dir=$(dirname $script_path)
 23 | #CONFIG_JSON="$script_dir/ds_config.json"
 24 | CONFIG_JSON="/tmp/ds_config.json"
 25 | 
 26 | USE_DEEPSPEED=1
 27 | ZERO_STAGE=0
 28 | 
 29 | 
 30 | # Debug
 31 | #TP=4
 32 | #PP=4
 33 | #LAYERS=8
 34 | #HIDDEN=512
 35 | #SEQ=1024
 36 | #GLOBAL_BATCH=128
 37 | #WORKER_STR="-i worker-0"
 38 | 
 39 | 
 40 | TP=1
 41 | PP=2
 42 | HIDDEN=1024
 43 | LAYERS=24
 44 | SEQ=1024
 45 | GLOBAL_BATCH=2
 46 | WORKER_STR=""
 47 | 
 48 | MICRO_BATCH=1
 49 | 
 50 | DTYPE="bf16"
 51 | 
 52 | LOG_DIR="/tmp/tensorboard/tp${TP}_pp${PP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_${DTYPE}_fix3"
 53 | mkdir -p $LOG_DIR
 54 | 
 55 | while [[ $# -gt 0 ]]
 56 | do
 57 | key="$1"
 58 | case $key in
 59 |     --no-deepspeed)
 60 |     USE_DEEPSPEED=0;
 61 |     shift
 62 |     ;;
 63 |     -z|--zero-stage)
 64 |     ZERO_STAGE=$2;
 65 |     shift
 66 |     ;;
 67 |     *)
 68 |     echo "Unknown argument(s)"
 69 |     usage
 70 |     exit 1
 71 |     shift
 72 |     ;;
 73 | esac
 74 | done
 75 | 
 76 | 
 77 | options=" \
 78 | 	--tensor-model-parallel-size $TP \
 79 | 	--pipeline-model-parallel-size $PP \
 80 |         --num-layers $LAYERS \
 81 |         --hidden-size $HIDDEN \
 82 |         --num-attention-heads 32 \
 83 |         --seq-length $SEQ \
 84 |         --loss-scale 12 \
 85 |         --max-position-embeddings $SEQ \
 86 | 	--micro-batch-size $MICRO_BATCH \
 87 | 	--global-batch-size $GLOBAL_BATCH \
 88 | 	--train-iters 1000 \
 89 |         --lr 6.0e-5 \
 90 | 	--min-lr 6.0e-6 \
 91 |         --lr-decay-style cosine \
 92 |         --log-interval 1 \
 93 |         --eval-iters 40 \
 94 |         --eval-interval 1000 \
 95 | 	--data-path ${DATASET} \
 96 | 	--vocab-file ${VOCAB_PATH} \
 97 | 	--merge-file ${MERGE_PATH} \
 98 | 	--save-interval 10000 \
 99 |         --split 98,2,0 \
100 |         --clip-grad 1.0 \
101 | 	--weight-decay 0.1 \
102 | 	--adam-beta1 0.9 \
103 | 	--adam-beta2 0.95 \
104 | 	--init-method-std 0.006 \
105 |         --${DTYPE} \
106 | 	--checkpoint-activations \
107 | 	--exit-interval 10000 \
108 | 	--tensorboard-dir $LOG_DIR
109 |         "
110 | 
111 | 
112 | if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
113 | 	echo "Using DeepSpeed"
114 | 	options="${options} \
115 | 		--deepspeed \
116 | 		--deepspeed_config=${CONFIG_JSON} \
117 | 		--zero-stage=${ZERO_STAGE} \
118 | 		--deepspeed-activation-checkpointing \
119 | 	"
120 | fi
121 | 
122 | 
123 | cat <<EOT > $CONFIG_JSON
124 | {
125 |   "train_batch_size" : $GLOBAL_BATCH,
126 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH,
127 |   "steps_per_print": 1,
128 | 
129 |   "zero_optimization": {
130 |     "stage": $ZERO_STAGE
131 |   },
132 | 
133 |   "bf16": {
134 |     "enabled": true
135 |   },
136 | 
137 |   "fp16": {
138 |     "enabled": false,
139 |     "loss_scale": 0,
140 |     "loss_scale_window": 500,
141 |     "hysteresis": 2,
142 |     "min_loss_scale": 1,
143 |     "initial_scale_power": 12
144 |   },
145 | 
146 |   "wall_clock_breakdown" : true
147 | }
148 | EOT
149 | 
150 | WORKER_STR="-i worker-0:0,1"
151 | #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
152 | #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
153 | run_cmd="deepspeed $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
154 | 
155 | 
156 | echo ${run_cmd}
157 | eval ${run_cmd}
158 | 
159 | set +x
160 | 


--------------------------------------------------------------------------------
/run_fp16.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | 
  4 | DIR=`pwd`
  5 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
  6 | #mkdir -p $DIR/logs
  7 | #mkdir -p /tmp/logs
  8 | 
  9 | 
 10 | #DATASET_1="<PATH TO THE FIRST DATASET>"
 11 | #DATASET_2="<PATH TO THE SECOND DATASET>"
 12 | #DATASET_3="<PATH TO THE THIRD DATASET>"
 13 | #DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
 14 | 
 15 | BASE_DATA_PATH=/data/Megatron-LM/data
 16 | DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron
 17 | VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
 18 | MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
 19 | 
 20 | 
 21 | script_path=$(realpath $0)
 22 | script_dir=$(dirname $script_path)
 23 | #CONFIG_JSON="$script_dir/ds_config.json"
 24 | CONFIG_JSON="/tmp/ds_config.json"
 25 | 
 26 | USE_DEEPSPEED=1
 27 | ZERO_STAGE=0
 28 | 
 29 | 
 30 | # Debug
 31 | #TP=4
 32 | #PP=4
 33 | #LAYERS=8
 34 | #HIDDEN=512
 35 | #SEQ=1024
 36 | #GLOBAL_BATCH=128
 37 | #WORKER_STR="-i worker-0"
 38 | 
 39 | 
 40 | TP=1
 41 | PP=1
 42 | DP=2
 43 | WORLD_SIZE=$((TP*PP*DP))
 44 | HIDDEN=1024
 45 | LAYERS=24
 46 | SEQ=1024
 47 | GLOBAL_BATCH=1
 48 | WORKER_STR=""
 49 | 
 50 | MICRO_BATCH=1
 51 | LR=6.0e-4
 52 | MIN_LR=6.0e-5
 53 | DTYPE="fp16"
 54 | EXP_DIR=${HOME}/experiments/results/bf16
 55 | LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3"
 56 | mkdir -p $LOG_DIR
 57 | 
 58 | while [[ $# -gt 0 ]]
 59 | do
 60 | key="$1"
 61 | case $key in
 62 |     --no-deepspeed)
 63 |     USE_DEEPSPEED=0;
 64 |     shift
 65 |     ;;
 66 |     -z|--zero-stage)
 67 |     ZERO_STAGE=$2;
 68 |     shift
 69 |     ;;
 70 |     *)
 71 |     echo "Unknown argument(s)"
 72 |     usage
 73 |     exit 1
 74 |     shift
 75 |     ;;
 76 | esac
 77 | done
 78 | 
 79 | 
 80 | options=" \
 81 | 	--tensor-model-parallel-size $TP \
 82 | 	--pipeline-model-parallel-size $PP \
 83 |         --num-layers $LAYERS \
 84 |         --hidden-size $HIDDEN \
 85 |         --num-attention-heads 32 \
 86 |         --seq-length $SEQ \
 87 |         --loss-scale 12 \
 88 |         --max-position-embeddings $SEQ \
 89 | 	--micro-batch-size $MICRO_BATCH \
 90 | 	--global-batch-size $GLOBAL_BATCH \
 91 | 	--train-iters 1000 \
 92 |         --lr $LR \
 93 | 	--min-lr $MIN_LR \
 94 |         --lr-decay-style cosine \
 95 |         --log-interval 1 \
 96 |         --eval-iters 40 \
 97 |         --eval-interval 10 \
 98 | 	--data-path ${DATASET} \
 99 | 	--vocab-file ${VOCAB_PATH} \
100 | 	--merge-file ${MERGE_PATH} \
101 | 	--save-interval 10000 \
102 |         --split 98,2,0 \
103 |         --clip-grad 1.0 \
104 | 	--weight-decay 0.1 \
105 | 	--adam-beta1 0.9 \
106 | 	--adam-beta2 0.95 \
107 | 	--init-method-std 0.006 \
108 |         --${DTYPE} \
109 | 	--checkpoint-activations \
110 | 	--exit-interval 10000 \
111 | 	--tensorboard-dir $LOG_DIR
112 |         "
113 | 
114 | 
115 | if [[ ${USE_DEEPSPEED} -eq 1 ]]; then
116 | 	echo "Using DeepSpeed"
117 | 	options="${options} \
118 | 		--deepspeed \
119 | 		--deepspeed_config=${CONFIG_JSON} \
120 | 		--zero-stage=${ZERO_STAGE} \
121 | 		--deepspeed-activation-checkpointing \
122 | 	"
123 | fi
124 | 
125 | 
126 | cat <<EOT > $CONFIG_JSON
127 | {
128 |   "train_batch_size" : $GLOBAL_BATCH,
129 |   "train_micro_batch_size_per_gpu": $MICRO_BATCH,
130 |   "steps_per_print": 1,
131 | 
132 |   "zero_optimization": {
133 |     "stage": $ZERO_STAGE
134 |   },
135 | 
136 |   "bf16": {
137 |     "enabled": false
138 |   },
139 | 
140 |   "fp16": {
141 |     "enabled": true,
142 |     "loss_scale": 0,
143 |     "loss_scale_window": 500,
144 |     "hysteresis": 2,
145 |     "min_loss_scale": 1,
146 |     "initial_scale_power": 8
147 |   },
148 | 
149 |   "wall_clock_breakdown" : true
150 | }
151 | EOT
152 | 
153 | WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE"
154 | #WORKER_STR="-i worker-0:0,1,2,3"
155 | #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}"
156 | #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}"
157 | run_cmd="deepspeed --master_port 29600 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}"
158 | 
159 | 
160 | echo ${run_cmd}
161 | eval ${run_cmd}
162 | 
163 | set +x
164 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | # Scripts
2 | 
3 | This section should be organized with sub-folders for different things.
4 | 


--------------------------------------------------------------------------------
/scripts/bloom-inference-scripts/README.md:
--------------------------------------------------------------------------------
1 | # Inference scripts for BLOOM
2 | 
3 | Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-scripts
4 | 


--------------------------------------------------------------------------------
/scripts/bloom-inference-server/README.md:
--------------------------------------------------------------------------------
1 | ## Inference solutions for BLOOM 176B
2 | 
3 | Moved to https://github.com/huggingface/transformers-bloom-inference/tree/main/bloom-inference-server
4 | 


--------------------------------------------------------------------------------
/scripts/test_multiple_dataset_sampling/create_dummy_dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | def get_args():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument('--dir',
 7 |         type=str,
 8 |         required=True,
 9 |         help='directory to save data'
10 |         )
11 |     args = parser.parse_args()
12 |     return args
13 | 
14 | def main():
15 |     args = get_args()
16 | 
17 |     for i in range(10):
18 | 
19 |         row_limit = 1000
20 |         rows_to_save = [{'text': ''.join([str(i)+'-*']*128)}]
21 | 
22 |         with open('{}/dataset_{}.json'.format(args.dir, i), 'w') as f:
23 |             f.write(
24 |                 '\n'.join(json.dumps(_i) for _i in rows_to_save*row_limit)
25 |             )
26 | 
27 | if __name__ == '__main__':
28 |     main()
29 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | default_section = FIRSTPARTY
 3 | ensure_newline_before_comments = True
 4 | force_grid_wrap = 0
 5 | include_trailing_comma = True
 6 | known_first_party = megatron
 7 | known_third_party =
 8 |     apex
 9 |     codecarbon
10 |     datasets
11 |     deepspeed
12 |     git
13 |     nltk
14 |     numpy
15 |     pytest
16 |     tensorboard
17 |     torch
18 |     tqdm
19 |     transformers
20 | 
21 | line_length = 119
22 | lines_after_imports = 2
23 | multi_line_output = 3
24 | use_parentheses = True


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Setup for pip package."""
17 | 
18 | import os
19 | import sys
20 | import setuptools
21 | 
22 | if sys.version_info < (3,):
23 |     raise Exception("Python 2 is not supported by Megatron.")
24 | 
25 | from megatron.package_info import (
26 |     __description__,
27 |     __contact_names__,
28 |     __url__,
29 |     __download_url__,
30 |     __keywords__,
31 |     __license__,
32 |     __package_name__,
33 |     __version__,
34 | )
35 | 
36 | with open("README.md", "r") as fh:
37 |     long_description = fh.read()
38 | 
39 | ###############################################################################
40 | #                             Dependency Loading                              #
41 | # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
42 | 
43 | 
44 | def req_file(filename):
45 |     with open(filename) as f:
46 |         content = f.readlines()
47 |     return [x.strip() for x in content]
48 | 
49 | 
50 | install_requires = req_file("requirements.txt")
51 | 
52 | setuptools.setup(
53 |     name=__package_name__,
54 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
55 |     # the version across setup.py and the project code, see
56 |     # https://packaging.python.org/en/latest/single_source_version.html
57 |     version=__version__,
58 |     description=__description__,
59 |     long_description=long_description,
60 |     long_description_content_type="text/markdown",
61 |     # The project's main homepage.
62 |     url=__url__,
63 |     author=__contact_names__,
64 |     maintainer=__contact_names__,
65 |     # The licence under which the project is released
66 |     license=__license__,
67 |     classifiers=[
68 |         'Intended Audience :: Developers',
69 |         'Intended Audience :: Science/Research',
70 |         'Intended Audience :: Information Technology',
71 |         # Indicate what your project relates to
72 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
73 |         'Topic :: Software Development :: Libraries :: Python Modules',
74 |         # Supported python versions
75 |         'Programming Language :: Python :: 3.6',
76 |         'Programming Language :: Python :: 3.7',
77 |         'Programming Language :: Python :: 3.8',
78 |         # Additional Setting
79 |         'Environment :: Console',
80 |         'Natural Language :: English',
81 |         'Operating System :: OS Independent',
82 |     ],
83 |     python_requires='>=3.6',
84 |     packages=setuptools.find_packages(),
85 |     install_requires=install_requires,
86 |     # Add in any packaged data.
87 |     include_package_data=True,
88 |     zip_safe=False,
89 |     # PyPI package information.
90 |     keywords=__keywords__
91 | )
92 | 


--------------------------------------------------------------------------------
/tasks/data_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """ Tasks data utility."""
 17 | 
 18 | import re
 19 | import numpy as np
 20 | 
 21 | 
 22 | def clean_text(text):
 23 |     """Remove new lines and multiple spaces and adjust end of sentence dot."""
 24 | 
 25 |     text = text.replace("\n", " ")
 26 |     text = re.sub(r'\s+', ' ', text)
 27 |     for _ in range(3):
 28 |         text = text.replace(' . ', '. ')
 29 | 
 30 |     return text
 31 | 
 32 | 
 33 | def build_sample(ids, types, paddings, label, unique_id):
 34 |     """Convert to numpy and return a sample consumed by the batch producer."""
 35 | 
 36 |     ids_np = np.array(ids, dtype=np.int64)
 37 |     types_np = np.array(types, dtype=np.int64)
 38 |     paddings_np = np.array(paddings, dtype=np.int64)
 39 |     sample = ({'text': ids_np,
 40 |                'types': types_np,
 41 |                'padding_mask': paddings_np,
 42 |                'label': int(label),
 43 |                'uid': int(unique_id)})
 44 | 
 45 |     return sample
 46 | 
 47 | 
 48 | def build_tokens_types_paddings_from_text(text_a, text_b,
 49 |                                           tokenizer, max_seq_length):
 50 |     """Build token types and paddings, trim if needed, and pad if needed."""
 51 | 
 52 |     text_a_ids = tokenizer.tokenize(text_a)
 53 |     text_b_ids = None
 54 |     if text_b is not None:
 55 |         text_b_ids = tokenizer.tokenize(text_b)
 56 | 
 57 |     return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
 58 |                                                 max_seq_length, tokenizer.cls,
 59 |                                                 tokenizer.sep, tokenizer.pad)
 60 | 
 61 | 
 62 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
 63 |                                          cls_id, sep_id, pad_id):
 64 |     """Build token types and paddings, trim if needed, and pad if needed."""
 65 | 
 66 |     ids = []
 67 |     types = []
 68 |     paddings = []
 69 | 
 70 |     # [CLS].
 71 |     ids.append(cls_id)
 72 |     types.append(0)
 73 |     paddings.append(1)
 74 | 
 75 |     # A.
 76 |     len_text_a = len(text_a_ids)
 77 |     ids.extend(text_a_ids)
 78 |     types.extend([0] * len_text_a)
 79 |     paddings.extend([1] * len_text_a)
 80 | 
 81 |     # [SEP].
 82 |     ids.append(sep_id)
 83 |     types.append(0)
 84 |     paddings.append(1)
 85 | 
 86 |     # B.
 87 |     if text_b_ids is not None:
 88 |         len_text_b = len(text_b_ids)
 89 |         ids.extend(text_b_ids)
 90 |         types.extend([1] * len_text_b)
 91 |         paddings.extend([1] * len_text_b)
 92 | 
 93 |     # Cap the size.
 94 |     trimmed = False
 95 |     if len(ids) >= max_seq_length:
 96 |         max_seq_length_m1 = max_seq_length - 1
 97 |         ids = ids[0:max_seq_length_m1]
 98 |         types = types[0:max_seq_length_m1]
 99 |         paddings = paddings[0:max_seq_length_m1]
100 |         trimmed = True
101 | 
102 |     # [SEP].
103 |     if (text_b_ids is not None) or trimmed:
104 |         ids.append(sep_id)
105 |         if text_b_ids is None:
106 |             types.append(0)
107 |         else:
108 |             types.append(1)
109 |         paddings.append(1)
110 | 
111 |     # Padding.
112 |     padding_length = max_seq_length - len(ids)
113 |     if padding_length > 0:
114 |         ids.extend([pad_id] * padding_length)
115 |         types.extend([pad_id] * padding_length)
116 |         paddings.extend([0] * padding_length)
117 | 
118 |     return ids, types, paddings
119 | 


--------------------------------------------------------------------------------
/tasks/eval_harness/download.py:
--------------------------------------------------------------------------------
 1 | # Downloads the specified taks in the evaluation harness
 2 | # This is particularly useful when running in environments where the GPU nodes 
 3 | # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
 4 | 
 5 | from lm_eval import tasks
 6 | from lm_eval.tasks import ALL_TASKS
 7 | import argparse
 8 | import os
 9 | 
10 | 
11 | parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
12 | parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
13 | args = parser.parse_args()
14 | 
15 | def main():
16 |     task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
17 |     tasks.get_task_dict(task_list)
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 
22 | 
23 |     


--------------------------------------------------------------------------------
/tasks/eval_harness/report-to-csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # this script converts results.json:
 4 | #
 5 | #   "results": {
 6 | #     "arc_challenge": {
 7 | #       "acc": 0.24232081911262798,
 8 | #       "acc_stderr": 0.01252159329580012,
 9 | #       "acc_norm": 0.2764505119453925,
10 | #       "acc_norm_stderr": 0.013069662474252425
11 | #     },
12 | #
13 | # into a format expected by a spreadsheet, which is:
14 | #
15 | #   task          metric   value    err
16 | #   arc_challenge acc      xxx      yyy
17 | #   arc_challenge acc_norm xxx      yyy
18 | #   arc_challenge f1       xxx      yyy
19 | #
20 | # usage:
21 | # report-to-csv.py results.json
22 | 
23 | 
24 | import sys
25 | import json
26 | import io
27 | import csv
28 | 
29 | results_file = sys.argv[1]
30 | 
31 | csv_file = results_file.replace("json", "csv")
32 | 
33 | print(f"Converting {results_file} to {csv_file}")
34 | 
35 | with io.open(results_file, 'r', encoding='utf-8') as f:
36 |     results = json.load(f)
37 | 
38 | with io.open(csv_file, 'w', encoding='utf-8') as f:
39 | 
40 |     writer = csv.writer(f)
41 |     writer.writerow(["task", "metric", "value", "err", "version"])
42 | 
43 |     versions = results["versions"]
44 | 
45 |     for k,v in sorted(results["results"].items()):
46 |         if k not in versions:
47 |             versions[k] = -1
48 | 
49 |         if "acc" in v:
50 |             writer.writerow([k, "acc", v["acc"], v["acc_stderr"], versions[k]])
51 |         if "acc_norm" in v:
52 |             writer.writerow([k, "acc_norm", v["acc_norm"], v["acc_norm_stderr"], versions[k]])
53 |         if "f1" in v:
54 |             writer.writerow([k, "f1", v["f1"], v["f1_stderr"] if "f1_stderr" in v else "", versions[k]])
55 |         # if "ppl" in v:
56 |         #     writer.writerow([k, "ppl", v["ppl"], v["ppl_stderr"], versions[k]])
57 |         # if "em" in v:
58 |         #     writer.writerow([k, "em", v["em"], v["em_stderr"] if "em_stderr" in v else "", versions[k]])
59 | 


--------------------------------------------------------------------------------
/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """GLUE dataset."""
17 | 
18 | from abc import ABC
19 | from abc import abstractmethod
20 | 
21 | from torch.utils.data import Dataset
22 | 
23 | from megatron import print_rank_0
24 | from tasks.data_utils import build_sample
25 | from tasks.data_utils import build_tokens_types_paddings_from_text
26 | 
27 | 
28 | class GLUEAbstractDataset(ABC, Dataset):
29 |     """GLUE base dataset class."""
30 | 
31 |     def __init__(self, task_name, dataset_name, datapaths,
32 |                  tokenizer, max_seq_length):
33 |         # Store inputs.
34 |         self.task_name = task_name
35 |         self.dataset_name = dataset_name
36 |         self.tokenizer = tokenizer
37 |         self.max_seq_length = max_seq_length
38 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
39 |                                                              self.dataset_name))
40 |         # Process the files.
41 |         string = '  > paths:'
42 |         for path in datapaths:
43 |             string += ' ' + path
44 |         print_rank_0(string)
45 |         self.samples = []
46 |         for datapath in datapaths:
47 |             self.samples.extend(self.process_samples_from_single_path(datapath))
48 |         print_rank_0('  >> total number of samples: {}'.format(
49 |             len(self.samples)))
50 | 
51 |     def __len__(self):
52 |         return len(self.samples)
53 | 
54 |     def __getitem__(self, idx):
55 |         raw_sample = self.samples[idx]
56 |         ids, types, paddings = build_tokens_types_paddings_from_text(
57 |             raw_sample['text_a'], raw_sample['text_b'],
58 |             self.tokenizer, self.max_seq_length)
59 |         sample = build_sample(ids, types, paddings,
60 |                               raw_sample['label'], raw_sample['uid'])
61 |         return sample
62 | 
63 |     @abstractmethod
64 |     def process_samples_from_single_path(self, datapath):
65 |         """Abstract method that takes a single path / filename and
66 |         returns a list of dataset samples, each sample being a dict of
67 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
68 |         """
69 |         pass
70 | 


--------------------------------------------------------------------------------
/tasks/glue/finetune.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """GLUE finetuning/evaluation."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron import get_tokenizer
21 | from megatron import mpu
22 | from megatron.model.classification import Classification
23 | from tasks.eval_utils import accuracy_func_provider
24 | from tasks.finetune_utils import finetune
25 | 
26 | 
27 | def glue_classification(num_classes, Dataset,
28 |                         name_from_datapath_func):
29 | 
30 |     def train_valid_datasets_provider():
31 |         """Build train and validation dataset."""
32 |         args = get_args()
33 |         tokenizer = get_tokenizer()
34 | 
35 |         train_dataset = Dataset('training', args.train_data,
36 |                                 tokenizer, args.seq_length)
37 |         valid_dataset = Dataset('validation', args.valid_data,
38 |                                 tokenizer, args.seq_length)
39 | 
40 |         return train_dataset, valid_dataset
41 | 
42 |     def model_provider(pre_process=True, post_process=True):
43 |         """Build the model."""
44 |         args = get_args()
45 | 
46 |         print_rank_0('building classification model for {} ...'.format(
47 |             args.task))
48 |         model = Classification(num_classes=num_classes, num_tokentypes=2,
49 |                                pre_process=pre_process, post_process=post_process)
50 | 
51 |         return model
52 | 
53 |     def metrics_func_provider():
54 |         """Privde metrics callback function."""
55 |         def single_dataset_provider(datapath):
56 |             args = get_args()
57 |             tokenizer = get_tokenizer()
58 | 
59 |             name = name_from_datapath_func(datapath)
60 |             return Dataset(name, [datapath], tokenizer, args.seq_length)
61 |         return accuracy_func_provider(single_dataset_provider)
62 | 
63 |     """Finetune/evaluate."""
64 |     finetune(train_valid_datasets_provider, model_provider,
65 |              end_of_epoch_callback_provider=metrics_func_provider)
66 | 
67 | 
68 | def main():
69 |     args = get_args()
70 | 
71 |     if args.task == 'MNLI':
72 | 
73 |         num_classes = 3
74 |         from tasks.glue.mnli import MNLIDataset as Dataset
75 | 
76 |         def name_from_datapath(datapath):
77 |             return datapath.split('MNLI')[-1].strip(
78 |                 '.tsv').strip('/').replace('_', '-')
79 | 
80 |     elif args.task == 'QQP':
81 | 
82 |         num_classes = 2
83 |         from tasks.glue.qqp import QQPDataset as Dataset
84 | 
85 |         def name_from_datapath(datapath):
86 |             return datapath.split('QQP')[-1].strip(
87 |                 '.tsv').strip('/').replace('_', '-')
88 | 
89 |     else:
90 |         raise NotImplementedError('GLUE task {} is not implemented.'.format(
91 |             args.task))
92 | 
93 |     glue_classification(num_classes, Dataset, name_from_datapath)
94 | 


--------------------------------------------------------------------------------
/tasks/glue/mnli.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """MNLI dataset."""
17 | 
18 | from megatron import print_rank_0
19 | from tasks.data_utils import clean_text
20 | from .data import GLUEAbstractDataset
21 | 
22 | 
23 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
24 | 
25 | 
26 | class MNLIDataset(GLUEAbstractDataset):
27 | 
28 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
29 |                  test_label='contradiction'):
30 |         self.test_label = test_label
31 |         super().__init__('MNLI', name, datapaths,
32 |                          tokenizer, max_seq_length)
33 | 
34 |     def process_samples_from_single_path(self, filename):
35 |         """"Implement abstract method."""
36 |         print_rank_0(' > Processing {} ...'.format(filename))
37 | 
38 |         samples = []
39 |         total = 0
40 |         first = True
41 |         is_test = False
42 |         with open(filename, 'r') as f:
43 |             for line in f:
44 |                 row = line.strip().split('\t')
45 |                 if first:
46 |                     first = False
47 |                     if len(row) == 10:
48 |                         is_test = True
49 |                         print_rank_0(
50 |                             '   reading {}, {} and {} columns and setting '
51 |                             'labels to {}'.format(
52 |                                 row[0].strip(), row[8].strip(),
53 |                                 row[9].strip(), self.test_label))
54 |                     else:
55 |                         print_rank_0('    reading {} , {}, {}, and {} columns '
56 |                                      '...'.format(
57 |                                          row[0].strip(), row[8].strip(),
58 |                                          row[9].strip(), row[-1].strip()))
59 |                     continue
60 | 
61 |                 text_a = clean_text(row[8].strip())
62 |                 text_b = clean_text(row[9].strip())
63 |                 unique_id = int(row[0].strip())
64 |                 label = row[-1].strip()
65 |                 if is_test:
66 |                     label = self.test_label
67 | 
68 |                 assert len(text_a) > 0
69 |                 assert len(text_b) > 0
70 |                 assert label in LABELS
71 |                 assert unique_id >= 0
72 | 
73 |                 sample = {'text_a': text_a,
74 |                           'text_b': text_b,
75 |                           'label': LABELS[label],
76 |                           'uid': unique_id}
77 |                 total += 1
78 |                 samples.append(sample)
79 | 
80 |                 if total % 50000 == 0:
81 |                     print_rank_0('  > processed {} so far ...'.format(total))
82 | 
83 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
84 |         return samples
85 | 


--------------------------------------------------------------------------------
/tasks/main.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | import os
19 | import sys
20 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
21 |                                              os.path.pardir)))
22 | 
23 | from megatron import get_args
24 | from megatron.initialize import initialize_megatron
25 | 
26 | 
27 | def get_tasks_args(parser):
28 |     """Provide extra arguments required for tasks."""
29 |     group = parser.add_argument_group(title='tasks')
30 | 
31 |     group.add_argument('--task', type=str, required=True,
32 |                        help='Task name.')
33 |     group.add_argument('--epochs', type=int, default=None,
34 |                        help='Number of finetunning epochs. Zero results in '
35 |                        'evaluation only.')
36 |     group.add_argument('--pretrained-checkpoint', type=str, default=None,
37 |                        help='Pretrained checkpoint used for finetunning.')
38 |     group.add_argument('--keep-last', action='store_true',
39 |                        help='Keep the last batch (maybe incomplete) in'
40 |                        'the data loader')
41 |     group.add_argument('--train-data', nargs='+', default=None,
42 |                        help='Whitespace separated paths or corpora names '
43 |                        'for training.')
44 |     group.add_argument('--valid-data', nargs='*', default=None,
45 |                        help='path(s) to the validation data.')
46 |     group.add_argument('--overlapping-eval', type=int, default=32,
47 |                        help='Sliding window for overlapping evaluation.')
48 |     group.add_argument('--strict-lambada', action='store_true',
49 |                        help='Use more difficult formulation of lambada.')
50 |     # Retriever args
51 |     group.add_argument('--qa-data-dev', type=str, default=None,
52 |                        help='Path to the QA dataset dev file.')
53 |     group.add_argument('--qa-data-test', type=str, default=None,
54 |                        help='Path to the QA dataset test file.')
55 | 
56 |     # Faiss arguments for retriever
57 |     group.add_argument('--faiss-use-gpu', action='store_true',
58 |                        help='Whether create the FaissMIPSIndex on GPU')
59 |     group.add_argument('--faiss-match', type=str, default='string', \
60 |                         choices=['regex', 'string'], help="Answer matching '\
61 |                         'logic type")
62 |     group.add_argument('--faiss-topk-retrievals', type=int, default=100,
63 |                        help='Number of blocks to use as top-k during retrieval')
64 | 
65 |     return parser
66 | 
67 | 
68 | if __name__ == '__main__':
69 | 
70 |     initialize_megatron(extra_args_provider=get_tasks_args)
71 | 
72 |     args = get_args()
73 | 
74 |     if args.num_layers_per_virtual_pipeline_stage is not None:
75 |         print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
76 |         exit()
77 | 
78 |     if args.task == 'RACE':
79 |         from race.finetune import main
80 |     elif args.task in ['MNLI', 'QQP']:
81 |         from glue.finetune import main
82 |     elif args.task in ['LAMBADA', 'WIKITEXT103']:
83 |         from zeroshot_gpt.evaluate import main
84 |     elif args.task in ['ICT-ZEROSHOT-NQ']:
85 |         from orqa.evaluate_orqa import main
86 |     else:
87 |         raise NotImplementedError('Task {} is not implemented.'.format(
88 |             args.task))
89 | 
90 |     main()
91 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | import os
19 | import sys
20 | 
21 | from megatron import get_args
22 | from tasks.orqa.evaluate_utils import ORQAEvaluator
23 | 
24 | def main():
25 |     """
26 |     Main program
27 |     """
28 | 
29 |     args = get_args()
30 | 
31 |     # Set up the model and evaluator
32 |     evaluator = ORQAEvaluator()
33 | 
34 |     # Run evaluation
35 |     if args.qa_data_dev is not None:
36 |         evaluator.evaluate(args.qa_data_dev, "DEV")
37 | 
38 |     if args.qa_data_test is not None:
39 |         evaluator.evaluate(args.qa_data_test, "TEST")
40 | 
41 | 


--------------------------------------------------------------------------------
/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Race."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron import get_tokenizer
21 | from megatron import mpu
22 | from megatron.model.multiple_choice import MultipleChoice
23 | from tasks.eval_utils import accuracy_func_provider
24 | from tasks.finetune_utils import finetune
25 | from tasks.race.data import RaceDataset
26 | 
27 | 
28 | def train_valid_datasets_provider():
29 |     """Provide train and validation datasets."""
30 |     args = get_args()
31 |     tokenizer = get_tokenizer()
32 | 
33 |     train_dataset = RaceDataset('training', args.train_data,
34 |                                 tokenizer, args.seq_length)
35 |     valid_dataset = RaceDataset('validation', args.valid_data,
36 |                                 tokenizer, args.seq_length)
37 | 
38 |     return train_dataset, valid_dataset
39 | 
40 | 
41 | def model_provider(pre_process=True, post_process=True):
42 |     """Build the model."""
43 | 
44 |     print_rank_0('building multichoice model for RACE ...')
45 |     model = MultipleChoice(num_tokentypes=2,
46 |                            pre_process=pre_process,
47 |                            post_process=post_process)
48 | 
49 |     return model
50 | 
51 | 
52 | def metrics_func_provider():
53 |     """Privde metrics callback function."""
54 |     args = get_args()
55 |     tokenizer = get_tokenizer()
56 | 
57 |     def single_dataset_provider(datapath):
58 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
59 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
60 | 
61 |     return accuracy_func_provider(single_dataset_provider)
62 | 
63 | 
64 | def main():
65 | 
66 |     finetune(train_valid_datasets_provider, model_provider,
67 |              end_of_epoch_callback_provider=metrics_func_provider)
68 | 


--------------------------------------------------------------------------------
/tasks/vision/classification.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Vision-classification finetuning/evaluation."""
17 | 
18 | from megatron import get_args
19 | from megatron import print_rank_0
20 | from megatron.model.vit_model import VitModel
21 | from megatron.data.vit_dataset import build_train_valid_datasets
22 | from tasks.vision.eval_utils import accuracy_func_provider
23 | from tasks.vision.finetune_utils import finetune
24 | 
25 | 
26 | def classification():
27 |     def train_valid_datasets_provider():
28 |         """Build train and validation dataset."""
29 |         args = get_args()
30 | 
31 |         train_ds, valid_ds = build_train_valid_datasets(
32 |             data_path=args.data_path,
33 |             crop_size=args.img_dim,
34 |         )
35 |         return train_ds, valid_ds
36 | 
37 |     def model_provider():
38 |         """Build the model."""
39 |         args = get_args()
40 | 
41 |         print_rank_0("building classification model for ImageNet ...")
42 | 
43 |         return VitModel(num_classes=args.num_classes, finetune=True)
44 | 
45 |     """Finetune/evaluate."""
46 |     finetune(
47 |         train_valid_datasets_provider,
48 |         model_provider,
49 |         end_of_epoch_callback_provider=accuracy_func_provider,
50 |     )
51 | 
52 | 
53 | def main():
54 |     classification()
55 | 


--------------------------------------------------------------------------------
/tasks/vision/eval_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Evaluation utilities."""
17 | 
18 | import os
19 | import torch
20 | from megatron import get_args
21 | from megatron import print_rank_0
22 | from megatron import mpu
23 | from tasks.vision.finetune_utils import build_data_loader
24 | from tasks.vision.finetune_utils import process_batch
25 | from torchvision import datasets, transforms
26 | 
27 | 
28 | def accuracy_func_provider():
29 |     """Provide function that calculates accuracies."""
30 |     args = get_args()
31 |     data_path = args.data_path
32 |     crop_size = args.img_dim
33 | 
34 |     # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
35 |     # Build dataloaders.
36 |     val_data_path = os.path.join(data_path[0], "val")
37 |     normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
38 |     transform_val = transforms.Compose(
39 |         [
40 |             transforms.Resize(crop_size),
41 |             transforms.CenterCrop(crop_size),
42 |             transforms.ToTensor(),
43 |             normalize,
44 |         ]
45 |     )
46 |     dataset = datasets.ImageFolder(root=val_data_path, transform=transform_val)
47 | 
48 |     dataloader = build_data_loader(
49 |         dataset,
50 |         args.micro_batch_size,
51 |         num_workers=args.num_workers,
52 |         drop_last=(mpu.get_data_parallel_world_size() > 1),
53 |     )
54 | 
55 |     def metrics_func(model, epoch):
56 |         print_rank_0("calculating metrics ...")
57 |         correct, total = calculate_correct_answers(model, dataloader, epoch)
58 |         percent = float(correct) * 100.0 / float(total)
59 |         print_rank_0(
60 |             " >> |epoch: {}| overall: correct / total = {} / {} = "
61 |             "{:.4f} %".format(epoch, correct, total, percent)
62 |         )
63 | 
64 |     return metrics_func
65 | 
66 | 
67 | def calculate_correct_answers(model, dataloader, epoch):
68 |     """Calculate correct over total answers"""
69 | 
70 |     model.eval()
71 |     with torch.no_grad():
72 |         # For all the batches in the dataset.
73 |         total = 0
74 |         correct = 0
75 |         for _, batch in enumerate(dataloader):
76 |             # Run the model forward.
77 |             images, labels = process_batch(batch)
78 |             logits = model(images).contiguous().float()
79 |             # Add output predictions.
80 |             # Compute the correct answers.
81 |             predicted = torch.argmax(logits, dim=-1)
82 |             corrects = (predicted == labels).float()
83 |             # Add to the counters.
84 |             total += labels.size(0)
85 |             correct += corrects.sum().item()
86 |     model.train()
87 | 
88 |     # Reduce.
89 |     unreduced = torch.cuda.LongTensor([correct, total])
90 |     torch.distributed.all_reduce(unreduced, group=mpu.get_data_parallel_group())
91 | 
92 |     # Print on screen.
93 |     correct_ans = unreduced[0].item()
94 |     total_count = unreduced[1].item()
95 |     return correct_ans, total_count
96 | 


--------------------------------------------------------------------------------
/tasks/vision/main.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Main tasks functionality."""
17 | 
18 | import os
19 | import sys
20 | 
21 | sys.path.append(
22 |     os.path.abspath(
23 |         os.path.join(
24 |             os.path.join(os.path.dirname(__file__), os.path.pardir),
25 |             os.path.pardir,
26 |         )
27 |     )
28 | )
29 | from megatron import get_args
30 | from megatron.initialize import initialize_megatron
31 | from classification import main
32 | 
33 | 
34 | def get_tasks_args(parser):
35 |     """Provide extra arguments required for tasks."""
36 |     group = parser.add_argument_group(title="tasks")
37 | 
38 |     group.add_argument(
39 |         "--epochs",
40 |         type=int,
41 |         default=None,
42 |         help="Number of finetunning epochs. Zero results in "
43 |         "evaluation only.",
44 |     )
45 |     group.add_argument(
46 |         "--pretrained-checkpoint",
47 |         type=str,
48 |         default=None,
49 |         help="Pretrained checkpoint used for finetunning.",
50 |     )
51 |     group.add_argument(
52 |         "--keep-last",
53 |         action="store_true",
54 |         help="Keep the last batch (maybe incomplete) in" "the data loader",
55 |     )
56 | 
57 |     return parser
58 | 
59 | 
60 | if __name__ == "__main__":
61 | 
62 |     initialize_megatron(extra_args_provider=get_tasks_args)
63 |     args = get_args()
64 |     main()
65 | 


--------------------------------------------------------------------------------
/tasks/zeroshot_gpt/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Detokenization."""
17 | 
18 | import re
19 | 
20 | 
21 | def ptb_detokenizer(string):
22 |     string = string.replace(" '", "'")
23 |     string = string.replace(" \n", "\n")
24 |     string = string.replace("\n ", "\n")
25 |     string = string.replace(" n't", "n't")
26 |     string = string.replace(" N ", "1 ")
27 |     string = string.replace("$ 1", "$1")
28 |     string = string.replace("# 1", "#1")
29 |     return string
30 | 
31 | 
32 | def wikitext_detokenizer(string):
33 |     # contractions
34 |     string = string.replace("s '", "s'")
35 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
36 |     # number separators
37 |     string = string.replace(" @-@ ", "-")
38 |     string = string.replace(" @,@ ", ",")
39 |     string = string.replace(" @.@ ", ".")
40 |     # punctuation
41 |     string = string.replace(" : ", ": ")
42 |     string = string.replace(" ; ", "; ")
43 |     string = string.replace(" . ", ". ")
44 |     string = string.replace(" ! ", "! ")
45 |     string = string.replace(" ? ", "? ")
46 |     string = string.replace(" , ", ", ")
47 |     # double brackets
48 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
49 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
50 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
51 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
52 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
53 |     # miscellaneous
54 |     string = string.replace("= = = =", "====")
55 |     string = string.replace("= = =", "===")
56 |     string = string.replace("= =", "==")
57 |     string = string.replace(" " + chr(176) + " ", chr(176))
58 |     string = string.replace(" \n", "\n")
59 |     string = string.replace("\n ", "\n")
60 |     string = string.replace(" N ", " 1 ")
61 |     string = string.replace(" 's", "'s")
62 | 
63 |     return string
64 | 
65 | 
66 | def lambada_detokenizer(string):
67 |     return string
68 | 
69 | 
70 | _DETOKENIZERS = {
71 |     'ptb': ptb_detokenizer,
72 |     'wiki': wikitext_detokenizer,
73 |     'lambada': lambada_detokenizer,
74 | }
75 | 
76 | 
77 | def get_detokenizer(path):
78 |     for key in _DETOKENIZERS.keys():
79 |         if key in path:
80 |             return _DETOKENIZERS[key]
81 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Testing
 2 | 
 3 | This test suite heavily borrows from [HF Transformers](https://github.com/huggingface/transformers/), therefore you can refer to the its [testing docs](https://huggingface.co/transformers/testing.html) for in-depth details. In particular wrt writing new tests, as we have access a lot of helper classes and functions, so you can write tests very quickly and not need to reinvent the wheel.
 4 | 
 5 | The foundation is `pytest`, which allows you to write normal `pytest` tests, but we also use a lot of unit tests in particular via `TestCasePlus` which extends `unittest` and provides additional rich functionality.
 6 | 
 7 | ## Running testing
 8 | 
 9 | ```
10 | make test
11 | ```
12 | or:
13 | 
14 | ```
15 | pytest tests
16 | ```
17 | 
18 | Important: the first time you run this it can take some minutes to build all the Megatron cuda kernels and deepspeed kernels if you haven't pre-built the latter.
19 | 
20 | For various other options please see the doc mentioned at the very top.
21 | 
22 | You will want to have at least 1 gpu available, best 2 to run the tests.
23 | 
24 | ## CI
25 | 
26 | The CI setup is documented [here](../.github/workflows/ci.md).
27 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # tests directory-specific settings - this file is run automatically
16 | # by pytest before any tests are run
17 | 
18 | import sys
19 | import warnings
20 | from os.path import abspath, dirname, join
21 | 
22 | 
23 | # allow having multiple repository checkouts and not needing to remember to rerun
24 | # 'pip install -e .[dev]' when switching between checkouts and running tests.
25 | git_repo_path = abspath(join(dirname(dirname(__file__))))
26 | sys.path.insert(1, git_repo_path)
27 | 
28 | # silence FutureWarning warnings in tests since often we can't act on them until
29 | # they become normal warnings - i.e. the tests still need to test the current functionality
30 | warnings.simplefilter(action="ignore", category=FutureWarning)
31 | 
32 | 
33 | def pytest_sessionfinish(session, exitstatus):
34 |     # If no tests are collected, pytest exists with code 5, which makes the CI fail.
35 |     if exitstatus == 5:
36 |         session.exitstatus = 0
37 | 


--------------------------------------------------------------------------------
/tests/data/gpt2/README.md:
--------------------------------------------------------------------------------
1 | Dataset used for testing.
2 | 
3 | `ag_news_prompt*`: manually generated from dataset available at https://huggingface.co/datasets/TimeRobber/ag_news_classify_question_first_100


--------------------------------------------------------------------------------
/tests/data/gpt2/ag_news_prompt_inputs_document.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_inputs_document.bin


--------------------------------------------------------------------------------
/tests/data/gpt2/ag_news_prompt_inputs_document.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_inputs_document.idx


--------------------------------------------------------------------------------
/tests/data/gpt2/ag_news_prompt_targets_document.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_targets_document.bin


--------------------------------------------------------------------------------
/tests/data/gpt2/ag_news_prompt_targets_document.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/ag_news_prompt_targets_document.idx


--------------------------------------------------------------------------------
/tests/data/gpt2/generate_ag_news_mtf_dataset.sh:
--------------------------------------------------------------------------------
 1 | python -c "from datasets import load_dataset; load_dataset('TimeRobber/ag_news_classify_question_first_100', split='train').to_json('ag_news_classify_question_first_100.jsonl')"
 2 | 
 3 | python tools/preprocess_data.py \
 4 |     --input ag_news_classify_question_first_100.jsonl \
 5 |     --output-prefix tests/data/gpt2/ag_news_prompt \
 6 |     --dataset-impl mmap \
 7 |     --json-key targets \
 8 |     --tokenizer-type PretrainedFromHF \
 9 |     --tokenizer-name-or-path bigscience/tokenizer \
10 |     --append-eod \
11 |     --workers 8
12 | 
13 | python tools/preprocess_data.py \
14 |     --input ag_news_classify_question_first_100.jsonl \
15 |     --output-prefix tests/data/gpt2/ag_news_prompt \
16 |     --dataset-impl mmap \
17 |     --json-key inputs \
18 |     --tokenizer-type PretrainedFromHF \
19 |     --tokenizer-name-or-path bigscience/tokenizer \
20 |     --workers 8
21 | 
22 | rm ag_news_classify_question_first_100.jsonl
23 | 


--------------------------------------------------------------------------------
/tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin


--------------------------------------------------------------------------------
/tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigscience-workshop/Megatron-DeepSpeed/8387ae17c4704f6579f88a84500b535d19d7fbbf/tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx


--------------------------------------------------------------------------------
/tests/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 1,
 3 |   "train_batch_size": 16,
 4 |   "gradient_clipping": 1.0,
 5 |   "zero_optimization": {
 6 |     "stage": 1
 7 |   },
 8 |   "fp16": {
 9 |     "enabled": true,
10 |     "loss_scale": 0,
11 |     "loss_scale_window": 500,
12 |     "hysteresis": 2,
13 |     "min_loss_scale": 1,
14 |     "initial_scale_power": 12
15 |   },
16 |   "zero_allow_untested_optimizer": true,
17 |   "steps_per_print": 2000,
18 |   "wall_clock_breakdown": false
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/ds_config_bf16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 1,
 3 |   "train_batch_size": 16,
 4 |   "gradient_clipping": 1.0,
 5 |   "zero_optimization": {
 6 |     "stage": 0
 7 |   },
 8 |   "bf16": {
 9 |     "enabled": true
10 |   },
11 |   "zero_allow_untested_optimizer": true,
12 |   "steps_per_print": 2000,
13 |   "wall_clock_breakdown": false
14 | }
15 | 


--------------------------------------------------------------------------------
/tests/ds_config_cl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 1,
 3 |   "train_batch_size": 16,
 4 |   "gradient_clipping": 1.0,
 5 |   "zero_optimization": {
 6 |     "stage": 1
 7 |   },
 8 |   "fp16": {
 9 |     "enabled": true,
10 |     "loss_scale": 0,
11 |     "loss_scale_window": 500,
12 |     "hysteresis": 2,
13 |     "min_loss_scale": 1,
14 |     "initial_scale_power": 12
15 |   },
16 |   "curriculum_learning": {
17 |     "enabled": true,
18 |     "curriculum_type": "seqlen",
19 |     "min_difficulty": 8,
20 |     "max_difficulty": 128,
21 |     "schedule_type": "fixed_linear",
22 |     "schedule_config": {
23 |       "total_curriculum_step": 30,
24 |       "difficulty_step": 4
25 |     }
26 |   },
27 |   "steps_per_print": 2000,
28 |   "wall_clock_breakdown": false
29 | }
30 | 


--------------------------------------------------------------------------------
/tests/ds_config_inference.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_micro_batch_size_per_gpu": 1,
 3 |   "train_batch_size": 16,
 4 |   "fp16": {
 5 |     "enabled": true,
 6 |     "loss_scale": 0,
 7 |     "loss_scale_window": 500,
 8 |     "hysteresis": 2,
 9 |     "min_loss_scale": 1,
10 |     "initial_scale_power": 12
11 |   },
12 |   "zero_allow_untested_optimizer": false,
13 |   "steps_per_print": 2000,
14 |   "wall_clock_breakdown": false
15 | }
16 | 


--------------------------------------------------------------------------------
/tests/test_activations.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import unittest
 3 | 
 4 | import torch
 5 | from torch.nn import functional as F
 6 | 
 7 | from megatron.model.glu_activations import GLU_ACTIVATIONS, geglu, liglu, reglu, swiglu
 8 | from megatron.testing_utils import set_seed, torch_assert_equal
 9 | 
10 | 
11 | class TestActivations(unittest.TestCase):
12 |     def setUp(self):
13 |         """setup an input of reasonable size"""
14 |         set_seed()
15 |         self.batch_size = random.randint(2, 64)
16 |         self.seq_len = random.randint(256, 1025)
17 |         self.num_channels = random.randint(1, 384) * 2
18 |         self.x = torch.randn(self.batch_size, self.seq_len, self.num_channels)
19 |         self.x1, self.x2 = self.x.chunk(2, dim=-1)
20 |         # glu should halve the last dimension
21 |         self.output_shape = [self.batch_size, self.seq_len, self.num_channels // 2]
22 | 
23 |     def test_shapes(self):
24 |         for activation_fn in GLU_ACTIVATIONS.values():
25 |             output = activation_fn(self.x)
26 |             self.assertEqual(list(output.shape), self.output_shape)
27 | 
28 |     def test_liglu(self):
29 |         expected = self.x1 * self.x2
30 |         torch_assert_equal(liglu(self.x), expected)
31 | 
32 |     def test_geglu(self):
33 |         expected = self.x1 * F.gelu(self.x2)
34 |         torch_assert_equal(geglu(self.x), expected)
35 | 
36 |     def test_reglu(self):
37 |         expected = self.x1 * F.relu(self.x2)
38 |         torch_assert_equal(reglu(self.x), expected)
39 | 
40 |     def test_swiglu(self):
41 |         expected = self.x1 * F.silu(self.x2)
42 |         torch_assert_equal(swiglu(self.x), expected)
43 | 
44 |     # from megatron.testing_utils import require_torch_bf16
45 |     # @require_torch_bf16
46 |     # def test_bf16_jit(self):
47 |     #     x_bf16 = self.x.to(torch.bfloat16)
48 |     #     for activation_fn in GLU_ACTIVATIONS.values():
49 |     #         output = activation_fn(x_bf16)
50 |     #         self.assertEqual(list(output.shape), self.output_shape)
51 | 


--------------------------------------------------------------------------------
/tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 


--------------------------------------------------------------------------------
/tests/tools/README.md:
--------------------------------------------------------------------------------
 1 | # Test suite tools
 2 | 
 3 | # Make tiny tokenizer files
 4 | 
 5 | currently for gpt2 run:
 6 | ```
 7 | ./shrink-tokenizer.py
 8 | ```
 9 | 
10 | and then we have tiny vocab and merge files under the generated dir `tiny` to add to repo under `data/gpt2`.
11 | 
12 | ```
13 | cp tiny/merges.txt ../data/gpt2/gpt2-tiny-merges.txt
14 | cp tiny/vocab.json ../data/gpt2/gpt2-tiny-vocab.json
15 | ```
16 | 
17 | Note, the tiny vocab was set to 5000 items after experimenting with the resulting index files size. Using a tiny vocab of 500 (and adjusted merge entries) proved to generate very large index files, so it actually ends up costing more in final file size. 5000 proved to generate an almost identical index files as with the original 50k vocab size.
18 | 
19 | 
20 | # Make tiny pre-processed index
21 | 
22 | to be used in test training
23 | 
24 | ```
25 | ./openwebtext-to-jsonl.py
26 | ```
27 | 
28 | generates:
29 | 
30 | ```
31 | openwebtext-10000.jsonl
32 | ```
33 | 
34 | we don't want to store jsonl in repo, to keep the size small, so it's a temp file.
35 | 
36 | Now we pre-process it:
37 | 
38 | ```
39 | cd ../..
40 | input=tests/tools/openwebtext-1000.jsonl
41 | python tools/preprocess_data.py \
42 |     --input $input \
43 |     --output-prefix tests/data/gpt2/meg-gpt2-openwebtext \
44 |     --dataset-impl mmap \
45 |     --tokenizer-type GPT2BPETokenizer \
46 |     --merge-file tests/data/gpt2/gpt2-tiny-merges.txt \
47 |     --vocab tests/data/gpt2/gpt2-tiny-vocab.json \
48 |     --append-eod \
49 |     --workers 6
50 | ```
51 | 
52 | and voila we now have:
53 | ```
54 | ls -sh1 tests/data/gpt2/meg-gpt2-openwebtext*
55 | 2.6M tests/data/gpt2/meg-gpt2-openwebtext_text_document.bin
56 |  20K tests/data/gpt2/meg-gpt2-openwebtext_text_document.idx
57 | ```
58 | which we can now commit and use in tests.
59 | 


--------------------------------------------------------------------------------
/tests/tools/openwebtext-to-jsonl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # generate a jsonl version of a small slice of a dataset that can be fed to megatron-lm preprocessor
 4 | 
 5 | import sys
 6 | from datasets import load_dataset
 7 | 
 8 | dataset_name = "stas/openwebtext-10k"
 9 | 
10 | # subset to jsonlines
11 | n_samples = 1000
12 | ds = load_dataset(dataset_name, split='train')
13 | ds_small = ds.select(range(n_samples))
14 | path = f"openwebtext-{n_samples}.jsonl"
15 | ds_small.to_json(path, orient="records", lines=True)
16 | 


--------------------------------------------------------------------------------
/tests/tools/shrink-tokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # produce a tiny tokenizer which we can use in testing (so that it won't take much space in the repo)
 4 | 
 5 | import json
 6 | from transformers import AutoTokenizer
 7 | from tokenizers import Tokenizer
 8 | 
 9 | mname = "gpt2"
10 | 
11 | vocab_keep_items = 5000
12 | 
13 | tokenizer = AutoTokenizer.from_pretrained(mname, use_fast=True)
14 | assert tokenizer.is_fast, "This only works for fast tokenizers."
15 | tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
16 | vocab = tokenizer_json["model"]["vocab"]
17 | if tokenizer_json["model"]["type"] == "BPE":
18 |     if "gpt2" in mname:
19 |         new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items-1 }
20 |         new_vocab["<|endoftext|>"] = vocab_keep_items-1
21 |     else:
22 |         new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
23 |     merges = tokenizer_json["model"]["merges"]
24 |     new_merges = []
25 |     for i in range(len(merges)):
26 |         a, b = merges[i].split()
27 |         new_token = "".join((a, b))
28 |         if a in new_vocab and b in new_vocab and new_token in new_vocab:
29 |             new_merges.append(merges[i])
30 |     tokenizer_json["model"]["merges"] = new_merges
31 | elif tokenizer_json["model"]["type"] == "Unigram":
32 |     new_vocab = vocab[:vocab_keep_items]
33 | elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel":
34 |     new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items }
35 | else:
36 |     raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}")
37 | tokenizer_json["model"]["vocab"] = new_vocab
38 | tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
39 | tokenizer.save_pretrained("tiny")
40 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Tools
 3 | 
 4 | - [sample_idxs_to_text.py](./sample_idxs_to_text.py) - want to see which text was feed at specific iterations? for example to understand why the training went astray? Then use this script. The pre-amble of the script contains the documentation and usage examples.
 5 | 
 6 | 
 7 | ## A few notes on how we created the datasets:
 8 | 
 9 | ### Creating the Json Lines text file
10 | 
11 | First you need to create a jsonl file containing your dataset. For this we exported from the HF-datasets format. For example for C4:
12 | 
13 | ```
14 | from datasets import load_dataset
15 | c4 = load_dataset("c4", "en")
16 | c4["train"].to_json("c4_en_train.jsonl")
17 | c4["validation"].to_json("c4_en_valid.jsonl")
18 | ```
19 | 
20 | This creates quite a large file compared to the size of the HF dataset on disk (810GB vs 305 for C4 for example)
21 | 
22 | ### Megatron pre-processing
23 | 
24 | Then you need to pass that text file to the `preprocess_data.py` script for tokenization and memory-mapping, creating two files, one to store the tokens indices and one to store the document start and ends. The result will be slightly bigger than the text dataset. (360GB vs 305GB for C4 for example). You can choose one of the default Megatron tokenizers (but then you have to pass merges and vocab files) or one from HF-tokenizers. For example, in our GPT-like models reusing a T5 sentencepiece-bpe tokenizer:
25 | 
26 | `python tools/preprocess_data.py   --input ~/c4_en_train.jsonl        --output-prefix c4_en_train --dataset-impl mmap        --tokenizer-type PretrainedFromHF --tokenizer-name-or-path t5-small        --workers 30        --append-eod`
27 | 
28 | Do note that adding too many workers can be counterproductive for very large dataset: as the bottleneck becomes disk writing, the intermediary process results pool up and can flood the RAM. In our experiments on GCP machines, running with 60 workers on C4 inevitably led the program to fail.
29 | 


--------------------------------------------------------------------------------
/tools/convert_checkpoint/deepspeed_to_transformers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import torch
 5 | import json
 6 | import sys
 7 | from pathlib import Path
 8 |  
 9 | # insert megatron's root dir into sys.path
10 | root_repo_path = str(Path(__file__).resolve().parents[2])
11 | if root_repo_path not in sys.path:
12 |     sys.path.insert(0, root_repo_path)
13 |     
14 | from deepspeed.checkpoint import DeepSpeedCheckpoint
15 | from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments
16 | 
17 | # the import was tested to work with this version
18 | # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider
19 | # copying that version here instead
20 | from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint
21 | from transformers import GPT2Config
22 | 
23 | 
24 | def main():
25 | 
26 |     # this first part comes mainly from deepspeed_to_megatron.main
27 |     args = parse_arguments()
28 |     print(
29 |         f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}'
30 |     )
31 | 
32 |     ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp,
33 |                                         args.target_pp)
34 |     iteration = ds_checkpoint.get_iteration()
35 |     input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0,
36 |                                                args.for_release)
37 | 
38 |     # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main
39 |     # Spell out all parameters in case the defaults change.
40 |     config = GPT2Config(
41 |         vocab_size=50257,
42 |         n_positions=1024,
43 |         n_ctx=1024,
44 |         n_embd=1024,
45 |         n_layer=24,
46 |         n_head=16,
47 |         n_inner=4096,
48 |         activation_function="gelu",  # used to be "gelu_new" in earlier versions
49 |         resid_pdrop=0.1,
50 |         embd_pdrop=0.1,
51 |         attn_pdrop=0.1,
52 |         layer_norm_epsilon=1e-5,
53 |         initializer_range=0.02,
54 |         summary_type="cls_index",
55 |         summary_use_proj=True,
56 |         summary_activation=None,
57 |         summary_proj_to_labels=True,
58 |         summary_first_dropout=0.1,
59 |         scale_attn_weights=True,
60 |         gradient_checkpointing=False,
61 |         use_cache=True,
62 |         bos_token_id=50256,
63 |         eos_token_id=50256,
64 |     )
65 | 
66 |     # Convert.
67 |     print("Converting to HF Checkpoint")
68 |     output_state_dict = convert_megatron_checkpoint(args, input_state_dict,
69 |                                                     config)
70 | 
71 |     basename = args.output_folder
72 |     os.makedirs(basename, exist_ok=True)
73 | 
74 |     # Print the structure of converted state dict.
75 |     #if args.print_checkpoint_structure:
76 |     #    recursive_print(None, output_state_dict)
77 | 
78 |     # Store the config to file.
79 |     output_config_file = os.path.join(basename, "config.json")
80 |     output_config = config.to_dict()
81 |     output_config["architectures"] = ["GPT2LMHeadModel"]
82 |     output_config["model_type"] = "gpt2"
83 |     print(f'Saving config to "{output_config_file}"')
84 |     with open(output_config_file, "w") as f:
85 |         json.dump(output_config, f)
86 | 
87 |     # Store the state_dict to file.
88 |     output_checkpoint_file = os.path.join(basename, "pytorch_model.bin")
89 |     print(f'Saving checkpoint to "{output_checkpoint_file}"')
90 |     torch.save(output_state_dict, output_checkpoint_file)
91 | 
92 |     print("Now add tokenizer files and upload to the hub")
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     main()
97 | 


--------------------------------------------------------------------------------
/tools/convert_checkpoint/inspect_checkpoint.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch
 3 | import os
 4 | from collections import OrderedDict
 5 | from pathlib import Path
 6 | 
 7 | # insert megatron's root dir into sys.path
 8 | root_repo_path = str(Path(__file__).resolve().parents[2])
 9 | if root_repo_path not in sys.path:
10 |     sys.path.insert(0, root_repo_path)
11 | 
12 | 
13 | def dump_data(datum, name_list=[]):
14 |     if type(datum) in (dict, OrderedDict):
15 |         for k, v in datum.items():
16 |             dump_data(v, name_list + [str(k)])
17 |     elif type(datum) in (list, tuple):
18 |         for v in datum:
19 |             dump_data(v, name_list)
20 |     elif torch.is_tensor(datum):
21 |         prefix = '.'.join(name_list)
22 |         print(f'[tensor] {prefix} = {datum.shape}')
23 |     else:
24 |         #pass
25 |         prefix = '.'.join(name_list)
26 |         print(f'[other] {prefix} = {datum}')
27 | 
28 | 
29 | def main():
30 |     if len(sys.argv) < 2:
31 |         print(f'Usage: {sys.argv[0]} <checkpoint file>')
32 |         exit(1)
33 | 
34 |     ckpt_file = sys.argv[1]
35 |     if not os.path.isfile(ckpt_file):
36 |         print(f'{ckpt_file} is not a valid file')
37 |         exit(1)
38 | 
39 |     print(f'loading checkpoint file: {ckpt_file}')
40 |     sd = torch.load(ckpt_file, map_location=torch.device('cpu'))
41 |     dump_data(sd)
42 | 
43 |     quit()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pathlib import Path
  3 |  
  4 | # insert megatron's root dir into sys.path
  5 | root_repo_path = str(Path(__file__).resolve().parents[2])
  6 | if root_repo_path not in sys.path:
  7 |     sys.path.insert(0, root_repo_path)
  8 |     
  9 | import argparse
 10 | 
 11 | from deepspeed.checkpoint import DeepSpeedCheckpoint 
 12 | 
 13 | 
 14 | def list_files(file_list, tag):
 15 |     print(f'Listing files: {tag}')
 16 |     for i, file in enumerate(file_list):
 17 |         print(f'{i+1}: {file}')
 18 | 
 19 | 
 20 | def parse_arguments():
 21 |     parser = argparse.ArgumentParser()
 22 |     parser.add_argument('--folder',
 23 |                         default=None,
 24 |                         type=str,
 25 |                         help='DeepSpeed Checkpoint folder')
 26 |     parser.add_argument('--target_tp',
 27 |                         default=None,
 28 |                         type=int,
 29 |                         help='Target TP degree')
 30 |     parser.add_argument('--target_pp',
 31 |                         default=None,
 32 |                         type=int,
 33 |                         help='Target PP degree')
 34 |     args = parser.parse_args()
 35 |     print(f'args = {args}')
 36 |     return args
 37 | 
 38 | 
 39 | def show_input_files(ds_checkpoint):
 40 |     list_files(ds_checkpoint.file_list, 'all')
 41 |     list_files(ds_checkpoint.zero_files, 'zero')
 42 |     list_files(ds_checkpoint.layer_files, 'layer')
 43 |     list_files(ds_checkpoint.mp_rank_files, 'mp rank')
 44 | 
 45 | 
 46 | def show_simple_state(ds_checkpoint):
 47 |     print(f'layer keys = {ds_checkpoint.layer_keys}')
 48 |     print(f'layer count = {ds_checkpoint.layer_count}')
 49 | 
 50 |     print(
 51 |         f'tp_degree_count = {ds_checkpoint.original_tp_degree} ------> {ds_checkpoint.tp_degree}'
 52 |     )
 53 |     print(
 54 |         f'pp_degree_count = {ds_checkpoint.original_pp_degree} ------> {ds_checkpoint.pp_degree}'
 55 |     )
 56 |     print(f'dp_degree_count = {ds_checkpoint.dp_degree}')
 57 |     ds_checkpoint.old_2d_map.print_data('old 2d map ==>')
 58 |     ds_checkpoint.new_2d_map.print_data('new 2d map ==>')
 59 | 
 60 | 
 61 | def show_mappings(ds_checkpoint):
 62 |     ds_checkpoint.show_pp_tranformer_map()
 63 |     ds_checkpoint.show_transformer_file_map()
 64 |     ds_checkpoint.show_tp_embedding_map()
 65 |     ds_checkpoint.show_tp_final_norm_map()
 66 |     ds_checkpoint.show_2d_mapping()
 67 | 
 68 | 
 69 | def show_state_summary(tag, sd):
 70 |     summary = {k: v.shape for k, v in sd.items()}
 71 |     print(f'{tag} = {summary}')
 72 | 
 73 | 
 74 | def show_embedding_states(ds_checkpoint):
 75 |     for i in range(0, ds_checkpoint.tp_degree):
 76 |         sd = ds_checkpoint.get_embedding_state(i)
 77 |         show_state_summary(f'embedding[{i}]', sd)
 78 | 
 79 | 
 80 | def show_final_norm_states(ds_checkpoint):
 81 |     for i in range(0, ds_checkpoint.tp_degree):
 82 |         sd = ds_checkpoint.get_final_norm_state(i)
 83 |         show_state_summary(f'final_norm[{i}]', sd)
 84 | 
 85 | 
 86 | def show_transformer_states(ds_checkpoint):
 87 |     for i in range(0, ds_checkpoint.tp_degree):
 88 |         for j in range(0, ds_checkpoint.pp_degree):
 89 |             state_list = ds_checkpoint.get_transformer_state(tp_index=i,
 90 |                                                              pp_index=j)
 91 |             print(f'tp_pp_rank[{i},{j}] = ')
 92 |             for k, sd in enumerate(state_list):
 93 |                 show_state_summary(f'      block[{k}]', sd)
 94 |                 print("")
 95 | 
 96 | 
 97 | def main():
 98 |     print(f'Inspecting DeepSpeed Checkpoint')
 99 |     args = parse_arguments()
100 | 
101 |     ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp,
102 |                                         args.target_pp)
103 |     ds_checkpoint.validate_files()
104 | 
105 |     show_simple_state(ds_checkpoint)
106 |     show_input_files(ds_checkpoint)
107 |     show_simple_state(ds_checkpoint)
108 |     show_mappings(ds_checkpoint)
109 |     show_embedding_states(ds_checkpoint)
110 |     show_final_norm_states(ds_checkpoint)
111 |     show_transformer_states(ds_checkpoint)
112 |     checkpoint_args = ds_checkpoint.get_args()
113 |     print(f'checkpoint args = {checkpoint_args}')
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     main()
118 | 


--------------------------------------------------------------------------------
/tools/create_doc_index.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
 4 |                                              os.path.pardir)))
 5 | 
 6 | from megatron import print_rank_0
 7 | from megatron.indexer import IndexBuilder
 8 | from megatron.initialize import initialize_megatron
 9 | 
10 | 
11 | def main():
12 |     """Create a BlockData data structure by running an IndexBuilder over an ICT Dataset
13 |     - Include all args needed for initial model specification
14 | 
15 |     Other key args:
16 |         --block-data-path: path to write to
17 |         --ict-load or --realm-load: path to checkpoint with which to embed
18 |         --data-path and --titles-data-path: paths for dataset
19 |         --indexer-log-interval: reporting interval
20 |         --indexer-batch-size: size specific for indexer jobs
21 | 
22 |     Check README.md for example script
23 |     """
24 | 
25 |     initialize_megatron(extra_args_provider=None,
26 |                         args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
27 |     index_builder = IndexBuilder()
28 |     index_builder.build_and_save_index()
29 |     print_rank_0("Build and save indices: done!")
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 
34 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/logs/rescale-logs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # This script fixes up BigScience log files by adjusting and fixing
 5 | # units of logged values to be seconds instead of milliseconds.
 6 | # It does the modification in-place (so make back ups!).
 7 | #
 8 | # Example:
 9 | #
10 | # find . -name "*.out*" -print0 | xargs -0 -P 8 rescale-logs.py
11 | #
12 | # See also the discussion in
13 | # https://github.com/bigscience-workshop/Megatron-DeepSpeed/issues/236.
14 | #
15 | # This script is derived from https://stackoverflow.com/a/60080531/9201239
16 | # and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c.
17 | 
18 | import os
19 | import re
20 | import sys
21 | 
22 | LINE_START_RE = re.compile(' ?iteration')
23 | ELAPSED_TIME_RE = re.compile(r'elapsed time per iteration \(ms\): ([0-9.]+)')
24 | SAMPLES_PER_SEC_RE = re.compile('samples per second: ([0-9.]+)')
25 | 
26 | 
27 | def rescale_logs(log_file_path):
28 |     new_log_file_path = log_file_path + '.new'
29 |     with open(log_file_path, 'r') as log_file:
30 |         with open(new_log_file_path, 'w') as new_log_file:
31 |             for line in log_file.readlines():
32 |                 if LINE_START_RE.match(line):
33 |                     match = ELAPSED_TIME_RE.search(line)
34 |                     if match:
35 |                         # Logged time is in ms, so convert the match.
36 |                         time_in_sec = float(match[1]) / 1000
37 |                         replacement = (
38 |                             f'elapsed time per iteration (s): '
39 |                             f'{time_in_sec:.2f}'
40 |                         )
41 | 
42 |                         # We only need to replace once per line.
43 |                         line = ELAPSED_TIME_RE.sub(replacement, line, count=1)
44 | 
45 |                     match = SAMPLES_PER_SEC_RE.search(line)
46 |                     if match:
47 |                         # Logged time is in ms, so convert the match.
48 |                         time_in_sec = float(match[1]) * 1000
49 |                         # As the values are already logged up to 3
50 |                         # numbers after the decimal point and we scale
51 |                         # by exactly that amount, we log them without
52 |                         # decimal point here in order to not seem more
53 |                         # exact than we are.
54 |                         replacement = f'samples per second: {time_in_sec:.0f}'
55 | 
56 |                         # We only need to replace once per line.
57 |                         line = SAMPLES_PER_SEC_RE.sub(
58 |                             replacement,
59 |                             line,
60 |                             count=1,
61 |                         )
62 | 
63 |                 new_log_file.write(line)
64 | 
65 |     os.rename(new_log_file_path, log_file_path)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     if len(sys.argv) < 2:
70 |         print(f'{sys.argv[0]} <input file>',
71 |               file=sys.stderr)
72 |         sys.exit(1)
73 | 
74 |     input_file = sys.argv[1]
75 |     rescale_logs(input_file)
76 |     print('Done')
77 | 


--------------------------------------------------------------------------------
/tools/logs/tb-rename-events.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # This script renames event names in TensorBoard log files.
 5 | # It does the renaming in-place (so make back ups!).
 6 | #
 7 | # Example:
 8 | #
 9 | # find . -name "*.tfevents*" -exec tb-rename-events.py {} "iteration-time" "iteration-time/iteration-time" \;
10 | #
11 | # More than one old tag can be remapped to one new tag – use ";" as a separator:
12 | #
13 | # tb-rename-events.py events.out.tfevents.1 "training loss;validation loss" "loss"
14 | #
15 | # This script is derived from https://stackoverflow.com/a/60080531/9201239
16 | # and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c.
17 | 
18 | import os
19 | import sys
20 | 
21 | # Use this if you want to avoid using the GPU
22 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
23 | import tensorflow as tf
24 | from tensorflow.core.util.event_pb2 import Event
25 | 
26 | 
27 | def rename_events(input_file, old_tags, new_tag):
28 |     new_file = input_file + '.new'
29 |     # Make a record writer
30 |     with tf.io.TFRecordWriter(new_file) as writer:
31 |         # Iterate event records
32 |         for rec in tf.data.TFRecordDataset([input_file]):
33 |             # Read event
34 |             ev = Event()
35 |             ev.MergeFromString(rec.numpy())
36 |             # Check if it is a summary
37 |             if ev.summary:
38 |                 # Iterate summary values
39 |                 for v in ev.summary.value:
40 |                     # Check if the tag should be renamed
41 |                     if v.tag in old_tags:
42 |                         # Rename with new tag name
43 |                         v.tag = new_tag
44 |             writer.write(ev.SerializeToString())
45 |     os.rename(new_file, input_file)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     if len(sys.argv) != 4:
50 |         print(f'{sys.argv[0]} <input file> <old tags> <new tag>',
51 |               file=sys.stderr)
52 |         sys.exit(1)
53 |     input_file, old_tags, new_tag = sys.argv[1:]
54 |     old_tags = old_tags.split(';')
55 |     rename_events(input_file, old_tags, new_tag)
56 |     print('Done')
57 | 


--------------------------------------------------------------------------------
/tools/logs/tb-rescale-scalars.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # This script rescales scalar values in TensorBoard log files.
 5 | # It does the modification in-place (so make back ups!).
 6 | #
 7 | # Example:
 8 | #
 9 | # find . -name "*.tfevents*" -exec tb-rescale-scalars.py {} "iteration-time/samples per second" 1000 \;
10 | #
11 | # More than one old tag can be rescaled – use ";" as a separator:
12 | #
13 | # tb-rescale-scalars.py events.out.tfevents.1 "training loss;validation loss" 1e-2
14 | #
15 | # By default, BigScience GPT throughput values will be fixed up according to
16 | # https://github.com/bigscience-workshop/Megatron-DeepSpeed/issues/236,
17 | # i.e. the rescaling fixes values wrongly logged as "seconds" when they are
18 | # actually milliseconds.
19 | #
20 | # This script is derived from https://stackoverflow.com/a/60080531/9201239
21 | # and https://gist.github.com/stas00/4cd1651d1c8f01196ea322c733bde46c.
22 | 
23 | import os
24 | import sys
25 | 
26 | # Use this if you want to avoid using the GPU
27 | os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
28 | import tensorflow as tf
29 | from tensorflow.core.util.event_pb2 import Event
30 | 
31 | 
32 | def rescale_scalars(input_file, tags, rescale_factor):
33 |     new_file = input_file + '.new'
34 |     # Make a record writer
35 |     with tf.io.TFRecordWriter(new_file) as writer:
36 |         # Iterate event records
37 |         for rec in tf.data.TFRecordDataset([input_file]):
38 |             # Read event
39 |             ev = Event()
40 |             ev.MergeFromString(rec.numpy())
41 |             # Check if it is a summary
42 |             if ev.summary:
43 |                 # Iterate summary values
44 |                 for v in ev.summary.value:
45 |                     # Check if the tag should be rescaled
46 |                     if v.tag in tags:
47 |                         v.simple_value *= rescale_factor
48 |             writer.write(ev.SerializeToString())
49 |     os.rename(new_file, input_file)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     if len(sys.argv) < 2:
54 |         print(f'{sys.argv[0]} <input file> [<tags> [<rescale factor>]]',
55 |               file=sys.stderr)
56 |         sys.exit(1)
57 | 
58 |     if len(sys.argv) < 3:
59 |         sys.argv.append(';'.join([
60 |             'iteration-time/samples per second',
61 |             'iteration-time/samples per second per replica',
62 |             'iteration-time/tokens per second',
63 |             'iteration-time/tokens per second per replica',
64 |         ]))
65 |     if len(sys.argv) < 4:
66 |         sys.argv.append('1000')
67 | 
68 |     input_file, tags, rescale_factor = sys.argv[1:]
69 |     tags = tags.split(';')
70 |     rescale_factor = float(rescale_factor)
71 |     rescale_scalars(input_file, tags, rescale_factor)
72 |     print('Done')
73 | 


--------------------------------------------------------------------------------
/tools/openwebtext/README.md:
--------------------------------------------------------------------------------
 1 | The following steps show how to prepare training dataset to train the mode.
 2 | 
 3 | # Libraries to install
 4 | 
 5 | ```
 6 |     pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract 
 7 |     git clone https://github.com/mattilyra/LSH
 8 |     cd LSH
 9 |     python setup.py install
10 | ``` 
11 | 
12 | # Download the dataset
13 | 
14 | 1. Download the deduplicated URLs from [jcpeterson](https://mega.nz/#F!EZZD0YwJ!9_PlEQzdMVLaNdKv_ICNVQ!cc4RgQQZ)
15 | 2. Remove blacklisted URLs.
16 | ```
17 | python blacklist_urls.py <path to the dowloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
18 | ```
19 | 3. Download the content from the clean urls with [openwebtext's utilities](https://github.com/eukaryote31/openwebtext/blob/master/download.py). 
20 | 
21 | 4. Merge the contents into one loose json file with 1 json per newline of the format `{'text': text, 'url': unique_url}`. It is important for the url to be unique.
22 | 
23 | # Prepare the data for GPT-2 training:
24 | 
25 | 1. Perform ftfy, english detection and remove documents with less than 128 tokens. This step can be sharded and run on shards.
26 | ```
27 | python cleanup_dataset.py <input data file> <output cleaned data filename>
28 | ```
29 | Additional cleanup (e.g. remove documents less than 512 characters or dataset specific cleaning like stories, realnews datasets) can be done using `cleanup_fix_dataset.py`. More details can be found by running `python cleanup_fix_dataset.py --help`.
30 | 2. Using LSH, find possible duplicates and store then in a file for later processing. The code supports saving and loading fingerprints for recurrent deduplications, and is also multithreaded for faster processing. More details are can be found by `python find_duplicate.py --help`.
31 | ```
32 | python find_duplicates.py --inputs <pairlist list of input cleaned data files and keys, e.g. cc.json cc_id news.json news_id> --output <output possible duplicate urls filename>
33 | ```
34 | 3. Based on similarity measure defind inside function `is_similar` (default: 0.9), group urls that are similar. Basically, for each group, only one url we should keep and remove the rest.
35 | ```
36 | python group_duplicate_urls.py <possible duplicate urls file> <output file containing similar urls>
37 | ```
38 | 4. Remove similar documents that were detected in the last step.
39 | ```
40 | python remove_group_duplicates.py <file containing simialr documents> <cleaned data file> <outputfile containing deduplicate data>
41 | ```
42 | 
43 | 5. Shuffle the dataset.
44 | ```
45 | shuf <cleaned deduped data file> -o train_data.json
46 | ```
47 | 
48 | # Deduplicating ngrams
49 | 
50 | To deduplicate the downstream tasks (e.g. lambada, squad) from the training dataset, we run the following command.
51 | 
52 | ```
53 | python filter_ngrams.py --tasks <name of he task, e.g. lambada, squad> --dedup-dataset <training dataset to deduplicate> <json key> --output <output training dataset>
54 | ```
55 | We use 13-grams by default for the deduplication. When we find a 13-gram match in a training document, we split the document into two pieces and remove the 13-gram along with 200 characters from the both side of the 13-gram. We also remove any splitted document with less than 200 characters or if a document got splitted more than 10 times. These parameters can be changed using corresponding arguments.
56 | 
57 | Only for the lambada task, we need to provide the path, `--lambada-path <path of the lambada test data>`.
58 | 
59 | Several other features (e.g. save and load dictionary) have been added, look at `python filter_ngrams.py --help` for details.
60 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import argparse
17 | import json
18 | import os
19 | import time
20 | 
21 | """
22 | This code adds id to each json object in a json file. User can add prefix
23 | to the ids.
24 | """
25 | 
26 | if __name__ == '__main__':
27 | 
28 |     print('parsing the arguments ...')
29 | 
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
32 |         ' json file where id needs to be added')
33 |     parser.add_argument('--output-file', type=str, default=None, help=\
34 |         'Output file name with id')
35 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
36 |         'Id prefix')
37 |     parser.add_argument('--log-interval', type=int, default=100,
38 |                        help='Log interval')
39 |     args = parser.parse_args()
40 | 
41 |     print('Adding ids to dataset ...')
42 | 
43 |     f_input = open(args.input_file, 'r', encoding='utf-8')
44 |     f_output = open(args.output_file, 'wb')
45 | 
46 |     unique_ids = 1
47 |     start_time = time.time()
48 |     for row in f_input:
49 |         each_row = json.loads(row)
50 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
51 |         each_row['adlr_id'] = adlr_id_string
52 |         myjson = json.dumps(each_row, ensure_ascii=False)
53 | 
54 |         f_output.write(myjson.encode('utf-8'))
55 |         f_output.write('\n'.encode('utf-8'))
56 | 
57 |         if unique_ids % args.log_interval == 0:
58 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
59 |                     unique_ids, time.time() - start_time), flush=True)
60 | 
61 |         unique_ids += 1
62 | 
63 |     # Close the file.
64 |     f_input.close()
65 |     f_output.close()
66 |     
67 |     print('done :-)', flush=True)
68 | 


--------------------------------------------------------------------------------
/tools/openwebtext/group_duplicate_url.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import time
18 | import sys
19 | 
20 | 
21 | if __name__ == '__main__':
22 | 
23 | 
24 |     print('grouping duplicate urls ...')
25 | 
26 |     input = sys.argv[1]
27 |     output = sys.argv[2]
28 |     if len(sys.argv) > 3:
29 |         jaccard_similarity_threshold = float(sys.argv[3])
30 |     else:
31 |         jaccard_similarity_threshold = 0.7
32 | 
33 |     url_to_index = {}
34 |     index_to_urls = []
35 |     counter = 0
36 |     start_time = time.time()
37 |     with open(input, 'r') as f:
38 |         for line in f:
39 |             counter += 1
40 |             myjson = json.loads(line)
41 |             urls = []
42 |             for main_url in myjson.keys():
43 |                 urls.append(main_url)
44 |                 for value in myjson[main_url]:
45 |                     for other_url, js in value.items():
46 |                         if js >= jaccard_similarity_threshold:
47 |                             urls.append(other_url)
48 |             current_index = -1
49 |             other_indices = set()
50 |             for url in urls:
51 |                 if url in url_to_index:
52 |                     if current_index == -1:
53 |                         current_index = url_to_index[url]
54 |                     elif current_index != url_to_index[url]:
55 |                         other_indices.add(url_to_index[url])
56 |             if current_index == -1:
57 |                 current_index = len(index_to_urls)
58 |                 index_to_urls.append(set())
59 |             for url in urls:
60 |                 url_to_index[url] = current_index
61 |                 index_to_urls[current_index].add(url)
62 |             for index in other_indices:
63 |                 for url in index_to_urls[index]:
64 |                     index_to_urls[current_index].add(url)
65 |                     url_to_index[url] = current_index
66 |                 index_to_urls[index] = None
67 | 
68 |             if counter % 100000 == 0:
69 |                 print(' > processed {} lines in {} seconds ...'.format(
70 |                     counter, time.time() - start_time))
71 | 
72 | 
73 |     total_remove = 0
74 |     total_remain = 0
75 |     for urls in index_to_urls:
76 |         if urls is not None:
77 |             if len(urls) > 1:
78 |                 total_remove += (len(urls) - 1)
79 |                 total_remain += 1
80 |     print('out of {} urls, only {} are unique and {} should be removed'.format(
81 |         total_remove+total_remain, total_remain, total_remove))
82 | 
83 |     with open(output, 'wb') as f:
84 |         for i, urls in enumerate(index_to_urls):
85 |             if urls is not None:
86 |                 if len(urls) > 1:
87 |                     myjson = json.dumps({str(i): list(urls)},
88 |                                         ensure_ascii=False)
89 |                     f.write(myjson.encode('utf-8'))
90 |                     f.write('\n'.encode('utf-8'))
91 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import glob
18 | import sys
19 | import json
20 | import argparse
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument("--json_path", type=str, default=".",
26 |         help="path where all the json files are located")
27 | 
28 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
29 |         help="filename where the merged json should go")
30 | 
31 |     args = parser.parse_args()
32 | 
33 |     json_path = args.json_path
34 |     out_file = args.output_file
35 | 
36 |     json_files = glob.glob(json_path + '/*.json')
37 | 
38 |     counter = 0
39 | 
40 |     with open(out_file, 'w') as outfile:
41 |         for fname in json_files:
42 |             counter += 1
43 | 
44 |             if counter % 1024 == 0:
45 |                 print("Merging at ", counter, flush=True)
46 | 
47 |             with open(fname, 'r') as infile:
48 |                 for row in infile:
49 |                     each_row = json.loads(row)
50 |                     outfile.write(row)
51 | 
52 | 
53 |     print("Merged file", out_file, flush=True)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | import json
18 | import time
19 | import sys
20 | 
21 | 
22 | if __name__ == '__main__':
23 | 
24 |     url_filename = sys.argv[1]
25 |     data_filename = sys.argv[2]
26 |     output_filename = sys.argv[3]
27 | 
28 |     urls = set()
29 |     with open(url_filename, 'r') as f:
30 |         for line in f:
31 |             myjson = json.loads(line)
32 |             for key in myjson:
33 |                 this_urls = myjson[key]
34 |                 for i in range(1, len(this_urls)):
35 |                     urls.add(this_urls[i])
36 |     print('will be removing {} urls'.format(len(urls)), flush=True)
37 | 
38 |     written_docs = 0
39 |     removed_docs = 0
40 |     removed_chars = 0
41 |     start_time = time.time()
42 |     with open(output_filename, 'wb') as fout:
43 |         with open(data_filename, 'r') as fin:
44 |             for line in fin:
45 |                 try:
46 |                     myjson = json.loads(line)
47 |                     url = myjson['url']
48 |                     if url in urls:
49 |                         print('removing', myjson)
50 |                         removed_docs += 1
51 |                         removed_chars += len(myjson['text'])
52 |                         continue
53 |                     myjson = json.dumps(myjson, ensure_ascii=False)
54 |                     fout.write(myjson.encode('utf-8'))
55 |                     fout.write('\n'.encode('utf-8'))
56 |                     written_docs += 1
57 |                     if written_docs % 10000 == 0:
58 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
59 |                               '| removed: {} (char: {})'.format(
60 |                                   time.time() - start_time,
61 |                                   written_docs, removed_docs, removed_chars))
62 |                 except Exception as e:
63 |                     print('[SKIPPING]', line, e)
64 | 
65 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
66 |           '| removed: {} (char: {})'.format(
67 |               time.time() - start_time,
68 |               written_docs, removed_docs, removed_chars))
69 |     print('done :-)')
70 | 


--------------------------------------------------------------------------------
/tools/tb/tb-remove-events-by-group.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # this script removes events from tensorboard log files by group names
 5 | # it does the removal in place (so make back ups!)
 6 | #
 7 | # example:
 8 | #
 9 | #  find . -name "*.tfevents*" -exec tb-remove-events-by-group.py {} "batch-size" \;
10 | #
11 | # which wold match any of "batch-size/batch-size", "batch-size/batch-size vs samples", etc.
12 | #
13 | # more than one group can be removed - use `;` as a separator:
14 | #
15 | #  tb-remove-events-by-group.py events.out.tfevents.1 "batch-size;grad-norm"
16 | #
17 | # this script is derived from https://stackoverflow.com/a/60080531/9201239
18 | #
19 | # Important: this script requires CUDA environment.
20 | 
21 | from pathlib import Path
22 | import os
23 | import re
24 | import shlex
25 | import sys
26 | 
27 | # avoid using the GPU
28 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
29 | # disable logging
30 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
31 | import tensorflow as tf
32 | from tensorflow.core.util.event_pb2 import Event
33 | 
34 | 
35 | def is_tag_matching_group(tag, groups_to_remove):
36 |     for group in groups_to_remove:
37 |         if tag.startswith(group):
38 |             return True
39 |     return False
40 | 
41 | 
42 | def remove_events(input_file, groups_to_remove):
43 |     new_file = input_file + ".new"
44 |     # Make a record writer
45 |     with tf.io.TFRecordWriter(new_file) as writer:
46 |         # Iterate event records
47 |         for rec in tf.data.TFRecordDataset([input_file]):
48 |             # Read event
49 |             ev = Event()
50 |             ev.MergeFromString(rec.numpy())
51 |             # Check if it is a summary event
52 |             if ev.summary:
53 |                 orig_values = [v for v in ev.summary.value]
54 |                 filtered_values = [v for v in orig_values if not is_tag_matching_group(v.tag, groups_to_remove)]
55 |                 #print(f"filtered_values={len(filtered_values)}, orig_values={len(orig_values)}")
56 |                 if len(filtered_values) != len(orig_values):
57 |                     # for v in orig_values:
58 |                     #     print(v)
59 |                     del ev.summary.value[:]
60 |                     ev.summary.value.extend(filtered_values)
61 |             writer.write(ev.SerializeToString())
62 |     os.rename(new_file, input_file)
63 | 
64 | def remove_events_dir(input_file, groups_to_remove):
65 |     # Write removed events
66 |     remove_events(input_file, groups_to_remove)
67 | 
68 | if __name__ == '__main__':
69 |     if len(sys.argv) != 3:
70 |         print(f'{sys.argv[0]} <input file> <tags to remove>',
71 |               file=sys.stderr)
72 |         sys.exit(1)
73 |     input_file, groups_to_remove = sys.argv[1:]
74 |     print(input_file, shlex.quote(groups_to_remove))
75 |     groups_to_remove = groups_to_remove.split(';')
76 |     remove_events_dir(input_file, groups_to_remove)
77 | 


--------------------------------------------------------------------------------
/tools/tb/tb-remove-events-by-tag.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # this script removes events from tensorboard log files by specific tag names
 5 | # it does the removal in place (so make back ups!)
 6 | #
 7 | # example:
 8 | #
 9 | #  find . -name "*.tfevents*" -exec tb-remove-events-by-tag.py {} "batch-size/batch-size" \;
10 | #
11 | # more than one tag can be removed - use `;` as a separator:
12 | #
13 | #  tb-remove-events-by-tag.py events.out.tfevents.1 "batch-size/batch-size;batch-size/batch-size vs samples"
14 | #
15 | # this script is derived from https://stackoverflow.com/a/60080531/9201239
16 | #
17 | # Important: this script requires CUDA environment.
18 | 
19 | import shlex
20 | import sys
21 | from pathlib import Path
22 | import os
23 | # avoid using the GPU
24 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
25 | # disable logging
26 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
27 | import tensorflow as tf
28 | from tensorflow.core.util.event_pb2 import Event
29 | 
30 | def remove_events(input_file, tags_to_remove):
31 |     new_file = input_file + ".new"
32 |     # Make a record writer
33 |     with tf.io.TFRecordWriter(new_file) as writer:
34 |         # Iterate event records
35 |         for rec in tf.data.TFRecordDataset([input_file]):
36 |             # Read event
37 |             ev = Event()
38 |             ev.MergeFromString(rec.numpy())
39 |             # Check if it is a summary event
40 |             if ev.summary:
41 |                 orig_values = [v for v in ev.summary.value]
42 |                 filtered_values = [v for v in orig_values if v.tag not in tags_to_remove]
43 |                 #print(f"filtered_values={len(filtered_values)}, orig_values={len(orig_values)}")
44 |                 if len(filtered_values) != len(orig_values):
45 |                     # for v in orig_values:
46 |                     #     print(v)
47 |                     del ev.summary.value[:]
48 |                     ev.summary.value.extend(filtered_values)
49 |             writer.write(ev.SerializeToString())
50 |     os.rename(new_file, input_file)
51 | 
52 | def remove_events_dir(input_file, tags_to_remove):
53 |     # Write removed events
54 |     remove_events(input_file, tags_to_remove)
55 | 
56 | if __name__ == '__main__':
57 |     if len(sys.argv) != 3:
58 |         print(f'{sys.argv[0]} <input file> <tags to remove>',
59 |               file=sys.stderr)
60 |         sys.exit(1)
61 |     input_file, tags_to_remove = sys.argv[1:]
62 |     print(input_file, shlex.quote(tags_to_remove))
63 |     tags_to_remove = tags_to_remove.split(';')
64 |     remove_events_dir(input_file, tags_to_remove)
65 | 


--------------------------------------------------------------------------------
/tools/tb/tb-rename-events.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # this script renames event names in tensorboard log files
 5 | # it does the rename in place (so make back ups!)
 6 | #
 7 | # example:
 8 | #
 9 | # find . -name "*.tfevents*" -exec tb-rename-events.py {} "iteration-time" "iteration-time/iteration-time" \;
10 | #
11 | # more than one old tag can be remapped to one new tag - use `;` as a separator:
12 | #
13 | # tb-rename-events.py events.out.tfevents.1 "training loss;validation loss" "loss"
14 | #
15 | # this script is derived from https://stackoverflow.com/a/60080531/9201239
16 | #
17 | # Important: this script requires CUDA environment.
18 | 
19 | import shlex
20 | import sys
21 | from pathlib import Path
22 | import os
23 | # avoid using the GPU
24 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
25 | # disable logging
26 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
27 | import tensorflow as tf
28 | from tensorflow.core.util.event_pb2 import Event
29 | 
30 | def rename_events(input_file, old_tags, new_tag):
31 |     new_file = input_file + ".new"
32 |     # Make a record writer
33 |     with tf.io.TFRecordWriter(new_file) as writer:
34 |         # Iterate event records
35 |         for rec in tf.data.TFRecordDataset([input_file]):
36 |             # Read event
37 |             ev = Event()
38 |             ev.MergeFromString(rec.numpy())
39 |             # Check if it is a summary
40 |             #print(ev)
41 |             if ev.summary:
42 |                 # Iterate summary values
43 |                 for v in ev.summary.value:
44 |                     #print(v)
45 |                     # Check if the tag should be renamed
46 |                     if v.tag in old_tags:
47 |                         # Rename with new tag name
48 |                         v.tag = new_tag
49 |             writer.write(ev.SerializeToString())
50 |     os.rename(new_file, input_file)
51 | 
52 | def rename_events_dir(input_file, old_tags, new_tag):
53 |     # Write renamed events
54 |     rename_events(input_file, old_tags, new_tag)
55 | 
56 | if __name__ == '__main__':
57 |     if len(sys.argv) != 4:
58 |         print(f'{sys.argv[0]} <input file> <old tags> <new tag>',
59 |               file=sys.stderr)
60 |         sys.exit(1)
61 |     input_file, old_tags, new_tag = sys.argv[1:]
62 |     print(input_file, shlex.quote(old_tags), shlex.quote(new_tag))
63 |     old_tags = old_tags.split(';')
64 |     rename_events_dir(input_file, old_tags, new_tag)
65 | 


--------------------------------------------------------------------------------