├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── enhancement.md │ ├── question.md │ └── regression.md └── workflows │ └── stale.yml ├── .gitignore ├── .gitlab-ci.yml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── distrib_optimizer.md └── images │ └── distrib_optimizer │ ├── data_flow.png │ └── sharding_scheme.png ├── examples ├── detxoify_lm │ ├── README.md │ ├── annotations │ │ ├── filter-selfgeneration.py │ │ ├── perspective_api_annotate.py │ │ └── preprocess.sh │ ├── finetune_gpt.py │ ├── finetune_gpt_distributed-1.3b.sh │ ├── generate-1.3b.sh │ ├── generate_samples_gpt.py │ ├── perspective_api.py │ └── self_generation │ │ └── selfgenerate-1.3b-unconditional.sh ├── evaluate_retriever_nq.sh ├── evaluate_zeroshot_gpt.sh ├── finetune_mnli_distributed.sh ├── finetune_race_distributed.sh ├── finetune_retriever_distributed.sh ├── merge_mp_bert.sh ├── msdp │ ├── README.md │ ├── data_processing.sh │ ├── eval_knwl_generation.sh │ ├── eval_resp_generation.sh │ ├── prep_resp_gen.sh │ ├── prompt_knwl_gen.sh │ └── prompt_resp_gen.sh ├── pretrain_bert.sh ├── pretrain_bert_distributed.sh ├── pretrain_bert_distributed_with_mp.sh ├── pretrain_gpt.sh ├── pretrain_gpt3_175B.sh ├── pretrain_gpt_distributed.sh ├── pretrain_gpt_distributed_with_mp.sh ├── pretrain_ict.sh ├── pretrain_t5.sh ├── pretrain_t5_distributed.sh ├── pretrain_t5_distributed_with_mp.sh ├── run_text_generation_server_345M.sh ├── run_text_generation_server_345M_8_tensor_parallel.sh └── sc21 │ ├── CONFIG.sh │ ├── README.md │ ├── SBATCH.sh │ ├── SRUN.sh │ ├── run_figure_11.sh │ ├── run_figure_12.sh │ ├── run_figure_13.sh │ ├── run_figure_14.sh │ ├── run_figure_15.sh │ ├── run_figure_16.sh │ ├── run_figure_17.sh │ ├── run_figure_18.sh │ └── run_table_1.sh ├── exp2.sh ├── images ├── Achieved_petaFLOPs.png └── cases_april2021.png ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── core │ ├── README.md │ ├── __init__.py │ ├── dist_checkpointing │ │ ├── __init__.py │ │ ├── core.py │ │ ├── dict_utils.py │ │ ├── mapping.py │ │ ├── optimizer.py │ │ ├── serialization.py │ │ ├── strategies │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── tensorstore.py │ │ │ ├── two_stage.py │ │ │ └── zarr.py │ │ └── utils.py │ ├── enums.py │ ├── fusions │ │ ├── __init__.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_layer_norm.py │ │ └── fused_softmax.py │ ├── inference_params.py │ ├── model_parallel_config.py │ ├── models │ │ ├── __init__.py │ │ ├── common │ │ │ └── rotary_pos_embedding.py │ │ └── gpt │ │ │ ├── __init__.py │ │ │ ├── gpt_embedding.py │ │ │ └── gpt_model.py │ ├── package_info.py │ ├── parallel_state.py │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── p2p_communication.py │ │ ├── schedules.py │ │ ├── sp_utils.py │ │ └── split_solver.py │ ├── requirements.txt │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── custom_layers │ │ │ └── transformer_engine.py │ │ ├── dot_product_attention.py │ │ ├── enums.py │ │ ├── identity_op.py │ │ ├── mlp.py │ │ ├── module.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── utils.py │ └── utils.py ├── data │ ├── Makefile │ ├── __init__.py │ ├── autoaugment.py │ ├── bert_dataset.py │ ├── biencoder_dataset_utils.py │ ├── blendable_dataset.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── gpt_dataset.py │ ├── helpers.cpp │ ├── ict_dataset.py │ ├── image_folder.py │ ├── indexed_dataset.py │ ├── multimodal_dataset.py │ ├── orqa_wiki_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ ├── t5_dataset.py │ ├── test │ │ ├── test_indexed_dataset.py │ │ └── test_preprocess_data.sh │ └── vit_dataset.py ├── dist_signal_handler.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── tests │ │ ├── __init__.py │ │ └── test_fused_kernels.py │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── distributed.py │ ├── enums.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_softmax.py │ ├── gpt_model.py │ ├── language_model.py │ ├── module.py │ ├── multiple_choice.py │ ├── realm_model.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vision │ │ ├── classification.py │ │ ├── dino.py │ │ ├── esvit_swin_backbone.py │ │ ├── inpainting.py │ │ ├── knn_monitor.py │ │ ├── mit_backbone.py │ │ ├── swin_backbone.py │ │ ├── utils.py │ │ └── vit_backbone.py ├── mpu │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py ├── optimizer │ ├── __init__.py │ ├── clip_grads.py │ ├── distrib_optimizer.py │ ├── grad_scaler.py │ └── optimizer.py ├── optimizer_param_scheduler.py ├── static │ └── index.html ├── text_generation │ ├── __init__.py │ ├── api.py │ ├── beam_utils.py │ ├── communication.py │ ├── forward_step.py │ ├── generation.py │ ├── sampling.py │ └── tokenization.py ├── text_generation_server.py ├── timers.py ├── tokenizer │ ├── __init__.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── picture ├── .DS_Store ├── 13bx32A100_memory.svg ├── 2.7bx8A100_memory.pdf ├── 2.7bx8A100_memory.svg ├── 30bx32A100_memory.svg ├── 30bx64A100_memory.svg ├── 32x7b zhihu_throughput.tex ├── 7bx32A100_memory.svg ├── Raycast (2).dmg ├── seq1f1b_memory.pdf ├── seq1f1b_memory.png ├── seq1f1b_memory.svg ├── seq1f1b_original.png └── seq1f1b_zerobubble.pdf ├── pretrain_bert.py ├── pretrain_gpt.py ├── pretrain_gpt_core.py ├── pretrain_ict.py ├── pretrain_retro.py ├── pretrain_t5.py ├── pretrain_vision_classify.py ├── pretrain_vision_dino.py ├── pretrain_vision_inpaint.py ├── pyproject.toml ├── setup.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── main.py ├── msdp │ ├── README.md │ ├── evaluate.py │ ├── main.py │ ├── metrics.py │ ├── preprocessing.py │ └── prompt.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification │ │ ├── classification.py │ │ └── eval_utils.py │ ├── finetune_utils.py │ ├── main.py │ └── segmentation │ │ ├── cityscapes.py │ │ ├── data.py │ │ ├── finetune_segformer.py │ │ ├── finetune_setr.py │ │ ├── metrics.py │ │ ├── seg_heads.py │ │ ├── seg_models.py │ │ ├── transforms.py │ │ └── utils.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tests ├── __init__.py ├── functional_tests │ ├── __init__.py │ ├── python_test_utils │ │ ├── __init__.py │ │ ├── check_slurm_job_completion.py │ │ ├── get_test_results_from_tensorboard_logs.py │ │ ├── test_ci_pipeline.py │ │ └── test_resume_checkpoint_pipeline.py │ ├── shell_test_utils │ │ └── jobwait.sh │ ├── test_results │ │ ├── bert │ │ │ ├── bert_tp1_pp2_1nodes_50steps.json │ │ │ ├── bert_tp1_pp4_1nodes_50steps.json │ │ │ ├── bert_tp2_pp2_1nodes_50steps.json │ │ │ └── bert_tp4_pp1_1nodes_50steps.json │ │ └── gpt3 │ │ │ ├── gpt3_tp1_pp2_1nodes_50steps.json │ │ │ ├── gpt3_tp1_pp2_1nodes_50steps_core_enabled.json │ │ │ ├── gpt3_tp1_pp2_1nodes_50steps_core_enabled_rope_embeddings.json │ │ │ ├── gpt3_tp1_pp4_1nodes_50steps.json │ │ │ ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled.json │ │ │ ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_disable_bias_linear.json │ │ │ ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_sequence_parallel.json │ │ │ ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_swiglu.json │ │ │ ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_untie_embeddings_and_outputs.json │ │ │ ├── gpt3_tp2_pp2_1nodes_50steps.json │ │ │ ├── gpt3_tp2_pp2_1nodes_50steps_core_enabled.json │ │ │ ├── gpt3_tp2_pp2_1nodes_50steps_te_enabled.json │ │ │ ├── gpt3_tp4_pp1_1nodes_50steps.json │ │ │ └── gpt3_tp4_pp1_1nodes_50steps_core_enabled.json │ └── test_scripts │ │ ├── bert │ │ ├── pretrain_bert_distributed_resume_checkpoint_test.sh │ │ ├── pretrain_bert_distributed_test.sh │ │ ├── sbatch_bert_distributed_resume_checkpoint_test.sh │ │ └── sbatch_bert_distributed_test.sh │ │ └── gpt3 │ │ ├── pretrain_gpt3_distributed_resume_checkpoint_test.sh │ │ ├── pretrain_gpt3_distributed_test.sh │ │ ├── sbatch_gpt3_distributed_resume_checkpoint_test.sh │ │ └── sbatch_gpt3_distributed_test.sh └── unit_tests │ ├── __init__.py │ ├── models │ ├── __init__.py │ ├── test_gpt_embedding.py │ └── test_gpt_model.py │ ├── pipeline_parallel │ ├── __init__.py │ └── test_schedules.py │ ├── tensor_parallel │ ├── test_cross_entropy.py │ ├── test_data.py │ ├── test_mappings.py │ ├── test_random.py │ └── test_tensor_parallel_utils.py │ ├── test_basic.py │ ├── test_parallel_state.py │ ├── test_utilities.py │ ├── test_utils.py │ └── transformer │ ├── __init__.py │ ├── test_attention.py │ ├── test_core_attention.py │ ├── test_mlp.py │ ├── test_module.py │ ├── test_transformer_block.py │ └── test_transformer_layer.py └── tools ├── autoformat.sh ├── bert_embedding ├── __init__.py ├── dataset.py ├── embed.py ├── external_libs.py ├── huggingface.py └── utils.py ├── checkpoint_loader_megatron.py ├── checkpoint_saver_megatron.py ├── checkpoint_util.py ├── linter.py ├── merge_datasets.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py ├── preprocess_data.py ├── preprocess_data_nmt.py ├── preprocess_mmdata.py ├── retro ├── README.md ├── cli │ ├── __init__.py │ ├── __main__.py │ └── cli.py ├── db │ ├── __init__.py │ ├── build.py │ ├── dataset.py │ └── utils.py ├── examples │ ├── preprocess_data.sh │ └── pretrain_model.sh ├── external_libs.py ├── index │ ├── __init__.py │ ├── build.py │ ├── factory.py │ ├── index.py │ ├── indexes │ │ ├── __init__.py │ │ ├── faiss_base.py │ │ └── faiss_par_add.py │ └── utils.py ├── main.py ├── query │ ├── __init__.py │ ├── chunk_dataset.py │ ├── query.py │ ├── retro_dataset.py │ └── utils.py └── utils.py ├── run_text_generation_server.py └── text_generation_cli.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [html] 2 | directory = coverage 3 | 4 | [run] 5 | data_file = .coverage_$LOCAL_RANK 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: BUG 3 | about: Report a bug that needs attention 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Stack trace/logs** 20 | If applicable, add the stack trace or logs from the time of the error. 21 | 22 | **Environment (please complete the following information):** 23 | - Megatron-LM commit ID 24 | - PyTorch version 25 | - CUDA version 26 | - NCCL version 27 | 28 | **Proposed fix** 29 | If you have a proposal for how to fix the issue state it here or link to a PR. 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: ENHANCEMENT 3 | about: Suggest an idea to improve this project 4 | title: "[ENHANCEMENT]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Proposed implementation** 20 | If you have a proposed implementation for the feature state it here or link to a PR. 21 | 22 | **Additional context** 23 | Add any other context or screenshots about the feature request here. 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: QUESTION 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement 4 | request 5 | title: "[QUESTION]" 6 | labels: '' 7 | assignees: '' 8 | 9 | --- 10 | 11 | **Your question** 12 | Ask a clear and concise question about Megatron-LM. 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/regression.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: REGRESSION 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update 4 | title: "[REGRESSION]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the regression** 11 | A clear and concise description of what the regression is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention. 15 | 16 | **Previous performance** 17 | What speed or accuracy did you previously see. 18 | 19 | **New performance** 20 | What speed or accuracy do you see after the update. 21 | 22 | **Stack trace/logs** 23 | If applicable, add the stack trace or logs related to the regression. 24 | 25 | **Environment (please complete the following information):** 26 | - Previous Megatron-LM commit ID 27 | - New Megatron-LM commit ID 28 | - Previous PyTorch version 29 | - New PyTorch version 30 | - Previous CUDA version 31 | - New CUDA version 32 | - Previous NCCL version 33 | - New NCCL version 34 | 35 | **Proposed fix** 36 | If you have a proposal for how to fix the issue state it here or link to a PR. 37 | 38 | **Additional context** 39 | Add any other context about the problem here. 40 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '15 18 * * *' 11 | 12 | jobs: 13 | stale: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/stale@v5 22 | with: 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | days-before-stale: 60 25 | stale-issue-message: 'Marking as stale. No activity in 60 days.' 26 | stale-pr-message: 'Marking as stale. No activity in 60 days.' 27 | stale-issue-label: 'stale' 28 | stale-pr-label: 'stale' 29 | remove-stale-when-updated: true 30 | operations-per-run: 1000 31 | days-before-close: -1 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.so 3 | build 4 | .coverage_* 5 | *.egg-info 6 | *~ 7 | slurm* 8 | logs 9 | *.npy 10 | *.bin 11 | *.idx 12 | *.log 13 | *.op 14 | *nsys-rep 15 | *tfevents* 16 | */index-cache/* 17 | *.pt 18 | -------------------------------------------------------------------------------- /docs/images/distrib_optimizer/data_flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/docs/images/distrib_optimizer/data_flow.png -------------------------------------------------------------------------------- /docs/images/distrib_optimizer/sharding_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/docs/images/distrib_optimizer/sharding_scheme.png -------------------------------------------------------------------------------- /examples/detxoify_lm/annotations/filter-selfgeneration.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from typing import Dict, Optional, List 4 | 5 | import joblib 6 | from googleapiclient import discovery 7 | from googleapiclient.errors import HttpError 8 | 9 | import argparse 10 | 11 | from tqdm import tqdm 12 | 13 | parser = argparse.ArgumentParser(description='Process some integers.') 14 | parser.add_argument('--data-path', type=str, default='', 15 | help='data path to load the jsonl') 16 | parser.add_argument('--out-path', type=str, default='', 17 | help='data path to load the jsonl') 18 | parser.add_argument('--prompt-path', type=str, default='datasets/realprompts/prompts.jsonl', 19 | help='data path to load the prompt jsonl') 20 | parser.add_argument('--workers', type=int, default=10, 21 | help='Number of worker processes to launch') 22 | 23 | 24 | def get_corpus_scores(lines): 25 | scores = [] 26 | corpus = [] 27 | 28 | for line in tqdm(lines): 29 | data = json.loads(line) 30 | score = data['score'] 31 | if score is not None: 32 | scores.append(score['toxicity']) 33 | corpus.append(data) 34 | return scores, corpus 35 | 36 | 37 | def main(): 38 | args = parser.parse_args() 39 | 40 | with open(args.data_path) as f: 41 | lines = f.readlines() 42 | 43 | print(f"total line number: {len(lines)}") 44 | 45 | scores, corpus = get_corpus_scores(lines) 46 | import numpy as np 47 | scores = np.array(scores) 48 | indices = np.argsort(scores) 49 | 50 | print(f"total valid samples: {len(scores)}") 51 | 52 | print(f"Avg toxicity (all): {np.mean(scores)} +- {np.std(scores)}") 53 | print(f"Avg toxicity (toxic): {np.mean(scores[scores > 0.5])} +- {np.std(scores[scores > 0.5])}") 54 | print(f"Toxic Percentage {sum(scores > 0.5) / len(scores)}") 55 | print(f"Avg toxicity (nontoxic): {np.mean(scores[scores <= 0.5])} +- {np.std(scores[scores <= 0.5])}") 56 | print(f"Nontoxic Percentage {sum(scores <= 0.5) / len(scores)}") 57 | 58 | samples_left = len(lines) // 2 59 | print(f"After filtering: {samples_left} of samples are left") 60 | nontoxic_indices = indices[:samples_left] 61 | print(f"Avg toxicity (filtered): {np.mean(scores[nontoxic_indices])} +- {np.std(scores[nontoxic_indices])}") 62 | print(f"Toxicity Range (filtered): {np.min(scores[nontoxic_indices])} ~ {np.max(scores[nontoxic_indices])}") 63 | nontoxic_data = [corpus[ind] for ind in nontoxic_indices] 64 | print(f"Total samples after filtering: {len(nontoxic_data)}") 65 | print(f"Examples: {nontoxic_data[:3]}") 66 | 67 | from sklearn.utils import shuffle 68 | nontoxic_data = shuffle(nontoxic_data) 69 | 70 | with open(args.out_path, 'w') as f: 71 | for x in nontoxic_data: 72 | f.write(json.dumps(x) + '\n') 73 | 74 | 75 | main() -------------------------------------------------------------------------------- /examples/detxoify_lm/annotations/preprocess.sh: -------------------------------------------------------------------------------- 1 | VOCAB_FILE=pt2-vocab.json 2 | MERGE_FILE=gpt2-merges.txt 3 | 4 | python3 tools/preprocess_data.py \ 5 | --input $1 \ 6 | --output-prefix $2 \ 7 | --vocab-file $VOCAB_FILE \ 8 | --merge-file $MERGE_FILE \ 9 | --tokenizer-type GPT2BPETokenizer \ 10 | --append-eod --workers 20 --chunk-size 25 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Change for multinode config 4 | GPUS_PER_NODE=16 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=$(($RANDOM + 1024)) 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | # input 12 | DATA_PATH=$1 13 | SHARE_DATA=$PWD # current work dir 14 | FINETUNED_PATH="$SHARE_DATA/$2" 15 | lr=$3 16 | bs=$4 17 | iter=$5 18 | CHECKPOINT_PATH=$6 19 | 20 | # vocab 21 | VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab 22 | MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file 23 | 24 | # tensorboard 25 | TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2" 26 | mkdir -p ${TENSORBOARD_DIR} 27 | 28 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 29 | 30 | python -m torch.distributed.run $DISTRIBUTED_ARGS \ 31 | examples/detxoify_lm/finetune_gpt.py \ 32 | --num-layers 24 \ 33 | --hidden-size 2048 \ 34 | --num-attention-heads 32 \ 35 | --micro-batch-size 4 \ 36 | --global-batch-size $bs \ 37 | --seq-length 2048 \ 38 | --max-position-embeddings 2048 \ 39 | --train-iters $iter \ 40 | --save $FINETUNED_PATH \ 41 | --load $CHECKPOINT_PATH \ 42 | --data-path $DATA_PATH \ 43 | --data-path2 ${DATA_BLEND} \ 44 | --vocab-file $VOCAB_FILE \ 45 | --merge-file $MERGE_FILE \ 46 | --data-impl mmap \ 47 | --split 100,0,0 \ 48 | --distributed-backend nccl \ 49 | --lr-decay-style constant \ 50 | --lr $lr \ 51 | --clip-grad 1.0 \ 52 | --weight-decay 0.1 \ 53 | --adam-beta1 0.9 \ 54 | --adam-beta2 0.95 \ 55 | --checkpoint-activations \ 56 | --log-interval 1 \ 57 | --save-interval 78 \ 58 | --eval-interval 78 \ 59 | --eval-iters 50 \ 60 | --fp16 \ 61 | --DDP-impl local \ 62 | --finetune --no-load-optim \ 63 | --log-validation-ppl-to-tensorboard \ 64 | --tensorboard-dir ${TENSORBOARD_DIR} 65 | -------------------------------------------------------------------------------- /examples/detxoify_lm/generate-1.3b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | VOCAB_FILE=gpt2-vocab.json 4 | MERGE_FILE=gpt2-merges.txt 5 | 6 | GPUS_PER_NODE=1 7 | # Change for multinode config 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=$(($RANDOM + 1024)) 10 | NNODES=1 11 | NODE_RANK=0 12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 13 | NUM_SAMPLES=$(wc -l < $1) 14 | PREFIX=$(basename $2) 15 | SEED=$(($RANDOM)) 16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl 17 | 18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 19 | 20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 21 | --tensor-model-parallel-size 1 \ 22 | --num-layers 24 \ 23 | --hidden-size 2048 \ 24 | --load $CHECKPOINT_PATH \ 25 | --num-attention-heads 32 \ 26 | --max-position-embeddings 2048 \ 27 | --tokenizer-type GPT2BPETokenizer \ 28 | --fp16 \ 29 | --micro-batch-size 400 \ 30 | --seq-length 2048 \ 31 | --out-seq-length 20 \ 32 | --temperature 1.0 \ 33 | --vocab-file $VOCAB_FILE \ 34 | --merge-file $MERGE_FILE \ 35 | --sample-input-file $1 \ 36 | --sample-output-file $OUTPUT \ 37 | --num-samples $NUM_SAMPLES \ 38 | --max-tokens-to-oom 1200000 \ 39 | --top_p 0.9 \ 40 | --seed $SEED 41 | 42 | -------------------------------------------------------------------------------- /examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CHECKPOINT_PATH=$2 # Your model ckpt 3 | SHARE_DATA=$PWD # current work dir 4 | VOCAB_FILE=gpt2-vocab.json # Your gpt-2 vocab 5 | MERGE_FILE=gpt2-merges.txt # Your gpt-2 merge file 6 | 7 | GPUS_PER_NODE=1 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=$(($RANDOM + 1024)) 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | SEED=$3 15 | SUFFIX=$(basename $CHECKPOINT_PATH) 16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/ 17 | mkdir -p $save_dir 18 | echo $save_dir/$SEED.out 19 | 20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 21 | 22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \ 23 | --tensor-model-parallel-size 1 \ 24 | --num-layers 24 \ 25 | --hidden-size 2048 \ 26 | --load $CHECKPOINT_PATH \ 27 | --num-attention-heads 32 \ 28 | --max-position-embeddings 2048 \ 29 | --tokenizer-type GPT2BPETokenizer \ 30 | --fp16 \ 31 | --micro-batch-size 150 \ 32 | --seq-length 2048 \ 33 | --out-seq-length 1000 \ 34 | --temperature 1.0 \ 35 | --vocab-file $VOCAB_FILE \ 36 | --merge-file $MERGE_FILE \ 37 | --num-samples $1 \ 38 | --top_p 0.9 \ 39 | --max-tokens-to-oom 1200000 \ 40 | --genfile $save_dir/$SEED.out \ 41 | --seed $SEED 42 | 43 | -------------------------------------------------------------------------------- /examples/evaluate_retriever_nq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained 4 | # ICT model or a finetuned model for Natural Question task 5 | 6 | # Datasets can be downloaded from the following link: 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 8 | 9 | EVIDENCE_DATA_DIR= 10 | EMBEDDING_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | QA_FILE= 14 | 15 | python tasks/main.py \ 16 | --task RETRIEVER-EVAL \ 17 | --tokenizer-type BertWordPieceLowerCase \ 18 | --num-layers 12 \ 19 | --hidden-size 768 \ 20 | --num-attention-heads 12 \ 21 | --tensor-model-parallel-size 1 \ 22 | --micro-batch-size 128 \ 23 | --seq-length 512 \ 24 | --max-position-embeddings 512 \ 25 | --load ${CHECKPOINT_PATH} \ 26 | --evidence-data-path ${EVIDENCE_DATA_DIR} \ 27 | --embedding-path ${EMBEDDING_PATH} \ 28 | --retriever-seq-length 256 \ 29 | --vocab-file bert-vocab.txt\ 30 | --qa-data-test ${QA_FILE} \ 31 | --faiss-use-gpu \ 32 | --retriever-report-topk-accuracies 1 5 20 100 \ 33 | --fp16 \ 34 | --indexer-log-interval 1000 \ 35 | --indexer-batch-size 128 36 | 37 | 38 | -------------------------------------------------------------------------------- /examples/evaluate_zeroshot_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TASK="LAMBADA" 12 | 13 | VALID_DATA= 14 | VOCAB_FILE=gpt2-vocab.json 15 | MERGE_FILE=gpt2-merges.txt 16 | CHECKPOINT=checkpoints/gpt2_345m 17 | 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 20 | --task $TASK \ 21 | --valid-data $VALID_DATA \ 22 | --tokenizer-type GPT2BPETokenizer \ 23 | --strict-lambada \ 24 | --vocab-file $VOCAB_FILE \ 25 | --merge-file $MERGE_FILE \ 26 | --load $CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --batch-size 8 \ 32 | --seq-length 1024 \ 33 | --max-position-embeddings 1024 \ 34 | --log-interval 10 \ 35 | --fp16 \ 36 | --no-load-optim \ 37 | --no-load-rng 38 | -------------------------------------------------------------------------------- /examples/finetune_mnli_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv" 12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \ 13 | data/glue_data/MNLI/dev_mismatched.tsv" 14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 15 | VOCAB_FILE=bert-vocab.txt 16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task MNLI \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 5 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 8 \ 32 | --lr 5.0e-5 \ 33 | --lr-decay-style linear \ 34 | --lr-warmup-fraction 0.065 \ 35 | --seq-length 512 \ 36 | --max-position-embeddings 512 \ 37 | --save-interval 500000 \ 38 | --save $CHECKPOINT_PATH \ 39 | --log-interval 10 \ 40 | --eval-interval 100 \ 41 | --eval-iters 50 \ 42 | --weight-decay 1.0e-1 \ 43 | --fp16 44 | -------------------------------------------------------------------------------- /examples/finetune_race_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORLD_SIZE=8 4 | 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 6 | --nnodes 1 \ 7 | --node_rank 0 \ 8 | --master_addr localhost \ 9 | --master_port 6000" 10 | 11 | TRAIN_DATA="data/RACE/train/middle" 12 | VALID_DATA="data/RACE/dev/middle \ 13 | data/RACE/dev/high" 14 | VOCAB_FILE=bert-vocab.txt 15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m 16 | CHECKPOINT_PATH=checkpoints/bert_345m_race 17 | 18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 19 | --task RACE \ 20 | --seed 1234 \ 21 | --train-data $TRAIN_DATA \ 22 | --valid-data $VALID_DATA \ 23 | --tokenizer-type BertWordPieceLowerCase \ 24 | --vocab-file $VOCAB_FILE \ 25 | --epochs 3 \ 26 | --pretrained-checkpoint $PRETRAINED_CHECKPOINT \ 27 | --tensor-model-parallel-size 1 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --micro-batch-size 4 \ 32 | --lr 1.0e-5 \ 33 | --lr-decay-style linear \ 34 | --lr-warmup-fraction 0.06 \ 35 | --seq-length 512 \ 36 | --max-position-embeddings 512 \ 37 | --save-interval 100000 \ 38 | --save $CHECKPOINT_PATH \ 39 | --log-interval 10 \ 40 | --eval-interval 100 \ 41 | --eval-iters 50 \ 42 | --weight-decay 1.0e-1 \ 43 | --clip-grad 1.0 \ 44 | --hidden-dropout 0.1 \ 45 | --attention-dropout 0.1 \ 46 | --fp16 47 | -------------------------------------------------------------------------------- /examples/finetune_retriever_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Finetune a BERT or pretrained ICT model using Google natural question data 4 | # Datasets can be downloaded from the following link: 5 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py 6 | 7 | WORLD_SIZE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | CHECKPOINT_PATH= 16 | 17 | # Load either of the below 18 | BERT_LOAD_PATH= 19 | PRETRAINED_CHECKPOINT= 20 | 21 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \ 22 | --task RET-FINETUNE-NQ \ 23 | --train-with-neg \ 24 | --train-hard-neg 1 \ 25 | --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \ 26 | --num-layers 12 \ 27 | --hidden-size 768 \ 28 | --num-attention-heads 12 \ 29 | --tensor-model-parallel-size 1 \ 30 | --tokenizer-type BertWordPieceLowerCase \ 31 | --train-data nq-train.json \ 32 | --valid-data nq-dev.json \ 33 | --save ${CHECKPOINT_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --vocab-file bert-vocab.txt \ 36 | --bert-load ${BERT_LOAD_PATH} \ 37 | --save-interval 5000 \ 38 | --log-interval 10 \ 39 | --eval-interval 20000 \ 40 | --eval-iters 100 \ 41 | --indexer-log-interval 1000 \ 42 | --faiss-use-gpu \ 43 | --DDP-impl torch \ 44 | --fp16 \ 45 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 46 | --seq-length 512 \ 47 | --retriever-seq-length 256 \ 48 | --max-position-embeddings 512 \ 49 | --retriever-score-scaling \ 50 | --epochs 80 \ 51 | --micro-batch-size 8 \ 52 | --eval-micro-batch-size 16 \ 53 | --indexer-batch-size 128 \ 54 | --lr 2e-5 \ 55 | --lr-warmup-fraction 0.01 \ 56 | --weight-decay 1e-1 57 | -------------------------------------------------------------------------------- /examples/merge_mp_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TENSOR_MODEL_PARALLEL_SIZE=2 4 | 5 | VOCAB_FILE=bert-vocab.txt 6 | CHECKPOINT_PATH=checkpoints/bert_345m 7 | 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \ 9 | --model-type BERT \ 10 | --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \ 11 | --tokenizer-type BertWordPieceLowerCase \ 12 | --vocab-file $VOCAB_FILE \ 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 512 \ 17 | --max-position-embeddings 512 \ 18 | --load $CHECKPOINT_PATH 19 | -------------------------------------------------------------------------------- /examples/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp). 5 | 6 | -------------------------------------------------------------------------------- /examples/msdp/eval_knwl_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################### 4 | # Evaluate the F1 scores. 5 | ######################### 6 | 7 | WORLD_SIZE=1 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 9 | --nnodes 1 \ 10 | --node_rank 0 \ 11 | --master_addr localhost \ 12 | --master_port 6000" 13 | 14 | MODEL_GEN_PATH= \ 15 | (e.g., /testseen_knowledge_generations.txt) 16 | GROUND_TRUTH_PATH= \ 17 | (e.g., /testseen_knowledge_reference.txt) 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --seq-length 2048 \ 24 | --max-position-embeddings 2048 \ 25 | --micro-batch-size 4 \ 26 | --task MSDP-EVAL-F1 \ 27 | --guess-file ${MODEL_GEN_PATH} \ 28 | --answer-file ${GROUND_TRUTH_PATH} 29 | 30 | 31 | ############################################ 32 | # Evaluate BLEU, METEOR, and ROUGE-L scores. 33 | ############################################ 34 | 35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 37 | 38 | # To evaluate on these metrics, please setup the environments based on 39 | # the nlg-eval github, and run the corresponding evaluation commands. 40 | 41 | nlg-eval \ 42 | --hypothesis= \ 43 | --references= 44 | -------------------------------------------------------------------------------- /examples/msdp/eval_resp_generation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######################### 4 | # Evaluate the F1 scores. 5 | ######################### 6 | 7 | WORLD_SIZE=1 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 9 | --nnodes 1 \ 10 | --node_rank 0 \ 11 | --master_addr localhost \ 12 | --master_port 6000" 13 | 14 | MODEL_GEN_PATH= \ 15 | (e.g., /testseen_response_generations.txt) 16 | GROUND_TRUTH_PATH= \ 17 | (e.g., /testseen_response_reference.txt) 18 | 19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --num-attention-heads 16 \ 23 | --seq-length 2048 \ 24 | --max-position-embeddings 2048 \ 25 | --micro-batch-size 4 \ 26 | --task MSDP-EVAL-F1 \ 27 | --guess-file ${MODEL_GEN_PATH} \ 28 | --answer-file ${GROUND_TRUTH_PATH} 29 | 30 | 31 | ########################## 32 | # Evaluate the KF1 scores. 33 | ########################## 34 | 35 | MODEL_GEN_PATH= \ 36 | (e.g., /testseen_response_generations.txt) 37 | GROUND_TRUTH_PATH= \ 38 | (e.g., /testseen_knowledge_reference.txt) 39 | 40 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 41 | --num-layers 24 \ 42 | --hidden-size 1024 \ 43 | --num-attention-heads 16 \ 44 | --seq-length 2048 \ 45 | --max-position-embeddings 2048 \ 46 | --micro-batch-size 4 \ 47 | --task MSDP-EVAL-F1 \ 48 | --guess-file ${MODEL_GEN_PATH} \ 49 | --answer-file ${GROUND_TRUTH_PATH} 50 | 51 | 52 | ############################################ 53 | # Evaluate BLEU, METEOR, and ROUGE-L scores. 54 | ############################################ 55 | 56 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 57 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 58 | 59 | # To evaluate on these metrics, please setup the environments based on 60 | # the nlg-eval github, and run the corresponding evaluation commands. 61 | 62 | nlg-eval \ 63 | --hypothesis= \ 64 | --references= 65 | -------------------------------------------------------------------------------- /examples/msdp/prep_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Preparing the input file for the response generation (second-stage prompting) 4 | 5 | DIR=`pwd` 6 | 7 | TEST_FILE= \ 8 | (e.g., /testseen_processed.txt) 9 | KNOWLEDGE_FILE= \ 10 | (e.g., /testseen_knowledge_generations.txt) 11 | PROCESSED_FILE= \ 12 | (e.g., /testseen_processed_with_generated_knowledge.txt) 13 | 14 | python ${DIR}/tasks/msdp/preprocessing.py \ 15 | --func prepare_input \ 16 | --test_file ${TEST_FILE} \ 17 | --knwl_gen_file ${KNOWLEDGE_FILE} \ 18 | --processed_file ${PROCESSED_FILE} 19 | -------------------------------------------------------------------------------- /examples/msdp/prompt_knwl_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge 5 | # The size of the pretrained language model is 357M 6 | 7 | WORLD_SIZE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 10 | --nnodes 1 \ 11 | --node_rank 0 \ 12 | --master_addr localhost \ 13 | --master_port 6000" 14 | 15 | CHECKPOINT_PATH= (e.g., /357m) 16 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 17 | MERGE_PATH= (e.g., /gpt2-merges.txt) 18 | INPUT_PATH= \ 19 | (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /testseen_knowledge_prompts.json) 22 | OUTPUT_PATH= \ 23 | (e.g., /testseen_knowledge_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type knowledge \ 42 | --num-prompt-examples 10 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/msdp/prompt_resp_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1 5 | # The output is the corresponding response. 6 | # The size of the pretrained language model is 357M 7 | 8 | WORLD_SIZE=8 9 | 10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \ 11 | --nnodes 1 \ 12 | --node_rank 0 \ 13 | --master_addr localhost \ 14 | --master_port 6000" 15 | 16 | CHECKPOINT_PATH= (e.g., /357m) 17 | VOCAB_PATH= (e.g., /gpt2-vocab.json) 18 | MERGE_PATH= (e.g., /gpt2-merges.txt) 19 | INPUT_PATH= (e.g., /testseen_processed.txt) 20 | PROMPT_PATH= \ 21 | (e.g., /response_prompts.txt) 22 | OUTPUT_PATH= \ 23 | (e.g., /output_testseen_response_generations.txt) 24 | 25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 2048 \ 30 | --max-position-embeddings 2048 \ 31 | --micro-batch-size 1 \ 32 | --vocab-file ${VOCAB_PATH} \ 33 | --merge-file ${MERGE_PATH} \ 34 | --load ${CHECKPOINT_PATH} \ 35 | --fp16 \ 36 | --DDP-impl torch \ 37 | --tokenizer-type GPT2BPETokenizer \ 38 | --sample-input-file ${INPUT_PATH} \ 39 | --sample-output-file ${OUTPUT_PATH} \ 40 | --prompt-file ${PROMPT_PATH} \ 41 | --prompt-type response \ 42 | --num-prompt-examples 20 \ 43 | --task MSDP-PROMPT 44 | 45 | # NOTE: If you use api for the model generation, please use 46 | # the "--api-prompt" flag (setting this value as True). 47 | -------------------------------------------------------------------------------- /examples/pretrain_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | CHECKPOINT_PATH= 6 | VOCAB_FILE=/bert-vocab.txt 7 | DATA_PATH=_text_sentence 8 | 9 | BERT_ARGS=" 10 | --num-layers 24 \ 11 | --hidden-size 1024 \ 12 | --num-attention-heads 16 \ 13 | --seq-length 512 \ 14 | --max-position-embeddings 512 \ 15 | --micro-batch-size 4 \ 16 | --global-batch-size 8 \ 17 | --lr 0.0001 \ 18 | --train-iters 2000000 \ 19 | --lr-decay-iters 990000 \ 20 | --lr-decay-style linear \ 21 | --min-lr 0.00001 \ 22 | --weight-decay 1e-2 \ 23 | --lr-warmup-fraction .01 \ 24 | --clip-grad 1.0 \ 25 | --fp16 26 | " 27 | 28 | DATA_ARGS=" 29 | --data-path $DATA_PATH \ 30 | --vocab-file $VOCAB_FILE \ 31 | --data-impl mmap \ 32 | --split 949,50,1 33 | " 34 | 35 | OUTPUT_ARGS=" 36 | --log-interval 100 \ 37 | --save-interval 10000 \ 38 | --eval-interval 1000 \ 39 | --eval-iters 10 40 | " 41 | 42 | torchrun pretrain_bert.py \ 43 | $BERT_ARGS \ 44 | $DATA_ARGS \ 45 | $OUTPUT_ARGS \ 46 | --save $CHECKPOINT_PATH \ 47 | --load $CHECKPOINT_PATH 48 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/bert-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | BERT_ARGS=" 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --seq-length 512 \ 30 | --max-position-embeddings 512 \ 31 | --micro-batch-size 4 \ 32 | --global-batch-size 32 \ 33 | --lr 0.0001 \ 34 | --train-iters 1000000 \ 35 | --lr-decay-iters 990000 \ 36 | --lr-decay-style linear \ 37 | --min-lr 1.0e-5 \ 38 | --weight-decay 1e-2 \ 39 | --lr-warmup-fraction .01 \ 40 | --clip-grad 1.0 \ 41 | --fp16 42 | " 43 | 44 | DATA_ARGS=" 45 | --data-path $DATA_PATH \ 46 | --vocab-file $VOCAB_FILE \ 47 | --data-impl mmap \ 48 | --split 949,50,1 49 | " 50 | 51 | OUTPUT_ARGS=" 52 | --log-interval 100 \ 53 | --save-interval 10000 \ 54 | --eval-interval 1000 \ 55 | --eval-iters 10 56 | " 57 | 58 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \ 59 | $BERT_ARGS \ 60 | $DATA_ARGS \ 61 | $OUTPUT_ARGS \ 62 | --distributed-backend nccl \ 63 | --save $CHECKPOINT_PATH \ 64 | --load $CHECKPOINT_PATH 65 | -------------------------------------------------------------------------------- /examples/pretrain_bert_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/bert-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | BERT_ARGS=" 26 | --tensor-model-parallel-size 2 \ 27 | --pipeline-model-parallel-size 2 \ 28 | --num-layers 24 \ 29 | --hidden-size 1024 \ 30 | --num-attention-heads 16 \ 31 | --seq-length 512 \ 32 | --max-position-embeddings 512 \ 33 | --micro-batch-size 2 \ 34 | --global-batch-size 16 \ 35 | --lr 0.0001 \ 36 | --train-iters 1000000 \ 37 | --lr-decay-iters 990000 \ 38 | --lr-decay-style linear \ 39 | --min-lr 1.0e-5 \ 40 | --weight-decay 1e-2 \ 41 | --lr-warmup-fraction .01 \ 42 | --clip-grad 1.0 \ 43 | --fp16 44 | " 45 | 46 | DATA_ARGS=" 47 | --data-path $DATA_PATH \ 48 | --vocab-file $VOCAB_FILE \ 49 | --data-impl mmap \ 50 | --split 949,50,1 51 | " 52 | 53 | OUTPUT_ARGS=" 54 | --log-interval 100 \ 55 | --save-interval 10000 \ 56 | --eval-interval 1000 \ 57 | --eval-iters 10 58 | " 59 | 60 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \ 61 | $BERT_ARGS \ 62 | $DATA_ARGS \ 63 | $OUTPUT_ARGS \ 64 | --distributed-backend nccl \ 65 | --save $CHECKPOINT_PATH \ 66 | --load $CHECKPOINT_PATH 67 | -------------------------------------------------------------------------------- /examples/pretrain_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | 7 | CHECKPOINT_PATH= 8 | VOCAB_FILE=/gpt2-vocab.json 9 | MERGE_FILE=/gpt2-merges.txt 10 | DATA_PATH=_text_document 11 | 12 | GPT_ARGS=" 13 | --num-layers 24 \ 14 | --hidden-size 1024 \ 15 | --num-attention-heads 16 \ 16 | --seq-length 1024 \ 17 | --max-position-embeddings 1024 \ 18 | --micro-batch-size 4 \ 19 | --global-batch-size 8 \ 20 | --lr 0.00015 \ 21 | --train-iters 500000 \ 22 | --lr-decay-iters 320000 \ 23 | --lr-decay-style cosine \ 24 | --min-lr 1.0e-5 \ 25 | --weight-decay 1e-2 \ 26 | --lr-warmup-fraction .01 \ 27 | --clip-grad 1.0 \ 28 | --fp16 29 | " 30 | 31 | DATA_ARGS=" 32 | --data-path $DATA_PATH \ 33 | --vocab-file $VOCAB_FILE \ 34 | --merge-file $MERGE_FILE \ 35 | --data-impl mmap \ 36 | --split 949,50,1 37 | " 38 | 39 | OUTPUT_ARGS=" 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 44 | " 45 | 46 | torchrun pretrain_gpt.py \ 47 | $GPT_ARGS \ 48 | $DATA_ARGS \ 49 | $OUTPUT_ARGS \ 50 | --save $CHECKPOINT_PATH \ 51 | --load $CHECKPOINT_PATH 52 | -------------------------------------------------------------------------------- /examples/pretrain_gpt3_175B.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #SBATCH --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b 5 | 6 | 7 | DIR=`pwd` 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 9 | mkdir -p $DIR/logs 10 | 11 | 12 | DATASET_1="" 13 | DATASET_2="" 14 | DATASET_3="" 15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" 16 | 17 | 18 | options=" \ 19 | --tensor-model-parallel-size 8 \ 20 | --pipeline-model-parallel-size 16 \ 21 | --num-layers 96 \ 22 | --hidden-size 12288 \ 23 | --num-attention-heads 96 \ 24 | --seq-length 2048 \ 25 | --max-position-embeddings 2048 \ 26 | --micro-batch-size 1 \ 27 | --global-batch-size 1536 \ 28 | --rampup-batch-size 16 16 5859375 \ 29 | --train-samples 146484375 \ 30 | --lr-decay-samples 126953125 \ 31 | --lr-warmup-samples 183105 \ 32 | --lr 6.0e-5 \ 33 | --min-lr 6.0e-6 \ 34 | --lr-decay-style cosine \ 35 | --log-interval 10 \ 36 | --eval-iters 40 \ 37 | --eval-interval 1000 \ 38 | --data-path ${DATASET} \ 39 | --vocab-file \ 40 | --merge-file \ 41 | --save-interval 1000 \ 42 | --save \ 43 | --load \ 44 | --split 98,2,0 \ 45 | --clip-grad 1.0 \ 46 | --weight-decay 0.1 \ 47 | --adam-beta1 0.9 \ 48 | --adam-beta2 0.95 \ 49 | --init-method-std 0.006 \ 50 | --tensorboard-dir \ 51 | --fp16 " 52 | 53 | 54 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}" 55 | 56 | 57 | srun -l \ 58 | --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \ 59 | --container-mounts "" \ 60 | --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}" 61 | 62 | 63 | set +x 64 | 65 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | 7 | GPUS_PER_NODE=8 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=6000 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | 15 | CHECKPOINT_PATH= 16 | VOCAB_FILE=/gpt2-vocab.json 17 | MERGE_FILE=/gpt2-merges.txt 18 | DATA_PATH=_text_document 19 | 20 | DISTRIBUTED_ARGS=" 21 | --nproc_per_node $GPUS_PER_NODE \ 22 | --nnodes $NNODES \ 23 | --node_rank $NODE_RANK \ 24 | --master_addr $MASTER_ADDR \ 25 | --master_port $MASTER_PORT 26 | " 27 | 28 | GPT_ARGS=" 29 | --num-layers 24 \ 30 | --hidden-size 1024 \ 31 | --num-attention-heads 16 \ 32 | --seq-length 1024 \ 33 | --max-position-embeddings 1024 \ 34 | --micro-batch-size 8 \ 35 | --global-batch-size 64 \ 36 | --lr 0.00015 \ 37 | --train-iters 500000 \ 38 | --lr-decay-iters 320000 \ 39 | --lr-decay-style cosine \ 40 | --min-lr 1.0e-5 \ 41 | --weight-decay 1e-2 \ 42 | --lr-warmup-fraction .01 \ 43 | --clip-grad 1.0 \ 44 | --fp16 45 | " 46 | 47 | DATA_ARGS=" 48 | --data-path $DATA_PATH \ 49 | --vocab-file $VOCAB_FILE \ 50 | --merge-file $MERGE_FILE \ 51 | --data-impl mmap \ 52 | --split 949,50,1 53 | " 54 | 55 | OUTPUT_ARGS=" 56 | --log-interval 100 \ 57 | --save-interval 10000 \ 58 | --eval-interval 1000 \ 59 | --eval-iters 10 60 | " 61 | 62 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 63 | $GPT_ARGS \ 64 | $DATA_ARGS \ 65 | $OUTPUT_ARGS \ 66 | --distributed-backend nccl \ 67 | --save $CHECKPOINT_PATH \ 68 | --load $CHECKPOINT_PATH 69 | -------------------------------------------------------------------------------- /examples/pretrain_gpt_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs the "345M" parameter model 4 | 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1 6 | 7 | GPUS_PER_NODE=8 8 | # Change for multinode config 9 | MASTER_ADDR=localhost 10 | MASTER_PORT=6000 11 | NNODES=1 12 | NODE_RANK=0 13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 14 | 15 | CHECKPOINT_PATH= 16 | VOCAB_FILE=/gpt2-vocab.json 17 | MERGE_FILE=/gpt2-merges.txt 18 | DATA_PATH=_text_document 19 | 20 | DISTRIBUTED_ARGS=" 21 | --nproc_per_node $GPUS_PER_NODE \ 22 | --nnodes $NNODES \ 23 | --node_rank $NODE_RANK \ 24 | --master_addr $MASTER_ADDR \ 25 | --master_port $MASTER_PORT 26 | " 27 | 28 | GPT_ARGS=" 29 | --tensor-model-parallel-size 2 \ 30 | --pipeline-model-parallel-size 2 \ 31 | --sequence-parallel \ 32 | --num-layers 24 \ 33 | --hidden-size 1024 \ 34 | --num-attention-heads 16 \ 35 | --seq-length 1024 \ 36 | --max-position-embeddings 1024 \ 37 | --micro-batch-size 4 \ 38 | --global-batch-size 16 \ 39 | --lr 0.00015 \ 40 | --train-iters 500000 \ 41 | --lr-decay-iters 320000 \ 42 | --lr-decay-style cosine \ 43 | --min-lr 1.0e-5 \ 44 | --weight-decay 1e-2 \ 45 | --lr-warmup-fraction .01 \ 46 | --clip-grad 1.0 \ 47 | --fp16 48 | " 49 | 50 | DATA_ARGS=" 51 | --data-path $DATA_PATH \ 52 | --vocab-file $VOCAB_FILE \ 53 | --merge-file $MERGE_FILE \ 54 | --data-impl mmap \ 55 | --split 949,50,1 56 | " 57 | 58 | OUTPUT_ARGS=" 59 | --log-interval 100 \ 60 | --save-interval 10000 \ 61 | --eval-interval 1000 \ 62 | --eval-iters 10 63 | " 64 | 65 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ 66 | $GPT_ARGS \ 67 | $DATA_ARGS \ 68 | $OUTPUT_ARGS \ 69 | --distributed-backend nccl \ 70 | --save $CHECKPOINT_PATH \ 71 | --load $CHECKPOINT_PATH 72 | 73 | -------------------------------------------------------------------------------- /examples/pretrain_ict.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # Runs the "217M" parameter biencoder model for ICT retriever 4 | 5 | RANK=0 6 | WORLD_SIZE=1 7 | 8 | PRETRAINED_BERT_PATH= 9 | TEXT_DATA_PATH= 10 | TITLE_DATA_PATH= 11 | CHECKPOINT_PATH= 12 | 13 | 14 | python pretrain_ict.py \ 15 | --num-layers 12 \ 16 | --hidden-size 768 \ 17 | --num-attention-heads 12 \ 18 | --tensor-model-parallel-size 1 \ 19 | --micro-batch-size 32 \ 20 | --seq-length 256 \ 21 | --max-position-embeddings 512 \ 22 | --train-iters 100000 \ 23 | --vocab-file bert-vocab.txt \ 24 | --tokenizer-type BertWordPieceLowerCase \ 25 | --DDP-impl torch \ 26 | --bert-load ${PRETRAINED_BERT_PATH} \ 27 | --log-interval 100 \ 28 | --eval-interval 1000 \ 29 | --eval-iters 10 \ 30 | --retriever-report-topk-accuracies 1 5 10 20 100 \ 31 | --retriever-score-scaling \ 32 | --load $CHECKPOINT_PATH \ 33 | --save $CHECKPOINT_PATH \ 34 | --data-path ${TEXT_DATA_PATH} \ 35 | --titles-data-path ${TITLE_DATA_PATH} \ 36 | --lr 0.0001 \ 37 | --lr-decay-style linear \ 38 | --weight-decay 1e-2 \ 39 | --clip-grad 1.0 \ 40 | --lr-warmup-fraction 0.01 \ 41 | --save-interval 4000 \ 42 | --exit-interval 8000 \ 43 | --query-in-block-prob 0.1 \ 44 | --fp16 45 | -------------------------------------------------------------------------------- /examples/pretrain_t5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | CHECKPOINT_PATH= 6 | VOCAB_FILE=/t5-vocab.txt 7 | DATA_PATH=_text_sentence 8 | 9 | T5_ARGS=" 10 | --num-layers 12 \ 11 | --hidden-size 768 \ 12 | --num-attention-heads 12 \ 13 | --kv-channels 64 \ 14 | --ffn-hidden-size 3072 \ 15 | --encoder-seq-length 512 \ 16 | --decoder-seq-length 128 \ 17 | --max-position-embeddings 512 \ 18 | --micro-batch-size 16 \ 19 | --global-batch-size 16 \ 20 | --lr 0.0001 \ 21 | --train-iters 1000000 \ 22 | --lr-decay-iters 1000000 \ 23 | --lr-decay-style linear \ 24 | --min-lr 0.00001 \ 25 | --weight-decay 1e-2 \ 26 | --lr-warmup-fraction .01 \ 27 | --clip-grad 1.0 \ 28 | --fp16 \ 29 | --vocab-extra-ids 100 30 | " 31 | 32 | DATA_ARGS=" 33 | --data-path $DATA_PATH \ 34 | --vocab-file $VOCAB_FILE \ 35 | --data-impl mmap \ 36 | --split 949,50,1 37 | " 38 | 39 | OUTPUT_ARGS=" 40 | --log-interval 100 \ 41 | --save-interval 10000 \ 42 | --eval-interval 1000 \ 43 | --eval-iters 10 44 | " 45 | 46 | torchrun pretrain_t5.py \ 47 | $T5_ARGS \ 48 | $DATA_ARGS \ 49 | $OUTPUT_ARGS \ 50 | --save $CHECKPOINT_PATH \ 51 | --load $CHECKPOINT_PATH 52 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/t5-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | T5_ARGS=" 26 | --num-layers 12 \ 27 | --hidden-size 768 \ 28 | --num-attention-heads 12 \ 29 | --kv-channels 64 \ 30 | --ffn-hidden-size 3072 \ 31 | --encoder-seq-length 512 \ 32 | --decoder-seq-length 128 \ 33 | --max-position-embeddings 512 \ 34 | --micro-batch-size 16 \ 35 | --global-batch-size 128 \ 36 | --lr 0.0001 \ 37 | --train-iters 1000000 \ 38 | --lr-decay-iters 1000000 \ 39 | --lr-decay-style linear \ 40 | --min-lr 0.00001 \ 41 | --weight-decay 1e-2 \ 42 | --lr-warmup-fraction .01 \ 43 | --clip-grad 1.0 \ 44 | --fp16 \ 45 | --vocab-extra-ids 100 46 | " 47 | 48 | DATA_ARGS=" 49 | --data-path $DATA_PATH \ 50 | --vocab-file $VOCAB_FILE \ 51 | --data-impl mmap \ 52 | --split 949,50,1 53 | " 54 | 55 | OUTPUT_ARGS=" 56 | --log-interval 100 \ 57 | --save-interval 10000 \ 58 | --eval-interval 1000 \ 59 | --eval-iters 10 60 | " 61 | 62 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \ 63 | $T5_ARGS \ 64 | $DATA_ARGS \ 65 | $OUTPUT_ARGS \ 66 | --distributed-backend nccl \ 67 | --save $CHECKPOINT_PATH \ 68 | --load $CHECKPOINT_PATH 69 | -------------------------------------------------------------------------------- /examples/pretrain_t5_distributed_with_mp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1 4 | 5 | GPUS_PER_NODE=8 6 | # Change for multinode config 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=6000 9 | NNODES=1 10 | NODE_RANK=0 11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 12 | 13 | CHECKPOINT_PATH= 14 | VOCAB_FILE=/t5-vocab.txt 15 | DATA_PATH=_text_sentence 16 | 17 | DISTRIBUTED_ARGS=" 18 | --nproc_per_node $GPUS_PER_NODE \ 19 | --nnodes $NNODES \ 20 | --node_rank $NODE_RANK \ 21 | --master_addr $MASTER_ADDR \ 22 | --master_port $MASTER_PORT 23 | " 24 | 25 | T5_ARGS=" 26 | --tensor-model-parallel-size 2 \ 27 | --num-layers 12 \ 28 | --hidden-size 768 \ 29 | --num-attention-heads 12 \ 30 | --kv-channels 64 \ 31 | --ffn-hidden-size 3072 \ 32 | --encoder-seq-length 512 \ 33 | --decoder-seq-length 128 \ 34 | --max-position-embeddings 512 \ 35 | --micro-batch-size 16 \ 36 | --global-batch-size 128 \ 37 | --lr 0.0001 \ 38 | --train-iters 1000000 \ 39 | --lr-decay-iters 1000000 \ 40 | --lr-decay-style linear \ 41 | --min-lr 0.00001 \ 42 | --weight-decay 1e-2 \ 43 | --lr-warmup-fraction .01 \ 44 | --clip-grad 1.0 \ 45 | --fp16 \ 46 | --vocab-extra-ids 100 47 | " 48 | 49 | DATA_ARGS=" 50 | --data-path $DATA_PATH \ 51 | --vocab-file $VOCAB_FILE \ 52 | --data-impl mmap \ 53 | --split 949,50,1 54 | " 55 | 56 | OUTPUT_ARGS=" 57 | --log-interval 100 \ 58 | --save-interval 10000 \ 59 | --eval-interval 1000 \ 60 | --eval-iters 10 61 | " 62 | 63 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \ 64 | $T5_ARGS \ 65 | $DATA_ARGS \ 66 | $OUTPUT_ARGS \ 67 | --distributed-backend nccl \ 68 | --save $CHECKPOINT_PATH \ 69 | --load $CHECKPOINT_PATH 70 | -------------------------------------------------------------------------------- /examples/run_text_generation_server_345M.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model. 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | export CUDA_DEVICE_MAX_CONNECTIONS=1 14 | 15 | pip install flask-restful 16 | 17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 18 | --tensor-model-parallel-size 1 \ 19 | --pipeline-model-parallel-size 1 \ 20 | --num-layers 24 \ 21 | --hidden-size 1024 \ 22 | --load ${CHECKPOINT} \ 23 | --num-attention-heads 16 \ 24 | --max-position-embeddings 1024 \ 25 | --tokenizer-type GPT2BPETokenizer \ 26 | --fp16 \ 27 | --micro-batch-size 1 \ 28 | --seq-length 1024 \ 29 | --out-seq-length 1024 \ 30 | --temperature 1.0 \ 31 | --vocab-file $VOCAB_FILE \ 32 | --merge-file $MERGE_FILE \ 33 | --top_p 0.9 \ 34 | --seed 42 35 | -------------------------------------------------------------------------------- /examples/run_text_generation_server_345M_8_tensor_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \ 4 | --nnodes 1 \ 5 | --node_rank 0 \ 6 | --master_addr localhost \ 7 | --master_port 6000" 8 | 9 | CHECKPOINT= 10 | VOCAB_FILE= 11 | MERGE_FILE= 12 | 13 | pip install flask-restful 14 | 15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \ 16 | --tensor-model-parallel-size 8 \ 17 | --pipeline-model-parallel-size 1 \ 18 | --num-layers 24 \ 19 | --hidden-size 1024 \ 20 | --load ${CHECKPOINT} \ 21 | --num-attention-heads 16 \ 22 | --max-position-embeddings 1024 \ 23 | --tokenizer-type GPT2BPETokenizer \ 24 | --fp16 \ 25 | --micro-batch-size 1 \ 26 | --seq-length 1024 \ 27 | --out-seq-length 1024 \ 28 | --temperature 1.0 \ 29 | --vocab-file $VOCAB_FILE \ 30 | --merge-file $MERGE_FILE \ 31 | --top_p 0.9 \ 32 | --seed 42 33 | -------------------------------------------------------------------------------- /examples/sc21/CONFIG.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # SLURM options. 5 | export SLURM_PARTITION= 6 | export SLURM_ACCOUNT= 7 | 8 | 9 | # Source code. 10 | export MEGATRON_CODE_DIR= 11 | 12 | 13 | # This variable is used to mount the relevant part of the filesystem 14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the 15 | # launch directory already get mounted; this variable should be used to 16 | # mount the directories that contain the data and tokenizer files. 17 | export DOCKER_MOUNT_DIR= 18 | 19 | 20 | # Data and tokenizer files. 21 | MEGATRON_DATA= 22 | BPE_VOCAB_FILE= 23 | BPE_MERGE_FILE= 24 | 25 | 26 | # Megatron input parameters. 27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters 28 | # that are not listed here. 29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ 30 | --tensor-model-parallel-size ${TP} \ 31 | --pipeline-model-parallel-size ${PP} \ 32 | --micro-batch-size ${MBS} \ 33 | --global-batch-size ${GBS} \ 34 | --num-layers ${NLS} \ 35 | --hidden-size ${HS} \ 36 | --num-attention-heads ${NAH} \ 37 | --DDP-impl ${DDP} \ 38 | --data-path ${MEGATRON_DATA} \ 39 | --vocab-file ${BPE_VOCAB_FILE} \ 40 | --merge-file ${BPE_MERGE_FILE} \ 41 | --log-interval 5 \ 42 | --seq-length 2048 \ 43 | --max-position-embeddings 2048 \ 44 | --train-iters 500 \ 45 | --lr-decay-iters 320 \ 46 | --lr 0.0001 \ 47 | --min-lr 0.00001 \ 48 | --lr-decay-style cosine \ 49 | --lr-warmup-fraction 0.01 \ 50 | --split 969,30,1 \ 51 | --eval-iters 100 \ 52 | --eval-interval 1000 \ 53 | --clip-grad 1.0 \ 54 | --fp16 \ 55 | --loss-scale 8192 " 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/sc21/README.md: -------------------------------------------------------------------------------- 1 | # Reproducing Figures in SC21 Paper 2 | 3 | 4 | This directory contains some of the scripts that were used to produce the 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other 9 | schedulers as well. 10 | 11 | 12 | ## Git commit 13 | 14 | To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e 15 | 16 | 17 | ## Setup 18 | 19 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please 20 | update the unspecified values (in angle brackets `<...>`) before launching any 21 | scripts. 22 | 23 | 24 | 25 | ## Scripts 26 | 27 | Below is a list of scripts that can be used to reproduce various figures in our 28 | [paper](https://arxiv.org/pdf/2104.04473.pdf): 29 | 30 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput 31 | for GPT models ranging from 1 billion to 1 trillion parameters. 32 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling 33 | performance of pipeline parallelism. 34 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of 35 | the interleaved schedule on a 175B GPT model. 36 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of 37 | different degrees of pipeline and tensor model parallelism on a model with 38 | 162.2 billion parameters. 39 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of 40 | different degrees of data and pipeline model parallelism on a model with 41 | 5.9 billion parameters. 42 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of 43 | different degrees of data and tensor model parallelism on a model with 44 | 5.9 billion parameters. 45 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of 46 | microbatch size. 47 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of 48 | activation recomputation. 49 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of 50 | the scatter-gather communication optimization. 51 | -------------------------------------------------------------------------------- /examples/sc21/SBATCH.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | sbatch -p ${SLURM_PARTITION} \ 5 | -A ${SLURM_ACCOUNT} \ 6 | --job-name=${JOB_NAME} \ 7 | --nodes=${NNODES} \ 8 | --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh 9 | 10 | exit 0 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /examples/sc21/SRUN.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8 4 | 5 | 6 | THIS_DIR=`pwd` 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 8 | mkdir -p ${THIS_DIR}/logs 9 | 10 | 11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}" 12 | 13 | 14 | srun -l \ 15 | --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \ 16 | --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \ 17 | --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}" 18 | 19 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_11.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [1, 2, 4, 8]. 8 | PP=1 9 | 10 | # Batch size (global batch size) options = [8, 128]. 11 | GBS=8 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel size options. 18 | NLS=$((3*PP)) 19 | NNODES=${PP} 20 | 21 | 22 | # Other params. 23 | TP=8 24 | MBS=1 25 | HS=20480 26 | NAH=128 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_12.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Interleaved schedule options = [YES, NO]. 8 | INTERLEAVED=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set interleaved schedule options. 18 | if [ ${INTERLEAVED} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${INTERLEAVED} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_13.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 128]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and tensor-parallel size options. 18 | TP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | MBS=1 23 | NLS=32 24 | HS=20480 25 | NAH=128 26 | DDP=local 27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 28 | NNODES=8 29 | 30 | 31 | # Name of the job. 32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS} 33 | 34 | 35 | # Import the configs. 36 | . `pwd`/CONFIG.sh 37 | 38 | 39 | # Submit the job. 40 | . `pwd`/SBATCH.sh 41 | 42 | 43 | exit 0 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_14.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32]. 8 | PP=2 9 | 10 | # Batch size (global batch size) options = [32, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set pipeline-parallel and data-parallel size options. 18 | DP=$((64/PP)) 19 | 20 | 21 | # Other params. 22 | TP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32]. 8 | TP=2 9 | 10 | # Batch size (global batch size) options = [32, 128, 512]. 11 | GBS=32 12 | 13 | 14 | 15 | 16 | 17 | # Set tensor-parallel and data-parallel size options. 18 | DP=$((64/TP)) 19 | 20 | 21 | # Other params. 22 | PP=1 23 | MBS=1 24 | NLS=32 25 | HS=3840 26 | NAH=32 27 | DDP=local 28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 29 | NNODES=8 30 | 31 | 32 | # Name of the job. 33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS} 34 | 35 | 36 | # Import the configs. 37 | . `pwd`/CONFIG.sh 38 | 39 | 40 | # Submit the job. 41 | . `pwd`/SBATCH.sh 42 | 43 | 44 | exit 0 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Microbatch size options = [1, 2, 4, 8]. 8 | MBS=1 9 | 10 | # Batch size (global batch size) options = [128, 512]. 11 | GBS=128 12 | 13 | 14 | 15 | 16 | 17 | # Other params. 18 | TP=8 19 | PP=8 20 | NLS=32 21 | HS=15360 22 | NAH=128 23 | DDP=local 24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 25 | NNODES=8 26 | 27 | 28 | # Name of the job. 29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS} 30 | 31 | 32 | # Import the configs. 33 | . `pwd`/CONFIG.sh 34 | 35 | 36 | # Submit the job. 37 | . `pwd`/SBATCH.sh 38 | 39 | 40 | exit 0 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_17.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Activation recomputation options = [YES, NO]. 8 | ACTIVATION_RECOMPUTATION=YES 9 | 10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256]. 11 | GBS=1 12 | 13 | 14 | 15 | 16 | 17 | # Set activation recomputation. 18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform " 20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="" 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=16 31 | MBS=1 32 | NLS=80 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=16 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /examples/sc21/run_figure_18.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================================ 4 | # Choose the case to run. 5 | # ================================ 6 | 7 | # Scatter-gather communication optimization options = [YES, NO]. 8 | SCATTER_GATHER=YES 9 | 10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60]. 11 | GBS=12 12 | 13 | 14 | 15 | 16 | 17 | # Set scatter-gather communication optimization options. 18 | if [ ${SCATTER_GATHER} == "YES" ]; then 19 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 " 20 | elif [ ${SCATTER_GATHER} == "NO" ]; then 21 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline " 22 | else 23 | echo "Invalid configuration" 24 | exit 1 25 | fi 26 | 27 | 28 | # Other params. 29 | TP=8 30 | PP=12 31 | MBS=1 32 | NLS=96 33 | HS=12288 34 | NAH=96 35 | DDP=local 36 | NNODES=12 37 | 38 | 39 | # Name of the job. 40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS} 41 | 42 | 43 | # Import the configs. 44 | . `pwd`/CONFIG.sh 45 | 46 | 47 | # Submit the job. 48 | . `pwd`/SBATCH.sh 49 | 50 | 51 | exit 0 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /images/Achieved_petaFLOPs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/images/Achieved_petaFLOPs.png -------------------------------------------------------------------------------- /images/cases_april2021.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/images/cases_april2021.png -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args, get_retro_args 6 | from .global_vars import get_current_global_batch_size 7 | from .global_vars import get_num_microbatches 8 | from .global_vars import get_signal_handler 9 | from .global_vars import update_num_microbatches 10 | from .global_vars import get_tokenizer 11 | from .global_vars import get_tensorboard_writer 12 | from .global_vars import get_adlr_autoresume 13 | from .global_vars import get_timers 14 | from .initialize import initialize_megatron 15 | 16 | from .utils import (print_rank_0, 17 | is_last_rank, 18 | print_rank_last) 19 | -------------------------------------------------------------------------------- /megatron/core/README.md: -------------------------------------------------------------------------------- 1 | Megatron Core is a library for efficient and scalable training of transformer based models. 2 | -------------------------------------------------------------------------------- /megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | import megatron.core.parallel_state 2 | import megatron.core.tensor_parallel 3 | import megatron.core.utils 4 | 5 | from .inference_params import InferenceParams 6 | from .model_parallel_config import ModelParallelConfig 7 | 8 | # Alias parallel_state as mpu, its legacy name 9 | mpu = parallel_state 10 | 11 | __all__ = ["parallel_state", "tensor_parallel", "utils", "InferenceParams", "ModelParallelConfig"] 12 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .core import check_is_distributed_checkpoint 4 | from .mapping import LocalNonpersitentObject, ShardedTensor 5 | from .serialization import load, load_common_state_dict, save 6 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/core.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import json 4 | from dataclasses import asdict, dataclass 5 | from pathlib import Path 6 | from typing import Optional 7 | 8 | CONFIG_FNAME = 'metadata.json' 9 | 10 | 11 | class CheckpointingException(Exception): 12 | pass 13 | 14 | 15 | @dataclass 16 | class CheckpointingConfig: 17 | """ Documents backends used in the checkpoint. """ 18 | 19 | sharded_backend: str 20 | sharded_backend_version: int = 1 21 | common_backend: str = 'torch' 22 | common_backend_version: int = 1 23 | 24 | 25 | def check_is_distributed_checkpoint(checkpoint_dir): 26 | return maybe_load_config(checkpoint_dir) is not None 27 | 28 | 29 | def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]: 30 | config_path = Path(checkpoint_dir, CONFIG_FNAME) 31 | if not config_path.exists(): 32 | return None 33 | with config_path.open() as f: 34 | config_dict = json.load(f) 35 | return CheckpointingConfig(**config_dict) 36 | 37 | 38 | def save_config(config: CheckpointingConfig, checkpoint_dir: str): 39 | config_path = Path(checkpoint_dir, CONFIG_FNAME) 40 | with config_path.open('w') as f: 41 | json.dump(asdict(config), f) 42 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Various loading and saving strategies """ 4 | 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | try: 10 | import tensorstore 11 | import zarr 12 | 13 | from .tensorstore import _import_trigger 14 | from .zarr import _import_trigger 15 | except ImportError: 16 | logger.warning('Zarr-based strategies will not be registered because of missing packages') 17 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/strategies/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from abc import ABC, abstractmethod 4 | from collections import defaultdict 5 | from enum import Enum 6 | from pathlib import Path 7 | from typing import Dict, List, Optional 8 | 9 | from ..mapping import CheckpointingException, ShardedStateDict, ShardedTensor, StateDict 10 | 11 | 12 | class StrategyAction(Enum): 13 | LOAD_COMMON = 'load_common' 14 | LOAD_SHARDED = 'load_sharded' 15 | SAVE_COMMON = 'save_common' 16 | SAVE_SHARDED = 'save_sharded' 17 | 18 | 19 | default_strategies = defaultdict(dict) 20 | 21 | 22 | def get_default_strategy(action: StrategyAction, backend: str, version: int): 23 | try: 24 | return default_strategies[action.value][(backend, version)] 25 | except KeyError as e: 26 | raise CheckpointingException( 27 | f'Cannot find default strategy for: {(action, backend, version)}' 28 | ) from e 29 | 30 | 31 | class LoadStrategyBase(ABC): 32 | @abstractmethod 33 | def check_backend_compatibility(self, loaded_version): 34 | raise NotImplementedError 35 | 36 | @abstractmethod 37 | def check_version_compatibility(self, loaded_version): 38 | raise NotImplementedError 39 | 40 | 41 | class SaveStrategyBase(ABC): 42 | def __init__(self, backend: str, version: int): 43 | self.backend = backend 44 | self.version = version 45 | 46 | 47 | class LoadCommonStrategy(LoadStrategyBase): 48 | @abstractmethod 49 | def load(self, checkpoint_dir: Path): 50 | raise NotImplementedError 51 | 52 | 53 | class LoadShardedStrategy(LoadStrategyBase): 54 | @abstractmethod 55 | def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path): 56 | raise NotImplementedError 57 | 58 | 59 | class SaveCommonStrategy(SaveStrategyBase): 60 | @abstractmethod 61 | def save(self, common_state_dict: StateDict, checkpoint_dir: Path): 62 | raise NotImplementedError 63 | 64 | 65 | class SaveShardedStrategy(SaveStrategyBase): 66 | @abstractmethod 67 | def save(self, sharded_tensors: List[ShardedTensor], checkpoint_dir: Path): 68 | raise NotImplementedError 69 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from typing import Tuple 4 | 5 | from .dict_utils import dict_list_map_inplace, extract_matching_values 6 | from .mapping import LocalNonpersitentObject, ShardedStateDict, ShardedTensor, StateDict 7 | 8 | 9 | def extract_sharded_tensors( 10 | sharded_state_dict: ShardedStateDict, 11 | ) -> Tuple[ShardedStateDict, StateDict]: 12 | return extract_matching_values(sharded_state_dict, lambda v: isinstance(v, ShardedTensor)) 13 | 14 | 15 | def extract_sharded_tensors_or_nonpersistent( 16 | sharded_state_dict: ShardedStateDict, 17 | ) -> Tuple[ShardedStateDict, StateDict]: 18 | return extract_matching_values( 19 | sharded_state_dict, lambda v: isinstance(v, (ShardedTensor, LocalNonpersitentObject)) 20 | ) 21 | 22 | 23 | def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix: str): 24 | def add_prefix(t): 25 | if isinstance(t, ShardedTensor): 26 | t.key = f'{prefix}.{t.key}' 27 | return t 28 | 29 | dict_list_map_inplace(add_prefix, sharded_state_dict) 30 | -------------------------------------------------------------------------------- /megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | class ModelType(enum.Enum): 7 | encoder_or_decoder = 1 8 | encoder_and_decoder = 2 9 | retro_encoder = 3 10 | retro_decoder = 4 11 | -------------------------------------------------------------------------------- /megatron/core/fusions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/core/fusions/__init__.py -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from typing import Optional, Tuple 4 | 5 | import torch 6 | 7 | 8 | def _bias_dropout_add_func(x, bias, residual, prob, training): 9 | # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor 10 | # NOTE: Previously, the argument `bias` used to be passed as 11 | # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the 12 | # transformer layer but broadcasting should automatically take care of that. 13 | # Also, looking at broadcasting semantics, `expand_as` and broadcasting 14 | # seem to be identical performance-wise (both just change the view). 15 | 16 | # If we want to train mixed precision, then the output of this function 17 | # should be half precision. However, in AMP O1, the input (residual) is 18 | # in fp32, and it will up-cast the result to fp32, causing pipeline parallel 19 | # GPU communication to hang. Therefore, we need to cast residual to the same 20 | # dtype as x. 21 | residual = residual if residual.dtype == x.dtype else residual.to(x.dtype) 22 | if bias is not None: 23 | x = x + bias 24 | out = torch.nn.functional.dropout(x, p=prob, training=training) 25 | out = residual + out 26 | return out 27 | 28 | 29 | def get_bias_dropout_add(training, fused): 30 | def unfused_bias_dropout_add(x_with_bias, residual, prob): 31 | x, bias = x_with_bias # unpack 32 | return _bias_dropout_add_func(x, bias, residual, prob, training) 33 | 34 | @torch.jit.script 35 | def bias_dropout_add_fused_train( 36 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], 37 | residual: torch.Tensor, 38 | prob: float, 39 | ) -> torch.Tensor: 40 | x, bias = x_with_bias # unpack 41 | return _bias_dropout_add_func(x, bias, residual, prob, True) 42 | 43 | @torch.jit.script 44 | def bias_dropout_add_fused_inference( 45 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], 46 | residual: torch.Tensor, 47 | prob: float, 48 | ) -> torch.Tensor: 49 | x, bias = x_with_bias # unpack 50 | return _bias_dropout_add_func(x, bias, residual, prob, False) 51 | 52 | if fused: 53 | # jit scripting for a nn.module (with dropout) is not 54 | # triggering the fusion kernel. For now, we use two 55 | # different nn.functional routines to account for varying 56 | # dropout semantics during training and inference phases. 57 | if training: 58 | return bias_dropout_add_fused_train 59 | else: 60 | return bias_dropout_add_fused_inference 61 | else: 62 | return unfused_bias_dropout_add 63 | -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 6 | # 1/sqrt(2*pi)-> 0.3989423 7 | # 1/sqrt(2) -> 0.70710678 8 | # sqrt(2/pi) -> 0.79788456 9 | # this function is tanh approximation of gelu 10 | # actual gelu is: 11 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 12 | 13 | 14 | @torch.jit.script 15 | def bias_gelu(bias, y): 16 | x = bias + y 17 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 18 | 19 | 20 | # gradient of tanh approximation of gelu 21 | # gradient of actual gelu is: 22 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 23 | @torch.jit.script 24 | def bias_gelu_back(g, bias, y): 25 | x = bias + y 26 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 27 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 28 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( 29 | 1 + tanh_out 30 | ) 31 | return ff * g 32 | 33 | 34 | class GeLUFunction(torch.autograd.Function): 35 | @staticmethod 36 | # bias is an optional argument 37 | def forward(ctx, input, bias): 38 | ctx.save_for_backward(input, bias) 39 | return bias_gelu(bias, input) 40 | 41 | @staticmethod 42 | def backward(ctx, grad_output): 43 | input, bias = ctx.saved_tensors 44 | tmp = bias_gelu_back(grad_output, bias, input) 45 | return tmp, tmp 46 | 47 | 48 | bias_gelu_impl = GeLUFunction.apply 49 | -------------------------------------------------------------------------------- /megatron/core/inference_params.py: -------------------------------------------------------------------------------- 1 | class InferenceParams: 2 | """Inference parameters that are passed to the main model in order 3 | to efficienly calculate and store the context during inference.""" 4 | 5 | def __init__(self, max_batch_size, max_sequence_length): 6 | self.max_sequence_length = max_sequence_length 7 | self.max_batch_size = max_batch_size 8 | self.sequence_len_offset = 0 9 | self.batch_size_offset = 0 10 | self.key_value_memory_dict = {} 11 | 12 | def swap_key_value_dict(self, batch_idx): 13 | "swap between batches" 14 | if len(self.key_value_memory_dict) == 0: 15 | raise ValueError("should not swap when dict in empty") 16 | 17 | for layer_number in self.key_value_memory_dict.keys(): 18 | inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number] 19 | assert ( 20 | len(batch_idx) == inference_key_memory.shape[1] 21 | ) # make sure batch size is the same 22 | new_inference_key_memory = inference_key_memory[:, batch_idx] 23 | new_inference_value_memory = inference_value_memory[:, batch_idx] 24 | self.key_value_memory_dict[layer_number] = ( 25 | new_inference_key_memory, 26 | new_inference_value_memory, 27 | ) 28 | -------------------------------------------------------------------------------- /megatron/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/core/models/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/rotary_pos_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib.util 4 | 5 | import torch 6 | from torch import einsum, nn 7 | 8 | __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb'] 9 | 10 | 11 | class RotaryEmbedding(nn.Module): 12 | def __init__(self, dim, seq_len_interpolation_factor=None): 13 | super().__init__() 14 | self.seq_len_interpolation_factor = seq_len_interpolation_factor 15 | inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) 16 | self.register_buffer('inv_freq', inv_freq) 17 | 18 | def forward(self, max_seq_len, offset=0): 19 | seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset 20 | if self.seq_len_interpolation_factor is not None: 21 | seq = seq.type_as(self.inv_freq) 22 | seq *= 1 / self.seq_len_interpolation_factor 23 | freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq) 24 | # first part even vector components, second part odd vector components, 25 | # 2 * dim in dimension size 26 | emb = torch.cat((freqs, freqs), dim=-1) 27 | # emb [seq_length, .., dim] 28 | return emb[:, None, None, :] 29 | 30 | 31 | def _rotate_half(x): 32 | """ 33 | change sign so the last dimension becomes [-odd, +even] 34 | """ 35 | x1, x2 = torch.chunk(x, 2, dim=-1) 36 | return torch.cat((-x2, x1), dim=-1) 37 | 38 | 39 | def apply_rotary_pos_emb(t, freqs): 40 | """ 41 | input tensor t is of shape [seq_length, ..., dim] 42 | rotary positional embeding tensor freqs is of shape [seq_length, ..., dim] 43 | check https://kexue.fm/archives/8265 for detailed formulas 44 | """ 45 | rot_dim = freqs.shape[-1] 46 | # ideally t_pass is empty so rotary pos embedding is applied to all tensor t 47 | t, t_pass = t[..., :rot_dim], t[..., rot_dim:] 48 | 49 | # first part is cosine component 50 | # second part is sine component, need to change signs with _rotate_half method 51 | t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin()) 52 | return torch.cat((t, t_pass), dim=-1) 53 | -------------------------------------------------------------------------------- /megatron/core/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_model import GPTModel 2 | -------------------------------------------------------------------------------- /megatron/core/package_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | MAJOR = 0 5 | MINOR = 2 6 | PATCH = 0 7 | PRE_RELEASE = '' 8 | 9 | # Use the following formatting: (major, minor, patch, pre-release) 10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) 11 | 12 | __shortversion__ = '.'.join(map(str, VERSION[:3])) 13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) 14 | 15 | __package_name__ = 'megatron_core' 16 | __contact_names__ = 'NVIDIA' 17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email 18 | __homepage__ = ( 19 | 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage 20 | ) 21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' 22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 23 | __description__ = ( 24 | 'Megatron Core - a library for efficient and scalable training of transformer based models' 25 | ) 26 | __license__ = 'BSD-3' 27 | __keywords__ = ( 28 | 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' 29 | ) 30 | -------------------------------------------------------------------------------- /megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .schedules import get_forward_backward_func 2 | from .sp_utils import get_tflops 3 | -------------------------------------------------------------------------------- /megatron/core/pipeline_parallel/split_solver.py: -------------------------------------------------------------------------------- 1 | from sympy import symbols, Eq, solve 2 | 3 | def round_down(x, tp_size): 4 | return x // tp_size * tp_size 5 | class solver: 6 | def __init__(self, total_seqlen, config, causal=True): 7 | self.total_seqlen = total_seqlen 8 | self.config = config 9 | self.total_tflops = config.get_seq_tflops(total_seqlen, causal) 10 | 11 | 12 | def solve_partition(self, num_splits, tp_size=1): 13 | res = [] 14 | prefix = self.total_seqlen 15 | for i in range(1, num_splits): 16 | seqlen = symbols('seqlen') 17 | tflops = self.config.get_prefix_tflops(seqlen, prefix) 18 | eq = Eq(tflops, self.total_tflops / num_splits) 19 | sol = solve(eq, seqlen) 20 | sol = round_down(int(sol[0]), tp_size) 21 | res.insert(0, int(sol)) 22 | prefix -= int(sol) 23 | res.insert(0, prefix) 24 | return res 25 | 26 | 27 | if __name__ == "__main__": 28 | from sp_utils import SeqTFlops 29 | kw = { 30 | "num_layers": 24, 31 | "hidden_size": 4096, 32 | "ffn_size": 16384, 33 | "num_heads": 32, 34 | "dim_head": 128, 35 | "vocab_size": 32000 36 | } 37 | config = SeqTFlops(**kw) 38 | s = solver(16384, config) 39 | s.solve_partition(4, 2) 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | torch -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy import vocab_parallel_cross_entropy 2 | from .data import broadcast_data 3 | from .layers import ( 4 | ColumnParallelLinear, 5 | RowParallelLinear, 6 | VocabParallelEmbedding, 7 | copy_tensor_model_parallel_attributes, 8 | linear_with_grad_accumulation_and_async_allreduce, 9 | param_is_not_tensor_parallel_duplicate, 10 | set_defaults_if_not_set_tensor_model_parallel_attributes, 11 | set_tensor_model_parallel_attributes, 12 | ) 13 | from .mappings import ( 14 | copy_to_tensor_model_parallel_region, 15 | gather_from_sequence_parallel_region, 16 | gather_from_tensor_model_parallel_region, 17 | scatter_to_sequence_parallel_region, 18 | scatter_to_tensor_model_parallel_region, 19 | ) 20 | from .random import checkpoint, get_cuda_rng_tracker, model_parallel_cuda_manual_seed 21 | from .utils import ( 22 | gather_split_1d_tensor, 23 | split_tensor_along_last_dim, 24 | split_tensor_into_1d_equal_chunks, 25 | ) 26 | 27 | __all__ = [ 28 | # cross_entropy.py 29 | "vocab_parallel_cross_entropy", 30 | # data.py 31 | "broadcast_data", 32 | # layers.py 33 | "ColumnParallelLinear", 34 | "RowParallelLinear", 35 | "VocabParallelEmbedding", 36 | "set_tensor_model_parallel_attributes", 37 | "set_defaults_if_not_set_tensor_model_parallel_attributes", 38 | "copy_tensor_model_parallel_attributes", 39 | "param_is_not_tensor_parallel_duplicate", 40 | "linear_with_grad_accumulation_and_async_allreduce", 41 | # mappings.py 42 | "copy_to_tensor_model_parallel_region", 43 | "gather_from_tensor_model_parallel_region", 44 | "gather_from_sequence_parallel_region", 45 | # "reduce_from_tensor_model_parallel_region", 46 | "scatter_to_tensor_model_parallel_region", 47 | "scatter_to_sequence_parallel_region", 48 | # random.py 49 | "checkpoint", 50 | "get_cuda_rng_tracker", 51 | "model_parallel_cuda_manual_seed", 52 | # utils.py 53 | "split_tensor_along_last_dim", 54 | "split_tensor_into_1d_equal_chunks", 55 | "gather_split_1d_tensor", 56 | ] 57 | -------------------------------------------------------------------------------- /megatron/core/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .transformer_config import TransformerConfig 4 | -------------------------------------------------------------------------------- /megatron/core/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | # can we get rid of this? 7 | # it's being used in pipeline schedules 8 | class ModelType(enum.Enum): 9 | encoder_or_decoder = 1 10 | encoder_and_decoder = 2 11 | 12 | 13 | # class LayerType(enum.Enum): 14 | # encoder = 1 15 | # decoder = 2 16 | 17 | 18 | class AttnType(enum.Enum): 19 | self_attn = 1 20 | cross_attn = 2 21 | 22 | 23 | class AttnMaskType(enum.Enum): 24 | padding = 1 25 | causal = 2 26 | -------------------------------------------------------------------------------- /megatron/core/transformer/identity_op.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | 4 | 5 | class IdentityOp(torch.nn.Module): 6 | """ 7 | This is a placeholder for IdentityOp (NoOp) 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | super(IdentityOp, self).__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | -------------------------------------------------------------------------------- /megatron/core/transformer/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for transformer layers.""" 4 | 5 | import torch 6 | 7 | from megatron import get_args 8 | 9 | 10 | def attention_mask_func(attention_scores, attention_mask): 11 | attention_scores.masked_fill_(attention_mask, -10000.0) 12 | return attention_scores 13 | 14 | 15 | def get_linear_layer(rows, columns, init_method): 16 | """Simple linear layer with weight initialization.""" 17 | layer = torch.nn.Linear(rows, columns) 18 | if get_args().perform_initialization: 19 | init_method(layer.weight) 20 | with torch.no_grad(): 21 | layer.bias.zero_() 22 | return layer 23 | 24 | 25 | @torch.jit.script 26 | def gelu_impl(x): 27 | """OpenAI's gelu implementation.""" 28 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x))) 29 | 30 | 31 | def openai_gelu(x): 32 | return gelu_impl(x) 33 | 34 | 35 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 36 | @torch.jit.script 37 | def erf_gelu(x): 38 | return ( 39 | x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype)) 40 | ) 41 | -------------------------------------------------------------------------------- /megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /megatron/data/multimodal_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from PIL import Image, UnidentifiedImageError 4 | import numpy as np 5 | import io 6 | import torch 7 | 8 | try: 9 | from torchvision.transforms import InterpolationMode 10 | BICUBIC = InterpolationMode.BICUBIC 11 | except ImportError: 12 | BICUBIC = Image.BICUBIC 13 | 14 | from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize 15 | 16 | def _convert_image_to_rgb(image): 17 | return image.convert("RGB") 18 | 19 | def _transform(img_h, img_w): 20 | return Compose([ 21 | ToPILImage(), 22 | RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0), interpolation=BICUBIC), 23 | _convert_image_to_rgb, 24 | ToTensor(), 25 | Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 26 | ]) 27 | 28 | class MultiModalDataset(torch.utils.data.Dataset): 29 | 30 | def __init__(self, name, data_prefix, indexed_dataset, 31 | num_samples, seq_length, seed, img_h, img_w): 32 | 33 | self.name = name 34 | self.indexed_dataset = indexed_dataset 35 | self.doc_idx = indexed_dataset.get_doc_idx() 36 | self.visual_transform = _transform(img_h, img_w) 37 | 38 | def __len__(self): 39 | return self.indexed_dataset.sizes.shape[0] 40 | 41 | def __getitem__(self, idx): 42 | text_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]) 43 | assert mode == 0 44 | img_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]+1) 45 | assert mode == 1 46 | img_pad = img_sample[0].item() 47 | xs = img_sample[1:].tobytes(order='C') 48 | xs = xs[:len(xs)-img_pad] 49 | 50 | img_sample = np.array(Image.open(io.BytesIO(xs))) 51 | img_sample = self.visual_transform(img_sample).reshape(-1) 52 | 53 | return {'text': np.array(text_sample, dtype=np.int64), 54 | 'img': np.array(img_sample, dtype=np.float32)} 55 | -------------------------------------------------------------------------------- /megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /megatron/dist_signal_handler.py: -------------------------------------------------------------------------------- 1 | import signal 2 | 3 | import torch 4 | 5 | 6 | def get_world_size(): 7 | if torch.distributed.is_available() and torch.distributed.is_initialized(): 8 | world_size = torch.distributed.get_world_size() 9 | else: 10 | world_size = 1 11 | return world_size 12 | 13 | 14 | def get_device(local_rank=None): 15 | backend = torch.distributed.get_backend() 16 | if backend == 'nccl': 17 | if local_rank is None: 18 | device = torch.device('cuda') 19 | else: 20 | device = torch.device(f'cuda:{local_rank}') 21 | elif backend == 'gloo': 22 | device = torch.device('cpu') 23 | else: 24 | raise RuntimeError 25 | return device 26 | 27 | 28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): 29 | if not torch.distributed.is_available() or \ 30 | not torch.distributed.is_initialized(): 31 | return [item] 32 | 33 | device = get_device(local_rank) 34 | 35 | if group is not None: 36 | group_size = group.size() 37 | else: 38 | group_size = get_world_size() 39 | 40 | tensor = torch.tensor([item], device=device, dtype=dtype) 41 | output_tensors = [ 42 | torch.zeros(1, dtype=tensor.dtype, device=tensor.device) 43 | for _ in range(group_size) 44 | ] 45 | torch.distributed.all_gather(output_tensors, tensor, group, async_op) 46 | output = [elem.item() for elem in output_tensors] 47 | return output 48 | 49 | 50 | class DistributedSignalHandler: 51 | def __init__(self, sig=signal.SIGTERM): 52 | self.sig = sig 53 | 54 | def signals_received(self): 55 | all_received = all_gather_item( 56 | self._signal_received, dtype=torch.int32 57 | ) 58 | return all_received 59 | 60 | def __enter__(self): 61 | self._signal_received = False 62 | self.released = False 63 | self.original_handler = signal.getsignal(self.sig) 64 | 65 | def handler(signum, frame): 66 | self._signal_received = True 67 | 68 | signal.signal(self.sig, handler) 69 | 70 | return self 71 | 72 | def __exit__(self, type, value, tb): 73 | self.release() 74 | 75 | def release(self): 76 | if self.released: 77 | return False 78 | 79 | signal.signal(self.sig, self.original_handler) 80 | self.released = True 81 | return True 82 | -------------------------------------------------------------------------------- /megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import pathlib 5 | import subprocess 6 | 7 | from torch.utils import cpp_extension 8 | 9 | # Setting this param to a list has a problem of generating different 10 | # compilation commands (with diferent order of architectures) and 11 | # leading to recompilation of fused kernels. Set it to empty string 12 | # to avoid recompilation and assign arch flags explicity in 13 | # extra_cuda_cflags below 14 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 15 | 16 | 17 | def load(args): 18 | 19 | # Check if cuda 11 is installed for compute capability 8.0 20 | cc_flag = [] 21 | _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version( 22 | cpp_extension.CUDA_HOME 23 | ) 24 | if int(bare_metal_major) >= 11: 25 | cc_flag.append('-gencode') 26 | cc_flag.append('arch=compute_80,code=sm_80') 27 | if int(bare_metal_minor) >= 8: 28 | cc_flag.append('-gencode') 29 | cc_flag.append('arch=compute_90,code=sm_90') 30 | 31 | # Build path 32 | srcpath = pathlib.Path(__file__).parent.absolute() 33 | buildpath = srcpath / "build" 34 | _create_build_dir(buildpath) 35 | 36 | # Helper function to build the kernels. 37 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 38 | return cpp_extension.load( 39 | name=name, 40 | sources=sources, 41 | build_directory=buildpath, 42 | extra_cflags=[ 43 | "-O3", 44 | ], 45 | extra_cuda_cflags=[ 46 | "-O3", 47 | "-gencode", 48 | "arch=compute_70,code=sm_70", 49 | "--use_fast_math", 50 | ] 51 | + extra_cuda_flags 52 | + cc_flag, 53 | verbose=(args.rank == 0), 54 | ) 55 | 56 | 57 | def _get_cuda_bare_metal_version(cuda_dir): 58 | raw_output = subprocess.check_output( 59 | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True 60 | ) 61 | output = raw_output.split() 62 | release_idx = output.index("release") + 1 63 | release = output[release_idx].split(".") 64 | bare_metal_major = release[0] 65 | bare_metal_minor = release[1][0] 66 | 67 | return raw_output, bare_metal_major, bare_metal_minor 68 | 69 | 70 | def _create_build_dir(buildpath): 71 | try: 72 | os.mkdir(buildpath) 73 | except OSError: 74 | if not os.path.isdir(buildpath): 75 | print(f"Creation of the build directory {buildpath} failed") 76 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 4 | 5 | from .distributed import DistributedDataParallel 6 | from .bert_model import BertModel 7 | from .gpt_model import GPTModel 8 | from .t5_model import T5Model 9 | from .language_model import get_language_model 10 | from .module import Float16Module 11 | -------------------------------------------------------------------------------- /megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | retro_decoder_with_retriever = 5 11 | 12 | class AttnType(enum.Enum): 13 | self_attn = 1 14 | cross_attn = 2 15 | 16 | class AttnMaskType(enum.Enum): 17 | padding = 1 18 | causal = 2 19 | 20 | # For backward compatibility with old model checkpoints 21 | from megatron.core.enums import ModelType 22 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 7 | # 1/sqrt(2*pi)-> 0.3989423 8 | # 1/sqrt(2) -> 0.70710678 9 | # sqrt(2/pi) -> 0.79788456 10 | # this function is tanh approximation of gelu 11 | # actual gelu is: 12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 13 | 14 | @torch.jit.script 15 | def bias_gelu(bias, y): 16 | x = bias + y 17 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 18 | 19 | # gradient of tanh approximation of gelu 20 | # gradient of actual gelu is: 21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 22 | @torch.jit.script 23 | def bias_gelu_back(g, bias, y): 24 | x = bias + y 25 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 26 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 27 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 28 | return ff*g 29 | 30 | class GeLUFunction(torch.autograd.Function): 31 | @staticmethod 32 | # bias is an optional argument 33 | def forward(ctx, input, bias): 34 | ctx.save_for_backward(input, bias) 35 | return bias_gelu(bias, input) 36 | 37 | @staticmethod 38 | def backward(ctx, grad_output): 39 | input, bias = ctx.saved_tensors 40 | tmp = bias_gelu_back(grad_output, bias, input) 41 | return tmp, tmp 42 | 43 | bias_gelu_impl = GeLUFunction.apply 44 | -------------------------------------------------------------------------------- /megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for models.""" 4 | 5 | import math 6 | 7 | import torch 8 | 9 | from megatron import get_args 10 | 11 | def init_method_normal(sigma): 12 | """Init method based on N(0, sigma).""" 13 | def init_(tensor): 14 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 15 | 16 | return init_ 17 | 18 | 19 | def scaled_init_method_normal(sigma, num_layers): 20 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 21 | std = sigma / math.sqrt(2.0 * num_layers) 22 | 23 | def init_(tensor): 24 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 25 | 26 | return init_ 27 | 28 | 29 | def attention_mask_func(attention_scores, attention_mask): 30 | attention_scores.masked_fill_(attention_mask, -10000.0) 31 | return attention_scores 32 | 33 | 34 | def get_linear_layer(rows, columns, init_method): 35 | """Simple linear layer with weight initialization.""" 36 | layer = torch.nn.Linear(rows, columns) 37 | if get_args().perform_initialization: 38 | init_method(layer.weight) 39 | with torch.no_grad(): 40 | layer.bias.zero_() 41 | return layer 42 | 43 | @torch.jit.script 44 | def gelu_impl(x): 45 | """OpenAI's gelu implementation.""" 46 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 47 | (1.0 + 0.044715 * x * x))) 48 | def openai_gelu(x): 49 | return gelu_impl(x) 50 | 51 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 52 | @torch.jit.script 53 | def erf_gelu(x): 54 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 55 | -------------------------------------------------------------------------------- /megatron/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def resize(input, 7 | size=None, 8 | scale_factor=None, 9 | mode='nearest', 10 | align_corners=None, 11 | warning=True): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ((output_h > 1 and output_w > 1 and input_h > 1 18 | and input_w > 1) and (output_h - 1) % (input_h - 1) 19 | and (output_w - 1) % (input_w - 1)): 20 | warnings.warn( 21 | f'When align_corners={align_corners}, ' 22 | 'the output would more aligned if ' 23 | f'input size {(input_h, input_w)} is `x+1` and ' 24 | f'out size {(output_h, output_w)} is `nx+1`') 25 | if isinstance(size, torch.Size): 26 | size = tuple(int(x) for x in size) 27 | return F.interpolate(input, size, scale_factor, mode, align_corners) 28 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import os 5 | import random 6 | import numpy 7 | import torch 8 | 9 | import mpu 10 | 11 | 12 | class IdentityLayer(torch.nn.Module): 13 | def __init__(self, size, scale=1.0): 14 | super(IdentityLayer, self).__init__() 15 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 16 | 17 | def forward(self): 18 | return self.weight 19 | 20 | 21 | def set_random_seed(seed): 22 | """Set random seed for reproducability.""" 23 | random.seed(seed) 24 | numpy.random.seed(seed) 25 | torch.manual_seed(seed) 26 | mpu.model_parallel_cuda_manual_seed(seed) 27 | 28 | 29 | def initialize_distributed(backend='nccl'): 30 | """Initialize torch.distributed.""" 31 | # Get local rank in case it is provided. 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--local_rank', type=int, default=None, 34 | help='local rank passed from distributed launcher') 35 | args = parser.parse_args() 36 | local_rank = args.local_rank 37 | 38 | # Get rank and world size. 39 | rank = int(os.getenv('RANK', '0')) 40 | world_size = int(os.getenv("WORLD_SIZE", '1')) 41 | 42 | print('> initializing torch.distributed with local rank: {}, ' 43 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 44 | 45 | # Set the device id. 46 | device = rank % torch.cuda.device_count() 47 | if local_rank is not None: 48 | device = local_rank 49 | torch.cuda.set_device(device) 50 | 51 | # Call the init process. 52 | init_method = 'tcp://' 53 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 54 | master_port = os.getenv('MASTER_PORT', '6000') 55 | init_method += master_ip + ':' + master_port 56 | torch.distributed.init_process_group( 57 | backend=backend, 58 | world_size=world_size, 59 | rank=rank, 60 | init_method=init_method) 61 | 62 | 63 | def print_separator(message): 64 | torch.distributed.barrier() 65 | filler_len = (78 - len(message)) // 2 66 | filler = '-' * filler_len 67 | string = '\n' + filler + ' {} '.format(message) + filler 68 | if torch.distributed.get_rank() == 0: 69 | print(string, flush=True) 70 | torch.distributed.barrier() 71 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import print_separator 4 | from commons import initialize_distributed 5 | from mpu import data as data_utils 6 | import mpu 7 | import torch 8 | import functools 9 | import operator 10 | import sys 11 | sys.path.append("../..") 12 | 13 | 14 | def test_broadcast_data(tensor_model_parallel_size): 15 | 16 | if torch.distributed.get_rank() == 0: 17 | print('> testing broadcast_data with model parallel size {} ...'. 18 | format(tensor_model_parallel_size)) 19 | 20 | mpu.initialize_model_parallel(tensor_model_parallel_size) 21 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 22 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 23 | 24 | key_size_t = {'key1': [7, 11], 25 | 'key2': [8, 2, 1], 26 | 'key3': [13], 27 | 'key4': [5, 1, 2], 28 | 'key5': [5, 12]} 29 | keys = list(key_size_t.keys()) 30 | 31 | data = {} 32 | data_t = {} 33 | for key in key_size_t: 34 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 35 | data_t[key] = data[key].clone() 36 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 37 | data_t['keyX'] = data['keyX'].clone() 38 | if mpu.get_tensor_model_parallel_rank() != 0: 39 | data = None 40 | 41 | data_utils._check_data_types(keys, data_t, torch.int64) 42 | key_size, key_numel, \ 43 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 44 | for key in keys: 45 | assert key_size[key] == key_size_t[key] 46 | total_numel_t = 0 47 | for key in keys: 48 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 49 | assert key_numel[key] == target_size 50 | total_numel_t += target_size 51 | assert total_numel == total_numel_t 52 | 53 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 54 | for key in keys: 55 | tensor = data_t[key].cuda() 56 | assert data_b[key].sub(tensor).abs().max() == 0 57 | 58 | # Reset groups 59 | mpu.destroy_tensor_model_parallel() 60 | 61 | torch.distributed.barrier() 62 | if torch.distributed.get_rank() == 0: 63 | print('>> passed the test :-)') 64 | 65 | 66 | if __name__ == '__main__': 67 | 68 | initialize_distributed() 69 | world_size = torch.distributed.get_world_size() 70 | 71 | tensor_model_parallel_size = 1 72 | while tensor_model_parallel_size <= world_size: 73 | print_separator('test test broadcast data') 74 | test_broadcast_data(tensor_model_parallel_size) 75 | tensor_model_parallel_size *= 2 76 | -------------------------------------------------------------------------------- /megatron/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /megatron/text_generation/beam_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | ## from huggingface beam search 19 | class BeamHypotheses(object): 20 | def __init__(self, num_beams, length_penalty=1.0, early_stopping=False): 21 | """ 22 | Initialize n-best list of hypotheses. 23 | """ 24 | self.length_penalty = length_penalty 25 | self.early_stopping = early_stopping 26 | self.num_beams = num_beams 27 | self.beams = [] 28 | self.worst_score = 1e9 29 | 30 | def __len__(self): 31 | """ 32 | Number of hypotheses in the list. 33 | """ 34 | return len(self.beams) 35 | 36 | def add(self, hyp, sum_logprobs, length): 37 | """ 38 | Add a new hypothesis to the list. 39 | """ 40 | score = sum_logprobs / length ** self.length_penalty 41 | if len(self) < self.num_beams or score > self.worst_score: 42 | self.beams.append((score, hyp)) 43 | if len(self) > self.num_beams: 44 | sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) 45 | del self.beams[sorted_scores[0][1]] 46 | self.worst_score = sorted_scores[1][0] 47 | else: 48 | self.worst_score = min(score, self.worst_score) 49 | 50 | def is_done(self, best_sum_logprobs, cur_len): 51 | """ 52 | If there are enough hypotheses and that none of the hypotheses being generated 53 | can become better than the worst one in the heap, then we are done with this sentence. 54 | """ 55 | 56 | if len(self) < self.num_beams: 57 | return False 58 | elif self.early_stopping: 59 | return True 60 | else: 61 | cur_score = best_sum_logprobs / cur_len ** self.length_penalty 62 | ret = self.worst_score >= cur_score 63 | return ret 64 | 65 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /picture/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/.DS_Store -------------------------------------------------------------------------------- /picture/2.7bx8A100_memory.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/2.7bx8A100_memory.pdf -------------------------------------------------------------------------------- /picture/32x7b zhihu_throughput.tex: -------------------------------------------------------------------------------- 1 | \begin{tabular}{llllllllll} 2 | \toprule 3 | & & \multicolumn{4}{r}{TFLOPS/s} & \multicolumn{4}{r}{Throughput} \\ 4 | & Method & 1F1B & 1F1B-I & Seq1F1B & Seq1F1B-I & 1F1B & 1F1B-I & Seq1F1B & Seq1F1B-I \\ 5 | seqlen & Micros & & & & & & & & \\ 6 | \midrule 7 | \multirow[t]{2}{*}{32768} & 8 & 99.67±0.19 & 109.55±0.72 & \textbf{110.62±0.54} & \textbf{97.69±1.82} & 48189.47±93.97 & 52964.06±348.28 & \textbf{53484.52±259.03} & \textbf{47232.00±882.04} \\ 8 | & 16 & 114.45±0.36 & 116.46±0.76 & \textbf{115.34±0.18} & \textbf{95.55±1.60} & 55333.16±175.13 & 56304.42±365.27 & \textbf{55765.31±89.28} & \textbf{46195.24±771.99} \\ 9 | \cline{1-10} 10 | \multirow[t]{2}{*}{65536} & 8 & 107.49±0.03 & 119.96±0.16 & \textbf{124.62±0.06} & \textbf{117.75±1.26} & 37342.92±10.05 & 41676.97±57.03 & \textbf{43296.70±20.83} & \textbf{40907.87±437.09} \\ 11 | & 16 & 123.95±0.06 & 128.70±0.06 & \textbf{123.05±0.51} & \textbf{117.98±0.82} & 43063.26±20.15 & 44712.49±19.41 & \textbf{33367.34±137.63} & \textbf{40989.72±284.90} \\ 12 | \cline{1-10} 13 | \multirow[t]{2}{*}{131072} & 8 & OOM & OOM & \textbf{136.72±0.05} & \textbf{135.06±0.17} & OOM & OOM & \textbf{30392.34±10.14} & \textbf{30023.38±38.29} \\ 14 | & 16 & OOM & OOM & \textbf{142.08±0.02} & \textbf{136.58±0.19} & OOM & OOM & \textbf{31584.02±4.38} & \textbf{30362.33±42.01} \\ 15 | \cline{1-10} 16 | \bottomrule 17 | \end{tabular} 18 | -------------------------------------------------------------------------------- /picture/Raycast (2).dmg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/Raycast (2).dmg -------------------------------------------------------------------------------- /picture/seq1f1b_memory.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_memory.pdf -------------------------------------------------------------------------------- /picture/seq1f1b_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_memory.png -------------------------------------------------------------------------------- /picture/seq1f1b_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_original.png -------------------------------------------------------------------------------- /picture/seq1f1b_zerobubble.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_zerobubble.pdf -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | [tool.isort] 4 | profile = "black" # black-compatible 5 | line_length = 100 # should match black parameters 6 | py_version = 38 # python 3.8 as a target version 7 | known_first_party = ["megatron"] # FIRSTPARTY section 8 | known_third_party = ["transformer_engine"] # THIRDPARTY section 9 | sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"] 10 | default_section = "THIRDPARTY" 11 | extend_skip = ["setup.py"] 12 | 13 | [tool.black] 14 | line_length = 100 15 | skip_string_normalization = true 16 | # recongized by future versions, disallows to reformat code with incompatible versions 17 | # Matches NeMO version so people working on both codebases don't need two different version of black installed 18 | required_version = "19.10b0" 19 | -------------------------------------------------------------------------------- /tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """GLUE dataset.""" 4 | 5 | from abc import ABC 6 | from abc import abstractmethod 7 | 8 | from torch.utils.data import Dataset 9 | 10 | from megatron import print_rank_0 11 | from tasks.data_utils import build_sample 12 | from tasks.data_utils import build_tokens_types_paddings_from_text 13 | 14 | 15 | class GLUEAbstractDataset(ABC, Dataset): 16 | """GLUE base dataset class.""" 17 | 18 | def __init__(self, task_name, dataset_name, datapaths, 19 | tokenizer, max_seq_length): 20 | # Store inputs. 21 | self.task_name = task_name 22 | self.dataset_name = dataset_name 23 | self.tokenizer = tokenizer 24 | self.max_seq_length = max_seq_length 25 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 26 | self.dataset_name)) 27 | # Process the files. 28 | string = ' > paths:' 29 | for path in datapaths: 30 | string += ' ' + path 31 | print_rank_0(string) 32 | self.samples = [] 33 | for datapath in datapaths: 34 | self.samples.extend(self.process_samples_from_single_path(datapath)) 35 | print_rank_0(' >> total number of samples: {}'.format( 36 | len(self.samples))) 37 | 38 | def __len__(self): 39 | return len(self.samples) 40 | 41 | def __getitem__(self, idx): 42 | raw_sample = self.samples[idx] 43 | ids, types, paddings = build_tokens_types_paddings_from_text( 44 | raw_sample['text_a'], raw_sample['text_b'], 45 | self.tokenizer, self.max_seq_length) 46 | sample = build_sample(ids, types, paddings, 47 | raw_sample['label'], raw_sample['uid']) 48 | return sample 49 | 50 | @abstractmethod 51 | def process_samples_from_single_path(self, datapath): 52 | """Abstract method that takes a single path / filename and 53 | returns a list of dataset samples, each sample being a dict of 54 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 55 | """ 56 | pass 57 | -------------------------------------------------------------------------------- /tasks/glue/mnli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """MNLI dataset.""" 4 | 5 | from megatron import print_rank_0 6 | from tasks.data_utils import clean_text 7 | from .data import GLUEAbstractDataset 8 | 9 | 10 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} 11 | 12 | 13 | class MNLIDataset(GLUEAbstractDataset): 14 | 15 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 16 | test_label='contradiction'): 17 | self.test_label = test_label 18 | super().__init__('MNLI', name, datapaths, 19 | tokenizer, max_seq_length) 20 | 21 | def process_samples_from_single_path(self, filename): 22 | """"Implement abstract method.""" 23 | print_rank_0(' > Processing {} ...'.format(filename)) 24 | 25 | samples = [] 26 | total = 0 27 | first = True 28 | is_test = False 29 | with open(filename, 'r') as f: 30 | for line in f: 31 | row = line.strip().split('\t') 32 | if first: 33 | first = False 34 | if len(row) == 10: 35 | is_test = True 36 | print_rank_0( 37 | ' reading {}, {} and {} columns and setting ' 38 | 'labels to {}'.format( 39 | row[0].strip(), row[8].strip(), 40 | row[9].strip(), self.test_label)) 41 | else: 42 | print_rank_0(' reading {} , {}, {}, and {} columns ' 43 | '...'.format( 44 | row[0].strip(), row[8].strip(), 45 | row[9].strip(), row[-1].strip())) 46 | continue 47 | 48 | text_a = clean_text(row[8].strip()) 49 | text_b = clean_text(row[9].strip()) 50 | unique_id = int(row[0].strip()) 51 | label = row[-1].strip() 52 | if is_test: 53 | label = self.test_label 54 | 55 | assert len(text_a) > 0 56 | assert len(text_b) > 0 57 | assert label in LABELS 58 | assert unique_id >= 0 59 | 60 | sample = {'text_a': text_a, 61 | 'text_b': text_b, 62 | 'label': LABELS[label], 63 | 'uid': unique_id} 64 | total += 1 65 | samples.append(sample) 66 | 67 | if total % 50000 == 0: 68 | print_rank_0(' > processed {} so far ...'.format(total)) 69 | 70 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 71 | return samples 72 | -------------------------------------------------------------------------------- /tasks/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework. 5 | 6 | ## Multi-Stage Dialogue Prompting 7 | 8 | ### Data Preparation 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/) 10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets. 11 | 12 | ### Stage-1: Prompting for Knowledge Generation 13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation. 14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation. 15 | 16 | ### Stage-2: Prompting for Response Generation 17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file). 18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation. 19 | 3. We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation. 20 | -------------------------------------------------------------------------------- /tasks/msdp/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Model evaluation""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from tasks.msdp.metrics import F1Metric 8 | from tqdm import tqdm 9 | 10 | 11 | def evaluate_f1(guess_file, answer_file): 12 | """Evaluating F1 Score""" 13 | 14 | guess_list = [] 15 | print_rank_0('reading %s' % guess_file) 16 | with open(guess_file, "r") as f: 17 | for i, line in enumerate(tqdm(f)): 18 | line = line.strip() 19 | if "<|endoftext|>" in line: 20 | line = line.replace("<|endoftext|>", "") 21 | guess_list.append(line) 22 | 23 | answer_list = [] 24 | print_rank_0('reading %s' % answer_file) 25 | with open(answer_file, "r") as f: 26 | for i, line in enumerate(tqdm(f)): 27 | line = line.strip() 28 | if line == "no_passages_used": 29 | line = "" 30 | answer_list.append(line) 31 | 32 | assert len(guess_list) == len(answer_list), \ 33 | "lengths of guess and answer are different!" 34 | 35 | precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list) 36 | print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1)) 37 | 38 | print_rank_0('done :-)') 39 | 40 | 41 | def main(): 42 | args = get_args() 43 | 44 | evaluate_f1(args.guess_file, args.answer_file) 45 | 46 | -------------------------------------------------------------------------------- /tasks/msdp/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Run multi-stage dialogue prompting (MSDP).""" 4 | 5 | import os 6 | import sys 7 | sys.path.append(os.path.abspath(os.path.join( 8 | os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir))) 9 | from megatron import get_args 10 | from megatron.initialize import initialize_megatron 11 | 12 | 13 | def get_tasks_args(parser): 14 | """Provide extra arguments required for tasks.""" 15 | group = parser.add_argument_group(title='tasks') 16 | 17 | # parameters for the knowledgeable dialogue generation 18 | group.add_argument('--task', type=str, required=True, 19 | help='Task name.') 20 | group.add_argument("--sample-input-file", type=str, default=None, 21 | help='Get input from file instead of interactive mode, ' 22 | 'each line is an input.') 23 | group.add_argument("--sample-output-file", type=str, default=None, 24 | help='Output file got from --sample-input-file') 25 | group.add_argument('--prompt-file', type=str, default=None, 26 | help='prompting file') 27 | group.add_argument('--prompt-type', type=str, default=None, 28 | choices=['knowledge', 'response'], 29 | help='prompt type (knowledge or response)') 30 | group.add_argument('--num-prompt-examples', type=int, default=10, 31 | help='number of prompt examples') 32 | group.add_argument('--guess-file', type=str, default=None, 33 | help='datapath for generated sentences') 34 | group.add_argument('--answer-file', type=str, default=None, 35 | help='datapath for golden sentences') 36 | group.add_argument('--out-seq-length', type=int, default=100, 37 | help='output sequence length') 38 | group.add_argument('--api-prompt', default=False, action="store_true", 39 | help='setup model api for prompting') 40 | group.add_argument('--megatron-api-url', type=str, default=None, 41 | help='url of the megatron api') 42 | 43 | return parser 44 | 45 | 46 | if __name__ == '__main__': 47 | 48 | initialize_megatron(extra_args_provider=get_tasks_args) 49 | 50 | args = get_args() 51 | 52 | if args.num_layers_per_virtual_pipeline_stage is not None: 53 | print("Interleaved pipeline schedule is not yet supported for downstream tasks.") 54 | exit() 55 | 56 | if args.task == 'MSDP-PROMPT': 57 | from tasks.msdp.prompt import main 58 | 59 | elif args.task == 'MSDP-EVAL-F1': 60 | from tasks.msdp.evaluate import main 61 | 62 | else: 63 | raise NotImplementedError('Task {} is not implemented.'.format( 64 | args.task)) 65 | 66 | main() 67 | -------------------------------------------------------------------------------- /tasks/msdp/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | # The following code is adapted from 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 4 | # which is licensed under the MIT license. More details on the license can be 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE. 6 | 7 | """Provides standard metric evaluations for dialog.""" 8 | 9 | from collections import Counter 10 | from typing import List 11 | import numpy as np 12 | import re 13 | 14 | re_art = re.compile(r'\b(a|an|the)\b') 15 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') 16 | 17 | 18 | def normalize_answer(s): 19 | """ 20 | Lower text and remove punctuation, articles and extra whitespace. 21 | """ 22 | s = s.lower() 23 | s = re_punc.sub(' ', s) 24 | s = re_art.sub(' ', s) 25 | s = ' '.join(s.split()) 26 | return s 27 | 28 | 29 | class F1Metric: 30 | """ 31 | Helper class which computes token-level F1. 32 | """ 33 | 34 | @staticmethod 35 | def _prec_recall_f1_score(pred_items, gold_items): 36 | """ 37 | Compute precision, recall and f1 given a set of gold and prediction items. 38 | :param pred_items: iterable of predicted values 39 | :param gold_items: iterable of gold values 40 | :return: tuple (p, r, f1) for precision, recall, f1 41 | """ 42 | common = Counter(gold_items) & Counter(pred_items) 43 | num_same = sum(common.values()) 44 | if num_same == 0: 45 | return 0, 0, 0 46 | precision = 1.0 * num_same / len(pred_items) 47 | recall = 1.0 * num_same / len(gold_items) 48 | f1 = (2 * precision * recall) / (precision + recall) 49 | return precision, recall, f1 50 | 51 | @staticmethod 52 | def compute_each_pair(guess: str, answer: str): 53 | if answer == "": 54 | return None, None, None 55 | if guess == "": 56 | return 0, 0, 0 57 | g_tokens = normalize_answer(guess).split() 58 | a_tokens = normalize_answer(answer).split() 59 | 60 | precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens) 61 | return precision, recall, f1 62 | 63 | @staticmethod 64 | def compute_all_pairs(guesses: List[str], answers: List[str]): 65 | # additional augment: 66 | assert len(guesses) == len(answers) 67 | 68 | precision_list, recall_list, f1_list = [], [], [] 69 | for guess, answer in zip(guesses, answers): 70 | precision, recall, f1 = F1Metric.compute_each_pair(guess, answer) 71 | if precision is None or recall is None or f1 is None: 72 | continue 73 | precision_list.append(precision) 74 | recall_list.append(recall) 75 | f1_list.append(f1) 76 | 77 | return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list) 78 | -------------------------------------------------------------------------------- /tasks/orqa/README.md: -------------------------------------------------------------------------------- 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering 2 | 3 | Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408). 4 | 5 | ## Retriever Training 6 | 7 | #### Unsupervised pretraining 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body. 9 | 10 |
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | 
20 | 21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training. 22 | 23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf). 24 | 25 | #### Supervised finetuning 26 | 27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906). 28 | 29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model. 30 | 31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408). 32 | 33 | ## Reader Training 34 | 35 | The reader component will be available soon. 36 | 37 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | from megatron import get_args, print_rank_0 6 | from megatron.indexer import IndexBuilder 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator 8 | 9 | def main(): 10 | """ 11 | Main program 12 | """ 13 | 14 | args = get_args() 15 | 16 | """ 17 | Create a BlockData data structure by running an IndexBuilder over an 18 | ICT Dataset and then evaluate on NQ task 19 | """ 20 | 21 | print_rank_0("Starting index builder!") 22 | 23 | index_builder = IndexBuilder() 24 | index_builder.build_and_save_index() 25 | print_rank_0("Build and save indices: done!") 26 | 27 | 28 | print_rank_0("Starting evaluations!") 29 | 30 | # Set up the model and evaluator 31 | evaluator = ORQAEvaluator() 32 | 33 | # Run evaluation 34 | if args.qa_data_dev is not None: 35 | evaluator.evaluate(args.qa_data_dev, "DEV") 36 | 37 | if args.qa_data_test is not None: 38 | evaluator.evaluate(args.qa_data_test, "TEST") 39 | 40 | -------------------------------------------------------------------------------- /tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Race.""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from megatron import get_tokenizer 8 | from megatron.model.multiple_choice import MultipleChoice 9 | from tasks.eval_utils import accuracy_func_provider 10 | from tasks.finetune_utils import finetune 11 | from tasks.race.data import RaceDataset 12 | from megatron.arguments import core_transformer_config_from_args 13 | 14 | 15 | def train_valid_datasets_provider(): 16 | """Provide train and validation datasets.""" 17 | args = get_args() 18 | tokenizer = get_tokenizer() 19 | 20 | train_dataset = RaceDataset('training', args.train_data, 21 | tokenizer, args.seq_length) 22 | valid_dataset = RaceDataset('validation', args.valid_data, 23 | tokenizer, args.seq_length) 24 | 25 | return train_dataset, valid_dataset 26 | 27 | 28 | def model_provider(pre_process=True, post_process=True): 29 | """Build the model.""" 30 | config = core_transformer_config_from_args(get_args()) 31 | print_rank_0('building multichoice model for RACE ...') 32 | model = MultipleChoice(config=config, 33 | num_tokentypes=2, 34 | pre_process=pre_process, 35 | post_process=post_process) 36 | 37 | return model 38 | 39 | 40 | def metrics_func_provider(): 41 | """Privde metrics callback function.""" 42 | args = get_args() 43 | tokenizer = get_tokenizer() 44 | 45 | def single_dataset_provider(datapath): 46 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 47 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 48 | 49 | return accuracy_func_provider(single_dataset_provider) 50 | 51 | 52 | def main(): 53 | 54 | finetune(train_valid_datasets_provider, model_provider, 55 | end_of_epoch_callback_provider=metrics_func_provider) 56 | -------------------------------------------------------------------------------- /tasks/vision/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | import os 6 | import sys 7 | 8 | sys.path.append( 9 | os.path.abspath( 10 | os.path.join( 11 | os.path.join(os.path.dirname(__file__), os.path.pardir), 12 | os.path.pardir, 13 | ) 14 | ) 15 | ) 16 | from megatron import get_args 17 | from megatron.initialize import initialize_megatron 18 | 19 | def get_tasks_args(parser): 20 | """Provide extra arguments required for tasks.""" 21 | group = parser.add_argument_group(title="tasks") 22 | 23 | group.add_argument('--task', type=str, default='segment', 24 | choices=['classify', 'segment_setr', 'segment_segformer'], 25 | help='task name.') 26 | group.add_argument("--epochs", type=int, default=None, 27 | help="Number of finetunning epochs. Zero results in " 28 | "evaluation only.") 29 | group.add_argument('--pretrained-checkpoint-type', type=str, default='default', 30 | choices=['default', 'external', 'constrastive'], 31 | help='Type of pretrained checkpoint') 32 | group.add_argument("--pretrained-checkpoint", type=str, default=None, 33 | help="Pretrained checkpoint used for finetunning.") 34 | group.add_argument('--seg-stride', type=int, default=None, 35 | help='sliding window stride during evaluation') 36 | return parser 37 | 38 | 39 | if __name__ == "__main__": 40 | 41 | initialize_megatron(extra_args_provider=get_tasks_args) 42 | args = get_args() 43 | 44 | if args.task == 'classify': 45 | from tasks.vision.classification.classification import main 46 | main() 47 | elif args.task == 'segment_setr': 48 | from tasks.vision.segmentation.finetune_setr import main 49 | main() 50 | elif args.task == 'segment_segformer': 51 | from tasks.vision.segmentation.finetune_segformer import main 52 | main() 53 | 54 | -------------------------------------------------------------------------------- /tasks/vision/segmentation/seg_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import math 3 | import einops 4 | import torch 5 | import apex 6 | import torch.nn.functional as F 7 | from megatron import get_args 8 | from megatron.model.module import MegatronModule 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead 10 | from megatron.model.vision.mit_backbone import mit_b3, mit_b5 11 | from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead 12 | 13 | 14 | class SetrSegmentationModel(MegatronModule): 15 | 16 | def __init__(self, 17 | num_classes, 18 | pre_process=True, 19 | post_process=True): 20 | super(SetrSegmentationModel, self).__init__() 21 | args = get_args() 22 | assert post_process & pre_process 23 | self.hidden_size = args.hidden_size 24 | self.num_classes = num_classes 25 | self.backbone = VitBackbone( 26 | pre_process=pre_process, 27 | post_process=post_process, 28 | class_token=False, 29 | post_layer_norm=False, 30 | drop_path_rate=0.1 31 | ) 32 | 33 | self.head = SetrSegmentationHead( 34 | self.hidden_size, 35 | self.num_classes 36 | ) 37 | 38 | def set_input_tensor(self, input_tensor): 39 | """See megatron.model.transformer.set_input_tensor()""" 40 | pass 41 | 42 | def forward(self, input): 43 | # [b hw c] 44 | hidden_states = self.backbone(input) 45 | result_final = self.head(hidden_states) 46 | return result_final 47 | 48 | 49 | class SegformerSegmentationModel(MegatronModule): 50 | 51 | def __init__(self, 52 | num_classes, 53 | pre_process=True, 54 | post_process=True): 55 | super(SegformerSegmentationModel, self).__init__() 56 | args = get_args() 57 | self.hidden_size = args.hidden_size 58 | self.num_classes = num_classes 59 | self.pre_process = pre_process 60 | self.post_process = post_process 61 | 62 | self.backbone = mit_b5() 63 | self.head = SegformerSegmentationHead( 64 | feature_strides=[4, 8, 16, 32], 65 | in_channels=[64, 128, 320, 512], 66 | embedding_dim=768, 67 | dropout_ratio=0.1 68 | ) 69 | 70 | def set_input_tensor(self, input_tensor): 71 | """See megatron.model.transformer.set_input_tensor()""" 72 | pass 73 | 74 | def forward(self, input): 75 | # [b hw c] 76 | hidden_states = self.backbone(input) 77 | hidden_states = self.head(hidden_states) 78 | return hidden_states 79 | 80 | -------------------------------------------------------------------------------- /tasks/zeroshot_gpt/detokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Detokenization.""" 4 | 5 | import re 6 | 7 | 8 | def ptb_detokenizer(string): 9 | string = string.replace(" '", "'") 10 | string = string.replace(" \n", "\n") 11 | string = string.replace("\n ", "\n") 12 | string = string.replace(" n't", "n't") 13 | string = string.replace(" N ", "1 ") 14 | string = string.replace("$ 1", "$1") 15 | string = string.replace("# 1", "#1") 16 | return string 17 | 18 | 19 | def wikitext_detokenizer(string): 20 | # contractions 21 | string = string.replace("s '", "s'") 22 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 23 | # number separators 24 | string = string.replace(" @-@ ", "-") 25 | string = string.replace(" @,@ ", ",") 26 | string = string.replace(" @.@ ", ".") 27 | # punctuation 28 | string = string.replace(" : ", ": ") 29 | string = string.replace(" ; ", "; ") 30 | string = string.replace(" . ", ". ") 31 | string = string.replace(" ! ", "! ") 32 | string = string.replace(" ? ", "? ") 33 | string = string.replace(" , ", ", ") 34 | # double brackets 35 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 36 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 37 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 38 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 39 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 40 | # miscellaneous 41 | string = string.replace("= = = =", "====") 42 | string = string.replace("= = =", "===") 43 | string = string.replace("= =", "==") 44 | string = string.replace(" " + chr(176) + " ", chr(176)) 45 | string = string.replace(" \n", "\n") 46 | string = string.replace("\n ", "\n") 47 | string = string.replace(" N ", " 1 ") 48 | string = string.replace(" 's", "'s") 49 | 50 | return string 51 | 52 | 53 | def lambada_detokenizer(string): 54 | return string 55 | 56 | 57 | _DETOKENIZERS = { 58 | 'ptb': ptb_detokenizer, 59 | 'wiki': wikitext_detokenizer, 60 | 'lambada': lambada_detokenizer, 61 | } 62 | 63 | 64 | def get_detokenizer(path): 65 | for key in _DETOKENIZERS.keys(): 66 | if key in path: 67 | return _DETOKENIZERS[key] 68 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/functional_tests/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/functional_tests/python_test_utils/__init__.py -------------------------------------------------------------------------------- /tests/functional_tests/python_test_utils/check_slurm_job_completion.py: -------------------------------------------------------------------------------- 1 | """Check if a given slurm job id completed successfully 2 | Usage: 3 | python3 check_slurm_job_completion.py 4 | """ 5 | 6 | import sys 7 | import subprocess 8 | 9 | 10 | cmd = f"sacct -j {sys.argv[1]}" 11 | result = subprocess.check_output(cmd, shell=True).decode().split() 12 | assert len(result) > 14, "JOB state not available." 13 | 14 | status = result[19] 15 | exit_code = result[20] 16 | 17 | assert status == "COMPLETED", f"Job {sys.argv[1]} not completed." 18 | assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully." 19 | 20 | -------------------------------------------------------------------------------- /tests/functional_tests/shell_test_utils/jobwait.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | JOBID=$1 4 | echo "Job id : $JOBID" 5 | 6 | if [[ $JOBID -eq "" ]]; then 7 | exit 1 8 | fi 9 | 10 | sleep 10s 11 | 12 | while true; do 13 | export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1` 14 | case "${STATE}" in 15 | PENDING|RUNNING|REQUEUED) 16 | echo "Job is still in $STATE" 17 | sleep 15s 18 | ;; 19 | *) 20 | sleep 30s 21 | echo "Exiting with SLURM job status '${STATE}'" 22 | exit 0 23 | ;; 24 | esac 25 | done 26 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49535, 10.46661, 10.42394, 10.30692, 10.15978, 9.96955]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [19029.0, 19175.0, 22073.0, 18475.0, 20839.0, 23168.0, 22721.0]}, "iteration_timing_avg": 0.4121861764705882} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46274, 10.31498, 10.17119, 9.97324]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22611.0, 20670.0, 26082.0, 23626.0, 21993.0, 21751.0, 23179.0]}, "iteration_timing_avg": 0.874113823529412} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44877, 10.43852, 10.44018, 10.44114, 10.45622, 10.44144, 10.39044, 10.25684, 10.133, 9.95743]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [28181.0, 20629.0, 28241.0, 26287.0, 24057.0, 21016.0, 21238.0]}, "iteration_timing_avg": 0.7704600000000001} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.48784, 10.4873, 10.50417, 10.49446, 10.47819, 10.41361, 10.28135, 10.14425, 9.94149]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26852.0, 19803.0, 25492.0, 24594.0, 21586.0, 19658.0, 20766.0]}, "iteration_timing_avg": 1.4250708823529417} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 44, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62397, 10.53554]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [2078.0, 2320.0, 2519.0, 2248.0, 2127.0, 1987.0]}, "iteration_timing_avg": 0.09863333333333332} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.83091, 10.87024, 10.89161, 10.81277, 10.6858, 10.61231, 10.09495, 10.21817]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1545.0, 1840.0, 1867.0, 1825.0, 1867.0, 1668.0, 1576.0, 1932.0]}, "iteration_timing_avg": 0.09399846153846156} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled_rope_embeddings.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.84538, 10.87913, 10.90387, 10.8235, 10.67913, 10.60602, 10.06785, 10.19695]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1722.0, 2142.0, 2149.0, 1966.0, 2038.0, 1914.0, 1745.0, 1956.0]}, "iteration_timing_avg": 0.10455653846153849} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 47, "step_interval": 5, "values": [10.81244, 10.87437, 10.90132, 10.84946, 10.84826, 10.81872, 10.61811, 10.61053, 10.52823, 10.22961]}, "num-zeros": {"start_step": 0, "end_step": 30, "step_interval": 5, "values": [2356.0, 2601.0, 2778.0, 2282.0, 2350.0, 2782.0]}, "iteration_timing_avg": 0.12793593749999999} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79471, 10.86601, 10.89073, 10.78482, 10.6587, 10.58125]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1609.0, 1850.0, 1921.0, 1942.0, 1853.0, 1674.0]}, "iteration_timing_avg": 0.12440000000000001} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_disable_bias_linear.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79474, 10.86607, 10.8908, 10.7851, 10.65905, 10.58193]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1587.0, 1824.0, 2006.0, 1919.0, 1874.0, 1646.0]}, "iteration_timing_avg": 0.12088222222222227} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_sequence_parallel.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 41, "step_interval": 5, "values": [10.79471, 10.86601, 10.89073, 10.78482, 10.6587, 10.58125, 10.0813, 10.19422, 10.13437]}, "num-zeros": {"start_step": 0, "end_step": 41, "step_interval": 5, "values": [1609.0, 1850.0, 1921.0, 1942.0, 1853.0, 1674.0, 1544.0, 1884.0, 2438.0]}, "iteration_timing_avg": 0.12650857142857144} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_swiglu.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [10.73442, 10.82095, 10.84047, 10.75831, 10.70386, 10.63718, 10.20959, 10.36611]}, "num-zeros": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [2625.0, 2815.0, 2837.0, 2870.0, 2755.0, 2617.0, 2345.0, 2529.0]}, "iteration_timing_avg": 0.1255659259259259} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_untie_embeddings_and_outputs.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [10.89427, 10.9106, 10.917, 10.84465, 10.70825, 10.63519, 10.15543, 10.26206]}, "num-zeros": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [22727188.0, 23020756.0, 22501138.0, 22830610.0, 22739638.0, 22547160.0, 22955250.0, 22589434.0]}, "iteration_timing_avg": 0.12411037037037034} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85543, 10.89355, 10.87608, 10.87365, 10.88042, 10.84182, 10.67177, 10.62854, 10.52511, 10.25229]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2470.0, 2444.0, 2570.0, 2192.0, 2241.0, 2574.0, 2476.0]}, "iteration_timing_avg": 0.14008088235294117} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_core_enabled.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92215, 10.93714, 10.89742, 10.87588, 10.75165, 10.65713, 10.1606, 10.24967, 10.15339, 9.84198]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1655.0, 1837.0, 1968.0, 1854.0, 1811.0, 1810.0, 1593.0, 1997.0, 2315.0, 2343.0]}, "iteration_timing_avg": 0.13743323529411763} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_te_enabled.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8559, 10.89255, 10.8665, 10.81693, 10.69856, 10.60955, 10.10845, 10.21443, 10.12855, 9.80126]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1693.0, 1878.0, 1977.0, 1871.0, 2022.0, 1716.0, 1646.0, 2006.0, 2280.0, 2365.0]}, "iteration_timing_avg": 0.12973323529411762} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85921, 10.8797, 10.87381, 10.88658, 10.88912, 10.84826, 10.68571, 10.62946, 10.54289, 10.26918]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2289.0, 2368.0, 2427.0, 2023.0, 2234.0, 2501.0, 2316.0]}, "iteration_timing_avg": 0.20419529411764706} -------------------------------------------------------------------------------- /tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps_core_enabled.json: -------------------------------------------------------------------------------- 1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86168, 10.88879, 10.87894, 10.8312, 10.71384, 10.61221, 10.13333, 10.23204, 10.16051, 9.83654]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1854.0, 2137.0, 2162.0, 2176.0, 2072.0, 1947.0, 1702.0, 2222.0, 2457.0, 2535.0]}, "iteration_timing_avg": 0.20128235294117644} 2 | -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -x 3 | 4 | DATA_PATH=$1 5 | CHECKPOINT_PATH=$2 6 | TENSORBOARD_DIR=$3 7 | TP_SIZE=$4 8 | PP_SIZE=$5 9 | NNODES=$6 10 | MAX_STEPS=$7 11 | VP_SIZE=$8 12 | GPUS_PER_NODE=8 13 | # Change for multinode config 14 | MASTER_ADDR=localhost 15 | MASTER_PORT=6000 16 | NODE_RANK=0 17 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 18 | export CUDA_DEVICE_MAX_CONNECTIONS=1 19 | 20 | 21 | # Runs the "345M" parameter model 22 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES" 23 | 24 | torchrun $DISTRIBUTED_ARGS \ 25 | pretrain_bert.py \ 26 | --num-layers 24 \ 27 | --hidden-size 1024 \ 28 | --num-attention-heads 16 \ 29 | --log-params-norm \ 30 | --log-num-zeros-in-grad \ 31 | --log-validation-ppl-to-tensorboard \ 32 | --log-timers-to-tensorboard \ 33 | --tensorboard-dir ${TENSORBOARD_DIR} \ 34 | --micro-batch-size 4 \ 35 | --global-batch-size 128 \ 36 | --seq-length 512 \ 37 | --max-position-embeddings 512 \ 38 | --train-iters $MAX_STEPS \ 39 | --timing-log-level 2 \ 40 | --lr-decay-iters 990000 \ 41 | --save $CHECKPOINT_PATH \ 42 | --load $CHECKPOINT_PATH \ 43 | --data-path $DATA_PATH \ 44 | --vocab-file /workspace/data/bert_data/vocab.txt \ 45 | --data-impl mmap \ 46 | --split 949,50,1 \ 47 | --distributed-backend nccl \ 48 | --lr 0.0001 \ 49 | --min-lr 0.00001 \ 50 | --lr-warmup-fraction 0.01 \ 51 | --log-interval 1 \ 52 | --save-interval 10000 \ 53 | --eval-interval 1000 \ 54 | --eval-iters 10 \ 55 | --tensor-model-parallel-size $TP_SIZE \ 56 | --pipeline-model-parallel-size $PP_SIZE \ 57 | ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \ 58 | --no-gradient-accumulation-fusion \ 59 | --fp16 -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr_nlp_llmnext 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | 13 | echo 'Running tests using $PYTORCH_IMAGE image' 14 | 15 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 16 | ls 17 | cd /workspace/megatron-lm 18 | ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES" -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr_nlp_llmnext 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | 13 | echo 'Running tests using $PYTORCH_IMAGE image' 14 | 15 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 16 | ls 17 | cd /workspace/megatron-lm 18 | ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE" -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -x 3 | 4 | DATA_PATH=$1 5 | CHECKPOINT_PATH=$2 6 | TENSORBOARD_DIR=$3 7 | USE_TE=$4 8 | TP_SIZE=$5 9 | PP_SIZE=$6 10 | NNODES=$7 11 | MAX_STEPS=$8 12 | USE_CORE=$9 13 | VP_SIZE=${10} 14 | MBS=${11} 15 | GBS=${12} 16 | ADDITIONAL_PARAMS=${13} 17 | GPUS_PER_NODE=8 18 | # Change for multinode config 19 | MASTER_ADDR=localhost 20 | MASTER_PORT=6000 21 | NODE_RANK=0 22 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 23 | export CUDA_DEVICE_MAX_CONNECTIONS=1 24 | 25 | TRANSFORMER_IMPL=local 26 | TRAINING_DTYPE=fp16 27 | CALLING_SCRIPT=pretrain_gpt.py 28 | 29 | if [[ $USE_CORE -eq 1 ]]; then 30 | echo "Running using megatron core" 31 | TRANSFORMER_IMPL=local 32 | TRAINING_DTYPE=bf16 33 | CALLING_SCRIPT=pretrain_gpt_core.py 34 | export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 35 | fi 36 | 37 | if [[ $USE_TE -eq 1 ]]; then 38 | echo "Running with TransformerEngine ..." 39 | TRANSFORMER_IMPL=transformer_engine 40 | TRAINING_DTYPE=bf16 41 | else 42 | echo "Running with local transformer implementation ..." 43 | fi 44 | 45 | # Runs the "345M" parameter model 46 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES" 47 | 48 | torchrun $DISTRIBUTED_ARGS \ 49 | $CALLING_SCRIPT \ 50 | --num-layers 12 \ 51 | --hidden-size 512 \ 52 | --num-attention-heads 8 \ 53 | --log-params-norm \ 54 | --log-num-zeros-in-grad \ 55 | --log-validation-ppl-to-tensorboard \ 56 | --log-timers-to-tensorboard \ 57 | --tensorboard-dir ${TENSORBOARD_DIR} \ 58 | --micro-batch-size ${MBS:-4} \ 59 | --global-batch-size ${GBS:-32} \ 60 | --seq-length 1024 \ 61 | --max-position-embeddings 1024 \ 62 | --train-iters $MAX_STEPS \ 63 | --timing-log-level 2 \ 64 | --lr-decay-iters 320000 \ 65 | --save $CHECKPOINT_PATH \ 66 | --load $CHECKPOINT_PATH \ 67 | --data-path $DATA_PATH \ 68 | --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \ 69 | --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \ 70 | --data-impl mmap \ 71 | --split 949,50,1 \ 72 | --distributed-backend nccl \ 73 | --lr 0.00015 \ 74 | --lr-decay-style cosine \ 75 | --min-lr 1.0e-5 \ 76 | --weight-decay 1e-2 \ 77 | --clip-grad 1.0 \ 78 | --lr-warmup-fraction .01 \ 79 | --log-interval 1 \ 80 | --save-interval 10000 \ 81 | --eval-interval 1000 \ 82 | --eval-iters 10 \ 83 | --transformer-impl $TRANSFORMER_IMPL \ 84 | --tensor-model-parallel-size $TP_SIZE \ 85 | --pipeline-model-parallel-size $PP_SIZE \ 86 | ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \ 87 | ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \ 88 | --no-gradient-accumulation-fusion \ 89 | --${TRAINING_DTYPE} 90 | -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr_nlp_llmnext 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | 13 | echo 'Running tests using $PYTORCH_IMAGE image' 14 | 15 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 16 | ls 17 | cd /workspace/megatron-lm 18 | ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES" -------------------------------------------------------------------------------- /tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parameters 4 | #SBATCH --account=adlr_nlp_llmnext 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job 6 | #SBATCH --nodes=1 7 | #SBATCH --partition=luna 8 | 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document 10 | CHECKPOINT_PATH=/workspace/checkpoints 11 | TENSORBOARD_DIR=/workspace/logs 12 | 13 | if [[ -n $MBS ]]; then MBS=4; fi 14 | if [[ -n $GBS ]]; then GBS=32; fi 15 | 16 | if [[ -n $VP_SIZE ]]; then VP_SIZE="" ; fi 17 | 18 | echo 'Running tests using $PYTORCH_IMAGE image' 19 | 20 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c " 21 | ls 22 | cd /workspace/megatron-lm 23 | ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $USE_CORE \"$VP_SIZE\" \"$MBS\" \"$GBS\" \"$ADDITIONAL_PARAMS\"" 24 | -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/models/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/models/test_gpt_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import pytest 4 | 5 | import torch 6 | 7 | from megatron.core.transformer.transformer_config import TransformerConfig 8 | from megatron.core.models.gpt.gpt_embedding import GPTEmbedding 9 | from tests.unit_tests.test_utilities import Utils 10 | 11 | class TestGPTEmbedding: 12 | 13 | def setup_method(self, method): 14 | Utils.initialize_model_parallel(1,1) 15 | transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) 16 | self.gpt_embedding = GPTEmbedding(config=transformer_config, vocab_size=100, max_sequence_length=4, add_position_embedding=True) 17 | 18 | def teardown_method(self, method): 19 | Utils.destroy_model_parallel() 20 | 21 | def test_constructor(self): 22 | assert isinstance(self.gpt_embedding, GPTEmbedding) 23 | num_weights = sum([p.numel() for p in self.gpt_embedding.parameters()]) 24 | assert num_weights == 1248 25 | 26 | def test_zero_parameters(self): 27 | sum_weights = sum([p.sum() for p in self.gpt_embedding.parameters()]) 28 | assert sum_weights != 0 29 | self.gpt_embedding.zero_parameters() 30 | sum_weights = sum([p.sum() for p in self.gpt_embedding.parameters()]) 31 | assert sum_weights == 0 32 | 33 | def test_cpu_forward(self): 34 | input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)) 35 | position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)) 36 | embeddings = self.gpt_embedding(input_ids, position_ids) 37 | assert embeddings.device.type == 'cpu' 38 | assert embeddings.shape[0] == self.gpt_embedding.max_sequence_length 39 | assert embeddings.shape[1] == input_ids.shape[0] 40 | assert embeddings.shape[2] == self.gpt_embedding.config.hidden_size 41 | 42 | def test_gpu_forward(self): 43 | self.gpt_embedding.cuda() 44 | input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda() 45 | position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda() 46 | embeddings = self.gpt_embedding(input_ids, position_ids) 47 | assert embeddings.device.type == 'cuda' 48 | assert embeddings.shape[0] == self.gpt_embedding.max_sequence_length 49 | assert embeddings.shape[1] == input_ids.shape[0] 50 | assert embeddings.shape[2] == self.gpt_embedding.config.hidden_size -------------------------------------------------------------------------------- /tests/unit_tests/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/pipeline_parallel/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy 2 | import torch 3 | from tests.unit_tests.test_utilities import Utils 4 | import numpy as np 5 | 6 | def test_vocab_parallel_cross_entropy(): 7 | Utils.initialize_model_parallel(4,2) 8 | vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda() 9 | target = torch.arange(0,32,2).cuda() 10 | output = vocab_parallel_cross_entropy(vocab_parallel_logits, target) 11 | expected_output = torch.tensor([10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309, 12 | 10.2309, 8.2309, 6.2309, 4.2309, 10.2309, 8.2309, 6.2309, 4.2309]).cuda() 13 | assert(torch.equal(torch.round(expected_output), torch.round(output))) 14 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_data.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.data import broadcast_data 2 | import torch 3 | from tests.unit_tests.test_utilities import Utils 4 | 5 | def test_broadcast_data(): 6 | Utils.initialize_model_parallel(2,4) 7 | input_data = { 8 | 0 : torch.ones((8,8)).cuda() * 0.0, 9 | 1 : torch.ones((8,8)).cuda() * 1.0, 10 | 2 : torch.ones((8,8)).cuda() * 2.0, 11 | 3 : torch.ones((8,8)).cuda() * 3.0, 12 | 4 : torch.ones((8,8)).cuda() * 4.0, 13 | 5 : torch.ones((8,8)).cuda() * 5.0, 14 | 6 : torch.ones((8,8)).cuda() * 6.0, 15 | 7 : torch.ones((8,8)).cuda() * 7.0 16 | } 17 | dtype = torch.float32 18 | actual_output = broadcast_data([0,1],input_data, dtype) 19 | assert(torch.equal(actual_output[0], input_data[0])) 20 | assert(torch.equal(actual_output[1], input_data[1])) 21 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_random.py: -------------------------------------------------------------------------------- 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed 3 | from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER 4 | from megatron.core.tensor_parallel.random import checkpoint 5 | from tests.unit_tests.test_utilities import Utils 6 | import pytest 7 | import torch 8 | 9 | def test_cuda_rng_states_tracker(): 10 | rng_tracker = CudaRNGStatesTracker() 11 | rng_tracker.set_states({"state1":1234}) 12 | assert(rng_tracker.get_states()["state1"] == 1234) 13 | rng_tracker.reset() 14 | assert(rng_tracker.get_states() == {}) 15 | seed = 1111 16 | rng_tracker.add("state2",seed) 17 | with pytest.raises(Exception): 18 | assert(rng_tracker.add("state3",seed)) 19 | with pytest.raises(Exception): 20 | assert(rng_tracker.add("state2",111)) 21 | assert(rng_tracker.get_states()['state2'] is not None) 22 | with pytest.raises(Exception): 23 | assert() 24 | 25 | rng_tracker.fork("state2") 26 | torch.cuda.manual_seed(seed) 27 | rng_state = torch.cuda.get_rng_state() 28 | assert torch.equal(rng_tracker.get_states()['state2'], rng_state) 29 | 30 | def test_model_parallel_cuda_manual_seed(): 31 | Utils.initialize_model_parallel(4,2) 32 | model_parallel_cuda_manual_seed(0) 33 | assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None) 34 | Utils.destroy_model_parallel() 35 | 36 | def test_checkpoint(): 37 | def test_forward(*input): 38 | return input[0]+input[1] 39 | assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2))) 40 | Utils.initialize_model_parallel() 41 | input1 = torch.ones((4,4)) 42 | checkpoint(test_forward, True, input1, torch.ones((4,4))*2) 43 | assert(torch.equal(torch.ones(input1.numel()).cuda(), input1)) 44 | Utils.destroy_model_parallel() -------------------------------------------------------------------------------- /tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import megatron.core.tensor_parallel.utils as util 3 | import megatron.core.parallel_state as ps 4 | from tests.unit_tests.test_utilities import Utils 5 | 6 | rank = Utils.rank 7 | 8 | def test_split_tensor_along_last_dim(): 9 | input_tensor = torch.rand((3,4)) 10 | torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0]) 11 | torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1]) 12 | 13 | def test_split_tensor_into_1d_equal_chunks(): 14 | Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) 15 | input_tensor = torch.rand((3,4)) 16 | output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor) 17 | if rank % 2 == 0 : 18 | start = 0 19 | end = int(input_tensor.numel()/2) 20 | else : 21 | start = int(input_tensor.numel()/2) 22 | end = input_tensor.numel() 23 | 24 | assert torch.equal(output_tensor, input_tensor.flatten()[start:end]) 25 | Utils.destroy_model_parallel() 26 | 27 | def test_gather_split_1d_tensor(): 28 | Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4) 29 | input_tensor = torch.ones((2,4)).cuda() * rank 30 | actual_output_tensor = util.gather_split_1d_tensor(input_tensor) 31 | if rank %2 == 0: 32 | expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1)) 33 | else : 34 | expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten())) 35 | assert(torch.equal(actual_output_tensor, expected_output_tensor)) 36 | Utils.destroy_model_parallel() 37 | 38 | def test_vocab(): 39 | global_vocab_size = 1600 40 | per_partition_vocab_size = 1600 / Utils.world_size 41 | assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size))) 42 | assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size))) 43 | -------------------------------------------------------------------------------- /tests/unit_tests/test_basic.py: -------------------------------------------------------------------------------- 1 | def test_import(): 2 | import megatron 3 | 4 | -------------------------------------------------------------------------------- /tests/unit_tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import megatron.core.parallel_state as ps 4 | 5 | class Utils: 6 | 7 | world_size = torch.cuda.device_count() 8 | rank = int(os.environ['LOCAL_RANK']) 9 | 10 | @staticmethod 11 | def initialize_distributed(): 12 | print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}') 13 | torch.cuda.set_device(Utils.rank % torch.cuda.device_count()) 14 | init_method = 'tcp://' 15 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 16 | master_port = os.getenv('MASTER_PORT', '6000') 17 | init_method += master_ip + ':' + master_port 18 | torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method) 19 | 20 | @staticmethod 21 | def destroy_model_parallel(): 22 | ps.destroy_model_parallel() 23 | torch.distributed.barrier() 24 | 25 | @staticmethod 26 | def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None): 27 | ps.destroy_model_parallel() 28 | if not torch.distributed.is_initialized(): 29 | Utils.initialize_distributed() 30 | ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank) -------------------------------------------------------------------------------- /tests/unit_tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import megatron.core.utils as util 4 | import numpy as np 5 | 6 | def test_divide_properly(): 7 | assert util.divide(4,2) == 2 8 | 9 | def test_divide_improperly(): 10 | with pytest.raises(AssertionError): 11 | util.divide(4,5) 12 | 13 | def test_global_memory_buffer(): 14 | global_memory_buffer = util.GlobalMemoryBuffer() 15 | obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor") 16 | expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device()) 17 | assert torch.equal(obtained_tensor, expected_tensor) 18 | 19 | def test_make_viewless_tensor(): 20 | inp = torch.rand((3,4)) 21 | assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True))) 22 | assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False))) 23 | 24 | def test_safely_set_viewless_tensor_data(): 25 | tensor = torch.zeros((3,4)) 26 | new_data_tensor = torch.tensor(np.random.rand(3,4)) 27 | util.safely_set_viewless_tensor_data(tensor, new_data_tensor) 28 | assert(torch.equal(tensor, new_data_tensor)) 29 | 30 | def test_assert_viewless_tensor(): 31 | tensor = torch.rand((3,4)) 32 | assert(torch.equal(util.assert_viewless_tensor(tensor), tensor)) 33 | input_tensor_list=[tensor,tensor,tensor] 34 | output_tensor_list = util.assert_viewless_tensor(input_tensor_list) 35 | for inp,out in zip(input_tensor_list, output_tensor_list): 36 | assert(torch.equal(inp,out)) 37 | -------------------------------------------------------------------------------- /tests/unit_tests/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/transformer/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/transformer/test_core_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import pytest 5 | 6 | import torch 7 | 8 | from megatron.core.transformer.attention import CrossAttention 9 | """ 10 | 11 | @pytest.fixture 12 | def core_attention(transformer_config): 13 | return CrossAttention(transformer_config) 14 | 15 | 16 | class TestCoreAttention: 17 | def test_constructor(self, core_attention): 18 | assert isinstance(core_attention, CrossAttention) 19 | assert core_attention.layer_number == 1 20 | 21 | num_weights = sum([p.numel() for p in core_attention.parameters()]) 22 | assert num_weights == 0 23 | 24 | def test_cpu_forward(self, core_attention): 25 | # we can't currently do this because the global memory buffer is on GPU 26 | pass 27 | 28 | def test_gpu_forward(self, core_attention): 29 | 30 | # destroy_global_memory_buffer() 31 | # _set_global_memory_buffer() 32 | # model_parallel_cuda_manual_seed(123) 33 | 34 | core_attention.cuda() 35 | config = core_attention.config 36 | sequence_length = 32 37 | micro_batch_size = 2 38 | # query_layer (float): [sequence_length, micro_batch_size, num_attention_heads, hidden_size / num_attention_heads] 39 | query_layer = torch.ones( 40 | ( 41 | sequence_length, 42 | micro_batch_size, 43 | config.num_attention_heads, 44 | config.hidden_size // config.num_attention_heads, 45 | ) 46 | ).cuda() 47 | 48 | key_layer = torch.ones_like(query_layer).cuda() 49 | 50 | value_layer = torch.ones_like(query_layer).cuda() 51 | 52 | attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() 53 | 54 | context_layer = core_attention( 55 | query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask 56 | ) 57 | 58 | assert context_layer.shape[0] == sequence_length 59 | assert context_layer.shape[1] == micro_batch_size 60 | assert context_layer.shape[2] == config.hidden_size 61 | assert context_layer.device.type == 'cuda' 62 | assert context_layer.dtype == torch.float32 63 | 64 | """ -------------------------------------------------------------------------------- /tests/unit_tests/transformer/test_mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import pytest 4 | 5 | import torch 6 | 7 | from megatron.core.transformer.mlp import MLP 8 | from tests.unit_tests.test_utilities import Utils 9 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed 10 | from megatron.core.transformer.transformer_config import TransformerConfig 11 | 12 | class TestParallelMLP: 13 | 14 | def setup_method(self, method): 15 | Utils.initialize_model_parallel(1,1) 16 | model_parallel_cuda_manual_seed(123) 17 | transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) 18 | self.mlp = MLP(transformer_config) 19 | 20 | def teardown_method(self, method): 21 | Utils.destroy_model_parallel() 22 | 23 | def test_constructor(self): 24 | assert isinstance(self.mlp, MLP) 25 | 26 | num_weights = sum([p.numel() for p in self.mlp.parameters()]) 27 | assert num_weights == 1236 28 | 29 | """ 30 | def test_cpu_forward(self, mlp): 31 | # [sequence length, micro batch size, hidden size] 32 | hidden_states = torch.ones((32, 2, mlp.config.hidden_size)) 33 | output, output_bias = mlp(hidden_states) 34 | assert output.shape[0] == 32 35 | assert output.shape[1] == 2 36 | assert output.shape[2] == mlp.config.hidden_size 37 | assert output_bias.shape[0] == mlp.config.hidden_size 38 | assert output.dtype == torch.float32 39 | """ 40 | 41 | @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") 42 | def test_gpu_forward(self): 43 | mlp = self.mlp 44 | mlp.cuda() 45 | # [sequence length, batch size, hidden size] 46 | hidden_states = torch.ones((32, 2, mlp.config.hidden_size)) 47 | hidden_states = hidden_states.cuda() 48 | output, output_bias = mlp(hidden_states) 49 | assert output.shape[0] == 32 50 | assert output.shape[1] == 2 51 | assert output.shape[2] == mlp.config.hidden_size 52 | assert output_bias.shape[0] == mlp.config.hidden_size 53 | assert output.dtype == torch.float32 54 | assert output.device.type == 'cuda' 55 | assert output_bias.device.type == 'cuda' 56 | 57 | -------------------------------------------------------------------------------- /tests/unit_tests/transformer/test_transformer_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import pytest 5 | 6 | import torch 7 | 8 | from megatron.core.transformer.transformer_config import TransformerConfig 9 | from megatron.core.transformer.transformer_layer import TransformerLayer 10 | from tests.unit_tests.test_utilities import Utils 11 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed 12 | from megatron.core.transformer.transformer_config import TransformerConfig 13 | 14 | 15 | 16 | class TestParallelTransformerLayer: 17 | 18 | def setup_method(self, method): 19 | Utils.initialize_model_parallel(1,1) 20 | model_parallel_cuda_manual_seed(123) 21 | transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True) 22 | self.parallel_transformer_layer = TransformerLayer(transformer_config) 23 | 24 | def teardown_method(self, method): 25 | Utils.destroy_model_parallel() 26 | 27 | def test_constructor(self): 28 | parallel_transformer_layer = self.parallel_transformer_layer 29 | assert isinstance(parallel_transformer_layer, TransformerLayer) 30 | assert parallel_transformer_layer.layer_number == 1 31 | 32 | num_weights = sum([p.numel() for p in parallel_transformer_layer.parameters()]) 33 | assert num_weights == 1884 34 | 35 | def test_gpu_forward(self): 36 | parallel_transformer_layer = self.parallel_transformer_layer 37 | config: TransformerConfig = parallel_transformer_layer.config 38 | sequence_length = 32 39 | micro_batch_size = 2 40 | parallel_transformer_layer.cuda() 41 | 42 | # [sequence length, batch size, hidden size] 43 | hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size)) 44 | hidden_states = hidden_states.cuda() 45 | 46 | attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda() 47 | 48 | hidden_states = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask) 49 | assert hidden_states.shape[0] == sequence_length 50 | assert hidden_states.shape[1] == micro_batch_size 51 | assert hidden_states.shape[2] == config.hidden_size 52 | -------------------------------------------------------------------------------- /tools/autoformat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 4 | 5 | # for now we just format core 6 | 7 | black ${SCRIPT_DIR}/../megatron/core 8 | isort ${SCRIPT_DIR}/../megatron/core 9 | -------------------------------------------------------------------------------- /tools/bert_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder 4 | -------------------------------------------------------------------------------- /tools/bert_embedding/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from megatron import get_args, get_tokenizer 7 | from megatron.data.bert_dataset import build_training_sample 8 | 9 | 10 | class BertEmbeddingDataset(torch.utils.data.Dataset): 11 | '''Dataset to convert a text dataset to Bert tokens.''' 12 | 13 | def __init__(self, text_dataset, max_seq_length): 14 | 15 | super().__init__() 16 | 17 | args = get_args() 18 | 19 | # Dataset, tokenizer. 20 | self.text_dataset = text_dataset 21 | self.bert_tokenizer = get_tokenizer() 22 | 23 | # Params to store. 24 | self.max_seq_length = max_seq_length 25 | self.seed = args.seed 26 | self.masked_lm_prob = args.mask_prob 27 | 28 | # Vocab stuff. 29 | self.vocab_id_list = list(self.bert_tokenizer.inv_vocab.keys()) 30 | self.vocab_id_to_token_dict = self.bert_tokenizer.inv_vocab 31 | self.cls_id = self.bert_tokenizer.cls 32 | self.sep_id = self.bert_tokenizer.sep 33 | self.mask_id = self.bert_tokenizer.mask 34 | self.pad_id = self.bert_tokenizer.pad 35 | 36 | def __len__(self): 37 | return len(self.text_dataset) 38 | 39 | def __getitem__(self, idx): 40 | 41 | # Text. 42 | text_sample = self.text_dataset[idx] 43 | text = text_sample["text"] 44 | text = text.replace("<|endoftext|>", "") 45 | 46 | # Bert/Wordpiece tokens (+truncate). 47 | bert_token_ids = self.bert_tokenizer.tokenize(text) 48 | bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep. 49 | if not bert_token_ids: 50 | bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq 51 | 52 | # Note that this rng state should be numpy and not python since 53 | # python randint is inclusive whereas the numpy one is exclusive. 54 | # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1 55 | np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32)) 56 | 57 | # Build sample. 58 | sample = build_training_sample([bert_token_ids], 59 | len(bert_token_ids), 60 | len(bert_token_ids) + 2, # for cls+sep 61 | self.vocab_id_list, 62 | self.vocab_id_to_token_dict, 63 | self.cls_id, self.sep_id, 64 | self.mask_id, self.pad_id, 65 | self.masked_lm_prob, np_rng, 66 | binary_head=False) 67 | sample["seq_length"] = len(sample["text"]) 68 | return sample 69 | -------------------------------------------------------------------------------- /tools/bert_embedding/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "h5py", 7 | "transformers", # for huggingface bert 8 | ] 9 | 10 | for lib in required_libs: 11 | try: 12 | globals()[lib] = importlib.import_module(lib) 13 | except ImportError as e: 14 | raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.") 15 | -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/merge_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), 6 | os.path.pardir))) 7 | 8 | from megatron.data import indexed_dataset 9 | 10 | 11 | def main(args): 12 | 13 | prefixes = set() 14 | for basename in os.listdir(args.input): 15 | prefix, ext = os.path.splitext(basename) 16 | 17 | if prefix in prefixes: 18 | continue 19 | 20 | if not os.path.isfile(os.path.join(args.input, basename)): 21 | continue 22 | 23 | ext_pair = '.bin' if ext == '.idx' else '.idx' 24 | assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \ 25 | f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}' 26 | 27 | prefixes.add(prefix) 28 | 29 | builder = None 30 | for prefix in sorted(prefixes): 31 | if builder is None: 32 | dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer') 33 | 34 | if isinstance(dataset, indexed_dataset.MMapIndexedDataset): 35 | builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype) 36 | else: 37 | builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin') 38 | 39 | del dataset 40 | 41 | builder.merge_file_(os.path.join(args.input, prefix)) 42 | 43 | builder.finalize(args.output_prefix + '.idx') 44 | 45 | 46 | if __name__ == '__main__': 47 | parser = argparse.ArgumentParser() 48 | 49 | group = parser.add_argument_group(title='input data') 50 | group.add_argument('--input', type=str, required=True, 51 | help='Path to directory containing all document files to merge') 52 | 53 | group = parser.add_argument_group(title='output data') 54 | group.add_argument('--output-prefix', type=str, required=True, 55 | help='Path to binary output file without suffix') 56 | 57 | args = parser.parse_args() 58 | 59 | assert os.path.isdir(args.input), \ 60 | f'ERROR: {args.input} is not a directory or does not exist' 61 | 62 | assert os.path.isdir(os.path.dirname(args.output_prefix)), \ 63 | f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist' 64 | 65 | main(args) 66 | 67 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | 8 | """ 9 | This code adds id to each json object in a json file. User can add prefix 10 | to the ids. 11 | """ 12 | 13 | if __name__ == '__main__': 14 | 15 | print('parsing the arguments ...') 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 19 | ' json file where id needs to be added') 20 | parser.add_argument('--output-file', type=str, default=None, help=\ 21 | 'Output file name with id') 22 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 23 | 'Id prefix') 24 | parser.add_argument('--log-interval', type=int, default=100, 25 | help='Log interval') 26 | args = parser.parse_args() 27 | 28 | print('Adding ids to dataset ...') 29 | 30 | f_input = open(args.input_file, 'r', encoding='utf-8') 31 | f_output = open(args.output_file, 'wb') 32 | 33 | unique_ids = 1 34 | start_time = time.time() 35 | for row in f_input: 36 | each_row = json.loads(row) 37 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 38 | each_row['adlr_id'] = adlr_id_string 39 | myjson = json.dumps(each_row, ensure_ascii=False) 40 | 41 | f_output.write(myjson.encode('utf-8')) 42 | f_output.write('\n'.encode('utf-8')) 43 | 44 | if unique_ids % args.log_interval == 0: 45 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 46 | unique_ids, time.time() - start_time), flush=True) 47 | 48 | unique_ids += 1 49 | 50 | # Close the file. 51 | f_input.close() 52 | f_output.close() 53 | 54 | print('done :-)', flush=True) 55 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import glob 5 | import sys 6 | import json 7 | import argparse 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--json_path", type=str, default=".", 13 | help="path where all the json files are located") 14 | 15 | parser.add_argument("--output_file", type=str, default="merged_output.json", 16 | help="filename where the merged json should go") 17 | 18 | args = parser.parse_args() 19 | 20 | json_path = args.json_path 21 | out_file = args.output_file 22 | 23 | json_files = glob.glob(json_path + '/*.json') 24 | 25 | counter = 0 26 | 27 | with open(out_file, 'w') as outfile: 28 | for fname in json_files: 29 | counter += 1 30 | 31 | if counter % 1024 == 0: 32 | print("Merging at ", counter, flush=True) 33 | 34 | with open(fname, 'r') as infile: 35 | for row in infile: 36 | each_row = json.loads(row) 37 | outfile.write(row) 38 | 39 | 40 | print("Merged file", out_file, flush=True) 41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import json 5 | import time 6 | import sys 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | url_filename = sys.argv[1] 12 | data_filename = sys.argv[2] 13 | output_filename = sys.argv[3] 14 | 15 | urls = set() 16 | with open(url_filename, 'r') as f: 17 | for line in f: 18 | myjson = json.loads(line) 19 | for key in myjson: 20 | this_urls = myjson[key] 21 | for i in range(1, len(this_urls)): 22 | urls.add(this_urls[i]) 23 | print('will be removing {} urls'.format(len(urls)), flush=True) 24 | 25 | written_docs = 0 26 | removed_docs = 0 27 | removed_chars = 0 28 | start_time = time.time() 29 | with open(output_filename, 'wb') as fout: 30 | with open(data_filename, 'r') as fin: 31 | for line in fin: 32 | try: 33 | myjson = json.loads(line) 34 | url = myjson['url'] 35 | if url in urls: 36 | print('removing', myjson) 37 | removed_docs += 1 38 | removed_chars += len(myjson['text']) 39 | continue 40 | myjson = json.dumps(myjson, ensure_ascii=False) 41 | fout.write(myjson.encode('utf-8')) 42 | fout.write('\n'.encode('utf-8')) 43 | written_docs += 1 44 | if written_docs % 10000 == 0: 45 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 46 | '| removed: {} (char: {})'.format( 47 | time.time() - start_time, 48 | written_docs, removed_docs, removed_chars)) 49 | except Exception as e: 50 | print('[SKIPPING]', line, e) 51 | 52 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 53 | '| removed: {} (char: {})'.format( 54 | time.time() - start_time, 55 | written_docs, removed_docs, removed_chars)) 56 | print('done :-)') 57 | -------------------------------------------------------------------------------- /tools/retro/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .cli import retro 4 | -------------------------------------------------------------------------------- /tools/retro/cli/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | from . import retro 6 | 7 | 8 | if __name__ == "__main__": 9 | retro.init(os.environ["RETRO_WORKDIR"]) 10 | -------------------------------------------------------------------------------- /tools/retro/db/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .build import build_db 4 | -------------------------------------------------------------------------------- /tools/retro/db/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import json 4 | import numpy as np 5 | import torch 6 | from tqdm import tqdm 7 | 8 | from megatron import get_args, print_rank_0 9 | from tools.retro.external_libs import h5py 10 | from tools.retro.utils import get_gpt_tokenizer 11 | 12 | 13 | class DBDataset(torch.utils.data.Dataset): 14 | '''Dataset for iterating chunks. 15 | 16 | Requires: 17 | - List of indexed datasets 18 | - Chunk index array, with format: 19 | [dataset_idx, doc_id, start_idx, end_idx, bert_length]) 20 | ''' 21 | 22 | def __init__(self, db_path, indexed_datasets, chunks, max_chunk_length): 23 | 24 | assert chunks.shape[1] == 5, "expected 5 columns (dataset_idx, " \ 25 | "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); " \ 26 | "found %d columns." % chunks.shape[1] 27 | 28 | self.db_path = db_path 29 | self.indexed_datasets = indexed_datasets 30 | self.chunks = chunks 31 | self.doc_chunk_map = None 32 | 33 | self.max_chunk_length = max_chunk_length 34 | self.eod_token_id = get_gpt_tokenizer().eod 35 | 36 | def __len__(self): 37 | return self.chunks.shape[0] 38 | 39 | def __getitem__(self, chunk_id): 40 | 41 | # Chunk start/end indexes. 42 | indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = \ 43 | [ value.item() for value in self.chunks[chunk_id] ] 44 | chunk_length = token_end_idx - token_start_idx 45 | indexed_dataset = self.indexed_datasets[indexed_dataset_id] 46 | 47 | # Chunk token ids. 48 | token_ids = indexed_dataset.get(doc_id, 49 | offset=token_start_idx, 50 | length=chunk_length) 51 | 52 | # Extend chunks to max_chunk_length by padding with EOD tokens. 53 | if chunk_length != self.max_chunk_length: 54 | assert chunk_length < self.max_chunk_length, "invalid chunk len." 55 | token_ids = token_ids.tolist() 56 | token_ids += [self.eod_token_id] * \ 57 | (self.max_chunk_length - chunk_length) 58 | 59 | return { 60 | "doc_id" : doc_id, 61 | "text" : np.array(token_ids, dtype=np.int64), 62 | } 63 | 64 | def load_doc_tuples(self): 65 | '''Load the dataset & document ids. 66 | 67 | Load the dataset id & document id of each chunk in the database, to 68 | be used for causality filtering during querying. 69 | ''' 70 | self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32") 71 | block_size = int(1e6) 72 | for start_idx in tqdm(range(0, len(self), block_size)): 73 | end_idx = min(len(self), start_idx + block_size) 74 | self.doc_tuples[start_idx:end_idx]=self.chunks[start_idx:end_idx,:2] 75 | -------------------------------------------------------------------------------- /tools/retro/examples/pretrain_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -u 4 | 5 | unset NCCL_DEBUG 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1 7 | 8 | ######## GPT or Retro?. ######## 9 | 10 | # 0 : GPT. 11 | # 1 : Retro 12 | 13 | ADD_RETRIEVER=1 14 | 15 | ######## Megatron, Retro dirs. ######## 16 | 17 | REPO_DIR="" 18 | RETRO_WORKDIR="" 19 | 20 | ######## Data. ######## 21 | 22 | DATA_BLEND="" 23 | 24 | ######## Args. ######## 25 | 26 | ARGS=" \ 27 | --log-interval 1 \ 28 | --use-flash-attn \ 29 | --apply-layernorm-1p \ 30 | --untie-embeddings-and-output-weights \ 31 | --disable-bias-linear \ 32 | --no-position-embedding \ 33 | --use-rotary-position-embeddings \ 34 | --rotary-percent 0.5 \ 35 | --swiglu \ 36 | --attention-dropout 0.0 \ 37 | --hidden-dropout 0.0 \ 38 | --exit-duration-in-mins 220 \ 39 | --tensor-model-parallel-size 1 \ 40 | --pipeline-model-parallel-size 1 \ 41 | --num-layers 24 \ 42 | --hidden-size 1024 \ 43 | --num-attention-heads 16 \ 44 | --seq-length 512 \ 45 | --max-position-embeddings 512 \ 46 | --micro-batch-size 16 \ 47 | --global-batch-size 256 \ 48 | --train-samples 200000 \ 49 | --lr-decay-samples 175000 \ 50 | --lr-warmup-samples 10000 \ 51 | --lr 2.5e-5 \ 52 | --min-lr 2.5e-6 \ 53 | --lr-decay-style cosine \ 54 | --eval-iters 50 \ 55 | --eval-interval 2000 \ 56 | --tokenizer-type GPTSentencePieceTokenizer \ 57 | --tokenizer-model \ 58 | --data-path ${DATA_BLEND} \ 59 | --split 98,2,0 \ 60 | --clip-grad 1.0 \ 61 | --weight-decay 0.1 \ 62 | --adam-beta1 0.9 \ 63 | --adam-beta2 0.95 \ 64 | --init-method-std 0.007 \ 65 | --log-params-norm \ 66 | --log-num-zeros-in-grad \ 67 | --bf16 \ 68 | --DDP-impl local \ 69 | " 70 | 71 | ######## Retro. ######## 72 | 73 | if [ "$ADD_RETRIEVER" = "0" ]; then 74 | SCRIPT=pretrain_gpt.py 75 | else 76 | ARGS="${ARGS} \ 77 | --retro-workdir ${RETRO_WORKDIR} \ 78 | --retro-add-retriever \ 79 | " 80 | SCRIPT=pretrain_retro.py 81 | fi 82 | 83 | ######## Command. ######## 84 | 85 | NPROCS=8 86 | CMD="\ 87 | pwd && cd ${REPO_DIR} && pwd && \ 88 | export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \ 89 | python -m torch.distributed.run \ 90 | --nproc_per_node ${NPROCS} \ 91 | --nnodes 1 \ 92 | --node_rank ${NODE_RANK} \ 93 | --master_addr ${MASTER_ADDR} \ 94 | --master_port 6000 \ 95 | ${SCRIPT} ${ARGS} \ 96 | " 97 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 98 | echo "CMD = '$CMD'." 99 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 100 | eval $CMD 101 | -------------------------------------------------------------------------------- /tools/retro/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "faiss", 7 | "h5py", 8 | "transformers", # for huggingface bert 9 | ] 10 | 11 | for lib in required_libs: 12 | try: 13 | globals()[lib] = importlib.import_module(lib) 14 | except ImportError as e: 15 | raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.") 16 | -------------------------------------------------------------------------------- /tools/retro/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .build import add_to_index, build_index, train_index 4 | # from .index import Index 5 | -------------------------------------------------------------------------------- /tools/retro/index/factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .indexes import FaissBaseIndex, FaissParallelAddIndex 4 | 5 | 6 | class IndexFactory: 7 | '''Get index. 8 | 9 | Index type generally read from argument '--retro-index-ty'. 10 | ''' 11 | 12 | @classmethod 13 | def get_index_class(cls, index_type): 14 | return { 15 | "faiss-base" : FaissBaseIndex, 16 | "faiss-par-add" : FaissParallelAddIndex, 17 | }[index_type] 18 | 19 | @classmethod 20 | def get_index(cls, index_type): 21 | index_class = cls.get_index_class(index_type) 22 | index = index_class() 23 | return index 24 | -------------------------------------------------------------------------------- /tools/retro/index/index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import abc 4 | import numpy as np 5 | import os 6 | import torch 7 | 8 | from megatron import get_retro_args 9 | from tools.retro.external_libs import faiss 10 | 11 | from .utils import get_index_dir 12 | 13 | 14 | class Index(abc.ABC): 15 | 16 | '''Abstract base class for indexes. 17 | 18 | *Note* : While currently only Faiss-based classes are implemented, in the 19 | future, this class will be extended with other types of indexes that have 20 | different performance-accuracy trade-offs. 21 | 22 | The primary methods to override are: 23 | - train() : Train index on the sampled training chunks. 24 | - add() : Add all training chunks to index. 25 | ''' 26 | 27 | @classmethod 28 | def c_verbose(cls, index, v): 29 | '''Make index object verbose.''' 30 | assert isinstance(v, bool) 31 | faiss.ParameterSpace().set_index_parameter(index, "verbose", v) 32 | 33 | def get_empty_index_path(self): 34 | args = get_retro_args() 35 | return os.path.join( 36 | get_index_dir(), 37 | "empty_%.3f.faissindex" % args.retro_index_train_load_fraction, 38 | ) 39 | 40 | def get_empty_index(self): 41 | return faiss.read_index(self.get_empty_index_path()) 42 | 43 | def get_added_index_path(self): 44 | args = get_retro_args() 45 | return os.path.join( 46 | get_index_dir(), 47 | "added_%.3f_%.3f.faissindex" % ( 48 | args.retro_index_train_load_fraction, 49 | args.retro_index_add_load_fraction, 50 | ), 51 | ) 52 | 53 | def get_added_index(self): 54 | return faiss.read_index(self.get_added_index_path()) 55 | 56 | @abc.abstractmethod 57 | def train(self, *args): 58 | pass 59 | 60 | @abc.abstractmethod 61 | def add(self, *args): 62 | pass 63 | 64 | def embed_text_dataset_block(self, embedder, text_dataset, _range): 65 | '''Embed a range of a text dataset.''' 66 | sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range)) 67 | return embedder.embed_text_dataset(sub_dataset) 68 | -------------------------------------------------------------------------------- /tools/retro/index/indexes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .faiss_base import FaissBaseIndex 4 | from .faiss_par_add import FaissParallelAddIndex 5 | -------------------------------------------------------------------------------- /tools/retro/index/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import concurrent 4 | import gc 5 | import glob 6 | import numpy as np 7 | import os 8 | import psutil 9 | import time 10 | import torch 11 | from tqdm import tqdm 12 | 13 | from megatron import get_retro_args, print_rank_0 14 | from tools.retro.db.utils import get_indexed_dataset_infos 15 | from tools.retro.external_libs import h5py 16 | 17 | 18 | def get_index_dir(): 19 | """Create sub-directory for this index.""" 20 | 21 | args = get_retro_args() 22 | 23 | # Directory path. 24 | index_dir_path = os.path.join( 25 | args.retro_workdir, 26 | "index", 27 | args.retro_index_type, 28 | args.retro_index_str, 29 | ) 30 | 31 | # Make directory. 32 | os.makedirs(index_dir_path, exist_ok=True) 33 | 34 | return index_dir_path 35 | 36 | 37 | def num_samples_to_block_ranges(num_samples): 38 | '''Split a range (length num_samples) into sequence of block ranges 39 | of size block_size.''' 40 | args = get_retro_args() 41 | block_size = args.retro_block_size 42 | start_idxs = list(range(0, num_samples, block_size)) 43 | end_idxs = [min(num_samples, s + block_size) for s in start_idxs] 44 | ranges = list(zip(start_idxs, end_idxs)) 45 | return ranges 46 | 47 | 48 | def get_training_data_root_dir(): 49 | args = get_retro_args() 50 | return os.path.join(args.retro_workdir, "index", "train_emb") 51 | 52 | 53 | def get_training_data_block_dir(): 54 | return os.path.join(get_training_data_root_dir(), "blocks") 55 | 56 | 57 | def get_training_data_block_paths(): 58 | return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5")) 59 | 60 | 61 | def get_training_data_merged_path(): 62 | args = get_retro_args() 63 | return os.path.join(get_training_data_root_dir(), 64 | "train_%.3f.bin" % args.retro_index_train_load_fraction) 65 | 66 | 67 | def get_added_codes_dir(): 68 | return os.path.join(get_index_dir(), "add_codes") 69 | 70 | 71 | def get_added_code_paths(): 72 | return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5")) 73 | -------------------------------------------------------------------------------- /tools/retro/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .query import query_pretraining_neighbors 4 | -------------------------------------------------------------------------------- /tools/retro/query/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import hashlib 4 | import os 5 | 6 | from megatron import get_retro_args 7 | 8 | 9 | def get_query_workdir(): 10 | args = get_retro_args() 11 | return os.path.join(args.retro_workdir, "query") 12 | 13 | 14 | def get_neighbor_dirname(key, dataset): 15 | hashes = ",".join([ d.desc_hash for d in dataset.datasets ]) 16 | hash = hashlib.md5(hashes.encode()).hexdigest() 17 | return os.path.join(get_query_workdir(), os.path.basename(f"{key}_{hash}")) 18 | -------------------------------------------------------------------------------- /tools/retro/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import torch 5 | import types 6 | 7 | from megatron import get_retro_args 8 | from megatron.tokenizer.tokenizer import ( 9 | _BertWordPieceTokenizer, 10 | _GPT2BPETokenizer, 11 | _GPTSentencePieceTokenizer, 12 | ) 13 | 14 | 15 | def get_args_path(workdir): 16 | '''Argument copy stored within retro workdir.''' 17 | return os.path.join(workdir, "args.json") 18 | 19 | 20 | def get_num_chunks_per_sample(): 21 | '''Compute seq_length // chunk_length.''' 22 | args = get_retro_args() 23 | sample_length = args.retro_gpt_seq_length 24 | chunk_length = args.retro_gpt_chunk_length 25 | assert sample_length % chunk_length == 0 26 | return sample_length // chunk_length 27 | 28 | 29 | def get_gpt_tokenizer(): 30 | '''GPT (BPE) tokenizer.''' 31 | args = get_retro_args() 32 | tokenizer_type = args.retro_gpt_tokenizer_type 33 | if tokenizer_type == "GPT2BPETokenizer": 34 | assert args.retro_gpt_vocab_file and args.retro_gpt_merge_file 35 | return _GPT2BPETokenizer( 36 | vocab_file=args.retro_gpt_vocab_file, 37 | merge_file=args.retro_gpt_merge_file, 38 | ) 39 | elif tokenizer_type == 'GPTSentencePieceTokenizer': 40 | assert args.retro_gpt_tokenizer_model is not None 41 | return _GPTSentencePieceTokenizer(args.retro_gpt_tokenizer_model) 42 | else: 43 | raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type) 44 | 45 | 46 | def get_bert_tokenizer(): 47 | '''Bert (Wordpiece) tokenizer.''' 48 | args = get_retro_args() 49 | lower_case = { 50 | "BertWordPieceLowerCase" : True, 51 | "BertWordPieceCase" : False, 52 | }[args.retro_bert_tokenizer_type] 53 | return _BertWordPieceTokenizer( 54 | vocab_file=args.retro_bert_vocab_file, 55 | lower_case=lower_case, 56 | ) 57 | 58 | 59 | class GPTToTextDataset(torch.utils.data.Dataset): 60 | '''Dataset to convert GPT tokens to text.''' 61 | 62 | def __init__(self, gpt_dataset): 63 | 64 | super().__init__() 65 | 66 | self.gpt_dataset = gpt_dataset 67 | self.gpt_tokenizer = get_gpt_tokenizer() 68 | 69 | def __len__(self): 70 | return len(self.gpt_dataset) 71 | 72 | def __getitem__(self, idx): 73 | gpt_token_ids = self.gpt_dataset[idx]["text"].tolist() 74 | text = self.gpt_tokenizer.detokenize(gpt_token_ids) 75 | return {"text": text} 76 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import sys 3 | import json 4 | import requests 5 | 6 | 7 | if __name__ == "__main__": 8 | url = sys.argv[1] 9 | url = 'http://' + url + '/api' 10 | headers = {'Content-Type': 'application/json'} 11 | 12 | while True: 13 | sentence = input("Enter prompt: ") 14 | tokens_to_generate = int(eval(input("Enter number of tokens to generate: "))) 15 | 16 | data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} 17 | response = requests.put(url, data=json.dumps(data), headers=headers) 18 | 19 | if response.status_code != 200: 20 | print(f"Error {response.status_code}: {response.json()['message']}") 21 | else: 22 | print("Megatron Response: ") 23 | print(response.json()['text'][0]) 24 | --------------------------------------------------------------------------------