├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── enhancement.md
    │   ├── question.md
    │   └── regression.md
    └── workflows
    │   └── stale.yml
├── .gitignore
├── .gitlab-ci.yml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
    ├── distrib_optimizer.md
    └── images
    │   └── distrib_optimizer
    │       ├── data_flow.png
    │       └── sharding_scheme.png
├── examples
    ├── detxoify_lm
    │   ├── README.md
    │   ├── annotations
    │   │   ├── filter-selfgeneration.py
    │   │   ├── perspective_api_annotate.py
    │   │   └── preprocess.sh
    │   ├── finetune_gpt.py
    │   ├── finetune_gpt_distributed-1.3b.sh
    │   ├── generate-1.3b.sh
    │   ├── generate_samples_gpt.py
    │   ├── perspective_api.py
    │   └── self_generation
    │   │   └── selfgenerate-1.3b-unconditional.sh
    ├── evaluate_retriever_nq.sh
    ├── evaluate_zeroshot_gpt.sh
    ├── finetune_mnli_distributed.sh
    ├── finetune_race_distributed.sh
    ├── finetune_retriever_distributed.sh
    ├── merge_mp_bert.sh
    ├── msdp
    │   ├── README.md
    │   ├── data_processing.sh
    │   ├── eval_knwl_generation.sh
    │   ├── eval_resp_generation.sh
    │   ├── prep_resp_gen.sh
    │   ├── prompt_knwl_gen.sh
    │   └── prompt_resp_gen.sh
    ├── pretrain_bert.sh
    ├── pretrain_bert_distributed.sh
    ├── pretrain_bert_distributed_with_mp.sh
    ├── pretrain_gpt.sh
    ├── pretrain_gpt3_175B.sh
    ├── pretrain_gpt_distributed.sh
    ├── pretrain_gpt_distributed_with_mp.sh
    ├── pretrain_ict.sh
    ├── pretrain_t5.sh
    ├── pretrain_t5_distributed.sh
    ├── pretrain_t5_distributed_with_mp.sh
    ├── run_text_generation_server_345M.sh
    ├── run_text_generation_server_345M_8_tensor_parallel.sh
    └── sc21
    │   ├── CONFIG.sh
    │   ├── README.md
    │   ├── SBATCH.sh
    │   ├── SRUN.sh
    │   ├── run_figure_11.sh
    │   ├── run_figure_12.sh
    │   ├── run_figure_13.sh
    │   ├── run_figure_14.sh
    │   ├── run_figure_15.sh
    │   ├── run_figure_16.sh
    │   ├── run_figure_17.sh
    │   ├── run_figure_18.sh
    │   └── run_table_1.sh
├── exp2.sh
├── images
    ├── Achieved_petaFLOPs.png
    └── cases_april2021.png
├── megatron
    ├── __init__.py
    ├── arguments.py
    ├── checkpointing.py
    ├── core
    │   ├── README.md
    │   ├── __init__.py
    │   ├── dist_checkpointing
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── dict_utils.py
    │   │   ├── mapping.py
    │   │   ├── optimizer.py
    │   │   ├── serialization.py
    │   │   ├── strategies
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── tensorstore.py
    │   │   │   ├── two_stage.py
    │   │   │   └── zarr.py
    │   │   └── utils.py
    │   ├── enums.py
    │   ├── fusions
    │   │   ├── __init__.py
    │   │   ├── fused_bias_dropout.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_layer_norm.py
    │   │   └── fused_softmax.py
    │   ├── inference_params.py
    │   ├── model_parallel_config.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── common
    │   │   │   └── rotary_pos_embedding.py
    │   │   └── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── gpt_embedding.py
    │   │   │   └── gpt_model.py
    │   ├── package_info.py
    │   ├── parallel_state.py
    │   ├── pipeline_parallel
    │   │   ├── __init__.py
    │   │   ├── p2p_communication.py
    │   │   ├── schedules.py
    │   │   ├── sp_utils.py
    │   │   └── split_solver.py
    │   ├── requirements.txt
    │   ├── tensor_parallel
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   └── utils.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── custom_layers
    │   │   │   └── transformer_engine.py
    │   │   ├── dot_product_attention.py
    │   │   ├── enums.py
    │   │   ├── identity_op.py
    │   │   ├── mlp.py
    │   │   ├── module.py
    │   │   ├── transformer_block.py
    │   │   ├── transformer_config.py
    │   │   ├── transformer_layer.py
    │   │   └── utils.py
    │   └── utils.py
    ├── data
    │   ├── Makefile
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── bert_dataset.py
    │   ├── biencoder_dataset_utils.py
    │   ├── blendable_dataset.py
    │   ├── data_samplers.py
    │   ├── dataset_utils.py
    │   ├── gpt_dataset.py
    │   ├── helpers.cpp
    │   ├── ict_dataset.py
    │   ├── image_folder.py
    │   ├── indexed_dataset.py
    │   ├── multimodal_dataset.py
    │   ├── orqa_wiki_dataset.py
    │   ├── realm_dataset_utils.py
    │   ├── realm_index.py
    │   ├── t5_dataset.py
    │   ├── test
    │   │   ├── test_indexed_dataset.py
    │   │   └── test_preprocess_data.sh
    │   └── vit_dataset.py
    ├── dist_signal_handler.py
    ├── fp16_deprecated
    │   └── loss_scaler.py
    ├── fused_kernels
    │   ├── __init__.py
    │   ├── compat.h
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_fused_kernels.py
    │   └── type_shim.h
    ├── global_vars.py
    ├── indexer.py
    ├── initialize.py
    ├── memory.py
    ├── microbatches.py
    ├── model
    │   ├── __init__.py
    │   ├── bert_model.py
    │   ├── biencoder_model.py
    │   ├── classification.py
    │   ├── distributed.py
    │   ├── enums.py
    │   ├── fused_bias_gelu.py
    │   ├── fused_layer_norm.py
    │   ├── fused_softmax.py
    │   ├── gpt_model.py
    │   ├── language_model.py
    │   ├── module.py
    │   ├── multiple_choice.py
    │   ├── realm_model.py
    │   ├── t5_model.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── vision
    │   │   ├── classification.py
    │   │   ├── dino.py
    │   │   ├── esvit_swin_backbone.py
    │   │   ├── inpainting.py
    │   │   ├── knn_monitor.py
    │   │   ├── mit_backbone.py
    │   │   ├── swin_backbone.py
    │   │   ├── utils.py
    │   │   └── vit_backbone.py
    ├── mpu
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_layers.py
    │   │   └── test_random.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── clip_grads.py
    │   ├── distrib_optimizer.py
    │   ├── grad_scaler.py
    │   └── optimizer.py
    ├── optimizer_param_scheduler.py
    ├── static
    │   └── index.html
    ├── text_generation
    │   ├── __init__.py
    │   ├── api.py
    │   ├── beam_utils.py
    │   ├── communication.py
    │   ├── forward_step.py
    │   ├── generation.py
    │   ├── sampling.py
    │   └── tokenization.py
    ├── text_generation_server.py
    ├── timers.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── bert_tokenization.py
    │   ├── gpt2_tokenization.py
    │   └── tokenizer.py
    ├── training.py
    └── utils.py
├── picture
    ├── .DS_Store
    ├── 13bx32A100_memory.svg
    ├── 2.7bx8A100_memory.pdf
    ├── 2.7bx8A100_memory.svg
    ├── 30bx32A100_memory.svg
    ├── 30bx64A100_memory.svg
    ├── 32x7b zhihu_throughput.tex
    ├── 7bx32A100_memory.svg
    ├── Raycast (2).dmg
    ├── seq1f1b_memory.pdf
    ├── seq1f1b_memory.png
    ├── seq1f1b_memory.svg
    ├── seq1f1b_original.png
    └── seq1f1b_zerobubble.pdf
├── pretrain_bert.py
├── pretrain_gpt.py
├── pretrain_gpt_core.py
├── pretrain_ict.py
├── pretrain_retro.py
├── pretrain_t5.py
├── pretrain_vision_classify.py
├── pretrain_vision_dino.py
├── pretrain_vision_inpaint.py
├── pyproject.toml
├── setup.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── main.py
    ├── msdp
    │   ├── README.md
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── prompt.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification
    │   │   ├── classification.py
    │   │   └── eval_utils.py
    │   ├── finetune_utils.py
    │   ├── main.py
    │   └── segmentation
    │   │   ├── cityscapes.py
    │   │   ├── data.py
    │   │   ├── finetune_segformer.py
    │   │   ├── finetune_setr.py
    │   │   ├── metrics.py
    │   │   ├── seg_heads.py
    │   │   ├── seg_models.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tests
    ├── __init__.py
    ├── functional_tests
    │   ├── __init__.py
    │   ├── python_test_utils
    │   │   ├── __init__.py
    │   │   ├── check_slurm_job_completion.py
    │   │   ├── get_test_results_from_tensorboard_logs.py
    │   │   ├── test_ci_pipeline.py
    │   │   └── test_resume_checkpoint_pipeline.py
    │   ├── shell_test_utils
    │   │   └── jobwait.sh
    │   ├── test_results
    │   │   ├── bert
    │   │   │   ├── bert_tp1_pp2_1nodes_50steps.json
    │   │   │   ├── bert_tp1_pp4_1nodes_50steps.json
    │   │   │   ├── bert_tp2_pp2_1nodes_50steps.json
    │   │   │   └── bert_tp4_pp1_1nodes_50steps.json
    │   │   └── gpt3
    │   │   │   ├── gpt3_tp1_pp2_1nodes_50steps.json
    │   │   │   ├── gpt3_tp1_pp2_1nodes_50steps_core_enabled.json
    │   │   │   ├── gpt3_tp1_pp2_1nodes_50steps_core_enabled_rope_embeddings.json
    │   │   │   ├── gpt3_tp1_pp4_1nodes_50steps.json
    │   │   │   ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled.json
    │   │   │   ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_disable_bias_linear.json
    │   │   │   ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_sequence_parallel.json
    │   │   │   ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_swiglu.json
    │   │   │   ├── gpt3_tp1_pp4_1nodes_50steps_core_enabled_untie_embeddings_and_outputs.json
    │   │   │   ├── gpt3_tp2_pp2_1nodes_50steps.json
    │   │   │   ├── gpt3_tp2_pp2_1nodes_50steps_core_enabled.json
    │   │   │   ├── gpt3_tp2_pp2_1nodes_50steps_te_enabled.json
    │   │   │   ├── gpt3_tp4_pp1_1nodes_50steps.json
    │   │   │   └── gpt3_tp4_pp1_1nodes_50steps_core_enabled.json
    │   └── test_scripts
    │   │   ├── bert
    │   │       ├── pretrain_bert_distributed_resume_checkpoint_test.sh
    │   │       ├── pretrain_bert_distributed_test.sh
    │   │       ├── sbatch_bert_distributed_resume_checkpoint_test.sh
    │   │       └── sbatch_bert_distributed_test.sh
    │   │   └── gpt3
    │   │       ├── pretrain_gpt3_distributed_resume_checkpoint_test.sh
    │   │       ├── pretrain_gpt3_distributed_test.sh
    │   │       ├── sbatch_gpt3_distributed_resume_checkpoint_test.sh
    │   │       └── sbatch_gpt3_distributed_test.sh
    └── unit_tests
    │   ├── __init__.py
    │   ├── models
    │       ├── __init__.py
    │       ├── test_gpt_embedding.py
    │       └── test_gpt_model.py
    │   ├── pipeline_parallel
    │       ├── __init__.py
    │       └── test_schedules.py
    │   ├── tensor_parallel
    │       ├── test_cross_entropy.py
    │       ├── test_data.py
    │       ├── test_mappings.py
    │       ├── test_random.py
    │       └── test_tensor_parallel_utils.py
    │   ├── test_basic.py
    │   ├── test_parallel_state.py
    │   ├── test_utilities.py
    │   ├── test_utils.py
    │   └── transformer
    │       ├── __init__.py
    │       ├── test_attention.py
    │       ├── test_core_attention.py
    │       ├── test_mlp.py
    │       ├── test_module.py
    │       ├── test_transformer_block.py
    │       └── test_transformer_layer.py
└── tools
    ├── autoformat.sh
    ├── bert_embedding
        ├── __init__.py
        ├── dataset.py
        ├── embed.py
        ├── external_libs.py
        ├── huggingface.py
        └── utils.py
    ├── checkpoint_loader_megatron.py
    ├── checkpoint_saver_megatron.py
    ├── checkpoint_util.py
    ├── linter.py
    ├── merge_datasets.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    ├── preprocess_data.py
    ├── preprocess_data_nmt.py
    ├── preprocess_mmdata.py
    ├── retro
        ├── README.md
        ├── cli
        │   ├── __init__.py
        │   ├── __main__.py
        │   └── cli.py
        ├── db
        │   ├── __init__.py
        │   ├── build.py
        │   ├── dataset.py
        │   └── utils.py
        ├── examples
        │   ├── preprocess_data.sh
        │   └── pretrain_model.sh
        ├── external_libs.py
        ├── index
        │   ├── __init__.py
        │   ├── build.py
        │   ├── factory.py
        │   ├── index.py
        │   ├── indexes
        │   │   ├── __init__.py
        │   │   ├── faiss_base.py
        │   │   └── faiss_par_add.py
        │   └── utils.py
        ├── main.py
        ├── query
        │   ├── __init__.py
        │   ├── chunk_dataset.py
        │   ├── query.py
        │   ├── retro_dataset.py
        │   └── utils.py
        └── utils.py
    ├── run_text_generation_server.py
    └── text_generation_cli.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [html]
2 | directory = coverage
3 | 
4 | [run]
5 | data_file = .coverage_$LOCAL_RANK
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: BUG
 3 | about: Report a bug that needs attention
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Stack trace/logs**
20 | If applicable, add the stack trace or logs from the time of the error.
21 | 
22 | **Environment (please complete the following information):**
23 |  - Megatron-LM commit ID
24 |  - PyTorch version
25 |  - CUDA version
26 |  - NCCL version
27 | 
28 | **Proposed fix**
29 | If you have a proposal for how to fix the issue state it here or link to a PR.
30 | 
31 | **Additional context**
32 | Add any other context about the problem here.
33 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: ENHANCEMENT
 3 | about: Suggest an idea to improve this project
 4 | title: "[ENHANCEMENT]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Proposed implementation**
20 | If you have a proposed implementation for the feature state it here or link to a PR.
21 | 
22 | **Additional context**
23 | Add any other context or screenshots about the feature request here.
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: QUESTION
 3 | about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
 4 |   request
 5 | title: "[QUESTION]"
 6 | labels: ''
 7 | assignees: ''
 8 | 
 9 | ---
10 | 
11 | **Your question**
12 | Ask a clear and concise question about Megatron-LM.
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/regression.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: REGRESSION
 3 | about: Report a regression in speed or accuracy due to a Megatron-LM update
 4 | title: "[REGRESSION]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the regression**
11 | A clear and concise description of what the regression is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
15 | 
16 | **Previous performance**
17 | What speed or accuracy did you previously see.
18 | 
19 | **New performance**
20 | What speed or accuracy do you see after the update.
21 | 
22 | **Stack trace/logs**
23 | If applicable, add the stack trace or logs related to the regression.
24 | 
25 | **Environment (please complete the following information):**
26 |  - Previous Megatron-LM commit ID
27 |  - New Megatron-LM commit ID
28 |  - Previous PyTorch version
29 |  - New PyTorch version
30 |  - Previous CUDA version
31 |  - New CUDA version
32 |  - Previous NCCL version
33 |  - New NCCL version
34 | 
35 | **Proposed fix**
36 | If you have a proposal for how to fix the issue state it here or link to a PR.
37 | 
38 | **Additional context**
39 | Add any other context about the problem here.
40 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
 2 | #
 3 | # You can adjust the behavior by modifying this file.
 4 | # For more information, see:
 5 | # https://github.com/actions/stale
 6 | name: Mark stale issues and pull requests
 7 | 
 8 | on:
 9 |   schedule:
10 |   - cron: '15 18 * * *'
11 | 
12 | jobs:
13 |   stale:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       issues: write
18 |       pull-requests: write
19 | 
20 |     steps:
21 |     - uses: actions/stale@v5
22 |       with:
23 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
24 |         days-before-stale: 60
25 |         stale-issue-message: 'Marking as stale. No activity in 60 days.'
26 |         stale-pr-message: 'Marking as stale. No activity in 60 days.'
27 |         stale-issue-label: 'stale'
28 |         stale-pr-label: 'stale'
29 |         remove-stale-when-updated: true
30 |         operations-per-run: 1000
31 |         days-before-close: -1
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.so
 3 | build
 4 | .coverage_*
 5 | *.egg-info
 6 | *~
 7 | slurm*
 8 | logs
 9 | *.npy
10 | *.bin
11 | *.idx
12 | *.log
13 | *.op
14 | *nsys-rep
15 | *tfevents*
16 | */index-cache/*
17 | *.pt
18 | 


--------------------------------------------------------------------------------
/docs/images/distrib_optimizer/data_flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/docs/images/distrib_optimizer/data_flow.png


--------------------------------------------------------------------------------
/docs/images/distrib_optimizer/sharding_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/docs/images/distrib_optimizer/sharding_scheme.png


--------------------------------------------------------------------------------
/examples/detxoify_lm/annotations/filter-selfgeneration.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from typing import Dict, Optional, List
 4 | 
 5 | import joblib
 6 | from googleapiclient import discovery
 7 | from googleapiclient.errors import HttpError
 8 | 
 9 | import argparse
10 | 
11 | from tqdm import tqdm
12 | 
13 | parser = argparse.ArgumentParser(description='Process some integers.')
14 | parser.add_argument('--data-path', type=str, default='',
15 |                     help='data path to load the jsonl')
16 | parser.add_argument('--out-path', type=str, default='',
17 |                     help='data path to load the jsonl')
18 | parser.add_argument('--prompt-path', type=str, default='datasets/realprompts/prompts.jsonl',
19 |                     help='data path to load the prompt jsonl')
20 | parser.add_argument('--workers', type=int, default=10,
21 |                    help='Number of worker processes to launch')
22 | 
23 | 
24 | def get_corpus_scores(lines):
25 |     scores = []
26 |     corpus = []
27 | 
28 |     for line in tqdm(lines):
29 |         data = json.loads(line)
30 |         score = data['score']
31 |         if score is not None:
32 |             scores.append(score['toxicity'])
33 |             corpus.append(data)
34 |     return scores, corpus
35 | 
36 | 
37 | def main():
38 |     args = parser.parse_args()
39 | 
40 |     with open(args.data_path) as f:
41 |         lines = f.readlines()
42 | 
43 |     print(f"total line number: {len(lines)}")
44 | 
45 |     scores, corpus = get_corpus_scores(lines)
46 |     import numpy as np
47 |     scores = np.array(scores)
48 |     indices = np.argsort(scores)
49 | 
50 |     print(f"total valid samples: {len(scores)}")
51 | 
52 |     print(f"Avg toxicity (all): {np.mean(scores)} +- {np.std(scores)}")
53 |     print(f"Avg toxicity (toxic): {np.mean(scores[scores > 0.5])} +- {np.std(scores[scores > 0.5])}")
54 |     print(f"Toxic Percentage {sum(scores > 0.5) / len(scores)}")
55 |     print(f"Avg toxicity (nontoxic): {np.mean(scores[scores <= 0.5])} +- {np.std(scores[scores <= 0.5])}")
56 |     print(f"Nontoxic Percentage {sum(scores <= 0.5) / len(scores)}")
57 | 
58 |     samples_left = len(lines) // 2
59 |     print(f"After filtering: {samples_left} of samples are left")
60 |     nontoxic_indices = indices[:samples_left]
61 |     print(f"Avg toxicity (filtered): {np.mean(scores[nontoxic_indices])} +- {np.std(scores[nontoxic_indices])}")
62 |     print(f"Toxicity Range (filtered): {np.min(scores[nontoxic_indices])} ~ {np.max(scores[nontoxic_indices])}")
63 |     nontoxic_data = [corpus[ind] for ind in nontoxic_indices]
64 |     print(f"Total samples after filtering: {len(nontoxic_data)}")
65 |     print(f"Examples: {nontoxic_data[:3]}")
66 | 
67 |     from sklearn.utils import shuffle
68 |     nontoxic_data = shuffle(nontoxic_data)
69 | 
70 |     with open(args.out_path, 'w') as f:
71 |         for x in nontoxic_data:
72 |             f.write(json.dumps(x) + '\n')
73 | 
74 | 
75 | main()


--------------------------------------------------------------------------------
/examples/detxoify_lm/annotations/preprocess.sh:
--------------------------------------------------------------------------------
 1 | VOCAB_FILE=pt2-vocab.json
 2 | MERGE_FILE=gpt2-merges.txt
 3 | 
 4 | python3 tools/preprocess_data.py \
 5 |     --input $1 \
 6 |     --output-prefix $2 \
 7 |     --vocab-file $VOCAB_FILE \
 8 |     --merge-file $MERGE_FILE \
 9 |     --tokenizer-type GPT2BPETokenizer \
10 |     --append-eod  --workers 20 --chunk-size 25
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/finetune_gpt_distributed-1.3b.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Change for multinode config
 4 | GPUS_PER_NODE=16
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=$(($RANDOM + 1024))
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | # input
12 | DATA_PATH=$1
13 | SHARE_DATA=$PWD                       # current work dir
14 | FINETUNED_PATH="$SHARE_DATA/$2"
15 | lr=$3
16 | bs=$4
17 | iter=$5
18 | CHECKPOINT_PATH=$6
19 | 
20 | # vocab
21 | VOCAB_FILE=gpt2-vocab.json           # Your gpt-2 vocab
22 | MERGE_FILE=gpt2-merges.txt           # Your gpt-2 merge file
23 | 
24 | # tensorboard
25 | TENSORBOARD_DIR="$SHARE_DATA/tensorboard/$2"
26 | mkdir -p ${TENSORBOARD_DIR}
27 | 
28 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
29 | 
30 | python -m torch.distributed.run $DISTRIBUTED_ARGS \
31 |      examples/detxoify_lm/finetune_gpt.py \
32 |      --num-layers 24 \
33 |      --hidden-size 2048 \
34 |      --num-attention-heads 32 \
35 |      --micro-batch-size 4 \
36 |      --global-batch-size $bs \
37 |      --seq-length 2048 \
38 |      --max-position-embeddings 2048 \
39 |      --train-iters $iter \
40 |      --save $FINETUNED_PATH \
41 |      --load $CHECKPOINT_PATH \
42 |      --data-path $DATA_PATH \
43 |      --data-path2 ${DATA_BLEND} \
44 |      --vocab-file $VOCAB_FILE \
45 |      --merge-file $MERGE_FILE \
46 |      --data-impl mmap \
47 |      --split 100,0,0 \
48 |      --distributed-backend nccl \
49 |      --lr-decay-style constant \
50 |      --lr $lr \
51 |      --clip-grad 1.0 \
52 |      --weight-decay 0.1 \
53 |      --adam-beta1 0.9 \
54 |      --adam-beta2 0.95 \
55 |      --checkpoint-activations \
56 |      --log-interval 1 \
57 |      --save-interval 78 \
58 |      --eval-interval 78 \
59 |      --eval-iters 50 \
60 |      --fp16 \
61 |      --DDP-impl local \
62 |      --finetune --no-load-optim \
63 |      --log-validation-ppl-to-tensorboard \
64 |      --tensorboard-dir ${TENSORBOARD_DIR}
65 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/generate-1.3b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | VOCAB_FILE=gpt2-vocab.json
 4 | MERGE_FILE=gpt2-merges.txt
 5 | 
 6 | GPUS_PER_NODE=1
 7 | # Change for multinode config
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=$(($RANDOM + 1024))
10 | NNODES=1
11 | NODE_RANK=0
12 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
13 | NUM_SAMPLES=$(wc -l < $1)
14 | PREFIX=$(basename $2)
15 | SEED=$(($RANDOM))
16 | OUTPUT=$1_output_"$PREFIX"_seed_"$SEED".jsonl
17 | 
18 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
19 | 
20 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
21 |        --tensor-model-parallel-size 1 \
22 |        --num-layers 24 \
23 |        --hidden-size 2048 \
24 |        --load $CHECKPOINT_PATH \
25 |        --num-attention-heads 32 \
26 |        --max-position-embeddings 2048 \
27 |        --tokenizer-type GPT2BPETokenizer \
28 |        --fp16 \
29 |        --micro-batch-size 400 \
30 |        --seq-length 2048 \
31 |        --out-seq-length 20 \
32 |        --temperature 1.0 \
33 |        --vocab-file $VOCAB_FILE \
34 |        --merge-file $MERGE_FILE \
35 |        --sample-input-file $1 \
36 |        --sample-output-file $OUTPUT \
37 |        --num-samples $NUM_SAMPLES \
38 |        --max-tokens-to-oom 1200000 \
39 |        --top_p 0.9 \
40 |        --seed $SEED
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CHECKPOINT_PATH=$2          # Your model ckpt
 3 | SHARE_DATA=$PWD             # current work dir
 4 | VOCAB_FILE=gpt2-vocab.json  # Your gpt-2 vocab
 5 | MERGE_FILE=gpt2-merges.txt  # Your gpt-2 merge file
 6 | 
 7 | GPUS_PER_NODE=1
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=$(($RANDOM + 1024))
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | SEED=$3
15 | SUFFIX=$(basename $CHECKPOINT_PATH)
16 | save_dir=$SHARE_DATA/selfgeneration/unconditional_generation_$SUFFIX/
17 | mkdir -p $save_dir
18 | echo $save_dir/$SEED.out
19 | 
20 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
21 | 
22 | python -m torch.distributed.run $DISTRIBUTED_ARGS examples/detxoify_lm/generate_samples_gpt.py \
23 |        --tensor-model-parallel-size 1 \
24 |        --num-layers 24 \
25 |        --hidden-size 2048 \
26 |        --load $CHECKPOINT_PATH \
27 |        --num-attention-heads 32 \
28 |        --max-position-embeddings 2048 \
29 |        --tokenizer-type GPT2BPETokenizer \
30 |        --fp16 \
31 |        --micro-batch-size 150 \
32 |        --seq-length 2048 \
33 |        --out-seq-length 1000 \
34 |        --temperature 1.0 \
35 |        --vocab-file $VOCAB_FILE \
36 |        --merge-file $MERGE_FILE \
37 |        --num-samples $1 \
38 |        --top_p 0.9 \
39 |        --max-tokens-to-oom 1200000 \
40 |        --genfile $save_dir/$SEED.out  \
41 |        --seed $SEED
42 | 
43 | 


--------------------------------------------------------------------------------
/examples/evaluate_retriever_nq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Evaluate natural question test data given Wikipedia embeddings and pretrained
 4 | # ICT model or a finetuned model for Natural Question task
 5 | 
 6 | # Datasets can be downloaded from the following link:
 7 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 8 | 
 9 | EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
10 | EMBEDDING_PATH=<Specify path of the embeddings>
11 | CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
12 | 
13 | QA_FILE=<Path of the natural question dev or test dataset>
14 | 
15 | python tasks/main.py \
16 |     --task RETRIEVER-EVAL \
17 |     --tokenizer-type BertWordPieceLowerCase \
18 |     --num-layers 12 \
19 |     --hidden-size 768 \
20 |     --num-attention-heads 12 \
21 |     --tensor-model-parallel-size 1 \
22 |     --micro-batch-size 128 \
23 |     --seq-length 512 \
24 |     --max-position-embeddings 512 \
25 |     --load ${CHECKPOINT_PATH} \
26 |     --evidence-data-path ${EVIDENCE_DATA_DIR} \
27 |     --embedding-path ${EMBEDDING_PATH} \
28 |     --retriever-seq-length 256 \
29 |     --vocab-file  bert-vocab.txt\
30 |     --qa-data-test ${QA_FILE} \
31 |     --faiss-use-gpu \
32 |     --retriever-report-topk-accuracies 1 5 20 100 \
33 |     --fp16 \
34 |     --indexer-log-interval 1000 \
35 |     --indexer-batch-size 128
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/examples/evaluate_zeroshot_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TASK="LAMBADA"
12 | 
13 | VALID_DATA=<lambada path>
14 | VOCAB_FILE=gpt2-vocab.json
15 | MERGE_FILE=gpt2-merges.txt
16 | CHECKPOINT=checkpoints/gpt2_345m
17 | 
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
20 |                --task $TASK \
21 |                --valid-data $VALID_DATA \
22 |                --tokenizer-type GPT2BPETokenizer \
23 |                --strict-lambada \
24 |                --vocab-file $VOCAB_FILE \
25 |                --merge-file $MERGE_FILE \
26 |                --load $CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --batch-size 8 \
32 |                --seq-length 1024 \
33 |                --max-position-embeddings 1024 \
34 |                --log-interval 10 \
35 |                --fp16 \
36 |                --no-load-optim \
37 |                --no-load-rng
38 | 


--------------------------------------------------------------------------------
/examples/finetune_mnli_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/glue_data/MNLI/train.tsv"
12 | VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
13 |             data/glue_data/MNLI/dev_mismatched.tsv"
14 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
15 | VOCAB_FILE=bert-vocab.txt
16 | CHECKPOINT_PATH=checkpoints/bert_345m_mnli
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task MNLI \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 5 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 8 \
32 |                --lr 5.0e-5 \
33 |                --lr-decay-style linear \
34 |                --lr-warmup-fraction 0.065 \
35 |                --seq-length 512 \
36 |                --max-position-embeddings 512 \
37 |                --save-interval 500000 \
38 |                --save $CHECKPOINT_PATH \
39 |                --log-interval 10 \
40 |                --eval-interval 100 \
41 |                --eval-iters 50 \
42 |                --weight-decay 1.0e-1 \
43 |                --fp16
44 | 


--------------------------------------------------------------------------------
/examples/finetune_race_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORLD_SIZE=8
 4 | 
 5 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 6 |                   --nnodes 1 \
 7 |                   --node_rank 0 \
 8 |                   --master_addr localhost \
 9 |                   --master_port 6000"
10 | 
11 | TRAIN_DATA="data/RACE/train/middle"
12 | VALID_DATA="data/RACE/dev/middle \
13 |             data/RACE/dev/high"
14 | VOCAB_FILE=bert-vocab.txt
15 | PRETRAINED_CHECKPOINT=checkpoints/bert_345m
16 | CHECKPOINT_PATH=checkpoints/bert_345m_race
17 | 
18 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
19 |                --task RACE \
20 |                --seed 1234 \
21 |                --train-data $TRAIN_DATA \
22 |                --valid-data $VALID_DATA \
23 |                --tokenizer-type BertWordPieceLowerCase \
24 |                --vocab-file $VOCAB_FILE \
25 |                --epochs 3 \
26 |                --pretrained-checkpoint $PRETRAINED_CHECKPOINT \
27 |                --tensor-model-parallel-size 1 \
28 |                --num-layers 24 \
29 |                --hidden-size 1024 \
30 |                --num-attention-heads 16 \
31 |                --micro-batch-size 4 \
32 |                --lr 1.0e-5 \
33 |                --lr-decay-style linear \
34 |                --lr-warmup-fraction 0.06 \
35 |                --seq-length 512 \
36 |                --max-position-embeddings 512 \
37 |                --save-interval 100000 \
38 |                --save $CHECKPOINT_PATH \
39 |                --log-interval 10 \
40 |                --eval-interval 100 \
41 |                --eval-iters 50 \
42 |                --weight-decay 1.0e-1 \
43 |                --clip-grad 1.0 \
44 |                --hidden-dropout 0.1 \
45 |                --attention-dropout 0.1 \
46 |                --fp16
47 | 


--------------------------------------------------------------------------------
/examples/finetune_retriever_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Finetune a BERT or pretrained ICT model using Google natural question data 
 4 | # Datasets can be downloaded from the following link:
 5 | # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 6 | 
 7 | WORLD_SIZE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
16 | 
17 | # Load either of the below
18 | BERT_LOAD_PATH=<Path of BERT pretrained model>
19 | PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
20 | 
21 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
22 |         --task RET-FINETUNE-NQ \
23 |         --train-with-neg \
24 |         --train-hard-neg 1 \
25 |         --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
26 |         --num-layers 12 \
27 |         --hidden-size 768 \
28 |         --num-attention-heads 12 \
29 |         --tensor-model-parallel-size 1 \
30 |         --tokenizer-type BertWordPieceLowerCase \
31 |         --train-data nq-train.json \
32 |         --valid-data nq-dev.json \
33 |         --save ${CHECKPOINT_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --vocab-file bert-vocab.txt \
36 |         --bert-load ${BERT_LOAD_PATH} \
37 |         --save-interval 5000 \
38 |         --log-interval 10 \
39 |         --eval-interval 20000 \
40 |         --eval-iters 100 \
41 |         --indexer-log-interval 1000 \
42 |         --faiss-use-gpu \
43 |         --DDP-impl torch \
44 |         --fp16 \
45 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
46 |         --seq-length 512 \
47 |         --retriever-seq-length 256 \
48 |         --max-position-embeddings 512 \
49 |         --retriever-score-scaling \
50 |         --epochs 80 \
51 |         --micro-batch-size 8 \
52 |         --eval-micro-batch-size 16 \
53 |         --indexer-batch-size 128 \
54 |         --lr 2e-5 \
55 |         --lr-warmup-fraction 0.01 \
56 |         --weight-decay 1e-1
57 | 


--------------------------------------------------------------------------------
/examples/merge_mp_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TENSOR_MODEL_PARALLEL_SIZE=2
 4 | 
 5 | VOCAB_FILE=bert-vocab.txt
 6 | CHECKPOINT_PATH=checkpoints/bert_345m
 7 | 
 8 | WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
 9 |                                 --model-type BERT \
10 |                                 --tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
11 |                                 --tokenizer-type BertWordPieceLowerCase \
12 |                                 --vocab-file $VOCAB_FILE \
13 |                                 --num-layers 24 \
14 |                                 --hidden-size 1024 \
15 |                                 --num-attention-heads 16 \
16 |                                 --seq-length 512 \
17 |                                 --max-position-embeddings 512 \
18 |                                 --load $CHECKPOINT_PATH
19 | 


--------------------------------------------------------------------------------
/examples/msdp/README.md:
--------------------------------------------------------------------------------
1 | 
2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
3 | 
4 | This directory contains all the scripts of multi-stage prompting for knowledgeable dialogue generation that includes data preparation, and knowledge and response generations. More details are available on [`knowledgeable task directory`](../../tasks/msdp).
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/msdp/eval_knwl_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #########################
 4 | # Evaluate the F1 scores.
 5 | #########################
 6 | 
 7 | WORLD_SIZE=1
 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 9 |                   --nnodes 1 \
10 |                   --node_rank 0 \
11 |                   --master_addr localhost \
12 |                   --master_port 6000"
13 |                   
14 | MODEL_GEN_PATH=<PATH_OF_THE_KNOWLEDGE_GENERATION> \ 
15 |         (e.g., /testseen_knowledge_generations.txt)
16 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
17 |         (e.g., /testseen_knowledge_reference.txt)
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
20 |         --num-layers 24 \
21 |         --hidden-size 1024 \
22 |         --num-attention-heads 16 \
23 |         --seq-length 2048 \
24 |         --max-position-embeddings 2048 \
25 |         --micro-batch-size 4 \
26 |         --task MSDP-EVAL-F1 \
27 |         --guess-file ${MODEL_GEN_PATH} \
28 |         --answer-file ${GROUND_TRUTH_PATH}
29 | 
30 | 
31 | ############################################
32 | # Evaluate BLEU, METEOR, and ROUGE-L scores.
33 | ############################################
34 | 
35 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
36 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 
37 | 
38 | # To evaluate on these metrics, please setup the environments based on 
39 | # the nlg-eval github, and run the corresponding evaluation commands.
40 | 
41 | nlg-eval \
42 |     --hypothesis=<PATH_OF_THE_KNOWLEDGE_GENERATION> \
43 |     --references=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE>
44 | 


--------------------------------------------------------------------------------
/examples/msdp/eval_resp_generation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #########################
 4 | # Evaluate the F1 scores.
 5 | #########################
 6 | 
 7 | WORLD_SIZE=1
 8 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
 9 |                   --nnodes 1 \
10 |                   --node_rank 0 \
11 |                   --master_addr localhost \
12 |                   --master_port 6000"
13 |                   
14 | MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
15 |         (e.g., /testseen_response_generations.txt)
16 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_RESPONSE> \ 
17 |         (e.g., /testseen_response_reference.txt)
18 | 
19 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
20 |         --num-layers 24 \
21 |         --hidden-size 1024 \
22 |         --num-attention-heads 16 \
23 |         --seq-length 2048 \
24 |         --max-position-embeddings 2048 \
25 |         --micro-batch-size 4 \
26 |         --task MSDP-EVAL-F1 \
27 |         --guess-file ${MODEL_GEN_PATH} \
28 |         --answer-file ${GROUND_TRUTH_PATH}
29 | 
30 | 
31 | ##########################
32 | # Evaluate the KF1 scores.
33 | ##########################
34 |                   
35 | MODEL_GEN_PATH=<PATH_OF_THE_RESPONSE_GENERATION> \ 
36 |         (e.g., /testseen_response_generations.txt)
37 | GROUND_TRUTH_PATH=<PATH_OF_THE_GROUND_TRUTH_KNOWLEDGE> \ 
38 |         (e.g., /testseen_knowledge_reference.txt)
39 | 
40 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
41 |         --num-layers 24 \
42 |         --hidden-size 1024 \
43 |         --num-attention-heads 16 \
44 |         --seq-length 2048 \
45 |         --max-position-embeddings 2048 \
46 |         --micro-batch-size 4 \
47 |         --task MSDP-EVAL-F1 \
48 |         --guess-file ${MODEL_GEN_PATH} \
49 |         --answer-file ${GROUND_TRUTH_PATH}
50 | 
51 | 
52 | ############################################
53 | # Evaluate BLEU, METEOR, and ROUGE-L scores.
54 | ############################################
55 | 
56 | # We follow the nlg-eval (https://github.com/Maluuba/nlg-eval) to 
57 | # evaluate the BLEU, METEOR, and ROUGE-L scores. 
58 | 
59 | # To evaluate on these metrics, please setup the environments based on 
60 | # the nlg-eval github, and run the corresponding evaluation commands.
61 | 
62 | nlg-eval \
63 |     --hypothesis=<PATH_OF_THE_RESPONSE_GENERATION> \
64 |     --references=<PATH_OF_THE_GROUND_TRUTH_RESPONSE>
65 | 


--------------------------------------------------------------------------------
/examples/msdp/prep_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Preparing the input file for the response generation (second-stage prompting)
 4 | 
 5 | DIR=`pwd`
 6 | 
 7 | TEST_FILE=<PATH_OF_PROCESSED_TEST_DATA> \
 8 |         (e.g., /testseen_processed.txt)
 9 | KNOWLEDGE_FILE=<PATH_OF_GENERATED_KNOWLEDGE_DATA> \
10 |         (e.g., /testseen_knowledge_generations.txt)
11 | PROCESSED_FILE=<PATH_OF_INPUT_FILE_FOR_RESPONSE_GENERATION> \
12 |         (e.g., /testseen_processed_with_generated_knowledge.txt)
13 | 
14 | python ${DIR}/tasks/msdp/preprocessing.py \
15 |         --func prepare_input \
16 |         --test_file ${TEST_FILE} \
17 |         --knwl_gen_file ${KNOWLEDGE_FILE} \
18 |         --processed_file ${PROCESSED_FILE}
19 | 


--------------------------------------------------------------------------------
/examples/msdp/prompt_knwl_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-1: Prompt a pretrained language model to generate the context-relevant knowledge
 4 | # The input contains prompts and current dialogue context, the output is the relevant knowledge
 5 | # The size of the pretrained language model is 357M
 6 | 
 7 | WORLD_SIZE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
10 |                   --nnodes 1 \
11 |                   --node_rank 0 \
12 |                   --master_addr localhost \
13 |                   --master_port 6000"
14 | 
15 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
16 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
17 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
18 | INPUT_PATH=<PATH_OF_PROCESSED_TEST_DATA_FILE> \ 
19 |         (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_KNOWLEDGE_GENERATION_PROMPTS> \
21 |         (e.g., /testseen_knowledge_prompts.json)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /testseen_knowledge_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type knowledge \
42 |         --num-prompt-examples 10 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/msdp/prompt_resp_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stage-2: Prompt a pretrained language model to generate the corresponding response
 4 | # The input contains prompts, current dialogue context, and generated knowledge in Stage-1
 5 | # The output is the corresponding response.
 6 | # The size of the pretrained language model is 357M
 7 | 
 8 | WORLD_SIZE=8
 9 | 
10 | DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
11 |                   --nnodes 1 \
12 |                   --node_rank 0 \
13 |                   --master_addr localhost \
14 |                   --master_port 6000"
15 | 
16 | CHECKPOINT_PATH=<PATH_OF_LANGUAGE_MODEL> (e.g., /357m)
17 | VOCAB_PATH=<PATH_OF_VOCAB_FILE> (e.g., /gpt2-vocab.json)
18 | MERGE_PATH=<PATH_OF_MERGE_FILE> (e.g., /gpt2-merges.txt)
19 | INPUT_PATH=<PATH_OF_INPUT_TEST_DATA_FILE> (e.g., /testseen_processed.txt)
20 | PROMPT_PATH=<PATH_OF_RESPONSE_GENERATION_PROMPTS> \
21 |         (e.g., /response_prompts.txt)
22 | OUTPUT_PATH=<PATH_OF_OUTPUT_GENERATION_FILE> \
23 |         (e.g., /output_testseen_response_generations.txt)
24 | 
25 | python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/msdp/main.py \
26 |         --num-layers 24 \
27 |         --hidden-size 1024 \
28 |         --num-attention-heads 16 \
29 |         --seq-length 2048 \
30 |         --max-position-embeddings 2048 \
31 |         --micro-batch-size 1 \
32 |         --vocab-file ${VOCAB_PATH} \
33 |         --merge-file ${MERGE_PATH} \
34 |         --load ${CHECKPOINT_PATH} \
35 |         --fp16 \
36 |         --DDP-impl torch \
37 |         --tokenizer-type GPT2BPETokenizer \
38 |         --sample-input-file ${INPUT_PATH} \
39 |         --sample-output-file ${OUTPUT_PATH} \
40 |         --prompt-file ${PROMPT_PATH} \
41 |         --prompt-type response \
42 |         --num-prompt-examples 20 \
43 |         --task MSDP-PROMPT 
44 | 
45 | # NOTE: If you use api for the model generation, please use 
46 | # the "--api-prompt" flag (setting this value as True). 
47 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | CHECKPOINT_PATH=<Specify path>
 6 | VOCAB_FILE=<Specify path to file>/bert-vocab.txt
 7 | DATA_PATH=<Specify path and file prefix>_text_sentence
 8 | 
 9 | BERT_ARGS="
10 |     --num-layers 24 \
11 |     --hidden-size 1024 \
12 |     --num-attention-heads 16 \
13 |     --seq-length 512 \
14 |     --max-position-embeddings 512 \
15 |     --micro-batch-size 4 \
16 |     --global-batch-size 8 \
17 |     --lr 0.0001 \
18 |     --train-iters 2000000 \
19 |     --lr-decay-iters 990000 \
20 |     --lr-decay-style linear \
21 |     --min-lr 0.00001 \
22 |     --weight-decay 1e-2 \
23 |     --lr-warmup-fraction .01 \
24 |     --clip-grad 1.0 \
25 |     --fp16
26 | "
27 | 
28 | DATA_ARGS="
29 |     --data-path $DATA_PATH \
30 |     --vocab-file $VOCAB_FILE \
31 |     --data-impl mmap \
32 |     --split 949,50,1
33 | "
34 | 
35 | OUTPUT_ARGS="
36 |     --log-interval 100 \
37 |     --save-interval 10000 \
38 |     --eval-interval 1000 \
39 |     --eval-iters 10
40 | "
41 | 
42 | torchrun pretrain_bert.py \
43 |     $BERT_ARGS \
44 |     $DATA_ARGS \
45 |     $OUTPUT_ARGS \
46 |     --save $CHECKPOINT_PATH \
47 |     --load $CHECKPOINT_PATH
48 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/bert-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | BERT_ARGS="
26 |     --num-layers 24 \
27 |     --hidden-size 1024 \
28 |     --num-attention-heads 16 \
29 |     --seq-length 512 \
30 |     --max-position-embeddings 512 \
31 |     --micro-batch-size 4 \
32 |     --global-batch-size 32 \
33 |     --lr 0.0001 \
34 |     --train-iters 1000000 \
35 |     --lr-decay-iters 990000 \
36 |     --lr-decay-style linear \
37 |     --min-lr 1.0e-5 \
38 |     --weight-decay 1e-2 \
39 |     --lr-warmup-fraction .01 \
40 |     --clip-grad 1.0 \
41 |     --fp16
42 | "
43 | 
44 | DATA_ARGS="
45 |     --data-path $DATA_PATH \
46 |     --vocab-file $VOCAB_FILE \
47 |     --data-impl mmap \
48 |     --split 949,50,1
49 | "
50 | 
51 | OUTPUT_ARGS="
52 |     --log-interval 100 \
53 |     --save-interval 10000 \
54 |     --eval-interval 1000 \
55 |     --eval-iters 10
56 | "
57 | 
58 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
59 |     $BERT_ARGS \
60 |     $DATA_ARGS \
61 |     $OUTPUT_ARGS \
62 |     --distributed-backend nccl \
63 |     --save $CHECKPOINT_PATH \
64 |     --load $CHECKPOINT_PATH
65 | 


--------------------------------------------------------------------------------
/examples/pretrain_bert_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/bert-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | BERT_ARGS="
26 |     --tensor-model-parallel-size 2 \
27 |     --pipeline-model-parallel-size 2 \
28 |     --num-layers 24 \
29 |     --hidden-size 1024 \
30 |     --num-attention-heads 16 \
31 |     --seq-length 512 \
32 |     --max-position-embeddings 512 \
33 |     --micro-batch-size 2 \
34 |     --global-batch-size 16 \
35 |     --lr 0.0001 \
36 |     --train-iters 1000000 \
37 |     --lr-decay-iters 990000 \
38 |     --lr-decay-style linear \
39 |     --min-lr 1.0e-5 \
40 |     --weight-decay 1e-2 \
41 |     --lr-warmup-fraction .01 \
42 |     --clip-grad 1.0 \
43 |     --fp16
44 | "
45 | 
46 | DATA_ARGS="
47 |     --data-path $DATA_PATH \
48 |     --vocab-file $VOCAB_FILE \
49 |     --data-impl mmap \
50 |     --split 949,50,1
51 | "
52 | 
53 | OUTPUT_ARGS="
54 |     --log-interval 100 \
55 |     --save-interval 10000 \
56 |     --eval-interval 1000 \
57 |     --eval-iters 10
58 | "
59 | 
60 | torchrun $DISTRIBUTED_ARGS pretrain_bert.py \
61 |     $BERT_ARGS \
62 |     $DATA_ARGS \
63 |     $OUTPUT_ARGS \
64 |     --distributed-backend nccl \
65 |     --save $CHECKPOINT_PATH \
66 |     --load $CHECKPOINT_PATH
67 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | 
 7 | CHECKPOINT_PATH=<Specify path>
 8 | VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
 9 | MERGE_FILE=<Specify path to file>/gpt2-merges.txt
10 | DATA_PATH=<Specify path and file prefix>_text_document
11 | 
12 | GPT_ARGS="
13 |     --num-layers 24 \
14 |     --hidden-size 1024 \
15 |     --num-attention-heads 16 \
16 |     --seq-length 1024 \
17 |     --max-position-embeddings 1024 \
18 |     --micro-batch-size 4 \
19 |     --global-batch-size 8 \
20 |     --lr 0.00015 \
21 |     --train-iters 500000 \
22 |     --lr-decay-iters 320000 \
23 |     --lr-decay-style cosine \
24 |     --min-lr 1.0e-5 \
25 |     --weight-decay 1e-2 \
26 |     --lr-warmup-fraction .01 \
27 |     --clip-grad 1.0 \
28 |     --fp16
29 | "
30 | 
31 | DATA_ARGS="
32 |     --data-path $DATA_PATH \
33 |     --vocab-file $VOCAB_FILE \
34 |     --merge-file $MERGE_FILE \
35 |     --data-impl mmap \
36 |     --split 949,50,1
37 | "
38 | 
39 | OUTPUT_ARGS="
40 |     --log-interval 100 \
41 |     --save-interval 10000 \
42 |     --eval-interval 1000 \
43 |     --eval-iters 10
44 | "
45 | 
46 | torchrun pretrain_gpt.py \
47 |     $GPT_ARGS \
48 |     $DATA_ARGS \
49 |     $OUTPUT_ARGS \
50 |     --save $CHECKPOINT_PATH \
51 |     --load $CHECKPOINT_PATH
52 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt3_175B.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | #SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
 5 | 
 6 | 
 7 | DIR=`pwd`
 8 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 9 | mkdir -p $DIR/logs
10 | 
11 | 
12 | DATASET_1="<PATH TO THE FIRST DATASET>"
13 | DATASET_2="<PATH TO THE SECOND DATASET>"
14 | DATASET_3="<PATH TO THE THIRD DATASET>"
15 | DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
16 | 
17 | 
18 | options=" \
19 | 	--tensor-model-parallel-size 8 \
20 | 	--pipeline-model-parallel-size 16 \
21 |         --num-layers 96 \
22 |         --hidden-size 12288 \
23 |         --num-attention-heads 96 \
24 |         --seq-length 2048 \
25 |         --max-position-embeddings 2048 \
26 | 	--micro-batch-size 1 \
27 | 	--global-batch-size 1536 \
28 | 	--rampup-batch-size 16 16 5859375 \
29 | 	--train-samples 146484375 \
30 |        	--lr-decay-samples 126953125 \
31 |         --lr-warmup-samples 183105 \
32 |         --lr 6.0e-5 \
33 | 	--min-lr 6.0e-6 \
34 |         --lr-decay-style cosine \
35 |         --log-interval 10 \
36 |         --eval-iters 40 \
37 |         --eval-interval 1000 \
38 | 	--data-path ${DATASET} \
39 | 	--vocab-file <PATH TO gpt-vocab.json> \
40 | 	--merge-file <PATH TO gpt-merges.txt> \
41 | 	--save-interval 1000 \
42 | 	--save <PATH TO CHECKPOINTS DIRECTORY> \
43 | 	--load <PATH TO CHECKPOINTS DIRECTORY> \
44 | 	--split 98,2,0 \
45 | 	--clip-grad 1.0 \
46 | 	--weight-decay 0.1 \
47 | 	--adam-beta1 0.9 \
48 | 	--adam-beta2 0.95 \
49 | 	--init-method-std 0.006 \
50 | 	--tensorboard-dir <TENSORBOARD DIRECTORY> \
51 | 	--fp16 "
52 | 
53 | 
54 | run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
55 | 
56 | 
57 | srun -l \
58 |      --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
59 |      --container-mounts "<DIRECTORIES TO MOUNT>" \
60 |      --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
61 | 
62 | 
63 | set +x
64 | 
65 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | 
 7 | GPUS_PER_NODE=8
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=6000
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | 
15 | CHECKPOINT_PATH=<Specify path>
16 | VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
17 | MERGE_FILE=<Specify path to file>/gpt2-merges.txt
18 | DATA_PATH=<Specify path and file prefix>_text_document
19 | 
20 | DISTRIBUTED_ARGS="
21 |     --nproc_per_node $GPUS_PER_NODE \
22 |     --nnodes $NNODES \
23 |     --node_rank $NODE_RANK \
24 |     --master_addr $MASTER_ADDR \
25 |     --master_port $MASTER_PORT
26 | "
27 | 
28 | GPT_ARGS="
29 |     --num-layers 24 \
30 |     --hidden-size 1024 \
31 |     --num-attention-heads 16 \
32 |     --seq-length 1024 \
33 |     --max-position-embeddings 1024 \
34 |     --micro-batch-size 8 \
35 |     --global-batch-size 64 \
36 |     --lr 0.00015 \
37 |     --train-iters 500000 \
38 |     --lr-decay-iters 320000 \
39 |     --lr-decay-style cosine \
40 |     --min-lr 1.0e-5 \
41 |     --weight-decay 1e-2 \
42 |     --lr-warmup-fraction .01 \
43 |     --clip-grad 1.0 \
44 |     --fp16
45 | "
46 | 
47 | DATA_ARGS="
48 |     --data-path $DATA_PATH \
49 |     --vocab-file $VOCAB_FILE \
50 |     --merge-file $MERGE_FILE \
51 |     --data-impl mmap \
52 |     --split 949,50,1
53 | "
54 | 
55 | OUTPUT_ARGS="
56 |     --log-interval 100 \
57 |     --save-interval 10000 \
58 |     --eval-interval 1000 \
59 |     --eval-iters 10
60 | "
61 | 
62 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
63 |     $GPT_ARGS \
64 |     $DATA_ARGS \
65 |     $OUTPUT_ARGS \
66 |     --distributed-backend nccl \
67 |     --save $CHECKPOINT_PATH \
68 |     --load $CHECKPOINT_PATH
69 | 


--------------------------------------------------------------------------------
/examples/pretrain_gpt_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Runs the "345M" parameter model
 4 | 
 5 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 6 | 
 7 | GPUS_PER_NODE=8
 8 | # Change for multinode config
 9 | MASTER_ADDR=localhost
10 | MASTER_PORT=6000
11 | NNODES=1
12 | NODE_RANK=0
13 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
14 | 
15 | CHECKPOINT_PATH=<Specify path>
16 | VOCAB_FILE=<Specify path to file>/gpt2-vocab.json
17 | MERGE_FILE=<Specify path to file>/gpt2-merges.txt
18 | DATA_PATH=<Specify path and file prefix>_text_document
19 | 
20 | DISTRIBUTED_ARGS="
21 |     --nproc_per_node $GPUS_PER_NODE \
22 |     --nnodes $NNODES \
23 |     --node_rank $NODE_RANK \
24 |     --master_addr $MASTER_ADDR \
25 |     --master_port $MASTER_PORT
26 | "
27 | 
28 | GPT_ARGS="
29 |     --tensor-model-parallel-size 2 \
30 |     --pipeline-model-parallel-size 2 \
31 |     --sequence-parallel \
32 |     --num-layers 24 \
33 |     --hidden-size 1024 \
34 |     --num-attention-heads 16 \
35 |     --seq-length 1024 \
36 |     --max-position-embeddings 1024 \
37 |     --micro-batch-size 4 \
38 |     --global-batch-size 16 \
39 |     --lr 0.00015 \
40 |     --train-iters 500000 \
41 |     --lr-decay-iters 320000 \
42 |     --lr-decay-style cosine \
43 |     --min-lr 1.0e-5 \
44 |     --weight-decay 1e-2 \
45 |     --lr-warmup-fraction .01 \
46 |     --clip-grad 1.0 \
47 |     --fp16
48 | "
49 | 
50 | DATA_ARGS="
51 |     --data-path $DATA_PATH \
52 |     --vocab-file $VOCAB_FILE \
53 |     --merge-file $MERGE_FILE \
54 |     --data-impl mmap \
55 |     --split 949,50,1
56 | "
57 | 
58 | OUTPUT_ARGS="
59 |     --log-interval 100 \
60 |     --save-interval 10000 \
61 |     --eval-interval 1000 \
62 |     --eval-iters 10
63 | "
64 | 
65 | torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
66 |     $GPT_ARGS \
67 |     $DATA_ARGS \
68 |     $OUTPUT_ARGS \
69 |     --distributed-backend nccl \
70 |     --save $CHECKPOINT_PATH \
71 |     --load $CHECKPOINT_PATH
72 | 
73 | 


--------------------------------------------------------------------------------
/examples/pretrain_ict.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # Runs the "217M" parameter biencoder model for ICT retriever
 4 | 
 5 | RANK=0
 6 | WORLD_SIZE=1
 7 | 
 8 | PRETRAINED_BERT_PATH=<Specify path of pretrained BERT model>
 9 | TEXT_DATA_PATH=<Specify path and file prefix of the text data>
10 | TITLE_DATA_PATH=<Specify path and file prefix od the titles>
11 | CHECKPOINT_PATH=<Specify path>
12 | 
13 | 
14 | python pretrain_ict.py \
15 |         --num-layers 12 \
16 |         --hidden-size 768 \
17 |         --num-attention-heads 12 \
18 |         --tensor-model-parallel-size 1 \
19 |         --micro-batch-size 32 \
20 |         --seq-length 256 \
21 |         --max-position-embeddings 512 \
22 |         --train-iters 100000 \
23 |         --vocab-file bert-vocab.txt \
24 |         --tokenizer-type BertWordPieceLowerCase \
25 |         --DDP-impl torch \
26 |         --bert-load ${PRETRAINED_BERT_PATH} \
27 |         --log-interval 100 \
28 |         --eval-interval 1000 \
29 |         --eval-iters 10 \
30 |         --retriever-report-topk-accuracies 1 5 10 20 100 \
31 |         --retriever-score-scaling \
32 |         --load $CHECKPOINT_PATH \
33 |         --save $CHECKPOINT_PATH \
34 |         --data-path ${TEXT_DATA_PATH} \
35 |         --titles-data-path ${TITLE_DATA_PATH} \
36 |         --lr 0.0001 \
37 |         --lr-decay-style linear \
38 |         --weight-decay 1e-2 \
39 |         --clip-grad 1.0 \
40 |         --lr-warmup-fraction 0.01 \
41 |         --save-interval 4000 \
42 |         --exit-interval 8000 \
43 |         --query-in-block-prob 0.1 \
44 |         --fp16
45 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | CHECKPOINT_PATH=<Specify path>
 6 | VOCAB_FILE=<Specify path to file>/t5-vocab.txt
 7 | DATA_PATH=<Specify path and file prefix>_text_sentence
 8 | 
 9 | T5_ARGS="
10 |     --num-layers 12 \
11 |     --hidden-size 768 \
12 |     --num-attention-heads 12 \
13 |     --kv-channels 64 \
14 |     --ffn-hidden-size 3072 \
15 |     --encoder-seq-length 512 \
16 |     --decoder-seq-length 128 \
17 |     --max-position-embeddings 512 \
18 |     --micro-batch-size 16 \
19 |     --global-batch-size 16 \
20 |     --lr 0.0001 \
21 |     --train-iters 1000000 \
22 |     --lr-decay-iters 1000000 \
23 |     --lr-decay-style linear \
24 |     --min-lr 0.00001 \
25 |     --weight-decay 1e-2 \
26 |     --lr-warmup-fraction .01 \
27 |     --clip-grad 1.0 \
28 |     --fp16 \
29 |     --vocab-extra-ids 100
30 | "
31 | 
32 | DATA_ARGS="
33 |     --data-path $DATA_PATH \
34 |     --vocab-file $VOCAB_FILE \
35 |     --data-impl mmap \
36 |     --split 949,50,1
37 | "
38 | 
39 | OUTPUT_ARGS="
40 |     --log-interval 100 \
41 |     --save-interval 10000 \
42 |     --eval-interval 1000 \
43 |     --eval-iters 10
44 | "
45 | 
46 | torchrun pretrain_t5.py \
47 |     $T5_ARGS \
48 |     $DATA_ARGS \
49 |     $OUTPUT_ARGS \
50 |     --save $CHECKPOINT_PATH \
51 |     --load $CHECKPOINT_PATH
52 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/t5-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | T5_ARGS="
26 |     --num-layers 12 \
27 |     --hidden-size 768 \
28 |     --num-attention-heads 12 \
29 |     --kv-channels 64 \
30 |     --ffn-hidden-size 3072 \
31 |     --encoder-seq-length 512 \
32 |     --decoder-seq-length 128 \
33 |     --max-position-embeddings 512 \
34 |     --micro-batch-size 16 \
35 |     --global-batch-size 128 \
36 |     --lr 0.0001 \
37 |     --train-iters 1000000 \
38 |     --lr-decay-iters 1000000 \
39 |     --lr-decay-style linear \
40 |     --min-lr 0.00001 \
41 |     --weight-decay 1e-2 \
42 |     --lr-warmup-fraction .01 \
43 |     --clip-grad 1.0 \
44 |     --fp16 \
45 |     --vocab-extra-ids 100
46 | "
47 | 
48 | DATA_ARGS="
49 |     --data-path $DATA_PATH \
50 |     --vocab-file $VOCAB_FILE \
51 |     --data-impl mmap \
52 |     --split 949,50,1
53 | "
54 | 
55 | OUTPUT_ARGS="
56 |     --log-interval 100 \
57 |     --save-interval 10000 \
58 |     --eval-interval 1000 \
59 |     --eval-iters 10
60 | "
61 | 
62 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
63 |     $T5_ARGS \
64 |     $DATA_ARGS \
65 |     $OUTPUT_ARGS \
66 |     --distributed-backend nccl \
67 |     --save $CHECKPOINT_PATH \
68 |     --load $CHECKPOINT_PATH
69 | 


--------------------------------------------------------------------------------
/examples/pretrain_t5_distributed_with_mp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 4 | 
 5 | GPUS_PER_NODE=8
 6 | # Change for multinode config
 7 | MASTER_ADDR=localhost
 8 | MASTER_PORT=6000
 9 | NNODES=1
10 | NODE_RANK=0
11 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
12 | 
13 | CHECKPOINT_PATH=<Specify path>
14 | VOCAB_FILE=<Specify path to file>/t5-vocab.txt
15 | DATA_PATH=<Specify path and file prefix>_text_sentence
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | T5_ARGS="
26 |     --tensor-model-parallel-size 2 \
27 |     --num-layers 12 \
28 |     --hidden-size 768 \
29 |     --num-attention-heads 12 \
30 |     --kv-channels 64 \
31 |     --ffn-hidden-size 3072 \
32 |     --encoder-seq-length 512 \
33 |     --decoder-seq-length 128 \
34 |     --max-position-embeddings 512 \
35 |     --micro-batch-size 16 \
36 |     --global-batch-size 128 \
37 |     --lr 0.0001 \
38 |     --train-iters 1000000 \
39 |     --lr-decay-iters 1000000 \
40 |     --lr-decay-style linear \
41 |     --min-lr 0.00001 \
42 |     --weight-decay 1e-2 \
43 |     --lr-warmup-fraction .01 \
44 |     --clip-grad 1.0 \
45 |     --fp16  \
46 |     --vocab-extra-ids 100
47 | "
48 | 
49 | DATA_ARGS="
50 |     --data-path $DATA_PATH \
51 |     --vocab-file $VOCAB_FILE \
52 |     --data-impl mmap \
53 |     --split 949,50,1
54 | "
55 | 
56 | OUTPUT_ARGS="
57 |     --log-interval 100 \
58 |     --save-interval 10000 \
59 |     --eval-interval 1000 \
60 |     --eval-iters 10
61 | "
62 | 
63 | torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
64 |     $T5_ARGS \
65 |     $DATA_ARGS \
66 |     $OUTPUT_ARGS \
67 |     --distributed-backend nccl \
68 |     --save $CHECKPOINT_PATH \
69 |     --load $CHECKPOINT_PATH
70 | 


--------------------------------------------------------------------------------
/examples/run_text_generation_server_345M.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model.
 3 | DISTRIBUTED_ARGS="--nproc_per_node 1 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | export CUDA_DEVICE_MAX_CONNECTIONS=1
14 | 
15 | pip install flask-restful
16 | 
17 | torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
18 |        --tensor-model-parallel-size 1  \
19 |        --pipeline-model-parallel-size 1  \
20 |        --num-layers 24  \
21 |        --hidden-size 1024  \
22 |        --load ${CHECKPOINT}  \
23 |        --num-attention-heads 16  \
24 |        --max-position-embeddings 1024  \
25 |        --tokenizer-type GPT2BPETokenizer  \
26 |        --fp16  \
27 |        --micro-batch-size 1  \
28 |        --seq-length 1024  \
29 |        --out-seq-length 1024  \
30 |        --temperature 1.0  \
31 |        --vocab-file $VOCAB_FILE  \
32 |        --merge-file $MERGE_FILE  \
33 |        --top_p 0.9  \
34 |        --seed 42
35 | 


--------------------------------------------------------------------------------
/examples/run_text_generation_server_345M_8_tensor_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This example will start serving the 345M model that is partitioned 8 way tensor parallel
 3 | DISTRIBUTED_ARGS="--nproc_per_node 8 \
 4 |                   --nnodes 1 \
 5 |                   --node_rank 0 \
 6 |                   --master_addr localhost \
 7 |                   --master_port 6000"
 8 | 
 9 | CHECKPOINT=<Path to checkpoint (e.g /345m)>
10 | VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
11 | MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
12 | 
13 | pip install flask-restful
14 | 
15 | python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
16 |        --tensor-model-parallel-size 8  \
17 |        --pipeline-model-parallel-size 1  \
18 |        --num-layers 24  \
19 |        --hidden-size 1024  \
20 |        --load ${CHECKPOINT}  \
21 |        --num-attention-heads 16  \
22 |        --max-position-embeddings 1024  \
23 |        --tokenizer-type GPT2BPETokenizer  \
24 |        --fp16  \
25 |        --micro-batch-size 1  \
26 |        --seq-length 1024  \
27 |        --out-seq-length 1024  \
28 |        --temperature 1.0  \
29 |        --vocab-file $VOCAB_FILE  \
30 |        --merge-file $MERGE_FILE  \
31 |        --top_p 0.9  \
32 |        --seed 42
33 | 


--------------------------------------------------------------------------------
/examples/sc21/CONFIG.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # SLURM options.
 5 | export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
 6 | export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
 7 | 
 8 | 
 9 | # Source code.
10 | export MEGATRON_CODE_DIR=<megatron source code directory>
11 | 
12 | 
13 | # This variable is used to mount the relevant part of the filesystem
14 | # inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
15 | # launch directory already get mounted; this variable should be used to
16 | # mount the directories that contain the data and tokenizer files.
17 | export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
18 | 
19 | 
20 | # Data and tokenizer files.
21 | MEGATRON_DATA=<path to megatron processed data>
22 | BPE_VOCAB_FILE=<path to bpe vocab file>
23 | BPE_MERGE_FILE=<path to bpe merges file>
24 | 
25 | 
26 | # Megatron input parameters.
27 | # `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
28 | # that are not listed here. 
29 | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
30 | 	--tensor-model-parallel-size ${TP} \
31 | 	--pipeline-model-parallel-size ${PP} \
32 | 	--micro-batch-size ${MBS} \
33 | 	--global-batch-size ${GBS} \
34 |         --num-layers ${NLS} \
35 |         --hidden-size ${HS} \
36 |         --num-attention-heads ${NAH} \
37 | 	--DDP-impl ${DDP} \
38 | 	--data-path ${MEGATRON_DATA} \
39 | 	--vocab-file ${BPE_VOCAB_FILE} \
40 | 	--merge-file ${BPE_MERGE_FILE} \
41 |         --log-interval 5 \
42 |         --seq-length 2048 \
43 |         --max-position-embeddings 2048 \
44 |         --train-iters 500 \
45 |         --lr-decay-iters 320 \
46 |         --lr 0.0001 \
47 | 	--min-lr 0.00001 \
48 |         --lr-decay-style cosine \
49 |         --lr-warmup-fraction 0.01 \
50 |         --split 969,30,1 \
51 |         --eval-iters 100 \
52 |         --eval-interval 1000 \
53 |         --clip-grad 1.0 \
54 |         --fp16 \
55 | 	--loss-scale 8192 "
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/examples/sc21/README.md:
--------------------------------------------------------------------------------
 1 | # Reproducing Figures in SC21 Paper
 2 | 
 3 | 
 4 | This directory contains some of the scripts that were used to produce the
 5 | results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
 6 | to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
 7 | scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
 8 | [pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
 9 | schedulers as well.
10 | 
11 | 
12 | ## Git commit
13 | 
14 | To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e
15 | 
16 | 
17 | ## Setup
18 | 
19 | All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
20 | update the unspecified values (in angle brackets `<...>`) before launching any
21 | scripts.
22 | 
23 | 
24 | 
25 | ## Scripts
26 | 
27 | Below is a list of scripts that can be used to reproduce various figures in our
28 | [paper](https://arxiv.org/pdf/2104.04473.pdf):
29 | 
30 | * [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
31 | for GPT models ranging from 1 billion to 1 trillion parameters.
32 | * [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
33 | performance of pipeline parallelism.
34 | * [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
35 | the interleaved schedule on a 175B GPT model.
36 | * [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
37 | different degrees of pipeline and tensor model parallelism on a model with
38 | 162.2 billion parameters.
39 | * [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
40 | different degrees of data and pipeline model parallelism on a model with
41 | 5.9 billion parameters.
42 | * [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
43 | different degrees of data and tensor model parallelism on a model with
44 | 5.9 billion parameters.
45 | * [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
46 | microbatch size.
47 | * [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
48 | activation recomputation.
49 | * [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
50 | the scatter-gather communication optimization.
51 | 


--------------------------------------------------------------------------------
/examples/sc21/SBATCH.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | sbatch -p ${SLURM_PARTITION} \
 5 |        -A ${SLURM_ACCOUNT} \
 6 |        --job-name=${JOB_NAME} \
 7 |        --nodes=${NNODES} \
 8 |        --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
 9 | 
10 | exit 0
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/examples/sc21/SRUN.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
 4 | 
 5 | 
 6 | THIS_DIR=`pwd`
 7 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 8 | mkdir -p ${THIS_DIR}/logs
 9 | 
10 | 
11 | CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
12 | 
13 | 
14 | srun -l \
15 |      --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
16 |      --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
17 |      --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
18 | 
19 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [1, 2, 4, 8].
 8 | PP=1
 9 | 
10 | # Batch size (global batch size) options = [8, 128].
11 | GBS=8
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel size options.
18 | NLS=$((3*PP))
19 | NNODES=${PP}
20 | 
21 | 
22 | # Other params.
23 | TP=8
24 | MBS=1
25 | HS=20480
26 | NAH=128
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Interleaved schedule options = [YES, NO].
 8 | INTERLEAVED=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set interleaved schedule options.
18 | if [ ${INTERLEAVED} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${INTERLEAVED} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_13.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and tensor-parallel size options.
18 | TP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | MBS=1
23 | NLS=32
24 | HS=20480
25 | NAH=128
26 | DDP=local
27 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
28 | NNODES=8
29 | 
30 | 
31 | # Name of the job.
32 | export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
33 | 
34 | 
35 | # Import the configs.
36 | . `pwd`/CONFIG.sh
37 | 
38 | 
39 | # Submit the job.
40 | . `pwd`/SBATCH.sh
41 | 
42 | 
43 | exit 0
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_14.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Pipeline-parallel size options = [2, 4, 8, 16, 32].
 8 | PP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set pipeline-parallel and data-parallel size options.
18 | DP=$((64/PP))
19 | 
20 | 
21 | # Other params.
22 | TP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Tensor-parallel size options = [2, 4, 8, 16, 32].
 8 | TP=2
 9 | 
10 | # Batch size (global batch size) options = [32, 128, 512].
11 | GBS=32
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set tensor-parallel and data-parallel size options.
18 | DP=$((64/TP))
19 | 
20 | 
21 | # Other params.
22 | PP=1
23 | MBS=1
24 | NLS=32
25 | HS=3840
26 | NAH=32
27 | DDP=local
28 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
29 | NNODES=8
30 | 
31 | 
32 | # Name of the job.
33 | export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
34 | 
35 | 
36 | # Import the configs.
37 | . `pwd`/CONFIG.sh
38 | 
39 | 
40 | # Submit the job.
41 | . `pwd`/SBATCH.sh
42 | 
43 | 
44 | exit 0
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Microbatch size options = [1, 2, 4, 8].
 8 | MBS=1
 9 | 
10 | # Batch size (global batch size) options = [128, 512].
11 | GBS=128
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Other params.
18 | TP=8
19 | PP=8
20 | NLS=32
21 | HS=15360
22 | NAH=128
23 | DDP=local
24 | MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
25 | NNODES=8
26 | 
27 | 
28 | # Name of the job.
29 | export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
30 | 
31 | 
32 | # Import the configs.
33 | . `pwd`/CONFIG.sh
34 | 
35 | 
36 | # Submit the job.
37 | . `pwd`/SBATCH.sh
38 | 
39 | 
40 | exit 0
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_17.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Activation recomputation options = [YES, NO].
 8 | ACTIVATION_RECOMPUTATION=YES
 9 | 
10 | # Batch size (global batch size) options = [1, 2, 4, ..., 256].
11 | GBS=1
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set activation recomputation.
18 | if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
20 | elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS=""
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=16
31 | MBS=1
32 | NLS=80
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=16
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/examples/sc21/run_figure_18.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ================================
 4 | # Choose the case to run.
 5 | # ================================
 6 | 
 7 | # Scatter-gather communication optimization options = [YES, NO].
 8 | SCATTER_GATHER=YES
 9 | 
10 | # Batch size (global batch size) options = [12, 24, 36, ..., 60].
11 | GBS=12
12 | 
13 | 
14 | 
15 | 
16 | 
17 | # Set scatter-gather communication optimization options.
18 | if [ ${SCATTER_GATHER} == "YES" ]; then
19 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
20 | elif [ ${SCATTER_GATHER} == "NO" ]; then
21 |     MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
22 | else
23 |     echo "Invalid configuration"
24 |     exit 1
25 | fi
26 | 
27 | 
28 | # Other params.
29 | TP=8
30 | PP=12
31 | MBS=1
32 | NLS=96
33 | HS=12288
34 | NAH=96
35 | DDP=local
36 | NNODES=12
37 | 
38 | 
39 | # Name of the job.
40 | export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
41 | 
42 | 
43 | # Import the configs.
44 | . `pwd`/CONFIG.sh
45 | 
46 | 
47 | # Submit the job.
48 | . `pwd`/SBATCH.sh
49 | 
50 | 
51 | exit 0
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/images/Achieved_petaFLOPs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/images/Achieved_petaFLOPs.png


--------------------------------------------------------------------------------
/images/cases_april2021.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/images/cases_april2021.png


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args, get_retro_args
 6 | from .global_vars import get_current_global_batch_size
 7 | from .global_vars import get_num_microbatches
 8 | from .global_vars import get_signal_handler
 9 | from .global_vars import update_num_microbatches
10 | from .global_vars import get_tokenizer
11 | from .global_vars import get_tensorboard_writer
12 | from .global_vars import get_adlr_autoresume
13 | from .global_vars import get_timers
14 | from .initialize  import initialize_megatron
15 | 
16 | from .utils import (print_rank_0,
17 |                     is_last_rank,
18 |                     print_rank_last)
19 | 


--------------------------------------------------------------------------------
/megatron/core/README.md:
--------------------------------------------------------------------------------
1 | Megatron Core is a library for efficient and scalable training of transformer based models.
2 | 


--------------------------------------------------------------------------------
/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import megatron.core.parallel_state
 2 | import megatron.core.tensor_parallel
 3 | import megatron.core.utils
 4 | 
 5 | from .inference_params import InferenceParams
 6 | from .model_parallel_config import ModelParallelConfig
 7 | 
 8 | # Alias parallel_state as mpu, its legacy name
 9 | mpu = parallel_state
10 | 
11 | __all__ = ["parallel_state", "tensor_parallel", "utils", "InferenceParams", "ModelParallelConfig"]
12 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .core import check_is_distributed_checkpoint
4 | from .mapping import LocalNonpersitentObject, ShardedTensor
5 | from .serialization import load, load_common_state_dict, save
6 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/core.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import json
 4 | from dataclasses import asdict, dataclass
 5 | from pathlib import Path
 6 | from typing import Optional
 7 | 
 8 | CONFIG_FNAME = 'metadata.json'
 9 | 
10 | 
11 | class CheckpointingException(Exception):
12 |     pass
13 | 
14 | 
15 | @dataclass
16 | class CheckpointingConfig:
17 |     """ Documents backends used in the checkpoint. """
18 | 
19 |     sharded_backend: str
20 |     sharded_backend_version: int = 1
21 |     common_backend: str = 'torch'
22 |     common_backend_version: int = 1
23 | 
24 | 
25 | def check_is_distributed_checkpoint(checkpoint_dir):
26 |     return maybe_load_config(checkpoint_dir) is not None
27 | 
28 | 
29 | def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]:
30 |     config_path = Path(checkpoint_dir, CONFIG_FNAME)
31 |     if not config_path.exists():
32 |         return None
33 |     with config_path.open() as f:
34 |         config_dict = json.load(f)
35 |     return CheckpointingConfig(**config_dict)
36 | 
37 | 
38 | def save_config(config: CheckpointingConfig, checkpoint_dir: str):
39 |     config_path = Path(checkpoint_dir, CONFIG_FNAME)
40 |     with config_path.open('w') as f:
41 |         json.dump(asdict(config), f)
42 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/strategies/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """ Various loading and saving strategies """
 4 | 
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | try:
10 |     import tensorstore
11 |     import zarr
12 | 
13 |     from .tensorstore import _import_trigger
14 |     from .zarr import _import_trigger
15 | except ImportError:
16 |     logger.warning('Zarr-based strategies will not be registered because of missing packages')
17 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/strategies/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from collections import defaultdict
 5 | from enum import Enum
 6 | from pathlib import Path
 7 | from typing import Dict, List, Optional
 8 | 
 9 | from ..mapping import CheckpointingException, ShardedStateDict, ShardedTensor, StateDict
10 | 
11 | 
12 | class StrategyAction(Enum):
13 |     LOAD_COMMON = 'load_common'
14 |     LOAD_SHARDED = 'load_sharded'
15 |     SAVE_COMMON = 'save_common'
16 |     SAVE_SHARDED = 'save_sharded'
17 | 
18 | 
19 | default_strategies = defaultdict(dict)
20 | 
21 | 
22 | def get_default_strategy(action: StrategyAction, backend: str, version: int):
23 |     try:
24 |         return default_strategies[action.value][(backend, version)]
25 |     except KeyError as e:
26 |         raise CheckpointingException(
27 |             f'Cannot find default strategy for: {(action, backend, version)}'
28 |         ) from e
29 | 
30 | 
31 | class LoadStrategyBase(ABC):
32 |     @abstractmethod
33 |     def check_backend_compatibility(self, loaded_version):
34 |         raise NotImplementedError
35 | 
36 |     @abstractmethod
37 |     def check_version_compatibility(self, loaded_version):
38 |         raise NotImplementedError
39 | 
40 | 
41 | class SaveStrategyBase(ABC):
42 |     def __init__(self, backend: str, version: int):
43 |         self.backend = backend
44 |         self.version = version
45 | 
46 | 
47 | class LoadCommonStrategy(LoadStrategyBase):
48 |     @abstractmethod
49 |     def load(self, checkpoint_dir: Path):
50 |         raise NotImplementedError
51 | 
52 | 
53 | class LoadShardedStrategy(LoadStrategyBase):
54 |     @abstractmethod
55 |     def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path):
56 |         raise NotImplementedError
57 | 
58 | 
59 | class SaveCommonStrategy(SaveStrategyBase):
60 |     @abstractmethod
61 |     def save(self, common_state_dict: StateDict, checkpoint_dir: Path):
62 |         raise NotImplementedError
63 | 
64 | 
65 | class SaveShardedStrategy(SaveStrategyBase):
66 |     @abstractmethod
67 |     def save(self, sharded_tensors: List[ShardedTensor], checkpoint_dir: Path):
68 |         raise NotImplementedError
69 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from typing import Tuple
 4 | 
 5 | from .dict_utils import dict_list_map_inplace, extract_matching_values
 6 | from .mapping import LocalNonpersitentObject, ShardedStateDict, ShardedTensor, StateDict
 7 | 
 8 | 
 9 | def extract_sharded_tensors(
10 |     sharded_state_dict: ShardedStateDict,
11 | ) -> Tuple[ShardedStateDict, StateDict]:
12 |     return extract_matching_values(sharded_state_dict, lambda v: isinstance(v, ShardedTensor))
13 | 
14 | 
15 | def extract_sharded_tensors_or_nonpersistent(
16 |     sharded_state_dict: ShardedStateDict,
17 | ) -> Tuple[ShardedStateDict, StateDict]:
18 |     return extract_matching_values(
19 |         sharded_state_dict, lambda v: isinstance(v, (ShardedTensor, LocalNonpersitentObject))
20 |     )
21 | 
22 | 
23 | def add_prefix_for_sharding(sharded_state_dict: ShardedStateDict, prefix: str):
24 |     def add_prefix(t):
25 |         if isinstance(t, ShardedTensor):
26 |             t.key = f'{prefix}.{t.key}'
27 |         return t
28 | 
29 |     dict_list_map_inplace(add_prefix, sharded_state_dict)
30 | 


--------------------------------------------------------------------------------
/megatron/core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | class ModelType(enum.Enum):
 7 |     encoder_or_decoder = 1
 8 |     encoder_and_decoder = 2
 9 |     retro_encoder = 3
10 |     retro_decoder = 4
11 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/core/fusions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from typing import Optional, Tuple
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def _bias_dropout_add_func(x, bias, residual, prob, training):
 9 |     # type: (Tensor, Optional[Tensor], Tensor, float, bool) -> Tensor
10 |     # NOTE: Previously, the argument `bias` used to be passed as
11 |     # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
12 |     # transformer layer but broadcasting should automatically take care of that.
13 |     # Also, looking at broadcasting semantics, `expand_as` and broadcasting
14 |     # seem to be identical performance-wise (both just change the view).
15 | 
16 |     # If we want to train mixed precision, then the output of this function
17 |     # should be half precision. However, in AMP O1, the input (residual) is
18 |     # in fp32, and it will up-cast the result to fp32, causing pipeline parallel
19 |     # GPU communication to hang. Therefore, we need to cast residual to the same
20 |     # dtype as x.
21 |     residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
22 |     if bias is not None:
23 |         x = x + bias
24 |     out = torch.nn.functional.dropout(x, p=prob, training=training)
25 |     out = residual + out
26 |     return out
27 | 
28 | 
29 | def get_bias_dropout_add(training, fused):
30 |     def unfused_bias_dropout_add(x_with_bias, residual, prob):
31 |         x, bias = x_with_bias  # unpack
32 |         return _bias_dropout_add_func(x, bias, residual, prob, training)
33 | 
34 |     @torch.jit.script
35 |     def bias_dropout_add_fused_train(
36 |         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
37 |         residual: torch.Tensor,
38 |         prob: float,
39 |     ) -> torch.Tensor:
40 |         x, bias = x_with_bias  # unpack
41 |         return _bias_dropout_add_func(x, bias, residual, prob, True)
42 | 
43 |     @torch.jit.script
44 |     def bias_dropout_add_fused_inference(
45 |         x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]],
46 |         residual: torch.Tensor,
47 |         prob: float,
48 |     ) -> torch.Tensor:
49 |         x, bias = x_with_bias  # unpack
50 |         return _bias_dropout_add_func(x, bias, residual, prob, False)
51 | 
52 |     if fused:
53 |         # jit scripting for a nn.module (with dropout) is not
54 |         # triggering the fusion kernel. For now, we use two
55 |         # different nn.functional routines to account for varying
56 |         # dropout semantics during training and inference phases.
57 |         if training:
58 |             return bias_dropout_add_fused_train
59 |         else:
60 |             return bias_dropout_add_fused_inference
61 |     else:
62 |         return unfused_bias_dropout_add
63 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 6 | # 1/sqrt(2*pi)-> 0.3989423
 7 | # 1/sqrt(2)   -> 0.70710678
 8 | # sqrt(2/pi)  -> 0.79788456
 9 | # this function is tanh approximation of gelu
10 | # actual gelu is:
11 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
12 | 
13 | 
14 | @torch.jit.script
15 | def bias_gelu(bias, y):
16 |     x = bias + y
17 |     return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18 | 
19 | 
20 | # gradient of tanh approximation of gelu
21 | # gradient of actual gelu is:
22 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
23 | @torch.jit.script
24 | def bias_gelu_back(g, bias, y):
25 |     x = bias + y
26 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
27 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
28 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
29 |         1 + tanh_out
30 |     )
31 |     return ff * g
32 | 
33 | 
34 | class GeLUFunction(torch.autograd.Function):
35 |     @staticmethod
36 |     # bias is an optional argument
37 |     def forward(ctx, input, bias):
38 |         ctx.save_for_backward(input, bias)
39 |         return bias_gelu(bias, input)
40 | 
41 |     @staticmethod
42 |     def backward(ctx, grad_output):
43 |         input, bias = ctx.saved_tensors
44 |         tmp = bias_gelu_back(grad_output, bias, input)
45 |         return tmp, tmp
46 | 
47 | 
48 | bias_gelu_impl = GeLUFunction.apply
49 | 


--------------------------------------------------------------------------------
/megatron/core/inference_params.py:
--------------------------------------------------------------------------------
 1 | class InferenceParams:
 2 |     """Inference parameters that are passed to the main model in order
 3 |     to efficienly calculate and store the context during inference."""
 4 | 
 5 |     def __init__(self, max_batch_size, max_sequence_length):
 6 |         self.max_sequence_length = max_sequence_length
 7 |         self.max_batch_size = max_batch_size
 8 |         self.sequence_len_offset = 0
 9 |         self.batch_size_offset = 0
10 |         self.key_value_memory_dict = {}
11 | 
12 |     def swap_key_value_dict(self, batch_idx):
13 |         "swap between batches"
14 |         if len(self.key_value_memory_dict) == 0:
15 |             raise ValueError("should not swap when dict in empty")
16 | 
17 |         for layer_number in self.key_value_memory_dict.keys():
18 |             inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
19 |             assert (
20 |                 len(batch_idx) == inference_key_memory.shape[1]
21 |             )  # make sure batch size is the same
22 |             new_inference_key_memory = inference_key_memory[:, batch_idx]
23 |             new_inference_value_memory = inference_value_memory[:, batch_idx]
24 |             self.key_value_memory_dict[layer_number] = (
25 |                 new_inference_key_memory,
26 |                 new_inference_value_memory,
27 |             )
28 | 


--------------------------------------------------------------------------------
/megatron/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/core/models/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/rotary_pos_embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import importlib.util
 4 | 
 5 | import torch
 6 | from torch import einsum, nn
 7 | 
 8 | __all__ = ['RotaryEmbedding', 'apply_rotary_pos_emb']
 9 | 
10 | 
11 | class RotaryEmbedding(nn.Module):
12 |     def __init__(self, dim, seq_len_interpolation_factor=None):
13 |         super().__init__()
14 |         self.seq_len_interpolation_factor = seq_len_interpolation_factor
15 |         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
16 |         self.register_buffer('inv_freq', inv_freq)
17 | 
18 |     def forward(self, max_seq_len, offset=0):
19 |         seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
20 |         if self.seq_len_interpolation_factor is not None:
21 |             seq = seq.type_as(self.inv_freq)
22 |             seq *= 1 / self.seq_len_interpolation_factor
23 |         freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
24 |         # first part even vector components, second part odd vector components,
25 |         #  2 * dim in dimension size
26 |         emb = torch.cat((freqs, freqs), dim=-1)
27 |         # emb [seq_length, .., dim]
28 |         return emb[:, None, None, :]
29 | 
30 | 
31 | def _rotate_half(x):
32 |     """
33 |     change sign so the last dimension becomes [-odd, +even]
34 |     """
35 |     x1, x2 = torch.chunk(x, 2, dim=-1)
36 |     return torch.cat((-x2, x1), dim=-1)
37 | 
38 | 
39 | def apply_rotary_pos_emb(t, freqs):
40 |     """
41 |     input tensor t is of shape [seq_length, ..., dim]
42 |     rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
43 |     check https://kexue.fm/archives/8265 for detailed formulas
44 |     """
45 |     rot_dim = freqs.shape[-1]
46 |     # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
47 |     t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
48 | 
49 |     # first part is cosine component
50 |     # second part is sine component, need to change signs with _rotate_half method
51 |     t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
52 |     return torch.cat((t, t_pass), dim=-1)
53 | 


--------------------------------------------------------------------------------
/megatron/core/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_model import GPTModel
2 | 


--------------------------------------------------------------------------------
/megatron/core/package_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | 
 4 | MAJOR = 0
 5 | MINOR = 2
 6 | PATCH = 0
 7 | PRE_RELEASE = ''
 8 | 
 9 | # Use the following formatting: (major, minor, patch, pre-release)
10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
11 | 
12 | __shortversion__ = '.'.join(map(str, VERSION[:3]))
13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
14 | 
15 | __package_name__ = 'megatron_core'
16 | __contact_names__ = 'NVIDIA'
17 | __contact_emails__ = 'nemo-toolkit@nvidia.com'  # use NeMo Email
18 | __homepage__ = (
19 |     'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'  # use NeMo homepage
20 | )
21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
23 | __description__ = (
24 |     'Megatron Core - a library for efficient and scalable training of transformer based models'
25 | )
26 | __license__ = 'BSD-3'
27 | __keywords__ = (
28 |     'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
29 | )
30 | 


--------------------------------------------------------------------------------
/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedules import get_forward_backward_func
2 | from .sp_utils import get_tflops
3 | 


--------------------------------------------------------------------------------
/megatron/core/pipeline_parallel/split_solver.py:
--------------------------------------------------------------------------------
 1 | from sympy import symbols, Eq, solve
 2 | 
 3 | def round_down(x, tp_size):
 4 |     return x // tp_size * tp_size
 5 | class solver:
 6 |     def __init__(self, total_seqlen, config, causal=True):
 7 |         self.total_seqlen = total_seqlen 
 8 |         self.config = config
 9 |         self.total_tflops = config.get_seq_tflops(total_seqlen, causal)
10 |         
11 | 
12 |     def solve_partition(self, num_splits, tp_size=1):
13 |         res = []
14 |         prefix = self.total_seqlen
15 |         for i in range(1, num_splits):
16 |             seqlen = symbols('seqlen')
17 |             tflops = self.config.get_prefix_tflops(seqlen, prefix)
18 |             eq = Eq(tflops, self.total_tflops / num_splits)
19 |             sol = solve(eq, seqlen)
20 |             sol = round_down(int(sol[0]), tp_size)
21 |             res.insert(0, int(sol))
22 |             prefix -= int(sol)
23 |         res.insert(0, prefix)
24 |         return res
25 |         
26 | 
27 | if __name__ == "__main__":
28 |     from sp_utils import SeqTFlops
29 |     kw = {
30 |         "num_layers": 24,
31 |         "hidden_size": 4096,
32 |         "ffn_size": 16384,
33 |         "num_heads": 32,
34 |         "dim_head": 128,
35 |         "vocab_size": 32000
36 |     }
37 |     config = SeqTFlops(**kw)
38 |     s = solver(16384, config)
39 |     s.solve_partition(4, 2)
40 |         
41 |         
42 |         
43 |         
44 |     
45 |     
46 |     
47 |     
48 | 


--------------------------------------------------------------------------------
/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | torch


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cross_entropy import vocab_parallel_cross_entropy
 2 | from .data import broadcast_data
 3 | from .layers import (
 4 |     ColumnParallelLinear,
 5 |     RowParallelLinear,
 6 |     VocabParallelEmbedding,
 7 |     copy_tensor_model_parallel_attributes,
 8 |     linear_with_grad_accumulation_and_async_allreduce,
 9 |     param_is_not_tensor_parallel_duplicate,
10 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
11 |     set_tensor_model_parallel_attributes,
12 | )
13 | from .mappings import (
14 |     copy_to_tensor_model_parallel_region,
15 |     gather_from_sequence_parallel_region,
16 |     gather_from_tensor_model_parallel_region,
17 |     scatter_to_sequence_parallel_region,
18 |     scatter_to_tensor_model_parallel_region,
19 | )
20 | from .random import checkpoint, get_cuda_rng_tracker, model_parallel_cuda_manual_seed
21 | from .utils import (
22 |     gather_split_1d_tensor,
23 |     split_tensor_along_last_dim,
24 |     split_tensor_into_1d_equal_chunks,
25 | )
26 | 
27 | __all__ = [
28 |     # cross_entropy.py
29 |     "vocab_parallel_cross_entropy",
30 |     # data.py
31 |     "broadcast_data",
32 |     # layers.py
33 |     "ColumnParallelLinear",
34 |     "RowParallelLinear",
35 |     "VocabParallelEmbedding",
36 |     "set_tensor_model_parallel_attributes",
37 |     "set_defaults_if_not_set_tensor_model_parallel_attributes",
38 |     "copy_tensor_model_parallel_attributes",
39 |     "param_is_not_tensor_parallel_duplicate",
40 |     "linear_with_grad_accumulation_and_async_allreduce",
41 |     # mappings.py
42 |     "copy_to_tensor_model_parallel_region",
43 |     "gather_from_tensor_model_parallel_region",
44 |     "gather_from_sequence_parallel_region",
45 |     #    "reduce_from_tensor_model_parallel_region",
46 |     "scatter_to_tensor_model_parallel_region",
47 |     "scatter_to_sequence_parallel_region",
48 |     # random.py
49 |     "checkpoint",
50 |     "get_cuda_rng_tracker",
51 |     "model_parallel_cuda_manual_seed",
52 |     # utils.py
53 |     "split_tensor_along_last_dim",
54 |     "split_tensor_into_1d_equal_chunks",
55 |     "gather_split_1d_tensor",
56 | ]
57 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .transformer_config import TransformerConfig
4 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | # can we get rid of this?
 7 | # it's being used in pipeline schedules
 8 | class ModelType(enum.Enum):
 9 |     encoder_or_decoder = 1
10 |     encoder_and_decoder = 2
11 | 
12 | 
13 | # class LayerType(enum.Enum):
14 | #     encoder = 1
15 | #     decoder = 2
16 | 
17 | 
18 | class AttnType(enum.Enum):
19 |     self_attn = 1
20 |     cross_attn = 2
21 | 
22 | 
23 | class AttnMaskType(enum.Enum):
24 |     padding = 1
25 |     causal = 2
26 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/identity_op.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | class IdentityOp(torch.nn.Module):
 6 |     """
 7 |     This is a placeholder for IdentityOp (NoOp)
 8 |     """
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super(IdentityOp, self).__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Utilities for transformer layers."""
 4 | 
 5 | import torch
 6 | 
 7 | from megatron import get_args
 8 | 
 9 | 
10 | def attention_mask_func(attention_scores, attention_mask):
11 |     attention_scores.masked_fill_(attention_mask, -10000.0)
12 |     return attention_scores
13 | 
14 | 
15 | def get_linear_layer(rows, columns, init_method):
16 |     """Simple linear layer with weight initialization."""
17 |     layer = torch.nn.Linear(rows, columns)
18 |     if get_args().perform_initialization:
19 |         init_method(layer.weight)
20 |     with torch.no_grad():
21 |         layer.bias.zero_()
22 |     return layer
23 | 
24 | 
25 | @torch.jit.script
26 | def gelu_impl(x):
27 |     """OpenAI's gelu implementation."""
28 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
29 | 
30 | 
31 | def openai_gelu(x):
32 |     return gelu_impl(x)
33 | 
34 | 
35 | # This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
36 | @torch.jit.script
37 | def erf_gelu(x):
38 |     return (
39 |         x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype) + torch.ones_like(x).to(dtype=x.dtype))
40 |     )
41 | 


--------------------------------------------------------------------------------
/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/megatron/data/multimodal_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from PIL import Image, UnidentifiedImageError
 4 | import numpy as np
 5 | import io
 6 | import torch
 7 | 
 8 | try:
 9 |     from torchvision.transforms import InterpolationMode
10 |     BICUBIC = InterpolationMode.BICUBIC
11 | except ImportError:
12 |     BICUBIC = Image.BICUBIC
13 | 
14 | from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize
15 | 
16 | def _convert_image_to_rgb(image):
17 |     return image.convert("RGB")
18 | 
19 | def _transform(img_h, img_w):
20 |     return Compose([
21 |         ToPILImage(),
22 |         RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0), interpolation=BICUBIC),
23 |         _convert_image_to_rgb,
24 |         ToTensor(),
25 |         Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
26 |     ])
27 | 
28 | class MultiModalDataset(torch.utils.data.Dataset):
29 | 
30 |     def __init__(self, name, data_prefix, indexed_dataset,
31 |                  num_samples, seq_length, seed, img_h, img_w):
32 | 
33 |         self.name = name
34 |         self.indexed_dataset = indexed_dataset
35 |         self.doc_idx = indexed_dataset.get_doc_idx()
36 |         self.visual_transform = _transform(img_h, img_w)
37 | 
38 |     def __len__(self):
39 |         return self.indexed_dataset.sizes.shape[0]
40 | 
41 |     def __getitem__(self, idx):
42 |         text_sample, mode = self.indexed_dataset.get(self.doc_idx[idx])
43 |         assert mode == 0
44 |         img_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]+1)
45 |         assert mode == 1
46 |         img_pad = img_sample[0].item()
47 |         xs = img_sample[1:].tobytes(order='C')
48 |         xs = xs[:len(xs)-img_pad]
49 | 
50 |         img_sample = np.array(Image.open(io.BytesIO(xs)))
51 |         img_sample = self.visual_transform(img_sample).reshape(-1)
52 | 
53 |         return {'text': np.array(text_sample, dtype=np.int64),
54 |                 'img': np.array(img_sample, dtype=np.float32)}
55 | 


--------------------------------------------------------------------------------
/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/megatron/dist_signal_handler.py:
--------------------------------------------------------------------------------
 1 | import signal
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_world_size():
 7 |     if torch.distributed.is_available() and torch.distributed.is_initialized():
 8 |         world_size = torch.distributed.get_world_size()
 9 |     else:
10 |         world_size = 1
11 |     return world_size
12 | 
13 | 
14 | def get_device(local_rank=None):
15 |     backend = torch.distributed.get_backend()
16 |     if backend == 'nccl':
17 |         if local_rank is None:
18 |             device = torch.device('cuda')
19 |         else:
20 |             device = torch.device(f'cuda:{local_rank}')
21 |     elif backend == 'gloo':
22 |         device = torch.device('cpu')
23 |     else:
24 |         raise RuntimeError
25 |     return device
26 | 
27 | 
28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None):
29 |     if not torch.distributed.is_available() or \
30 |        not torch.distributed.is_initialized():
31 |         return [item]
32 | 
33 |     device = get_device(local_rank)
34 | 
35 |     if group is not None:
36 |         group_size = group.size()
37 |     else:
38 |         group_size = get_world_size()
39 | 
40 |     tensor = torch.tensor([item], device=device, dtype=dtype)
41 |     output_tensors = [
42 |         torch.zeros(1, dtype=tensor.dtype, device=tensor.device)
43 |         for _ in range(group_size)
44 |     ]
45 |     torch.distributed.all_gather(output_tensors, tensor, group, async_op)
46 |     output = [elem.item() for elem in output_tensors]
47 |     return output
48 | 
49 | 
50 | class DistributedSignalHandler:
51 |     def __init__(self, sig=signal.SIGTERM):
52 |         self.sig = sig
53 | 
54 |     def signals_received(self):
55 |         all_received = all_gather_item(
56 |             self._signal_received, dtype=torch.int32
57 |         )
58 |         return all_received
59 | 
60 |     def __enter__(self):
61 |         self._signal_received = False
62 |         self.released = False
63 |         self.original_handler = signal.getsignal(self.sig)
64 | 
65 |         def handler(signum, frame):
66 |             self._signal_received = True
67 | 
68 |         signal.signal(self.sig, handler)
69 | 
70 |         return self
71 | 
72 |     def __exit__(self, type, value, tb):
73 |         self.release()
74 | 
75 |     def release(self):
76 |         if self.released:
77 |             return False
78 | 
79 |         signal.signal(self.sig, self.original_handler)
80 |         self.released = True
81 |         return True
82 | 


--------------------------------------------------------------------------------
/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import os
 4 | import pathlib
 5 | import subprocess
 6 | 
 7 | from torch.utils import cpp_extension
 8 | 
 9 | # Setting this param to a list has a problem of generating different
10 | # compilation commands (with diferent order of architectures) and
11 | # leading to recompilation of fused kernels. Set it to empty string
12 | # to avoid recompilation and assign arch flags explicity in
13 | # extra_cuda_cflags below
14 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
15 | 
16 | 
17 | def load(args):
18 | 
19 |     # Check if cuda 11 is installed for compute capability 8.0
20 |     cc_flag = []
21 |     _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
22 |         cpp_extension.CUDA_HOME
23 |     )
24 |     if int(bare_metal_major) >= 11:
25 |         cc_flag.append('-gencode')
26 |         cc_flag.append('arch=compute_80,code=sm_80')
27 |         if int(bare_metal_minor) >= 8:
28 |             cc_flag.append('-gencode')
29 |             cc_flag.append('arch=compute_90,code=sm_90')
30 | 
31 |     # Build path
32 |     srcpath = pathlib.Path(__file__).parent.absolute()
33 |     buildpath = srcpath / "build"
34 |     _create_build_dir(buildpath)
35 | 
36 |     # Helper function to build the kernels.
37 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
38 |         return cpp_extension.load(
39 |             name=name,
40 |             sources=sources,
41 |             build_directory=buildpath,
42 |             extra_cflags=[
43 |                 "-O3",
44 |             ],
45 |             extra_cuda_cflags=[
46 |                 "-O3",
47 |                 "-gencode",
48 |                 "arch=compute_70,code=sm_70",
49 |                 "--use_fast_math",
50 |             ]
51 |             + extra_cuda_flags
52 |             + cc_flag,
53 |             verbose=(args.rank == 0),
54 |         )
55 | 
56 | 
57 | def _get_cuda_bare_metal_version(cuda_dir):
58 |     raw_output = subprocess.check_output(
59 |         [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
60 |     )
61 |     output = raw_output.split()
62 |     release_idx = output.index("release") + 1
63 |     release = output[release_idx].split(".")
64 |     bare_metal_major = release[0]
65 |     bare_metal_minor = release[1][0]
66 | 
67 |     return raw_output, bare_metal_major, bare_metal_minor
68 | 
69 | 
70 | def _create_build_dir(buildpath):
71 |     try:
72 |         os.mkdir(buildpath)
73 |     except OSError:
74 |         if not os.path.isdir(buildpath):
75 |             print(f"Creation of the build directory {buildpath} failed")
76 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 4 | 
 5 | from .distributed import DistributedDataParallel
 6 | from .bert_model import BertModel
 7 | from .gpt_model import GPTModel
 8 | from .t5_model import T5Model
 9 | from .language_model import get_language_model
10 | from .module import Float16Module
11 | 


--------------------------------------------------------------------------------
/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 |     retro_decoder_with_retriever = 5
11 |  
12 | class AttnType(enum.Enum):
13 |     self_attn = 1
14 |     cross_attn = 2
15 | 
16 | class AttnMaskType(enum.Enum):
17 |     padding = 1
18 |     causal = 2
19 | 
20 | # For backward compatibility with old model checkpoints
21 | from megatron.core.enums import ModelType
22 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 7 | # 1/sqrt(2*pi)-> 0.3989423
 8 | # 1/sqrt(2)   -> 0.70710678
 9 | # sqrt(2/pi)  -> 0.79788456
10 | # this function is tanh approximation of gelu
11 | # actual gelu is:
12 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
13 | 
14 | @torch.jit.script
15 | def bias_gelu(bias, y):
16 |     x = bias + y
17 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
18 | 
19 | # gradient of tanh approximation of gelu
20 | # gradient of actual gelu is:
21 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
22 | @torch.jit.script
23 | def bias_gelu_back(g, bias, y):
24 |     x = bias + y
25 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
26 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
27 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
28 |     return ff*g
29 | 
30 | class GeLUFunction(torch.autograd.Function):
31 |     @staticmethod
32 |     # bias is an optional argument
33 |     def forward(ctx, input, bias):
34 |         ctx.save_for_backward(input, bias)
35 |         return bias_gelu(bias, input)
36 | 
37 |     @staticmethod
38 |     def backward(ctx, grad_output):
39 |         input, bias = ctx.saved_tensors
40 |         tmp = bias_gelu_back(grad_output, bias, input)
41 |         return tmp, tmp
42 | 
43 | bias_gelu_impl = GeLUFunction.apply
44 | 


--------------------------------------------------------------------------------
/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Utilities for models."""
 4 | 
 5 | import math
 6 | 
 7 | import torch
 8 | 
 9 | from megatron import get_args
10 | 
11 | def init_method_normal(sigma):
12 |     """Init method based on N(0, sigma)."""
13 |     def init_(tensor):
14 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
15 | 
16 |     return init_
17 | 
18 | 
19 | def scaled_init_method_normal(sigma, num_layers):
20 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
21 |     std = sigma / math.sqrt(2.0 * num_layers)
22 | 
23 |     def init_(tensor):
24 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
25 | 
26 |     return init_
27 | 
28 | 
29 | def attention_mask_func(attention_scores, attention_mask):
30 |     attention_scores.masked_fill_(attention_mask, -10000.0)
31 |     return attention_scores
32 | 
33 | 
34 | def get_linear_layer(rows, columns, init_method):
35 |     """Simple linear layer with weight initialization."""
36 |     layer = torch.nn.Linear(rows, columns)
37 |     if get_args().perform_initialization:
38 |         init_method(layer.weight)
39 |     with torch.no_grad():
40 |         layer.bias.zero_()
41 |     return layer
42 | 
43 | @torch.jit.script
44 | def gelu_impl(x):
45 |     """OpenAI's gelu implementation."""
46 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
47 |                                        (1.0 + 0.044715 * x * x)))
48 | def openai_gelu(x):
49 |     return gelu_impl(x)
50 | 
51 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
52 | @torch.jit.script
53 | def erf_gelu(x):
54 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
55 | 


--------------------------------------------------------------------------------
/megatron/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def resize(input,
 7 |            size=None,
 8 |            scale_factor=None,
 9 |            mode='nearest',
10 |            align_corners=None,
11 |            warning=True):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
18 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
19 |                         and (output_w - 1) % (input_w - 1)):
20 |                     warnings.warn(
21 |                         f'When align_corners={align_corners}, '
22 |                         'the output would more aligned if '
23 |                         f'input size {(input_h, input_w)} is `x+1` and '
24 |                         f'out size {(output_h, output_w)} is `nx+1`')
25 |     if isinstance(size, torch.Size):
26 |         size = tuple(int(x) for x in size)
27 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
28 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import os
 5 | import random
 6 | import numpy
 7 | import torch
 8 | 
 9 | import mpu
10 | 
11 | 
12 | class IdentityLayer(torch.nn.Module):
13 |     def __init__(self, size, scale=1.0):
14 |         super(IdentityLayer, self).__init__()
15 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
16 | 
17 |     def forward(self):
18 |         return self.weight
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     """Set random seed for reproducability."""
23 |     random.seed(seed)
24 |     numpy.random.seed(seed)
25 |     torch.manual_seed(seed)
26 |     mpu.model_parallel_cuda_manual_seed(seed)
27 | 
28 | 
29 | def initialize_distributed(backend='nccl'):
30 |     """Initialize torch.distributed."""
31 |     # Get local rank in case it is provided.
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--local_rank', type=int, default=None,
34 |                         help='local rank passed from distributed launcher')
35 |     args = parser.parse_args()
36 |     local_rank = args.local_rank
37 | 
38 |     # Get rank and world size.
39 |     rank = int(os.getenv('RANK', '0'))
40 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
41 | 
42 |     print('> initializing torch.distributed with local rank: {}, '
43 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
44 | 
45 |     # Set the device id.
46 |     device = rank % torch.cuda.device_count()
47 |     if local_rank is not None:
48 |         device = local_rank
49 |     torch.cuda.set_device(device)
50 | 
51 |     # Call the init process.
52 |     init_method = 'tcp://'
53 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
54 |     master_port = os.getenv('MASTER_PORT', '6000')
55 |     init_method += master_ip + ':' + master_port
56 |     torch.distributed.init_process_group(
57 |         backend=backend,
58 |         world_size=world_size,
59 |         rank=rank,
60 |         init_method=init_method)
61 | 
62 | 
63 | def print_separator(message):
64 |     torch.distributed.barrier()
65 |     filler_len = (78 - len(message)) // 2
66 |     filler = '-' * filler_len
67 |     string = '\n' + filler + ' {} '.format(message) + filler
68 |     if torch.distributed.get_rank() == 0:
69 |         print(string, flush=True)
70 |     torch.distributed.barrier()
71 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import print_separator
 4 | from commons import initialize_distributed
 5 | from mpu import data as data_utils
 6 | import mpu
 7 | import torch
 8 | import functools
 9 | import operator
10 | import sys
11 | sys.path.append("../..")
12 | 
13 | 
14 | def test_broadcast_data(tensor_model_parallel_size):
15 | 
16 |     if torch.distributed.get_rank() == 0:
17 |         print('> testing broadcast_data with model parallel size {} ...'.
18 |               format(tensor_model_parallel_size))
19 | 
20 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
21 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
22 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
23 | 
24 |     key_size_t = {'key1': [7, 11],
25 |                   'key2': [8, 2, 1],
26 |                   'key3': [13],
27 |                   'key4': [5, 1, 2],
28 |                   'key5': [5, 12]}
29 |     keys = list(key_size_t.keys())
30 | 
31 |     data = {}
32 |     data_t = {}
33 |     for key in key_size_t:
34 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
35 |         data_t[key] = data[key].clone()
36 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
37 |     data_t['keyX'] = data['keyX'].clone()
38 |     if mpu.get_tensor_model_parallel_rank() != 0:
39 |         data = None
40 | 
41 |     data_utils._check_data_types(keys, data_t, torch.int64)
42 |     key_size, key_numel, \
43 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
44 |     for key in keys:
45 |         assert key_size[key] == key_size_t[key]
46 |     total_numel_t = 0
47 |     for key in keys:
48 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
49 |         assert key_numel[key] == target_size
50 |         total_numel_t += target_size
51 |     assert total_numel == total_numel_t
52 | 
53 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
54 |     for key in keys:
55 |         tensor = data_t[key].cuda()
56 |         assert data_b[key].sub(tensor).abs().max() == 0
57 | 
58 |     # Reset groups
59 |     mpu.destroy_tensor_model_parallel()
60 | 
61 |     torch.distributed.barrier()
62 |     if torch.distributed.get_rank() == 0:
63 |         print('>> passed the test :-)')
64 | 
65 | 
66 | if __name__ == '__main__':
67 | 
68 |     initialize_distributed()
69 |     world_size = torch.distributed.get_world_size()
70 | 
71 |     tensor_model_parallel_size = 1
72 |     while tensor_model_parallel_size <= world_size:
73 |         print_separator('test test broadcast data')
74 |         test_broadcast_data(tensor_model_parallel_size)
75 |         tensor_model_parallel_size *= 2
76 | 


--------------------------------------------------------------------------------
/megatron/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/megatron/text_generation/beam_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | ## from huggingface beam search
19 | class BeamHypotheses(object):
20 |     def __init__(self, num_beams, length_penalty=1.0, early_stopping=False):
21 |         """
22 |         Initialize n-best list of hypotheses.
23 |         """
24 |         self.length_penalty = length_penalty
25 |         self.early_stopping = early_stopping
26 |         self.num_beams = num_beams
27 |         self.beams = []
28 |         self.worst_score = 1e9
29 | 
30 |     def __len__(self):
31 |         """
32 |         Number of hypotheses in the list.
33 |         """
34 |         return len(self.beams)
35 | 
36 |     def add(self, hyp, sum_logprobs, length):
37 |         """
38 |         Add a new hypothesis to the list.
39 |         """
40 |         score = sum_logprobs / length ** self.length_penalty
41 |         if len(self) < self.num_beams or score > self.worst_score:
42 |             self.beams.append((score, hyp))
43 |             if len(self) > self.num_beams:
44 |                 sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
45 |                 del self.beams[sorted_scores[0][1]]
46 |                 self.worst_score = sorted_scores[1][0]
47 |             else:
48 |                 self.worst_score = min(score, self.worst_score)
49 | 
50 |     def is_done(self, best_sum_logprobs, cur_len):
51 |         """
52 |         If there are enough hypotheses and that none of the hypotheses being generated
53 |         can become better than the worst one in the heap, then we are done with this sentence.
54 |         """
55 | 
56 |         if len(self) < self.num_beams:
57 |             return False
58 |         elif self.early_stopping:
59 |             return True
60 |         else:
61 |             cur_score = best_sum_logprobs / cur_len ** self.length_penalty
62 |             ret = self.worst_score >= cur_score
63 |             return ret
64 | 
65 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/picture/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/.DS_Store


--------------------------------------------------------------------------------
/picture/2.7bx8A100_memory.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/2.7bx8A100_memory.pdf


--------------------------------------------------------------------------------
/picture/32x7b zhihu_throughput.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llllllllll}
 2 | \toprule
 3 |  &  & \multicolumn{4}{r}{TFLOPS/s} & \multicolumn{4}{r}{Throughput} \\
 4 |  & Method & 1F1B & 1F1B-I & Seq1F1B & Seq1F1B-I & 1F1B & 1F1B-I & Seq1F1B & Seq1F1B-I \\
 5 | seqlen & Micros &  &  &  &  &  &  &  &  \\
 6 | \midrule
 7 | \multirow[t]{2}{*}{32768} & 8 & 99.67±0.19 & 109.55±0.72 & \textbf{110.62±0.54} & \textbf{97.69±1.82} & 48189.47±93.97 & 52964.06±348.28 & \textbf{53484.52±259.03} & \textbf{47232.00±882.04} \\
 8 |  & 16 & 114.45±0.36 & 116.46±0.76 & \textbf{115.34±0.18} & \textbf{95.55±1.60} & 55333.16±175.13 & 56304.42±365.27 & \textbf{55765.31±89.28} & \textbf{46195.24±771.99} \\
 9 | \cline{1-10}
10 | \multirow[t]{2}{*}{65536} & 8 & 107.49±0.03 & 119.96±0.16 & \textbf{124.62±0.06} & \textbf{117.75±1.26} & 37342.92±10.05 & 41676.97±57.03 & \textbf{43296.70±20.83} & \textbf{40907.87±437.09} \\
11 |  & 16 & 123.95±0.06 & 128.70±0.06 & \textbf{123.05±0.51} & \textbf{117.98±0.82} & 43063.26±20.15 & 44712.49±19.41 & \textbf{33367.34±137.63} & \textbf{40989.72±284.90} \\
12 | \cline{1-10}
13 | \multirow[t]{2}{*}{131072} & 8 & OOM & OOM & \textbf{136.72±0.05} & \textbf{135.06±0.17} & OOM & OOM & \textbf{30392.34±10.14} & \textbf{30023.38±38.29} \\
14 |  & 16 & OOM & OOM & \textbf{142.08±0.02} & \textbf{136.58±0.19} & OOM & OOM & \textbf{31584.02±4.38} & \textbf{30362.33±42.01} \\
15 | \cline{1-10}
16 | \bottomrule
17 | \end{tabular}
18 | 


--------------------------------------------------------------------------------
/picture/Raycast (2).dmg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/Raycast (2).dmg


--------------------------------------------------------------------------------
/picture/seq1f1b_memory.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_memory.pdf


--------------------------------------------------------------------------------
/picture/seq1f1b_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_memory.png


--------------------------------------------------------------------------------
/picture/seq1f1b_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_original.png


--------------------------------------------------------------------------------
/picture/seq1f1b_zerobubble.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/picture/seq1f1b_zerobubble.pdf


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | [tool.isort]
 4 | profile = "black"  # black-compatible
 5 | line_length = 100  # should match black parameters
 6 | py_version = 38  # python 3.8 as a target version
 7 | known_first_party = ["megatron"]  # FIRSTPARTY section
 8 | known_third_party = ["transformer_engine"]  # THIRDPARTY section
 9 | sections = ["FUTURE", "STDLIB", "THIRDPARTY", "FIRSTPARTY", "LOCALFOLDER"]
10 | default_section = "THIRDPARTY"
11 | extend_skip = ["setup.py"]
12 | 
13 | [tool.black]
14 | line_length = 100
15 | skip_string_normalization = true
16 | # recongized by future versions, disallows to reformat code with incompatible versions
17 | # Matches NeMO version so people working on both codebases don't need two different version of black installed
18 | required_version = "19.10b0"  
19 | 


--------------------------------------------------------------------------------
/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """GLUE dataset."""
 4 | 
 5 | from abc import ABC
 6 | from abc import abstractmethod
 7 | 
 8 | from torch.utils.data import Dataset
 9 | 
10 | from megatron import print_rank_0
11 | from tasks.data_utils import build_sample
12 | from tasks.data_utils import build_tokens_types_paddings_from_text
13 | 
14 | 
15 | class GLUEAbstractDataset(ABC, Dataset):
16 |     """GLUE base dataset class."""
17 | 
18 |     def __init__(self, task_name, dataset_name, datapaths,
19 |                  tokenizer, max_seq_length):
20 |         # Store inputs.
21 |         self.task_name = task_name
22 |         self.dataset_name = dataset_name
23 |         self.tokenizer = tokenizer
24 |         self.max_seq_length = max_seq_length
25 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
26 |                                                              self.dataset_name))
27 |         # Process the files.
28 |         string = '  > paths:'
29 |         for path in datapaths:
30 |             string += ' ' + path
31 |         print_rank_0(string)
32 |         self.samples = []
33 |         for datapath in datapaths:
34 |             self.samples.extend(self.process_samples_from_single_path(datapath))
35 |         print_rank_0('  >> total number of samples: {}'.format(
36 |             len(self.samples)))
37 | 
38 |     def __len__(self):
39 |         return len(self.samples)
40 | 
41 |     def __getitem__(self, idx):
42 |         raw_sample = self.samples[idx]
43 |         ids, types, paddings = build_tokens_types_paddings_from_text(
44 |             raw_sample['text_a'], raw_sample['text_b'],
45 |             self.tokenizer, self.max_seq_length)
46 |         sample = build_sample(ids, types, paddings,
47 |                               raw_sample['label'], raw_sample['uid'])
48 |         return sample
49 | 
50 |     @abstractmethod
51 |     def process_samples_from_single_path(self, datapath):
52 |         """Abstract method that takes a single path / filename and
53 |         returns a list of dataset samples, each sample being a dict of
54 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
55 |         """
56 |         pass
57 | 


--------------------------------------------------------------------------------
/tasks/glue/mnli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """MNLI dataset."""
 4 | 
 5 | from megatron import print_rank_0
 6 | from tasks.data_utils import clean_text
 7 | from .data import GLUEAbstractDataset
 8 | 
 9 | 
10 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
11 | 
12 | 
13 | class MNLIDataset(GLUEAbstractDataset):
14 | 
15 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
16 |                  test_label='contradiction'):
17 |         self.test_label = test_label
18 |         super().__init__('MNLI', name, datapaths,
19 |                          tokenizer, max_seq_length)
20 | 
21 |     def process_samples_from_single_path(self, filename):
22 |         """"Implement abstract method."""
23 |         print_rank_0(' > Processing {} ...'.format(filename))
24 | 
25 |         samples = []
26 |         total = 0
27 |         first = True
28 |         is_test = False
29 |         with open(filename, 'r') as f:
30 |             for line in f:
31 |                 row = line.strip().split('\t')
32 |                 if first:
33 |                     first = False
34 |                     if len(row) == 10:
35 |                         is_test = True
36 |                         print_rank_0(
37 |                             '   reading {}, {} and {} columns and setting '
38 |                             'labels to {}'.format(
39 |                                 row[0].strip(), row[8].strip(),
40 |                                 row[9].strip(), self.test_label))
41 |                     else:
42 |                         print_rank_0('    reading {} , {}, {}, and {} columns '
43 |                                      '...'.format(
44 |                                          row[0].strip(), row[8].strip(),
45 |                                          row[9].strip(), row[-1].strip()))
46 |                     continue
47 | 
48 |                 text_a = clean_text(row[8].strip())
49 |                 text_b = clean_text(row[9].strip())
50 |                 unique_id = int(row[0].strip())
51 |                 label = row[-1].strip()
52 |                 if is_test:
53 |                     label = self.test_label
54 | 
55 |                 assert len(text_a) > 0
56 |                 assert len(text_b) > 0
57 |                 assert label in LABELS
58 |                 assert unique_id >= 0
59 | 
60 |                 sample = {'text_a': text_a,
61 |                           'text_b': text_b,
62 |                           'label': LABELS[label],
63 |                           'uid': unique_id}
64 |                 total += 1
65 |                 samples.append(sample)
66 | 
67 |                 if total % 50000 == 0:
68 |                     print_rank_0('  > processed {} so far ...'.format(total))
69 | 
70 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
71 |         return samples
72 | 


--------------------------------------------------------------------------------
/tasks/msdp/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
 3 | 
 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
 5 | 
 6 | ## Multi-Stage Dialogue Prompting
 7 | 
 8 | ### Data Preparation
 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets.
11 | 
12 | ### Stage-1: Prompting for Knowledge Generation
13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
15 | 
16 | ### Stage-2: Prompting for Response Generation
17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
19 | 3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
20 | 


--------------------------------------------------------------------------------
/tasks/msdp/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Model evaluation"""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from tasks.msdp.metrics import F1Metric
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | def evaluate_f1(guess_file, answer_file):
12 |     """Evaluating F1 Score"""
13 | 
14 |     guess_list = []
15 |     print_rank_0('reading %s' % guess_file)
16 |     with open(guess_file, "r") as f:
17 |         for i, line in enumerate(tqdm(f)):
18 |             line = line.strip()
19 |             if "<|endoftext|>" in line:
20 |                 line = line.replace("<|endoftext|>", "")
21 |             guess_list.append(line)
22 | 
23 |     answer_list = []
24 |     print_rank_0('reading %s' % answer_file)
25 |     with open(answer_file, "r") as f:
26 |         for i, line in enumerate(tqdm(f)):
27 |             line = line.strip()
28 |             if line == "no_passages_used":
29 |                 line = ""
30 |             answer_list.append(line)
31 | 
32 |     assert len(guess_list) == len(answer_list), \
33 |         "lengths of guess and answer are different!"
34 | 
35 |     precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
36 |     print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
37 | 
38 |     print_rank_0('done :-)')
39 | 
40 | 
41 | def main():
42 |     args = get_args()
43 |     
44 |     evaluate_f1(args.guess_file, args.answer_file)
45 | 
46 | 


--------------------------------------------------------------------------------
/tasks/msdp/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Run multi-stage dialogue prompting (MSDP)."""
 4 | 
 5 | import os
 6 | import sys
 7 | sys.path.append(os.path.abspath(os.path.join(
 8 |     os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir)))
 9 | from megatron import get_args
10 | from megatron.initialize import initialize_megatron
11 | 
12 | 
13 | def get_tasks_args(parser):
14 |     """Provide extra arguments required for tasks."""
15 |     group = parser.add_argument_group(title='tasks')
16 | 
17 |     # parameters for the knowledgeable dialogue generation
18 |     group.add_argument('--task', type=str, required=True,
19 |                        help='Task name.')
20 |     group.add_argument("--sample-input-file", type=str, default=None,
21 |                        help='Get input from file instead of interactive mode, '
22 |                        'each line is an input.')
23 |     group.add_argument("--sample-output-file", type=str, default=None,
24 |                        help='Output file got from --sample-input-file')
25 |     group.add_argument('--prompt-file', type=str, default=None,
26 |                        help='prompting file')
27 |     group.add_argument('--prompt-type', type=str, default=None, 
28 |                        choices=['knowledge', 'response'],
29 |                        help='prompt type (knowledge or response)')
30 |     group.add_argument('--num-prompt-examples', type=int, default=10,
31 |                        help='number of prompt examples')
32 |     group.add_argument('--guess-file', type=str, default=None,
33 |                        help='datapath for generated sentences')
34 |     group.add_argument('--answer-file', type=str, default=None,
35 |                        help='datapath for golden sentences')
36 |     group.add_argument('--out-seq-length', type=int, default=100,
37 |                        help='output sequence length')
38 |     group.add_argument('--api-prompt', default=False, action="store_true",
39 |                        help='setup model api for prompting')
40 |     group.add_argument('--megatron-api-url', type=str, default=None,
41 |                        help='url of the megatron api')
42 | 
43 |     return parser
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 
48 |     initialize_megatron(extra_args_provider=get_tasks_args)
49 | 
50 |     args = get_args()
51 | 
52 |     if args.num_layers_per_virtual_pipeline_stage is not None:
53 |         print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
54 |         exit()
55 | 
56 |     if args.task == 'MSDP-PROMPT':
57 |         from tasks.msdp.prompt import main
58 | 
59 |     elif args.task == 'MSDP-EVAL-F1':
60 |         from tasks.msdp.evaluate import main
61 | 
62 |     else:
63 |         raise NotImplementedError('Task {} is not implemented.'.format(
64 |             args.task))
65 | 
66 |     main()
67 | 


--------------------------------------------------------------------------------
/tasks/msdp/metrics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # The following code is adapted from
 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
 4 | # which is licensed under the MIT license. More details on the license can be 
 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
 6 | 
 7 | """Provides standard metric evaluations for dialog."""
 8 | 
 9 | from collections import Counter
10 | from typing import List
11 | import numpy as np
12 | import re
13 | 
14 | re_art = re.compile(r'\b(a|an|the)\b')
15 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
16 | 
17 | 
18 | def normalize_answer(s):
19 |     """
20 |     Lower text and remove punctuation, articles and extra whitespace.
21 |     """
22 |     s = s.lower()
23 |     s = re_punc.sub(' ', s)
24 |     s = re_art.sub(' ', s)
25 |     s = ' '.join(s.split())
26 |     return s
27 | 
28 | 
29 | class F1Metric:
30 |     """
31 |     Helper class which computes token-level F1.
32 |     """
33 | 
34 |     @staticmethod
35 |     def _prec_recall_f1_score(pred_items, gold_items):
36 |         """
37 |         Compute precision, recall and f1 given a set of gold and prediction items.
38 |         :param pred_items: iterable of predicted values
39 |         :param gold_items: iterable of gold values
40 |         :return: tuple (p, r, f1) for precision, recall, f1
41 |         """
42 |         common = Counter(gold_items) & Counter(pred_items)
43 |         num_same = sum(common.values())
44 |         if num_same == 0:
45 |             return 0, 0, 0
46 |         precision = 1.0 * num_same / len(pred_items)
47 |         recall = 1.0 * num_same / len(gold_items)
48 |         f1 = (2 * precision * recall) / (precision + recall)
49 |         return precision, recall, f1
50 | 
51 |     @staticmethod
52 |     def compute_each_pair(guess: str, answer: str):
53 |         if answer == "":
54 |             return None, None, None
55 |         if guess == "":
56 |             return 0, 0, 0
57 |         g_tokens = normalize_answer(guess).split()
58 |         a_tokens = normalize_answer(answer).split()
59 | 
60 |         precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
61 |         return precision, recall, f1
62 |         
63 |     @staticmethod
64 |     def compute_all_pairs(guesses: List[str], answers: List[str]):
65 |         # additional augment:
66 |         assert len(guesses) == len(answers)
67 |         
68 |         precision_list, recall_list, f1_list = [], [], []
69 |         for guess, answer in zip(guesses, answers):
70 |             precision, recall, f1 = F1Metric.compute_each_pair(guess, answer)
71 |             if precision is None or recall is None or f1 is None:
72 |                 continue
73 |             precision_list.append(precision)
74 |             recall_list.append(recall)
75 |             f1_list.append(f1)
76 |         
77 |         return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
78 | 


--------------------------------------------------------------------------------
/tasks/orqa/README.md:
--------------------------------------------------------------------------------
 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering
 2 | 
 3 | Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
 4 | 
 5 | ## Retriever Training
 6 | 
 7 | #### Unsupervised pretraining
 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body.
 9 | 
10 | <pre>
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | </pre>
20 | 
21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training.
22 | 
23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf).
24 | 
25 | #### Supervised finetuning
26 | 
27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906).
28 | 
29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model.
30 | 
31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408).
32 | 
33 | ## Reader Training
34 | 
35 | The reader component will be available soon.
36 | 
37 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | from megatron import get_args, print_rank_0
 6 | from megatron.indexer import IndexBuilder
 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator
 8 | 
 9 | def main():
10 |     """
11 |     Main program
12 |     """
13 | 
14 |     args = get_args()
15 | 
16 |     """
17 |     Create a BlockData data structure by running an IndexBuilder over an
18 |     ICT Dataset and then evaluate on NQ task
19 |     """
20 | 
21 |     print_rank_0("Starting index builder!")
22 | 
23 |     index_builder = IndexBuilder()
24 |     index_builder.build_and_save_index()
25 |     print_rank_0("Build and save indices: done!")
26 | 
27 | 
28 |     print_rank_0("Starting evaluations!")
29 | 
30 |     # Set up the model and evaluator
31 |     evaluator = ORQAEvaluator()
32 | 
33 |     # Run evaluation
34 |     if args.qa_data_dev is not None:
35 |         evaluator.evaluate(args.qa_data_dev, "DEV")
36 | 
37 |     if args.qa_data_test is not None:
38 |         evaluator.evaluate(args.qa_data_test, "TEST")
39 | 
40 | 


--------------------------------------------------------------------------------
/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Race."""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from megatron import get_tokenizer
 8 | from megatron.model.multiple_choice import MultipleChoice
 9 | from tasks.eval_utils import accuracy_func_provider
10 | from tasks.finetune_utils import finetune
11 | from tasks.race.data import RaceDataset
12 | from megatron.arguments import core_transformer_config_from_args
13 | 
14 | 
15 | def train_valid_datasets_provider():
16 |     """Provide train and validation datasets."""
17 |     args = get_args()
18 |     tokenizer = get_tokenizer()
19 | 
20 |     train_dataset = RaceDataset('training', args.train_data,
21 |                                 tokenizer, args.seq_length)
22 |     valid_dataset = RaceDataset('validation', args.valid_data,
23 |                                 tokenizer, args.seq_length)
24 | 
25 |     return train_dataset, valid_dataset
26 | 
27 | 
28 | def model_provider(pre_process=True, post_process=True):
29 |     """Build the model."""
30 |     config = core_transformer_config_from_args(get_args())
31 |     print_rank_0('building multichoice model for RACE ...')
32 |     model = MultipleChoice(config=config,
33 |                            num_tokentypes=2,
34 |                            pre_process=pre_process,
35 |                            post_process=post_process)
36 | 
37 |     return model
38 | 
39 | 
40 | def metrics_func_provider():
41 |     """Privde metrics callback function."""
42 |     args = get_args()
43 |     tokenizer = get_tokenizer()
44 | 
45 |     def single_dataset_provider(datapath):
46 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
47 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
48 | 
49 |     return accuracy_func_provider(single_dataset_provider)
50 | 
51 | 
52 | def main():
53 | 
54 |     finetune(train_valid_datasets_provider, model_provider,
55 |              end_of_epoch_callback_provider=metrics_func_provider)
56 | 


--------------------------------------------------------------------------------
/tasks/vision/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | sys.path.append(
 9 |     os.path.abspath(
10 |         os.path.join(
11 |             os.path.join(os.path.dirname(__file__), os.path.pardir),
12 |             os.path.pardir,
13 |         )
14 |     )
15 | )
16 | from megatron import get_args
17 | from megatron.initialize import initialize_megatron
18 | 
19 | def get_tasks_args(parser):
20 |     """Provide extra arguments required for tasks."""
21 |     group = parser.add_argument_group(title="tasks")
22 | 
23 |     group.add_argument('--task', type=str, default='segment',
24 |                        choices=['classify', 'segment_setr', 'segment_segformer'],
25 |                        help='task name.')
26 |     group.add_argument("--epochs", type=int, default=None,
27 |                        help="Number of finetunning epochs. Zero results in "
28 |                        "evaluation only.")
29 |     group.add_argument('--pretrained-checkpoint-type', type=str, default='default',
30 |                        choices=['default', 'external', 'constrastive'],
31 |                        help='Type of pretrained checkpoint')
32 |     group.add_argument("--pretrained-checkpoint", type=str, default=None,
33 |                        help="Pretrained checkpoint used for finetunning.")
34 |     group.add_argument('--seg-stride', type=int, default=None,
35 |                        help='sliding window stride during evaluation')
36 |     return parser
37 | 
38 | 
39 | if __name__ == "__main__":
40 | 
41 |     initialize_megatron(extra_args_provider=get_tasks_args)
42 |     args = get_args()
43 | 
44 |     if args.task == 'classify':
45 |         from tasks.vision.classification.classification import main
46 |         main()
47 |     elif args.task == 'segment_setr':
48 |         from tasks.vision.segmentation.finetune_setr import main
49 |         main()
50 |     elif args.task == 'segment_segformer':
51 |         from tasks.vision.segmentation.finetune_segformer import main
52 |         main()
53 | 
54 | 


--------------------------------------------------------------------------------
/tasks/vision/segmentation/seg_models.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import math
 3 | import einops
 4 | import torch
 5 | import apex
 6 | import torch.nn.functional as F
 7 | from megatron import get_args
 8 | from megatron.model.module import MegatronModule
 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
10 | from megatron.model.vision.mit_backbone import mit_b3, mit_b5
11 | from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead
12 | 
13 | 
14 | class SetrSegmentationModel(MegatronModule):
15 | 
16 |     def __init__(self,
17 |                  num_classes,
18 |                  pre_process=True,
19 |                  post_process=True):
20 |         super(SetrSegmentationModel, self).__init__()
21 |         args = get_args()
22 |         assert post_process & pre_process
23 |         self.hidden_size = args.hidden_size
24 |         self.num_classes = num_classes
25 |         self.backbone = VitBackbone(
26 |             pre_process=pre_process,
27 |             post_process=post_process,
28 |             class_token=False,
29 |             post_layer_norm=False,
30 |             drop_path_rate=0.1
31 |         )
32 | 
33 |         self.head = SetrSegmentationHead(
34 |             self.hidden_size,
35 |             self.num_classes
36 |         )
37 | 
38 |     def set_input_tensor(self, input_tensor):
39 |         """See megatron.model.transformer.set_input_tensor()"""
40 |         pass
41 | 
42 |     def forward(self, input):
43 |         # [b hw c]
44 |         hidden_states = self.backbone(input)
45 |         result_final = self.head(hidden_states)
46 |         return result_final
47 | 
48 | 
49 | class SegformerSegmentationModel(MegatronModule):
50 | 
51 |     def __init__(self,
52 |                  num_classes,
53 |                  pre_process=True,
54 |                  post_process=True):
55 |         super(SegformerSegmentationModel, self).__init__()
56 |         args = get_args()
57 |         self.hidden_size = args.hidden_size
58 |         self.num_classes = num_classes
59 |         self.pre_process = pre_process
60 |         self.post_process = post_process
61 | 
62 |         self.backbone = mit_b5()
63 |         self.head = SegformerSegmentationHead(
64 |             feature_strides=[4, 8, 16, 32],
65 |             in_channels=[64, 128, 320, 512],
66 |             embedding_dim=768,
67 |             dropout_ratio=0.1
68 |         )
69 | 
70 |     def set_input_tensor(self, input_tensor):
71 |         """See megatron.model.transformer.set_input_tensor()"""
72 |         pass
73 | 
74 |     def forward(self, input):
75 |         # [b hw c]
76 |         hidden_states = self.backbone(input)
77 |         hidden_states = self.head(hidden_states)
78 |         return hidden_states
79 | 
80 | 


--------------------------------------------------------------------------------
/tasks/zeroshot_gpt/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Detokenization."""
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | def ptb_detokenizer(string):
 9 |     string = string.replace(" '", "'")
10 |     string = string.replace(" \n", "\n")
11 |     string = string.replace("\n ", "\n")
12 |     string = string.replace(" n't", "n't")
13 |     string = string.replace(" N ", "1 ")
14 |     string = string.replace("$ 1", "$1")
15 |     string = string.replace("# 1", "#1")
16 |     return string
17 | 
18 | 
19 | def wikitext_detokenizer(string):
20 |     # contractions
21 |     string = string.replace("s '", "s'")
22 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
23 |     # number separators
24 |     string = string.replace(" @-@ ", "-")
25 |     string = string.replace(" @,@ ", ",")
26 |     string = string.replace(" @.@ ", ".")
27 |     # punctuation
28 |     string = string.replace(" : ", ": ")
29 |     string = string.replace(" ; ", "; ")
30 |     string = string.replace(" . ", ". ")
31 |     string = string.replace(" ! ", "! ")
32 |     string = string.replace(" ? ", "? ")
33 |     string = string.replace(" , ", ", ")
34 |     # double brackets
35 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
36 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
37 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
38 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
39 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
40 |     # miscellaneous
41 |     string = string.replace("= = = =", "====")
42 |     string = string.replace("= = =", "===")
43 |     string = string.replace("= =", "==")
44 |     string = string.replace(" " + chr(176) + " ", chr(176))
45 |     string = string.replace(" \n", "\n")
46 |     string = string.replace("\n ", "\n")
47 |     string = string.replace(" N ", " 1 ")
48 |     string = string.replace(" 's", "'s")
49 | 
50 |     return string
51 | 
52 | 
53 | def lambada_detokenizer(string):
54 |     return string
55 | 
56 | 
57 | _DETOKENIZERS = {
58 |     'ptb': ptb_detokenizer,
59 |     'wiki': wikitext_detokenizer,
60 |     'lambada': lambada_detokenizer,
61 | }
62 | 
63 | 
64 | def get_detokenizer(path):
65 |     for key in _DETOKENIZERS.keys():
66 |         if key in path:
67 |             return _DETOKENIZERS[key]
68 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/functional_tests/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/functional_tests/python_test_utils/__init__.py


--------------------------------------------------------------------------------
/tests/functional_tests/python_test_utils/check_slurm_job_completion.py:
--------------------------------------------------------------------------------
 1 | """Check if a given slurm job id completed successfully
 2 |    Usage:
 3 |        python3 check_slurm_job_completion.py <JOB_ID>
 4 | """
 5 | 
 6 | import sys
 7 | import subprocess
 8 | 
 9 | 
10 | cmd = f"sacct -j {sys.argv[1]}"
11 | result = subprocess.check_output(cmd, shell=True).decode().split()
12 | assert len(result) > 14, "JOB state not available."
13 | 
14 | status = result[19]
15 | exit_code = result[20]
16 | 
17 | assert status == "COMPLETED", f"Job {sys.argv[1]} not completed."
18 | assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully."
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/functional_tests/shell_test_utils/jobwait.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | JOBID=$1
 4 | echo "Job id : $JOBID"
 5 | 
 6 | if [[ $JOBID -eq "" ]]; then
 7 |   exit 1
 8 | fi
 9 | 
10 | sleep 10s
11 | 
12 | while true; do
13 |     export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
14 |     case "${STATE}" in
15 |         PENDING|RUNNING|REQUEUED)
16 |             echo "Job is still in $STATE"
17 |             sleep 15s
18 |             ;;
19 |         *)
20 |             sleep 30s
21 |             echo "Exiting with SLURM job status '${STATE}'"
22 |             exit 0
23 |             ;;
24 |     esac
25 | done
26 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49535, 10.46661, 10.42394, 10.30692, 10.15978, 9.96955]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [19029.0, 19175.0, 22073.0, 18475.0, 20839.0, 23168.0, 22721.0]}, "iteration_timing_avg": 0.4121861764705882}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46274, 10.31498, 10.17119, 9.97324]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22611.0, 20670.0, 26082.0, 23626.0, 21993.0, 21751.0, 23179.0]}, "iteration_timing_avg": 0.874113823529412}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44877, 10.43852, 10.44018, 10.44114, 10.45622, 10.44144, 10.39044, 10.25684, 10.133, 9.95743]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [28181.0, 20629.0, 28241.0, 26287.0, 24057.0, 21016.0, 21238.0]}, "iteration_timing_avg": 0.7704600000000001}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.48784, 10.4873, 10.50417, 10.49446, 10.47819, 10.41361, 10.28135, 10.14425, 9.94149]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26852.0, 19803.0, 25492.0, 24594.0, 21586.0, 19658.0, 20766.0]}, "iteration_timing_avg": 1.4250708823529417}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 44, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62397, 10.53554]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [2078.0, 2320.0, 2519.0, 2248.0, 2127.0, 1987.0]}, "iteration_timing_avg": 0.09863333333333332}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.83091, 10.87024, 10.89161, 10.81277, 10.6858, 10.61231, 10.09495, 10.21817]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1545.0, 1840.0, 1867.0, 1825.0, 1867.0, 1668.0, 1576.0, 1932.0]}, "iteration_timing_avg": 0.09399846153846156}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled_rope_embeddings.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.84538, 10.87913, 10.90387, 10.8235, 10.67913, 10.60602, 10.06785, 10.19695]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1722.0, 2142.0, 2149.0, 1966.0, 2038.0, 1914.0, 1745.0, 1956.0]}, "iteration_timing_avg": 0.10455653846153849}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 47, "step_interval": 5, "values": [10.81244, 10.87437, 10.90132, 10.84946, 10.84826, 10.81872, 10.61811, 10.61053, 10.52823, 10.22961]}, "num-zeros": {"start_step": 0, "end_step": 30, "step_interval": 5, "values": [2356.0, 2601.0, 2778.0, 2282.0, 2350.0, 2782.0]}, "iteration_timing_avg": 0.12793593749999999}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79471, 10.86601, 10.89073, 10.78482, 10.6587, 10.58125]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1609.0, 1850.0, 1921.0, 1942.0, 1853.0, 1674.0]}, "iteration_timing_avg": 0.12440000000000001}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_disable_bias_linear.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79474, 10.86607, 10.8908, 10.7851, 10.65905, 10.58193]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1587.0, 1824.0, 2006.0, 1919.0, 1874.0, 1646.0]}, "iteration_timing_avg": 0.12088222222222227}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_sequence_parallel.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 41, "step_interval": 5, "values": [10.79471, 10.86601, 10.89073, 10.78482, 10.6587, 10.58125, 10.0813, 10.19422, 10.13437]}, "num-zeros": {"start_step": 0, "end_step": 41, "step_interval": 5, "values": [1609.0, 1850.0, 1921.0, 1942.0, 1853.0, 1674.0, 1544.0, 1884.0, 2438.0]}, "iteration_timing_avg": 0.12650857142857144}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_swiglu.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [10.73442, 10.82095, 10.84047, 10.75831, 10.70386, 10.63718, 10.20959, 10.36611]}, "num-zeros": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [2625.0, 2815.0, 2837.0, 2870.0, 2755.0, 2617.0, 2345.0, 2529.0]}, "iteration_timing_avg": 0.1255659259259259}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_untie_embeddings_and_outputs.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [10.89427, 10.9106, 10.917, 10.84465, 10.70825, 10.63519, 10.15543, 10.26206]}, "num-zeros": {"start_step": 0, "end_step": 40, "step_interval": 5, "values": [22727188.0, 23020756.0, 22501138.0, 22830610.0, 22739638.0, 22547160.0, 22955250.0, 22589434.0]}, "iteration_timing_avg": 0.12411037037037034}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85543, 10.89355, 10.87608, 10.87365, 10.88042, 10.84182, 10.67177, 10.62854, 10.52511, 10.25229]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2470.0, 2444.0, 2570.0, 2192.0, 2241.0, 2574.0, 2476.0]}, "iteration_timing_avg": 0.14008088235294117}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_core_enabled.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.92215, 10.93714, 10.89742, 10.87588, 10.75165, 10.65713, 10.1606, 10.24967, 10.15339, 9.84198]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1655.0, 1837.0, 1968.0, 1854.0, 1811.0, 1810.0, 1593.0, 1997.0, 2315.0, 2343.0]}, "iteration_timing_avg": 0.13743323529411763}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps_te_enabled.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.8559, 10.89255, 10.8665, 10.81693, 10.69856, 10.60955, 10.10845, 10.21443, 10.12855, 9.80126]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1693.0, 1878.0, 1977.0, 1871.0, 2022.0, 1716.0, 1646.0, 2006.0, 2280.0, 2365.0]}, "iteration_timing_avg": 0.12973323529411762}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.85921, 10.8797, 10.87381, 10.88658, 10.88912, 10.84826, 10.68571, 10.62946, 10.54289, 10.26918]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2289.0, 2368.0, 2427.0, 2023.0, 2234.0, 2501.0, 2316.0]}, "iteration_timing_avg": 0.20419529411764706}


--------------------------------------------------------------------------------
/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps_core_enabled.json:
--------------------------------------------------------------------------------
1 | {"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86168, 10.88879, 10.87894, 10.8312, 10.71384, 10.61221, 10.13333, 10.23204, 10.16051, 9.83654]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1854.0, 2137.0, 2162.0, 2176.0, 2072.0, 1947.0, 1702.0, 2222.0, 2457.0, 2535.0]}, "iteration_timing_avg": 0.20128235294117644}
2 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -x 
 3 | 
 4 | DATA_PATH=$1
 5 | CHECKPOINT_PATH=$2
 6 | TENSORBOARD_DIR=$3
 7 | TP_SIZE=$4
 8 | PP_SIZE=$5
 9 | NNODES=$6
10 | MAX_STEPS=$7
11 | VP_SIZE=$8
12 | GPUS_PER_NODE=8
13 | # Change for multinode config
14 | MASTER_ADDR=localhost
15 | MASTER_PORT=6000
16 | NODE_RANK=0
17 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
18 | export CUDA_DEVICE_MAX_CONNECTIONS=1
19 | 
20 | 
21 | # Runs the "345M" parameter model
22 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES"
23 | 
24 | torchrun $DISTRIBUTED_ARGS \
25 |        pretrain_bert.py \
26 |        --num-layers 24 \
27 |        --hidden-size 1024 \
28 |        --num-attention-heads 16 \
29 |        --log-params-norm \
30 |        --log-num-zeros-in-grad \
31 |        --log-validation-ppl-to-tensorboard \
32 |        --log-timers-to-tensorboard \
33 |        --tensorboard-dir ${TENSORBOARD_DIR} \
34 |        --micro-batch-size 4 \
35 |        --global-batch-size 128 \
36 |        --seq-length 512 \
37 |        --max-position-embeddings 512 \
38 |        --train-iters $MAX_STEPS \
39 |        --timing-log-level 2 \
40 |        --lr-decay-iters 990000 \
41 |        --save $CHECKPOINT_PATH \
42 |        --load $CHECKPOINT_PATH \
43 |        --data-path $DATA_PATH \
44 |        --vocab-file /workspace/data/bert_data/vocab.txt \
45 |        --data-impl mmap \
46 |        --split 949,50,1 \
47 |        --distributed-backend nccl \
48 |        --lr 0.0001 \
49 |        --min-lr 0.00001 \
50 |        --lr-warmup-fraction 0.01 \
51 |        --log-interval 1 \
52 |        --save-interval 10000 \
53 |        --eval-interval 1000 \
54 |        --eval-iters 10 \
55 |        --tensor-model-parallel-size $TP_SIZE \
56 |        --pipeline-model-parallel-size $PP_SIZE \
57 |        ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
58 |        --no-gradient-accumulation-fusion \
59 |        --fp16 


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr_nlp_llmnext
 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | 
13 | echo 'Running tests using $PYTORCH_IMAGE image'
14 | 
15 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
16 |   ls 
17 |   cd /workspace/megatron-lm
18 |   ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr_nlp_llmnext
 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | 
13 | echo 'Running tests using $PYTORCH_IMAGE image'
14 | 
15 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
16 |   ls 
17 |   cd /workspace/megatron-lm
18 |   ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE"


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -x 
 3 | 
 4 | DATA_PATH=$1
 5 | CHECKPOINT_PATH=$2
 6 | TENSORBOARD_DIR=$3
 7 | USE_TE=$4
 8 | TP_SIZE=$5
 9 | PP_SIZE=$6
10 | NNODES=$7
11 | MAX_STEPS=$8
12 | USE_CORE=$9
13 | VP_SIZE=${10}
14 | MBS=${11}
15 | GBS=${12}
16 | ADDITIONAL_PARAMS=${13}
17 | GPUS_PER_NODE=8
18 | # Change for multinode config
19 | MASTER_ADDR=localhost
20 | MASTER_PORT=6000
21 | NODE_RANK=0
22 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
23 | export CUDA_DEVICE_MAX_CONNECTIONS=1
24 | 
25 | TRANSFORMER_IMPL=local
26 | TRAINING_DTYPE=fp16
27 | CALLING_SCRIPT=pretrain_gpt.py
28 | 
29 | if [[ $USE_CORE -eq 1 ]]; then
30 |        echo "Running using megatron core"
31 |        TRANSFORMER_IMPL=local
32 |        TRAINING_DTYPE=bf16
33 |        CALLING_SCRIPT=pretrain_gpt_core.py
34 |        export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
35 | fi
36 | 
37 | if [[ $USE_TE -eq 1 ]]; then
38 |        echo "Running with TransformerEngine ..."
39 |        TRANSFORMER_IMPL=transformer_engine
40 |        TRAINING_DTYPE=bf16
41 | else
42 |        echo "Running with local transformer implementation ..."
43 | fi
44 | 
45 | # Runs the "345M" parameter model
46 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES"
47 | 
48 | torchrun $DISTRIBUTED_ARGS \
49 |        $CALLING_SCRIPT \
50 |        --num-layers 12 \
51 |        --hidden-size 512 \
52 |        --num-attention-heads 8 \
53 |        --log-params-norm \
54 |        --log-num-zeros-in-grad \
55 |        --log-validation-ppl-to-tensorboard \
56 |        --log-timers-to-tensorboard \
57 |        --tensorboard-dir ${TENSORBOARD_DIR} \
58 |        --micro-batch-size ${MBS:-4} \
59 |        --global-batch-size ${GBS:-32} \
60 |        --seq-length 1024 \
61 |        --max-position-embeddings 1024 \
62 |        --train-iters $MAX_STEPS \
63 |        --timing-log-level 2 \
64 |        --lr-decay-iters 320000 \
65 |        --save $CHECKPOINT_PATH \
66 |        --load $CHECKPOINT_PATH \
67 |        --data-path $DATA_PATH \
68 |        --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
69 |        --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
70 |        --data-impl mmap \
71 |        --split 949,50,1 \
72 |        --distributed-backend nccl \
73 |        --lr 0.00015 \
74 |        --lr-decay-style cosine \
75 |        --min-lr 1.0e-5 \
76 |        --weight-decay 1e-2 \
77 |        --clip-grad 1.0 \
78 |        --lr-warmup-fraction .01 \
79 |        --log-interval 1 \
80 |        --save-interval 10000 \
81 |        --eval-interval 1000 \
82 |        --eval-iters 10 \
83 |        --transformer-impl $TRANSFORMER_IMPL \
84 |        --tensor-model-parallel-size $TP_SIZE \
85 |        --pipeline-model-parallel-size $PP_SIZE \
86 |        ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
87 |        ${ADDITIONAL_PARAMS:+$ADDITIONAL_PARAMS} \
88 |        --no-gradient-accumulation-fusion \
89 |        --${TRAINING_DTYPE}
90 | 


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_resume_checkpoint_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr_nlp_llmnext
 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | 
13 | echo 'Running tests using $PYTORCH_IMAGE image'
14 | 
15 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
16 |   ls 
17 |   cd /workspace/megatron-lm
18 |   ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"


--------------------------------------------------------------------------------
/tests/functional_tests/test_scripts/gpt3/sbatch_gpt3_distributed_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parameters
 4 | #SBATCH --account=adlr_nlp_llmnext
 5 | #SBATCH --job-name=adlr_nlp_llmnext-ci:megatron-job
 6 | #SBATCH --nodes=1
 7 | #SBATCH --partition=luna
 8 | 
 9 | DATA_PATH=/workspace/data/gpt3_data/my-gpt3_00_text_document
10 | CHECKPOINT_PATH=/workspace/checkpoints
11 | TENSORBOARD_DIR=/workspace/logs
12 | 
13 | if [[ -n $MBS ]]; then MBS=4; fi
14 | if [[ -n $GBS ]]; then GBS=32; fi
15 | 
16 | if [[ -n $VP_SIZE ]]; then VP_SIZE="" ; fi
17 | 
18 | echo 'Running tests using $PYTORCH_IMAGE image'
19 | 
20 | srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image $PYTORCH_IMAGE --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
21 |   ls 
22 |   cd /workspace/megatron-lm
23 |   ./tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $USE_TE $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $USE_CORE \"$VP_SIZE\" \"$MBS\" \"$GBS\" \"$ADDITIONAL_PARAMS\""
24 | 


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/models/test_gpt_embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import pytest
 4 | 
 5 | import torch
 6 | 
 7 | from megatron.core.transformer.transformer_config import TransformerConfig
 8 | from megatron.core.models.gpt.gpt_embedding import GPTEmbedding
 9 | from tests.unit_tests.test_utilities import Utils
10 | 
11 | class TestGPTEmbedding:
12 | 
13 |     def setup_method(self, method):
14 |         Utils.initialize_model_parallel(1,1)
15 |         transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
16 |         self.gpt_embedding = GPTEmbedding(config=transformer_config, vocab_size=100, max_sequence_length=4, add_position_embedding=True)
17 |         
18 |     def teardown_method(self, method):
19 |         Utils.destroy_model_parallel()
20 |     
21 |     def test_constructor(self):
22 |         assert isinstance(self.gpt_embedding, GPTEmbedding)
23 |         num_weights = sum([p.numel() for p in self.gpt_embedding.parameters()])
24 |         assert num_weights == 1248
25 |         
26 |     def test_zero_parameters(self):
27 |         sum_weights = sum([p.sum() for p in self.gpt_embedding.parameters()])
28 |         assert sum_weights != 0
29 |         self.gpt_embedding.zero_parameters()
30 |         sum_weights = sum([p.sum() for p in self.gpt_embedding.parameters()])
31 |         assert sum_weights == 0
32 | 
33 |     def test_cpu_forward(self):
34 |         input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1))
35 |         position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1))
36 |         embeddings = self.gpt_embedding(input_ids, position_ids)
37 |         assert embeddings.device.type == 'cpu'
38 |         assert embeddings.shape[0] == self.gpt_embedding.max_sequence_length
39 |         assert embeddings.shape[1] == input_ids.shape[0]
40 |         assert embeddings.shape[2] == self.gpt_embedding.config.hidden_size
41 | 
42 |     def test_gpu_forward(self):
43 |         self.gpt_embedding.cuda()
44 |         input_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda()
45 |         position_ids = torch.tensor([0, 1, 2, 3], dtype=torch.int64).repeat((2, 1)).cuda()
46 |         embeddings = self.gpt_embedding(input_ids, position_ids)
47 |         assert embeddings.device.type == 'cuda'
48 |         assert embeddings.shape[0] == self.gpt_embedding.max_sequence_length
49 |         assert embeddings.shape[1] == input_ids.shape[0]
50 |         assert embeddings.shape[2] == self.gpt_embedding.config.hidden_size


--------------------------------------------------------------------------------
/tests/unit_tests/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/pipeline_parallel/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy
 2 | import torch
 3 | from tests.unit_tests.test_utilities import Utils
 4 | import numpy as np
 5 | 
 6 | def test_vocab_parallel_cross_entropy():
 7 |     Utils.initialize_model_parallel(4,2)
 8 |     vocab_parallel_logits = torch.range(0,7).repeat(16,4).cuda()
 9 |     target = torch.arange(0,32,2).cuda()
10 |     output = vocab_parallel_cross_entropy(vocab_parallel_logits, target)
11 |     expected_output = torch.tensor([10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309,
12 |         10.2309,  8.2309,  6.2309,  4.2309, 10.2309,  8.2309,  6.2309,  4.2309]).cuda()
13 |     assert(torch.equal(torch.round(expected_output), torch.round(output)))
14 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_data.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.data import broadcast_data
 2 | import torch
 3 | from tests.unit_tests.test_utilities import Utils
 4 | 
 5 | def test_broadcast_data():
 6 |     Utils.initialize_model_parallel(2,4)
 7 |     input_data = {
 8 |         0 : torch.ones((8,8)).cuda() * 0.0,
 9 |         1 : torch.ones((8,8)).cuda() * 1.0,
10 |         2 : torch.ones((8,8)).cuda() * 2.0,
11 |         3 : torch.ones((8,8)).cuda() * 3.0,
12 |         4 : torch.ones((8,8)).cuda() * 4.0,
13 |         5 : torch.ones((8,8)).cuda() * 5.0,
14 |         6 : torch.ones((8,8)).cuda() * 6.0,
15 |         7 : torch.ones((8,8)).cuda() * 7.0
16 |         }
17 |     dtype = torch.float32
18 |     actual_output = broadcast_data([0,1],input_data, dtype)
19 |     assert(torch.equal(actual_output[0], input_data[0]))
20 |     assert(torch.equal(actual_output[1], input_data[1]))
21 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_random.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.tensor_parallel.random import CudaRNGStatesTracker
 2 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 3 | from megatron.core.tensor_parallel.random import _CUDA_RNG_STATE_TRACKER
 4 | from megatron.core.tensor_parallel.random import checkpoint
 5 | from tests.unit_tests.test_utilities import Utils
 6 | import pytest
 7 | import torch
 8 | 
 9 | def test_cuda_rng_states_tracker():
10 |     rng_tracker = CudaRNGStatesTracker()
11 |     rng_tracker.set_states({"state1":1234})
12 |     assert(rng_tracker.get_states()["state1"] == 1234)
13 |     rng_tracker.reset()
14 |     assert(rng_tracker.get_states() == {})
15 |     seed = 1111
16 |     rng_tracker.add("state2",seed)
17 |     with pytest.raises(Exception):
18 |         assert(rng_tracker.add("state3",seed))
19 |     with pytest.raises(Exception):
20 |         assert(rng_tracker.add("state2",111))
21 |     assert(rng_tracker.get_states()['state2'] is not None)
22 |     with pytest.raises(Exception):
23 |         assert()
24 |     
25 |     rng_tracker.fork("state2")
26 |     torch.cuda.manual_seed(seed)
27 |     rng_state = torch.cuda.get_rng_state()
28 |     assert torch.equal(rng_tracker.get_states()['state2'], rng_state)
29 | 
30 | def test_model_parallel_cuda_manual_seed():
31 |     Utils.initialize_model_parallel(4,2)
32 |     model_parallel_cuda_manual_seed(0)
33 |     assert(_CUDA_RNG_STATE_TRACKER.get_states()['model-parallel-rng'] is not None)
34 |     Utils.destroy_model_parallel()
35 | 
36 | def test_checkpoint():
37 |     def test_forward(*input):
38 |         return input[0]+input[1]
39 |     assert(torch.equal(torch.ones(16)*3,checkpoint(test_forward, None, torch.ones(16), torch.ones(16)*2)))
40 |     Utils.initialize_model_parallel()
41 |     input1 = torch.ones((4,4))
42 |     checkpoint(test_forward, True, input1, torch.ones((4,4))*2)
43 |     assert(torch.equal(torch.ones(input1.numel()).cuda(), input1))
44 |     Utils.destroy_model_parallel()


--------------------------------------------------------------------------------
/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import megatron.core.tensor_parallel.utils as util
 3 | import megatron.core.parallel_state as ps
 4 | from tests.unit_tests.test_utilities import Utils
 5 | 
 6 | rank = Utils.rank
 7 | 
 8 | def test_split_tensor_along_last_dim():
 9 |     input_tensor = torch.rand((3,4))
10 |     torch.equal(input_tensor[0:2,0:2], util.split_tensor_along_last_dim(input_tensor,2)[0])
11 |     torch.equal(input_tensor[2:,2:], util.split_tensor_along_last_dim(input_tensor,2)[1])
12 | 
13 | def test_split_tensor_into_1d_equal_chunks():
14 |     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
15 |     input_tensor = torch.rand((3,4))
16 |     output_tensor = util.split_tensor_into_1d_equal_chunks(input_tensor)
17 |     if rank % 2 == 0 :
18 |         start = 0
19 |         end = int(input_tensor.numel()/2)
20 |     else :
21 |         start = int(input_tensor.numel()/2)
22 |         end = input_tensor.numel()
23 |         
24 |     assert torch.equal(output_tensor, input_tensor.flatten()[start:end])
25 |     Utils.destroy_model_parallel()
26 | 
27 | def test_gather_split_1d_tensor():
28 |     Utils.initialize_model_parallel(tensor_model_parallel_size=2, pipeline_model_parallel_size=4)
29 |     input_tensor = torch.ones((2,4)).cuda() * rank
30 |     actual_output_tensor = util.gather_split_1d_tensor(input_tensor)
31 |     if rank %2 == 0:
32 |         expected_output_tensor = torch.concat((input_tensor.flatten(), input_tensor.flatten() + 1))
33 |     else : 
34 |         expected_output_tensor = torch.concat((input_tensor.flatten() - 1, input_tensor.flatten()))
35 |     assert(torch.equal(actual_output_tensor, expected_output_tensor))
36 |     Utils.destroy_model_parallel()
37 | 
38 | def test_vocab():
39 |     global_vocab_size = 1600
40 |     per_partition_vocab_size = 1600 / Utils.world_size
41 |     assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_per_partition_vocab_size(global_vocab_size // Utils.world_size, rank, Utils.world_size)))
42 |     assert((rank * per_partition_vocab_size, (rank + 1)* per_partition_vocab_size) == (util.VocabUtility.vocab_range_from_global_vocab_size(global_vocab_size, rank, Utils.world_size)))
43 |     


--------------------------------------------------------------------------------
/tests/unit_tests/test_basic.py:
--------------------------------------------------------------------------------
1 | def test_import():
2 |     import megatron
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/unit_tests/test_utilities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import megatron.core.parallel_state as ps
 4 | 
 5 | class Utils:
 6 | 
 7 |     world_size = torch.cuda.device_count()
 8 |     rank = int(os.environ['LOCAL_RANK'])
 9 | 
10 |     @staticmethod
11 |     def initialize_distributed():
12 |         print(f'Initializing torch.distributed with rank: {Utils.rank}, world_size: {Utils.world_size}')
13 |         torch.cuda.set_device(Utils.rank % torch.cuda.device_count())
14 |         init_method = 'tcp://'
15 |         master_ip = os.getenv('MASTER_ADDR', 'localhost')
16 |         master_port = os.getenv('MASTER_PORT', '6000')
17 |         init_method += master_ip + ':' + master_port
18 |         torch.distributed.init_process_group(backend='nccl', world_size=Utils.world_size, rank=Utils.rank, init_method=init_method)
19 |         
20 |     @staticmethod
21 |     def destroy_model_parallel():
22 |         ps.destroy_model_parallel()
23 |         torch.distributed.barrier()
24 | 
25 |     @staticmethod
26 |     def initialize_model_parallel(tensor_model_parallel_size = 1, pipeline_model_parallel_size = 1, virtual_pipeline_model_parallel_size = None, pipeline_model_parallel_split_rank = None):
27 |         ps.destroy_model_parallel()
28 |         if not torch.distributed.is_initialized():
29 |             Utils.initialize_distributed()
30 |         ps.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, virtual_pipeline_model_parallel_size, pipeline_model_parallel_split_rank)


--------------------------------------------------------------------------------
/tests/unit_tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import megatron.core.utils as util
 4 | import numpy as np
 5 | 
 6 | def test_divide_properly():
 7 |     assert util.divide(4,2) == 2
 8 | 
 9 | def test_divide_improperly():
10 |     with pytest.raises(AssertionError):
11 |         util.divide(4,5)
12 | 
13 | def test_global_memory_buffer():
14 |     global_memory_buffer = util.GlobalMemoryBuffer()
15 |     obtained_tensor = global_memory_buffer.get_tensor((3,2), torch.float32, "test_tensor")
16 |     expected_tensor = torch.empty((3,2), dtype=torch.float32, device=torch.cuda.current_device())
17 |     assert torch.equal(obtained_tensor, expected_tensor)
18 | 
19 | def test_make_viewless_tensor():
20 |     inp = torch.rand((3,4))
21 |     assert(torch.equal(inp, util.make_viewless_tensor(inp, True, True)))
22 |     assert(torch.equal(inp, util.make_viewless_tensor(inp, True, False)))
23 | 
24 | def test_safely_set_viewless_tensor_data():
25 |     tensor = torch.zeros((3,4))
26 |     new_data_tensor = torch.tensor(np.random.rand(3,4))
27 |     util.safely_set_viewless_tensor_data(tensor, new_data_tensor)
28 |     assert(torch.equal(tensor, new_data_tensor))
29 | 
30 | def test_assert_viewless_tensor():
31 |     tensor = torch.rand((3,4))
32 |     assert(torch.equal(util.assert_viewless_tensor(tensor), tensor))
33 |     input_tensor_list=[tensor,tensor,tensor]
34 |     output_tensor_list = util.assert_viewless_tensor(input_tensor_list)
35 |     for inp,out in zip(input_tensor_list, output_tensor_list):
36 |         assert(torch.equal(inp,out))
37 | 


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MayDomine/Seq1F1B/f54b286c75d5510994cb3802d1a8a5c71124d59b/tests/unit_tests/transformer/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/test_core_attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import pytest
 5 | 
 6 | import torch
 7 | 
 8 | from megatron.core.transformer.attention import CrossAttention
 9 | """ 
10 | 
11 | @pytest.fixture
12 | def core_attention(transformer_config):
13 |     return CrossAttention(transformer_config)
14 | 
15 | 
16 | class TestCoreAttention:
17 |     def test_constructor(self, core_attention):
18 |         assert isinstance(core_attention, CrossAttention)
19 |         assert core_attention.layer_number == 1
20 | 
21 |         num_weights = sum([p.numel() for p in core_attention.parameters()])
22 |         assert num_weights == 0
23 | 
24 |     def test_cpu_forward(self, core_attention):
25 |         # we can't currently do this because the global memory buffer is on GPU
26 |         pass
27 | 
28 |     def test_gpu_forward(self, core_attention):
29 | 
30 |         # destroy_global_memory_buffer()
31 |         # _set_global_memory_buffer()
32 |         # model_parallel_cuda_manual_seed(123)
33 | 
34 |         core_attention.cuda()
35 |         config = core_attention.config
36 |         sequence_length = 32
37 |         micro_batch_size = 2
38 |         # query_layer (float): [sequence_length, micro_batch_size, num_attention_heads, hidden_size / num_attention_heads]
39 |         query_layer = torch.ones(
40 |             (
41 |                 sequence_length,
42 |                 micro_batch_size,
43 |                 config.num_attention_heads,
44 |                 config.hidden_size // config.num_attention_heads,
45 |             )
46 |         ).cuda()
47 | 
48 |         key_layer = torch.ones_like(query_layer).cuda()
49 | 
50 |         value_layer = torch.ones_like(query_layer).cuda()
51 | 
52 |         attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
53 | 
54 |         context_layer = core_attention(
55 |             query_layer=query_layer, key_layer=key_layer, value_layer=value_layer, attention_mask=attention_mask
56 |         )
57 | 
58 |         assert context_layer.shape[0] == sequence_length
59 |         assert context_layer.shape[1] == micro_batch_size
60 |         assert context_layer.shape[2] == config.hidden_size
61 |         assert context_layer.device.type == 'cuda'
62 |         assert context_layer.dtype == torch.float32
63 | 
64 | """


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/test_mlp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import pytest
 4 | 
 5 | import torch
 6 | 
 7 | from megatron.core.transformer.mlp import MLP
 8 | from tests.unit_tests.test_utilities import Utils
 9 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
10 | from megatron.core.transformer.transformer_config import TransformerConfig
11 | 
12 | class TestParallelMLP:
13 | 
14 |     def setup_method(self, method):
15 |         Utils.initialize_model_parallel(1,1)
16 |         model_parallel_cuda_manual_seed(123)
17 |         transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
18 |         self.mlp = MLP(transformer_config)
19 | 
20 |     def teardown_method(self, method):
21 |         Utils.destroy_model_parallel()
22 | 
23 |     def test_constructor(self):
24 |         assert isinstance(self.mlp, MLP)
25 | 
26 |         num_weights = sum([p.numel() for p in self.mlp.parameters()])
27 |         assert num_weights == 1236
28 | 
29 |     """
30 |     def test_cpu_forward(self, mlp):
31 |         # [sequence length, micro batch size, hidden size]
32 |         hidden_states = torch.ones((32, 2, mlp.config.hidden_size))
33 |         output, output_bias = mlp(hidden_states)
34 |         assert output.shape[0] == 32
35 |         assert output.shape[1] == 2
36 |         assert output.shape[2] == mlp.config.hidden_size
37 |         assert output_bias.shape[0] == mlp.config.hidden_size
38 |         assert output.dtype == torch.float32
39 |     """
40 | 
41 |     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
42 |     def test_gpu_forward(self):
43 |         mlp = self.mlp
44 |         mlp.cuda()
45 |         # [sequence length, batch size, hidden size]
46 |         hidden_states = torch.ones((32, 2, mlp.config.hidden_size))
47 |         hidden_states = hidden_states.cuda()
48 |         output, output_bias = mlp(hidden_states)
49 |         assert output.shape[0] == 32
50 |         assert output.shape[1] == 2
51 |         assert output.shape[2] == mlp.config.hidden_size
52 |         assert output_bias.shape[0] == mlp.config.hidden_size
53 |         assert output.dtype == torch.float32
54 |         assert output.device.type == 'cuda'
55 |         assert output_bias.device.type == 'cuda'
56 | 
57 | 


--------------------------------------------------------------------------------
/tests/unit_tests/transformer/test_transformer_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import pytest
 5 | 
 6 | import torch
 7 | 
 8 | from megatron.core.transformer.transformer_config import TransformerConfig
 9 | from megatron.core.transformer.transformer_layer import TransformerLayer
10 | from tests.unit_tests.test_utilities import Utils
11 | from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
12 | from megatron.core.transformer.transformer_config import TransformerConfig
13 | 
14 | 
15 | 
16 | class TestParallelTransformerLayer:
17 |     
18 |     def setup_method(self, method):
19 |         Utils.initialize_model_parallel(1,1)
20 |         model_parallel_cuda_manual_seed(123)
21 |         transformer_config = TransformerConfig(num_layers=2, hidden_size=12, num_attention_heads=4, use_cpu_initialization=True)
22 |         self.parallel_transformer_layer = TransformerLayer(transformer_config)
23 | 
24 |     def teardown_method(self, method):
25 |         Utils.destroy_model_parallel()
26 | 
27 |     def test_constructor(self):
28 |         parallel_transformer_layer = self.parallel_transformer_layer
29 |         assert isinstance(parallel_transformer_layer, TransformerLayer)
30 |         assert parallel_transformer_layer.layer_number == 1
31 | 
32 |         num_weights = sum([p.numel() for p in parallel_transformer_layer.parameters()])
33 |         assert num_weights == 1884
34 | 
35 |     def test_gpu_forward(self):
36 |         parallel_transformer_layer = self.parallel_transformer_layer
37 |         config: TransformerConfig = parallel_transformer_layer.config
38 |         sequence_length = 32
39 |         micro_batch_size = 2
40 |         parallel_transformer_layer.cuda()
41 | 
42 |         # [sequence length, batch size, hidden size]
43 |         hidden_states = torch.ones((sequence_length, micro_batch_size, config.hidden_size))
44 |         hidden_states = hidden_states.cuda()
45 | 
46 |         attention_mask = torch.ones((1, 1, sequence_length, sequence_length), dtype=bool).cuda()
47 | 
48 |         hidden_states = parallel_transformer_layer(hidden_states=hidden_states, attention_mask=attention_mask)
49 |         assert hidden_states.shape[0] == sequence_length
50 |         assert hidden_states.shape[1] == micro_batch_size
51 |         assert hidden_states.shape[2] == config.hidden_size
52 | 


--------------------------------------------------------------------------------
/tools/autoformat.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
4 | 
5 | # for now we just format core
6 | 
7 | black ${SCRIPT_DIR}/../megatron/core
8 | isort ${SCRIPT_DIR}/../megatron/core
9 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder
4 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | from megatron import get_args, get_tokenizer
 7 | from megatron.data.bert_dataset import build_training_sample
 8 | 
 9 | 
10 | class BertEmbeddingDataset(torch.utils.data.Dataset):
11 |     '''Dataset to convert a text dataset to Bert tokens.'''
12 | 
13 |     def __init__(self, text_dataset, max_seq_length):
14 | 
15 |         super().__init__()
16 | 
17 |         args = get_args()
18 | 
19 |         # Dataset, tokenizer.
20 |         self.text_dataset = text_dataset
21 |         self.bert_tokenizer = get_tokenizer()
22 | 
23 |         # Params to store.
24 |         self.max_seq_length = max_seq_length
25 |         self.seed = args.seed
26 |         self.masked_lm_prob = args.mask_prob
27 | 
28 |         # Vocab stuff.
29 |         self.vocab_id_list = list(self.bert_tokenizer.inv_vocab.keys())
30 |         self.vocab_id_to_token_dict = self.bert_tokenizer.inv_vocab
31 |         self.cls_id = self.bert_tokenizer.cls
32 |         self.sep_id = self.bert_tokenizer.sep
33 |         self.mask_id = self.bert_tokenizer.mask
34 |         self.pad_id = self.bert_tokenizer.pad
35 | 
36 |     def __len__(self):
37 |         return len(self.text_dataset)
38 | 
39 |     def __getitem__(self, idx):
40 | 
41 |         # Text.
42 |         text_sample = self.text_dataset[idx]
43 |         text = text_sample["text"]
44 |         text = text.replace("<|endoftext|>", "")
45 | 
46 |         # Bert/Wordpiece tokens (+truncate).
47 |         bert_token_ids = self.bert_tokenizer.tokenize(text)
48 |         bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep.
49 |         if not bert_token_ids:
50 |             bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq
51 | 
52 |         # Note that this rng state should be numpy and not python since
53 |         # python randint is inclusive whereas the numpy one is exclusive.
54 |         # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
55 |         np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
56 | 
57 |         # Build sample.
58 |         sample = build_training_sample([bert_token_ids],
59 |                                        len(bert_token_ids),
60 |                                        len(bert_token_ids) + 2, # for cls+sep
61 |                                        self.vocab_id_list,
62 |                                        self.vocab_id_to_token_dict,
63 |                                        self.cls_id, self.sep_id,
64 |                                        self.mask_id, self.pad_id,
65 |                                        self.masked_lm_prob, np_rng,
66 |                                        binary_head=False)
67 |         sample["seq_length"] = len(sample["text"])
68 |         return sample
69 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "h5py",
 7 |     "transformers", # for huggingface bert
 8 | ]
 9 | 
10 | for lib in required_libs:
11 |     try:
12 |         globals()[lib] = importlib.import_module(lib)
13 |     except ImportError as e:
14 |         raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.")
15 | 


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/merge_datasets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import argparse
 5 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
 6 |                                              os.path.pardir)))
 7 | 
 8 | from megatron.data import indexed_dataset
 9 | 
10 | 
11 | def main(args):
12 | 
13 |     prefixes = set()
14 |     for basename in os.listdir(args.input):
15 |         prefix, ext = os.path.splitext(basename)
16 | 
17 |         if prefix in prefixes:
18 |             continue
19 | 
20 |         if not os.path.isfile(os.path.join(args.input, basename)):
21 |             continue
22 | 
23 |         ext_pair = '.bin' if ext == '.idx' else '.idx'
24 |         assert os.path.isfile(os.path.join(args.input, prefix) + ext_pair), \
25 |                f'ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}'
26 | 
27 |         prefixes.add(prefix)
28 | 
29 |     builder = None
30 |     for prefix in sorted(prefixes):
31 |         if builder is None:
32 |             dataset = indexed_dataset.make_dataset(os.path.join(args.input, prefix), 'infer')
33 | 
34 |             if isinstance(dataset, indexed_dataset.MMapIndexedDataset):
35 |                 builder = indexed_dataset.MMapIndexedDatasetBuilder(args.output_prefix + '.bin', dtype=dataset._index.dtype)
36 |             else:
37 |                 builder = indexed_dataset.IndexedDatasetBuilder(args.output_prefix + '.bin')
38 | 
39 |             del dataset
40 | 
41 |         builder.merge_file_(os.path.join(args.input, prefix))
42 | 
43 |     builder.finalize(args.output_prefix + '.idx')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     parser = argparse.ArgumentParser()
48 | 
49 |     group = parser.add_argument_group(title='input data')
50 |     group.add_argument('--input', type=str, required=True,
51 |                        help='Path to directory containing all document files to merge')
52 | 
53 |     group = parser.add_argument_group(title='output data')
54 |     group.add_argument('--output-prefix', type=str, required=True,
55 |                        help='Path to binary output file without suffix')
56 | 
57 |     args = parser.parse_args()
58 | 
59 |     assert os.path.isdir(args.input), \
60 |            f'ERROR: {args.input} is not a directory or does not exist'
61 | 
62 |     assert os.path.isdir(os.path.dirname(args.output_prefix)), \
63 |            f'ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist'
64 | 
65 |     main(args)
66 | 
67 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | 
 8 | """
 9 | This code adds id to each json object in a json file. User can add prefix
10 | to the ids.
11 | """
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     print('parsing the arguments ...')
16 | 
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
19 |         ' json file where id needs to be added')
20 |     parser.add_argument('--output-file', type=str, default=None, help=\
21 |         'Output file name with id')
22 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
23 |         'Id prefix')
24 |     parser.add_argument('--log-interval', type=int, default=100,
25 |                        help='Log interval')
26 |     args = parser.parse_args()
27 | 
28 |     print('Adding ids to dataset ...')
29 | 
30 |     f_input = open(args.input_file, 'r', encoding='utf-8')
31 |     f_output = open(args.output_file, 'wb')
32 | 
33 |     unique_ids = 1
34 |     start_time = time.time()
35 |     for row in f_input:
36 |         each_row = json.loads(row)
37 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
38 |         each_row['adlr_id'] = adlr_id_string
39 |         myjson = json.dumps(each_row, ensure_ascii=False)
40 | 
41 |         f_output.write(myjson.encode('utf-8'))
42 |         f_output.write('\n'.encode('utf-8'))
43 | 
44 |         if unique_ids % args.log_interval == 0:
45 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
46 |                     unique_ids, time.time() - start_time), flush=True)
47 | 
48 |         unique_ids += 1
49 | 
50 |     # Close the file.
51 |     f_input.close()
52 |     f_output.close()
53 |     
54 |     print('done :-)', flush=True)
55 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import glob
 5 | import sys
 6 | import json
 7 | import argparse
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--json_path", type=str, default=".",
13 |         help="path where all the json files are located")
14 | 
15 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
16 |         help="filename where the merged json should go")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     json_path = args.json_path
21 |     out_file = args.output_file
22 | 
23 |     json_files = glob.glob(json_path + '/*.json')
24 | 
25 |     counter = 0
26 | 
27 |     with open(out_file, 'w') as outfile:
28 |         for fname in json_files:
29 |             counter += 1
30 | 
31 |             if counter % 1024 == 0:
32 |                 print("Merging at ", counter, flush=True)
33 | 
34 |             with open(fname, 'r') as infile:
35 |                 for row in infile:
36 |                     each_row = json.loads(row)
37 |                     outfile.write(row)
38 | 
39 | 
40 |     print("Merged file", out_file, flush=True)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import json
 5 | import time
 6 | import sys
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     url_filename = sys.argv[1]
12 |     data_filename = sys.argv[2]
13 |     output_filename = sys.argv[3]
14 | 
15 |     urls = set()
16 |     with open(url_filename, 'r') as f:
17 |         for line in f:
18 |             myjson = json.loads(line)
19 |             for key in myjson:
20 |                 this_urls = myjson[key]
21 |                 for i in range(1, len(this_urls)):
22 |                     urls.add(this_urls[i])
23 |     print('will be removing {} urls'.format(len(urls)), flush=True)
24 | 
25 |     written_docs = 0
26 |     removed_docs = 0
27 |     removed_chars = 0
28 |     start_time = time.time()
29 |     with open(output_filename, 'wb') as fout:
30 |         with open(data_filename, 'r') as fin:
31 |             for line in fin:
32 |                 try:
33 |                     myjson = json.loads(line)
34 |                     url = myjson['url']
35 |                     if url in urls:
36 |                         print('removing', myjson)
37 |                         removed_docs += 1
38 |                         removed_chars += len(myjson['text'])
39 |                         continue
40 |                     myjson = json.dumps(myjson, ensure_ascii=False)
41 |                     fout.write(myjson.encode('utf-8'))
42 |                     fout.write('\n'.encode('utf-8'))
43 |                     written_docs += 1
44 |                     if written_docs % 10000 == 0:
45 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
46 |                               '| removed: {} (char: {})'.format(
47 |                                   time.time() - start_time,
48 |                                   written_docs, removed_docs, removed_chars))
49 |                 except Exception as e:
50 |                     print('[SKIPPING]', line, e)
51 | 
52 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
53 |           '| removed: {} (char: {})'.format(
54 |               time.time() - start_time,
55 |               written_docs, removed_docs, removed_chars))
56 |     print('done :-)')
57 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .cli import retro
4 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | from . import retro
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     retro.init(os.environ["RETRO_WORKDIR"])
10 | 


--------------------------------------------------------------------------------
/tools/retro/db/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .build import build_db
4 | 


--------------------------------------------------------------------------------
/tools/retro/db/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import json
 4 | import numpy as np
 5 | import torch
 6 | from tqdm import tqdm
 7 | 
 8 | from megatron import get_args, print_rank_0
 9 | from tools.retro.external_libs import h5py
10 | from tools.retro.utils import get_gpt_tokenizer
11 | 
12 | 
13 | class DBDataset(torch.utils.data.Dataset):
14 |     '''Dataset for iterating chunks.
15 | 
16 |     Requires:
17 |     - List of indexed datasets
18 |     - Chunk index array, with format:
19 |         [dataset_idx, doc_id, start_idx, end_idx, bert_length])
20 |     '''
21 | 
22 |     def __init__(self, db_path, indexed_datasets, chunks, max_chunk_length):
23 | 
24 |         assert chunks.shape[1] == 5, "expected 5 columns (dataset_idx, " \
25 |         "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); " \
26 |         "found %d columns." % chunks.shape[1]
27 | 
28 |         self.db_path = db_path
29 |         self.indexed_datasets = indexed_datasets
30 |         self.chunks = chunks
31 |         self.doc_chunk_map = None
32 | 
33 |         self.max_chunk_length = max_chunk_length
34 |         self.eod_token_id = get_gpt_tokenizer().eod
35 | 
36 |     def __len__(self):
37 |         return self.chunks.shape[0]
38 | 
39 |     def __getitem__(self, chunk_id):
40 | 
41 |         # Chunk start/end indexes.
42 |         indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = \
43 |             [ value.item() for value in self.chunks[chunk_id] ]
44 |         chunk_length = token_end_idx - token_start_idx
45 |         indexed_dataset = self.indexed_datasets[indexed_dataset_id]
46 | 
47 |         # Chunk token ids.
48 |         token_ids = indexed_dataset.get(doc_id,
49 |                                         offset=token_start_idx,
50 |                                         length=chunk_length)
51 | 
52 |         # Extend chunks to max_chunk_length by padding with EOD tokens.
53 |         if chunk_length != self.max_chunk_length:
54 |             assert chunk_length < self.max_chunk_length, "invalid chunk len."
55 |             token_ids = token_ids.tolist()
56 |             token_ids += [self.eod_token_id] * \
57 |                 (self.max_chunk_length - chunk_length)
58 | 
59 |         return {
60 |             "doc_id" : doc_id,
61 |             "text" : np.array(token_ids, dtype=np.int64),
62 |         }
63 | 
64 |     def load_doc_tuples(self):
65 |         '''Load the dataset & document ids.
66 | 
67 |         Load the dataset id & document id of each chunk in the database, to
68 |         be used for causality filtering during querying.
69 |         '''
70 |         self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
71 |         block_size = int(1e6)
72 |         for start_idx in tqdm(range(0, len(self), block_size)):
73 |             end_idx = min(len(self), start_idx + block_size)
74 |             self.doc_tuples[start_idx:end_idx]=self.chunks[start_idx:end_idx,:2]
75 | 


--------------------------------------------------------------------------------
/tools/retro/examples/pretrain_model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -u
  4 | 
  5 | unset NCCL_DEBUG
  6 | export CUDA_DEVICE_MAX_CONNECTIONS=1
  7 | 
  8 | ######## GPT or Retro?. ########
  9 | 
 10 | # 0 : GPT.
 11 | # 1 : Retro
 12 | 
 13 | ADD_RETRIEVER=1
 14 | 
 15 | ######## Megatron, Retro dirs. ########
 16 | 
 17 | REPO_DIR="<path/to/megatron/repo>"
 18 | RETRO_WORKDIR="<path/to/retro/data/directory>"
 19 | 
 20 | ######## Data. ########
 21 | 
 22 | DATA_BLEND="<see --data-path in arguments.py>"
 23 | 
 24 | ######## Args. ########
 25 | 
 26 | ARGS=" \
 27 |     --log-interval 1 \
 28 |     --use-flash-attn \
 29 |     --apply-layernorm-1p \
 30 |     --untie-embeddings-and-output-weights \
 31 |     --disable-bias-linear \
 32 |     --no-position-embedding \
 33 |     --use-rotary-position-embeddings \
 34 |     --rotary-percent 0.5 \
 35 |     --swiglu \
 36 |     --attention-dropout 0.0 \
 37 |     --hidden-dropout 0.0 \
 38 |     --exit-duration-in-mins 220 \
 39 |     --tensor-model-parallel-size 1 \
 40 |     --pipeline-model-parallel-size 1 \
 41 |     --num-layers 24 \
 42 |     --hidden-size 1024 \
 43 |     --num-attention-heads 16 \
 44 |     --seq-length 512 \
 45 |     --max-position-embeddings 512 \
 46 |     --micro-batch-size 16 \
 47 |     --global-batch-size 256 \
 48 |     --train-samples 200000 \
 49 |     --lr-decay-samples 175000 \
 50 |     --lr-warmup-samples 10000 \
 51 |     --lr 2.5e-5 \
 52 |     --min-lr 2.5e-6 \
 53 |     --lr-decay-style cosine \
 54 |     --eval-iters 50 \
 55 |     --eval-interval 2000 \
 56 |     --tokenizer-type GPTSentencePieceTokenizer \
 57 |     --tokenizer-model <path/to/gpt/tokenizer/model> \
 58 |     --data-path ${DATA_BLEND} \
 59 |     --split 98,2,0 \
 60 |     --clip-grad 1.0 \
 61 |     --weight-decay 0.1 \
 62 |     --adam-beta1 0.9 \
 63 |     --adam-beta2 0.95 \
 64 |     --init-method-std 0.007 \
 65 |     --log-params-norm \
 66 |     --log-num-zeros-in-grad \
 67 |     --bf16 \
 68 |     --DDP-impl local \
 69 | "
 70 | 
 71 | ######## Retro. ########
 72 | 
 73 | if [ "$ADD_RETRIEVER" = "0" ]; then
 74 |     SCRIPT=pretrain_gpt.py
 75 | else
 76 |     ARGS="${ARGS} \
 77 |     --retro-workdir ${RETRO_WORKDIR} \
 78 |     --retro-add-retriever \
 79 |     "
 80 |     SCRIPT=pretrain_retro.py
 81 | fi
 82 | 
 83 | ######## Command. ########
 84 | 
 85 | NPROCS=8
 86 | CMD="\
 87 |     pwd && cd ${REPO_DIR} && pwd && \
 88 |     export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
 89 |     python -m torch.distributed.run \
 90 |     --nproc_per_node ${NPROCS} \
 91 |     --nnodes 1 \
 92 |     --node_rank ${NODE_RANK} \
 93 |     --master_addr ${MASTER_ADDR} \
 94 |     --master_port 6000 \
 95 |     ${SCRIPT} ${ARGS} \
 96 | "
 97 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
 98 | echo "CMD = '$CMD'."
 99 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
100 | eval $CMD
101 | 


--------------------------------------------------------------------------------
/tools/retro/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "faiss",
 7 |     "h5py",
 8 |     "transformers", # for huggingface bert
 9 | ]
10 | 
11 | for lib in required_libs:
12 |     try:
13 |         globals()[lib] = importlib.import_module(lib)
14 |     except ImportError as e:
15 |         raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.")
16 | 


--------------------------------------------------------------------------------
/tools/retro/index/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .build import add_to_index, build_index, train_index
4 | # from .index import Index
5 | 


--------------------------------------------------------------------------------
/tools/retro/index/factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from .indexes import FaissBaseIndex, FaissParallelAddIndex
 4 | 
 5 | 
 6 | class IndexFactory:
 7 |     '''Get index.
 8 | 
 9 |     Index type generally read from argument '--retro-index-ty'.
10 |     '''
11 | 
12 |     @classmethod
13 |     def get_index_class(cls, index_type):
14 |         return {
15 |             "faiss-base" : FaissBaseIndex,
16 |             "faiss-par-add" : FaissParallelAddIndex,
17 |         }[index_type]
18 | 
19 |     @classmethod
20 |     def get_index(cls, index_type):
21 |         index_class = cls.get_index_class(index_type)
22 |         index = index_class()
23 |         return index
24 | 


--------------------------------------------------------------------------------
/tools/retro/index/index.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import abc
 4 | import numpy as np
 5 | import os
 6 | import torch
 7 | 
 8 | from megatron import get_retro_args
 9 | from tools.retro.external_libs import faiss
10 | 
11 | from .utils import get_index_dir
12 | 
13 | 
14 | class Index(abc.ABC):
15 | 
16 |     '''Abstract base class for indexes.
17 | 
18 |     *Note* : While currently only Faiss-based classes are implemented, in the
19 |     future, this class will be extended with other types of indexes that have
20 |     different performance-accuracy trade-offs.
21 | 
22 |     The primary methods to override are:
23 |     - train() : Train index on the sampled training chunks.
24 |     - add() : Add all training chunks to index.
25 |     '''
26 | 
27 |     @classmethod
28 |     def c_verbose(cls, index, v):
29 |         '''Make index object verbose.'''
30 |         assert isinstance(v, bool)
31 |         faiss.ParameterSpace().set_index_parameter(index, "verbose", v)
32 | 
33 |     def get_empty_index_path(self):
34 |         args = get_retro_args()
35 |         return os.path.join(
36 |             get_index_dir(),
37 |             "empty_%.3f.faissindex" % args.retro_index_train_load_fraction,
38 |         )
39 | 
40 |     def get_empty_index(self):
41 |         return faiss.read_index(self.get_empty_index_path())
42 | 
43 |     def get_added_index_path(self):
44 |         args = get_retro_args()
45 |         return os.path.join(
46 |             get_index_dir(),
47 |             "added_%.3f_%.3f.faissindex" % (
48 |                 args.retro_index_train_load_fraction,
49 |                 args.retro_index_add_load_fraction,
50 |             ),
51 |         )
52 | 
53 |     def get_added_index(self):
54 |         return faiss.read_index(self.get_added_index_path())
55 | 
56 |     @abc.abstractmethod
57 |     def train(self, *args):
58 |         pass
59 | 
60 |     @abc.abstractmethod
61 |     def add(self, *args):
62 |         pass
63 | 
64 |     def embed_text_dataset_block(self, embedder, text_dataset, _range):
65 |         '''Embed a range of a text dataset.'''
66 |         sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range))
67 |         return embedder.embed_text_dataset(sub_dataset)
68 | 


--------------------------------------------------------------------------------
/tools/retro/index/indexes/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .faiss_base import FaissBaseIndex
4 | from .faiss_par_add import FaissParallelAddIndex
5 | 


--------------------------------------------------------------------------------
/tools/retro/index/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import concurrent
 4 | import gc
 5 | import glob
 6 | import numpy as np
 7 | import os
 8 | import psutil
 9 | import time
10 | import torch
11 | from tqdm import tqdm
12 | 
13 | from megatron import get_retro_args, print_rank_0
14 | from tools.retro.db.utils import get_indexed_dataset_infos
15 | from tools.retro.external_libs import h5py
16 | 
17 | 
18 | def get_index_dir():
19 |     """Create sub-directory for this index."""
20 | 
21 |     args = get_retro_args()
22 | 
23 |     # Directory path.
24 |     index_dir_path = os.path.join(
25 |         args.retro_workdir,
26 |         "index",
27 |         args.retro_index_type,
28 |         args.retro_index_str,
29 |     )
30 | 
31 |     # Make directory.
32 |     os.makedirs(index_dir_path, exist_ok=True)
33 | 
34 |     return index_dir_path
35 | 
36 | 
37 | def num_samples_to_block_ranges(num_samples):
38 |     '''Split a range (length num_samples) into sequence of block ranges
39 |     of size block_size.'''
40 |     args = get_retro_args()
41 |     block_size = args.retro_block_size
42 |     start_idxs = list(range(0, num_samples, block_size))
43 |     end_idxs = [min(num_samples, s + block_size) for s in start_idxs]
44 |     ranges = list(zip(start_idxs, end_idxs))
45 |     return ranges
46 | 
47 | 
48 | def get_training_data_root_dir():
49 |     args = get_retro_args()
50 |     return os.path.join(args.retro_workdir, "index", "train_emb")
51 | 
52 | 
53 | def get_training_data_block_dir():
54 |     return os.path.join(get_training_data_root_dir(), "blocks")
55 | 
56 | 
57 | def get_training_data_block_paths():
58 |     return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5"))
59 | 
60 | 
61 | def get_training_data_merged_path():
62 |     args = get_retro_args()
63 |     return os.path.join(get_training_data_root_dir(),
64 |                         "train_%.3f.bin" % args.retro_index_train_load_fraction)
65 | 
66 | 
67 | def get_added_codes_dir():
68 |     return os.path.join(get_index_dir(), "add_codes")
69 | 
70 | 
71 | def get_added_code_paths():
72 |     return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5"))
73 | 


--------------------------------------------------------------------------------
/tools/retro/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .query import query_pretraining_neighbors
4 | 


--------------------------------------------------------------------------------
/tools/retro/query/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import hashlib
 4 | import os
 5 | 
 6 | from megatron import get_retro_args
 7 | 
 8 | 
 9 | def get_query_workdir():
10 |     args = get_retro_args()
11 |     return os.path.join(args.retro_workdir, "query")
12 | 
13 | 
14 | def get_neighbor_dirname(key, dataset):
15 |     hashes = ",".join([ d.desc_hash for d in dataset.datasets ])
16 |     hash = hashlib.md5(hashes.encode()).hexdigest()
17 |     return os.path.join(get_query_workdir(), os.path.basename(f"{key}_{hash}"))
18 | 


--------------------------------------------------------------------------------
/tools/retro/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | import torch
 5 | import types
 6 | 
 7 | from megatron import get_retro_args
 8 | from megatron.tokenizer.tokenizer import (
 9 |     _BertWordPieceTokenizer,
10 |     _GPT2BPETokenizer,
11 |     _GPTSentencePieceTokenizer,
12 | )
13 | 
14 | 
15 | def get_args_path(workdir):
16 |     '''Argument copy stored within retro workdir.'''
17 |     return os.path.join(workdir, "args.json")
18 | 
19 | 
20 | def get_num_chunks_per_sample():
21 |     '''Compute seq_length // chunk_length.'''
22 |     args = get_retro_args()
23 |     sample_length = args.retro_gpt_seq_length
24 |     chunk_length = args.retro_gpt_chunk_length
25 |     assert sample_length % chunk_length == 0
26 |     return sample_length // chunk_length
27 | 
28 | 
29 | def get_gpt_tokenizer():
30 |     '''GPT (BPE) tokenizer.'''
31 |     args = get_retro_args()
32 |     tokenizer_type = args.retro_gpt_tokenizer_type
33 |     if tokenizer_type == "GPT2BPETokenizer":
34 |         assert args.retro_gpt_vocab_file and args.retro_gpt_merge_file
35 |         return _GPT2BPETokenizer(
36 |             vocab_file=args.retro_gpt_vocab_file,
37 |             merge_file=args.retro_gpt_merge_file,
38 |         )
39 |     elif tokenizer_type == 'GPTSentencePieceTokenizer':
40 |         assert args.retro_gpt_tokenizer_model is not None
41 |         return _GPTSentencePieceTokenizer(args.retro_gpt_tokenizer_model)
42 |     else:
43 |         raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type)
44 | 
45 | 
46 | def get_bert_tokenizer():
47 |     '''Bert (Wordpiece) tokenizer.'''
48 |     args = get_retro_args()
49 |     lower_case = {
50 |         "BertWordPieceLowerCase" : True,
51 |         "BertWordPieceCase" : False,
52 |     }[args.retro_bert_tokenizer_type]
53 |     return _BertWordPieceTokenizer(
54 |         vocab_file=args.retro_bert_vocab_file,
55 |         lower_case=lower_case,
56 |     )
57 | 
58 | 
59 | class GPTToTextDataset(torch.utils.data.Dataset):
60 |     '''Dataset to convert GPT tokens to text.'''
61 | 
62 |     def __init__(self, gpt_dataset):
63 | 
64 |         super().__init__()
65 | 
66 |         self.gpt_dataset = gpt_dataset
67 |         self.gpt_tokenizer = get_gpt_tokenizer()
68 | 
69 |     def __len__(self):
70 |         return len(self.gpt_dataset)
71 | 
72 |     def __getitem__(self, idx):
73 |         gpt_token_ids = self.gpt_dataset[idx]["text"].tolist()
74 |         text = self.gpt_tokenizer.detokenize(gpt_token_ids)
75 |         return {"text": text}
76 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import sys
 3 | import json
 4 | import requests
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     url = sys.argv[1]
 9 |     url = 'http://' + url + '/api'
10 |     headers = {'Content-Type': 'application/json'}
11 | 
12 |     while True:
13 |         sentence = input("Enter prompt: ")
14 |         tokens_to_generate = int(eval(input("Enter number of tokens to generate: ")))
15 | 
16 |         data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate}
17 |         response = requests.put(url, data=json.dumps(data), headers=headers)
18 | 
19 |         if response.status_code != 200:
20 |             print(f"Error {response.status_code}: {response.json()['message']}")
21 |         else:
22 |             print("Megatron Response: ")
23 |             print(response.json()['text'][0])
24 | 


--------------------------------------------------------------------------------