├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── animation-LQ.gif
    ├── animation.gif
    ├── c4-blend-llama2.sh
    ├── c4-blend-llama3.1.sh
    ├── c4-blend-llama3.sh
    ├── exp_results.png
    ├── mask-learning.gif
    ├── mask-sampling.png
    └── teaser.png
├── docs
    ├── export_hf.md
    └── preprocess_c4.md
├── eval_llama_ppl.py
├── learnable_sparsity
    ├── __init__.py
    ├── differentiable_mask.py
    └── ste.py
├── megatron
    ├── __init__.py
    ├── arguments.py
    ├── checkpointing.py
    ├── core
    │   ├── README.md
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── Makefile
    │   │   ├── __init__.py
    │   │   ├── bert_dataset.py
    │   │   ├── blended_dataset.py
    │   │   ├── blended_megatron_dataset_builder.py
    │   │   ├── blended_megatron_dataset_config.py
    │   │   ├── gpt_dataset.py
    │   │   ├── helpers.cpp
    │   │   ├── indexed_dataset.py
    │   │   ├── masked_dataset.py
    │   │   ├── megatron_dataset.py
    │   │   ├── megatron_tokenizer.py
    │   │   ├── multimodal_dataset.py
    │   │   ├── readme.md
    │   │   ├── t5_dataset.py
    │   │   └── utils.py
    │   ├── dist_checkpointing
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── dict_utils.py
    │   │   ├── mapping.py
    │   │   ├── optimizer.py
    │   │   ├── serialization.py
    │   │   ├── strategies
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── tensorstore.py
    │   │   │   ├── two_stage.py
    │   │   │   └── zarr.py
    │   │   └── utils.py
    │   ├── distributed
    │   │   ├── __init__.py
    │   │   ├── distributed_data_parallel.py
    │   │   ├── finalize_model_grads.py
    │   │   └── grad_buffer.py
    │   ├── enums.py
    │   ├── fusions
    │   │   ├── __init__.py
    │   │   ├── fused_bias_dropout.py
    │   │   ├── fused_bias_gelu.py
    │   │   ├── fused_bias_swiglu.py
    │   │   ├── fused_layer_norm.py
    │   │   └── fused_softmax.py
    │   ├── inference_params.py
    │   ├── jit.py
    │   ├── model_parallel_config.py
    │   ├── models
    │   │   ├── T5
    │   │   │   ├── __init__.py
    │   │   │   ├── t5_model.py
    │   │   │   └── t5_spec.py
    │   │   ├── __init__.py
    │   │   ├── bert
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_layer_specs.py
    │   │   │   ├── bert_lm_head.py
    │   │   │   ├── bert_model.py
    │   │   │   └── pooler.py
    │   │   ├── common
    │   │   │   ├── __init__.py
    │   │   │   ├── embeddings
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── language_model_embedding.py
    │   │   │   │   └── rotary_pos_embedding.py
    │   │   │   ├── language_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── language_module.py
    │   │   │   └── vision_module
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── vision_module.py
    │   │   ├── gpt
    │   │   │   ├── __init__.py
    │   │   │   ├── gpt_layer_specs.py
    │   │   │   └── gpt_model.py
    │   │   ├── retro
    │   │   │   ├── __init__.py
    │   │   │   ├── base_attention.py
    │   │   │   ├── config.py
    │   │   │   ├── decoder_attention.py
    │   │   │   ├── decoder_spec.py
    │   │   │   ├── encoder_attention.py
    │   │   │   ├── encoder_spec.py
    │   │   │   └── model.py
    │   │   └── vision
    │   │   │   ├── __init__.py
    │   │   │   └── clip_vit_model.py
    │   ├── optimizer
    │   │   ├── __init__.py
    │   │   ├── clip_grads.py
    │   │   ├── distrib_optimizer.py
    │   │   ├── grad_scaler.py
    │   │   ├── optimizer.py
    │   │   └── optimizer_config.py
    │   ├── package_info.py
    │   ├── packed_seq_params.py
    │   ├── parallel_state.py
    │   ├── pipeline_parallel
    │   │   ├── __init__.py
    │   │   ├── p2p_communication.py
    │   │   └── schedules.py
    │   ├── requirements.txt
    │   ├── tensor_parallel
    │   │   ├── __init__.py
    │   │   ├── cross_entropy.py
    │   │   ├── data.py
    │   │   ├── layers.py
    │   │   ├── mappings.py
    │   │   ├── random.py
    │   │   └── utils.py
    │   ├── timers.py
    │   ├── transformer
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── custom_layers
    │   │   │   ├── __init__.py
    │   │   │   └── transformer_engine.py
    │   │   ├── dot_product_attention.py
    │   │   ├── enums.py
    │   │   ├── identity_op.py
    │   │   ├── mlp.py
    │   │   ├── module.py
    │   │   ├── moe
    │   │   │   ├── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── experts.py
    │   │   │   ├── grouped_gemm_util.py
    │   │   │   ├── moe_layer.py
    │   │   │   ├── moe_utils.py
    │   │   │   ├── router.py
    │   │   │   └── token_dispatcher.py
    │   │   ├── spec_utils.py
    │   │   ├── transformer_block.py
    │   │   ├── transformer_config.py
    │   │   ├── transformer_layer.py
    │   │   └── utils.py
    │   └── utils.py
    ├── data
    │   ├── __init__.py
    │   ├── autoaugment.py
    │   ├── biencoder_dataset_utils.py
    │   ├── data_samplers.py
    │   ├── dataset_utils.py
    │   ├── ict_dataset.py
    │   ├── image_folder.py
    │   ├── multimodal_dataset.py
    │   ├── orqa_wiki_dataset.py
    │   ├── realm_dataset_utils.py
    │   ├── realm_index.py
    │   └── vit_dataset.py
    ├── dist_signal_handler.py
    ├── fp16_deprecated
    │   └── loss_scaler.py
    ├── fused_kernels
    │   ├── __init__.py
    │   ├── compat.h
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_fused_kernels.py
    │   └── type_shim.h
    ├── global_vars.py
    ├── indexer.py
    ├── initialize.py
    ├── log_handler.py
    ├── memory.py
    ├── microbatches.py
    ├── model
    │   ├── __init__.py
    │   ├── bert_model.py
    │   ├── biencoder_model.py
    │   ├── classification.py
    │   ├── enums.py
    │   ├── fused_bias_gelu.py
    │   ├── fused_layer_norm.py
    │   ├── fused_softmax.py
    │   ├── gpt_model.py
    │   ├── language_model.py
    │   ├── module.py
    │   ├── multiple_choice.py
    │   ├── realm_model.py
    │   ├── rms_norm.py
    │   ├── t5_model.py
    │   ├── transformer.py
    │   ├── utils.py
    │   └── vision
    │   │   ├── classification.py
    │   │   ├── dino.py
    │   │   ├── esvit_swin_backbone.py
    │   │   ├── inpainting.py
    │   │   ├── knn_monitor.py
    │   │   ├── mit_backbone.py
    │   │   ├── swin_backbone.py
    │   │   ├── utils.py
    │   │   └── vit_backbone.py
    ├── mpu
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── commons.py
    │   │   ├── test_cross_entropy.py
    │   │   ├── test_data.py
    │   │   ├── test_initialize.py
    │   │   ├── test_layers.py
    │   │   └── test_random.py
    ├── optimizer_param_scheduler.py
    ├── static
    │   └── index.html
    ├── text_generation
    │   ├── __init__.py
    │   ├── api.py
    │   ├── beam_utils.py
    │   ├── communication.py
    │   ├── forward_step.py
    │   ├── generation.py
    │   ├── sampling.py
    │   └── tokenization.py
    ├── text_generation_server.py
    ├── theoretical_memory_usage.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── auto_tokenization.py
    │   ├── bert_tokenization.py
    │   ├── gpt2_tokenization.py
    │   └── tokenizer.py
    ├── training.py
    └── utils.py
├── pretrain_gpt.py
├── pretrain_maskllm.py
├── run_docker.sh
├── scripts
    ├── data
    │   ├── download_c4.py
    │   ├── prepare_c4_megatron_llama2.sh
    │   ├── prepare_c4_megatron_llama3.1.sh
    │   └── prepare_c4_megatron_llama3.sh
    ├── learnable_sparsity
    │   ├── llama2_13b_mask_only_tp8_c4.sh
    │   ├── llama2_7b_mask_only_tp8_c4.sh
    │   ├── llama3.1_8b_mask_only_tp8_c4.sh
    │   └── llama3_8b_mask_only_tp8_c4.sh
    ├── oneshot
    │   ├── run_llama2_13b_prune_tp8.sh
    │   ├── run_llama2_7b_prune_tp8.sh
    │   ├── run_llama3.1_8b_prune_tp8.sh
    │   └── run_llama3_8b_prune_tp8.sh
    ├── ppl
    │   ├── evaluate_llama2_wikitext2.sh
    │   ├── evaluate_llama3.1_wikitext2.sh
    │   └── evaluate_llama3_wikitext2.sh
    └── tools
    │   ├── convert_llama2_13b_hf_to_megatron.sh
    │   ├── convert_llama2_7b_hf_to_megatron.sh
    │   ├── convert_llama2_7b_tp8_to_tp1.sh
    │   ├── convert_llama3.1_8b_hf_to_megatron.sh
    │   ├── convert_llama3.1_8b_tp8_to_tp1.sh
    │   ├── convert_llama3_8b_hf_to_megatron.sh
    │   ├── convert_llama3_8b_tp8_to_tp1.sh
    │   ├── download_llama2_13b_hf.py
    │   ├── download_llama2_7b_hf.py
    │   ├── download_llama3.1_8b_hf.py
    │   └── download_llama3_8b_hf.py
├── tasks
    ├── data_utils.py
    ├── ensemble_classifier.py
    ├── eval_utils.py
    ├── finetune_utils.py
    ├── glue
    │   ├── data.py
    │   ├── finetune.py
    │   ├── mnli.py
    │   └── qqp.py
    ├── latency
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── test_latency.py
    ├── main.py
    ├── msdp
    │   ├── README.md
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── metrics.py
    │   ├── preprocessing.py
    │   └── prompt.py
    ├── orqa
    │   ├── README.md
    │   ├── evaluate_orqa.py
    │   ├── evaluate_utils.py
    │   ├── supervised
    │   │   ├── data.py
    │   │   ├── eval_utils.py
    │   │   └── finetune.py
    │   └── unsupervised
    │   │   ├── nq.py
    │   │   ├── qa_utils.py
    │   │   └── tokenizers.py
    ├── pruning
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   ├── exclude_layers.py
    │   ├── layerwrapper.py
    │   ├── optimizer.py
    │   ├── prune_main_llama.py
    │   ├── prune_main_subdomain.py
    │   ├── run_sensitivity.py
    │   └── sparsity
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── datautils.py
    │   │       └── modelutils.py
    ├── race
    │   ├── data.py
    │   └── finetune.py
    ├── vision
    │   ├── classification
    │   │   ├── classification.py
    │   │   └── eval_utils.py
    │   ├── finetune_utils.py
    │   ├── main.py
    │   └── segmentation
    │   │   ├── cityscapes.py
    │   │   ├── data.py
    │   │   ├── finetune_segformer.py
    │   │   ├── finetune_setr.py
    │   │   ├── metrics.py
    │   │   ├── seg_heads.py
    │   │   ├── seg_models.py
    │   │   ├── transforms.py
    │   │   └── utils.py
    └── zeroshot_gpt
    │   ├── datasets.py
    │   ├── detokenizer.py
    │   └── evaluate.py
├── tool_apply_sparsity.py
├── tool_compress_mask.py
├── tool_compute_mask_hf.py
├── tool_export_to_hf.py
├── tool_trim_learnable_sparsity.py
└── tools
    ├── __init__.py
    ├── autoformat.sh
    ├── bert_embedding
        ├── __init__.py
        ├── dataset.py
        ├── embed.py
        ├── external_libs.py
        ├── huggingface.py
        └── utils.py
    ├── checkpoint
        ├── loader_llama2_hf.py
        ├── loader_megatron.py
        ├── saver_megatron.py
        └── util.py
    ├── download_c4.py
    ├── linter.py
    ├── merge_datasets.py
    ├── openwebtext
        ├── README.md
        ├── add_id.py
        ├── blacklist_urls.py
        ├── cleanup_dataset.py
        ├── cleanup_fix_dataset.py
        ├── filter_ngrams.py
        ├── find_duplicates.py
        ├── group_duplicate_url.py
        ├── merge_jsons.py
        └── remove_group_duplicates.py
    ├── prepare_c4_megatron.sh
    ├── preprocess_data.py
    ├── preprocess_data_nmt.py
    ├── preprocess_mmdata.py
    ├── retro
        ├── README.md
        ├── build_db.md
        ├── cli
        │   ├── __init__.py
        │   ├── __main__.py
        │   └── cli.py
        ├── db
        │   ├── __init__.py
        │   ├── build.py
        │   ├── dataset.py
        │   └── utils.py
        ├── examples
        │   ├── Dockerfile
        │   ├── preprocess_data.sh
        │   └── pretrain_model.sh
        ├── external_libs.py
        ├── index
        │   ├── __init__.py
        │   ├── build.py
        │   ├── factory.py
        │   ├── index.py
        │   ├── indexes
        │   │   ├── __init__.py
        │   │   ├── faiss_base.py
        │   │   └── faiss_par_add.py
        │   └── utils.py
        ├── main.py
        ├── query
        │   ├── __init__.py
        │   ├── chunk_dataset.py
        │   ├── multi_split_gpt_dataset.py
        │   ├── query.py
        │   ├── retro_dataset.py
        │   └── utils.py
        ├── sft
        │   ├── README.md
        │   ├── dataset_conv.py
        │   ├── open_inst.sh
        │   ├── sft_retro.py
        │   └── sft_retro_lm.sh
        ├── text_generation
        │   ├── evaluate.py
        │   ├── metrics.py
        │   ├── retro_api.py
        │   ├── retro_generate.sh
        │   ├── retro_generation.py
        │   └── retro_text_generation.py
        └── utils.py
    ├── run_text_generation_server.py
    └── text_generation_cli.py


/.gitignore:
--------------------------------------------------------------------------------
1 | assets/cache
2 | assets/checkpoints
3 | assets/data
4 | CACHE
5 | output
6 | __pycache__
7 | megatron/fused_kernels/build
8 | megatron/core/datasets/helpers.cpython-310-x86_64-linux-gnu.so
9 | wandb


--------------------------------------------------------------------------------
/assets/animation-LQ.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/animation-LQ.gif


--------------------------------------------------------------------------------
/assets/animation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/animation.gif


--------------------------------------------------------------------------------
/assets/c4-blend-llama2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # multilingual datasets
4 | C4_HOME=assets/data/c4_llama2_pretokenized
5 | DATA_BLEND=""
6 | for i in {00000..00019}; do # 1/20
7 |     DATA_BLEND="${DATA_BLEND} 0.05 ${C4_HOME}/c4_llama2_${i}_text_document"
8 | done
9 | 


--------------------------------------------------------------------------------
/assets/c4-blend-llama3.1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # multilingual datasets
4 | C4_HOME=assets/data/c4_llama3.1_pretokenized
5 | DATA_BLEND=""
6 | for i in {00000..00019}; do # 1/20
7 |     DATA_BLEND="${DATA_BLEND} 0.05 ${C4_HOME}/c4_llama3.1_${i}_text_document"
8 | done
9 | 


--------------------------------------------------------------------------------
/assets/c4-blend-llama3.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # multilingual datasets
4 | C4_HOME=assets/data/c4_llama3_pretokenized
5 | DATA_BLEND=""
6 | for i in {00000..00019}; do # 1/20
7 |     DATA_BLEND="${DATA_BLEND} 0.05 ${C4_HOME}/c4_llama3_${i}_text_document"
8 | done
9 | 


--------------------------------------------------------------------------------
/assets/exp_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/exp_results.png


--------------------------------------------------------------------------------
/assets/mask-learning.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/mask-learning.gif


--------------------------------------------------------------------------------
/assets/mask-sampling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/mask-sampling.png


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/teaser.png


--------------------------------------------------------------------------------
/learnable_sparsity/__init__.py:
--------------------------------------------------------------------------------
1 | from .differentiable_mask import ColumnParallelLinearSparse, RowParallelLinearSparse, convert_to_sparse_model, DifferentiableMask
2 | from . import ste


--------------------------------------------------------------------------------
/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args, get_retro_args
 6 | from .global_vars import get_current_global_batch_size
 7 | from .global_vars import get_num_microbatches
 8 | from .global_vars import get_signal_handler
 9 | from .global_vars import update_num_microbatches
10 | from .global_vars import get_tokenizer
11 | from .global_vars import get_tensorboard_writer
12 | from .global_vars import get_wandb_writer
13 | from .global_vars import get_one_logger
14 | from .global_vars import get_adlr_autoresume
15 | from .global_vars import get_timers
16 | from .initialize  import initialize_megatron
17 | 
18 | from .utils import (print_rank_0,
19 |                     is_last_rank,
20 |                     print_rank_last)
21 | 


--------------------------------------------------------------------------------
/megatron/core/README.md:
--------------------------------------------------------------------------------
1 | Megatron Core is a library for efficient and scalable training of transformer based models.
2 | 


--------------------------------------------------------------------------------
/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import megatron.core.tensor_parallel
 2 | import megatron.core.utils
 3 | from megatron.core import parallel_state
 4 | from megatron.core.distributed import DistributedDataParallel
 5 | from megatron.core.inference_params import InferenceParams
 6 | from megatron.core.model_parallel_config import ModelParallelConfig
 7 | from megatron.core.timers import Timers
 8 | 
 9 | # Alias parallel_state as mpu, its legacy name
10 | mpu = parallel_state
11 | 
12 | __all__ = [
13 |     "parallel_state",
14 |     "tensor_parallel",
15 |     "utils",
16 |     "DistributedDataParallel",
17 |     "InferenceParams",
18 |     "ModelParallelConfig",
19 |     "Timers",
20 | ]
21 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/datasets/__init__.py


--------------------------------------------------------------------------------
/megatron/core/datasets/multimodal_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Dict
 5 | 
 6 | import numpy
 7 | import torch
 8 | 
 9 | from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
10 | 
11 | 
12 | @dataclass
13 | class MultimodalDatasetConfig(GPTDatasetConfig):
14 |     """Configuration object for Megatron Core Multimodal datasets.
15 | 
16 | 
17 |     Note: This is unused at the moment and may be missing features. Follow-up changes will use this.
18 | 
19 |     Attributes:
20 |         image_h (int): Image height.
21 |         image_w (int): Image width.
22 |     """
23 | 
24 |     image_h: int = None
25 |     image_w: int = None
26 | 
27 |     def __post_init__(self) -> None:
28 |         super().__post_init__()
29 | 
30 |         assert self.image_h is not None
31 |         assert self.image_w is not None
32 | 
33 | 
34 | class MockMultimodalDataset(MockGPTDataset):
35 |     """Mock multimodal dataset.
36 | 
37 | 
38 |     This is unused at the moment and may be missing features. Follow-up changes will use this.
39 |     """
40 | 
41 |     def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
42 |         """Return a sample that contains a dummy image, text sequence and the associated labels and cost and attention masks.
43 | 
44 |         Args:
45 |             idx (int): The integer seed for mock data generation.
46 | 
47 |         Returns:
48 |             Dict[str, numpy.ndarray]: The mock data.
49 |         """
50 |         # Get a text sample.
51 |         sample = super().__getitem__(idx)
52 | 
53 |         # Add mock input image.
54 |         sample["image"] = torch.zeros(
55 |             (3, self.config.image_h, self.config.image_w), dtype=torch.float32
56 |         )
57 | 
58 |         return sample
59 | 


--------------------------------------------------------------------------------
/megatron/core/datasets/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import logging
 4 | from enum import Enum
 5 | from typing import Any, List
 6 | 
 7 | import numpy
 8 | import torch
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class Split(Enum):
14 |     train = 0
15 |     valid = 1
16 |     test = 2
17 | 
18 | 
19 | def compile_helpers():
20 |     """Compile C++ helper functions at runtime. Make sure this is invoked on a single process.
21 |     """
22 |     import os
23 |     import subprocess
24 | 
25 |     command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))]
26 |     if subprocess.run(command).returncode != 0:
27 |         import sys
28 | 
29 |         log_single_rank(logger, logging.ERROR, "Failed to compile the C++ dataset helper functions")
30 |         sys.exit(1)
31 | 
32 | 
33 | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any):
34 |     """If torch distributed is initialized, log only on rank
35 | 
36 |     Args:
37 |         logger (logging.Logger): The logger to write the logs
38 | 
39 |         args (Tuple[Any]): All logging.Logger.log positional arguments
40 | 
41 |         rank (int, optional): The rank to write on. Defaults to 0.
42 | 
43 |         kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments
44 |     """
45 |     if torch.distributed.is_initialized():
46 |         if torch.distributed.get_rank() == rank:
47 |             logger.log(*args, **kwargs)
48 |     else:
49 |         logger.log(*args, **kwargs)
50 | 
51 | 
52 | def normalize(weights: List[float]) -> List[float]:
53 |     """Do non-exponentiated normalization
54 | 
55 |     Args:
56 |         weights (List[float]): The weights
57 | 
58 |     Returns:
59 |         List[float]: The normalized weights
60 |     """
61 |     w = numpy.array(weights, dtype=numpy.float64)
62 |     w_sum = numpy.sum(w)
63 |     w = (w / w_sum).tolist()
64 |     return w
65 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from .core import check_is_distributed_checkpoint
 4 | from .mapping import LocalNonpersitentObject, ShardedTensor
 5 | from .serialization import (
 6 |     load,
 7 |     load_common_state_dict,
 8 |     load_plain_tensors,
 9 |     load_tensors_metadata,
10 |     save,
11 | )
12 | 


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/core.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """ Module for managing distributed checkpoints metadata. """
 4 | 
 5 | import json
 6 | from dataclasses import asdict, dataclass
 7 | from pathlib import Path
 8 | from typing import Optional
 9 | 
10 | CONFIG_FNAME = 'metadata.json'
11 | 
12 | 
13 | class CheckpointingException(Exception):
14 |     """ Base checkpointing related exception  """
15 | 
16 |     pass
17 | 
18 | 
19 | @dataclass
20 | class CheckpointingConfig:
21 |     """ Documents backends used in the checkpoint.
22 | 
23 |     Checkpoint config keeps track of formats used for storing the sharded tensors
24 |     (sharded_backend) and other objects (common_backend).
25 | 
26 |     Note that versioning is not for the checkpoint content (which is application specific),
27 |     but for the checkpoint format itself.
28 |     """
29 | 
30 |     sharded_backend: str
31 |     sharded_backend_version: int = 1
32 |     common_backend: str = 'torch'
33 |     common_backend_version: int = 1
34 | 
35 | 
36 | def check_is_distributed_checkpoint(checkpoint_dir):
37 |     """ Checks if `metadata.json` exists in the checkpoint and is a valid config.
38 | 
39 |     Args:
40 |         checkpoint_dir: checkpoint directory
41 | 
42 |     Returns:
43 |         bool: True if `metadata.json` exists in the checkpoint and is a valid config.
44 |     """
45 |     return maybe_load_config(checkpoint_dir) is not None
46 | 
47 | 
48 | def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]:
49 |     """ Returns checkpoint config if `checkpoint_dir` is a distributed checkpoint and None otherwise
50 | 
51 |     Args:
52 |         checkpoint_dir: checkpoint directory
53 | 
54 |     Returns:
55 |         CheckpointingConfig (optional): None if checkpoint is not a valid distributed checkpoint
56 |     """
57 |     config_path = Path(checkpoint_dir, CONFIG_FNAME)
58 |     if not config_path.exists():
59 |         return None
60 |     with config_path.open() as f:
61 |         config_dict = json.load(f)
62 |     return CheckpointingConfig(**config_dict)
63 | 
64 | 
65 | def save_config(config: CheckpointingConfig, checkpoint_dir: str):
66 |     """ Save given config to checkpoint directory.
67 | 
68 |     Args:
69 |         config: checkpoint config
70 |         checkpoint_dir: checkpoint directory
71 | 
72 |     Returns:
73 |         None
74 |     """
75 |     config_path = Path(checkpoint_dir, CONFIG_FNAME)
76 |     with config_path.open('w') as f:
77 |         json.dump(asdict(config), f)


--------------------------------------------------------------------------------
/megatron/core/dist_checkpointing/strategies/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """ Various loading and saving strategies """
 4 | 
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | try:
10 |     import tensorstore
11 |     import zarr
12 | 
13 |     from .tensorstore import _import_trigger
14 |     from .zarr import _import_trigger
15 | except ImportError:
16 |     # Only print warning on first rank.
17 |     import os
18 | 
19 |     if int(os.getenv('RANK', '0')) == 0:
20 |         logger.warning('Zarr-based strategies will not be registered because of missing packages')
21 | 


--------------------------------------------------------------------------------
/megatron/core/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .distributed_data_parallel import DistributedDataParallel
4 | from .finalize_model_grads import finalize_model_grads
5 | from .grad_buffer import shard_buffer
6 | 


--------------------------------------------------------------------------------
/megatron/core/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | class ModelType(enum.Enum):
 7 |     encoder_or_decoder = 1
 8 |     encoder_and_decoder = 2
 9 |     retro_encoder = 3
10 |     retro_decoder = 4
11 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/fusions/__init__.py


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_dropout.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | from typing import Optional, Tuple
 3 | 
 4 | import torch
 5 | 
 6 | from megatron.core.jit import jit_fuser
 7 | 
 8 | 
 9 | def _bias_dropout_add_func(x_with_bias, residual, prob, training):
10 |     # type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor
11 |     # NOTE: Previously, the argument `bias` used to be passed as
12 |     # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the
13 |     # transformer layer but broadcasting should automatically take care of that.
14 |     # Also, looking at broadcasting semantics, `expand_as` and broadcasting
15 |     # seem to be identical performance-wise (both just change the view).
16 | 
17 |     x, bias = x_with_bias  # unpack
18 | 
19 |     # If we want to train mixed precision, then the output of this function
20 |     # should be half precision. However, in AMP O1, the input (residual) is
21 |     # in fp32, and it will up-cast the result to fp32, causing pipeline parallel
22 |     # GPU communication to hang. Therefore, we need to cast residual to the same
23 |     # dtype as x.
24 |     residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
25 | 
26 |     # The Dropout operation, Residual Addition and the tensor returning can be
27 |     # done generically outside the if statement, but that stops fusing of Bias
28 |     # Addition-Dropout-Residual Addition operation. So doing it together inside
29 |     # the conditional branch to improve performance
30 |     if bias is not None:
31 |         x = x + bias
32 |         out = torch.nn.functional.dropout(x, p=prob, training=training)
33 |         out = residual + out
34 |         return out
35 |     else:
36 |         out = torch.nn.functional.dropout(x, p=prob, training=training)
37 |         out = residual + out
38 |         return out
39 | 
40 | 
41 | def bias_dropout_add_unfused(training):
42 |     def _bias_dropout_add(x_with_bias, residual, prob):
43 |         return _bias_dropout_add_func(x_with_bias, residual, prob, training)
44 | 
45 |     return _bias_dropout_add
46 | 
47 | 
48 | @jit_fuser
49 | def bias_dropout_add_fused_train(
50 |     x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
51 | ) -> torch.Tensor:
52 |     return _bias_dropout_add_func(x_with_bias, residual, prob, True)
53 | 
54 | 
55 | @jit_fuser
56 | def bias_dropout_add_fused_inference(
57 |     x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float,
58 | ) -> torch.Tensor:
59 |     return _bias_dropout_add_func(x_with_bias, residual, prob, False)
60 | 
61 | 
62 | def get_bias_dropout_add(training, fused):
63 |     if fused:
64 |         # jit scripting for a nn.module (with dropout) is not
65 |         # triggering the fusion kernel. For now, we use two
66 |         # different nn.functional routines to account for varying
67 |         # dropout semantics during training and inference phases.
68 |         if training:
69 |             return bias_dropout_add_fused_train
70 |         else:
71 |             return bias_dropout_add_fused_inference
72 |     else:
73 |         return bias_dropout_add_unfused(training)
74 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from megatron.core.jit import jit_fuser
 6 | 
 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 8 | # 1/sqrt(2*pi)-> 0.3989423
 9 | # 1/sqrt(2)   -> 0.70710678
10 | # sqrt(2/pi)  -> 0.79788456
11 | # this function is tanh approximation of gelu
12 | # actual gelu is:
13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
14 | 
15 | 
16 | @jit_fuser
17 | def bias_gelu(bias, y):
18 |     x = bias + y
19 |     return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
20 | 
21 | 
22 | # gradient of tanh approximation of gelu
23 | # gradient of actual gelu is:
24 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
25 | @jit_fuser
26 | def bias_gelu_back(g, bias, y):
27 |     x = bias + y
28 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
29 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
30 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (
31 |         1 + tanh_out
32 |     )
33 |     return ff * g
34 | 
35 | 
36 | class GeLUFunction(torch.autograd.Function):
37 |     @staticmethod
38 |     # bias is an optional argument
39 |     def forward(ctx, input, bias):
40 |         ctx.save_for_backward(input, bias)
41 |         return bias_gelu(bias, input)
42 | 
43 |     @staticmethod
44 |     def backward(ctx, grad_output):
45 |         input, bias = ctx.saved_tensors
46 |         tmp = bias_gelu_back(grad_output, bias, input)
47 |         return tmp, tmp
48 | 
49 | 
50 | bias_gelu_impl = GeLUFunction.apply
51 | 


--------------------------------------------------------------------------------
/megatron/core/fusions/fused_bias_swiglu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from megatron.core.jit import jit_fuser
 7 | 
 8 | ###### BIAS SWIGLU FUSION/ NO AUTOGRAD ################
 9 | 
10 | 
11 | @jit_fuser
12 | def swiglu(y):
13 |     y_1, y_2 = torch.chunk(y, 2, -1)
14 |     return F.silu(y_1) * y_2
15 | 
16 | 
17 | @jit_fuser
18 | def bias_swiglu(y, bias):
19 |     y = y + bias
20 |     return swiglu(y)
21 | 
22 | 
23 | # gradient of tanh approximation of gelu
24 | # gradient of actual gelu is:
25 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
26 | @jit_fuser
27 | def swiglu_back(g, y):
28 |     y_1, y_2 = torch.chunk(y, 2, -1)
29 |     return torch.cat(
30 |         (g * torch.sigmoid(y_1) * (1 + y_1 * (1 - torch.sigmoid(y_1))) * y_2, g * F.silu(y_1)), -1
31 |     )
32 | 
33 | 
34 | @jit_fuser
35 | def bias_swiglu_back(g, y, bias):
36 |     y = y + bias
37 |     return swiglu_back(g, y)
38 | 
39 | 
40 | class BiasSwiGLUFunction(torch.autograd.Function):
41 |     @staticmethod
42 |     # bias is an optional argument
43 |     def forward(ctx, input, bias):
44 |         ctx.save_for_backward(input, bias)
45 |         return bias_swiglu(input, bias)
46 | 
47 |     @staticmethod
48 |     def backward(ctx, grad_output):
49 |         input, bias = ctx.saved_tensors
50 |         tmp = bias_swiglu_back(grad_output, input, bias)
51 |         return tmp, tmp
52 | 
53 | 
54 | class SwiGLUFunction(torch.autograd.Function):
55 |     @staticmethod
56 |     # bias is an optional argument
57 |     def forward(ctx, input):
58 |         ctx.save_for_backward(input)
59 |         return swiglu(input)
60 | 
61 |     @staticmethod
62 |     def backward(ctx, grad_output):
63 |         input = ctx.saved_tensors
64 |         tmp = swiglu_back(grad_output, input[0])
65 |         return tmp
66 | 
67 | 
68 | def bias_swiglu_impl(input, bias):
69 |     ori_shape = input.shape
70 |     assert len(ori_shape) in [2, 3]
71 |     input = input.view(-1, ori_shape[-1])
72 |     if bias is not None:
73 |         output = BiasSwiGLUFunction.apply(input, bias)
74 |     else:
75 |         output = SwiGLUFunction.apply(input)
76 | 
77 |     return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
78 | 
79 | 
80 | # bias_swiglu_impl = BiasSwiGLUFunction.apply
81 | # swiglu_impl = SwiGLUFunction.apply
82 | 


--------------------------------------------------------------------------------
/megatron/core/inference_params.py:
--------------------------------------------------------------------------------
 1 | class InferenceParams:
 2 |     """Inference parameters that are passed to the main model in order
 3 |     to efficienly calculate and store the context during inference."""
 4 | 
 5 |     def __init__(self, max_batch_size, max_sequence_length):
 6 |         self.max_sequence_length = max_sequence_length
 7 |         self.max_batch_size = max_batch_size
 8 |         self.sequence_len_offset = 0
 9 |         self.batch_size_offset = 0
10 |         self.key_value_memory_dict = {}
11 | 
12 |     def swap_key_value_dict(self, batch_idx):
13 |         "swap between batches"
14 |         if len(self.key_value_memory_dict) == 0:
15 |             raise ValueError("should not swap when dict in empty")
16 | 
17 |         for layer_number in self.key_value_memory_dict.keys():
18 |             inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
19 |             assert (
20 |                 len(batch_idx) == inference_key_memory.shape[1]
21 |             )  # make sure batch size is the same
22 |             new_inference_key_memory = inference_key_memory[:, batch_idx]
23 |             new_inference_value_memory = inference_value_memory[:, batch_idx]
24 |             self.key_value_memory_dict[layer_number] = (
25 |                 new_inference_key_memory,
26 |                 new_inference_value_memory,
27 |             )
28 | 


--------------------------------------------------------------------------------
/megatron/core/jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | TORCH_MAJOR = int(torch.__version__.split(".")[0])
 6 | TORCH_MINOR = int(torch.__version__.split(".")[1])
 7 | 
 8 | jit_fuser = torch.jit.script
 9 | # nvFuser is deprecated in PyTorch JIT starting from 2.2
10 | if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2):
11 |     jit_fuser = torch.compile
12 | 


--------------------------------------------------------------------------------
/megatron/core/models/T5/__init__.py:
--------------------------------------------------------------------------------
1 | from .t5_model import T5Model
2 | 


--------------------------------------------------------------------------------
/megatron/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/bert/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/bert/bert_layer_specs.py:
--------------------------------------------------------------------------------
 1 | from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 2 | from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 3 | from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 4 | from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 5 | from megatron.core.transformer.custom_layers.transformer_engine import (
 6 |     TEDotProductAttention,
 7 |     TELayerNormColumnParallelLinear,
 8 |     TERowParallelLinear,
 9 | )
10 | from megatron.core.transformer.dot_product_attention import DotProductAttention
11 | from megatron.core.transformer.enums import AttnMaskType
12 | from megatron.core.transformer.mlp import MLP, MLPSubmodules
13 | from megatron.core.transformer.spec_utils import ModuleSpec
14 | from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
15 | 
16 | # Use this spec to use lower level Transformer Engine modules (required for fp8 training)
17 | bert_layer_with_transformer_engine_spec = ModuleSpec(
18 |     module=TransformerLayer,
19 |     submodules=TransformerLayerSubmodules(
20 |         self_attention=ModuleSpec(
21 |             module=SelfAttention,
22 |             params={"attn_mask_type": AttnMaskType.padding},
23 |             submodules=SelfAttentionSubmodules(
24 |                 linear_qkv=TELayerNormColumnParallelLinear,
25 |                 core_attention=TEDotProductAttention,
26 |                 linear_proj=TERowParallelLinear,
27 |             ),
28 |         ),
29 |         self_attn_bda=get_bias_dropout_add,
30 |         mlp=ModuleSpec(
31 |             module=MLP,
32 |             submodules=MLPSubmodules(
33 |                 linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear,
34 |             ),
35 |         ),
36 |         mlp_bda=get_bias_dropout_add,
37 |     ),
38 | )
39 | 
40 | # Use this spec for an implementation using only modules in megatron core
41 | bert_layer_local_spec = ModuleSpec(
42 |     module=TransformerLayer,
43 |     submodules=TransformerLayerSubmodules(
44 |         input_layernorm=FusedLayerNorm,
45 |         self_attention=ModuleSpec(
46 |             module=SelfAttention,
47 |             params={"attn_mask_type": AttnMaskType.padding},
48 |             submodules=SelfAttentionSubmodules(
49 |                 linear_qkv=ColumnParallelLinear,
50 |                 core_attention=DotProductAttention,
51 |                 linear_proj=RowParallelLinear,
52 |             ),
53 |         ),
54 |         self_attn_bda=get_bias_dropout_add,
55 |         pre_mlp_layernorm=FusedLayerNorm,
56 |         mlp=ModuleSpec(
57 |             module=MLP,
58 |             submodules=MLPSubmodules(
59 |                 linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,
60 |             ),
61 |         ),
62 |         mlp_bda=get_bias_dropout_add,
63 |     ),
64 | )
65 | 


--------------------------------------------------------------------------------
/megatron/core/models/bert/bert_lm_head.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | 
 4 | from megatron.core import tensor_parallel
 5 | from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 6 | from megatron.core.transformer.module import MegatronModule
 7 | from megatron.core.transformer.transformer_config import TransformerConfig
 8 | from megatron.core.transformer.utils import erf_gelu, get_linear_layer, openai_gelu
 9 | 
10 | 
11 | class BertLMHead(MegatronModule):
12 |     """Masked LM head for Bert
13 | 
14 |     Args:
15 |         hidden_size: hidden size
16 |         config (TransformerConfig): TransformerConfig object
17 |         parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks
18 |         vocab_size(int): The vocabulary size
19 |         share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are shared. Defaults to False
20 |         pre_process (bool): Include embedding layer (used with pipeline parallelism)
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         hidden_size: int,
26 |         config: TransformerConfig,
27 |         parallel_output: bool,
28 |         vocab_size: int,
29 |         pre_process: bool,
30 |         share_embeddings_and_output_weights: bool = False,
31 |     ):
32 |         super().__init__(config=config)
33 | 
34 |         self.vocab_size = vocab_size
35 |         self.parallel_output = parallel_output
36 | 
37 |         # TODO: Shoudl switch this to TE ?
38 |         self.dense = get_linear_layer(
39 |             hidden_size, hidden_size, config.init_method, config.perform_initialization
40 |         )
41 | 
42 |         setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel)
43 |         setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel)
44 | 
45 |         self.layernorm = FusedLayerNorm(
46 |             config=config,
47 |             hidden_size=hidden_size,
48 |             eps=config.layernorm_epsilon,
49 |             sequence_parallel=config.sequence_parallel,
50 |         )
51 | 
52 |         self.gelu = torch.nn.functional.gelu
53 |         # TODO Use activation_func in config to determine what to use
54 |         # if config.openai_gelu: # Dont have these configs in transfomer config yet
55 |         #    self.gelu = openai_gelu
56 |         # elif config.onnx_safe: # Dont have these configs in transfomer config yet
57 |         #   self.gelu = erf_gelu
58 | 
59 |         self.output_layer = tensor_parallel.ColumnParallelLinear(
60 |             config.hidden_size,
61 |             self.vocab_size,
62 |             config=config,
63 |             init_method=config.init_method,
64 |             bias=True,
65 |             skip_bias_add=False,
66 |             gather_output=not self.parallel_output,
67 |             skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights,
68 |         )
69 | 
70 |     def forward(self, hidden_states: Tensor, word_embeddings_weight: Tensor) -> Tensor:
71 |         hidden_states = self.dense(hidden_states)
72 |         hidden_states = self.gelu(hidden_states)
73 |         hidden_states = self.layernorm(hidden_states)
74 |         logits, _ = self.output_layer(hidden_states, weight=word_embeddings_weight)
75 |         return logits
76 | 


--------------------------------------------------------------------------------
/megatron/core/models/bert/pooler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | 
 4 | from megatron.core import tensor_parallel
 5 | from megatron.core.transformer.module import MegatronModule
 6 | from megatron.core.transformer.transformer_config import TransformerConfig
 7 | from megatron.core.transformer.utils import get_linear_layer
 8 | 
 9 | 
10 | class Pooler(MegatronModule):
11 |     """Pooler layer.
12 | 
13 |     Pool hidden states of a specific token (for example start of the
14 |     sequence) and add a linear transformation followed by a tanh.
15 | 
16 |     Args:
17 |         hidden_size (int): The hidden size_
18 |         init_method (callable): weight initialization method for the linear layer. bias is set to zero.
19 |         config (TransformerConfig): The transformer configuration
20 |         sequence_parallel (bool): Using squence parallel ? Defaults to False
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         hidden_size: int,
26 |         init_method: callable,
27 |         config: TransformerConfig,
28 |         sequence_parallel: bool = False,
29 |     ):
30 |         super(Pooler, self).__init__(config)
31 |         # TODO: Shoudl switch this to TE ?
32 |         self.dense = get_linear_layer(
33 |             hidden_size, hidden_size, init_method, config.perform_initialization
34 |         )
35 |         self.sequence_parallel = sequence_parallel
36 | 
37 |     def forward(self, hidden_states: Tensor, sequence_index=0):
38 |         # hidden_states: [s, b, h]
39 |         # sequence_index: index of the token to pool.
40 | 
41 |         # gather data along sequence dimensions
42 |         # same pooler is run on all tensor parallel nodes
43 |         if self.sequence_parallel:
44 |             hidden_states = tensor_parallel.gather_from_sequence_parallel_region(
45 |                 hidden_states, tensor_parallel_output_grad=False
46 |             )
47 | 
48 |         pooled = hidden_states[sequence_index, :, :]
49 |         pooled = self.dense(pooled)
50 |         pooled = torch.tanh(pooled)
51 |         return pooled
52 | 


--------------------------------------------------------------------------------
/megatron/core/models/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/embeddings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/embeddings/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/language_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/language_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/vision_module/__init__.py


--------------------------------------------------------------------------------
/megatron/core/models/common/vision_module/vision_module.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 2 | """Megatron Vision Module."""
 3 | 
 4 | from megatron.core.transformer.module import MegatronModule
 5 | from megatron.core.transformer.transformer_config import TransformerConfig
 6 | 
 7 | 
 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes.
 9 | class VisionModule(MegatronModule):
10 |     """Base vision module that has common helper functions used across CLIP, ViT, etc.
11 | 
12 |     Args:
13 |         config (TransformerConfig): Input transformer config for the model
14 |     """
15 | 
16 |     def __init__(self, config: TransformerConfig) -> None:
17 |         super().__init__(config=config)
18 | 


--------------------------------------------------------------------------------
/megatron/core/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt_model import GPTModel
2 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .config import RetroConfig
4 | from .decoder_spec import get_retro_decoder_block_spec
5 | from .model import RetroModel
6 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/base_attention.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from megatron.core.models.retro.config import RetroConfig
 4 | from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules
 5 | from megatron.core.transformer.enums import AttnMaskType
 6 | from megatron.core.transformer.module import MegatronModule
 7 | 
 8 | 
 9 | class BaseRetroCrossAttention(MegatronModule):
10 | 
11 |     """Base class for Retro cross attention, for both encoder & decoder layers.
12 | 
13 |     This class collects the retro arguments below (i.e., num neighbors, chunk
14 |     length, and retrieve length) for use in Retro's custom cross attention
15 |     operators.
16 | 
17 |     Arguments:
18 |       config (RetroConfig): Retro config.
19 | 
20 |       submodules (CrossAttentionSubmodules): Cross attention submodules.
21 | 
22 |       layer_number (int): Layer number within transformer block.
23 | 
24 |       attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding').
25 |     """
26 | 
27 |     def __init__(
28 |         self,
29 |         config: RetroConfig,
30 |         submodules: CrossAttentionSubmodules,
31 |         layer_number: int = 1,
32 |         attn_mask_type: AttnMaskType = AttnMaskType.padding,
33 |     ):
34 |         super().__init__(config=config)
35 | 
36 |         self.attn = CrossAttention(
37 |             config=config,
38 |             submodules=submodules,
39 |             layer_number=layer_number,
40 |             attn_mask_type=attn_mask_type,
41 |         )
42 | 
43 |         self.retro_num_neighbors = config.retro_num_neighbors
44 |         self.retro_chunk_length = config.retro_preprocess.retro_gpt_chunk_length
45 |         self.retro_retrieved_length = config.retro_preprocess.retro_gpt_retrieved_length
46 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import types
 4 | from dataclasses import dataclass
 5 | 
 6 | from megatron.core.transformer import TransformerConfig
 7 | 
 8 | 
 9 | @dataclass
10 | class RetroConfig(TransformerConfig):
11 | 
12 |     """Configuration object for Retro models.
13 | 
14 |     Attributes:
15 | 
16 |         retro_preprocess (SimpleNamespace): Retro preprocess arguments.
17 |         retro_workdir (str): Retro working directory, which contains the
18 |             preprocessed data for for pretraining. This directory is built during
19 |             preprocessing (see tools/retro/README.md), and contains subdirectories
20 |             for the chunk database and pretraining neighbors.
21 |         retro_encoder_layers (int): Number of layers to use for the retrieval
22 |             encoder.
23 |         retro_encoder_hidden_dropout (float): Hidden dropout for retrieval
24 |             encoder.
25 |         retro_encoder_attention_dropout (float): Attention dropout for retrieval
26 |             encoder.
27 |         retro_num_neighbors (int): Number of neighbors to retrieve during
28 |             pretraining.
29 |         retro_num_retrieved_chunks (int): Number of chunks to retrieve from the
30 |             retrieval database.
31 |         retro_verify_neighbor_count (bool): Verify that len(GPT dataset) ==
32 |             len(saved neighbors).
33 |     """
34 | 
35 |     # Retro.
36 |     retro_preprocess: types.SimpleNamespace = None
37 |     retro_workdir: str = None
38 |     retro_encoder_num_layers: int = 2
39 |     retro_encoder_hidden_dropout: float = 0.1
40 |     retro_encoder_attention_dropout: float = 0.1
41 |     retro_num_neighbors: int = 2
42 |     retro_num_retrieved_chunks: int = 2
43 |     retro_verify_neighbor_count: bool = True
44 | 


--------------------------------------------------------------------------------
/megatron/core/models/retro/model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | """Retro Model."""
 4 | 
 5 | from torch import Tensor
 6 | 
 7 | from megatron.core import InferenceParams
 8 | from megatron.core.models.gpt import GPTModel
 9 | 
10 | 
11 | class RetroModel(GPTModel):
12 | 
13 |     """Retro Model.
14 | 
15 |     A Retro model mostly re-uses the GPTModel interface, with the only difference
16 |     being the embedding of the 'context' this is used by Retro for processing
17 |     neighbor tokens. This embedded context is then forwarded to the Transformer
18 |     Block.
19 |     """
20 | 
21 |     def forward(
22 |         self,
23 |         input_ids: Tensor,
24 |         position_ids: Tensor,
25 |         attention_mask: Tensor,
26 |         context_input_ids: Tensor = None,
27 |         context_position_ids: Tensor = None,
28 |         context_mask: Tensor = None,
29 |         decoder_input: Tensor = None,
30 |         labels: Tensor = None,
31 |         inference_params: InferenceParams = None,
32 |     ) -> Tensor:
33 |         """RetroModel forward method.
34 | 
35 |         Foward input tokens & mask, along with neighbor tokens & mask, through
36 |         the Retro model..
37 | 
38 |         Arguments:
39 |           input_ids (Tensor): Input token IDs.
40 | 
41 |           position_ids (Tensor): Input position IDs.
42 | 
43 |           attention_mask (Tensor): Input attention mask.
44 | 
45 |           context_input_ids (Tensor): Context (i.e., neighbor) token IDs.
46 | 
47 |           context_position_ids (Tensor): Context (i.e., neighbor) position IDs.
48 | 
49 |           context_mask (Tensor): Context (i.e., neighbor) attention mask.
50 | 
51 |           decoder_input (Tensor): When using pipeline parallelism, input_ids and
52 |           position_ids will only be used on the first stage, and for all other
53 |           stages decoder_input will be provided via communication from the
54 |           previous stage.
55 | 
56 |           labels (Tensor): The labels of dimension [batch size, seq length].
57 | 
58 |           inference_params (InferenceParams): Parameters for inference.
59 |         """
60 | 
61 |         # Argument shapes:
62 |         #   Notation:
63 |         #     ns : Sequence length.
64 |         #     bs : Batch size.
65 |         #     d  : Hidden size.
66 |         #     l  : Number of chunks per sample (i.e., seq_length/chunk_length).
67 |         #     k  : Number of neighbors.
68 |         #     r  : Number of retrieved tokens (neighbors + continuation).
69 |         # - input_ids:   [ bs, ns ]
70 |         # - context_ids: [ k*bs*l, r ]
71 |         # - context:     [ r, k*bs*l, d ]
72 |         # - output:      [ ns, bs, d ]
73 | 
74 |         # Context embedding (e.g., for Retro neighbor tokens).
75 |         if context_input_ids is not None:
76 |             context = self.embedding(context_input_ids, context_position_ids)
77 |         else:
78 |             context = None
79 | 
80 |         # Call GPTModel.forward, and pass in embedded context.
81 |         return super().forward(
82 |             input_ids=input_ids,
83 |             position_ids=position_ids,
84 |             attention_mask=attention_mask,
85 |             decoder_input=decoder_input,
86 |             labels=labels,
87 |             inference_params=inference_params,
88 |             extra_block_kwargs={"context": context, "context_mask": context_mask,},
89 |         )
90 | 


--------------------------------------------------------------------------------
/megatron/core/models/vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/vision/__init__.py


--------------------------------------------------------------------------------
/megatron/core/package_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | 
 4 | MAJOR = 0
 5 | MINOR = 5
 6 | PATCH = 0
 7 | PRE_RELEASE = 'rc0'
 8 | 
 9 | # Use the following formatting: (major, minor, patch, pre-release)
10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
11 | 
12 | __shortversion__ = '.'.join(map(str, VERSION[:3]))
13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:])
14 | 
15 | __package_name__ = 'megatron_core'
16 | __contact_names__ = 'NVIDIA'
17 | __contact_emails__ = 'nemo-toolkit@nvidia.com'  # use NeMo Email
18 | __homepage__ = (
19 |     'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/'  # use NeMo homepage
20 | )
21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core'
22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases'
23 | __description__ = (
24 |     'Megatron Core - a library for efficient and scalable training of transformer based models'
25 | )
26 | __license__ = 'BSD-3'
27 | __keywords__ = (
28 |     'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch'
29 | )
30 | 


--------------------------------------------------------------------------------
/megatron/core/packed_seq_params.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from torch import Tensor
 4 | 
 5 | 
 6 | @dataclass
 7 | class PackedSeqParams:
 8 |     # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format,
 9 |     qkv_format: str = None
10 |     cu_seqlens_q: Tensor = None
11 |     cu_seqlens_kv: Tensor = None
12 |     max_seqlen_q: Tensor = None
13 |     max_seqlen_kv: Tensor = None
14 | 


--------------------------------------------------------------------------------
/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedules import get_forward_backward_func
2 | 


--------------------------------------------------------------------------------
/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | torch


--------------------------------------------------------------------------------
/megatron/core/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cross_entropy import vocab_parallel_cross_entropy
 2 | from .data import broadcast_data
 3 | from .layers import (
 4 |     ColumnParallelLinear,
 5 |     RowParallelLinear,
 6 |     VocabParallelEmbedding,
 7 |     copy_tensor_model_parallel_attributes,
 8 |     linear_with_grad_accumulation_and_async_allreduce,
 9 |     param_is_not_tensor_parallel_duplicate,
10 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
11 |     set_tensor_model_parallel_attributes,
12 | )
13 | from .mappings import (
14 |     copy_to_tensor_model_parallel_region,
15 |     gather_from_sequence_parallel_region,
16 |     gather_from_sequence_parallel_region_to_moe,
17 |     gather_from_tensor_model_parallel_region,
18 |     reduce_scatter_to_sequence_parallel_region_from_moe,
19 |     scatter_to_sequence_parallel_region,
20 |     scatter_to_tensor_model_parallel_region,
21 | )
22 | from .random import (
23 |     checkpoint,
24 |     get_cuda_rng_tracker,
25 |     get_data_parallel_rng_tracker_name,
26 |     model_parallel_cuda_manual_seed,
27 | )
28 | from .utils import (
29 |     gather_split_1d_tensor,
30 |     split_tensor_along_last_dim,
31 |     split_tensor_into_1d_equal_chunks,
32 | )
33 | 
34 | __all__ = [
35 |     # cross_entropy.py
36 |     "vocab_parallel_cross_entropy",
37 |     # data.py
38 |     "broadcast_data",
39 |     # layers.py
40 |     "ColumnParallelLinear",
41 |     "RowParallelLinear",
42 |     "VocabParallelEmbedding",
43 |     "set_tensor_model_parallel_attributes",
44 |     "set_defaults_if_not_set_tensor_model_parallel_attributes",
45 |     "copy_tensor_model_parallel_attributes",
46 |     "param_is_not_tensor_parallel_duplicate",
47 |     "linear_with_grad_accumulation_and_async_allreduce",
48 |     # mappings.py
49 |     "copy_to_tensor_model_parallel_region",
50 |     "gather_from_tensor_model_parallel_region",
51 |     "gather_from_sequence_parallel_region",
52 |     #    "reduce_from_tensor_model_parallel_region",
53 |     "scatter_to_tensor_model_parallel_region",
54 |     "scatter_to_sequence_parallel_region",
55 |     # random.py
56 |     "checkpoint",
57 |     "get_cuda_rng_tracker",
58 |     "model_parallel_cuda_manual_seed",
59 |     # utils.py
60 |     "split_tensor_along_last_dim",
61 |     "split_tensor_into_1d_equal_chunks",
62 |     "gather_split_1d_tensor",
63 |     "gather_from_sequence_parallel_region_to_moe",
64 |     "reduce_scatter_to_sequence_parallel_region_from_moe",
65 | ]
66 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | from .module import MegatronModule
4 | from .spec_utils import ModuleSpec, build_module
5 | from .transformer_config import TransformerConfig
6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
7 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/custom_layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/transformer/custom_layers/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | 
 6 | # can we get rid of this?
 7 | # it's being used in pipeline schedules
 8 | class ModelType(enum.Enum):
 9 |     encoder_or_decoder = 1
10 |     encoder_and_decoder = 2
11 | 
12 | 
13 | # class LayerType(enum.Enum):
14 | #     encoder = 1
15 | #     decoder = 2
16 | 
17 | 
18 | class AttnType(enum.Enum):
19 |     self_attn = 1
20 |     cross_attn = 2
21 | 
22 | 
23 | class AttnMaskType(enum.Enum):
24 |     padding = 1
25 |     causal = 2
26 |     no_mask = 3  # only used for TE
27 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/identity_op.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | import torch
 3 | 
 4 | 
 5 | class IdentityOp(torch.nn.Module):
 6 |     """
 7 |     This is a placeholder for IdentityOp(x) -> x
 8 |     """
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__()
12 | 
13 |     def forward(self, x, *args, **kwargs):
14 |         return x
15 | 
16 | 
17 | class IdentityFuncOp(IdentityOp):
18 |     """
19 |     This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x.
20 |     Such a func is handy for ops like `bias_dropout_fusion` which themselves
21 |     return a function at runtime based on passed arguments
22 |     """
23 | 
24 |     def __init__(self, *args, **kwargs):
25 |         super().__init__()
26 | 
27 |     def forward(self, *args, **kwargs):
28 |         return super().forward
29 | 


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/transformer/moe/__init__.py


--------------------------------------------------------------------------------
/megatron/core/transformer/moe/grouped_gemm_util.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | try:
 4 |     import grouped_gemm
 5 | except ImportError:
 6 |     grouped_gemm = None
 7 | 
 8 | 
 9 | def grouped_gemm_is_available():
10 |     return grouped_gemm is not None
11 | 
12 | 
13 | def assert_grouped_gemm_is_available():
14 |     assert grouped_gemm_is_available(), (
15 |         "Grouped GEMM is not available. Please run "
16 |         "`pip install git+https://github.com/fanshiqing/grouped_gemm@main`."
17 |     )
18 | 
19 | 
20 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None
21 | 


--------------------------------------------------------------------------------
/megatron/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/data/__init__.py


--------------------------------------------------------------------------------
/megatron/data/multimodal_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from PIL import Image, UnidentifiedImageError
 4 | import numpy as np
 5 | import io
 6 | import torch
 7 | 
 8 | try:
 9 |     from torchvision.transforms import InterpolationMode
10 |     BICUBIC = InterpolationMode.BICUBIC
11 | except ImportError:
12 |     BICUBIC = Image.BICUBIC
13 | 
14 | from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize
15 | 
16 | def _convert_image_to_rgb(image):
17 |     return image.convert("RGB")
18 | 
19 | def _transform(img_h, img_w):
20 |     return Compose([
21 |         ToPILImage(),
22 |         RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0), interpolation=BICUBIC),
23 |         _convert_image_to_rgb,
24 |         ToTensor(),
25 |         Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
26 |     ])
27 | 
28 | class MultiModalDataset(torch.utils.data.Dataset):
29 | 
30 |     def __init__(self, name, data_prefix, indexed_dataset,
31 |                  num_samples, seq_length, seed, img_h, img_w):
32 | 
33 |         self.name = name
34 |         self.indexed_dataset = indexed_dataset
35 |         self.doc_idx = indexed_dataset.get_document_indices()
36 |         self.visual_transform = _transform(img_h, img_w)
37 | 
38 |     def __len__(self):
39 |         return self.indexed_dataset.sequence_lengths.shape[0]
40 | 
41 |     def __getitem__(self, idx):
42 |         text_sample, mode = self.indexed_dataset.get(self.doc_idx[idx])
43 |         assert mode == 0
44 |         img_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]+1)
45 |         assert mode == 1
46 |         img_pad = img_sample[0].item()
47 |         xs = img_sample[1:].tobytes(order='C')
48 |         xs = xs[:len(xs)-img_pad]
49 | 
50 |         img_sample = np.array(Image.open(io.BytesIO(xs)))
51 |         img_sample = self.visual_transform(img_sample).reshape(-1)
52 | 
53 |         return {'text': np.array(text_sample, dtype=np.int64),
54 |                 'img': np.array(img_sample, dtype=np.float32)}
55 | 


--------------------------------------------------------------------------------
/megatron/dist_signal_handler.py:
--------------------------------------------------------------------------------
 1 | import signal
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_world_size():
 7 |     if torch.distributed.is_available() and torch.distributed.is_initialized():
 8 |         world_size = torch.distributed.get_world_size()
 9 |     else:
10 |         world_size = 1
11 |     return world_size
12 | 
13 | 
14 | def get_device(local_rank=None):
15 |     backend = torch.distributed.get_backend()
16 |     if backend == 'nccl':
17 |         if local_rank is None:
18 |             device = torch.device('cuda')
19 |         else:
20 |             device = torch.device(f'cuda:{local_rank}')
21 |     elif backend == 'gloo':
22 |         device = torch.device('cpu')
23 |     else:
24 |         raise RuntimeError
25 |     return device
26 | 
27 | 
28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None):
29 |     if not torch.distributed.is_available() or \
30 |        not torch.distributed.is_initialized():
31 |         return [item]
32 | 
33 |     device = get_device(local_rank)
34 | 
35 |     if group is not None:
36 |         group_size = group.size()
37 |     else:
38 |         group_size = get_world_size()
39 | 
40 |     tensor = torch.tensor([item], device=device, dtype=dtype)
41 |     output_tensors = [
42 |         torch.zeros(1, dtype=tensor.dtype, device=tensor.device)
43 |         for _ in range(group_size)
44 |     ]
45 |     torch.distributed.all_gather(output_tensors, tensor, group, async_op)
46 |     output = [elem.item() for elem in output_tensors]
47 |     return output
48 | 
49 | 
50 | class DistributedSignalHandler:
51 |     def __init__(self, sig=signal.SIGTERM):
52 |         self.sig = sig
53 | 
54 |     def signals_received(self):
55 |         all_received = all_gather_item(
56 |             self._signal_received, dtype=torch.int32
57 |         )
58 |         return all_received
59 | 
60 |     def __enter__(self):
61 |         self._signal_received = False
62 |         self.released = False
63 |         self.original_handler = signal.getsignal(self.sig)
64 | 
65 |         def handler(signum, frame):
66 |             self._signal_received = True
67 | 
68 |         signal.signal(self.sig, handler)
69 | 
70 |         return self
71 | 
72 |     def __exit__(self, type, value, tb):
73 |         self.release()
74 | 
75 |     def release(self):
76 |         if self.released:
77 |             return False
78 | 
79 |         signal.signal(self.sig, self.original_handler)
80 |         self.released = True
81 |         return True
82 | 


--------------------------------------------------------------------------------
/megatron/fp16_deprecated/loss_scaler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """For backward compatibility, we need the class definitions to deserialize."""
 4 | 
 5 | class LossScaler:
 6 |     def __init__(self, scale=1):
 7 |         self.cur_scale = scale
 8 | 
 9 | class DynamicLossScaler:
10 |     def __init__(self,
11 |                  init_scale=2**32,
12 |                  scale_factor=2.,
13 |                  scale_window=1000,
14 |                  min_scale=1,
15 |                  delayed_shift=1,
16 |                  consecutive_hysteresis=False):
17 |         self.cur_scale = init_scale
18 |         self.cur_iter = 0
19 |         self.last_overflow_iter = -1
20 |         self.scale_factor = scale_factor
21 |         self.scale_window = scale_window
22 |         self.min_scale = min_scale
23 |         self.delayed_shift = delayed_shift
24 |         self.cur_hysteresis = delayed_shift
25 |         self.consecutive_hysteresis = consecutive_hysteresis
26 | 
27 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import os
 4 | import pathlib
 5 | import subprocess
 6 | 
 7 | from torch.utils import cpp_extension
 8 | 
 9 | # Setting this param to a list has a problem of generating different
10 | # compilation commands (with diferent order of architectures) and
11 | # leading to recompilation of fused kernels. Set it to empty string
12 | # to avoid recompilation and assign arch flags explicity in
13 | # extra_cuda_cflags below
14 | os.environ["TORCH_CUDA_ARCH_LIST"] = ""
15 | 
16 | 
17 | def load(args):
18 | 
19 |     # Check if cuda 11 is installed for compute capability 8.0
20 |     cc_flag = []
21 |     _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
22 |         cpp_extension.CUDA_HOME
23 |     )
24 |     if int(bare_metal_major) >= 11:
25 |         cc_flag.append('-gencode')
26 |         cc_flag.append('arch=compute_80,code=sm_80')
27 |         if int(bare_metal_minor) >= 8:
28 |             cc_flag.append('-gencode')
29 |             cc_flag.append('arch=compute_90,code=sm_90')
30 | 
31 |     # Build path
32 |     srcpath = pathlib.Path(__file__).parent.absolute()
33 |     buildpath = srcpath / "build"
34 |     _create_build_dir(buildpath)
35 | 
36 |     # Helper function to build the kernels.
37 |     def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
38 |         return cpp_extension.load(
39 |             name=name,
40 |             sources=sources,
41 |             build_directory=buildpath,
42 |             extra_cflags=[
43 |                 "-O3",
44 |             ],
45 |             extra_cuda_cflags=[
46 |                 "-O3",
47 |                 "-gencode",
48 |                 "arch=compute_70,code=sm_70",
49 |                 "--use_fast_math",
50 |             ]
51 |             + extra_cuda_flags
52 |             + cc_flag,
53 |             verbose=(args.rank == 0),
54 |         )
55 | 
56 | 
57 | def _get_cuda_bare_metal_version(cuda_dir):
58 |     raw_output = subprocess.check_output(
59 |         [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
60 |     )
61 |     output = raw_output.split()
62 |     release_idx = output.index("release") + 1
63 |     release = output[release_idx].split(".")
64 |     bare_metal_major = release[0]
65 |     bare_metal_minor = release[1][0]
66 | 
67 |     return raw_output, bare_metal_major, bare_metal_minor
68 | 
69 | 
70 | def _create_build_dir(buildpath):
71 |     try:
72 |         os.mkdir(buildpath)
73 |     except OSError:
74 |         if not os.path.isdir(buildpath):
75 |             print(f"Creation of the build directory {buildpath} failed")
76 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/fused_kernels/type_shim.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
  2 | 
  3 | 
  4 | #include <ATen/ATen.h>
  5 | #include "compat.h"
  6 | 
  7 | 
  8 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
  9 |   switch(TYPE)								\
 10 |     {									\
 11 |     case at::ScalarType::Half:						\
 12 |       {									\
 13 | 	using scalar_t = at::Half;					\
 14 | 	__VA_ARGS__;							\
 15 | 	break;								\
 16 |       }									\
 17 |     case at::ScalarType::BFloat16:					\
 18 |       {									\
 19 | 	using scalar_t = at::BFloat16;					\
 20 | 	__VA_ARGS__;							\
 21 | 	break;								\
 22 |       }									\
 23 |     default:								\
 24 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
 25 |       }
 26 | 
 27 | 
 28 | #define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...)			\
 29 |   switch(TYPE)								\
 30 |     {									\
 31 |     case at::ScalarType::Half:						\
 32 |       {									\
 33 | 	using scalar_t = at::Half;					\
 34 | 	__VA_ARGS__;							\
 35 | 	break;								\
 36 |       }									\
 37 |     case at::ScalarType::BFloat16:					\
 38 |       {									\
 39 | 	using scalar_t = at::BFloat16;					\
 40 | 	__VA_ARGS__;							\
 41 | 	break;								\
 42 |       }									\
 43 |     case at::ScalarType::Float:						\
 44 |       {									\
 45 | 	using scalar_t = float;					\
 46 | 	__VA_ARGS__;							\
 47 | 	break;								\
 48 |       }									\
 49 |     default:								\
 50 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
 51 |       }
 52 | 
 53 | 
 54 | 
 55 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
 56 |   switch(TYPEIN)							\
 57 |     {									\
 58 |     case at::ScalarType::Float:						\
 59 |       {									\
 60 | 	using scalar_t_in = float;					\
 61 | 	switch(TYPEOUT)							\
 62 | 	  {								\
 63 | 	  case at::ScalarType::Float:					\
 64 | 	    {								\
 65 | 	      using scalar_t_out = float;				\
 66 | 	      __VA_ARGS__;						\
 67 | 	      break;							\
 68 | 	    }								\
 69 | 	  case at::ScalarType::Half:					\
 70 | 	    {								\
 71 | 	      using scalar_t_out = at::Half;				\
 72 | 	      __VA_ARGS__;						\
 73 | 	      break;							\
 74 | 	    }								\
 75 | 	  case at::ScalarType::BFloat16:				\
 76 | 	    {								\
 77 | 	      using scalar_t_out = at::BFloat16;			\
 78 | 	      __VA_ARGS__;						\
 79 | 	      break;							\
 80 | 	    }								\
 81 | 	  default:							\
 82 | 	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
 83 | 	  }								\
 84 | 	break;								\
 85 |       }									\
 86 |     case at::ScalarType::Half:						\
 87 |       {									\
 88 | 	using scalar_t_in = at::Half;					\
 89 | 	using scalar_t_out = at::Half;					\
 90 | 	__VA_ARGS__;							\
 91 | 	break;								\
 92 |       }									\
 93 |     case at::ScalarType::BFloat16:					\
 94 |       {									\
 95 | 	using scalar_t_in = at::BFloat16;				\
 96 | 	using scalar_t_out = at::BFloat16;				\
 97 | 	__VA_ARGS__;							\
 98 | 	break;								\
 99 |       }									\
100 |     default:								\
101 |       AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
102 |     }
103 | 
104 | 


--------------------------------------------------------------------------------
/megatron/log_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import sys
 4 | from logging import LogRecord, StreamHandler
 5 | 
 6 | BLACKLISTED_MODULES = ["torch.distributed"]
 7 | 
 8 | 
 9 | class CustomHandler(StreamHandler):
10 |     """
11 |     Custom handler to filter out logging from code outside of
12 |     Megatron Core, and dump to stdout.
13 |     """
14 | 
15 |     def __init__(self):
16 |         super().__init__(stream=sys.stdout)
17 | 
18 |     def filter(self, record: LogRecord) -> bool:
19 |         # Prevent log entries that come from the blacklisted modules
20 |         # through (e.g., PyTorch Distributed).
21 |         for blacklisted_module in BLACKLISTED_MODULES:
22 |             if record.name.startswith(blacklisted_module):
23 |                 return False
24 |         return True
25 | 


--------------------------------------------------------------------------------
/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 4 | from .rms_norm import RMSNorm
 5 | 
 6 | from .bert_model import BertModel
 7 | from .gpt_model import GPTModel
 8 | from .t5_model import T5Model
 9 | from .language_model import get_language_model
10 | from .module import Float16Module
11 | 


--------------------------------------------------------------------------------
/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |     retro_encoder = 3
 9 |     retro_decoder = 4
10 |     retro_decoder_with_retriever = 5
11 |  
12 | class AttnType(enum.Enum):
13 |     self_attn = 1
14 |     cross_attn = 2
15 | 
16 | class AttnMaskType(enum.Enum):
17 |     padding = 1
18 |     causal = 2
19 | 
20 | # For backward compatibility with old model checkpoints
21 | from megatron.core.enums import ModelType
22 | 


--------------------------------------------------------------------------------
/megatron/model/fused_bias_gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from megatron.core.jit import jit_fuser
 5 | 
 6 | 
 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################
 8 | # 1/sqrt(2*pi)-> 0.3989423
 9 | # 1/sqrt(2)   -> 0.70710678
10 | # sqrt(2/pi)  -> 0.79788456
11 | # this function is tanh approximation of gelu
12 | # actual gelu is:
13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
14 | 
15 | @jit_fuser
16 | def bias_gelu(bias, y):
17 |     x = bias + y
18 |     return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
19 | 
20 | # gradient of tanh approximation of gelu
21 | # gradient of actual gelu is:
22 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
23 | @jit_fuser
24 | def bias_gelu_back(g, bias, y):
25 |     x = bias + y
26 |     tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
27 |     # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
28 |     ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
29 |     return ff*g
30 | 
31 | class GeLUFunction(torch.autograd.Function):
32 |     @staticmethod
33 |     # bias is an optional argument
34 |     def forward(ctx, input, bias):
35 |         ctx.save_for_backward(input, bias)
36 |         return bias_gelu(bias, input)
37 | 
38 |     @staticmethod
39 |     def backward(ctx, grad_output):
40 |         input, bias = ctx.saved_tensors
41 |         tmp = bias_gelu_back(grad_output, bias, input)
42 |         return tmp, tmp
43 | 
44 | bias_gelu_impl = GeLUFunction.apply
45 | 


--------------------------------------------------------------------------------
/megatron/model/rms_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | class RMSNorm(torch.nn.Module):
 7 | 
 8 |     def __init__(self,
 9 |                  dim: int,
10 |                  eps: float = 1e-6,
11 |                  sequence_parallel: bool = False):
12 |         """RMS Normaliation module
13 | 
14 |         Arguments:
15 |             dim (int): The width of input, i.e. hidden size
16 |             eps (float): epsilon to use for the norm, default to 1e-6
17 |             sequence_parallel (bool): Set to true if sequence parallelism is being used,
18 |               this marks the weights as needing to be allreduced.
19 |         """
20 |         super().__init__()
21 |         self.eps = eps
22 |         self.weight = nn.Parameter(torch.ones(dim))
23 | 
24 |         setattr(self.weight, 'sequence_parallel', sequence_parallel)
25 | 
26 |     def _norm(self, x):
27 |         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
28 | 
29 |     def forward(self, x):
30 |         output = self._norm(x.float()).type_as(x)
31 |         return output * self.weight
32 | 


--------------------------------------------------------------------------------
/megatron/model/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Utilities for models."""
 4 | 
 5 | import math
 6 | 
 7 | import torch
 8 | 
 9 | from megatron import get_args
10 | from megatron.model import LayerNorm, RMSNorm
11 | from megatron.core.jit import jit_fuser
12 | 
13 | def init_method_normal(sigma):
14 |     """Init method based on N(0, sigma)."""
15 |     def init_(tensor):
16 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
17 | 
18 |     return init_
19 | 
20 | 
21 | def scaled_init_method_normal(sigma, num_layers):
22 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
23 |     std = sigma / math.sqrt(2.0 * num_layers)
24 | 
25 |     def init_(tensor):
26 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
27 | 
28 |     return init_
29 | 
30 | 
31 | def attention_mask_func(attention_scores, attention_mask):
32 |     attention_scores.masked_fill_(attention_mask, -10000.0)
33 |     return attention_scores
34 | 
35 | 
36 | def get_linear_layer(rows, columns, init_method):
37 |     """Simple linear layer with weight initialization."""
38 |     layer = torch.nn.Linear(rows, columns)
39 |     if get_args().perform_initialization:
40 |         init_method(layer.weight)
41 |     with torch.no_grad():
42 |         layer.bias.zero_()
43 |     return layer
44 | 
45 | 
46 | @jit_fuser
47 | def gelu_impl(x):
48 |     """OpenAI's gelu implementation."""
49 |     return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
50 | 
51 |                                        (1.0 + 0.044715 * x * x)))
52 | def openai_gelu(x):
53 |     return gelu_impl(x)
54 | 
55 | 
56 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
57 | @jit_fuser
58 | def erf_gelu(x):
59 |     return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
60 | 
61 | 
62 | def get_norm(config):
63 |     args = get_args()
64 |     if args.normalization == "LayerNorm":
65 |         return LayerNorm(
66 |             config.hidden_size,
67 |             eps=config.layernorm_epsilon,
68 |             no_persist_layer_norm=not config.persist_layer_norm,
69 |             sequence_parallel=config.sequence_parallel,
70 |             apply_layernorm_1p=args.apply_layernorm_1p)
71 |     elif args.normalization == "RMSNorm":
72 |         if args.apply_layernorm_1p:
73 |             raise NotImplementedError('RMSNorm does not currently support the layernorm_1p formulation.')
74 | 
75 |         return RMSNorm(dim=config.hidden_size,
76 |                        eps=config.layernorm_epsilon,
77 |                        sequence_parallel=config.sequence_parallel)
78 |     else:
79 |         raise Exception(f"unsupported norm type '{args.normalization}'.")
80 | 


--------------------------------------------------------------------------------
/megatron/model/vision/classification.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Vision Transformer(VIT) model."""
 4 | 
 5 | import torch
 6 | from torch.nn.init import trunc_normal_
 7 | from megatron import get_args
 8 | from megatron.model.utils import get_linear_layer
 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
10 | from megatron.model.vision.mit_backbone import mit_b3_avg
11 | from megatron.model.module import MegatronModule
12 | 
13 | class VitClassificationModel(MegatronModule):
14 |     """Vision Transformer Model."""
15 | 
16 |     def __init__(self, config, num_classes, finetune=False,
17 |                  pre_process=True, post_process=True):
18 |         super(VitClassificationModel, self).__init__()
19 |         args = get_args()
20 |         self.config = config
21 | 
22 |         self.hidden_size = args.hidden_size
23 |         self.num_classes = num_classes
24 |         self.finetune = finetune
25 |         self.pre_process = pre_process
26 |         self.post_process = post_process
27 |         self.backbone = VitBackbone(
28 |             config=config,
29 |             pre_process=self.pre_process,
30 |             post_process=self.post_process,
31 |             single_token_output=True
32 |         )
33 | 
34 |         if self.post_process:
35 |             if not self.finetune:
36 |                 self.head = VitMlpHead(config, self.hidden_size, self.num_classes)
37 |             else:
38 |                 self.head = get_linear_layer(
39 |                     self.hidden_size,
40 |                     self.num_classes,
41 |                     torch.nn.init.zeros_
42 |                 )
43 | 
44 |     def set_input_tensor(self, input_tensor):
45 |         """See megatron.model.transformer.set_input_tensor()"""
46 |         self.backbone.set_input_tensor(input_tensor)
47 | 
48 |     def forward(self, input):
49 |         hidden_states = self.backbone(input)
50 | 
51 |         if self.post_process:
52 |             hidden_states = self.head(hidden_states)
53 | 
54 |         return hidden_states
55 | 
56 | 
57 | class MitClassificationModel(MegatronModule):
58 |     """Mix vision Transformer Model."""
59 | 
60 |     def __init__(self, num_classes,
61 |                  pre_process=True, post_process=True):
62 |         super(MitClassificationModel, self).__init__()
63 |         args = get_args()
64 | 
65 |         self.hidden_size = args.hidden_size
66 |         self.num_classes = num_classes
67 | 
68 |         self.backbone = mit_b3_avg()
69 |         self.head = torch.nn.Linear(512, num_classes)
70 |         self.apply(self._init_weights)
71 | 
72 |     def _init_weights(self, m):
73 |         if isinstance(m, torch.nn.Linear):
74 |             trunc_normal_(m.weight, std=.02)
75 |             if isinstance(m, torch.nn.Linear) and m.bias is not None:
76 |                 torch.nn.init.constant_(m.bias, 0)
77 | 
78 |     def set_input_tensor(self, input_tensor):
79 |         """See megatron.model.transformer.set_input_tensor()"""
80 |         pass
81 | 
82 |     def forward(self, input):
83 |         hidden_states = self.backbone(input)
84 |         hidden_states = self.head(hidden_states)
85 | 
86 |         return hidden_states
87 | 


--------------------------------------------------------------------------------
/megatron/model/vision/utils.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import torch
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def resize(input,
 7 |            size=None,
 8 |            scale_factor=None,
 9 |            mode='nearest',
10 |            align_corners=None,
11 |            warning=True):
12 |     if warning:
13 |         if size is not None and align_corners:
14 |             input_h, input_w = tuple(int(x) for x in input.shape[2:])
15 |             output_h, output_w = tuple(int(x) for x in size)
16 |             if output_h > input_h or output_w > output_h:
17 |                 if ((output_h > 1 and output_w > 1 and input_h > 1
18 |                      and input_w > 1) and (output_h - 1) % (input_h - 1)
19 |                         and (output_w - 1) % (input_w - 1)):
20 |                     warnings.warn(
21 |                         f'When align_corners={align_corners}, '
22 |                         'the output would more aligned if '
23 |                         f'input size {(input_h, input_w)} is `x+1` and '
24 |                         f'out size {(output_h, output_w)} is `nx+1`')
25 |     if isinstance(size, torch.Size):
26 |         size = tuple(int(x) for x in size)
27 |     return F.interpolate(input, size, scale_factor, mode, align_corners)
28 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/megatron/mpu/tests/commons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import os
 5 | import random
 6 | import numpy
 7 | import torch
 8 | 
 9 | import mpu
10 | 
11 | 
12 | class IdentityLayer(torch.nn.Module):
13 |     def __init__(self, size, scale=1.0):
14 |         super(IdentityLayer, self).__init__()
15 |         self.weight = torch.nn.Parameter(scale * torch.randn(size))
16 | 
17 |     def forward(self):
18 |         return self.weight
19 | 
20 | 
21 | def set_random_seed(seed):
22 |     """Set random seed for reproducability."""
23 |     random.seed(seed)
24 |     numpy.random.seed(seed)
25 |     torch.manual_seed(seed)
26 |     mpu.model_parallel_cuda_manual_seed(seed)
27 | 
28 | 
29 | def initialize_distributed(backend='nccl'):
30 |     """Initialize torch.distributed."""
31 |     # Get local rank in case it is provided.
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--local_rank', type=int, default=None,
34 |                         help='local rank passed from distributed launcher')
35 |     args = parser.parse_args()
36 |     local_rank = args.local_rank
37 | 
38 |     # Get rank and world size.
39 |     rank = int(os.getenv('RANK', '0'))
40 |     world_size = int(os.getenv("WORLD_SIZE", '1'))
41 | 
42 |     print('> initializing torch.distributed with local rank: {}, '
43 |           'rank: {}, world size: {}'.format(local_rank, rank, world_size))
44 | 
45 |     # Set the device id.
46 |     device = rank % torch.cuda.device_count()
47 |     if local_rank is not None:
48 |         device = local_rank
49 |     torch.cuda.set_device(device)
50 | 
51 |     # Call the init process.
52 |     init_method = 'tcp://'
53 |     master_ip = os.getenv('MASTER_ADDR', 'localhost')
54 |     master_port = os.getenv('MASTER_PORT', '6000')
55 |     init_method += master_ip + ':' + master_port
56 |     torch.distributed.init_process_group(
57 |         backend=backend,
58 |         world_size=world_size,
59 |         rank=rank,
60 |         init_method=init_method)
61 | 
62 | 
63 | def print_separator(message):
64 |     torch.distributed.barrier()
65 |     filler_len = (78 - len(message)) // 2
66 |     filler = '-' * filler_len
67 |     string = '\n' + filler + ' {} '.format(message) + filler
68 |     if torch.distributed.get_rank() == 0:
69 |         print(string, flush=True)
70 |     torch.distributed.barrier()
71 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import print_separator
 4 | from commons import initialize_distributed
 5 | from mpu import data as data_utils
 6 | import mpu
 7 | import torch
 8 | import functools
 9 | import operator
10 | import sys
11 | sys.path.append("../..")
12 | 
13 | 
14 | def test_broadcast_data(tensor_model_parallel_size):
15 | 
16 |     if torch.distributed.get_rank() == 0:
17 |         print('> testing broadcast_data with model parallel size {} ...'.
18 |               format(tensor_model_parallel_size))
19 | 
20 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
21 |     torch.manual_seed(1234 + mpu.get_data_parallel_rank())
22 |     tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
23 | 
24 |     key_size_t = {'key1': [7, 11],
25 |                   'key2': [8, 2, 1],
26 |                   'key3': [13],
27 |                   'key4': [5, 1, 2],
28 |                   'key5': [5, 12]}
29 |     keys = list(key_size_t.keys())
30 | 
31 |     data = {}
32 |     data_t = {}
33 |     for key in key_size_t:
34 |         data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000)
35 |         data_t[key] = data[key].clone()
36 |     data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000)
37 |     data_t['keyX'] = data['keyX'].clone()
38 |     if mpu.get_tensor_model_parallel_rank() != 0:
39 |         data = None
40 | 
41 |     data_utils._check_data_types(keys, data_t, torch.int64)
42 |     key_size, key_numel, \
43 |         total_numel = data_utils._build_key_size_numel_dictionaries(keys, data)
44 |     for key in keys:
45 |         assert key_size[key] == key_size_t[key]
46 |     total_numel_t = 0
47 |     for key in keys:
48 |         target_size = functools.reduce(operator.mul, key_size_t[key], 1)
49 |         assert key_numel[key] == target_size
50 |         total_numel_t += target_size
51 |     assert total_numel == total_numel_t
52 | 
53 |     data_b = data_utils.broadcast_data(keys, data, torch.int64)
54 |     for key in keys:
55 |         tensor = data_t[key].cuda()
56 |         assert data_b[key].sub(tensor).abs().max() == 0
57 | 
58 |     # Reset groups
59 |     mpu.destroy_tensor_model_parallel()
60 | 
61 |     torch.distributed.barrier()
62 |     if torch.distributed.get_rank() == 0:
63 |         print('>> passed the test :-)')
64 | 
65 | 
66 | if __name__ == '__main__':
67 | 
68 |     initialize_distributed()
69 |     world_size = torch.distributed.get_world_size()
70 | 
71 |     tensor_model_parallel_size = 1
72 |     while tensor_model_parallel_size <= world_size:
73 |         print_separator('test test broadcast data')
74 |         test_broadcast_data(tensor_model_parallel_size)
75 |         tensor_model_parallel_size *= 2
76 | 


--------------------------------------------------------------------------------
/megatron/mpu/tests/test_initialize.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from commons import print_separator
 4 | from commons import initialize_distributed
 5 | import mpu
 6 | import torch
 7 | import sys
 8 | sys.path.append("../..")
 9 | 
10 | 
11 | def test_initialize_model_parallel(tensor_model_parallel_size):
12 | 
13 |     if torch.distributed.get_rank() == 0:
14 |         print('> testing initialize_model_parallel with size {} ...'.format(
15 |             tensor_model_parallel_size))
16 |     tensor_model_parallel_size_ = min(tensor_model_parallel_size,
17 |                                torch.distributed.get_world_size())
18 |     assert not mpu.model_parallel_is_initialized()
19 |     mpu.initialize_model_parallel(tensor_model_parallel_size_)
20 |     assert mpu.model_parallel_is_initialized()
21 | 
22 |     # Checks.
23 |     def check(group, world_size, rank):
24 |         assert world_size == torch.distributed.get_world_size(group=group)
25 |         assert rank == torch.distributed.get_rank(group=group)
26 | 
27 |     # Model parallel.
28 |     world_size = tensor_model_parallel_size_
29 |     rank = torch.distributed.get_rank() % tensor_model_parallel_size_
30 |     assert world_size == mpu.get_tensor_model_parallel_world_size()
31 |     assert rank == mpu.get_tensor_model_parallel_rank()
32 |     check(mpu.get_tensor_model_parallel_group(), world_size, rank)
33 | 
34 |     # Data parallel.
35 |     world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_
36 |     rank = torch.distributed.get_rank() // tensor_model_parallel_size
37 |     assert world_size == mpu.get_data_parallel_world_size()
38 |     assert rank == mpu.get_data_parallel_rank()
39 |     check(mpu.get_data_parallel_group(), world_size, rank)
40 | 
41 |     # Reset groups
42 |     mpu.destroy_model_parallel()
43 | 
44 |     torch.distributed.barrier()
45 |     if torch.distributed.get_rank() == 0:
46 |         print('>> passed the test :-)')
47 | 
48 | 
49 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_):
50 | 
51 |     if torch.distributed.get_rank() == 0:
52 |         print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format(
53 |             tensor_model_parallel_size_))
54 |     tensor_model_parallel_size = min(tensor_model_parallel_size_,
55 |                               torch.distributed.get_world_size())
56 |     assert not mpu.model_parallel_is_initialized()
57 |     mpu.initialize_model_parallel(tensor_model_parallel_size)
58 |     assert mpu.model_parallel_is_initialized()
59 | 
60 |     # Checks
61 |     src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank()
62 |     assert mpu.get_tensor_model_parallel_src_rank() == src_rank
63 | 
64 |     # Reset groups
65 |     mpu.destroy_model_parallel()
66 | 
67 |     torch.distributed.barrier()
68 |     if torch.distributed.get_rank() == 0:
69 |         print('>> passed the test :-)')
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     initialize_distributed()
75 |     world_size = torch.distributed.get_world_size()
76 |     tensor_model_parallel_size = 1
77 |     while tensor_model_parallel_size <= world_size:
78 |         print_separator('test initialize model parallel')
79 |         test_initialize_model_parallel(tensor_model_parallel_size)
80 |         print_separator('test model parallel source rank')
81 |         test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size)
82 |         tensor_model_parallel_size *= 2
83 | 


--------------------------------------------------------------------------------
/megatron/static/index.html:
--------------------------------------------------------------------------------
  1 | <!-- coding=utf-8-->
  2 | <!-- Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.-->
  3 | 
  4 | <!DOCTYPE html>
  5 | <html lang="en">
  6 | <head>
  7 | <meta charset="utf-8" />
  8 | <title>Megatron</title>
  9 | <style>
 10 | 
 11 | .wrapper {
 12 | max-width: 75%;
 13 | margin: auto;
 14 | }
 15 | 
 16 | h1 { 
 17 | margin: 3rem 0 1rem 0; 
 18 | padding: 0;
 19 | font-size: 1.5rem;
 20 | }
 21 | 
 22 | textarea {
 23 | width: 100%;
 24 | min-height: 300px;
 25 | resize: none;
 26 | border-radius: 8px;
 27 | border: 1px solid #ddd;
 28 | padding: 0.5rem;
 29 | box-shadow: inset 0 0 0.25rem #ddd;
 30 | &:focus {
 31 | outline: none;
 32 | border: 1px solid darken(#ddd, 5%);
 33 | box-shadow: inset 0 0 0.5rem darken(#ddd, 5%);
 34 | }
 35 | }
 36 | 
 37 | #the-count {
 38 | float: right;
 39 | padding: 0.1rem 0 0 0;
 40 | font-size: 0.875rem;
 41 | }
 42 | /* Chat containers */
 43 | .container {
 44 | font-family: 'Arial', sans-serif;
 45 | font-size: 16px;
 46 | border: 2px solid #dedede;
 47 | background-color: #f1f1f1;
 48 | border-radius: 5px;
 49 | padding: 15px;
 50 | margin: 10px 0;
 51 | }
 52 | 
 53 | 
 54 | /* Clear floats */
 55 | .container::after {
 56 | content: "";
 57 | clear: both;
 58 | display: table;
 59 | }
 60 | 
 61 | /* Style images */
 62 | .container img {
 63 | float: left;
 64 | max-width: 60px;
 65 | width: 100%;
 66 | margin-right: 20px;
 67 | border-radius: 50%;
 68 | }
 69 | 
 70 | </style>
 71 | </head>
 72 | <body>
 73 | <div class="wrapper">
 74 | <h1>Prompt Megatron</h1>
 75 | <textarea name="prompt" id="prompt" maxlength="1024" placeholder="Add prompt"autofocus></textarea>
 76 | <label for="tokens_to_generate">Number tokens to generate (1-1024):</label>
 77 | <input type="number" id="tokens_to_generate" name="tokens_to_generate" min="10" max="256", value=32>
 78 | <button onclick="submit_query()">Submit</button>
 79 | 
 80 | <div id="the-count">
 81 | <span id="current">0</span>
 82 | <span id="maximum">/ 1000</span>
 83 | </div>
 84 | <textarea name="response" id="response" maxlength="2048" placeholder="Megatron response..."></textarea>
 85 | </div>
 86 | <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
 87 | <script type="text/javascript">
 88 | 
 89 | 	function submit_query() {
 90 | 		$("#response").val("Waiting for Megatron response...");
 91 | 		$.ajax({
 92 | 			url:"api",
 93 | 			type:"PUT",
 94 | 			data:JSON.stringify({prompts: [$("#prompt").val()], tokens_to_generate: parseInt($("#tokens_to_generate").val(),10)}),
 95 | 			contentType:"application/json; charset=utf-8",
 96 | 			dataType:"json",
 97 | 			success: function(data){
 98 | 				data.max_len=35;
 99 | 				$("#response").val(data.text);
100 | 			}
101 | 		});
102 | 	}
103 | 	
104 | $('textarea').keyup(function() {
105 | var characterCount = $(this).val().length,
106 | current = $('#current'),
107 | maximum = $('#maximum'),
108 | theCount = $('#the-count');
109 | 
110 | current.text(characterCount);
111 | 
112 | if (characterCount >= 800) {
113 | maximum.css('color', '#8f0001');
114 | current.css('color', '#8f0001');
115 | theCount.css('font-weight','bold');
116 | } else {
117 | maximum.css('color','#666');
118 | theCount.css('font-weight','normal');
119 | }
120 | });
121 | </script>
122 | </body>
123 | </html>
124 | 
125 | 


--------------------------------------------------------------------------------
/megatron/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/megatron/text_generation/beam_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | ## from huggingface beam search
19 | class BeamHypotheses(object):
20 |     def __init__(self, num_beams, length_penalty=1.0, early_stopping=False):
21 |         """
22 |         Initialize n-best list of hypotheses.
23 |         """
24 |         self.length_penalty = length_penalty
25 |         self.early_stopping = early_stopping
26 |         self.num_beams = num_beams
27 |         self.beams = []
28 |         self.worst_score = 1e9
29 | 
30 |     def __len__(self):
31 |         """
32 |         Number of hypotheses in the list.
33 |         """
34 |         return len(self.beams)
35 | 
36 |     def add(self, hyp, sum_logprobs, length):
37 |         """
38 |         Add a new hypothesis to the list.
39 |         """
40 |         score = sum_logprobs / length ** self.length_penalty
41 |         if len(self) < self.num_beams or score > self.worst_score:
42 |             self.beams.append((score, hyp))
43 |             if len(self) > self.num_beams:
44 |                 sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
45 |                 del self.beams[sorted_scores[0][1]]
46 |                 self.worst_score = sorted_scores[1][0]
47 |             else:
48 |                 self.worst_score = min(score, self.worst_score)
49 | 
50 |     def is_done(self, best_sum_logprobs, cur_len):
51 |         """
52 |         If there are enough hypotheses and that none of the hypotheses being generated
53 |         can become better than the worst one in the heap, then we are done with this sentence.
54 |         """
55 | 
56 |         if len(self) < self.num_beams:
57 |             return False
58 |         elif self.early_stopping:
59 |             return True
60 |         else:
61 |             cur_score = best_sum_logprobs / cur_len ** self.length_penalty
62 |             ret = self.worst_score >= cur_score
63 |             return ret
64 | 
65 | 


--------------------------------------------------------------------------------
/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/run_docker.sh:
--------------------------------------------------------------------------------
1 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v $HOME:$HOME -it --rm nvcr.io/nvidia/pytorch:24.01-py3


--------------------------------------------------------------------------------
/scripts/data/download_c4.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | import os
 3 | 
 4 | os.makedirs('./assets/data/c4', exist_ok=True)
 5 | # English only
 6 | for i in range(20):
 7 |     en = load_dataset("allenai/c4", data_files={'train': f'en/c4-train.{str(i).zfill(5)}-of-01024.json.gz'}, cache_dir='./assets/data/c4', split='train')
 8 |     print(len(en))
 9 | 
10 |     # save as json files
11 |     en.to_json(f'./assets/data/c4/en/c4-train.{str(i).zfill(5)}-of-01024.json', orient='records', lines=True)


--------------------------------------------------------------------------------
/scripts/data/prepare_c4_megatron_llama2.sh:
--------------------------------------------------------------------------------
 1 | # for i from 00000 to 00100
 2 | mkdir -p assets/data/c4_llama2_pretokenized
 3 | for i in {00000..00019}; do
 4 |        echo "Processing ./assets/data/c4/en/c4-train.${i}-of-01024.json"
 5 |        python tools/preprocess_data.py \
 6 |               --input "./assets/data/c4/en/c4-train.${i}-of-01024.json" \
 7 |               --output-prefix assets/data/c4_llama2_pretokenized/c4_llama2_${i} \
 8 |               --vocab-file ./assets/checkpoints/llama2_7b_hf/tokenizer.json \
 9 |               --tokenizer-type Llama2Tokenizer \
10 |               --tokenizer-model ./assets/checkpoints/llama2_7b_hf/tokenizer.model \
11 |               --append-eod \
12 |               --workers 8
13 | done


--------------------------------------------------------------------------------
/scripts/data/prepare_c4_megatron_llama3.1.sh:
--------------------------------------------------------------------------------
 1 | # for i from 00000 to 00100
 2 | mkdir -p assets/data/c4_llama3.1_pretokenized
 3 | for i in {00000..00019}; do
 4 |        echo "Processing ./assets/data/c4/en/c4-train.${i}-of-01024.json"
 5 |        python tools/preprocess_data.py \
 6 |               --input "./assets/data/c4/en/c4-train.${i}-of-01024.json" \
 7 |               --output-prefix assets/data/c4_llama3.1_pretokenized/c4_llama3.1_${i} \
 8 |               --vocab-file ./assets/checkpoints/llama3.1_8b_hf/tokenizer.json \
 9 |               --tokenizer-type AutoTokenizer \
10 |               --tokenizer-model ./assets/checkpoints/llama3.1_8b_hf \
11 |               --append-eod \
12 |               --workers 8
13 | done


--------------------------------------------------------------------------------
/scripts/data/prepare_c4_megatron_llama3.sh:
--------------------------------------------------------------------------------
 1 | # for i from 00000 to 00100
 2 | mkdir -p assets/data/c4_llama3_pretokenized
 3 | for i in {00000..00019}; do
 4 |        echo "Processing ./assets/data/c4/en/c4-train.${i}-of-01024.json"
 5 |        python tools/preprocess_data.py \
 6 |               --input "./assets/data/c4/en/c4-train.${i}-of-01024.json" \
 7 |               --output-prefix assets/data/c4_llama3_pretokenized/c4_llama3_${i} \
 8 |               --vocab-file ./assets/checkpoints/llama3_8b_hf/tokenizer.json \
 9 |               --tokenizer-type AutoTokenizer \
10 |               --tokenizer-model ./assets/checkpoints/llama3_8b_hf \
11 |               --append-eod \
12 |               --workers 8
13 | done


--------------------------------------------------------------------------------
/scripts/oneshot/run_llama2_13b_prune_tp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export MASTER_ADDR="127.0.0.1"
 4 | export MASTER_PORT="45530" # select the port
 5 | NNODES=1 # number of nodes
 6 | NPROC_PER_NODE=8 # number of gpus (processes) per node
 7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total
 8 | 
 9 | export NCCL_IB_TIMEOUT=19
10 | export NCCL_IB_SL=1
11 | export CUDA_DEVICE_MAX_CONNECTIONS=1
12 | 
13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda
14 | EXTRA_CMD=$2
15 | 
16 | export TASK='wikitext'
17 | export SPARSITY=0.5
18 | export PATTERN='nmprune'
19 | export EXCLUDE=0
20 | 
21 | NSAMPLES=128
22 | BASE_NAME="llama2-13b-tp8"
23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}"
24 | 
25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
26 | PROJECT_DIR=$(pwd)
27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning
28 | mkdir -p $LOG_DIR
29 | 
30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama2_13b_megatron_tp8"
31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}"
32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_13b_hf/tokenizer.model"
33 | 
34 | TASK_NAME="PRUNE-WIKITEXT2"
35 | options=" \
36 |     ${mag_options} \
37 |     --task ${TASK_NAME} \
38 |     --valid-data ${VALID_DATA} \
39 |     --use-flash-attn \
40 |     --untie-embeddings-and-output-weights \
41 |     --disable-bias-linear \
42 |     --no-position-embedding \
43 |     --no-masked-softmax-fusion \
44 |     --use-rotary-position-embeddings \
45 |     --swiglu \
46 |     --attention-dropout 0.0 \
47 |     --hidden-dropout 0.0 \
48 |     --tensor-model-parallel-size 8 \
49 |     --pipeline-model-parallel-size 1 \
50 |     --num-layers 40 \
51 |     --hidden-size 5120 \
52 |     --num-attention-heads 40 \
53 |     --seq-length 4096 \
54 |     --max-position-embeddings 4096 \
55 |     --micro-batch-size 1 \
56 |     --global-batch-size 256 \
57 |     --train-iters 1000 \
58 |     --log-interval 10 \
59 |     --overlapping-eval 4096 \
60 |     --eval-iters 10 \
61 |     --eval-interval 500 \
62 |     --tokenizer-type Llama2Tokenizer \
63 |     --tokenizer-model ${TOKENIZER_MODEL} \
64 |     --make-vocab-size-divisible-by 1 \
65 |     --ffn-hidden-size 13824 --normalization RMSNorm \
66 |     --split 99,1,0 \
67 |     --clip-grad 1.0 \
68 |     --weight-decay 0.1 \
69 |     --adam-beta1 0.9 \
70 |     --adam-beta2 0.95 \
71 |     --init-method-std 0.014 \
72 |     --exit-on-missing-checkpoint \
73 |     --no-load-optim \
74 |     --no-load-rng \
75 |     --bf16 \
76 |     --log-interval 100 \
77 |     --eval-iters 32 \
78 |     --eval-interval 2000 \
79 |     --data-path "None" \
80 |     --save-interval 20000 \
81 |     --save ${CHECKPOINT_SAVE_DIR} \
82 |     --load ${CHECKPOINT_LOAD_DIR} \
83 |     --hessian-compute \
84 |     --sparse-pattern ${PATTERN} \
85 |     --sparse-method ${SPARSEMETHOD} \
86 |     --sparsity ${SPARSITY} \
87 |     --row-b -1 \
88 |     --col-b 128 \
89 |     --prunen 2 \
90 |     --prunem 4 \
91 |     --hessian-samples $NSAMPLES \
92 |     --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} "
93 | 
94 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 
95 | 
96 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${options}


--------------------------------------------------------------------------------
/scripts/oneshot/run_llama2_7b_prune_tp8.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export MASTER_ADDR="127.0.0.1"
 4 | export MASTER_PORT="45530" # select the port
 5 | NNODES=1 # number of nodes
 6 | NPROC_PER_NODE=8 # number of gpus (processes) per node
 7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total
 8 | 
 9 | export NCCL_IB_TIMEOUT=19
10 | export NCCL_IB_SL=1
11 | export CUDA_DEVICE_MAX_CONNECTIONS=1
12 | 
13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda
14 | EXTRA_CMD=$2
15 | 
16 | export TASK='wikitext'
17 | export SPARSITY=0.5
18 | export PATTERN='nmprune'
19 | export EXCLUDE=0
20 | 
21 | NSAMPLES=128
22 | BASE_NAME="llama2-7b-tp8"
23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}"
24 | 
25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
26 | PROJECT_DIR=$(pwd)
27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning
28 | mkdir -p $LOG_DIR
29 | 
30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama2_7b_megatron_tp8"
31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}"
32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_7b_hf/tokenizer.model"
33 | 
34 | TASK_NAME="PRUNE-WIKITEXT2"
35 | options=" \
36 |     ${mag_options} \
37 |     --task ${TASK_NAME} \
38 |     --valid-data ${VALID_DATA} \
39 |     --use-flash-attn \
40 |     --untie-embeddings-and-output-weights \
41 |     --disable-bias-linear \
42 |     --no-position-embedding \
43 |     --no-masked-softmax-fusion \
44 |     --use-rotary-position-embeddings \
45 |     --swiglu \
46 |     --attention-dropout 0.0 \
47 |     --hidden-dropout 0.0 \
48 |     --tensor-model-parallel-size 8 \
49 |     --pipeline-model-parallel-size 1 \
50 |     --num-layers 32 \
51 |     --hidden-size 4096 \
52 |     --num-attention-heads 32 \
53 |     --seq-length 4096 \
54 |     --max-position-embeddings 4096 \
55 |     --micro-batch-size 1 \
56 |     --global-batch-size 256 \
57 |     --train-iters 1000 \
58 |     --log-interval 10 \
59 |     --overlapping-eval 4096 \
60 |     --eval-iters 10 \
61 |     --eval-interval 500 \
62 |     --tokenizer-type Llama2Tokenizer \
63 |     --tokenizer-model ${TOKENIZER_MODEL} \
64 |     --make-vocab-size-divisible-by 1 \
65 |     --ffn-hidden-size 11008 --normalization RMSNorm \
66 |     --split 99,1,0 \
67 |     --clip-grad 1.0 \
68 |     --weight-decay 0.1 \
69 |     --num-query-groups 32 \
70 |     --group-query-attention \
71 |     --adam-beta1 0.9 \
72 |     --adam-beta2 0.95 \
73 |     --init-method-std 0.014 \
74 |     --exit-on-missing-checkpoint \
75 |     --no-load-optim \
76 |     --no-load-rng \
77 |     --bf16 \
78 |     --log-interval 100 \
79 |     --eval-iters 32 \
80 |     --eval-interval 2000 \
81 |     --data-path "None" \
82 |     --save-interval 20000 \
83 |     --save ${CHECKPOINT_SAVE_DIR} \
84 |     --load ${CHECKPOINT_LOAD_DIR} \
85 |     --hessian-compute \
86 |     --sparse-pattern ${PATTERN} \
87 |     --sparse-method ${SPARSEMETHOD} \
88 |     --sparsity ${SPARSITY} \
89 |     --row-b -1 \
90 |     --col-b 128 \
91 |     --prunen 2 \
92 |     --prunem 4 \
93 |     --hessian-samples $NSAMPLES \
94 |     --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} "
95 | 
96 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 
97 | 
98 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py $options


--------------------------------------------------------------------------------
/scripts/oneshot/run_llama3.1_8b_prune_tp8.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export MASTER_ADDR="127.0.0.1"
  4 | export MASTER_PORT="45530" # select the port
  5 | NNODES=1 # number of nodes
  6 | NPROC_PER_NODE=8 # number of gpus (processes) per node
  7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total
  8 | 
  9 | export NCCL_IB_TIMEOUT=19
 10 | export NCCL_IB_SL=1
 11 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 12 | 
 13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda
 14 | EXTRA_CMD=$2
 15 | 
 16 | export TASK='wikitext'
 17 | export SPARSITY=0.5
 18 | export PATTERN='nmprune'
 19 | export EXCLUDE=0
 20 | 
 21 | NSAMPLES=128
 22 | BASE_NAME="llama3.1-8b-tp8"
 23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}"
 24 | 
 25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 26 | PROJECT_DIR=$(pwd)
 27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning
 28 | mkdir -p $LOG_DIR
 29 | 
 30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama3.1_8b_megatron_tp8"
 31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}"
 32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3.1_8b_hf"
 33 | 
 34 | TASK_NAME="PRUNE-WIKITEXT2"
 35 | options=" \
 36 |     ${mag_options} \
 37 |     --task ${TASK_NAME} \
 38 |     --valid-data ${VALID_DATA} \
 39 |     --use-flash-attn \
 40 |     --untie-embeddings-and-output-weights \
 41 |     --disable-bias-linear \
 42 |     --no-position-embedding \
 43 |     --no-masked-softmax-fusion \
 44 |     --use-rotary-position-embeddings \
 45 |     --swiglu \
 46 |     --rotary-base 500000 \
 47 |     --attention-dropout 0.0 \
 48 |     --hidden-dropout 0.0 \
 49 |     --tensor-model-parallel-size 8 \
 50 |     --pipeline-model-parallel-size 1 \
 51 |     --num-layers 32 \
 52 |     --hidden-size 4096 \
 53 |     --num-attention-heads 32 \
 54 |     --seq-length 4096 \
 55 |     --max-position-embeddings 4096 \
 56 |     --group-query-attention \
 57 |     --num-query-groups 8 \
 58 |     --micro-batch-size 1 \
 59 |     --global-batch-size 256 \
 60 |     --train-iters 1000 \
 61 |     --log-interval 10 \
 62 |     --overlapping-eval 4096 \
 63 |     --eval-iters 10 \
 64 |     --eval-interval 500 \
 65 |     --tokenizer-type AutoTokenizer \
 66 |     --tokenizer-model ${TOKENIZER_MODEL} \
 67 |     --make-vocab-size-divisible-by 1 \
 68 |     --ffn-hidden-size 14336 --normalization RMSNorm \
 69 |     --split 99,1,0 \
 70 |     --clip-grad 1.0 \
 71 |     --weight-decay 0.1 \
 72 |     --group-query-attention \
 73 |     --adam-beta1 0.9 \
 74 |     --adam-beta2 0.95 \
 75 |     --init-method-std 0.014 \
 76 |     --exit-on-missing-checkpoint \
 77 |     --no-load-optim \
 78 |     --no-load-rng \
 79 |     --bf16 \
 80 |     --log-interval 100 \
 81 |     --eval-iters 32 \
 82 |     --eval-interval 2000 \
 83 |     --data-path "None" \
 84 |     --save-interval 20000 \
 85 |     --save ${CHECKPOINT_SAVE_DIR} \
 86 |     --load ${CHECKPOINT_LOAD_DIR} \
 87 |     --hessian-compute \
 88 |     --sparse-pattern ${PATTERN} \
 89 |     --sparse-method ${SPARSEMETHOD} \
 90 |     --sparsity ${SPARSITY} \
 91 |     --row-b -1 \
 92 |     --col-b 128 \
 93 |     --prunen 2 \
 94 |     --prunem 4 \
 95 |     --hessian-samples $NSAMPLES \
 96 |     --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} "
 97 | 
 98 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 
 99 | 
100 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py $options


--------------------------------------------------------------------------------
/scripts/oneshot/run_llama3_8b_prune_tp8.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export MASTER_ADDR="127.0.0.1"
  4 | export MASTER_PORT="45530" # select the port
  5 | NNODES=1 # number of nodes
  6 | NPROC_PER_NODE=8 # number of gpus (processes) per node
  7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total
  8 | 
  9 | export NCCL_IB_TIMEOUT=19
 10 | export NCCL_IB_SL=1
 11 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 12 | 
 13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda
 14 | EXTRA_CMD=$2
 15 | 
 16 | export TASK='wikitext'
 17 | export SPARSITY=0.5
 18 | export PATTERN='nmprune'
 19 | export EXCLUDE=0
 20 | 
 21 | NSAMPLES=128
 22 | BASE_NAME="llama3-8b-tp8"
 23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}"
 24 | 
 25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
 26 | PROJECT_DIR=$(pwd)
 27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning
 28 | mkdir -p $LOG_DIR
 29 | 
 30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama3_8b_megatron_tp8"
 31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}"
 32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3_8b_hf"
 33 | 
 34 | TASK_NAME="PRUNE-WIKITEXT2"
 35 | options=" \
 36 |     ${mag_options} \
 37 |     --task ${TASK_NAME} \
 38 |     --valid-data ${VALID_DATA} \
 39 |     --use-flash-attn \
 40 |     --untie-embeddings-and-output-weights \
 41 |     --disable-bias-linear \
 42 |     --no-position-embedding \
 43 |     --no-masked-softmax-fusion \
 44 |     --use-rotary-position-embeddings \
 45 |     --swiglu \
 46 |     --rotary-base 500000 \
 47 |     --attention-dropout 0.0 \
 48 |     --hidden-dropout 0.0 \
 49 |     --tensor-model-parallel-size 8 \
 50 |     --pipeline-model-parallel-size 1 \
 51 |     --num-layers 32 \
 52 |     --hidden-size 4096 \
 53 |     --num-attention-heads 32 \
 54 |     --seq-length 4096 \
 55 |     --max-position-embeddings 4096 \
 56 |     --group-query-attention \
 57 |     --num-query-groups 8 \
 58 |     --micro-batch-size 1 \
 59 |     --global-batch-size 256 \
 60 |     --train-iters 1000 \
 61 |     --log-interval 10 \
 62 |     --overlapping-eval 4096 \
 63 |     --eval-iters 10 \
 64 |     --eval-interval 500 \
 65 |     --tokenizer-type AutoTokenizer \
 66 |     --tokenizer-model ${TOKENIZER_MODEL} \
 67 |     --make-vocab-size-divisible-by 1 \
 68 |     --ffn-hidden-size 14336 --normalization RMSNorm \
 69 |     --split 99,1,0 \
 70 |     --clip-grad 1.0 \
 71 |     --weight-decay 0.1 \
 72 |     --group-query-attention \
 73 |     --adam-beta1 0.9 \
 74 |     --adam-beta2 0.95 \
 75 |     --init-method-std 0.014 \
 76 |     --exit-on-missing-checkpoint \
 77 |     --no-load-optim \
 78 |     --no-load-rng \
 79 |     --bf16 \
 80 |     --log-interval 100 \
 81 |     --eval-iters 32 \
 82 |     --eval-interval 2000 \
 83 |     --data-path "None" \
 84 |     --save-interval 20000 \
 85 |     --save ${CHECKPOINT_SAVE_DIR} \
 86 |     --load ${CHECKPOINT_LOAD_DIR} \
 87 |     --hessian-compute \
 88 |     --sparse-pattern ${PATTERN} \
 89 |     --sparse-method ${SPARSEMETHOD} \
 90 |     --sparsity ${SPARSITY} \
 91 |     --row-b -1 \
 92 |     --col-b 128 \
 93 |     --prunen 2 \
 94 |     --prunem 4 \
 95 |     --hessian-samples $NSAMPLES \
 96 |     --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} "
 97 | 
 98 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 
 99 | 
100 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py $options


--------------------------------------------------------------------------------
/scripts/ppl/evaluate_llama2_wikitext2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # export NCCL_IB_SL=1
 4 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 5 | LOAD=$1 # path to the model
 6 | MODEL=$2 # 7b, 13b
 7 | TP=$3
 8 | MODE=$4
 9 | 
10 | echo $LOAD
11 | 
12 | PROJECT_DIR=$(pwd) # change this to the path of your maskllm project
13 | OUTPUT="$PROJECT_DIR/output"
14 | 
15 | # If model==2b
16 | if [ "$MODEL" == "7b" ]; then
17 |    HIDDEN_SIZE=4096 # hidden size
18 |    NUM_LAYERS=32 # number of layers
19 |    NUM_ATTN_HEADS=32 # number of attention heads
20 |    TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_7b_megatron_tp8/tokenizer.model"
21 |    FFN_HIDDEN_SIZE=11008
22 | elif [ "$MODEL" == "13b" ]; then
23 |    HIDDEN_SIZE=5120 # hidden size
24 |    NUM_LAYERS=40 # number of layers
25 |    NUM_ATTN_HEADS=40 # number of attention heads
26 |    TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_13b_megatron_tp8/tokenizer.model"
27 |    FFN_HIDDEN_SIZE=13824
28 | fi
29 | SEQ_LENGTH=4096 # sequence length
30 | 
31 | if [ "$MODE" == "dense" ]; then
32 |    MASK_OPTIONS=" "
33 | elif [ "$MODE" == "sparse" ]; then
34 |    MASK_OPTIONS="--enable-sparsity "
35 | else
36 |    MASK_OPTIONS=" "
37 | fi
38 | 
39 | export CUDA_DEVICE_MAX_CONNECTIONS=1;
40 | 
41 | OPTIONS=" \
42 |    --task WIKITEXT2 \
43 |    --use-flash-attn \
44 |    --untie-embeddings-and-output-weights \
45 |    --disable-bias-linear \
46 |    --no-position-embedding \
47 |    --no-masked-softmax-fusion \
48 |    --use-rotary-position-embeddings \
49 |    --swiglu \
50 |    --attention-dropout 0.0 \
51 |    --hidden-dropout 0.0 \
52 |    --tensor-model-parallel-size $TP \
53 |    --pipeline-model-parallel-size 1 \
54 |    --overlapping-eval $SEQ_LENGTH \
55 |    --num-layers $NUM_LAYERS \
56 |    --hidden-size $HIDDEN_SIZE \
57 |    --num-attention-heads $NUM_ATTN_HEADS \
58 |    --seq-length $SEQ_LENGTH \
59 |    --max-position-embeddings $SEQ_LENGTH \
60 |    --micro-batch-size 1 \
61 |    --global-batch-size 256 \
62 |    --train-iters 1 \
63 |    --lr-decay-iters 1 \
64 |    --lr 1.0e-4 \
65 |    --min-lr 1.0e-5 \
66 |    --lr-decay-style cosine \
67 |    --log-interval 100 \
68 |    --tokenizer-type Llama2Tokenizer \
69 |    --tokenizer-model ${TOKENIZER_MODEL} \
70 |    --make-vocab-size-divisible-by 1 \
71 |    --ffn-hidden-size $FFN_HIDDEN_SIZE --normalization RMSNorm \
72 |    --data-path None \
73 |    --bf16 \
74 |    --no-save-optim --no-save-rng \
75 |    --no-load-optim --no-load-rng \
76 |    --exit-on-missing-checkpoint \
77 |    --load ${LOAD} \
78 |    --hidden-dropout 0.0 --attention-dropout 0.0 \
79 |    $MASK_OPTIONS"
80 | 
81 | cd $PROJECT_DIR; 
82 | 
83 | export MASTER_ADDR="127.0.0.1"
84 | export MASTER_PORT="45530" # select the port
85 | NNODES=1 # number of nodes
86 | NPROC_PER_NODE=${TP} # number of gpus (processes) per node
87 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total
88 | 
89 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${OPTIONS}
90 | 
91 | 


--------------------------------------------------------------------------------
/scripts/ppl/evaluate_llama3.1_wikitext2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # export NCCL_IB_SL=1
 4 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 5 | LOAD=$1 # path to the model
 6 | MODEL=$2 # 7b, 13b
 7 | TP=$3
 8 | MODE=$4
 9 | 
10 | echo $LOAD
11 | 
12 | PROJECT_DIR=$(pwd) # change this to the path of your maskllm project
13 | OUTPUT="$PROJECT_DIR/output"
14 | 
15 | # If model==2b
16 | if [ "$MODEL" == "8b" ]; then
17 |    HIDDEN_SIZE=4096 # hidden size
18 |    NUM_LAYERS=32 # number of layers
19 |    NUM_ATTN_HEADS=32 # number of attention heads
20 |    TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3.1_8b_hf"
21 |    FFN_HIDDEN_SIZE=14336
22 | fi
23 | SEQ_LENGTH=4096 # sequence length
24 | 
25 | if [ "$MODE" == "dense" ]; then
26 |    MASK_OPTIONS=" "
27 | elif [ "$MODE" == "sparse" ]; then
28 |    MASK_OPTIONS="--enable-sparsity "
29 | else
30 |    MASK_OPTIONS=" "
31 | fi
32 | 
33 | export CUDA_DEVICE_MAX_CONNECTIONS=1;
34 | 
35 | OPTIONS=" \
36 |    --task WIKITEXT2 \
37 |    --use-flash-attn \
38 |    --untie-embeddings-and-output-weights \
39 |    --disable-bias-linear \
40 |    --no-position-embedding \
41 |    --no-masked-softmax-fusion \
42 |    --use-rotary-position-embeddings \
43 |    --rotary-base 500000 \
44 |    --swiglu \
45 |    --attention-dropout 0.0 \
46 |    --hidden-dropout 0.0 \
47 |    --tensor-model-parallel-size $TP \
48 |    --pipeline-model-parallel-size 1 \
49 |    --overlapping-eval $SEQ_LENGTH \
50 |    --num-layers $NUM_LAYERS \
51 |    --hidden-size $HIDDEN_SIZE \
52 |    --num-attention-heads $NUM_ATTN_HEADS \
53 |    --seq-length $SEQ_LENGTH \
54 |    --max-position-embeddings $SEQ_LENGTH \
55 |    --group-query-attention \
56 |    --num-query-groups 8 \
57 |    --micro-batch-size 1 \
58 |    --global-batch-size 256 \
59 |    --train-iters 1 \
60 |    --lr-decay-iters 1 \
61 |    --lr 1.0e-4 \
62 |    --min-lr 1.0e-5 \
63 |    --lr-decay-style cosine \
64 |    --log-interval 100 \
65 |    --tokenizer-type AutoTokenizer \
66 |    --tokenizer-model ${TOKENIZER_MODEL} \
67 |    --make-vocab-size-divisible-by 1 \
68 |    --ffn-hidden-size $FFN_HIDDEN_SIZE --normalization RMSNorm \
69 |    --data-path None \
70 |    --bf16 \
71 |    --no-save-optim --no-save-rng \
72 |    --no-load-optim --no-load-rng \
73 |    --exit-on-missing-checkpoint \
74 |    --load ${LOAD} \
75 |    --hidden-dropout 0.0 --attention-dropout 0.0 \
76 |    $MASK_OPTIONS"
77 | 
78 | cd $PROJECT_DIR; 
79 | 
80 | export MASTER_ADDR="127.0.0.1"
81 | export MASTER_PORT="45530" # select the port
82 | NNODES=1 # number of nodes
83 | NPROC_PER_NODE=${TP} # number of gpus (processes) per node
84 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total
85 | 
86 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${OPTIONS}
87 | 
88 | 


--------------------------------------------------------------------------------
/scripts/ppl/evaluate_llama3_wikitext2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # export NCCL_IB_SL=1
 4 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 5 | LOAD=$1 # path to the model
 6 | MODEL=$2 # 7b, 13b
 7 | TP=$3
 8 | MODE=$4
 9 | 
10 | echo $LOAD
11 | 
12 | PROJECT_DIR=$(pwd) # change this to the path of your maskllm project
13 | OUTPUT="$PROJECT_DIR/output"
14 | 
15 | # If model==2b
16 | if [ "$MODEL" == "8b" ]; then
17 |    HIDDEN_SIZE=4096 # hidden size
18 |    NUM_LAYERS=32 # number of layers
19 |    NUM_ATTN_HEADS=32 # number of attention heads
20 |    TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3_8b_hf"
21 |    FFN_HIDDEN_SIZE=14336
22 | fi
23 | SEQ_LENGTH=4096 # sequence length
24 | 
25 | if [ "$MODE" == "dense" ]; then
26 |    MASK_OPTIONS=" "
27 | elif [ "$MODE" == "sparse" ]; then
28 |    MASK_OPTIONS="--enable-sparsity "
29 | else
30 |    MASK_OPTIONS=" "
31 | fi
32 | 
33 | export CUDA_DEVICE_MAX_CONNECTIONS=1;
34 | 
35 | OPTIONS=" \
36 |    --task WIKITEXT2 \
37 |    --use-flash-attn \
38 |    --untie-embeddings-and-output-weights \
39 |    --disable-bias-linear \
40 |    --no-position-embedding \
41 |    --no-masked-softmax-fusion \
42 |    --use-rotary-position-embeddings \
43 |    --rotary-base 500000 \
44 |    --swiglu \
45 |    --attention-dropout 0.0 \
46 |    --hidden-dropout 0.0 \
47 |    --tensor-model-parallel-size $TP \
48 |    --pipeline-model-parallel-size 1 \
49 |    --overlapping-eval $SEQ_LENGTH \
50 |    --num-layers $NUM_LAYERS \
51 |    --hidden-size $HIDDEN_SIZE \
52 |    --num-attention-heads $NUM_ATTN_HEADS \
53 |    --seq-length $SEQ_LENGTH \
54 |    --max-position-embeddings $SEQ_LENGTH \
55 |    --group-query-attention \
56 |    --num-query-groups 8 \
57 |    --micro-batch-size 1 \
58 |    --global-batch-size 256 \
59 |    --train-iters 1 \
60 |    --lr-decay-iters 1 \
61 |    --lr 1.0e-4 \
62 |    --min-lr 1.0e-5 \
63 |    --lr-decay-style cosine \
64 |    --log-interval 100 \
65 |    --tokenizer-type AutoTokenizer \
66 |    --tokenizer-model ${TOKENIZER_MODEL} \
67 |    --make-vocab-size-divisible-by 1 \
68 |    --ffn-hidden-size $FFN_HIDDEN_SIZE --normalization RMSNorm \
69 |    --data-path None \
70 |    --bf16 \
71 |    --no-save-optim --no-save-rng \
72 |    --no-load-optim --no-load-rng \
73 |    --exit-on-missing-checkpoint \
74 |    --load ${LOAD} \
75 |    --hidden-dropout 0.0 --attention-dropout 0.0 \
76 |    $MASK_OPTIONS"
77 | 
78 | cd $PROJECT_DIR; 
79 | 
80 | export MASTER_ADDR="127.0.0.1"
81 | export MASTER_PORT="45530" # select the port
82 | NNODES=1 # number of nodes
83 | NPROC_PER_NODE=${TP} # number of gpus (processes) per node
84 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total
85 | 
86 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${OPTIONS}
87 | 
88 | 


--------------------------------------------------------------------------------
/scripts/tools/convert_llama2_13b_hf_to_megatron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROJECT_DIR=$(pwd)
 4 | 
 5 | TP=8
 6 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_13b_hf
 7 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_13b_megatron_tp$TP
 8 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.model
 9 | 
10 | OPTIONS=" \
11 |     --model-type GPT \
12 |     --loader llama2_hf \
13 |     --saver megatron \
14 |     --target-tensor-parallel-size ${TP} \
15 |     --load-dir ${HF_FORMAT_DIR} \
16 |     --save-dir ${MEGATRON_FORMAT_DIR} \
17 |     --tokenizer-model ${TOKENIZER_MODEL}"
18 |     
19 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS
20 | cp -r $TOKENIZER_MODEL $MEGATRON_FORMAT_DIR


--------------------------------------------------------------------------------
/scripts/tools/convert_llama2_7b_hf_to_megatron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROJECT_DIR=$(pwd)
 3 | 
 4 | TP=8
 5 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_7b_hf
 6 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_7b_megatron_tp$TP
 7 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.model
 8 | 
 9 | OPTIONS=" \
10 |     --model-type GPT \
11 |     --loader llama2_hf \
12 |     --saver megatron \
13 |     --target-tensor-parallel-size ${TP} \
14 |     --load-dir ${HF_FORMAT_DIR} \
15 |     --save-dir ${MEGATRON_FORMAT_DIR} \
16 |     --tokenizer-model ${TOKENIZER_MODEL}"
17 | 
18 | echo $HF_FORMAT_DIR
19 | echo $MEGATRON_FORMAT_DIR
20 | echo $TOKENIZER_MODEL
21 | 
22 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS
23 | cp -r $TOKENIZER_MODEL $MEGATRON_FORMAT_DIR
24 | 


--------------------------------------------------------------------------------
/scripts/tools/convert_llama2_7b_tp8_to_tp1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROJECT_DIR=$(pwd)
 3 | 
 4 | TP=1
 5 | TP8_DIR=$PROJECT_DIR/output/checkpoints/llama2-7b-tp8-mask-only-c4-singlenode/train_iters_2000/ckpt
 6 | TP1_DIR=$PROJECT_DIR/output/checkpoints/llama2-7b-tp1-mask-only-c4-singlenode/train_iters_2000/ckpt
 7 | TOKENIZER_MODEL=assets/checkpoints/llama2_7b_hf/tokenizer.model
 8 | 
 9 | OPTIONS=" \
10 |     --model-type GPT \
11 |     --loader megatron \
12 |     --saver megatron \
13 |     --target-tensor-parallel-size ${TP} \
14 |     --load-dir ${TP8_DIR} \
15 |     --save-dir ${TP1_DIR} \
16 |     --megatron-path ${PROJECT_DIR}"
17 | 
18 | echo $TP8_DIR
19 | echo $TP1_DIR
20 | 
21 | pip install transformers wandb accelerate tqdm; cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS
22 | 


--------------------------------------------------------------------------------
/scripts/tools/convert_llama3.1_8b_hf_to_megatron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROJECT_DIR=$(pwd)
 3 | 
 4 | TP=8
 5 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3.1_8b_hf
 6 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3.1_8b_megatron_tp$TP
 7 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.json
 8 | 
 9 | OPTIONS=" \
10 |     --model-type GPT \
11 |     --loader llama2_hf \
12 |     --saver megatron \
13 |     --target-tensor-parallel-size ${TP} \
14 |     --load-dir ${HF_FORMAT_DIR} \
15 |     --save-dir ${MEGATRON_FORMAT_DIR} \
16 |     --tokenizer-model ${TOKENIZER_MODEL}"
17 | 
18 | echo $HF_FORMAT_DIR
19 | echo $MEGATRON_FORMAT_DIR
20 | echo $TOKENIZER_MODEL
21 | 
22 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS
23 | cp -r $HF_FORMAT_DIR/*token* $MEGATRON_FORMAT_DIR
24 | 


--------------------------------------------------------------------------------
/scripts/tools/convert_llama3.1_8b_tp8_to_tp1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROJECT_DIR=$(pwd)
 3 | 
 4 | TP=1
 5 | TP8_DIR=$PROJECT_DIR/output/checkpoints/llama3.1-8b-tp8-mask-only-c4-singlenode/train_iters_2000/ckpt
 6 | TP1_DIR=$PROJECT_DIR/output/checkpoints/llama3.1-8b-tp1-mask-only-c4-singlenode/train_iters_2000/ckpt
 7 | TOKENIZER_MODEL=assets/checkpoints/llama3.1_8b_hf/tokenizer.model
 8 | 
 9 | OPTIONS=" \
10 |     --model-type GPT \
11 |     --loader megatron \
12 |     --saver megatron \
13 |     --target-tensor-parallel-size ${TP} \
14 |     --load-dir ${TP8_DIR} \
15 |     --save-dir ${TP1_DIR} \
16 |     --megatron-path ${PROJECT_DIR}"
17 | 
18 | echo $TP8_DIR
19 | echo $TP1_DIR
20 | 
21 | pip install transformers wandb accelerate tqdm; cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS
22 | 


--------------------------------------------------------------------------------
/scripts/tools/convert_llama3_8b_hf_to_megatron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROJECT_DIR=$(pwd)
 3 | 
 4 | TP=8
 5 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3_8b_hf
 6 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3_8b_megatron_tp$TP
 7 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.json
 8 | 
 9 | OPTIONS=" \
10 |     --model-type GPT \
11 |     --loader llama2_hf \
12 |     --saver megatron \
13 |     --target-tensor-parallel-size ${TP} \
14 |     --load-dir ${HF_FORMAT_DIR} \
15 |     --save-dir ${MEGATRON_FORMAT_DIR} \
16 |     --tokenizer-model ${TOKENIZER_MODEL}"
17 | 
18 | echo $HF_FORMAT_DIR
19 | echo $MEGATRON_FORMAT_DIR
20 | echo $TOKENIZER_MODEL
21 | 
22 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS
23 | cp -r $HF_FORMAT_DIR/tokenizer* $MEGATRON_FORMAT_DIR
24 | 


--------------------------------------------------------------------------------
/scripts/tools/convert_llama3_8b_tp8_to_tp1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROJECT_DIR=$(pwd)
 3 | 
 4 | TP=1
 5 | TP8_DIR=$PROJECT_DIR/output/checkpoints/llama3-8b-tp8-mask-only-c4-singlenode/train_iters_2000/ckpt
 6 | TP1_DIR=$PROJECT_DIR/output/checkpoints/llama3-8b-tp1-mask-only-c4-singlenode/train_iters_2000/ckpt
 7 | TOKENIZER_MODEL=assets/checkpoints/llama3_8b_hf/tokenizer.model
 8 | 
 9 | OPTIONS=" \
10 |     --model-type GPT \
11 |     --loader megatron \
12 |     --saver megatron \
13 |     --target-tensor-parallel-size ${TP} \
14 |     --load-dir ${TP8_DIR} \
15 |     --save-dir ${TP1_DIR} \
16 |     --megatron-path ${PROJECT_DIR}"
17 | 
18 | echo $TP8_DIR
19 | echo $TP1_DIR
20 | 
21 | pip install transformers wandb accelerate tqdm; cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS
22 | 


--------------------------------------------------------------------------------
/scripts/tools/download_llama2_13b_hf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM
 3 | import os
 4 | 
 5 | HF_TOKEN = os.environ.get("HF_TOKEN")
 6 | os.makedirs(f"./assets/cache", exist_ok=True)
 7 | 
 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"):
 9 |     # Load model and tokenizer
10 |     model = AutoModelForCausalLM.from_pretrained(
11 |         model_id, 
12 |         torch_dtype=torch.float16, 
13 |         cache_dir=f"./assets/cache", 
14 |         use_auth_token=HF_TOKEN,
15 |     )
16 |     tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
17 |     # Save model and tokenizer to the specified directory
18 |     model.save_pretrained(save_directory)
19 |     tokenizer.save_pretrained(save_directory)
20 |     print(f"Model and tokenizer saved to {save_directory}")
21 | 
22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub
23 | dense_dir = "assets/checkpoints"
24 | os.makedirs(dense_dir, exist_ok=True)
25 | model_id = "meta-llama/Llama-2-13b-hf"
26 | save_directory = f"{dense_dir}/llama2_13b_hf"
27 | save_llama_model(model_id, save_directory)


--------------------------------------------------------------------------------
/scripts/tools/download_llama2_7b_hf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM
 3 | import os
 4 | 
 5 | HF_TOKEN = os.environ.get("HF_TOKEN")
 6 | os.makedirs(f"./assets/cache", exist_ok=True)
 7 | 
 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"):
 9 |     # Load model and tokenizer
10 |     model = AutoModelForCausalLM.from_pretrained(
11 |         model_id, 
12 |         torch_dtype=torch.float16, 
13 |         cache_dir=f"./assets/cache", 
14 |         use_auth_token=HF_TOKEN,
15 |     )
16 |     tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
17 |     # Save model and tokenizer to the specified directory
18 |     model.save_pretrained(save_directory)
19 |     tokenizer.save_pretrained(save_directory)
20 |     print(f"Model and tokenizer saved to {save_directory}")
21 | 
22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub
23 | dense_dir = "assets/checkpoints"
24 | os.makedirs(dense_dir, exist_ok=True)
25 | model_id = "meta-llama/Llama-2-7b-hf"
26 | save_directory = f"{dense_dir}/llama2_7b_hf"
27 | save_llama_model(model_id, save_directory)


--------------------------------------------------------------------------------
/scripts/tools/download_llama3.1_8b_hf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM
 3 | import os
 4 | 
 5 | HF_TOKEN = os.environ.get("HF_TOKEN")
 6 | os.makedirs(f"./assets/cache", exist_ok=True)
 7 | 
 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"):
 9 |     # Load model and tokenizer
10 |     model = AutoModelForCausalLM.from_pretrained(
11 |         model_id, 
12 |         torch_dtype=torch.float16, 
13 |         cache_dir=f"./assets/cache", 
14 |         use_auth_token=HF_TOKEN,
15 |     )
16 |     tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
17 |     # Save model and tokenizer to the specified directory
18 |     model.save_pretrained(save_directory)
19 |     tokenizer.save_pretrained(save_directory)
20 |     print(f"Model and tokenizer saved to {save_directory}")
21 | 
22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub
23 | dense_dir = "assets/checkpoints"
24 | os.makedirs(dense_dir, exist_ok=True)
25 | model_id = "meta-llama/Llama-3.1-8B"
26 | save_directory = f"{dense_dir}/llama3.1_8b_hf"
27 | save_llama_model(model_id, save_directory)


--------------------------------------------------------------------------------
/scripts/tools/download_llama3_8b_hf.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM
 3 | import os
 4 | 
 5 | HF_TOKEN = os.environ.get("HF_TOKEN")
 6 | os.makedirs(f"./assets/cache", exist_ok=True)
 7 | 
 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"):
 9 |     # Load model and tokenizer
10 |     model = AutoModelForCausalLM.from_pretrained(
11 |         model_id, 
12 |         torch_dtype=torch.float16, 
13 |         cache_dir=f"./assets/cache", 
14 |         use_auth_token=HF_TOKEN,
15 |     )
16 |     tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN)
17 |     # Save model and tokenizer to the specified directory
18 |     model.save_pretrained(save_directory)
19 |     tokenizer.save_pretrained(save_directory)
20 |     print(f"Model and tokenizer saved to {save_directory}")
21 | 
22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub
23 | dense_dir = "assets/checkpoints"
24 | os.makedirs(dense_dir, exist_ok=True)
25 | model_id = "meta-llama/Meta-Llama-3-8B"
26 | save_directory = f"{dense_dir}/llama3_8b_hf"
27 | save_llama_model(model_id, save_directory)


--------------------------------------------------------------------------------
/tasks/data_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  2 | 
  3 | """ Tasks data utility."""
  4 | 
  5 | import re
  6 | import numpy as np
  7 | 
  8 | 
  9 | def clean_text(text):
 10 |     """Remove new lines and multiple spaces and adjust end of sentence dot."""
 11 | 
 12 |     text = text.replace("\n", " ")
 13 |     text = re.sub(r'\s+', ' ', text)
 14 |     for _ in range(3):
 15 |         text = text.replace(' . ', '. ')
 16 | 
 17 |     return text
 18 | 
 19 | 
 20 | def build_sample(ids, types, paddings, label, unique_id):
 21 |     """Convert to numpy and return a sample consumed by the batch producer."""
 22 | 
 23 |     ids_np = np.array(ids, dtype=np.int64)
 24 |     types_np = np.array(types, dtype=np.int64)
 25 |     paddings_np = np.array(paddings, dtype=np.int64)
 26 |     sample = ({'text': ids_np,
 27 |                'types': types_np,
 28 |                'padding_mask': paddings_np,
 29 |                'label': int(label),
 30 |                'uid': int(unique_id)})
 31 | 
 32 |     return sample
 33 | 
 34 | 
 35 | def build_tokens_types_paddings_from_text(text_a, text_b,
 36 |                                           tokenizer, max_seq_length):
 37 |     """Build token types and paddings, trim if needed, and pad if needed."""
 38 | 
 39 |     text_a_ids = tokenizer.tokenize(text_a)
 40 |     text_b_ids = None
 41 |     if text_b is not None:
 42 |         text_b_ids = tokenizer.tokenize(text_b)
 43 | 
 44 |     return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids,
 45 |                                                 max_seq_length, tokenizer.cls,
 46 |                                                 tokenizer.sep, tokenizer.pad)
 47 | 
 48 | 
 49 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length,
 50 |                                          cls_id, sep_id, pad_id):
 51 |     """Build token types and paddings, trim if needed, and pad if needed."""
 52 | 
 53 |     ids = []
 54 |     types = []
 55 |     paddings = []
 56 | 
 57 |     # [CLS].
 58 |     ids.append(cls_id)
 59 |     types.append(0)
 60 |     paddings.append(1)
 61 | 
 62 |     # A.
 63 |     len_text_a = len(text_a_ids)
 64 |     ids.extend(text_a_ids)
 65 |     types.extend([0] * len_text_a)
 66 |     paddings.extend([1] * len_text_a)
 67 | 
 68 |     # [SEP].
 69 |     ids.append(sep_id)
 70 |     types.append(0)
 71 |     paddings.append(1)
 72 | 
 73 |     # B.
 74 |     if text_b_ids is not None:
 75 |         len_text_b = len(text_b_ids)
 76 |         ids.extend(text_b_ids)
 77 |         types.extend([1] * len_text_b)
 78 |         paddings.extend([1] * len_text_b)
 79 | 
 80 |     # Cap the size.
 81 |     trimmed = False
 82 |     if len(ids) >= max_seq_length:
 83 |         max_seq_length_m1 = max_seq_length - 1
 84 |         ids = ids[0:max_seq_length_m1]
 85 |         types = types[0:max_seq_length_m1]
 86 |         paddings = paddings[0:max_seq_length_m1]
 87 |         trimmed = True
 88 | 
 89 |     # [SEP].
 90 |     if (text_b_ids is not None) or trimmed:
 91 |         ids.append(sep_id)
 92 |         if text_b_ids is None:
 93 |             types.append(0)
 94 |         else:
 95 |             types.append(1)
 96 |         paddings.append(1)
 97 | 
 98 |     # Padding.
 99 |     padding_length = max_seq_length - len(ids)
100 |     if padding_length > 0:
101 |         ids.extend([pad_id] * padding_length)
102 |         types.extend([pad_id] * padding_length)
103 |         paddings.extend([0] * padding_length)
104 | 
105 |     return ids, types, paddings
106 | 


--------------------------------------------------------------------------------
/tasks/glue/data.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """GLUE dataset."""
 4 | 
 5 | from abc import ABC
 6 | from abc import abstractmethod
 7 | 
 8 | from torch.utils.data import Dataset
 9 | 
10 | from megatron import print_rank_0
11 | from tasks.data_utils import build_sample
12 | from tasks.data_utils import build_tokens_types_paddings_from_text
13 | 
14 | 
15 | class GLUEAbstractDataset(ABC, Dataset):
16 |     """GLUE base dataset class."""
17 | 
18 |     def __init__(self, task_name, dataset_name, datapaths,
19 |                  tokenizer, max_seq_length):
20 |         # Store inputs.
21 |         self.task_name = task_name
22 |         self.dataset_name = dataset_name
23 |         self.tokenizer = tokenizer
24 |         self.max_seq_length = max_seq_length
25 |         print_rank_0(' > building {} dataset for {}:'.format(self.task_name,
26 |                                                              self.dataset_name))
27 |         # Process the files.
28 |         string = '  > paths:'
29 |         for path in datapaths:
30 |             string += ' ' + path
31 |         print_rank_0(string)
32 |         self.samples = []
33 |         for datapath in datapaths:
34 |             self.samples.extend(self.process_samples_from_single_path(datapath))
35 |         print_rank_0('  >> total number of samples: {}'.format(
36 |             len(self.samples)))
37 | 
38 |     def __len__(self):
39 |         return len(self.samples)
40 | 
41 |     def __getitem__(self, idx):
42 |         raw_sample = self.samples[idx]
43 |         ids, types, paddings = build_tokens_types_paddings_from_text(
44 |             raw_sample['text_a'], raw_sample['text_b'],
45 |             self.tokenizer, self.max_seq_length)
46 |         sample = build_sample(ids, types, paddings,
47 |                               raw_sample['label'], raw_sample['uid'])
48 |         return sample
49 | 
50 |     @abstractmethod
51 |     def process_samples_from_single_path(self, datapath):
52 |         """Abstract method that takes a single path / filename and
53 |         returns a list of dataset samples, each sample being a dict of
54 |             {'text_a': string, 'text_b': string, 'label': int, 'uid': int}
55 |         """
56 |         pass
57 | 


--------------------------------------------------------------------------------
/tasks/glue/finetune.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """GLUE finetuning/evaluation."""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from megatron import get_tokenizer
 8 | from megatron.model.classification import Classification
 9 | from tasks.eval_utils import accuracy_func_provider
10 | from tasks.finetune_utils import finetune
11 | from megatron.arguments import core_transformer_config_from_args
12 | 
13 | 
14 | def glue_classification(num_classes, Dataset,
15 |                         name_from_datapath_func):
16 | 
17 |     def train_valid_datasets_provider():
18 |         """Build train and validation dataset."""
19 |         args = get_args()
20 |         tokenizer = get_tokenizer()
21 | 
22 |         train_dataset = Dataset('training', args.train_data,
23 |                                 tokenizer, args.seq_length)
24 |         valid_dataset = Dataset('validation', args.valid_data,
25 |                                 tokenizer, args.seq_length)
26 | 
27 |         return train_dataset, valid_dataset
28 | 
29 |     def model_provider(pre_process=True, post_process=True):
30 |         """Build the model."""
31 |         args = get_args()
32 |         config = core_transformer_config_from_args()
33 | 
34 |         print_rank_0('building classification model for {} ...'.format(
35 |             args.task))
36 |         model = Classification(config=config, num_classes=num_classes, num_tokentypes=2,
37 |                                pre_process=pre_process, post_process=post_process)
38 | 
39 |         return model
40 | 
41 |     def metrics_func_provider():
42 |         """Privde metrics callback function."""
43 |         def single_dataset_provider(datapath):
44 |             args = get_args()
45 |             tokenizer = get_tokenizer()
46 | 
47 |             name = name_from_datapath_func(datapath)
48 |             return Dataset(name, [datapath], tokenizer, args.seq_length)
49 |         return accuracy_func_provider(single_dataset_provider)
50 | 
51 |     """Finetune/evaluate."""
52 |     finetune(train_valid_datasets_provider, model_provider,
53 |              end_of_epoch_callback_provider=metrics_func_provider)
54 | 
55 | 
56 | def main():
57 |     args = get_args()
58 | 
59 |     if args.task == 'MNLI':
60 | 
61 |         num_classes = 3
62 |         from tasks.glue.mnli import MNLIDataset as Dataset
63 | 
64 |         def name_from_datapath(datapath):
65 |             return datapath.split('MNLI')[-1].strip(
66 |                 '.tsv').strip('/').replace('_', '-')
67 | 
68 |     elif args.task == 'QQP':
69 | 
70 |         num_classes = 2
71 |         from tasks.glue.qqp import QQPDataset as Dataset
72 | 
73 |         def name_from_datapath(datapath):
74 |             return datapath.split('QQP')[-1].strip(
75 |                 '.tsv').strip('/').replace('_', '-')
76 | 
77 |     else:
78 |         raise NotImplementedError('GLUE task {} is not implemented.'.format(
79 |             args.task))
80 | 
81 |     glue_classification(num_classes, Dataset, name_from_datapath)
82 | 


--------------------------------------------------------------------------------
/tasks/glue/mnli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """MNLI dataset."""
 4 | 
 5 | from megatron import print_rank_0
 6 | from tasks.data_utils import clean_text
 7 | from .data import GLUEAbstractDataset
 8 | 
 9 | 
10 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2}
11 | 
12 | 
13 | class MNLIDataset(GLUEAbstractDataset):
14 | 
15 |     def __init__(self, name, datapaths, tokenizer, max_seq_length,
16 |                  test_label='contradiction'):
17 |         self.test_label = test_label
18 |         super().__init__('MNLI', name, datapaths,
19 |                          tokenizer, max_seq_length)
20 | 
21 |     def process_samples_from_single_path(self, filename):
22 |         """"Implement abstract method."""
23 |         print_rank_0(' > Processing {} ...'.format(filename))
24 | 
25 |         samples = []
26 |         total = 0
27 |         first = True
28 |         is_test = False
29 |         with open(filename, 'r') as f:
30 |             for line in f:
31 |                 row = line.strip().split('\t')
32 |                 if first:
33 |                     first = False
34 |                     if len(row) == 10:
35 |                         is_test = True
36 |                         print_rank_0(
37 |                             '   reading {}, {} and {} columns and setting '
38 |                             'labels to {}'.format(
39 |                                 row[0].strip(), row[8].strip(),
40 |                                 row[9].strip(), self.test_label))
41 |                     else:
42 |                         print_rank_0('    reading {} , {}, {}, and {} columns '
43 |                                      '...'.format(
44 |                                          row[0].strip(), row[8].strip(),
45 |                                          row[9].strip(), row[-1].strip()))
46 |                     continue
47 | 
48 |                 text_a = clean_text(row[8].strip())
49 |                 text_b = clean_text(row[9].strip())
50 |                 unique_id = int(row[0].strip())
51 |                 label = row[-1].strip()
52 |                 if is_test:
53 |                     label = self.test_label
54 | 
55 |                 assert len(text_a) > 0
56 |                 assert len(text_b) > 0
57 |                 assert label in LABELS
58 |                 assert unique_id >= 0
59 | 
60 |                 sample = {'text_a': text_a,
61 |                           'text_b': text_b,
62 |                           'label': LABELS[label],
63 |                           'uid': unique_id}
64 |                 total += 1
65 |                 samples.append(sample)
66 | 
67 |                 if total % 50000 == 0:
68 |                     print_rank_0('  > processed {} so far ...'.format(total))
69 | 
70 |         print_rank_0(' >> processed {} samples.'.format(len(samples)))
71 |         return samples
72 | 


--------------------------------------------------------------------------------
/tasks/latency/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Detokenization."""
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | def ptb_detokenizer(string):
 9 |     string = string.replace(" '", "'")
10 |     string = string.replace(" \n", "\n")
11 |     string = string.replace("\n ", "\n")
12 |     string = string.replace(" n't", "n't")
13 |     string = string.replace(" N ", "1 ")
14 |     string = string.replace("$ 1", "$1")
15 |     string = string.replace("# 1", "#1")
16 |     return string
17 | 
18 | 
19 | def wikitext_detokenizer(string):
20 |     # contractions
21 |     string = string.replace("s '", "s'")
22 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
23 |     # number separators
24 |     string = string.replace(" @-@ ", "-")
25 |     string = string.replace(" @,@ ", ",")
26 |     string = string.replace(" @.@ ", ".")
27 |     # punctuation
28 |     string = string.replace(" : ", ": ")
29 |     string = string.replace(" ; ", "; ")
30 |     string = string.replace(" . ", ". ")
31 |     string = string.replace(" ! ", "! ")
32 |     string = string.replace(" ? ", "? ")
33 |     string = string.replace(" , ", ", ")
34 |     # double brackets
35 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
36 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
37 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
38 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
39 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
40 |     # miscellaneous
41 |     string = string.replace("= = = =", "====")
42 |     string = string.replace("= = =", "===")
43 |     string = string.replace("= =", "==")
44 |     string = string.replace(" " + chr(176) + " ", chr(176))
45 |     string = string.replace(" \n", "\n")
46 |     string = string.replace("\n ", "\n")
47 |     string = string.replace(" N ", " 1 ")
48 |     string = string.replace(" 's", "'s")
49 | 
50 |     return string
51 | 
52 | 
53 | def lambada_detokenizer(string):
54 |     return string
55 | 
56 | 
57 | _DETOKENIZERS = {
58 |     'ptb': ptb_detokenizer,
59 |     'wiki': wikitext_detokenizer,
60 |     'lambada': lambada_detokenizer,
61 | }
62 | 
63 | 
64 | def get_detokenizer(path):
65 |     for key in _DETOKENIZERS.keys():
66 |         if key in path:
67 |             return _DETOKENIZERS[key]
68 | 


--------------------------------------------------------------------------------
/tasks/msdp/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation
 3 | 
 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework.
 5 | 
 6 | ## Multi-Stage Dialogue Prompting
 7 | 
 8 | ### Data Preparation
 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets.
11 | 
12 | ### Stage-1: Prompting for Knowledge Generation
13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation.
15 | 
16 | ### Stage-2: Prompting for Response Generation
17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file).
18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation.
19 | 3.  We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation.
20 | 


--------------------------------------------------------------------------------
/tasks/msdp/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Model evaluation"""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from tasks.msdp.metrics import F1Metric
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | def evaluate_f1(guess_file, answer_file):
12 |     """Evaluating F1 Score"""
13 | 
14 |     guess_list = []
15 |     print_rank_0('reading %s' % guess_file)
16 |     with open(guess_file, "r") as f:
17 |         for i, line in enumerate(tqdm(f)):
18 |             line = line.strip()
19 |             if "<|endoftext|>" in line:
20 |                 line = line.replace("<|endoftext|>", "")
21 |             guess_list.append(line)
22 | 
23 |     answer_list = []
24 |     print_rank_0('reading %s' % answer_file)
25 |     with open(answer_file, "r") as f:
26 |         for i, line in enumerate(tqdm(f)):
27 |             line = line.strip()
28 |             if line == "no_passages_used":
29 |                 line = ""
30 |             answer_list.append(line)
31 | 
32 |     assert len(guess_list) == len(answer_list), \
33 |         "lengths of guess and answer are different!"
34 | 
35 |     precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list)
36 |     print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1))
37 | 
38 |     print_rank_0('done :-)')
39 | 
40 | 
41 | def main():
42 |     args = get_args()
43 |     
44 |     evaluate_f1(args.guess_file, args.answer_file)
45 | 
46 | 


--------------------------------------------------------------------------------
/tasks/msdp/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Run multi-stage dialogue prompting (MSDP)."""
 4 | 
 5 | import os
 6 | import sys
 7 | sys.path.append(os.path.abspath(os.path.join(
 8 |     os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir)))
 9 | from megatron import get_args
10 | from megatron.initialize import initialize_megatron
11 | 
12 | 
13 | def get_tasks_args(parser):
14 |     """Provide extra arguments required for tasks."""
15 |     group = parser.add_argument_group(title='tasks')
16 | 
17 |     # parameters for the knowledgeable dialogue generation
18 |     group.add_argument('--task', type=str, required=True,
19 |                        help='Task name.')
20 |     group.add_argument("--sample-input-file", type=str, default=None,
21 |                        help='Get input from file instead of interactive mode, '
22 |                        'each line is an input.')
23 |     group.add_argument("--sample-output-file", type=str, default=None,
24 |                        help='Output file got from --sample-input-file')
25 |     group.add_argument('--prompt-file', type=str, default=None,
26 |                        help='prompting file')
27 |     group.add_argument('--prompt-type', type=str, default=None, 
28 |                        choices=['knowledge', 'response'],
29 |                        help='prompt type (knowledge or response)')
30 |     group.add_argument('--num-prompt-examples', type=int, default=10,
31 |                        help='number of prompt examples')
32 |     group.add_argument('--guess-file', type=str, default=None,
33 |                        help='datapath for generated sentences')
34 |     group.add_argument('--answer-file', type=str, default=None,
35 |                        help='datapath for golden sentences')
36 |     group.add_argument('--out-seq-length', type=int, default=100,
37 |                        help='output sequence length')
38 |     group.add_argument('--api-prompt', default=False, action="store_true",
39 |                        help='setup model api for prompting')
40 |     group.add_argument('--megatron-api-url', type=str, default=None,
41 |                        help='url of the megatron api')
42 | 
43 |     return parser
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 
48 |     initialize_megatron(extra_args_provider=get_tasks_args)
49 | 
50 |     args = get_args()
51 | 
52 |     if args.num_layers_per_virtual_pipeline_stage is not None:
53 |         print("Interleaved pipeline schedule is not yet supported for downstream tasks.")
54 |         exit()
55 | 
56 |     if args.task == 'MSDP-PROMPT':
57 |         from tasks.msdp.prompt import main
58 | 
59 |     elif args.task == 'MSDP-EVAL-F1':
60 |         from tasks.msdp.evaluate import main
61 | 
62 |     else:
63 |         raise NotImplementedError('Task {} is not implemented.'.format(
64 |             args.task))
65 | 
66 |     main()
67 | 


--------------------------------------------------------------------------------
/tasks/msdp/metrics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # The following code is adapted from
 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
 4 | # which is licensed under the MIT license. More details on the license can be 
 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
 6 | 
 7 | """Provides standard metric evaluations for dialog."""
 8 | 
 9 | from collections import Counter
10 | from typing import List
11 | import numpy as np
12 | import re
13 | 
14 | re_art = re.compile(r'\b(a|an|the)\b')
15 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
16 | 
17 | 
18 | def normalize_answer(s):
19 |     """
20 |     Lower text and remove punctuation, articles and extra whitespace.
21 |     """
22 |     s = s.lower()
23 |     s = re_punc.sub(' ', s)
24 |     s = re_art.sub(' ', s)
25 |     s = ' '.join(s.split())
26 |     return s
27 | 
28 | 
29 | class F1Metric:
30 |     """
31 |     Helper class which computes token-level F1.
32 |     """
33 | 
34 |     @staticmethod
35 |     def _prec_recall_f1_score(pred_items, gold_items):
36 |         """
37 |         Compute precision, recall and f1 given a set of gold and prediction items.
38 |         :param pred_items: iterable of predicted values
39 |         :param gold_items: iterable of gold values
40 |         :return: tuple (p, r, f1) for precision, recall, f1
41 |         """
42 |         common = Counter(gold_items) & Counter(pred_items)
43 |         num_same = sum(common.values())
44 |         if num_same == 0:
45 |             return 0, 0, 0
46 |         precision = 1.0 * num_same / len(pred_items)
47 |         recall = 1.0 * num_same / len(gold_items)
48 |         f1 = (2 * precision * recall) / (precision + recall)
49 |         return precision, recall, f1
50 | 
51 |     @staticmethod
52 |     def compute_each_pair(guess: str, answer: str):
53 |         if answer == "":
54 |             return None, None, None
55 |         if guess == "":
56 |             return 0, 0, 0
57 |         g_tokens = normalize_answer(guess).split()
58 |         a_tokens = normalize_answer(answer).split()
59 | 
60 |         precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
61 |         return precision, recall, f1
62 |         
63 |     @staticmethod
64 |     def compute_all_pairs(guesses: List[str], answers: List[str]):
65 |         # additional augment:
66 |         assert len(guesses) == len(answers)
67 |         
68 |         precision_list, recall_list, f1_list = [], [], []
69 |         for guess, answer in zip(guesses, answers):
70 |             precision, recall, f1 = F1Metric.compute_each_pair(guess, answer)
71 |             if precision is None or recall is None or f1 is None:
72 |                 continue
73 |             precision_list.append(precision)
74 |             recall_list.append(recall)
75 |             f1_list.append(f1)
76 |         
77 |         return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
78 | 


--------------------------------------------------------------------------------
/tasks/orqa/README.md:
--------------------------------------------------------------------------------
 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering
 2 | 
 3 | Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
 4 | 
 5 | ## Retriever Training
 6 | 
 7 | #### Unsupervised pretraining
 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body.
 9 | 
10 | <pre>
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | </pre>
20 | 
21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training.
22 | 
23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf).
24 | 
25 | #### Supervised finetuning
26 | 
27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906).
28 | 
29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model.
30 | 
31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408).
32 | 
33 | ## Reader Training
34 | 
35 | The reader component will be available soon.
36 | 
37 | 


--------------------------------------------------------------------------------
/tasks/orqa/evaluate_orqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | from megatron import get_args, print_rank_0
 6 | from megatron.indexer import IndexBuilder
 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator
 8 | 
 9 | def main():
10 |     """
11 |     Main program
12 |     """
13 | 
14 |     args = get_args()
15 | 
16 |     """
17 |     Create a BlockData data structure by running an IndexBuilder over an
18 |     ICT Dataset and then evaluate on NQ task
19 |     """
20 | 
21 |     print_rank_0("Starting index builder!")
22 | 
23 |     index_builder = IndexBuilder()
24 |     index_builder.build_and_save_index()
25 |     print_rank_0("Build and save indices: done!")
26 | 
27 | 
28 |     print_rank_0("Starting evaluations!")
29 | 
30 |     # Set up the model and evaluator
31 |     evaluator = ORQAEvaluator()
32 | 
33 |     # Run evaluation
34 |     if args.qa_data_dev is not None:
35 |         evaluator.evaluate(args.qa_data_dev, "DEV")
36 | 
37 |     if args.qa_data_test is not None:
38 |         evaluator.evaluate(args.qa_data_test, "TEST")
39 | 
40 | 


--------------------------------------------------------------------------------
/tasks/pruning/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Detokenization."""
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | def ptb_detokenizer(string):
 9 |     string = string.replace(" '", "'")
10 |     string = string.replace(" \n", "\n")
11 |     string = string.replace("\n ", "\n")
12 |     string = string.replace(" n't", "n't")
13 |     string = string.replace(" N ", "1 ")
14 |     string = string.replace("$ 1", "$1")
15 |     string = string.replace("# 1", "#1")
16 |     return string
17 | 
18 | 
19 | def wikitext_detokenizer(string):
20 |     # contractions
21 |     string = string.replace("s '", "s'")
22 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
23 |     # number separators
24 |     string = string.replace(" @-@ ", "-")
25 |     string = string.replace(" @,@ ", ",")
26 |     string = string.replace(" @.@ ", ".")
27 |     # punctuation
28 |     string = string.replace(" : ", ": ")
29 |     string = string.replace(" ; ", "; ")
30 |     string = string.replace(" . ", ". ")
31 |     string = string.replace(" ! ", "! ")
32 |     string = string.replace(" ? ", "? ")
33 |     string = string.replace(" , ", ", ")
34 |     # double brackets
35 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
36 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
37 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
38 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
39 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
40 |     # miscellaneous
41 |     string = string.replace("= = = =", "====")
42 |     string = string.replace("= = =", "===")
43 |     string = string.replace("= =", "==")
44 |     string = string.replace(" " + chr(176) + " ", chr(176))
45 |     string = string.replace(" \n", "\n")
46 |     string = string.replace("\n ", "\n")
47 |     string = string.replace(" N ", " 1 ")
48 |     string = string.replace(" 's", "'s")
49 | 
50 |     return string
51 | 
52 | 
53 | def lambada_detokenizer(string):
54 |     return string
55 | 
56 | 
57 | _DETOKENIZERS = {
58 |     'ptb': ptb_detokenizer,
59 |     'wiki': wikitext_detokenizer,
60 |     'lambada': lambada_detokenizer,
61 | }
62 | 
63 | 
64 | def get_detokenizer(path):
65 |     for key in _DETOKENIZERS.keys():
66 |         if key in path:
67 |             return _DETOKENIZERS[key]
68 | 


--------------------------------------------------------------------------------
/tasks/pruning/exclude_layers.py:
--------------------------------------------------------------------------------
 1 | # 843m
 2 | # 96 prunable layers
 3 | exclude_layers_12 = [
 4 |     "module.language_model.encoder.layers.23.mlp.dense_h_to_4h",
 5 |     "module.language_model.encoder.layers.23.mlp.dense_4h_to_h",
 6 |     "module.language_model.encoder.layers.7.mlp.dense_h_to_4h",
 7 |     "module.language_model.encoder.layers.0.mlp.dense_h_to_4h",
 8 |     "module.language_model.encoder.layers.22.mlp.dense_h_to_4h",
 9 |     "module.language_model.encoder.layers.8.self_attention.query_key_value",
10 |     "module.language_model.encoder.layers.7.mlp.dense_4h_to_h",
11 |     "module.language_model.encoder.layers.19.mlp.dense_h_to_4h",
12 |     "module.language_model.encoder.layers.23.self_attention.query_key_value",
13 |     "module.language_model.encoder.layers.21.mlp.dense_h_to_4h",
14 |     "module.language_model.encoder.layers.20.mlp.dense_h_to_4h",
15 |     "module.language_model.encoder.layers.0.mlp.dense_4h_to_h",
16 | ]
17 | 
18 | # 8b
19 | # 128 prunable layers
20 | exclude_layers_13 = [
21 |     "module.language_model.encoder.layers.31.mlp.dense_h_to_4h",
22 |     "module.language_model.encoder.layers.31.mlp.dense_4h_to_h",
23 |     "module.language_model.encoder.layers.30.self_attention.query_key_value",
24 |     "module.language_model.encoder.layers.30.mlp.dense_h_to_4h",
25 |     "module.language_model.encoder.layers.29.mlp.dense_h_to_4h",
26 |     "module.language_model.encoder.layers.28.mlp.dense_h_to_4h",
27 |     "module.language_model.encoder.layers.30.mlp.dense_4h_to_h",
28 |     "module.language_model.encoder.layers.27.mlp.dense_h_to_4h",
29 |     "module.language_model.encoder.layers.22.self_attention.query_key_value",
30 |     "module.language_model.encoder.layers.25.mlp.dense_h_to_4h",
31 |     "module.language_model.encoder.layers.26.self_attention.query_key_value",
32 |     "module.language_model.encoder.layers.26.mlp.dense_h_to_4h",
33 |     "module.language_model.encoder.layers.31.self_attention.query_key_value",
34 | ]
35 | 
36 | exclude_layers_1000 = ["mlp.dense_4h_to_h"]


--------------------------------------------------------------------------------
/tasks/pruning/layerwrapper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | # Define WrappedGPT class
 6 | class WrappedGPT:
 7 |     """
 8 |     This class wraps a GPT layer for specific operations.
 9 |     """
10 | 
11 |     def __init__(self, layer, layer_id=0, layer_name="none"):
12 |         self.layer = layer
13 |         self.dev = self.layer.weight.device
14 |         self.rows = layer.weight.data.shape[0]
15 |         self.columns = layer.weight.data.shape[1]
16 | 
17 |         self.scaler_row = torch.zeros((self.columns), device=self.dev)
18 |         self.nsamples = 0
19 | 
20 |         self.layer_id = layer_id 
21 |         self.layer_name = layer_name
22 | 
23 |     def add_batch(self, inp, out):
24 |         if len(inp.shape) == 3:
25 |             inp = inp.reshape((-1, inp.shape[-1]))
26 |         if len(inp.shape) == 2:
27 |             inp = inp.unsqueeze(0)
28 |         tmp = inp.shape[0]
29 |         if isinstance(self.layer, nn.Linear):
30 |             if len(inp.shape) == 3:
31 |                 inp = inp.reshape((-1, inp.shape[-1]))
32 |             inp = inp.t()
33 | 
34 |         self.scaler_row *= self.nsamples / (self.nsamples+tmp)
35 |         self.nsamples += tmp
36 | 
37 |         inp = inp.type(torch.float32)
38 |         self.scaler_row += torch.norm(inp, p=2, dim=1).squeeze(0) ** 2  / self.nsamples


--------------------------------------------------------------------------------
/tasks/pruning/sparsity/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/tasks/pruning/sparsity/__init__.py


--------------------------------------------------------------------------------
/tasks/pruning/sparsity/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/tasks/pruning/sparsity/utils/__init__.py


--------------------------------------------------------------------------------
/tasks/race/finetune.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Race."""
 4 | 
 5 | from megatron import get_args
 6 | from megatron import print_rank_0
 7 | from megatron import get_tokenizer
 8 | from megatron.model.multiple_choice import MultipleChoice
 9 | from tasks.eval_utils import accuracy_func_provider
10 | from tasks.finetune_utils import finetune
11 | from tasks.race.data import RaceDataset
12 | from megatron.arguments import core_transformer_config_from_args
13 | 
14 | 
15 | def train_valid_datasets_provider():
16 |     """Provide train and validation datasets."""
17 |     args = get_args()
18 |     tokenizer = get_tokenizer()
19 | 
20 |     train_dataset = RaceDataset('training', args.train_data,
21 |                                 tokenizer, args.seq_length)
22 |     valid_dataset = RaceDataset('validation', args.valid_data,
23 |                                 tokenizer, args.seq_length)
24 | 
25 |     return train_dataset, valid_dataset
26 | 
27 | 
28 | def model_provider(pre_process=True, post_process=True):
29 |     """Build the model."""
30 |     config = core_transformer_config_from_args(get_args())
31 |     print_rank_0('building multichoice model for RACE ...')
32 |     model = MultipleChoice(config=config,
33 |                            num_tokentypes=2,
34 |                            pre_process=pre_process,
35 |                            post_process=post_process)
36 | 
37 |     return model
38 | 
39 | 
40 | def metrics_func_provider():
41 |     """Privde metrics callback function."""
42 |     args = get_args()
43 |     tokenizer = get_tokenizer()
44 | 
45 |     def single_dataset_provider(datapath):
46 |         name = datapath.split('RACE')[-1].strip('/').replace('/', '-')
47 |         return RaceDataset(name, [datapath], tokenizer, args.seq_length)
48 | 
49 |     return accuracy_func_provider(single_dataset_provider)
50 | 
51 | 
52 | def main():
53 | 
54 |     finetune(train_valid_datasets_provider, model_provider,
55 |              end_of_epoch_callback_provider=metrics_func_provider)
56 | 


--------------------------------------------------------------------------------
/tasks/vision/classification/classification.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Vision-classification finetuning/evaluation."""
 4 | 
 5 | import torch.nn.functional as F
 6 | from functools import partial
 7 | from megatron import get_args, get_timers
 8 | from megatron import print_rank_0
 9 | from megatron.model.vision.classification import VitClassificationModel
10 | from megatron.data.vit_dataset import build_train_valid_datasets
11 | from tasks.vision.classification.eval_utils import accuracy_func_provider
12 | from tasks.vision.finetune_utils import finetune
13 | from megatron.utils import average_losses_across_data_parallel_group
14 | 
15 | 
16 | def classification():
17 |     def train_valid_datasets_provider():
18 |         """Build train and validation dataset."""
19 |         args = get_args()
20 | 
21 |         train_ds, valid_ds = build_train_valid_datasets(
22 |             data_path=args.data_path,
23 |             image_size=(args.img_h, args.img_w),
24 |         )
25 |         return train_ds, valid_ds
26 | 
27 |     def model_provider(pre_process=True, post_process=True):
28 |         """Build the model."""
29 |         args = get_args()
30 | 
31 |         print_rank_0("building classification model for ImageNet ...")
32 | 
33 |         return VitClassificationModel(num_classes=args.num_classes, finetune=True,
34 |                                       pre_process=pre_process, post_process=post_process)
35 | 
36 |     def process_batch(batch):
37 |         """Process batch and produce inputs for the model."""
38 |         images = batch[0].cuda().contiguous()
39 |         labels = batch[1].cuda().contiguous()
40 |         return images, labels
41 | 
42 |     def cross_entropy_loss_func(labels, output_tensor):
43 |         logits = output_tensor
44 | 
45 |         # Cross-entropy loss.
46 |         loss = F.cross_entropy(logits.contiguous().float(), labels)
47 | 
48 |         # Reduce loss for logging.
49 |         averaged_loss = average_losses_across_data_parallel_group([loss])
50 | 
51 |         return loss, {'lm loss': averaged_loss[0]}
52 | 
53 |     def _cross_entropy_forward_step(batch, model):
54 |         """Simple forward step with cross-entropy loss."""
55 |         timers = get_timers()
56 | 
57 |         # Get the batch.
58 |         timers("batch generator", log_level=2).start()
59 |         try:
60 |             batch_ = next(batch)
61 |         except BaseException:
62 |             batch_ = batch
63 |         images, labels = process_batch(batch_)
64 |         timers("batch generator").stop()
65 | 
66 |         # Forward model.
67 |         output_tensor = model(images)
68 |       
69 |         return output_tensor, partial(cross_entropy_loss_func, labels)
70 | 
71 |     """Finetune/evaluate."""
72 |     finetune(
73 |         train_valid_datasets_provider,
74 |         model_provider,
75 |         forward_step=_cross_entropy_forward_step,
76 |         end_of_epoch_callback_provider=accuracy_func_provider,
77 |     )
78 | 
79 | def main():
80 |     classification()
81 | 
82 | 


--------------------------------------------------------------------------------
/tasks/vision/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Main tasks functionality."""
 4 | 
 5 | import os
 6 | import sys
 7 | 
 8 | sys.path.append(
 9 |     os.path.abspath(
10 |         os.path.join(
11 |             os.path.join(os.path.dirname(__file__), os.path.pardir),
12 |             os.path.pardir,
13 |         )
14 |     )
15 | )
16 | from megatron import get_args
17 | from megatron.initialize import initialize_megatron
18 | 
19 | def get_tasks_args(parser):
20 |     """Provide extra arguments required for tasks."""
21 |     group = parser.add_argument_group(title="tasks")
22 | 
23 |     group.add_argument('--task', type=str, default='segment',
24 |                        choices=['classify', 'segment_setr', 'segment_segformer'],
25 |                        help='task name.')
26 |     group.add_argument("--epochs", type=int, default=None,
27 |                        help="Number of finetunning epochs. Zero results in "
28 |                        "evaluation only.")
29 |     group.add_argument('--pretrained-checkpoint-type', type=str, default='default',
30 |                        choices=['default', 'external', 'constrastive'],
31 |                        help='Type of pretrained checkpoint')
32 |     group.add_argument("--pretrained-checkpoint", type=str, default=None,
33 |                        help="Pretrained checkpoint used for finetunning.")
34 |     group.add_argument('--seg-stride', type=int, default=None,
35 |                        help='sliding window stride during evaluation')
36 |     return parser
37 | 
38 | 
39 | if __name__ == "__main__":
40 | 
41 |     initialize_megatron(extra_args_provider=get_tasks_args)
42 |     args = get_args()
43 | 
44 |     if args.task == 'classify':
45 |         from tasks.vision.classification.classification import main
46 |         main()
47 |     elif args.task == 'segment_setr':
48 |         from tasks.vision.segmentation.finetune_setr import main
49 |         main()
50 |     elif args.task == 'segment_segformer':
51 |         from tasks.vision.segmentation.finetune_segformer import main
52 |         main()
53 | 
54 | 


--------------------------------------------------------------------------------
/tasks/vision/segmentation/seg_models.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import math
 3 | import einops
 4 | import torch
 5 | import apex
 6 | import torch.nn.functional as F
 7 | from megatron import get_args
 8 | from megatron.model.module import MegatronModule
 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead
10 | from megatron.model.vision.mit_backbone import mit_b3, mit_b5
11 | from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead
12 | 
13 | 
14 | class SetrSegmentationModel(MegatronModule):
15 | 
16 |     def __init__(self,
17 |                  num_classes,
18 |                  pre_process=True,
19 |                  post_process=True):
20 |         super(SetrSegmentationModel, self).__init__()
21 |         args = get_args()
22 |         assert post_process & pre_process
23 |         self.hidden_size = args.hidden_size
24 |         self.num_classes = num_classes
25 |         self.backbone = VitBackbone(
26 |             pre_process=pre_process,
27 |             post_process=post_process,
28 |             class_token=False,
29 |             post_layer_norm=False,
30 |             drop_path_rate=0.1
31 |         )
32 | 
33 |         self.head = SetrSegmentationHead(
34 |             self.hidden_size,
35 |             self.num_classes
36 |         )
37 | 
38 |     def set_input_tensor(self, input_tensor):
39 |         """See megatron.model.transformer.set_input_tensor()"""
40 |         pass
41 | 
42 |     def forward(self, input):
43 |         # [b hw c]
44 |         hidden_states = self.backbone(input)
45 |         result_final = self.head(hidden_states)
46 |         return result_final
47 | 
48 | 
49 | class SegformerSegmentationModel(MegatronModule):
50 | 
51 |     def __init__(self,
52 |                  num_classes,
53 |                  pre_process=True,
54 |                  post_process=True):
55 |         super(SegformerSegmentationModel, self).__init__()
56 |         args = get_args()
57 |         self.hidden_size = args.hidden_size
58 |         self.num_classes = num_classes
59 |         self.pre_process = pre_process
60 |         self.post_process = post_process
61 | 
62 |         self.backbone = mit_b5()
63 |         self.head = SegformerSegmentationHead(
64 |             feature_strides=[4, 8, 16, 32],
65 |             in_channels=[64, 128, 320, 512],
66 |             embedding_dim=768,
67 |             dropout_ratio=0.1
68 |         )
69 | 
70 |     def set_input_tensor(self, input_tensor):
71 |         """See megatron.model.transformer.set_input_tensor()"""
72 |         pass
73 | 
74 |     def forward(self, input):
75 |         # [b hw c]
76 |         hidden_states = self.backbone(input)
77 |         hidden_states = self.head(hidden_states)
78 |         return hidden_states
79 | 
80 | 


--------------------------------------------------------------------------------
/tasks/vision/segmentation/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import numpy as np
 4 | from megatron import get_args
 5 | 
 6 | def slidingcrops(img, mask):
 7 |     # img: [b c h w]
 8 |     # mask: [b h w]
 9 |     args = get_args()
10 |     assert args.img_h == args.img_w
11 |     crop_size = args.img_h
12 |     stride = args.seg_stride
13 |     ignore_index = args.ignore_index
14 |     n, c, h, w = img.shape
15 |     assert h >= crop_size
16 |     assert w >= crop_size
17 |     long_size = max(h, w)
18 | 
19 |     img_slices, mask_slices, slices_info = [], [], []
20 |     if long_size > crop_size:
21 |         assert stride <= crop_size
22 |         h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1
23 |         w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1
24 |         for yy in range(h_step_num):
25 |             for xx in range(w_step_num):
26 |                 sy, sx = yy * stride, xx * stride
27 |                 ey, ex = sy + crop_size, sx + crop_size
28 |                 img_sub = img[:, :, sy: ey, sx: ex]
29 |                 mask_sub = mask[:, sy: ey, sx: ex]
30 | 
31 |                 # padding
32 |                 sub_h, sub_w = img_sub.shape[2:]
33 |                 pad_h = max(crop_size - sub_h, 0)
34 |                 pad_w = max(crop_size - sub_w, 0)
35 |                 img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index)
36 |                 mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h))
37 | 
38 |                 img_slices.append(img_sub)
39 |                 mask_slices.append(mask_sub)
40 |                 slices_info.append([sy, ey, sx, ex, sub_h, sub_w])
41 | 
42 |         return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w)
43 |     else:
44 |         return img, mask, [[0, h, 0, w, h, w]], (h, w)
45 | 
46 | 
47 | def slidingjoins(preds, probs, labels, slices_info, img_size):
48 |     args = get_args()
49 |     num_slices = len(slices_info)
50 | 
51 |     if num_slices == 1:
52 |         return preds, labels
53 | 
54 |     h, w = img_size
55 |     split_size = args.micro_batch_size
56 | 
57 |     preds_split = torch.split(preds, split_size)
58 |     probs_split = torch.split(probs, split_size)
59 |     labels_split = torch.split(labels, split_size)
60 | 
61 |     assert(len(preds_split) == num_slices)
62 | 
63 |     total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda')
64 |     total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
65 |     total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda')
66 | 
67 |     for i in range(num_slices):
68 |         sy, ey, sx, ex, sub_h, sub_w = slices_info[i]
69 |         assert sy + sub_h <= h
70 |         assert sx + sub_w <= w
71 |         curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w]
72 |         curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w]
73 | 
74 |         local_max_probs = probs_split[i][:, :sub_h, : sub_w]
75 |         local_preds = preds_split[i][:, :sub_h, :sub_w]
76 | 
77 |         result_max_probs = torch.maximum(curr_max_probs, local_max_probs)
78 |         result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds)
79 | 
80 |         total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs
81 |         total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds
82 |         total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w]
83 | 
84 |     return total_preds, total_labels
85 | 
86 | 


--------------------------------------------------------------------------------
/tasks/zeroshot_gpt/detokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | """Detokenization."""
 4 | 
 5 | import re
 6 | 
 7 | 
 8 | def ptb_detokenizer(string):
 9 |     string = string.replace(" '", "'")
10 |     string = string.replace(" \n", "\n")
11 |     string = string.replace("\n ", "\n")
12 |     string = string.replace(" n't", "n't")
13 |     string = string.replace(" N ", "1 ")
14 |     string = string.replace("$ 1", "$1")
15 |     string = string.replace("# 1", "#1")
16 |     return string
17 | 
18 | 
19 | def wikitext_detokenizer(string):
20 |     # contractions
21 |     string = string.replace("s '", "s'")
22 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
23 |     # number separators
24 |     string = string.replace(" @-@ ", "-")
25 |     string = string.replace(" @,@ ", ",")
26 |     string = string.replace(" @.@ ", ".")
27 |     # punctuation
28 |     string = string.replace(" : ", ": ")
29 |     string = string.replace(" ; ", "; ")
30 |     string = string.replace(" . ", ". ")
31 |     string = string.replace(" ! ", "! ")
32 |     string = string.replace(" ? ", "? ")
33 |     string = string.replace(" , ", ", ")
34 |     # double brackets
35 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
36 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
37 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
38 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
39 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
40 |     # miscellaneous
41 |     string = string.replace("= = = =", "====")
42 |     string = string.replace("= = =", "===")
43 |     string = string.replace("= =", "==")
44 |     string = string.replace(" " + chr(176) + " ", chr(176))
45 |     string = string.replace(" \n", "\n")
46 |     string = string.replace("\n ", "\n")
47 |     string = string.replace(" N ", " 1 ")
48 |     string = string.replace(" 's", "'s")
49 | 
50 |     return string
51 | 
52 | 
53 | def lambada_detokenizer(string):
54 |     return string
55 | 
56 | 
57 | _DETOKENIZERS = {
58 |     'ptb': ptb_detokenizer,
59 |     'wiki': wikitext_detokenizer,
60 |     'lambada': lambada_detokenizer,
61 | }
62 | 
63 | 
64 | def get_detokenizer(path):
65 |     for key in _DETOKENIZERS.keys():
66 |         if key in path:
67 |             return _DETOKENIZERS[key]
68 | 


--------------------------------------------------------------------------------
/tool_apply_sparsity.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description='Trim Lana checkpoint')
 5 | parser.add_argument('--ckpt_dir', type=str, default='output/checkpoints/gpt3-843m-mask-only-simple-no-async-grad/train_iters_2000/ckpt/iter_0002000', help='Input checkpoint')
 6 | args = parser.parse_args()
 7 | 
 8 | def apply_sparsity(input, output):
 9 |     ckpt = torch.load(input, map_location='cpu')
10 |     new_encoder_state_dict = {}
11 |     mask_options = torch.zeros(1, 6, 4, dtype=torch.float32)
12 |     mask_options[:, 0, :].data += torch.tensor([1, 1, 0, 0], dtype=torch.float32)
13 |     mask_options[:, 1, :].data += torch.tensor([1, 0, 1, 0], dtype=torch.float32)
14 |     mask_options[:, 2, :].data += torch.tensor([1, 0, 0, 1], dtype=torch.float32)
15 |     mask_options[:, 3, :].data += torch.tensor([0, 1, 1, 0], dtype=torch.float32)
16 |     mask_options[:, 4, :].data += torch.tensor([0, 1, 0, 1], dtype=torch.float32)
17 |     mask_options[:, 5, :].data += torch.tensor([0, 0, 1, 1], dtype=torch.float32)
18 | 
19 |     for k,v in ckpt['model']['language_model']['encoder'].items():
20 |         if 'mask' not in k:
21 |             new_encoder_state_dict[k] = v
22 |             print("Save weights:", k)
23 | 
24 |     for k,v in ckpt['model']['language_model']['encoder'].items():
25 |         if '.diff_mask.gate' in k: 
26 |             gate = ckpt['model']['language_model']['encoder'][k].float()
27 |             runtime_mask = ckpt['model']['language_model']['encoder'][k.replace('diff_mask.gate', 'mask')].float()
28 |             winner_mask = mask_options[torch.arange(mask_options.shape[0]), gate.argmax(dim=-1)].view(*runtime_mask.shape)
29 |             # set the type of winner mask the same as runtime_mask
30 |             winner_mask = winner_mask.type_as(runtime_mask)
31 |             new_encoder_state_dict[k.replace('diff_mask.gate', 'weight')] *= winner_mask
32 |             print("freeze mask:", k.replace('diff_mask.gate', 'mask'))
33 |         
34 |     ckpt['model']['language_model']['encoder'] = new_encoder_state_dict
35 |     print(ckpt['model']['language_model']['encoder'].keys())
36 |     torch.save(ckpt, output)
37 | 
38 | import os
39 | import glob
40 | 
41 | if args.ckpt_dir.endswith('/'):
42 |     args.ckpt_dir = args.ckpt_dir[:-1]
43 | splited_dir = args.ckpt_dir.split('/') 
44 | output_dir = os.path.join('/'.join(splited_dir[:-1]), 'iter_0000001')
45 | print(f"output_dir: {output_dir}")
46 | os.makedirs(output_dir, exist_ok=True)
47 | mp_rank_dirs = glob.glob(os.path.join(args.ckpt_dir, "mp_rank_*"))
48 | for mp_rank_dir in mp_rank_dirs:
49 |     ckpt_file = os.path.join(mp_rank_dir, "model_optim_rng.pt")
50 |     output_file  = ckpt_file.replace(args.ckpt_dir, output_dir)
51 |     os.makedirs(os.path.dirname(output_file), exist_ok=True)
52 |     apply_sparsity(ckpt_file, output_file)
53 | 
54 | iteration_file = os.path.join( *splited_dir[:-1], 'latest_checkpointed_iteration.txt')
55 | print(iteration_file)
56 | with open(iteration_file, 'w') as f:
57 |     f.write("1")
58 | 
59 | 
60 | 
61 |             
62 | 


--------------------------------------------------------------------------------
/tool_compress_mask.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser(description='Mask Compression')
 7 | parser.add_argument('--mask_ckpt', type=str, help='path to the mask checkpoint')
 8 | parser.add_argument('--output', type=str, help='output path')
 9 | 
10 | args = parser.parse_args()
11 | 
12 | if __name__=='__main__':
13 |     mask_ckpt = torch.load(args.mask_ckpt, map_location='cpu')
14 |     compressed_mask = {}
15 |     for k, mask in mask_ckpt.items():
16 |         # Compress with np.packbits
17 |         print(f"Compressing {k}...")
18 |         mask = mask.cpu().numpy().astype(bool)
19 |         mask = np.packbits(mask)
20 |         compressed_mask[k] = mask
21 |     np.savez_compressed(args.output, **compressed_mask)
22 | 
23 | 


--------------------------------------------------------------------------------
/tool_compute_mask_hf.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os 
 3 | import numpy as np
 4 | import torch
 5 | from transformers import AutoTokenizer, AutoModelForCausalLM
 6 | from importlib.metadata import version
 7 | 
 8 | import time
 9 | import torch
10 | import torch.nn as nn
11 | 
12 | # Import get_loaders function from data module within the same directory
13 | 
14 | from collections import defaultdict
15 | import fnmatch
16 | 
17 | # Code adapted from https://github.com/IST-DASLab/sparsegpt/blob/master/datautils.py
18 | 
19 | import numpy as np
20 | import random
21 | import torch
22 | from datasets import load_dataset
23 | 
24 | print('torch', version('torch'))
25 | print('transformers', version('transformers'))
26 | print('accelerate', version('accelerate'))
27 | print('# of gpus: ', torch.cuda.device_count())
28 | 
29 | def get_llm(model_name, cache_dir="llm_weights"):
30 |     model = AutoModelForCausalLM.from_pretrained(
31 |         model_name, 
32 |         torch_dtype=torch.float16, 
33 |         cache_dir=cache_dir, 
34 |         device_map="cpu"
35 |     )
36 |     model.seqlen = model.config.max_position_embeddings 
37 |     return model
38 | 
39 | def main():
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('--dense', type=str, help='Dense model')
42 |     parser.add_argument('--sparse', type=str, help='Sparse model')
43 |     parser.add_argument('--save', type=str, help='Save as')
44 |     parser.add_argument("--cache_dir", default="llm_weights", type=str )
45 |     args = parser.parse_args()
46 | 
47 |     # Setting seeds for reproducibilit
48 |     with torch.no_grad():
49 |         dense = get_llm(args.dense, args.cache_dir)
50 |         sparse = get_llm(args.sparse, args.cache_dir)
51 | 
52 |         mask_ckpt = {}
53 |         for (name_dense, param_dense), (name_sparse, param_sparse) in zip(dense.named_parameters(), sparse.named_parameters()):
54 |             sparsity = (param_sparse==0).float().mean().item()
55 |             print(f"{name_sparse} - sparsity {sparsity:.4f}")
56 |             # Check 2:4
57 |             if abs(sparsity-0.5)<0.0001:
58 |                 mask = (param_sparse!=0).float()
59 |                 assert torch.equal(mask * param_dense, param_sparse)
60 |                 mask_ckpt[name_sparse+'.mask'] = mask
61 |             else:
62 |                 # assert equal of dense and sparse_weight 
63 |                 assert torch.equal(param_dense, param_sparse)
64 | 
65 |         torch.save(mask_ckpt, args.save)
66 |         print(mask_ckpt.keys())
67 |         print(f"Mask saved as {args.save}")
68 | 
69 |         
70 | 
71 | if __name__ == '__main__':
72 |     main()


--------------------------------------------------------------------------------
/tool_trim_learnable_sparsity.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description='Trim Lana checkpoint')
 5 | parser.add_argument('--ckpt_dir', type=str, default='output/checkpoints/llama-mask-only/train_iters_2000/ckpt/iter_0002000', help='Input checkpoint')
 6 | args = parser.parse_args()
 7 | 
 8 | def trim_ckpt(input, output):
 9 |     ckpt = torch.load(input, map_location='cpu')
10 |     new_encoder_state_dict = {}
11 |     mask_options = torch.zeros(1, 6, 4, dtype=torch.float32)
12 |     mask_options[:, 0, :].data += torch.tensor([1, 1, 0, 0], dtype=torch.float32)
13 |     mask_options[:, 1, :].data += torch.tensor([1, 0, 1, 0], dtype=torch.float32)
14 |     mask_options[:, 2, :].data += torch.tensor([1, 0, 0, 1], dtype=torch.float32)
15 |     mask_options[:, 3, :].data += torch.tensor([0, 1, 1, 0], dtype=torch.float32)
16 |     mask_options[:, 4, :].data += torch.tensor([0, 1, 0, 1], dtype=torch.float32)
17 |     mask_options[:, 5, :].data += torch.tensor([0, 0, 1, 1], dtype=torch.float32)
18 | 
19 |     for k,v in ckpt['model']['language_model']['encoder'].items():
20 |         if '.diff_mask.gate' in k: 
21 |             gate = ckpt['model']['language_model']['encoder'][k].float()
22 |             runtime_mask = ckpt['model']['language_model']['encoder'][k.replace('diff_mask.gate', 'mask')].float()
23 |             winner_mask = mask_options[torch.arange(mask_options.shape[0]), gate.argmax(dim=-1)].view(*runtime_mask.shape)
24 |             # set the type of winner mask the same as runtime_mask
25 |             winner_mask = winner_mask.type_as(runtime_mask)
26 |             new_encoder_state_dict[k.replace('diff_mask.gate', 'mask')] = winner_mask
27 |             print("save winner mask:", k.replace('diff_mask.gate', 'mask'))
28 |             continue
29 | 
30 |         if '.mask' in k: continue
31 |         if '.mask_options' in k: continue
32 | 
33 |         new_encoder_state_dict[k] = v
34 |         
35 |     ckpt['model']['language_model']['encoder'] = new_encoder_state_dict
36 |     print(ckpt['model']['language_model']['encoder'].keys())
37 |     torch.save(ckpt, output)
38 | 
39 | 
40 | import os
41 | import glob
42 | 
43 | # Create output directory
44 | splited_dir = args.ckpt_dir.split('/') 
45 | output_dir = os.path.join('/'.join(splited_dir[:-1]), 'release')
46 | print(f"output_dir: {output_dir}")
47 | os.makedirs(output_dir, exist_ok=True)
48 | 
49 | # Trim the checkpoints
50 | mp_rank_dirs = glob.glob(os.path.join(args.ckpt_dir, "mp_rank_*"))
51 | for mp_rank_dir in mp_rank_dirs:
52 |     ckpt_file = os.path.join(mp_rank_dir, "model_optim_rng.pt")
53 |     output_file  = ckpt_file.replace(args.ckpt_dir, output_dir)
54 |     os.makedirs(os.path.dirname(output_file), exist_ok=True)
55 |     print(f"Trim {ckpt_file} to {output_file}")
56 |     trim_ckpt(ckpt_file, output_file)
57 | 
58 | # update the latest iteration to "release"
59 | iteration_file = os.path.join( *splited_dir[:-1], 'latest_checkpointed_iteration.txt')
60 | print(iteration_file)
61 | with open(iteration_file, 'w') as f:
62 |     f.write("release")
63 | 
64 |             
65 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | import torch
3 | import os,sys
4 | import pathlib
5 | import logging
6 | 
7 | sys.path.append(os.path.join(pathlib.Path(__file__).parent.resolve(),'umct'))


--------------------------------------------------------------------------------
/tools/autoformat.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
4 | 
5 | # for now we just format core
6 | 
7 | black ${SCRIPT_DIR}/../megatron/core
8 | isort ${SCRIPT_DIR}/../megatron/core
9 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder
4 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | from megatron import get_args, get_tokenizer
 7 | from megatron.data.bert_dataset import build_training_sample
 8 | 
 9 | 
10 | class BertEmbeddingDataset(torch.utils.data.Dataset):
11 |     '''Dataset to convert a text dataset to Bert tokens.'''
12 | 
13 |     def __init__(self, text_dataset, max_seq_length):
14 | 
15 |         super().__init__()
16 | 
17 |         args = get_args()
18 | 
19 |         # Dataset, tokenizer.
20 |         self.text_dataset = text_dataset
21 |         self.bert_tokenizer = get_tokenizer()
22 | 
23 |         # Params to store.
24 |         self.max_seq_length = max_seq_length
25 |         self.seed = args.seed
26 |         self.masked_lm_prob = args.mask_prob
27 | 
28 |         # Vocab stuff.
29 |         self.vocab_id_list = list(self.bert_tokenizer.inv_vocab.keys())
30 |         self.vocab_id_to_token_dict = self.bert_tokenizer.inv_vocab
31 |         self.cls_id = self.bert_tokenizer.cls
32 |         self.sep_id = self.bert_tokenizer.sep
33 |         self.mask_id = self.bert_tokenizer.mask
34 |         self.pad_id = self.bert_tokenizer.pad
35 | 
36 |     def __len__(self):
37 |         return len(self.text_dataset)
38 | 
39 |     def __getitem__(self, idx):
40 | 
41 |         # Text.
42 |         text_sample = self.text_dataset[idx]
43 |         text = text_sample["text"]
44 |         text = text.replace("<|endoftext|>", "")
45 | 
46 |         # Bert/Wordpiece tokens (+truncate).
47 |         bert_token_ids = self.bert_tokenizer.tokenize(text)
48 |         bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep.
49 |         if not bert_token_ids:
50 |             bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq
51 | 
52 |         # Note that this rng state should be numpy and not python since
53 |         # python randint is inclusive whereas the numpy one is exclusive.
54 |         # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
55 |         np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32))
56 | 
57 |         # Build sample.
58 |         sample = build_training_sample([bert_token_ids],
59 |                                        len(bert_token_ids),
60 |                                        len(bert_token_ids) + 2, # for cls+sep
61 |                                        self.vocab_id_list,
62 |                                        self.vocab_id_to_token_dict,
63 |                                        self.cls_id, self.sep_id,
64 |                                        self.mask_id, self.pad_id,
65 |                                        self.masked_lm_prob, np_rng,
66 |                                        binary_head=False)
67 |         sample["seq_length"] = len(sample["text"])
68 |         return sample
69 | 


--------------------------------------------------------------------------------
/tools/bert_embedding/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "h5py",
 7 |     "transformers", # for huggingface bert
 8 | ]
 9 | 
10 | for lib in required_libs:
11 |     try:
12 |         globals()[lib] = importlib.import_module(lib)
13 |     except ImportError as e:
14 |         raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.")
15 | 


--------------------------------------------------------------------------------
/tools/download_c4.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 | 
3 | # English only
4 | for i in range(20):
5 |     en = load_dataset("allenai/c4", data_files={'train': f'en/c4-train.{str(i).zfill(5)}-of-01024.json.gz'}, cache_dir='./assets/data', split='train')
6 |     print(len(en))
7 | 
8 |     # save as json files
9 |     en.to_json(f'./assets/data/en/c4-train.{str(i).zfill(5)}-of-01024.json', orient='records', lines=True)


--------------------------------------------------------------------------------
/tools/linter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | import pathlib
 4 | import subprocess
 5 | 
 6 | 
 7 | def recursively_lint_files():
 8 |     """Recursively lint all python files in chosen subdirectories of megatron-lm"""
 9 | 
10 |     try:
11 |         import autopep8
12 |     except ModuleNotFoundError:
13 |         print("Please first install autopep8 via `pip install autopep8`")
14 |         return
15 | 
16 |     # get all python file paths from top level directory
17 |     file_dir = str(pathlib.Path(__file__).parent.absolute())
18 |     working_dir = osp.join(file_dir, os.pardir)
19 |     all_py_paths = set(os.path.join(working_dir, fname)
20 |                        for fname in os.listdir(working_dir) if ".py" in fname)
21 | 
22 |     # get all python file paths from chosen subdirectories
23 |     check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks']
24 |     for sub_dir in check_dirs:
25 |         for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)):
26 |             all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname))
27 | 
28 |     print("Linting the following: ")
29 |     for py_path in all_py_paths:
30 |         print(py_path)
31 |         command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path)
32 |         subprocess.check_call(command)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     recursively_lint_files()
37 | 


--------------------------------------------------------------------------------
/tools/merge_datasets.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import argparse
 5 | 
 6 | sys.path.append(
 7 |     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
 8 | )
 9 | 
10 | from megatron.core.datasets.indexed_dataset import (
11 |     MMapIndexedDataset,
12 |     MMapIndexedDatasetBuilder,
13 |     get_bin_path,
14 |     get_idx_path,
15 | )
16 | 
17 | 
18 | def get_args():
19 |     parser = argparse.ArgumentParser()
20 | 
21 |     group = parser.add_argument_group(title="input data")
22 |     group.add_argument(
23 |         "--input",
24 |         type=str,
25 |         required=True,
26 |         help="Path to directory containing all document files to merge",
27 |     )
28 | 
29 |     group = parser.add_argument_group(title="output data")
30 |     group.add_argument(
31 |         "--output-prefix",
32 |         type=str,
33 |         required=True,
34 |         help="Path to binary output file without suffix",
35 |     )
36 | 
37 |     group = parser.add_argument_group(title="miscellaneous")
38 |     group.add_argument(
39 |         "--multimodal",
40 |         action="store_true",
41 |         help="Whether the datasets are assumed to be multimodal"
42 |     )
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     assert os.path.isdir(
47 |         args.input
48 |     ), f"ERROR: {args.input} is not a directory or does not exist"
49 | 
50 |     assert os.path.isdir(
51 |         os.path.dirname(args.output_prefix)
52 |     ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist"
53 | 
54 |     return args
55 | 
56 | 
57 | def main():
58 |     args = get_args()
59 | 
60 |     prefixes = set()
61 |     for basename in os.listdir(args.input):
62 |         prefix, ext = os.path.splitext(basename)
63 | 
64 |         if prefix in prefixes:
65 |             continue
66 | 
67 |         if not os.path.isfile(os.path.join(args.input, basename)):
68 |             continue
69 | 
70 |         ext_pair = ".bin" if ext == ".idx" else ".idx"
71 |         assert os.path.isfile(
72 |             os.path.join(args.input, prefix) + ext_pair
73 |         ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}"
74 | 
75 |         prefixes.add(prefix)
76 | 
77 |     builder = None
78 |     for prefix in sorted(prefixes):
79 |         if builder is None:
80 |             dataset = MMapIndexedDataset(os.path.join(args.input, prefix), multimodal=args.multimodal)
81 |             builder = MMapIndexedDatasetBuilder(
82 |                 get_bin_path(args.output_prefix), dtype=dataset.index.dtype, multimodal=args.multimodal
83 |             )
84 |             del dataset
85 | 
86 |         builder.add_index(os.path.join(args.input, prefix))
87 | 
88 |     builder.finalize(get_idx_path(args.output_prefix))
89 | 
90 | 
91 | if __name__ == '__main__':
92 | 
93 |     main()
94 | 


--------------------------------------------------------------------------------
/tools/openwebtext/add_id.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import argparse
 4 | import json
 5 | import os
 6 | import time
 7 | 
 8 | """
 9 | This code adds id to each json object in a json file. User can add prefix
10 | to the ids.
11 | """
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     print('parsing the arguments ...')
16 | 
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--input-file', type=str, default=None, help='Input'\
19 |         ' json file where id needs to be added')
20 |     parser.add_argument('--output-file', type=str, default=None, help=\
21 |         'Output file name with id')
22 |     parser.add_argument('--id-prefix', type=str, default=None, help=\
23 |         'Id prefix')
24 |     parser.add_argument('--log-interval', type=int, default=100,
25 |                        help='Log interval')
26 |     args = parser.parse_args()
27 | 
28 |     print('Adding ids to dataset ...')
29 | 
30 |     f_input = open(args.input_file, 'r', encoding='utf-8')
31 |     f_output = open(args.output_file, 'wb')
32 | 
33 |     unique_ids = 1
34 |     start_time = time.time()
35 |     for row in f_input:
36 |         each_row = json.loads(row)
37 |         adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
38 |         each_row['adlr_id'] = adlr_id_string
39 |         myjson = json.dumps(each_row, ensure_ascii=False)
40 | 
41 |         f_output.write(myjson.encode('utf-8'))
42 |         f_output.write('\n'.encode('utf-8'))
43 | 
44 |         if unique_ids % args.log_interval == 0:
45 |             print('    processed {:9d} documents in {:.2f} seconds ...'.format( \
46 |                     unique_ids, time.time() - start_time), flush=True)
47 | 
48 |         unique_ids += 1
49 | 
50 |     # Close the file.
51 |     f_input.close()
52 |     f_output.close()
53 |     
54 |     print('done :-)', flush=True)
55 | 


--------------------------------------------------------------------------------
/tools/openwebtext/group_duplicate_url.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import json
 4 | import time
 5 | import sys
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 | 
10 | 
11 |     print('grouping duplicate urls ...')
12 | 
13 |     input = sys.argv[1]
14 |     output = sys.argv[2]
15 |     if len(sys.argv) > 3:
16 |         jaccard_similarity_threshold = float(sys.argv[3])
17 |     else:
18 |         jaccard_similarity_threshold = 0.7
19 | 
20 |     url_to_index = {}
21 |     index_to_urls = []
22 |     counter = 0
23 |     start_time = time.time()
24 |     with open(input, 'r') as f:
25 |         for line in f:
26 |             counter += 1
27 |             myjson = json.loads(line)
28 |             urls = []
29 |             for main_url in myjson.keys():
30 |                 urls.append(main_url)
31 |                 for value in myjson[main_url]:
32 |                     for other_url, js in value.items():
33 |                         if js >= jaccard_similarity_threshold:
34 |                             urls.append(other_url)
35 |             current_index = -1
36 |             other_indices = set()
37 |             for url in urls:
38 |                 if url in url_to_index:
39 |                     if current_index == -1:
40 |                         current_index = url_to_index[url]
41 |                     elif current_index != url_to_index[url]:
42 |                         other_indices.add(url_to_index[url])
43 |             if current_index == -1:
44 |                 current_index = len(index_to_urls)
45 |                 index_to_urls.append(set())
46 |             for url in urls:
47 |                 url_to_index[url] = current_index
48 |                 index_to_urls[current_index].add(url)
49 |             for index in other_indices:
50 |                 for url in index_to_urls[index]:
51 |                     index_to_urls[current_index].add(url)
52 |                     url_to_index[url] = current_index
53 |                 index_to_urls[index] = None
54 | 
55 |             if counter % 100000 == 0:
56 |                 print(' > processed {} lines in {} seconds ...'.format(
57 |                     counter, time.time() - start_time))
58 | 
59 | 
60 |     total_remove = 0
61 |     total_remain = 0
62 |     for urls in index_to_urls:
63 |         if urls is not None:
64 |             if len(urls) > 1:
65 |                 total_remove += (len(urls) - 1)
66 |                 total_remain += 1
67 |     print('out of {} urls, only {} are unique and {} should be removed'.format(
68 |         total_remove+total_remain, total_remain, total_remove))
69 | 
70 |     with open(output, 'wb') as f:
71 |         for i, urls in enumerate(index_to_urls):
72 |             if urls is not None:
73 |                 if len(urls) > 1:
74 |                     myjson = json.dumps({str(i): list(urls)},
75 |                                         ensure_ascii=False)
76 |                     f.write(myjson.encode('utf-8'))
77 |                     f.write('\n'.encode('utf-8'))
78 | 


--------------------------------------------------------------------------------
/tools/openwebtext/merge_jsons.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import glob
 5 | import sys
 6 | import json
 7 | import argparse
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--json_path", type=str, default=".",
13 |         help="path where all the json files are located")
14 | 
15 |     parser.add_argument("--output_file", type=str, default="merged_output.json",
16 |         help="filename where the merged json should go")
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     json_path = args.json_path
21 |     out_file = args.output_file
22 | 
23 |     json_files = glob.glob(json_path + '/*.json')
24 | 
25 |     counter = 0
26 | 
27 |     with open(out_file, 'w') as outfile:
28 |         for fname in json_files:
29 |             counter += 1
30 | 
31 |             if counter % 1024 == 0:
32 |                 print("Merging at ", counter, flush=True)
33 | 
34 |             with open(fname, 'r') as infile:
35 |                 for row in infile:
36 |                     each_row = json.loads(row)
37 |                     outfile.write(row)
38 | 
39 | 
40 |     print("Merged file", out_file, flush=True)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/tools/openwebtext/remove_group_duplicates.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | 
 4 | import json
 5 | import time
 6 | import sys
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 | 
11 |     url_filename = sys.argv[1]
12 |     data_filename = sys.argv[2]
13 |     output_filename = sys.argv[3]
14 | 
15 |     urls = set()
16 |     with open(url_filename, 'r') as f:
17 |         for line in f:
18 |             myjson = json.loads(line)
19 |             for key in myjson:
20 |                 this_urls = myjson[key]
21 |                 for i in range(1, len(this_urls)):
22 |                     urls.add(this_urls[i])
23 |     print('will be removing {} urls'.format(len(urls)), flush=True)
24 | 
25 |     written_docs = 0
26 |     removed_docs = 0
27 |     removed_chars = 0
28 |     start_time = time.time()
29 |     with open(output_filename, 'wb') as fout:
30 |         with open(data_filename, 'r') as fin:
31 |             for line in fin:
32 |                 try:
33 |                     myjson = json.loads(line)
34 |                     url = myjson['url']
35 |                     if url in urls:
36 |                         print('removing', myjson)
37 |                         removed_docs += 1
38 |                         removed_chars += len(myjson['text'])
39 |                         continue
40 |                     myjson = json.dumps(myjson, ensure_ascii=False)
41 |                     fout.write(myjson.encode('utf-8'))
42 |                     fout.write('\n'.encode('utf-8'))
43 |                     written_docs += 1
44 |                     if written_docs % 10000 == 0:
45 |                         print(' [PROCESSED] time (s): {:.2f} | written: {} '
46 |                               '| removed: {} (char: {})'.format(
47 |                                   time.time() - start_time,
48 |                                   written_docs, removed_docs, removed_chars))
49 |                 except Exception as e:
50 |                     print('[SKIPPING]', line, e)
51 | 
52 |     print(' [PROCESSED] time (s): {:.2f} | written: {} '
53 |           '| removed: {} (char: {})'.format(
54 |               time.time() - start_time,
55 |               written_docs, removed_docs, removed_chars))
56 |     print('done :-)')
57 | 


--------------------------------------------------------------------------------
/tools/prepare_c4_megatron.sh:
--------------------------------------------------------------------------------
 1 | # for i from 00000 to 00100
 2 | mkdir -p assets/data/preprocessed
 3 | for i in {00014..00020}; do
 4 |        echo "Processing ./assets/data/en/c4-train.${i}-of-01024.json"
 5 |        python tools/preprocess_data.py \
 6 |               --input "./assets/data/en/c4-train.${i}-of-01024.json" \
 7 |               --output-prefix assets/data/preprocessed/llama2_${i} \
 8 |               --vocab-file ./assets/checkpoints/llama2_7b_hf/tokenizer.json \
 9 |               --tokenizer-type Llama2Tokenizer \
10 |               --tokenizer-model ./assets/checkpoints/llama2_7b_hf/tokenizer.model \
11 |               --append-eod \
12 |               --workers 8
13 | done


--------------------------------------------------------------------------------
/tools/retro/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .cli import retro
4 | 


--------------------------------------------------------------------------------
/tools/retro/cli/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | 
 5 | from . import retro
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     retro.init(os.environ["RETRO_WORKDIR"])
10 | 


--------------------------------------------------------------------------------
/tools/retro/db/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .build import build_db
4 | 


--------------------------------------------------------------------------------
/tools/retro/db/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import json
 4 | import numpy as np
 5 | import torch
 6 | from tqdm import tqdm
 7 | 
 8 | from megatron import get_args, print_rank_0
 9 | from tools.retro.external_libs import h5py
10 | from tools.retro.utils import get_gpt_tokenizer
11 | 
12 | 
13 | class DBDataset(torch.utils.data.Dataset):
14 |     '''Dataset for iterating chunks.
15 | 
16 |     Requires:
17 |     - List of indexed datasets
18 |     - Chunk index array, with format:
19 |         [dataset_idx, doc_id, start_idx, end_idx, bert_length])
20 |     '''
21 | 
22 |     def __init__(self, db_path, indexed_datasets, chunks, max_chunk_length):
23 | 
24 |         assert chunks.shape[1] == 5, "expected 5 columns (dataset_idx, " \
25 |         "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); " \
26 |         "found %d columns." % chunks.shape[1]
27 | 
28 |         self.db_path = db_path
29 |         self.indexed_datasets = indexed_datasets
30 |         self.chunks = chunks
31 |         self.doc_chunk_map = None
32 | 
33 |         self.max_chunk_length = max_chunk_length
34 |         self.eod_token_id = get_gpt_tokenizer().eod
35 | 
36 |     def __len__(self):
37 |         return self.chunks.shape[0]
38 | 
39 |     def __getitem__(self, chunk_id):
40 | 
41 |         # Chunk start/end indexes.
42 |         indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = \
43 |             [ value.item() for value in self.chunks[chunk_id] ]
44 |         chunk_length = token_end_idx - token_start_idx
45 |         indexed_dataset = self.indexed_datasets[indexed_dataset_id]
46 | 
47 |         # Chunk token ids.
48 |         token_ids = indexed_dataset.get(doc_id,
49 |                                         offset=token_start_idx,
50 |                                         length=chunk_length)
51 | 
52 |         # Extend chunks to max_chunk_length by padding with EOD tokens.
53 |         if chunk_length != self.max_chunk_length:
54 |             assert chunk_length < self.max_chunk_length, "invalid chunk len."
55 |             token_ids = token_ids.tolist()
56 |             token_ids += [self.eod_token_id] * \
57 |                 (self.max_chunk_length - chunk_length)
58 | 
59 |         return {
60 |             "doc_id" : doc_id,
61 |             "text" : np.array(token_ids, dtype=np.int64),
62 |         }
63 | 
64 |     def load_doc_tuples(self):
65 |         '''Load the dataset & document ids.
66 | 
67 |         Load the dataset id & document id of each chunk in the database, to
68 |         be used for causality filtering during querying.
69 |         '''
70 |         self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32")
71 |         block_size = int(1e6)
72 |         for start_idx in tqdm(range(0, len(self), block_size)):
73 |             end_idx = min(len(self), start_idx + block_size)
74 |             self.doc_tuples[start_idx:end_idx]=self.chunks[start_idx:end_idx,:2]
75 | 


--------------------------------------------------------------------------------
/tools/retro/examples/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3
 2 | 
 3 | RUN pip install -U faiss-gpu
 4 | 
 5 | RUN apt update
 6 | 
 7 | RUN apt install -qy htop
 8 | 
 9 | RUN pip install -U transformers
10 | 
11 | RUN pip install --upgrade google-api-python-client
12 | 
13 | RUN pip install sentencepiece
14 | 
15 | RUN pip install h5py
16 | 
17 | RUN pip install nltk
18 | 
19 | RUN pip install einops
20 | 


--------------------------------------------------------------------------------
/tools/retro/examples/pretrain_model.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -u
  4 | 
  5 | unset NCCL_DEBUG
  6 | export CUDA_DEVICE_MAX_CONNECTIONS=1
  7 | 
  8 | ######## GPT or Retro?. ########
  9 | 
 10 | # 0 : GPT.
 11 | # 1 : Retro
 12 | 
 13 | ADD_RETRIEVER=1
 14 | 
 15 | ######## Megatron, Retro dirs. ########
 16 | 
 17 | REPO_DIR="<path/to/megatron/repo>"
 18 | RETRO_WORKDIR="<path/to/retro/data/directory>"
 19 | 
 20 | ######## Data. ########
 21 | 
 22 | DATA_BLEND="<see --data-path in arguments.py>"
 23 | 
 24 | ######## Args. ########
 25 | 
 26 | ARGS=" \
 27 |     --log-interval 1 \
 28 |     --use-flash-attn \
 29 |     --apply-layernorm-1p \
 30 |     --untie-embeddings-and-output-weights \
 31 |     --disable-bias-linear \
 32 |     --no-position-embedding \
 33 |     --use-rotary-position-embeddings \
 34 |     --rotary-percent 0.5 \
 35 |     --swiglu \
 36 |     --attention-dropout 0.0 \
 37 |     --hidden-dropout 0.0 \
 38 |     --exit-duration-in-mins 220 \
 39 |     --tensor-model-parallel-size 1 \
 40 |     --pipeline-model-parallel-size 1 \
 41 |     --num-layers 24 \
 42 |     --hidden-size 1024 \
 43 |     --num-attention-heads 16 \
 44 |     --seq-length 512 \
 45 |     --max-position-embeddings 512 \
 46 |     --micro-batch-size 16 \
 47 |     --global-batch-size 256 \
 48 |     --train-samples 200000 \
 49 |     --lr-decay-samples 175000 \
 50 |     --lr-warmup-samples 10000 \
 51 |     --lr 2.5e-5 \
 52 |     --min-lr 2.5e-6 \
 53 |     --lr-decay-style cosine \
 54 |     --eval-iters 50 \
 55 |     --eval-interval 2000 \
 56 |     --tokenizer-type GPTSentencePieceTokenizer \
 57 |     --tokenizer-model <path/to/gpt/tokenizer/model> \
 58 |     --data-path ${DATA_BLEND} \
 59 |     --split 98,2,0 \
 60 |     --clip-grad 1.0 \
 61 |     --weight-decay 0.1 \
 62 |     --adam-beta1 0.9 \
 63 |     --adam-beta2 0.95 \
 64 |     --init-method-std 0.007 \
 65 |     --log-params-norm \
 66 |     --log-num-zeros-in-grad \
 67 |     --bf16 \
 68 | "
 69 | 
 70 | ######## Retro. ########
 71 | 
 72 | if [ "$ADD_RETRIEVER" = "0" ]; then
 73 |     SCRIPT=pretrain_gpt.py
 74 | else
 75 |     ARGS="${ARGS} \
 76 |     --retro-workdir ${RETRO_WORKDIR} \
 77 |     --retro-add-retriever \
 78 |     "
 79 |     SCRIPT=pretrain_retro.py
 80 | fi
 81 | 
 82 | ######## Command. ########
 83 | 
 84 | NPROCS=8
 85 | CMD="\
 86 |     pwd && cd ${REPO_DIR} && pwd && \
 87 |     export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \
 88 |     python -m torch.distributed.run \
 89 |     --nproc_per_node ${NPROCS} \
 90 |     --nnodes 1 \
 91 |     --node_rank ${NODE_RANK} \
 92 |     --master_addr ${MASTER_ADDR} \
 93 |     --master_port 6000 \
 94 |     ${SCRIPT} ${ARGS} \
 95 | "
 96 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
 97 | echo "CMD = '$CMD'."
 98 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~"
 99 | eval $CMD
100 | 


--------------------------------------------------------------------------------
/tools/retro/external_libs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import importlib
 4 | 
 5 | required_libs = [
 6 |     "faiss",
 7 |     "h5py",
 8 |     "transformers", # for huggingface bert
 9 | ]
10 | 
11 | for lib in required_libs:
12 |     try:
13 |         globals()[lib] = importlib.import_module(lib)
14 |     except ImportError as e:
15 |         raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.")
16 | 


--------------------------------------------------------------------------------
/tools/retro/index/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .build import add_to_index, build_index, train_index
4 | # from .index import Index
5 | 


--------------------------------------------------------------------------------
/tools/retro/index/factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | from .indexes import FaissBaseIndex, FaissParallelAddIndex
 4 | 
 5 | 
 6 | class IndexFactory:
 7 |     '''Get index.
 8 | 
 9 |     Index type generally read from argument '--retro-index-ty'.
10 |     '''
11 | 
12 |     @classmethod
13 |     def get_index_class(cls, index_type):
14 |         return {
15 |             "faiss-base" : FaissBaseIndex,
16 |             "faiss-par-add" : FaissParallelAddIndex,
17 |         }[index_type]
18 | 
19 |     @classmethod
20 |     def get_index(cls, index_type):
21 |         index_class = cls.get_index_class(index_type)
22 |         index = index_class()
23 |         return index
24 | 


--------------------------------------------------------------------------------
/tools/retro/index/index.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import abc
 4 | import numpy as np
 5 | import os
 6 | import torch
 7 | 
 8 | from megatron import get_retro_args
 9 | from tools.retro.external_libs import faiss
10 | 
11 | from .utils import get_index_dir
12 | 
13 | 
14 | class Index(abc.ABC):
15 | 
16 |     '''Abstract base class for indexes.
17 | 
18 |     *Note* : While currently only Faiss-based classes are implemented, in the
19 |     future, this class will be extended with other types of indexes that have
20 |     different performance-accuracy trade-offs.
21 | 
22 |     The primary methods to override are:
23 |     - train() : Train index on the sampled training chunks.
24 |     - add() : Add all training chunks to index.
25 |     '''
26 | 
27 |     @classmethod
28 |     def c_verbose(cls, index, v):
29 |         '''Make index object verbose.'''
30 |         assert isinstance(v, bool)
31 |         faiss.ParameterSpace().set_index_parameter(index, "verbose", v)
32 | 
33 |     def get_empty_index_path(self):
34 |         args = get_retro_args()
35 |         return os.path.join(
36 |             get_index_dir(),
37 |             "empty_%.3f.faissindex" % args.retro_index_train_load_fraction,
38 |         )
39 | 
40 |     def get_empty_index(self):
41 |         return faiss.read_index(self.get_empty_index_path())
42 | 
43 |     def get_added_index_path(self):
44 |         args = get_retro_args()
45 |         return os.path.join(
46 |             get_index_dir(),
47 |             "added_%.3f_%.3f.faissindex" % (
48 |                 args.retro_index_train_load_fraction,
49 |                 args.retro_index_add_load_fraction,
50 |             ),
51 |         )
52 | 
53 |     def get_added_index(self):
54 |         return faiss.read_index(self.get_added_index_path())
55 | 
56 |     @abc.abstractmethod
57 |     def train(self, *args):
58 |         pass
59 | 
60 |     @abc.abstractmethod
61 |     def add(self, *args):
62 |         pass
63 | 
64 |     def embed_text_dataset_block(self, embedder, text_dataset, _range):
65 |         '''Embed a range of a text dataset.'''
66 |         sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range))
67 |         return embedder.embed_text_dataset(sub_dataset)
68 | 


--------------------------------------------------------------------------------
/tools/retro/index/indexes/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .faiss_base import FaissBaseIndex
4 | from .faiss_par_add import FaissParallelAddIndex
5 | 


--------------------------------------------------------------------------------
/tools/retro/index/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import concurrent
 4 | import gc
 5 | import glob
 6 | import numpy as np
 7 | import os
 8 | import psutil
 9 | import time
10 | import torch
11 | from tqdm import tqdm
12 | 
13 | from megatron import get_retro_args, print_rank_0
14 | from tools.retro.db.utils import get_indexed_dataset_infos
15 | from tools.retro.external_libs import h5py
16 | 
17 | 
18 | def get_index_dir():
19 |     """Create sub-directory for this index."""
20 | 
21 |     args = get_retro_args()
22 | 
23 |     # Directory path.
24 |     index_dir_path = os.path.join(
25 |         args.retro_workdir,
26 |         "index",
27 |         args.retro_index_type,
28 |         args.retro_index_str,
29 |     )
30 | 
31 |     # Make directory.
32 |     os.makedirs(index_dir_path, exist_ok=True)
33 | 
34 |     return index_dir_path
35 | 
36 | 
37 | def num_samples_to_block_ranges(num_samples):
38 |     '''Split a range (length num_samples) into sequence of block ranges
39 |     of size block_size.'''
40 |     args = get_retro_args()
41 |     block_size = args.retro_block_size
42 |     start_idxs = list(range(0, num_samples, block_size))
43 |     end_idxs = [min(num_samples, s + block_size) for s in start_idxs]
44 |     ranges = list(zip(start_idxs, end_idxs))
45 |     return ranges
46 | 
47 | 
48 | def get_training_data_root_dir():
49 |     args = get_retro_args()
50 |     return os.path.join(args.retro_workdir, "index", "train_emb")
51 | 
52 | 
53 | def get_training_data_block_dir():
54 |     return os.path.join(get_training_data_root_dir(), "blocks")
55 | 
56 | 
57 | def get_training_data_block_paths():
58 |     return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5"))
59 | 
60 | 
61 | def get_training_data_merged_path():
62 |     args = get_retro_args()
63 |     return os.path.join(get_training_data_root_dir(),
64 |                         "train_%.3f.bin" % args.retro_index_train_load_fraction)
65 | 
66 | 
67 | def get_added_codes_dir():
68 |     return os.path.join(get_index_dir(), "add_codes")
69 | 
70 | 
71 | def get_added_code_paths():
72 |     return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5"))
73 | 


--------------------------------------------------------------------------------
/tools/retro/query/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
2 | 
3 | from .query import query_pretraining_neighbors
4 | 


--------------------------------------------------------------------------------
/tools/retro/query/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import hashlib
 4 | import os
 5 | 
 6 | from megatron import get_retro_args
 7 | 
 8 | 
 9 | def get_query_workdir():
10 |     args = get_retro_args()
11 |     return os.path.join(args.retro_workdir, "query")
12 | 
13 | 
14 | def get_neighbor_dirname(key, dataset):
15 |     return os.path.join(get_query_workdir(), os.path.basename(f"{key}_{dataset.unique_description_hash}"))
16 | 


--------------------------------------------------------------------------------
/tools/retro/sft/README.md:
--------------------------------------------------------------------------------
1 | ## Note
2 | 
3 | The content within this `sft` directory is still under active development and will be updated soon.


--------------------------------------------------------------------------------
/tools/retro/sft/open_inst.sh:
--------------------------------------------------------------------------------
1 | DATA_BLEND="1.0 open_inst"
2 | 


--------------------------------------------------------------------------------
/tools/retro/text_generation/metrics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # The following code is adapted from
 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 
 4 | # which is licensed under the MIT license. More details on the license can be 
 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE.
 6 | 
 7 | """Provides standard metric evaluations for dialog."""
 8 | 
 9 | from collections import Counter
10 | from typing import List
11 | import numpy as np
12 | import re
13 | from nltk import ngrams
14 | 
15 | re_art = re.compile(r'\b(a|an|the)\b')
16 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']')
17 | 
18 | 
19 | def normalize_answer(s):
20 |     """
21 |     Lower text and remove punctuation, articles and extra whitespace.
22 |     """
23 |     s = s.lower()
24 |     s = re_punc.sub(' ', s)
25 |     s = re_art.sub(' ', s)
26 |     s = ' '.join(s.split())
27 |     return s
28 | 
29 | 
30 | class F1Metric:
31 |     """
32 |     Helper class which computes token-level F1.
33 |     """
34 | 
35 |     @staticmethod
36 |     def _prec_recall_f1_score(pred_items, gold_items):
37 |         """
38 |         Compute precision, recall and f1 given a set of gold and prediction items.
39 |         :param pred_items: iterable of predicted values
40 |         :param gold_items: iterable of gold values
41 |         :return: tuple (p, r, f1) for precision, recall, f1
42 |         """
43 |         common = Counter(gold_items) & Counter(pred_items)
44 |         num_same = sum(common.values())
45 |         if num_same == 0:
46 |             return 0, 0, 0
47 |         precision = 1.0 * num_same / len(pred_items)
48 |         recall = 1.0 * num_same / len(gold_items)
49 |         f1 = (2 * precision * recall) / (precision + recall)
50 |         return precision, recall, f1
51 | 
52 |     @staticmethod
53 |     def compute_each_pair(guess: str, answer: str, n=1):
54 |         if answer == "":
55 |             return None, None, None
56 |         if guess == "":
57 |             return 0, 0, 0
58 |         g_tokens = normalize_answer(guess).split()
59 |         a_tokens = normalize_answer(answer).split()
60 |         g_tokens = list(ngrams(g_tokens, n))
61 |         a_tokens = list(ngrams(a_tokens, n))
62 |         precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens)
63 |         return precision, recall, f1
64 | 
65 |     @staticmethod
66 |     def compute_all_pairs(guesses: List[str], answers: List[str], n=1):
67 |         # additional augment:
68 |         print("guess:", len(guesses), ", answers:", len(answers))
69 |         assert len(guesses) == len(answers)
70 | 
71 |         precision_list, recall_list, f1_list = [], [], []
72 |         for guess, answer in zip(guesses, answers):
73 |             precision, recall, f1 = F1Metric.compute_each_pair(guess, answer, n)
74 |             if precision is None or recall is None or f1 is None:
75 |                 continue
76 |             precision_list.append(precision)
77 |             recall_list.append(recall)
78 |             f1_list.append(f1)
79 | 
80 |         return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list)
81 | 


--------------------------------------------------------------------------------
/tools/retro/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | import os
 4 | import torch
 5 | import types
 6 | 
 7 | from megatron import get_retro_args
 8 | from megatron.tokenizer.tokenizer import (
 9 |     _BertWordPieceTokenizer,
10 |     _GPT2BPETokenizer,
11 |     _GPTSentencePieceTokenizer,
12 | )
13 | 
14 | 
15 | def get_args_path(workdir):
16 |     '''Argument copy stored within retro workdir.'''
17 |     return os.path.join(workdir, "args.json")
18 | 
19 | 
20 | def get_num_chunks_per_sample():
21 |     '''Compute seq_length // chunk_length.'''
22 |     args = get_retro_args()
23 |     sample_length = args.retro_gpt_seq_length
24 |     chunk_length = args.retro_gpt_chunk_length
25 |     assert sample_length % chunk_length == 0
26 |     return sample_length // chunk_length
27 | 
28 | 
29 | def get_gpt_tokenizer():
30 |     '''GPT (BPE) tokenizer.'''
31 |     args = get_retro_args()
32 |     tokenizer_type = args.retro_gpt_tokenizer_type
33 |     if tokenizer_type == "GPT2BPETokenizer":
34 |         assert args.retro_gpt_vocab_file and args.retro_gpt_merge_file
35 |         return _GPT2BPETokenizer(
36 |             vocab_file=args.retro_gpt_vocab_file,
37 |             merge_file=args.retro_gpt_merge_file,
38 |         )
39 |     elif tokenizer_type == 'GPTSentencePieceTokenizer':
40 |         assert args.retro_gpt_tokenizer_model is not None
41 |         return _GPTSentencePieceTokenizer(args.retro_gpt_tokenizer_model)
42 |     else:
43 |         raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type)
44 | 
45 | 
46 | def get_bert_tokenizer():
47 |     '''Bert (Wordpiece) tokenizer.'''
48 |     args = get_retro_args()
49 |     lower_case = {
50 |         "BertWordPieceLowerCase" : True,
51 |         "BertWordPieceCase" : False,
52 |     }[args.retro_bert_tokenizer_type]
53 |     return _BertWordPieceTokenizer(
54 |         vocab_file=args.retro_bert_vocab_file,
55 |         lower_case=lower_case,
56 |     )
57 | 
58 | 
59 | class GPTToTextDataset(torch.utils.data.Dataset):
60 |     '''Dataset to convert GPT tokens to text.'''
61 | 
62 |     def __init__(self, gpt_dataset):
63 | 
64 |         super().__init__()
65 | 
66 |         self.gpt_dataset = gpt_dataset
67 |         self.gpt_tokenizer = get_gpt_tokenizer()
68 | 
69 |     def __len__(self):
70 |         return len(self.gpt_dataset)
71 | 
72 |     def __getitem__(self, idx):
73 |         gpt_token_ids = self.gpt_dataset[idx]["text"].tolist()
74 |         text = self.gpt_tokenizer.detokenize(gpt_token_ids)
75 |         return {"text": text}
76 | 


--------------------------------------------------------------------------------
/tools/text_generation_cli.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | import sys
 3 | import json
 4 | import requests
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     url = sys.argv[1]
 9 |     url = 'http://' + url + '/api'
10 |     headers = {'Content-Type': 'application/json'}
11 | 
12 |     while True:
13 |         sentence = input("Enter prompt: ")
14 |         tokens_to_generate = int(eval(input("Enter number of tokens to generate: ")))
15 | 
16 |         data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate}
17 |         response = requests.put(url, data=json.dumps(data), headers=headers)
18 | 
19 |         if response.status_code != 200:
20 |             print(f"Error {response.status_code}: {response.json()['message']}")
21 |         else:
22 |             print("Megatron Response: ")
23 |             print(response.json()['text'][0])
24 | 


--------------------------------------------------------------------------------