├── .gitignore ├── LICENSE ├── README.md ├── assets ├── animation-LQ.gif ├── animation.gif ├── c4-blend-llama2.sh ├── c4-blend-llama3.1.sh ├── c4-blend-llama3.sh ├── exp_results.png ├── mask-learning.gif ├── mask-sampling.png └── teaser.png ├── docs ├── export_hf.md └── preprocess_c4.md ├── eval_llama_ppl.py ├── learnable_sparsity ├── __init__.py ├── differentiable_mask.py └── ste.py ├── megatron ├── __init__.py ├── arguments.py ├── checkpointing.py ├── core │ ├── README.md │ ├── __init__.py │ ├── datasets │ │ ├── Makefile │ │ ├── __init__.py │ │ ├── bert_dataset.py │ │ ├── blended_dataset.py │ │ ├── blended_megatron_dataset_builder.py │ │ ├── blended_megatron_dataset_config.py │ │ ├── gpt_dataset.py │ │ ├── helpers.cpp │ │ ├── indexed_dataset.py │ │ ├── masked_dataset.py │ │ ├── megatron_dataset.py │ │ ├── megatron_tokenizer.py │ │ ├── multimodal_dataset.py │ │ ├── readme.md │ │ ├── t5_dataset.py │ │ └── utils.py │ ├── dist_checkpointing │ │ ├── __init__.py │ │ ├── core.py │ │ ├── dict_utils.py │ │ ├── mapping.py │ │ ├── optimizer.py │ │ ├── serialization.py │ │ ├── strategies │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── tensorstore.py │ │ │ ├── two_stage.py │ │ │ └── zarr.py │ │ └── utils.py │ ├── distributed │ │ ├── __init__.py │ │ ├── distributed_data_parallel.py │ │ ├── finalize_model_grads.py │ │ └── grad_buffer.py │ ├── enums.py │ ├── fusions │ │ ├── __init__.py │ │ ├── fused_bias_dropout.py │ │ ├── fused_bias_gelu.py │ │ ├── fused_bias_swiglu.py │ │ ├── fused_layer_norm.py │ │ └── fused_softmax.py │ ├── inference_params.py │ ├── jit.py │ ├── model_parallel_config.py │ ├── models │ │ ├── T5 │ │ │ ├── __init__.py │ │ │ ├── t5_model.py │ │ │ └── t5_spec.py │ │ ├── __init__.py │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── bert_layer_specs.py │ │ │ ├── bert_lm_head.py │ │ │ ├── bert_model.py │ │ │ └── pooler.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── embeddings │ │ │ │ ├── __init__.py │ │ │ │ ├── language_model_embedding.py │ │ │ │ └── rotary_pos_embedding.py │ │ │ ├── language_module │ │ │ │ ├── __init__.py │ │ │ │ └── language_module.py │ │ │ └── vision_module │ │ │ │ ├── __init__.py │ │ │ │ └── vision_module.py │ │ ├── gpt │ │ │ ├── __init__.py │ │ │ ├── gpt_layer_specs.py │ │ │ └── gpt_model.py │ │ ├── retro │ │ │ ├── __init__.py │ │ │ ├── base_attention.py │ │ │ ├── config.py │ │ │ ├── decoder_attention.py │ │ │ ├── decoder_spec.py │ │ │ ├── encoder_attention.py │ │ │ ├── encoder_spec.py │ │ │ └── model.py │ │ └── vision │ │ │ ├── __init__.py │ │ │ └── clip_vit_model.py │ ├── optimizer │ │ ├── __init__.py │ │ ├── clip_grads.py │ │ ├── distrib_optimizer.py │ │ ├── grad_scaler.py │ │ ├── optimizer.py │ │ └── optimizer_config.py │ ├── package_info.py │ ├── packed_seq_params.py │ ├── parallel_state.py │ ├── pipeline_parallel │ │ ├── __init__.py │ │ ├── p2p_communication.py │ │ └── schedules.py │ ├── requirements.txt │ ├── tensor_parallel │ │ ├── __init__.py │ │ ├── cross_entropy.py │ │ ├── data.py │ │ ├── layers.py │ │ ├── mappings.py │ │ ├── random.py │ │ └── utils.py │ ├── timers.py │ ├── transformer │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── custom_layers │ │ │ ├── __init__.py │ │ │ └── transformer_engine.py │ │ ├── dot_product_attention.py │ │ ├── enums.py │ │ ├── identity_op.py │ │ ├── mlp.py │ │ ├── module.py │ │ ├── moe │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── experts.py │ │ │ ├── grouped_gemm_util.py │ │ │ ├── moe_layer.py │ │ │ ├── moe_utils.py │ │ │ ├── router.py │ │ │ └── token_dispatcher.py │ │ ├── spec_utils.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── utils.py │ └── utils.py ├── data │ ├── __init__.py │ ├── autoaugment.py │ ├── biencoder_dataset_utils.py │ ├── data_samplers.py │ ├── dataset_utils.py │ ├── ict_dataset.py │ ├── image_folder.py │ ├── multimodal_dataset.py │ ├── orqa_wiki_dataset.py │ ├── realm_dataset_utils.py │ ├── realm_index.py │ └── vit_dataset.py ├── dist_signal_handler.py ├── fp16_deprecated │ └── loss_scaler.py ├── fused_kernels │ ├── __init__.py │ ├── compat.h │ ├── tests │ │ ├── __init__.py │ │ └── test_fused_kernels.py │ └── type_shim.h ├── global_vars.py ├── indexer.py ├── initialize.py ├── log_handler.py ├── memory.py ├── microbatches.py ├── model │ ├── __init__.py │ ├── bert_model.py │ ├── biencoder_model.py │ ├── classification.py │ ├── enums.py │ ├── fused_bias_gelu.py │ ├── fused_layer_norm.py │ ├── fused_softmax.py │ ├── gpt_model.py │ ├── language_model.py │ ├── module.py │ ├── multiple_choice.py │ ├── realm_model.py │ ├── rms_norm.py │ ├── t5_model.py │ ├── transformer.py │ ├── utils.py │ └── vision │ │ ├── classification.py │ │ ├── dino.py │ │ ├── esvit_swin_backbone.py │ │ ├── inpainting.py │ │ ├── knn_monitor.py │ │ ├── mit_backbone.py │ │ ├── swin_backbone.py │ │ ├── utils.py │ │ └── vit_backbone.py ├── mpu │ └── tests │ │ ├── __init__.py │ │ ├── commons.py │ │ ├── test_cross_entropy.py │ │ ├── test_data.py │ │ ├── test_initialize.py │ │ ├── test_layers.py │ │ └── test_random.py ├── optimizer_param_scheduler.py ├── static │ └── index.html ├── text_generation │ ├── __init__.py │ ├── api.py │ ├── beam_utils.py │ ├── communication.py │ ├── forward_step.py │ ├── generation.py │ ├── sampling.py │ └── tokenization.py ├── text_generation_server.py ├── theoretical_memory_usage.py ├── tokenizer │ ├── __init__.py │ ├── auto_tokenization.py │ ├── bert_tokenization.py │ ├── gpt2_tokenization.py │ └── tokenizer.py ├── training.py └── utils.py ├── pretrain_gpt.py ├── pretrain_maskllm.py ├── run_docker.sh ├── scripts ├── data │ ├── download_c4.py │ ├── prepare_c4_megatron_llama2.sh │ ├── prepare_c4_megatron_llama3.1.sh │ └── prepare_c4_megatron_llama3.sh ├── learnable_sparsity │ ├── llama2_13b_mask_only_tp8_c4.sh │ ├── llama2_7b_mask_only_tp8_c4.sh │ ├── llama3.1_8b_mask_only_tp8_c4.sh │ └── llama3_8b_mask_only_tp8_c4.sh ├── oneshot │ ├── run_llama2_13b_prune_tp8.sh │ ├── run_llama2_7b_prune_tp8.sh │ ├── run_llama3.1_8b_prune_tp8.sh │ └── run_llama3_8b_prune_tp8.sh ├── ppl │ ├── evaluate_llama2_wikitext2.sh │ ├── evaluate_llama3.1_wikitext2.sh │ └── evaluate_llama3_wikitext2.sh └── tools │ ├── convert_llama2_13b_hf_to_megatron.sh │ ├── convert_llama2_7b_hf_to_megatron.sh │ ├── convert_llama2_7b_tp8_to_tp1.sh │ ├── convert_llama3.1_8b_hf_to_megatron.sh │ ├── convert_llama3.1_8b_tp8_to_tp1.sh │ ├── convert_llama3_8b_hf_to_megatron.sh │ ├── convert_llama3_8b_tp8_to_tp1.sh │ ├── download_llama2_13b_hf.py │ ├── download_llama2_7b_hf.py │ ├── download_llama3.1_8b_hf.py │ └── download_llama3_8b_hf.py ├── tasks ├── data_utils.py ├── ensemble_classifier.py ├── eval_utils.py ├── finetune_utils.py ├── glue │ ├── data.py │ ├── finetune.py │ ├── mnli.py │ └── qqp.py ├── latency │ ├── datasets.py │ ├── detokenizer.py │ └── test_latency.py ├── main.py ├── msdp │ ├── README.md │ ├── evaluate.py │ ├── main.py │ ├── metrics.py │ ├── preprocessing.py │ └── prompt.py ├── orqa │ ├── README.md │ ├── evaluate_orqa.py │ ├── evaluate_utils.py │ ├── supervised │ │ ├── data.py │ │ ├── eval_utils.py │ │ └── finetune.py │ └── unsupervised │ │ ├── nq.py │ │ ├── qa_utils.py │ │ └── tokenizers.py ├── pruning │ ├── datasets.py │ ├── detokenizer.py │ ├── exclude_layers.py │ ├── layerwrapper.py │ ├── optimizer.py │ ├── prune_main_llama.py │ ├── prune_main_subdomain.py │ ├── run_sensitivity.py │ └── sparsity │ │ ├── __init__.py │ │ ├── core.py │ │ └── utils │ │ ├── __init__.py │ │ ├── datautils.py │ │ └── modelutils.py ├── race │ ├── data.py │ └── finetune.py ├── vision │ ├── classification │ │ ├── classification.py │ │ └── eval_utils.py │ ├── finetune_utils.py │ ├── main.py │ └── segmentation │ │ ├── cityscapes.py │ │ ├── data.py │ │ ├── finetune_segformer.py │ │ ├── finetune_setr.py │ │ ├── metrics.py │ │ ├── seg_heads.py │ │ ├── seg_models.py │ │ ├── transforms.py │ │ └── utils.py └── zeroshot_gpt │ ├── datasets.py │ ├── detokenizer.py │ └── evaluate.py ├── tool_apply_sparsity.py ├── tool_compress_mask.py ├── tool_compute_mask_hf.py ├── tool_export_to_hf.py ├── tool_trim_learnable_sparsity.py └── tools ├── __init__.py ├── autoformat.sh ├── bert_embedding ├── __init__.py ├── dataset.py ├── embed.py ├── external_libs.py ├── huggingface.py └── utils.py ├── checkpoint ├── loader_llama2_hf.py ├── loader_megatron.py ├── saver_megatron.py └── util.py ├── download_c4.py ├── linter.py ├── merge_datasets.py ├── openwebtext ├── README.md ├── add_id.py ├── blacklist_urls.py ├── cleanup_dataset.py ├── cleanup_fix_dataset.py ├── filter_ngrams.py ├── find_duplicates.py ├── group_duplicate_url.py ├── merge_jsons.py └── remove_group_duplicates.py ├── prepare_c4_megatron.sh ├── preprocess_data.py ├── preprocess_data_nmt.py ├── preprocess_mmdata.py ├── retro ├── README.md ├── build_db.md ├── cli │ ├── __init__.py │ ├── __main__.py │ └── cli.py ├── db │ ├── __init__.py │ ├── build.py │ ├── dataset.py │ └── utils.py ├── examples │ ├── Dockerfile │ ├── preprocess_data.sh │ └── pretrain_model.sh ├── external_libs.py ├── index │ ├── __init__.py │ ├── build.py │ ├── factory.py │ ├── index.py │ ├── indexes │ │ ├── __init__.py │ │ ├── faiss_base.py │ │ └── faiss_par_add.py │ └── utils.py ├── main.py ├── query │ ├── __init__.py │ ├── chunk_dataset.py │ ├── multi_split_gpt_dataset.py │ ├── query.py │ ├── retro_dataset.py │ └── utils.py ├── sft │ ├── README.md │ ├── dataset_conv.py │ ├── open_inst.sh │ ├── sft_retro.py │ └── sft_retro_lm.sh ├── text_generation │ ├── evaluate.py │ ├── metrics.py │ ├── retro_api.py │ ├── retro_generate.sh │ ├── retro_generation.py │ └── retro_text_generation.py └── utils.py ├── run_text_generation_server.py └── text_generation_cli.py /.gitignore: -------------------------------------------------------------------------------- 1 | assets/cache 2 | assets/checkpoints 3 | assets/data 4 | CACHE 5 | output 6 | __pycache__ 7 | megatron/fused_kernels/build 8 | megatron/core/datasets/helpers.cpython-310-x86_64-linux-gnu.so 9 | wandb -------------------------------------------------------------------------------- /assets/animation-LQ.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/animation-LQ.gif -------------------------------------------------------------------------------- /assets/animation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/animation.gif -------------------------------------------------------------------------------- /assets/c4-blend-llama2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # multilingual datasets 4 | C4_HOME=assets/data/c4_llama2_pretokenized 5 | DATA_BLEND="" 6 | for i in {00000..00019}; do # 1/20 7 | DATA_BLEND="${DATA_BLEND} 0.05 ${C4_HOME}/c4_llama2_${i}_text_document" 8 | done 9 | -------------------------------------------------------------------------------- /assets/c4-blend-llama3.1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # multilingual datasets 4 | C4_HOME=assets/data/c4_llama3.1_pretokenized 5 | DATA_BLEND="" 6 | for i in {00000..00019}; do # 1/20 7 | DATA_BLEND="${DATA_BLEND} 0.05 ${C4_HOME}/c4_llama3.1_${i}_text_document" 8 | done 9 | -------------------------------------------------------------------------------- /assets/c4-blend-llama3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # multilingual datasets 4 | C4_HOME=assets/data/c4_llama3_pretokenized 5 | DATA_BLEND="" 6 | for i in {00000..00019}; do # 1/20 7 | DATA_BLEND="${DATA_BLEND} 0.05 ${C4_HOME}/c4_llama3_${i}_text_document" 8 | done 9 | -------------------------------------------------------------------------------- /assets/exp_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/exp_results.png -------------------------------------------------------------------------------- /assets/mask-learning.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/mask-learning.gif -------------------------------------------------------------------------------- /assets/mask-sampling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/mask-sampling.png -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/assets/teaser.png -------------------------------------------------------------------------------- /learnable_sparsity/__init__.py: -------------------------------------------------------------------------------- 1 | from .differentiable_mask import ColumnParallelLinearSparse, RowParallelLinearSparse, convert_to_sparse_model, DifferentiableMask 2 | from . import ste -------------------------------------------------------------------------------- /megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args, get_retro_args 6 | from .global_vars import get_current_global_batch_size 7 | from .global_vars import get_num_microbatches 8 | from .global_vars import get_signal_handler 9 | from .global_vars import update_num_microbatches 10 | from .global_vars import get_tokenizer 11 | from .global_vars import get_tensorboard_writer 12 | from .global_vars import get_wandb_writer 13 | from .global_vars import get_one_logger 14 | from .global_vars import get_adlr_autoresume 15 | from .global_vars import get_timers 16 | from .initialize import initialize_megatron 17 | 18 | from .utils import (print_rank_0, 19 | is_last_rank, 20 | print_rank_last) 21 | -------------------------------------------------------------------------------- /megatron/core/README.md: -------------------------------------------------------------------------------- 1 | Megatron Core is a library for efficient and scalable training of transformer based models. 2 | -------------------------------------------------------------------------------- /megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | import megatron.core.tensor_parallel 2 | import megatron.core.utils 3 | from megatron.core import parallel_state 4 | from megatron.core.distributed import DistributedDataParallel 5 | from megatron.core.inference_params import InferenceParams 6 | from megatron.core.model_parallel_config import ModelParallelConfig 7 | from megatron.core.timers import Timers 8 | 9 | # Alias parallel_state as mpu, its legacy name 10 | mpu = parallel_state 11 | 12 | __all__ = [ 13 | "parallel_state", 14 | "tensor_parallel", 15 | "utils", 16 | "DistributedDataParallel", 17 | "InferenceParams", 18 | "ModelParallelConfig", 19 | "Timers", 20 | ] 21 | -------------------------------------------------------------------------------- /megatron/core/datasets/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /megatron/core/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/datasets/__init__.py -------------------------------------------------------------------------------- /megatron/core/datasets/multimodal_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from dataclasses import dataclass 4 | from typing import Dict 5 | 6 | import numpy 7 | import torch 8 | 9 | from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset 10 | 11 | 12 | @dataclass 13 | class MultimodalDatasetConfig(GPTDatasetConfig): 14 | """Configuration object for Megatron Core Multimodal datasets. 15 | 16 | 17 | Note: This is unused at the moment and may be missing features. Follow-up changes will use this. 18 | 19 | Attributes: 20 | image_h (int): Image height. 21 | image_w (int): Image width. 22 | """ 23 | 24 | image_h: int = None 25 | image_w: int = None 26 | 27 | def __post_init__(self) -> None: 28 | super().__post_init__() 29 | 30 | assert self.image_h is not None 31 | assert self.image_w is not None 32 | 33 | 34 | class MockMultimodalDataset(MockGPTDataset): 35 | """Mock multimodal dataset. 36 | 37 | 38 | This is unused at the moment and may be missing features. Follow-up changes will use this. 39 | """ 40 | 41 | def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: 42 | """Return a sample that contains a dummy image, text sequence and the associated labels and cost and attention masks. 43 | 44 | Args: 45 | idx (int): The integer seed for mock data generation. 46 | 47 | Returns: 48 | Dict[str, numpy.ndarray]: The mock data. 49 | """ 50 | # Get a text sample. 51 | sample = super().__getitem__(idx) 52 | 53 | # Add mock input image. 54 | sample["image"] = torch.zeros( 55 | (3, self.config.image_h, self.config.image_w), dtype=torch.float32 56 | ) 57 | 58 | return sample 59 | -------------------------------------------------------------------------------- /megatron/core/datasets/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import logging 4 | from enum import Enum 5 | from typing import Any, List 6 | 7 | import numpy 8 | import torch 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Split(Enum): 14 | train = 0 15 | valid = 1 16 | test = 2 17 | 18 | 19 | def compile_helpers(): 20 | """Compile C++ helper functions at runtime. Make sure this is invoked on a single process. 21 | """ 22 | import os 23 | import subprocess 24 | 25 | command = ["make", "-C", os.path.abspath(os.path.dirname(__file__))] 26 | if subprocess.run(command).returncode != 0: 27 | import sys 28 | 29 | log_single_rank(logger, logging.ERROR, "Failed to compile the C++ dataset helper functions") 30 | sys.exit(1) 31 | 32 | 33 | def log_single_rank(logger: logging.Logger, *args: Any, rank: int = 0, **kwargs: Any): 34 | """If torch distributed is initialized, log only on rank 35 | 36 | Args: 37 | logger (logging.Logger): The logger to write the logs 38 | 39 | args (Tuple[Any]): All logging.Logger.log positional arguments 40 | 41 | rank (int, optional): The rank to write on. Defaults to 0. 42 | 43 | kwargs (Dict[str, Any]): All logging.Logger.log keyword arguments 44 | """ 45 | if torch.distributed.is_initialized(): 46 | if torch.distributed.get_rank() == rank: 47 | logger.log(*args, **kwargs) 48 | else: 49 | logger.log(*args, **kwargs) 50 | 51 | 52 | def normalize(weights: List[float]) -> List[float]: 53 | """Do non-exponentiated normalization 54 | 55 | Args: 56 | weights (List[float]): The weights 57 | 58 | Returns: 59 | List[float]: The normalized weights 60 | """ 61 | w = numpy.array(weights, dtype=numpy.float64) 62 | w_sum = numpy.sum(w) 63 | w = (w / w_sum).tolist() 64 | return w 65 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .core import check_is_distributed_checkpoint 4 | from .mapping import LocalNonpersitentObject, ShardedTensor 5 | from .serialization import ( 6 | load, 7 | load_common_state_dict, 8 | load_plain_tensors, 9 | load_tensors_metadata, 10 | save, 11 | ) 12 | -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/core.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Module for managing distributed checkpoints metadata. """ 4 | 5 | import json 6 | from dataclasses import asdict, dataclass 7 | from pathlib import Path 8 | from typing import Optional 9 | 10 | CONFIG_FNAME = 'metadata.json' 11 | 12 | 13 | class CheckpointingException(Exception): 14 | """ Base checkpointing related exception """ 15 | 16 | pass 17 | 18 | 19 | @dataclass 20 | class CheckpointingConfig: 21 | """ Documents backends used in the checkpoint. 22 | 23 | Checkpoint config keeps track of formats used for storing the sharded tensors 24 | (sharded_backend) and other objects (common_backend). 25 | 26 | Note that versioning is not for the checkpoint content (which is application specific), 27 | but for the checkpoint format itself. 28 | """ 29 | 30 | sharded_backend: str 31 | sharded_backend_version: int = 1 32 | common_backend: str = 'torch' 33 | common_backend_version: int = 1 34 | 35 | 36 | def check_is_distributed_checkpoint(checkpoint_dir): 37 | """ Checks if `metadata.json` exists in the checkpoint and is a valid config. 38 | 39 | Args: 40 | checkpoint_dir: checkpoint directory 41 | 42 | Returns: 43 | bool: True if `metadata.json` exists in the checkpoint and is a valid config. 44 | """ 45 | return maybe_load_config(checkpoint_dir) is not None 46 | 47 | 48 | def maybe_load_config(checkpoint_dir: str) -> Optional[CheckpointingConfig]: 49 | """ Returns checkpoint config if `checkpoint_dir` is a distributed checkpoint and None otherwise 50 | 51 | Args: 52 | checkpoint_dir: checkpoint directory 53 | 54 | Returns: 55 | CheckpointingConfig (optional): None if checkpoint is not a valid distributed checkpoint 56 | """ 57 | config_path = Path(checkpoint_dir, CONFIG_FNAME) 58 | if not config_path.exists(): 59 | return None 60 | with config_path.open() as f: 61 | config_dict = json.load(f) 62 | return CheckpointingConfig(**config_dict) 63 | 64 | 65 | def save_config(config: CheckpointingConfig, checkpoint_dir: str): 66 | """ Save given config to checkpoint directory. 67 | 68 | Args: 69 | config: checkpoint config 70 | checkpoint_dir: checkpoint directory 71 | 72 | Returns: 73 | None 74 | """ 75 | config_path = Path(checkpoint_dir, CONFIG_FNAME) 76 | with config_path.open('w') as f: 77 | json.dump(asdict(config), f) -------------------------------------------------------------------------------- /megatron/core/dist_checkpointing/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Various loading and saving strategies """ 4 | 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | try: 10 | import tensorstore 11 | import zarr 12 | 13 | from .tensorstore import _import_trigger 14 | from .zarr import _import_trigger 15 | except ImportError: 16 | # Only print warning on first rank. 17 | import os 18 | 19 | if int(os.getenv('RANK', '0')) == 0: 20 | logger.warning('Zarr-based strategies will not be registered because of missing packages') 21 | -------------------------------------------------------------------------------- /megatron/core/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .distributed_data_parallel import DistributedDataParallel 4 | from .finalize_model_grads import finalize_model_grads 5 | from .grad_buffer import shard_buffer 6 | -------------------------------------------------------------------------------- /megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | class ModelType(enum.Enum): 7 | encoder_or_decoder = 1 8 | encoder_and_decoder = 2 9 | retro_encoder = 3 10 | retro_decoder = 4 11 | -------------------------------------------------------------------------------- /megatron/core/fusions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/fusions/__init__.py -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_dropout.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | from typing import Optional, Tuple 3 | 4 | import torch 5 | 6 | from megatron.core.jit import jit_fuser 7 | 8 | 9 | def _bias_dropout_add_func(x_with_bias, residual, prob, training): 10 | # type: (Tuple[Tensor, Optional[Tensor]], Tensor, float, bool) -> Tensor 11 | # NOTE: Previously, the argument `bias` used to be passed as 12 | # `bias.expand_as(residual)` when the `bias_dropout_func` is called from the 13 | # transformer layer but broadcasting should automatically take care of that. 14 | # Also, looking at broadcasting semantics, `expand_as` and broadcasting 15 | # seem to be identical performance-wise (both just change the view). 16 | 17 | x, bias = x_with_bias # unpack 18 | 19 | # If we want to train mixed precision, then the output of this function 20 | # should be half precision. However, in AMP O1, the input (residual) is 21 | # in fp32, and it will up-cast the result to fp32, causing pipeline parallel 22 | # GPU communication to hang. Therefore, we need to cast residual to the same 23 | # dtype as x. 24 | residual = residual if residual.dtype == x.dtype else residual.to(x.dtype) 25 | 26 | # The Dropout operation, Residual Addition and the tensor returning can be 27 | # done generically outside the if statement, but that stops fusing of Bias 28 | # Addition-Dropout-Residual Addition operation. So doing it together inside 29 | # the conditional branch to improve performance 30 | if bias is not None: 31 | x = x + bias 32 | out = torch.nn.functional.dropout(x, p=prob, training=training) 33 | out = residual + out 34 | return out 35 | else: 36 | out = torch.nn.functional.dropout(x, p=prob, training=training) 37 | out = residual + out 38 | return out 39 | 40 | 41 | def bias_dropout_add_unfused(training): 42 | def _bias_dropout_add(x_with_bias, residual, prob): 43 | return _bias_dropout_add_func(x_with_bias, residual, prob, training) 44 | 45 | return _bias_dropout_add 46 | 47 | 48 | @jit_fuser 49 | def bias_dropout_add_fused_train( 50 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float, 51 | ) -> torch.Tensor: 52 | return _bias_dropout_add_func(x_with_bias, residual, prob, True) 53 | 54 | 55 | @jit_fuser 56 | def bias_dropout_add_fused_inference( 57 | x_with_bias: Tuple[torch.Tensor, Optional[torch.Tensor]], residual: torch.Tensor, prob: float, 58 | ) -> torch.Tensor: 59 | return _bias_dropout_add_func(x_with_bias, residual, prob, False) 60 | 61 | 62 | def get_bias_dropout_add(training, fused): 63 | if fused: 64 | # jit scripting for a nn.module (with dropout) is not 65 | # triggering the fusion kernel. For now, we use two 66 | # different nn.functional routines to account for varying 67 | # dropout semantics during training and inference phases. 68 | if training: 69 | return bias_dropout_add_fused_train 70 | else: 71 | return bias_dropout_add_fused_inference 72 | else: 73 | return bias_dropout_add_unfused(training) 74 | -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from megatron.core.jit import jit_fuser 6 | 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 8 | # 1/sqrt(2*pi)-> 0.3989423 9 | # 1/sqrt(2) -> 0.70710678 10 | # sqrt(2/pi) -> 0.79788456 11 | # this function is tanh approximation of gelu 12 | # actual gelu is: 13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 14 | 15 | 16 | @jit_fuser 17 | def bias_gelu(bias, y): 18 | x = bias + y 19 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 20 | 21 | 22 | # gradient of tanh approximation of gelu 23 | # gradient of actual gelu is: 24 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 25 | @jit_fuser 26 | def bias_gelu_back(g, bias, y): 27 | x = bias + y 28 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 29 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 30 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( 31 | 1 + tanh_out 32 | ) 33 | return ff * g 34 | 35 | 36 | class GeLUFunction(torch.autograd.Function): 37 | @staticmethod 38 | # bias is an optional argument 39 | def forward(ctx, input, bias): 40 | ctx.save_for_backward(input, bias) 41 | return bias_gelu(bias, input) 42 | 43 | @staticmethod 44 | def backward(ctx, grad_output): 45 | input, bias = ctx.saved_tensors 46 | tmp = bias_gelu_back(grad_output, bias, input) 47 | return tmp, tmp 48 | 49 | 50 | bias_gelu_impl = GeLUFunction.apply 51 | -------------------------------------------------------------------------------- /megatron/core/fusions/fused_bias_swiglu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from megatron.core.jit import jit_fuser 7 | 8 | ###### BIAS SWIGLU FUSION/ NO AUTOGRAD ################ 9 | 10 | 11 | @jit_fuser 12 | def swiglu(y): 13 | y_1, y_2 = torch.chunk(y, 2, -1) 14 | return F.silu(y_1) * y_2 15 | 16 | 17 | @jit_fuser 18 | def bias_swiglu(y, bias): 19 | y = y + bias 20 | return swiglu(y) 21 | 22 | 23 | # gradient of tanh approximation of gelu 24 | # gradient of actual gelu is: 25 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 26 | @jit_fuser 27 | def swiglu_back(g, y): 28 | y_1, y_2 = torch.chunk(y, 2, -1) 29 | return torch.cat( 30 | (g * torch.sigmoid(y_1) * (1 + y_1 * (1 - torch.sigmoid(y_1))) * y_2, g * F.silu(y_1)), -1 31 | ) 32 | 33 | 34 | @jit_fuser 35 | def bias_swiglu_back(g, y, bias): 36 | y = y + bias 37 | return swiglu_back(g, y) 38 | 39 | 40 | class BiasSwiGLUFunction(torch.autograd.Function): 41 | @staticmethod 42 | # bias is an optional argument 43 | def forward(ctx, input, bias): 44 | ctx.save_for_backward(input, bias) 45 | return bias_swiglu(input, bias) 46 | 47 | @staticmethod 48 | def backward(ctx, grad_output): 49 | input, bias = ctx.saved_tensors 50 | tmp = bias_swiglu_back(grad_output, input, bias) 51 | return tmp, tmp 52 | 53 | 54 | class SwiGLUFunction(torch.autograd.Function): 55 | @staticmethod 56 | # bias is an optional argument 57 | def forward(ctx, input): 58 | ctx.save_for_backward(input) 59 | return swiglu(input) 60 | 61 | @staticmethod 62 | def backward(ctx, grad_output): 63 | input = ctx.saved_tensors 64 | tmp = swiglu_back(grad_output, input[0]) 65 | return tmp 66 | 67 | 68 | def bias_swiglu_impl(input, bias): 69 | ori_shape = input.shape 70 | assert len(ori_shape) in [2, 3] 71 | input = input.view(-1, ori_shape[-1]) 72 | if bias is not None: 73 | output = BiasSwiGLUFunction.apply(input, bias) 74 | else: 75 | output = SwiGLUFunction.apply(input) 76 | 77 | return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1) 78 | 79 | 80 | # bias_swiglu_impl = BiasSwiGLUFunction.apply 81 | # swiglu_impl = SwiGLUFunction.apply 82 | -------------------------------------------------------------------------------- /megatron/core/inference_params.py: -------------------------------------------------------------------------------- 1 | class InferenceParams: 2 | """Inference parameters that are passed to the main model in order 3 | to efficienly calculate and store the context during inference.""" 4 | 5 | def __init__(self, max_batch_size, max_sequence_length): 6 | self.max_sequence_length = max_sequence_length 7 | self.max_batch_size = max_batch_size 8 | self.sequence_len_offset = 0 9 | self.batch_size_offset = 0 10 | self.key_value_memory_dict = {} 11 | 12 | def swap_key_value_dict(self, batch_idx): 13 | "swap between batches" 14 | if len(self.key_value_memory_dict) == 0: 15 | raise ValueError("should not swap when dict in empty") 16 | 17 | for layer_number in self.key_value_memory_dict.keys(): 18 | inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number] 19 | assert ( 20 | len(batch_idx) == inference_key_memory.shape[1] 21 | ) # make sure batch size is the same 22 | new_inference_key_memory = inference_key_memory[:, batch_idx] 23 | new_inference_value_memory = inference_value_memory[:, batch_idx] 24 | self.key_value_memory_dict[layer_number] = ( 25 | new_inference_key_memory, 26 | new_inference_value_memory, 27 | ) 28 | -------------------------------------------------------------------------------- /megatron/core/jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | TORCH_MAJOR = int(torch.__version__.split(".")[0]) 6 | TORCH_MINOR = int(torch.__version__.split(".")[1]) 7 | 8 | jit_fuser = torch.jit.script 9 | # nvFuser is deprecated in PyTorch JIT starting from 2.2 10 | if (TORCH_MAJOR > 2) or (TORCH_MAJOR == 2 and TORCH_MINOR >= 2): 11 | jit_fuser = torch.compile 12 | -------------------------------------------------------------------------------- /megatron/core/models/T5/__init__.py: -------------------------------------------------------------------------------- 1 | from .t5_model import T5Model 2 | -------------------------------------------------------------------------------- /megatron/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/bert/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/bert/bert_layer_specs.py: -------------------------------------------------------------------------------- 1 | from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add 2 | from megatron.core.fusions.fused_layer_norm import FusedLayerNorm 3 | from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear 4 | from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules 5 | from megatron.core.transformer.custom_layers.transformer_engine import ( 6 | TEDotProductAttention, 7 | TELayerNormColumnParallelLinear, 8 | TERowParallelLinear, 9 | ) 10 | from megatron.core.transformer.dot_product_attention import DotProductAttention 11 | from megatron.core.transformer.enums import AttnMaskType 12 | from megatron.core.transformer.mlp import MLP, MLPSubmodules 13 | from megatron.core.transformer.spec_utils import ModuleSpec 14 | from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules 15 | 16 | # Use this spec to use lower level Transformer Engine modules (required for fp8 training) 17 | bert_layer_with_transformer_engine_spec = ModuleSpec( 18 | module=TransformerLayer, 19 | submodules=TransformerLayerSubmodules( 20 | self_attention=ModuleSpec( 21 | module=SelfAttention, 22 | params={"attn_mask_type": AttnMaskType.padding}, 23 | submodules=SelfAttentionSubmodules( 24 | linear_qkv=TELayerNormColumnParallelLinear, 25 | core_attention=TEDotProductAttention, 26 | linear_proj=TERowParallelLinear, 27 | ), 28 | ), 29 | self_attn_bda=get_bias_dropout_add, 30 | mlp=ModuleSpec( 31 | module=MLP, 32 | submodules=MLPSubmodules( 33 | linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear, 34 | ), 35 | ), 36 | mlp_bda=get_bias_dropout_add, 37 | ), 38 | ) 39 | 40 | # Use this spec for an implementation using only modules in megatron core 41 | bert_layer_local_spec = ModuleSpec( 42 | module=TransformerLayer, 43 | submodules=TransformerLayerSubmodules( 44 | input_layernorm=FusedLayerNorm, 45 | self_attention=ModuleSpec( 46 | module=SelfAttention, 47 | params={"attn_mask_type": AttnMaskType.padding}, 48 | submodules=SelfAttentionSubmodules( 49 | linear_qkv=ColumnParallelLinear, 50 | core_attention=DotProductAttention, 51 | linear_proj=RowParallelLinear, 52 | ), 53 | ), 54 | self_attn_bda=get_bias_dropout_add, 55 | pre_mlp_layernorm=FusedLayerNorm, 56 | mlp=ModuleSpec( 57 | module=MLP, 58 | submodules=MLPSubmodules( 59 | linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear, 60 | ), 61 | ), 62 | mlp_bda=get_bias_dropout_add, 63 | ), 64 | ) 65 | -------------------------------------------------------------------------------- /megatron/core/models/bert/bert_lm_head.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | from megatron.core import tensor_parallel 5 | from megatron.core.fusions.fused_layer_norm import FusedLayerNorm 6 | from megatron.core.transformer.module import MegatronModule 7 | from megatron.core.transformer.transformer_config import TransformerConfig 8 | from megatron.core.transformer.utils import erf_gelu, get_linear_layer, openai_gelu 9 | 10 | 11 | class BertLMHead(MegatronModule): 12 | """Masked LM head for Bert 13 | 14 | Args: 15 | hidden_size: hidden size 16 | config (TransformerConfig): TransformerConfig object 17 | parallel_output (bool): Do not gather the outputs, keep them split across tensor parallel ranks 18 | vocab_size(int): The vocabulary size 19 | share_embeddings_and_output_weights (bool): When True, input embeddings and output logit weights are shared. Defaults to False 20 | pre_process (bool): Include embedding layer (used with pipeline parallelism) 21 | """ 22 | 23 | def __init__( 24 | self, 25 | hidden_size: int, 26 | config: TransformerConfig, 27 | parallel_output: bool, 28 | vocab_size: int, 29 | pre_process: bool, 30 | share_embeddings_and_output_weights: bool = False, 31 | ): 32 | super().__init__(config=config) 33 | 34 | self.vocab_size = vocab_size 35 | self.parallel_output = parallel_output 36 | 37 | # TODO: Shoudl switch this to TE ? 38 | self.dense = get_linear_layer( 39 | hidden_size, hidden_size, config.init_method, config.perform_initialization 40 | ) 41 | 42 | setattr(self.dense.weight, 'sequence_parallel', config.sequence_parallel) 43 | setattr(self.dense.bias, 'sequence_parallel', config.sequence_parallel) 44 | 45 | self.layernorm = FusedLayerNorm( 46 | config=config, 47 | hidden_size=hidden_size, 48 | eps=config.layernorm_epsilon, 49 | sequence_parallel=config.sequence_parallel, 50 | ) 51 | 52 | self.gelu = torch.nn.functional.gelu 53 | # TODO Use activation_func in config to determine what to use 54 | # if config.openai_gelu: # Dont have these configs in transfomer config yet 55 | # self.gelu = openai_gelu 56 | # elif config.onnx_safe: # Dont have these configs in transfomer config yet 57 | # self.gelu = erf_gelu 58 | 59 | self.output_layer = tensor_parallel.ColumnParallelLinear( 60 | config.hidden_size, 61 | self.vocab_size, 62 | config=config, 63 | init_method=config.init_method, 64 | bias=True, 65 | skip_bias_add=False, 66 | gather_output=not self.parallel_output, 67 | skip_weight_param_allocation=pre_process and share_embeddings_and_output_weights, 68 | ) 69 | 70 | def forward(self, hidden_states: Tensor, word_embeddings_weight: Tensor) -> Tensor: 71 | hidden_states = self.dense(hidden_states) 72 | hidden_states = self.gelu(hidden_states) 73 | hidden_states = self.layernorm(hidden_states) 74 | logits, _ = self.output_layer(hidden_states, weight=word_embeddings_weight) 75 | return logits 76 | -------------------------------------------------------------------------------- /megatron/core/models/bert/pooler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | from megatron.core import tensor_parallel 5 | from megatron.core.transformer.module import MegatronModule 6 | from megatron.core.transformer.transformer_config import TransformerConfig 7 | from megatron.core.transformer.utils import get_linear_layer 8 | 9 | 10 | class Pooler(MegatronModule): 11 | """Pooler layer. 12 | 13 | Pool hidden states of a specific token (for example start of the 14 | sequence) and add a linear transformation followed by a tanh. 15 | 16 | Args: 17 | hidden_size (int): The hidden size_ 18 | init_method (callable): weight initialization method for the linear layer. bias is set to zero. 19 | config (TransformerConfig): The transformer configuration 20 | sequence_parallel (bool): Using squence parallel ? Defaults to False 21 | """ 22 | 23 | def __init__( 24 | self, 25 | hidden_size: int, 26 | init_method: callable, 27 | config: TransformerConfig, 28 | sequence_parallel: bool = False, 29 | ): 30 | super(Pooler, self).__init__(config) 31 | # TODO: Shoudl switch this to TE ? 32 | self.dense = get_linear_layer( 33 | hidden_size, hidden_size, init_method, config.perform_initialization 34 | ) 35 | self.sequence_parallel = sequence_parallel 36 | 37 | def forward(self, hidden_states: Tensor, sequence_index=0): 38 | # hidden_states: [s, b, h] 39 | # sequence_index: index of the token to pool. 40 | 41 | # gather data along sequence dimensions 42 | # same pooler is run on all tensor parallel nodes 43 | if self.sequence_parallel: 44 | hidden_states = tensor_parallel.gather_from_sequence_parallel_region( 45 | hidden_states, tensor_parallel_output_grad=False 46 | ) 47 | 48 | pooled = hidden_states[sequence_index, :, :] 49 | pooled = self.dense(pooled) 50 | pooled = torch.tanh(pooled) 51 | return pooled 52 | -------------------------------------------------------------------------------- /megatron/core/models/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/embeddings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/embeddings/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/language_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/language_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/common/vision_module/__init__.py -------------------------------------------------------------------------------- /megatron/core/models/common/vision_module/vision_module.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | """Megatron Vision Module.""" 3 | 4 | from megatron.core.transformer.module import MegatronModule 5 | from megatron.core.transformer.transformer_config import TransformerConfig 6 | 7 | 8 | # Note: This is only a stub at the moment. This will be expanded in follow-up changes. 9 | class VisionModule(MegatronModule): 10 | """Base vision module that has common helper functions used across CLIP, ViT, etc. 11 | 12 | Args: 13 | config (TransformerConfig): Input transformer config for the model 14 | """ 15 | 16 | def __init__(self, config: TransformerConfig) -> None: 17 | super().__init__(config=config) 18 | -------------------------------------------------------------------------------- /megatron/core/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_model import GPTModel 2 | -------------------------------------------------------------------------------- /megatron/core/models/retro/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .config import RetroConfig 4 | from .decoder_spec import get_retro_decoder_block_spec 5 | from .model import RetroModel 6 | -------------------------------------------------------------------------------- /megatron/core/models/retro/base_attention.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from megatron.core.models.retro.config import RetroConfig 4 | from megatron.core.transformer.attention import CrossAttention, CrossAttentionSubmodules 5 | from megatron.core.transformer.enums import AttnMaskType 6 | from megatron.core.transformer.module import MegatronModule 7 | 8 | 9 | class BaseRetroCrossAttention(MegatronModule): 10 | 11 | """Base class for Retro cross attention, for both encoder & decoder layers. 12 | 13 | This class collects the retro arguments below (i.e., num neighbors, chunk 14 | length, and retrieve length) for use in Retro's custom cross attention 15 | operators. 16 | 17 | Arguments: 18 | config (RetroConfig): Retro config. 19 | 20 | submodules (CrossAttentionSubmodules): Cross attention submodules. 21 | 22 | layer_number (int): Layer number within transformer block. 23 | 24 | attn_mask_type (AttnMaskType): Mask type ('causal' or 'padding'). 25 | """ 26 | 27 | def __init__( 28 | self, 29 | config: RetroConfig, 30 | submodules: CrossAttentionSubmodules, 31 | layer_number: int = 1, 32 | attn_mask_type: AttnMaskType = AttnMaskType.padding, 33 | ): 34 | super().__init__(config=config) 35 | 36 | self.attn = CrossAttention( 37 | config=config, 38 | submodules=submodules, 39 | layer_number=layer_number, 40 | attn_mask_type=attn_mask_type, 41 | ) 42 | 43 | self.retro_num_neighbors = config.retro_num_neighbors 44 | self.retro_chunk_length = config.retro_preprocess.retro_gpt_chunk_length 45 | self.retro_retrieved_length = config.retro_preprocess.retro_gpt_retrieved_length 46 | -------------------------------------------------------------------------------- /megatron/core/models/retro/config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import types 4 | from dataclasses import dataclass 5 | 6 | from megatron.core.transformer import TransformerConfig 7 | 8 | 9 | @dataclass 10 | class RetroConfig(TransformerConfig): 11 | 12 | """Configuration object for Retro models. 13 | 14 | Attributes: 15 | 16 | retro_preprocess (SimpleNamespace): Retro preprocess arguments. 17 | retro_workdir (str): Retro working directory, which contains the 18 | preprocessed data for for pretraining. This directory is built during 19 | preprocessing (see tools/retro/README.md), and contains subdirectories 20 | for the chunk database and pretraining neighbors. 21 | retro_encoder_layers (int): Number of layers to use for the retrieval 22 | encoder. 23 | retro_encoder_hidden_dropout (float): Hidden dropout for retrieval 24 | encoder. 25 | retro_encoder_attention_dropout (float): Attention dropout for retrieval 26 | encoder. 27 | retro_num_neighbors (int): Number of neighbors to retrieve during 28 | pretraining. 29 | retro_num_retrieved_chunks (int): Number of chunks to retrieve from the 30 | retrieval database. 31 | retro_verify_neighbor_count (bool): Verify that len(GPT dataset) == 32 | len(saved neighbors). 33 | """ 34 | 35 | # Retro. 36 | retro_preprocess: types.SimpleNamespace = None 37 | retro_workdir: str = None 38 | retro_encoder_num_layers: int = 2 39 | retro_encoder_hidden_dropout: float = 0.1 40 | retro_encoder_attention_dropout: float = 0.1 41 | retro_num_neighbors: int = 2 42 | retro_num_retrieved_chunks: int = 2 43 | retro_verify_neighbor_count: bool = True 44 | -------------------------------------------------------------------------------- /megatron/core/models/retro/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Retro Model.""" 4 | 5 | from torch import Tensor 6 | 7 | from megatron.core import InferenceParams 8 | from megatron.core.models.gpt import GPTModel 9 | 10 | 11 | class RetroModel(GPTModel): 12 | 13 | """Retro Model. 14 | 15 | A Retro model mostly re-uses the GPTModel interface, with the only difference 16 | being the embedding of the 'context' this is used by Retro for processing 17 | neighbor tokens. This embedded context is then forwarded to the Transformer 18 | Block. 19 | """ 20 | 21 | def forward( 22 | self, 23 | input_ids: Tensor, 24 | position_ids: Tensor, 25 | attention_mask: Tensor, 26 | context_input_ids: Tensor = None, 27 | context_position_ids: Tensor = None, 28 | context_mask: Tensor = None, 29 | decoder_input: Tensor = None, 30 | labels: Tensor = None, 31 | inference_params: InferenceParams = None, 32 | ) -> Tensor: 33 | """RetroModel forward method. 34 | 35 | Foward input tokens & mask, along with neighbor tokens & mask, through 36 | the Retro model.. 37 | 38 | Arguments: 39 | input_ids (Tensor): Input token IDs. 40 | 41 | position_ids (Tensor): Input position IDs. 42 | 43 | attention_mask (Tensor): Input attention mask. 44 | 45 | context_input_ids (Tensor): Context (i.e., neighbor) token IDs. 46 | 47 | context_position_ids (Tensor): Context (i.e., neighbor) position IDs. 48 | 49 | context_mask (Tensor): Context (i.e., neighbor) attention mask. 50 | 51 | decoder_input (Tensor): When using pipeline parallelism, input_ids and 52 | position_ids will only be used on the first stage, and for all other 53 | stages decoder_input will be provided via communication from the 54 | previous stage. 55 | 56 | labels (Tensor): The labels of dimension [batch size, seq length]. 57 | 58 | inference_params (InferenceParams): Parameters for inference. 59 | """ 60 | 61 | # Argument shapes: 62 | # Notation: 63 | # ns : Sequence length. 64 | # bs : Batch size. 65 | # d : Hidden size. 66 | # l : Number of chunks per sample (i.e., seq_length/chunk_length). 67 | # k : Number of neighbors. 68 | # r : Number of retrieved tokens (neighbors + continuation). 69 | # - input_ids: [ bs, ns ] 70 | # - context_ids: [ k*bs*l, r ] 71 | # - context: [ r, k*bs*l, d ] 72 | # - output: [ ns, bs, d ] 73 | 74 | # Context embedding (e.g., for Retro neighbor tokens). 75 | if context_input_ids is not None: 76 | context = self.embedding(context_input_ids, context_position_ids) 77 | else: 78 | context = None 79 | 80 | # Call GPTModel.forward, and pass in embedded context. 81 | return super().forward( 82 | input_ids=input_ids, 83 | position_ids=position_ids, 84 | attention_mask=attention_mask, 85 | decoder_input=decoder_input, 86 | labels=labels, 87 | inference_params=inference_params, 88 | extra_block_kwargs={"context": context, "context_mask": context_mask,}, 89 | ) 90 | -------------------------------------------------------------------------------- /megatron/core/models/vision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/models/vision/__init__.py -------------------------------------------------------------------------------- /megatron/core/package_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | MAJOR = 0 5 | MINOR = 5 6 | PATCH = 0 7 | PRE_RELEASE = 'rc0' 8 | 9 | # Use the following formatting: (major, minor, patch, pre-release) 10 | VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE) 11 | 12 | __shortversion__ = '.'.join(map(str, VERSION[:3])) 13 | __version__ = '.'.join(map(str, VERSION[:3])) + ''.join(VERSION[3:]) 14 | 15 | __package_name__ = 'megatron_core' 16 | __contact_names__ = 'NVIDIA' 17 | __contact_emails__ = 'nemo-toolkit@nvidia.com' # use NeMo Email 18 | __homepage__ = ( 19 | 'https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/' # use NeMo homepage 20 | ) 21 | __repository_url__ = 'https://github.com/NVIDIA/Megatron-LM/megatron/core' 22 | __download_url__ = 'https://github.com/NVIDIA/Megatron-LM/releases' 23 | __description__ = ( 24 | 'Megatron Core - a library for efficient and scalable training of transformer based models' 25 | ) 26 | __license__ = 'BSD-3' 27 | __keywords__ = ( 28 | 'deep learning, machine learning, gpu, NLP, NLU, language, transformer, nvidia, pytorch, torch' 29 | ) 30 | -------------------------------------------------------------------------------- /megatron/core/packed_seq_params.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from torch import Tensor 4 | 5 | 6 | @dataclass 7 | class PackedSeqParams: 8 | # parameters to TEDotProductAttention and fused rope kernels for the `thd` (packed) sequence format, 9 | qkv_format: str = None 10 | cu_seqlens_q: Tensor = None 11 | cu_seqlens_kv: Tensor = None 12 | max_seqlen_q: Tensor = None 13 | max_seqlen_kv: Tensor = None 14 | -------------------------------------------------------------------------------- /megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .schedules import get_forward_backward_func 2 | -------------------------------------------------------------------------------- /megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | torch -------------------------------------------------------------------------------- /megatron/core/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .cross_entropy import vocab_parallel_cross_entropy 2 | from .data import broadcast_data 3 | from .layers import ( 4 | ColumnParallelLinear, 5 | RowParallelLinear, 6 | VocabParallelEmbedding, 7 | copy_tensor_model_parallel_attributes, 8 | linear_with_grad_accumulation_and_async_allreduce, 9 | param_is_not_tensor_parallel_duplicate, 10 | set_defaults_if_not_set_tensor_model_parallel_attributes, 11 | set_tensor_model_parallel_attributes, 12 | ) 13 | from .mappings import ( 14 | copy_to_tensor_model_parallel_region, 15 | gather_from_sequence_parallel_region, 16 | gather_from_sequence_parallel_region_to_moe, 17 | gather_from_tensor_model_parallel_region, 18 | reduce_scatter_to_sequence_parallel_region_from_moe, 19 | scatter_to_sequence_parallel_region, 20 | scatter_to_tensor_model_parallel_region, 21 | ) 22 | from .random import ( 23 | checkpoint, 24 | get_cuda_rng_tracker, 25 | get_data_parallel_rng_tracker_name, 26 | model_parallel_cuda_manual_seed, 27 | ) 28 | from .utils import ( 29 | gather_split_1d_tensor, 30 | split_tensor_along_last_dim, 31 | split_tensor_into_1d_equal_chunks, 32 | ) 33 | 34 | __all__ = [ 35 | # cross_entropy.py 36 | "vocab_parallel_cross_entropy", 37 | # data.py 38 | "broadcast_data", 39 | # layers.py 40 | "ColumnParallelLinear", 41 | "RowParallelLinear", 42 | "VocabParallelEmbedding", 43 | "set_tensor_model_parallel_attributes", 44 | "set_defaults_if_not_set_tensor_model_parallel_attributes", 45 | "copy_tensor_model_parallel_attributes", 46 | "param_is_not_tensor_parallel_duplicate", 47 | "linear_with_grad_accumulation_and_async_allreduce", 48 | # mappings.py 49 | "copy_to_tensor_model_parallel_region", 50 | "gather_from_tensor_model_parallel_region", 51 | "gather_from_sequence_parallel_region", 52 | # "reduce_from_tensor_model_parallel_region", 53 | "scatter_to_tensor_model_parallel_region", 54 | "scatter_to_sequence_parallel_region", 55 | # random.py 56 | "checkpoint", 57 | "get_cuda_rng_tracker", 58 | "model_parallel_cuda_manual_seed", 59 | # utils.py 60 | "split_tensor_along_last_dim", 61 | "split_tensor_into_1d_equal_chunks", 62 | "gather_split_1d_tensor", 63 | "gather_from_sequence_parallel_region_to_moe", 64 | "reduce_scatter_to_sequence_parallel_region_from_moe", 65 | ] 66 | -------------------------------------------------------------------------------- /megatron/core/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .module import MegatronModule 4 | from .spec_utils import ModuleSpec, build_module 5 | from .transformer_config import TransformerConfig 6 | from .transformer_layer import TransformerLayer, TransformerLayerSubmodules 7 | -------------------------------------------------------------------------------- /megatron/core/transformer/custom_layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/transformer/custom_layers/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | 6 | # can we get rid of this? 7 | # it's being used in pipeline schedules 8 | class ModelType(enum.Enum): 9 | encoder_or_decoder = 1 10 | encoder_and_decoder = 2 11 | 12 | 13 | # class LayerType(enum.Enum): 14 | # encoder = 1 15 | # decoder = 2 16 | 17 | 18 | class AttnType(enum.Enum): 19 | self_attn = 1 20 | cross_attn = 2 21 | 22 | 23 | class AttnMaskType(enum.Enum): 24 | padding = 1 25 | causal = 2 26 | no_mask = 3 # only used for TE 27 | -------------------------------------------------------------------------------- /megatron/core/transformer/identity_op.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | 4 | 5 | class IdentityOp(torch.nn.Module): 6 | """ 7 | This is a placeholder for IdentityOp(x) -> x 8 | """ 9 | 10 | def __init__(self, *args, **kwargs): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | 17 | class IdentityFuncOp(IdentityOp): 18 | """ 19 | This is a placeholder for IdentityFuncOp(...)(x) -> IdentityOp(x) -> x. 20 | Such a func is handy for ops like `bias_dropout_fusion` which themselves 21 | return a function at runtime based on passed arguments 22 | """ 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__() 26 | 27 | def forward(self, *args, **kwargs): 28 | return super().forward 29 | -------------------------------------------------------------------------------- /megatron/core/transformer/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/core/transformer/moe/__init__.py -------------------------------------------------------------------------------- /megatron/core/transformer/moe/grouped_gemm_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | try: 4 | import grouped_gemm 5 | except ImportError: 6 | grouped_gemm = None 7 | 8 | 9 | def grouped_gemm_is_available(): 10 | return grouped_gemm is not None 11 | 12 | 13 | def assert_grouped_gemm_is_available(): 14 | assert grouped_gemm_is_available(), ( 15 | "Grouped GEMM is not available. Please run " 16 | "`pip install git+https://github.com/fanshiqing/grouped_gemm@main`." 17 | ) 18 | 19 | 20 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None 21 | -------------------------------------------------------------------------------- /megatron/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/data/__init__.py -------------------------------------------------------------------------------- /megatron/data/multimodal_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from PIL import Image, UnidentifiedImageError 4 | import numpy as np 5 | import io 6 | import torch 7 | 8 | try: 9 | from torchvision.transforms import InterpolationMode 10 | BICUBIC = InterpolationMode.BICUBIC 11 | except ImportError: 12 | BICUBIC = Image.BICUBIC 13 | 14 | from torchvision.transforms import Compose, ToTensor, Normalize, ToPILImage, RandomResizedCrop, Resize 15 | 16 | def _convert_image_to_rgb(image): 17 | return image.convert("RGB") 18 | 19 | def _transform(img_h, img_w): 20 | return Compose([ 21 | ToPILImage(), 22 | RandomResizedCrop((img_h, img_w), scale=(0.5, 1.0), interpolation=BICUBIC), 23 | _convert_image_to_rgb, 24 | ToTensor(), 25 | Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 26 | ]) 27 | 28 | class MultiModalDataset(torch.utils.data.Dataset): 29 | 30 | def __init__(self, name, data_prefix, indexed_dataset, 31 | num_samples, seq_length, seed, img_h, img_w): 32 | 33 | self.name = name 34 | self.indexed_dataset = indexed_dataset 35 | self.doc_idx = indexed_dataset.get_document_indices() 36 | self.visual_transform = _transform(img_h, img_w) 37 | 38 | def __len__(self): 39 | return self.indexed_dataset.sequence_lengths.shape[0] 40 | 41 | def __getitem__(self, idx): 42 | text_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]) 43 | assert mode == 0 44 | img_sample, mode = self.indexed_dataset.get(self.doc_idx[idx]+1) 45 | assert mode == 1 46 | img_pad = img_sample[0].item() 47 | xs = img_sample[1:].tobytes(order='C') 48 | xs = xs[:len(xs)-img_pad] 49 | 50 | img_sample = np.array(Image.open(io.BytesIO(xs))) 51 | img_sample = self.visual_transform(img_sample).reshape(-1) 52 | 53 | return {'text': np.array(text_sample, dtype=np.int64), 54 | 'img': np.array(img_sample, dtype=np.float32)} 55 | -------------------------------------------------------------------------------- /megatron/dist_signal_handler.py: -------------------------------------------------------------------------------- 1 | import signal 2 | 3 | import torch 4 | 5 | 6 | def get_world_size(): 7 | if torch.distributed.is_available() and torch.distributed.is_initialized(): 8 | world_size = torch.distributed.get_world_size() 9 | else: 10 | world_size = 1 11 | return world_size 12 | 13 | 14 | def get_device(local_rank=None): 15 | backend = torch.distributed.get_backend() 16 | if backend == 'nccl': 17 | if local_rank is None: 18 | device = torch.device('cuda') 19 | else: 20 | device = torch.device(f'cuda:{local_rank}') 21 | elif backend == 'gloo': 22 | device = torch.device('cpu') 23 | else: 24 | raise RuntimeError 25 | return device 26 | 27 | 28 | def all_gather_item(item, dtype, group=None, async_op=False, local_rank=None): 29 | if not torch.distributed.is_available() or \ 30 | not torch.distributed.is_initialized(): 31 | return [item] 32 | 33 | device = get_device(local_rank) 34 | 35 | if group is not None: 36 | group_size = group.size() 37 | else: 38 | group_size = get_world_size() 39 | 40 | tensor = torch.tensor([item], device=device, dtype=dtype) 41 | output_tensors = [ 42 | torch.zeros(1, dtype=tensor.dtype, device=tensor.device) 43 | for _ in range(group_size) 44 | ] 45 | torch.distributed.all_gather(output_tensors, tensor, group, async_op) 46 | output = [elem.item() for elem in output_tensors] 47 | return output 48 | 49 | 50 | class DistributedSignalHandler: 51 | def __init__(self, sig=signal.SIGTERM): 52 | self.sig = sig 53 | 54 | def signals_received(self): 55 | all_received = all_gather_item( 56 | self._signal_received, dtype=torch.int32 57 | ) 58 | return all_received 59 | 60 | def __enter__(self): 61 | self._signal_received = False 62 | self.released = False 63 | self.original_handler = signal.getsignal(self.sig) 64 | 65 | def handler(signum, frame): 66 | self._signal_received = True 67 | 68 | signal.signal(self.sig, handler) 69 | 70 | return self 71 | 72 | def __exit__(self, type, value, tb): 73 | self.release() 74 | 75 | def release(self): 76 | if self.released: 77 | return False 78 | 79 | signal.signal(self.sig, self.original_handler) 80 | self.released = True 81 | return True 82 | -------------------------------------------------------------------------------- /megatron/fp16_deprecated/loss_scaler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """For backward compatibility, we need the class definitions to deserialize.""" 4 | 5 | class LossScaler: 6 | def __init__(self, scale=1): 7 | self.cur_scale = scale 8 | 9 | class DynamicLossScaler: 10 | def __init__(self, 11 | init_scale=2**32, 12 | scale_factor=2., 13 | scale_window=1000, 14 | min_scale=1, 15 | delayed_shift=1, 16 | consecutive_hysteresis=False): 17 | self.cur_scale = init_scale 18 | self.cur_iter = 0 19 | self.last_overflow_iter = -1 20 | self.scale_factor = scale_factor 21 | self.scale_window = scale_window 22 | self.min_scale = min_scale 23 | self.delayed_shift = delayed_shift 24 | self.cur_hysteresis = delayed_shift 25 | self.consecutive_hysteresis = consecutive_hysteresis 26 | 27 | -------------------------------------------------------------------------------- /megatron/fused_kernels/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import pathlib 5 | import subprocess 6 | 7 | from torch.utils import cpp_extension 8 | 9 | # Setting this param to a list has a problem of generating different 10 | # compilation commands (with diferent order of architectures) and 11 | # leading to recompilation of fused kernels. Set it to empty string 12 | # to avoid recompilation and assign arch flags explicity in 13 | # extra_cuda_cflags below 14 | os.environ["TORCH_CUDA_ARCH_LIST"] = "" 15 | 16 | 17 | def load(args): 18 | 19 | # Check if cuda 11 is installed for compute capability 8.0 20 | cc_flag = [] 21 | _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version( 22 | cpp_extension.CUDA_HOME 23 | ) 24 | if int(bare_metal_major) >= 11: 25 | cc_flag.append('-gencode') 26 | cc_flag.append('arch=compute_80,code=sm_80') 27 | if int(bare_metal_minor) >= 8: 28 | cc_flag.append('-gencode') 29 | cc_flag.append('arch=compute_90,code=sm_90') 30 | 31 | # Build path 32 | srcpath = pathlib.Path(__file__).parent.absolute() 33 | buildpath = srcpath / "build" 34 | _create_build_dir(buildpath) 35 | 36 | # Helper function to build the kernels. 37 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 38 | return cpp_extension.load( 39 | name=name, 40 | sources=sources, 41 | build_directory=buildpath, 42 | extra_cflags=[ 43 | "-O3", 44 | ], 45 | extra_cuda_cflags=[ 46 | "-O3", 47 | "-gencode", 48 | "arch=compute_70,code=sm_70", 49 | "--use_fast_math", 50 | ] 51 | + extra_cuda_flags 52 | + cc_flag, 53 | verbose=(args.rank == 0), 54 | ) 55 | 56 | 57 | def _get_cuda_bare_metal_version(cuda_dir): 58 | raw_output = subprocess.check_output( 59 | [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True 60 | ) 61 | output = raw_output.split() 62 | release_idx = output.index("release") + 1 63 | release = output[release_idx].split(".") 64 | bare_metal_major = release[0] 65 | bare_metal_minor = release[1][0] 66 | 67 | return raw_output, bare_metal_major, bare_metal_minor 68 | 69 | 70 | def _create_build_dir(buildpath): 71 | try: 72 | os.mkdir(buildpath) 73 | except OSError: 74 | if not os.path.isdir(buildpath): 75 | print(f"Creation of the build directory {buildpath} failed") 76 | -------------------------------------------------------------------------------- /megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /megatron/fused_kernels/type_shim.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | 4 | #include 5 | #include "compat.h" 6 | 7 | 8 | #define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...) \ 9 | switch(TYPE) \ 10 | { \ 11 | case at::ScalarType::Half: \ 12 | { \ 13 | using scalar_t = at::Half; \ 14 | __VA_ARGS__; \ 15 | break; \ 16 | } \ 17 | case at::ScalarType::BFloat16: \ 18 | { \ 19 | using scalar_t = at::BFloat16; \ 20 | __VA_ARGS__; \ 21 | break; \ 22 | } \ 23 | default: \ 24 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 25 | } 26 | 27 | 28 | #define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...) \ 29 | switch(TYPE) \ 30 | { \ 31 | case at::ScalarType::Half: \ 32 | { \ 33 | using scalar_t = at::Half; \ 34 | __VA_ARGS__; \ 35 | break; \ 36 | } \ 37 | case at::ScalarType::BFloat16: \ 38 | { \ 39 | using scalar_t = at::BFloat16; \ 40 | __VA_ARGS__; \ 41 | break; \ 42 | } \ 43 | case at::ScalarType::Float: \ 44 | { \ 45 | using scalar_t = float; \ 46 | __VA_ARGS__; \ 47 | break; \ 48 | } \ 49 | default: \ 50 | AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \ 51 | } 52 | 53 | 54 | 55 | #define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \ 56 | switch(TYPEIN) \ 57 | { \ 58 | case at::ScalarType::Float: \ 59 | { \ 60 | using scalar_t_in = float; \ 61 | switch(TYPEOUT) \ 62 | { \ 63 | case at::ScalarType::Float: \ 64 | { \ 65 | using scalar_t_out = float; \ 66 | __VA_ARGS__; \ 67 | break; \ 68 | } \ 69 | case at::ScalarType::Half: \ 70 | { \ 71 | using scalar_t_out = at::Half; \ 72 | __VA_ARGS__; \ 73 | break; \ 74 | } \ 75 | case at::ScalarType::BFloat16: \ 76 | { \ 77 | using scalar_t_out = at::BFloat16; \ 78 | __VA_ARGS__; \ 79 | break; \ 80 | } \ 81 | default: \ 82 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \ 83 | } \ 84 | break; \ 85 | } \ 86 | case at::ScalarType::Half: \ 87 | { \ 88 | using scalar_t_in = at::Half; \ 89 | using scalar_t_out = at::Half; \ 90 | __VA_ARGS__; \ 91 | break; \ 92 | } \ 93 | case at::ScalarType::BFloat16: \ 94 | { \ 95 | using scalar_t_in = at::BFloat16; \ 96 | using scalar_t_out = at::BFloat16; \ 97 | __VA_ARGS__; \ 98 | break; \ 99 | } \ 100 | default: \ 101 | AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \ 102 | } 103 | 104 | -------------------------------------------------------------------------------- /megatron/log_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import sys 4 | from logging import LogRecord, StreamHandler 5 | 6 | BLACKLISTED_MODULES = ["torch.distributed"] 7 | 8 | 9 | class CustomHandler(StreamHandler): 10 | """ 11 | Custom handler to filter out logging from code outside of 12 | Megatron Core, and dump to stdout. 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__(stream=sys.stdout) 17 | 18 | def filter(self, record: LogRecord) -> bool: 19 | # Prevent log entries that come from the blacklisted modules 20 | # through (e.g., PyTorch Distributed). 21 | for blacklisted_module in BLACKLISTED_MODULES: 22 | if record.name.startswith(blacklisted_module): 23 | return False 24 | return True 25 | -------------------------------------------------------------------------------- /megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 4 | from .rms_norm import RMSNorm 5 | 6 | from .bert_model import BertModel 7 | from .gpt_model import GPTModel 8 | from .t5_model import T5Model 9 | from .language_model import get_language_model 10 | from .module import Float16Module 11 | -------------------------------------------------------------------------------- /megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | retro_encoder = 3 9 | retro_decoder = 4 10 | retro_decoder_with_retriever = 5 11 | 12 | class AttnType(enum.Enum): 13 | self_attn = 1 14 | cross_attn = 2 15 | 16 | class AttnMaskType(enum.Enum): 17 | padding = 1 18 | causal = 2 19 | 20 | # For backward compatibility with old model checkpoints 21 | from megatron.core.enums import ModelType 22 | -------------------------------------------------------------------------------- /megatron/model/fused_bias_gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from megatron.core.jit import jit_fuser 5 | 6 | 7 | ###### BIAS GELU FUSION/ NO AUTOGRAD ################ 8 | # 1/sqrt(2*pi)-> 0.3989423 9 | # 1/sqrt(2) -> 0.70710678 10 | # sqrt(2/pi) -> 0.79788456 11 | # this function is tanh approximation of gelu 12 | # actual gelu is: 13 | # x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) 14 | 15 | @jit_fuser 16 | def bias_gelu(bias, y): 17 | x = bias + y 18 | return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) 19 | 20 | # gradient of tanh approximation of gelu 21 | # gradient of actual gelu is: 22 | # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) 23 | @jit_fuser 24 | def bias_gelu_back(g, bias, y): 25 | x = bias + y 26 | tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) 27 | # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 28 | ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) 29 | return ff*g 30 | 31 | class GeLUFunction(torch.autograd.Function): 32 | @staticmethod 33 | # bias is an optional argument 34 | def forward(ctx, input, bias): 35 | ctx.save_for_backward(input, bias) 36 | return bias_gelu(bias, input) 37 | 38 | @staticmethod 39 | def backward(ctx, grad_output): 40 | input, bias = ctx.saved_tensors 41 | tmp = bias_gelu_back(grad_output, bias, input) 42 | return tmp, tmp 43 | 44 | bias_gelu_impl = GeLUFunction.apply 45 | -------------------------------------------------------------------------------- /megatron/model/rms_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | from torch import nn 5 | 6 | class RMSNorm(torch.nn.Module): 7 | 8 | def __init__(self, 9 | dim: int, 10 | eps: float = 1e-6, 11 | sequence_parallel: bool = False): 12 | """RMS Normaliation module 13 | 14 | Arguments: 15 | dim (int): The width of input, i.e. hidden size 16 | eps (float): epsilon to use for the norm, default to 1e-6 17 | sequence_parallel (bool): Set to true if sequence parallelism is being used, 18 | this marks the weights as needing to be allreduced. 19 | """ 20 | super().__init__() 21 | self.eps = eps 22 | self.weight = nn.Parameter(torch.ones(dim)) 23 | 24 | setattr(self.weight, 'sequence_parallel', sequence_parallel) 25 | 26 | def _norm(self, x): 27 | return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) 28 | 29 | def forward(self, x): 30 | output = self._norm(x.float()).type_as(x) 31 | return output * self.weight 32 | -------------------------------------------------------------------------------- /megatron/model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Utilities for models.""" 4 | 5 | import math 6 | 7 | import torch 8 | 9 | from megatron import get_args 10 | from megatron.model import LayerNorm, RMSNorm 11 | from megatron.core.jit import jit_fuser 12 | 13 | def init_method_normal(sigma): 14 | """Init method based on N(0, sigma).""" 15 | def init_(tensor): 16 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 17 | 18 | return init_ 19 | 20 | 21 | def scaled_init_method_normal(sigma, num_layers): 22 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 23 | std = sigma / math.sqrt(2.0 * num_layers) 24 | 25 | def init_(tensor): 26 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 27 | 28 | return init_ 29 | 30 | 31 | def attention_mask_func(attention_scores, attention_mask): 32 | attention_scores.masked_fill_(attention_mask, -10000.0) 33 | return attention_scores 34 | 35 | 36 | def get_linear_layer(rows, columns, init_method): 37 | """Simple linear layer with weight initialization.""" 38 | layer = torch.nn.Linear(rows, columns) 39 | if get_args().perform_initialization: 40 | init_method(layer.weight) 41 | with torch.no_grad(): 42 | layer.bias.zero_() 43 | return layer 44 | 45 | 46 | @jit_fuser 47 | def gelu_impl(x): 48 | """OpenAI's gelu implementation.""" 49 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 50 | 51 | (1.0 + 0.044715 * x * x))) 52 | def openai_gelu(x): 53 | return gelu_impl(x) 54 | 55 | 56 | #This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter 57 | @jit_fuser 58 | def erf_gelu(x): 59 | return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype)) 60 | 61 | 62 | def get_norm(config): 63 | args = get_args() 64 | if args.normalization == "LayerNorm": 65 | return LayerNorm( 66 | config.hidden_size, 67 | eps=config.layernorm_epsilon, 68 | no_persist_layer_norm=not config.persist_layer_norm, 69 | sequence_parallel=config.sequence_parallel, 70 | apply_layernorm_1p=args.apply_layernorm_1p) 71 | elif args.normalization == "RMSNorm": 72 | if args.apply_layernorm_1p: 73 | raise NotImplementedError('RMSNorm does not currently support the layernorm_1p formulation.') 74 | 75 | return RMSNorm(dim=config.hidden_size, 76 | eps=config.layernorm_epsilon, 77 | sequence_parallel=config.sequence_parallel) 78 | else: 79 | raise Exception(f"unsupported norm type '{args.normalization}'.") 80 | -------------------------------------------------------------------------------- /megatron/model/vision/classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Vision Transformer(VIT) model.""" 4 | 5 | import torch 6 | from torch.nn.init import trunc_normal_ 7 | from megatron import get_args 8 | from megatron.model.utils import get_linear_layer 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead 10 | from megatron.model.vision.mit_backbone import mit_b3_avg 11 | from megatron.model.module import MegatronModule 12 | 13 | class VitClassificationModel(MegatronModule): 14 | """Vision Transformer Model.""" 15 | 16 | def __init__(self, config, num_classes, finetune=False, 17 | pre_process=True, post_process=True): 18 | super(VitClassificationModel, self).__init__() 19 | args = get_args() 20 | self.config = config 21 | 22 | self.hidden_size = args.hidden_size 23 | self.num_classes = num_classes 24 | self.finetune = finetune 25 | self.pre_process = pre_process 26 | self.post_process = post_process 27 | self.backbone = VitBackbone( 28 | config=config, 29 | pre_process=self.pre_process, 30 | post_process=self.post_process, 31 | single_token_output=True 32 | ) 33 | 34 | if self.post_process: 35 | if not self.finetune: 36 | self.head = VitMlpHead(config, self.hidden_size, self.num_classes) 37 | else: 38 | self.head = get_linear_layer( 39 | self.hidden_size, 40 | self.num_classes, 41 | torch.nn.init.zeros_ 42 | ) 43 | 44 | def set_input_tensor(self, input_tensor): 45 | """See megatron.model.transformer.set_input_tensor()""" 46 | self.backbone.set_input_tensor(input_tensor) 47 | 48 | def forward(self, input): 49 | hidden_states = self.backbone(input) 50 | 51 | if self.post_process: 52 | hidden_states = self.head(hidden_states) 53 | 54 | return hidden_states 55 | 56 | 57 | class MitClassificationModel(MegatronModule): 58 | """Mix vision Transformer Model.""" 59 | 60 | def __init__(self, num_classes, 61 | pre_process=True, post_process=True): 62 | super(MitClassificationModel, self).__init__() 63 | args = get_args() 64 | 65 | self.hidden_size = args.hidden_size 66 | self.num_classes = num_classes 67 | 68 | self.backbone = mit_b3_avg() 69 | self.head = torch.nn.Linear(512, num_classes) 70 | self.apply(self._init_weights) 71 | 72 | def _init_weights(self, m): 73 | if isinstance(m, torch.nn.Linear): 74 | trunc_normal_(m.weight, std=.02) 75 | if isinstance(m, torch.nn.Linear) and m.bias is not None: 76 | torch.nn.init.constant_(m.bias, 0) 77 | 78 | def set_input_tensor(self, input_tensor): 79 | """See megatron.model.transformer.set_input_tensor()""" 80 | pass 81 | 82 | def forward(self, input): 83 | hidden_states = self.backbone(input) 84 | hidden_states = self.head(hidden_states) 85 | 86 | return hidden_states 87 | -------------------------------------------------------------------------------- /megatron/model/vision/utils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | def resize(input, 7 | size=None, 8 | scale_factor=None, 9 | mode='nearest', 10 | align_corners=None, 11 | warning=True): 12 | if warning: 13 | if size is not None and align_corners: 14 | input_h, input_w = tuple(int(x) for x in input.shape[2:]) 15 | output_h, output_w = tuple(int(x) for x in size) 16 | if output_h > input_h or output_w > output_h: 17 | if ((output_h > 1 and output_w > 1 and input_h > 1 18 | and input_w > 1) and (output_h - 1) % (input_h - 1) 19 | and (output_w - 1) % (input_w - 1)): 20 | warnings.warn( 21 | f'When align_corners={align_corners}, ' 22 | 'the output would more aligned if ' 23 | f'input size {(input_h, input_w)} is `x+1` and ' 24 | f'out size {(output_h, output_w)} is `nx+1`') 25 | if isinstance(size, torch.Size): 26 | size = tuple(int(x) for x in size) 27 | return F.interpolate(input, size, scale_factor, mode, align_corners) 28 | -------------------------------------------------------------------------------- /megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /megatron/mpu/tests/commons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import os 5 | import random 6 | import numpy 7 | import torch 8 | 9 | import mpu 10 | 11 | 12 | class IdentityLayer(torch.nn.Module): 13 | def __init__(self, size, scale=1.0): 14 | super(IdentityLayer, self).__init__() 15 | self.weight = torch.nn.Parameter(scale * torch.randn(size)) 16 | 17 | def forward(self): 18 | return self.weight 19 | 20 | 21 | def set_random_seed(seed): 22 | """Set random seed for reproducability.""" 23 | random.seed(seed) 24 | numpy.random.seed(seed) 25 | torch.manual_seed(seed) 26 | mpu.model_parallel_cuda_manual_seed(seed) 27 | 28 | 29 | def initialize_distributed(backend='nccl'): 30 | """Initialize torch.distributed.""" 31 | # Get local rank in case it is provided. 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--local_rank', type=int, default=None, 34 | help='local rank passed from distributed launcher') 35 | args = parser.parse_args() 36 | local_rank = args.local_rank 37 | 38 | # Get rank and world size. 39 | rank = int(os.getenv('RANK', '0')) 40 | world_size = int(os.getenv("WORLD_SIZE", '1')) 41 | 42 | print('> initializing torch.distributed with local rank: {}, ' 43 | 'rank: {}, world size: {}'.format(local_rank, rank, world_size)) 44 | 45 | # Set the device id. 46 | device = rank % torch.cuda.device_count() 47 | if local_rank is not None: 48 | device = local_rank 49 | torch.cuda.set_device(device) 50 | 51 | # Call the init process. 52 | init_method = 'tcp://' 53 | master_ip = os.getenv('MASTER_ADDR', 'localhost') 54 | master_port = os.getenv('MASTER_PORT', '6000') 55 | init_method += master_ip + ':' + master_port 56 | torch.distributed.init_process_group( 57 | backend=backend, 58 | world_size=world_size, 59 | rank=rank, 60 | init_method=init_method) 61 | 62 | 63 | def print_separator(message): 64 | torch.distributed.barrier() 65 | filler_len = (78 - len(message)) // 2 66 | filler = '-' * filler_len 67 | string = '\n' + filler + ' {} '.format(message) + filler 68 | if torch.distributed.get_rank() == 0: 69 | print(string, flush=True) 70 | torch.distributed.barrier() 71 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import print_separator 4 | from commons import initialize_distributed 5 | from mpu import data as data_utils 6 | import mpu 7 | import torch 8 | import functools 9 | import operator 10 | import sys 11 | sys.path.append("../..") 12 | 13 | 14 | def test_broadcast_data(tensor_model_parallel_size): 15 | 16 | if torch.distributed.get_rank() == 0: 17 | print('> testing broadcast_data with model parallel size {} ...'. 18 | format(tensor_model_parallel_size)) 19 | 20 | mpu.initialize_model_parallel(tensor_model_parallel_size) 21 | torch.manual_seed(1234 + mpu.get_data_parallel_rank()) 22 | tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size() 23 | 24 | key_size_t = {'key1': [7, 11], 25 | 'key2': [8, 2, 1], 26 | 'key3': [13], 27 | 'key4': [5, 1, 2], 28 | 'key5': [5, 12]} 29 | keys = list(key_size_t.keys()) 30 | 31 | data = {} 32 | data_t = {} 33 | for key in key_size_t: 34 | data[key] = torch.LongTensor(size=key_size_t[key]).random_(0, 1000) 35 | data_t[key] = data[key].clone() 36 | data['keyX'] = torch.FloatTensor(size=(5, )).random_(0, 1000) 37 | data_t['keyX'] = data['keyX'].clone() 38 | if mpu.get_tensor_model_parallel_rank() != 0: 39 | data = None 40 | 41 | data_utils._check_data_types(keys, data_t, torch.int64) 42 | key_size, key_numel, \ 43 | total_numel = data_utils._build_key_size_numel_dictionaries(keys, data) 44 | for key in keys: 45 | assert key_size[key] == key_size_t[key] 46 | total_numel_t = 0 47 | for key in keys: 48 | target_size = functools.reduce(operator.mul, key_size_t[key], 1) 49 | assert key_numel[key] == target_size 50 | total_numel_t += target_size 51 | assert total_numel == total_numel_t 52 | 53 | data_b = data_utils.broadcast_data(keys, data, torch.int64) 54 | for key in keys: 55 | tensor = data_t[key].cuda() 56 | assert data_b[key].sub(tensor).abs().max() == 0 57 | 58 | # Reset groups 59 | mpu.destroy_tensor_model_parallel() 60 | 61 | torch.distributed.barrier() 62 | if torch.distributed.get_rank() == 0: 63 | print('>> passed the test :-)') 64 | 65 | 66 | if __name__ == '__main__': 67 | 68 | initialize_distributed() 69 | world_size = torch.distributed.get_world_size() 70 | 71 | tensor_model_parallel_size = 1 72 | while tensor_model_parallel_size <= world_size: 73 | print_separator('test test broadcast data') 74 | test_broadcast_data(tensor_model_parallel_size) 75 | tensor_model_parallel_size *= 2 76 | -------------------------------------------------------------------------------- /megatron/mpu/tests/test_initialize.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from commons import print_separator 4 | from commons import initialize_distributed 5 | import mpu 6 | import torch 7 | import sys 8 | sys.path.append("../..") 9 | 10 | 11 | def test_initialize_model_parallel(tensor_model_parallel_size): 12 | 13 | if torch.distributed.get_rank() == 0: 14 | print('> testing initialize_model_parallel with size {} ...'.format( 15 | tensor_model_parallel_size)) 16 | tensor_model_parallel_size_ = min(tensor_model_parallel_size, 17 | torch.distributed.get_world_size()) 18 | assert not mpu.model_parallel_is_initialized() 19 | mpu.initialize_model_parallel(tensor_model_parallel_size_) 20 | assert mpu.model_parallel_is_initialized() 21 | 22 | # Checks. 23 | def check(group, world_size, rank): 24 | assert world_size == torch.distributed.get_world_size(group=group) 25 | assert rank == torch.distributed.get_rank(group=group) 26 | 27 | # Model parallel. 28 | world_size = tensor_model_parallel_size_ 29 | rank = torch.distributed.get_rank() % tensor_model_parallel_size_ 30 | assert world_size == mpu.get_tensor_model_parallel_world_size() 31 | assert rank == mpu.get_tensor_model_parallel_rank() 32 | check(mpu.get_tensor_model_parallel_group(), world_size, rank) 33 | 34 | # Data parallel. 35 | world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_ 36 | rank = torch.distributed.get_rank() // tensor_model_parallel_size 37 | assert world_size == mpu.get_data_parallel_world_size() 38 | assert rank == mpu.get_data_parallel_rank() 39 | check(mpu.get_data_parallel_group(), world_size, rank) 40 | 41 | # Reset groups 42 | mpu.destroy_model_parallel() 43 | 44 | torch.distributed.barrier() 45 | if torch.distributed.get_rank() == 0: 46 | print('>> passed the test :-)') 47 | 48 | 49 | def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): 50 | 51 | if torch.distributed.get_rank() == 0: 52 | print('> testing get_tensor_model_parallel_src_rank with size {} ...'.format( 53 | tensor_model_parallel_size_)) 54 | tensor_model_parallel_size = min(tensor_model_parallel_size_, 55 | torch.distributed.get_world_size()) 56 | assert not mpu.model_parallel_is_initialized() 57 | mpu.initialize_model_parallel(tensor_model_parallel_size) 58 | assert mpu.model_parallel_is_initialized() 59 | 60 | # Checks 61 | src_rank = torch.distributed.get_rank() - mpu.get_tensor_model_parallel_rank() 62 | assert mpu.get_tensor_model_parallel_src_rank() == src_rank 63 | 64 | # Reset groups 65 | mpu.destroy_model_parallel() 66 | 67 | torch.distributed.barrier() 68 | if torch.distributed.get_rank() == 0: 69 | print('>> passed the test :-)') 70 | 71 | 72 | if __name__ == '__main__': 73 | 74 | initialize_distributed() 75 | world_size = torch.distributed.get_world_size() 76 | tensor_model_parallel_size = 1 77 | while tensor_model_parallel_size <= world_size: 78 | print_separator('test initialize model parallel') 79 | test_initialize_model_parallel(tensor_model_parallel_size) 80 | print_separator('test model parallel source rank') 81 | test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) 82 | tensor_model_parallel_size *= 2 83 | -------------------------------------------------------------------------------- /megatron/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Megatron 9 | 71 | 72 | 73 |
74 |

Prompt Megatron

75 | 76 | 77 | 78 | 79 | 80 |
81 | 0 82 | / 1000 83 |
84 | 85 |
86 | 87 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /megatron/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /megatron/text_generation/beam_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | ## from huggingface beam search 19 | class BeamHypotheses(object): 20 | def __init__(self, num_beams, length_penalty=1.0, early_stopping=False): 21 | """ 22 | Initialize n-best list of hypotheses. 23 | """ 24 | self.length_penalty = length_penalty 25 | self.early_stopping = early_stopping 26 | self.num_beams = num_beams 27 | self.beams = [] 28 | self.worst_score = 1e9 29 | 30 | def __len__(self): 31 | """ 32 | Number of hypotheses in the list. 33 | """ 34 | return len(self.beams) 35 | 36 | def add(self, hyp, sum_logprobs, length): 37 | """ 38 | Add a new hypothesis to the list. 39 | """ 40 | score = sum_logprobs / length ** self.length_penalty 41 | if len(self) < self.num_beams or score > self.worst_score: 42 | self.beams.append((score, hyp)) 43 | if len(self) > self.num_beams: 44 | sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) 45 | del self.beams[sorted_scores[0][1]] 46 | self.worst_score = sorted_scores[1][0] 47 | else: 48 | self.worst_score = min(score, self.worst_score) 49 | 50 | def is_done(self, best_sum_logprobs, cur_len): 51 | """ 52 | If there are enough hypotheses and that none of the hypotheses being generated 53 | can become better than the worst one in the heap, then we are done with this sentence. 54 | """ 55 | 56 | if len(self) < self.num_beams: 57 | return False 58 | elif self.early_stopping: 59 | return True 60 | else: 61 | cur_score = best_sum_logprobs / cur_len ** self.length_penalty 62 | ret = self.worst_score >= cur_score 63 | return ret 64 | 65 | -------------------------------------------------------------------------------- /megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /run_docker.sh: -------------------------------------------------------------------------------- 1 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -v $HOME:$HOME -it --rm nvcr.io/nvidia/pytorch:24.01-py3 -------------------------------------------------------------------------------- /scripts/data/download_c4.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | import os 3 | 4 | os.makedirs('./assets/data/c4', exist_ok=True) 5 | # English only 6 | for i in range(20): 7 | en = load_dataset("allenai/c4", data_files={'train': f'en/c4-train.{str(i).zfill(5)}-of-01024.json.gz'}, cache_dir='./assets/data/c4', split='train') 8 | print(len(en)) 9 | 10 | # save as json files 11 | en.to_json(f'./assets/data/c4/en/c4-train.{str(i).zfill(5)}-of-01024.json', orient='records', lines=True) -------------------------------------------------------------------------------- /scripts/data/prepare_c4_megatron_llama2.sh: -------------------------------------------------------------------------------- 1 | # for i from 00000 to 00100 2 | mkdir -p assets/data/c4_llama2_pretokenized 3 | for i in {00000..00019}; do 4 | echo "Processing ./assets/data/c4/en/c4-train.${i}-of-01024.json" 5 | python tools/preprocess_data.py \ 6 | --input "./assets/data/c4/en/c4-train.${i}-of-01024.json" \ 7 | --output-prefix assets/data/c4_llama2_pretokenized/c4_llama2_${i} \ 8 | --vocab-file ./assets/checkpoints/llama2_7b_hf/tokenizer.json \ 9 | --tokenizer-type Llama2Tokenizer \ 10 | --tokenizer-model ./assets/checkpoints/llama2_7b_hf/tokenizer.model \ 11 | --append-eod \ 12 | --workers 8 13 | done -------------------------------------------------------------------------------- /scripts/data/prepare_c4_megatron_llama3.1.sh: -------------------------------------------------------------------------------- 1 | # for i from 00000 to 00100 2 | mkdir -p assets/data/c4_llama3.1_pretokenized 3 | for i in {00000..00019}; do 4 | echo "Processing ./assets/data/c4/en/c4-train.${i}-of-01024.json" 5 | python tools/preprocess_data.py \ 6 | --input "./assets/data/c4/en/c4-train.${i}-of-01024.json" \ 7 | --output-prefix assets/data/c4_llama3.1_pretokenized/c4_llama3.1_${i} \ 8 | --vocab-file ./assets/checkpoints/llama3.1_8b_hf/tokenizer.json \ 9 | --tokenizer-type AutoTokenizer \ 10 | --tokenizer-model ./assets/checkpoints/llama3.1_8b_hf \ 11 | --append-eod \ 12 | --workers 8 13 | done -------------------------------------------------------------------------------- /scripts/data/prepare_c4_megatron_llama3.sh: -------------------------------------------------------------------------------- 1 | # for i from 00000 to 00100 2 | mkdir -p assets/data/c4_llama3_pretokenized 3 | for i in {00000..00019}; do 4 | echo "Processing ./assets/data/c4/en/c4-train.${i}-of-01024.json" 5 | python tools/preprocess_data.py \ 6 | --input "./assets/data/c4/en/c4-train.${i}-of-01024.json" \ 7 | --output-prefix assets/data/c4_llama3_pretokenized/c4_llama3_${i} \ 8 | --vocab-file ./assets/checkpoints/llama3_8b_hf/tokenizer.json \ 9 | --tokenizer-type AutoTokenizer \ 10 | --tokenizer-model ./assets/checkpoints/llama3_8b_hf \ 11 | --append-eod \ 12 | --workers 8 13 | done -------------------------------------------------------------------------------- /scripts/oneshot/run_llama2_13b_prune_tp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_ADDR="127.0.0.1" 4 | export MASTER_PORT="45530" # select the port 5 | NNODES=1 # number of nodes 6 | NPROC_PER_NODE=8 # number of gpus (processes) per node 7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total 8 | 9 | export NCCL_IB_TIMEOUT=19 10 | export NCCL_IB_SL=1 11 | export CUDA_DEVICE_MAX_CONNECTIONS=1 12 | 13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda 14 | EXTRA_CMD=$2 15 | 16 | export TASK='wikitext' 17 | export SPARSITY=0.5 18 | export PATTERN='nmprune' 19 | export EXCLUDE=0 20 | 21 | NSAMPLES=128 22 | BASE_NAME="llama2-13b-tp8" 23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}" 24 | 25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 26 | PROJECT_DIR=$(pwd) 27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning 28 | mkdir -p $LOG_DIR 29 | 30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama2_13b_megatron_tp8" 31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}" 32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_13b_hf/tokenizer.model" 33 | 34 | TASK_NAME="PRUNE-WIKITEXT2" 35 | options=" \ 36 | ${mag_options} \ 37 | --task ${TASK_NAME} \ 38 | --valid-data ${VALID_DATA} \ 39 | --use-flash-attn \ 40 | --untie-embeddings-and-output-weights \ 41 | --disable-bias-linear \ 42 | --no-position-embedding \ 43 | --no-masked-softmax-fusion \ 44 | --use-rotary-position-embeddings \ 45 | --swiglu \ 46 | --attention-dropout 0.0 \ 47 | --hidden-dropout 0.0 \ 48 | --tensor-model-parallel-size 8 \ 49 | --pipeline-model-parallel-size 1 \ 50 | --num-layers 40 \ 51 | --hidden-size 5120 \ 52 | --num-attention-heads 40 \ 53 | --seq-length 4096 \ 54 | --max-position-embeddings 4096 \ 55 | --micro-batch-size 1 \ 56 | --global-batch-size 256 \ 57 | --train-iters 1000 \ 58 | --log-interval 10 \ 59 | --overlapping-eval 4096 \ 60 | --eval-iters 10 \ 61 | --eval-interval 500 \ 62 | --tokenizer-type Llama2Tokenizer \ 63 | --tokenizer-model ${TOKENIZER_MODEL} \ 64 | --make-vocab-size-divisible-by 1 \ 65 | --ffn-hidden-size 13824 --normalization RMSNorm \ 66 | --split 99,1,0 \ 67 | --clip-grad 1.0 \ 68 | --weight-decay 0.1 \ 69 | --adam-beta1 0.9 \ 70 | --adam-beta2 0.95 \ 71 | --init-method-std 0.014 \ 72 | --exit-on-missing-checkpoint \ 73 | --no-load-optim \ 74 | --no-load-rng \ 75 | --bf16 \ 76 | --log-interval 100 \ 77 | --eval-iters 32 \ 78 | --eval-interval 2000 \ 79 | --data-path "None" \ 80 | --save-interval 20000 \ 81 | --save ${CHECKPOINT_SAVE_DIR} \ 82 | --load ${CHECKPOINT_LOAD_DIR} \ 83 | --hessian-compute \ 84 | --sparse-pattern ${PATTERN} \ 85 | --sparse-method ${SPARSEMETHOD} \ 86 | --sparsity ${SPARSITY} \ 87 | --row-b -1 \ 88 | --col-b 128 \ 89 | --prunen 2 \ 90 | --prunem 4 \ 91 | --hessian-samples $NSAMPLES \ 92 | --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} " 93 | 94 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 95 | 96 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${options} -------------------------------------------------------------------------------- /scripts/oneshot/run_llama2_7b_prune_tp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_ADDR="127.0.0.1" 4 | export MASTER_PORT="45530" # select the port 5 | NNODES=1 # number of nodes 6 | NPROC_PER_NODE=8 # number of gpus (processes) per node 7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total 8 | 9 | export NCCL_IB_TIMEOUT=19 10 | export NCCL_IB_SL=1 11 | export CUDA_DEVICE_MAX_CONNECTIONS=1 12 | 13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda 14 | EXTRA_CMD=$2 15 | 16 | export TASK='wikitext' 17 | export SPARSITY=0.5 18 | export PATTERN='nmprune' 19 | export EXCLUDE=0 20 | 21 | NSAMPLES=128 22 | BASE_NAME="llama2-7b-tp8" 23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}" 24 | 25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 26 | PROJECT_DIR=$(pwd) 27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning 28 | mkdir -p $LOG_DIR 29 | 30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama2_7b_megatron_tp8" 31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}" 32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_7b_hf/tokenizer.model" 33 | 34 | TASK_NAME="PRUNE-WIKITEXT2" 35 | options=" \ 36 | ${mag_options} \ 37 | --task ${TASK_NAME} \ 38 | --valid-data ${VALID_DATA} \ 39 | --use-flash-attn \ 40 | --untie-embeddings-and-output-weights \ 41 | --disable-bias-linear \ 42 | --no-position-embedding \ 43 | --no-masked-softmax-fusion \ 44 | --use-rotary-position-embeddings \ 45 | --swiglu \ 46 | --attention-dropout 0.0 \ 47 | --hidden-dropout 0.0 \ 48 | --tensor-model-parallel-size 8 \ 49 | --pipeline-model-parallel-size 1 \ 50 | --num-layers 32 \ 51 | --hidden-size 4096 \ 52 | --num-attention-heads 32 \ 53 | --seq-length 4096 \ 54 | --max-position-embeddings 4096 \ 55 | --micro-batch-size 1 \ 56 | --global-batch-size 256 \ 57 | --train-iters 1000 \ 58 | --log-interval 10 \ 59 | --overlapping-eval 4096 \ 60 | --eval-iters 10 \ 61 | --eval-interval 500 \ 62 | --tokenizer-type Llama2Tokenizer \ 63 | --tokenizer-model ${TOKENIZER_MODEL} \ 64 | --make-vocab-size-divisible-by 1 \ 65 | --ffn-hidden-size 11008 --normalization RMSNorm \ 66 | --split 99,1,0 \ 67 | --clip-grad 1.0 \ 68 | --weight-decay 0.1 \ 69 | --num-query-groups 32 \ 70 | --group-query-attention \ 71 | --adam-beta1 0.9 \ 72 | --adam-beta2 0.95 \ 73 | --init-method-std 0.014 \ 74 | --exit-on-missing-checkpoint \ 75 | --no-load-optim \ 76 | --no-load-rng \ 77 | --bf16 \ 78 | --log-interval 100 \ 79 | --eval-iters 32 \ 80 | --eval-interval 2000 \ 81 | --data-path "None" \ 82 | --save-interval 20000 \ 83 | --save ${CHECKPOINT_SAVE_DIR} \ 84 | --load ${CHECKPOINT_LOAD_DIR} \ 85 | --hessian-compute \ 86 | --sparse-pattern ${PATTERN} \ 87 | --sparse-method ${SPARSEMETHOD} \ 88 | --sparsity ${SPARSITY} \ 89 | --row-b -1 \ 90 | --col-b 128 \ 91 | --prunen 2 \ 92 | --prunem 4 \ 93 | --hessian-samples $NSAMPLES \ 94 | --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} " 95 | 96 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 97 | 98 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py $options -------------------------------------------------------------------------------- /scripts/oneshot/run_llama3.1_8b_prune_tp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_ADDR="127.0.0.1" 4 | export MASTER_PORT="45530" # select the port 5 | NNODES=1 # number of nodes 6 | NPROC_PER_NODE=8 # number of gpus (processes) per node 7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total 8 | 9 | export NCCL_IB_TIMEOUT=19 10 | export NCCL_IB_SL=1 11 | export CUDA_DEVICE_MAX_CONNECTIONS=1 12 | 13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda 14 | EXTRA_CMD=$2 15 | 16 | export TASK='wikitext' 17 | export SPARSITY=0.5 18 | export PATTERN='nmprune' 19 | export EXCLUDE=0 20 | 21 | NSAMPLES=128 22 | BASE_NAME="llama3.1-8b-tp8" 23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}" 24 | 25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 26 | PROJECT_DIR=$(pwd) 27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning 28 | mkdir -p $LOG_DIR 29 | 30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama3.1_8b_megatron_tp8" 31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}" 32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3.1_8b_hf" 33 | 34 | TASK_NAME="PRUNE-WIKITEXT2" 35 | options=" \ 36 | ${mag_options} \ 37 | --task ${TASK_NAME} \ 38 | --valid-data ${VALID_DATA} \ 39 | --use-flash-attn \ 40 | --untie-embeddings-and-output-weights \ 41 | --disable-bias-linear \ 42 | --no-position-embedding \ 43 | --no-masked-softmax-fusion \ 44 | --use-rotary-position-embeddings \ 45 | --swiglu \ 46 | --rotary-base 500000 \ 47 | --attention-dropout 0.0 \ 48 | --hidden-dropout 0.0 \ 49 | --tensor-model-parallel-size 8 \ 50 | --pipeline-model-parallel-size 1 \ 51 | --num-layers 32 \ 52 | --hidden-size 4096 \ 53 | --num-attention-heads 32 \ 54 | --seq-length 4096 \ 55 | --max-position-embeddings 4096 \ 56 | --group-query-attention \ 57 | --num-query-groups 8 \ 58 | --micro-batch-size 1 \ 59 | --global-batch-size 256 \ 60 | --train-iters 1000 \ 61 | --log-interval 10 \ 62 | --overlapping-eval 4096 \ 63 | --eval-iters 10 \ 64 | --eval-interval 500 \ 65 | --tokenizer-type AutoTokenizer \ 66 | --tokenizer-model ${TOKENIZER_MODEL} \ 67 | --make-vocab-size-divisible-by 1 \ 68 | --ffn-hidden-size 14336 --normalization RMSNorm \ 69 | --split 99,1,0 \ 70 | --clip-grad 1.0 \ 71 | --weight-decay 0.1 \ 72 | --group-query-attention \ 73 | --adam-beta1 0.9 \ 74 | --adam-beta2 0.95 \ 75 | --init-method-std 0.014 \ 76 | --exit-on-missing-checkpoint \ 77 | --no-load-optim \ 78 | --no-load-rng \ 79 | --bf16 \ 80 | --log-interval 100 \ 81 | --eval-iters 32 \ 82 | --eval-interval 2000 \ 83 | --data-path "None" \ 84 | --save-interval 20000 \ 85 | --save ${CHECKPOINT_SAVE_DIR} \ 86 | --load ${CHECKPOINT_LOAD_DIR} \ 87 | --hessian-compute \ 88 | --sparse-pattern ${PATTERN} \ 89 | --sparse-method ${SPARSEMETHOD} \ 90 | --sparsity ${SPARSITY} \ 91 | --row-b -1 \ 92 | --col-b 128 \ 93 | --prunen 2 \ 94 | --prunem 4 \ 95 | --hessian-samples $NSAMPLES \ 96 | --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} " 97 | 98 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 99 | 100 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py $options -------------------------------------------------------------------------------- /scripts/oneshot/run_llama3_8b_prune_tp8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export MASTER_ADDR="127.0.0.1" 4 | export MASTER_PORT="45530" # select the port 5 | NNODES=1 # number of nodes 6 | NPROC_PER_NODE=8 # number of gpus (processes) per node 7 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total 8 | 9 | export NCCL_IB_TIMEOUT=19 10 | export NCCL_IB_SL=1 11 | export CUDA_DEVICE_MAX_CONNECTIONS=1 12 | 13 | SPARSEMETHOD=$1 # SparseGPT, Magnitude, Wanda 14 | EXTRA_CMD=$2 15 | 16 | export TASK='wikitext' 17 | export SPARSITY=0.5 18 | export PATTERN='nmprune' 19 | export EXCLUDE=0 20 | 21 | NSAMPLES=128 22 | BASE_NAME="llama3-8b-tp8" 23 | NAME="${BASE_NAME}.sparse.${PATTERN}.sp${SPARSITY}${SPARSEMETHOD}.ex${EXCLUDE}" 24 | 25 | DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` 26 | PROJECT_DIR=$(pwd) 27 | LOG_DIR=$PROJECT_DIR/output/oneshot_pruning 28 | mkdir -p $LOG_DIR 29 | 30 | CHECKPOINT_LOAD_DIR="$PROJECT_DIR/assets/checkpoints/llama3_8b_megatron_tp8" 31 | CHECKPOINT_SAVE_DIR="$PROJECT_DIR/output/oneshot_pruning/checkpoint/${NAME}" 32 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3_8b_hf" 33 | 34 | TASK_NAME="PRUNE-WIKITEXT2" 35 | options=" \ 36 | ${mag_options} \ 37 | --task ${TASK_NAME} \ 38 | --valid-data ${VALID_DATA} \ 39 | --use-flash-attn \ 40 | --untie-embeddings-and-output-weights \ 41 | --disable-bias-linear \ 42 | --no-position-embedding \ 43 | --no-masked-softmax-fusion \ 44 | --use-rotary-position-embeddings \ 45 | --swiglu \ 46 | --rotary-base 500000 \ 47 | --attention-dropout 0.0 \ 48 | --hidden-dropout 0.0 \ 49 | --tensor-model-parallel-size 8 \ 50 | --pipeline-model-parallel-size 1 \ 51 | --num-layers 32 \ 52 | --hidden-size 4096 \ 53 | --num-attention-heads 32 \ 54 | --seq-length 4096 \ 55 | --max-position-embeddings 4096 \ 56 | --group-query-attention \ 57 | --num-query-groups 8 \ 58 | --micro-batch-size 1 \ 59 | --global-batch-size 256 \ 60 | --train-iters 1000 \ 61 | --log-interval 10 \ 62 | --overlapping-eval 4096 \ 63 | --eval-iters 10 \ 64 | --eval-interval 500 \ 65 | --tokenizer-type AutoTokenizer \ 66 | --tokenizer-model ${TOKENIZER_MODEL} \ 67 | --make-vocab-size-divisible-by 1 \ 68 | --ffn-hidden-size 14336 --normalization RMSNorm \ 69 | --split 99,1,0 \ 70 | --clip-grad 1.0 \ 71 | --weight-decay 0.1 \ 72 | --group-query-attention \ 73 | --adam-beta1 0.9 \ 74 | --adam-beta2 0.95 \ 75 | --init-method-std 0.014 \ 76 | --exit-on-missing-checkpoint \ 77 | --no-load-optim \ 78 | --no-load-rng \ 79 | --bf16 \ 80 | --log-interval 100 \ 81 | --eval-iters 32 \ 82 | --eval-interval 2000 \ 83 | --data-path "None" \ 84 | --save-interval 20000 \ 85 | --save ${CHECKPOINT_SAVE_DIR} \ 86 | --load ${CHECKPOINT_LOAD_DIR} \ 87 | --hessian-compute \ 88 | --sparse-pattern ${PATTERN} \ 89 | --sparse-method ${SPARSEMETHOD} \ 90 | --sparsity ${SPARSITY} \ 91 | --row-b -1 \ 92 | --col-b 128 \ 93 | --prunen 2 \ 94 | --prunem 4 \ 95 | --hessian-samples $NSAMPLES \ 96 | --exclude-layers-from-prune ${EXCLUDE} ${EXTRA_CMD} " 97 | 98 | cd $PROJECT_DIR; export CUDA_DEVICE_MAX_CONNECTIONS=1; 99 | 100 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py $options -------------------------------------------------------------------------------- /scripts/ppl/evaluate_llama2_wikitext2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # export NCCL_IB_SL=1 4 | export CUDA_DEVICE_MAX_CONNECTIONS=1 5 | LOAD=$1 # path to the model 6 | MODEL=$2 # 7b, 13b 7 | TP=$3 8 | MODE=$4 9 | 10 | echo $LOAD 11 | 12 | PROJECT_DIR=$(pwd) # change this to the path of your maskllm project 13 | OUTPUT="$PROJECT_DIR/output" 14 | 15 | # If model==2b 16 | if [ "$MODEL" == "7b" ]; then 17 | HIDDEN_SIZE=4096 # hidden size 18 | NUM_LAYERS=32 # number of layers 19 | NUM_ATTN_HEADS=32 # number of attention heads 20 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_7b_megatron_tp8/tokenizer.model" 21 | FFN_HIDDEN_SIZE=11008 22 | elif [ "$MODEL" == "13b" ]; then 23 | HIDDEN_SIZE=5120 # hidden size 24 | NUM_LAYERS=40 # number of layers 25 | NUM_ATTN_HEADS=40 # number of attention heads 26 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama2_13b_megatron_tp8/tokenizer.model" 27 | FFN_HIDDEN_SIZE=13824 28 | fi 29 | SEQ_LENGTH=4096 # sequence length 30 | 31 | if [ "$MODE" == "dense" ]; then 32 | MASK_OPTIONS=" " 33 | elif [ "$MODE" == "sparse" ]; then 34 | MASK_OPTIONS="--enable-sparsity " 35 | else 36 | MASK_OPTIONS=" " 37 | fi 38 | 39 | export CUDA_DEVICE_MAX_CONNECTIONS=1; 40 | 41 | OPTIONS=" \ 42 | --task WIKITEXT2 \ 43 | --use-flash-attn \ 44 | --untie-embeddings-and-output-weights \ 45 | --disable-bias-linear \ 46 | --no-position-embedding \ 47 | --no-masked-softmax-fusion \ 48 | --use-rotary-position-embeddings \ 49 | --swiglu \ 50 | --attention-dropout 0.0 \ 51 | --hidden-dropout 0.0 \ 52 | --tensor-model-parallel-size $TP \ 53 | --pipeline-model-parallel-size 1 \ 54 | --overlapping-eval $SEQ_LENGTH \ 55 | --num-layers $NUM_LAYERS \ 56 | --hidden-size $HIDDEN_SIZE \ 57 | --num-attention-heads $NUM_ATTN_HEADS \ 58 | --seq-length $SEQ_LENGTH \ 59 | --max-position-embeddings $SEQ_LENGTH \ 60 | --micro-batch-size 1 \ 61 | --global-batch-size 256 \ 62 | --train-iters 1 \ 63 | --lr-decay-iters 1 \ 64 | --lr 1.0e-4 \ 65 | --min-lr 1.0e-5 \ 66 | --lr-decay-style cosine \ 67 | --log-interval 100 \ 68 | --tokenizer-type Llama2Tokenizer \ 69 | --tokenizer-model ${TOKENIZER_MODEL} \ 70 | --make-vocab-size-divisible-by 1 \ 71 | --ffn-hidden-size $FFN_HIDDEN_SIZE --normalization RMSNorm \ 72 | --data-path None \ 73 | --bf16 \ 74 | --no-save-optim --no-save-rng \ 75 | --no-load-optim --no-load-rng \ 76 | --exit-on-missing-checkpoint \ 77 | --load ${LOAD} \ 78 | --hidden-dropout 0.0 --attention-dropout 0.0 \ 79 | $MASK_OPTIONS" 80 | 81 | cd $PROJECT_DIR; 82 | 83 | export MASTER_ADDR="127.0.0.1" 84 | export MASTER_PORT="45530" # select the port 85 | NNODES=1 # number of nodes 86 | NPROC_PER_NODE=${TP} # number of gpus (processes) per node 87 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total 88 | 89 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${OPTIONS} 90 | 91 | -------------------------------------------------------------------------------- /scripts/ppl/evaluate_llama3.1_wikitext2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # export NCCL_IB_SL=1 4 | export CUDA_DEVICE_MAX_CONNECTIONS=1 5 | LOAD=$1 # path to the model 6 | MODEL=$2 # 7b, 13b 7 | TP=$3 8 | MODE=$4 9 | 10 | echo $LOAD 11 | 12 | PROJECT_DIR=$(pwd) # change this to the path of your maskllm project 13 | OUTPUT="$PROJECT_DIR/output" 14 | 15 | # If model==2b 16 | if [ "$MODEL" == "8b" ]; then 17 | HIDDEN_SIZE=4096 # hidden size 18 | NUM_LAYERS=32 # number of layers 19 | NUM_ATTN_HEADS=32 # number of attention heads 20 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3.1_8b_hf" 21 | FFN_HIDDEN_SIZE=14336 22 | fi 23 | SEQ_LENGTH=4096 # sequence length 24 | 25 | if [ "$MODE" == "dense" ]; then 26 | MASK_OPTIONS=" " 27 | elif [ "$MODE" == "sparse" ]; then 28 | MASK_OPTIONS="--enable-sparsity " 29 | else 30 | MASK_OPTIONS=" " 31 | fi 32 | 33 | export CUDA_DEVICE_MAX_CONNECTIONS=1; 34 | 35 | OPTIONS=" \ 36 | --task WIKITEXT2 \ 37 | --use-flash-attn \ 38 | --untie-embeddings-and-output-weights \ 39 | --disable-bias-linear \ 40 | --no-position-embedding \ 41 | --no-masked-softmax-fusion \ 42 | --use-rotary-position-embeddings \ 43 | --rotary-base 500000 \ 44 | --swiglu \ 45 | --attention-dropout 0.0 \ 46 | --hidden-dropout 0.0 \ 47 | --tensor-model-parallel-size $TP \ 48 | --pipeline-model-parallel-size 1 \ 49 | --overlapping-eval $SEQ_LENGTH \ 50 | --num-layers $NUM_LAYERS \ 51 | --hidden-size $HIDDEN_SIZE \ 52 | --num-attention-heads $NUM_ATTN_HEADS \ 53 | --seq-length $SEQ_LENGTH \ 54 | --max-position-embeddings $SEQ_LENGTH \ 55 | --group-query-attention \ 56 | --num-query-groups 8 \ 57 | --micro-batch-size 1 \ 58 | --global-batch-size 256 \ 59 | --train-iters 1 \ 60 | --lr-decay-iters 1 \ 61 | --lr 1.0e-4 \ 62 | --min-lr 1.0e-5 \ 63 | --lr-decay-style cosine \ 64 | --log-interval 100 \ 65 | --tokenizer-type AutoTokenizer \ 66 | --tokenizer-model ${TOKENIZER_MODEL} \ 67 | --make-vocab-size-divisible-by 1 \ 68 | --ffn-hidden-size $FFN_HIDDEN_SIZE --normalization RMSNorm \ 69 | --data-path None \ 70 | --bf16 \ 71 | --no-save-optim --no-save-rng \ 72 | --no-load-optim --no-load-rng \ 73 | --exit-on-missing-checkpoint \ 74 | --load ${LOAD} \ 75 | --hidden-dropout 0.0 --attention-dropout 0.0 \ 76 | $MASK_OPTIONS" 77 | 78 | cd $PROJECT_DIR; 79 | 80 | export MASTER_ADDR="127.0.0.1" 81 | export MASTER_PORT="45530" # select the port 82 | NNODES=1 # number of nodes 83 | NPROC_PER_NODE=${TP} # number of gpus (processes) per node 84 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total 85 | 86 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${OPTIONS} 87 | 88 | -------------------------------------------------------------------------------- /scripts/ppl/evaluate_llama3_wikitext2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # export NCCL_IB_SL=1 4 | export CUDA_DEVICE_MAX_CONNECTIONS=1 5 | LOAD=$1 # path to the model 6 | MODEL=$2 # 7b, 13b 7 | TP=$3 8 | MODE=$4 9 | 10 | echo $LOAD 11 | 12 | PROJECT_DIR=$(pwd) # change this to the path of your maskllm project 13 | OUTPUT="$PROJECT_DIR/output" 14 | 15 | # If model==2b 16 | if [ "$MODEL" == "8b" ]; then 17 | HIDDEN_SIZE=4096 # hidden size 18 | NUM_LAYERS=32 # number of layers 19 | NUM_ATTN_HEADS=32 # number of attention heads 20 | TOKENIZER_MODEL="$PROJECT_DIR/assets/checkpoints/llama3_8b_hf" 21 | FFN_HIDDEN_SIZE=14336 22 | fi 23 | SEQ_LENGTH=4096 # sequence length 24 | 25 | if [ "$MODE" == "dense" ]; then 26 | MASK_OPTIONS=" " 27 | elif [ "$MODE" == "sparse" ]; then 28 | MASK_OPTIONS="--enable-sparsity " 29 | else 30 | MASK_OPTIONS=" " 31 | fi 32 | 33 | export CUDA_DEVICE_MAX_CONNECTIONS=1; 34 | 35 | OPTIONS=" \ 36 | --task WIKITEXT2 \ 37 | --use-flash-attn \ 38 | --untie-embeddings-and-output-weights \ 39 | --disable-bias-linear \ 40 | --no-position-embedding \ 41 | --no-masked-softmax-fusion \ 42 | --use-rotary-position-embeddings \ 43 | --rotary-base 500000 \ 44 | --swiglu \ 45 | --attention-dropout 0.0 \ 46 | --hidden-dropout 0.0 \ 47 | --tensor-model-parallel-size $TP \ 48 | --pipeline-model-parallel-size 1 \ 49 | --overlapping-eval $SEQ_LENGTH \ 50 | --num-layers $NUM_LAYERS \ 51 | --hidden-size $HIDDEN_SIZE \ 52 | --num-attention-heads $NUM_ATTN_HEADS \ 53 | --seq-length $SEQ_LENGTH \ 54 | --max-position-embeddings $SEQ_LENGTH \ 55 | --group-query-attention \ 56 | --num-query-groups 8 \ 57 | --micro-batch-size 1 \ 58 | --global-batch-size 256 \ 59 | --train-iters 1 \ 60 | --lr-decay-iters 1 \ 61 | --lr 1.0e-4 \ 62 | --min-lr 1.0e-5 \ 63 | --lr-decay-style cosine \ 64 | --log-interval 100 \ 65 | --tokenizer-type AutoTokenizer \ 66 | --tokenizer-model ${TOKENIZER_MODEL} \ 67 | --make-vocab-size-divisible-by 1 \ 68 | --ffn-hidden-size $FFN_HIDDEN_SIZE --normalization RMSNorm \ 69 | --data-path None \ 70 | --bf16 \ 71 | --no-save-optim --no-save-rng \ 72 | --no-load-optim --no-load-rng \ 73 | --exit-on-missing-checkpoint \ 74 | --load ${LOAD} \ 75 | --hidden-dropout 0.0 --attention-dropout 0.0 \ 76 | $MASK_OPTIONS" 77 | 78 | cd $PROJECT_DIR; 79 | 80 | export MASTER_ADDR="127.0.0.1" 81 | export MASTER_PORT="45530" # select the port 82 | NNODES=1 # number of nodes 83 | NPROC_PER_NODE=${TP} # number of gpus (processes) per node 84 | export WORLD_SIZE=$(($NNODES * $NPROC_PER_NODE)) # number of gpus we have in total 85 | 86 | torchrun --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODES --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT tasks/main.py ${OPTIONS} 87 | 88 | -------------------------------------------------------------------------------- /scripts/tools/convert_llama2_13b_hf_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROJECT_DIR=$(pwd) 4 | 5 | TP=8 6 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_13b_hf 7 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_13b_megatron_tp$TP 8 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.model 9 | 10 | OPTIONS=" \ 11 | --model-type GPT \ 12 | --loader llama2_hf \ 13 | --saver megatron \ 14 | --target-tensor-parallel-size ${TP} \ 15 | --load-dir ${HF_FORMAT_DIR} \ 16 | --save-dir ${MEGATRON_FORMAT_DIR} \ 17 | --tokenizer-model ${TOKENIZER_MODEL}" 18 | 19 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS 20 | cp -r $TOKENIZER_MODEL $MEGATRON_FORMAT_DIR -------------------------------------------------------------------------------- /scripts/tools/convert_llama2_7b_hf_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_DIR=$(pwd) 3 | 4 | TP=8 5 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_7b_hf 6 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama2_7b_megatron_tp$TP 7 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.model 8 | 9 | OPTIONS=" \ 10 | --model-type GPT \ 11 | --loader llama2_hf \ 12 | --saver megatron \ 13 | --target-tensor-parallel-size ${TP} \ 14 | --load-dir ${HF_FORMAT_DIR} \ 15 | --save-dir ${MEGATRON_FORMAT_DIR} \ 16 | --tokenizer-model ${TOKENIZER_MODEL}" 17 | 18 | echo $HF_FORMAT_DIR 19 | echo $MEGATRON_FORMAT_DIR 20 | echo $TOKENIZER_MODEL 21 | 22 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS 23 | cp -r $TOKENIZER_MODEL $MEGATRON_FORMAT_DIR 24 | -------------------------------------------------------------------------------- /scripts/tools/convert_llama2_7b_tp8_to_tp1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_DIR=$(pwd) 3 | 4 | TP=1 5 | TP8_DIR=$PROJECT_DIR/output/checkpoints/llama2-7b-tp8-mask-only-c4-singlenode/train_iters_2000/ckpt 6 | TP1_DIR=$PROJECT_DIR/output/checkpoints/llama2-7b-tp1-mask-only-c4-singlenode/train_iters_2000/ckpt 7 | TOKENIZER_MODEL=assets/checkpoints/llama2_7b_hf/tokenizer.model 8 | 9 | OPTIONS=" \ 10 | --model-type GPT \ 11 | --loader megatron \ 12 | --saver megatron \ 13 | --target-tensor-parallel-size ${TP} \ 14 | --load-dir ${TP8_DIR} \ 15 | --save-dir ${TP1_DIR} \ 16 | --megatron-path ${PROJECT_DIR}" 17 | 18 | echo $TP8_DIR 19 | echo $TP1_DIR 20 | 21 | pip install transformers wandb accelerate tqdm; cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS 22 | -------------------------------------------------------------------------------- /scripts/tools/convert_llama3.1_8b_hf_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_DIR=$(pwd) 3 | 4 | TP=8 5 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3.1_8b_hf 6 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3.1_8b_megatron_tp$TP 7 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.json 8 | 9 | OPTIONS=" \ 10 | --model-type GPT \ 11 | --loader llama2_hf \ 12 | --saver megatron \ 13 | --target-tensor-parallel-size ${TP} \ 14 | --load-dir ${HF_FORMAT_DIR} \ 15 | --save-dir ${MEGATRON_FORMAT_DIR} \ 16 | --tokenizer-model ${TOKENIZER_MODEL}" 17 | 18 | echo $HF_FORMAT_DIR 19 | echo $MEGATRON_FORMAT_DIR 20 | echo $TOKENIZER_MODEL 21 | 22 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS 23 | cp -r $HF_FORMAT_DIR/*token* $MEGATRON_FORMAT_DIR 24 | -------------------------------------------------------------------------------- /scripts/tools/convert_llama3.1_8b_tp8_to_tp1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_DIR=$(pwd) 3 | 4 | TP=1 5 | TP8_DIR=$PROJECT_DIR/output/checkpoints/llama3.1-8b-tp8-mask-only-c4-singlenode/train_iters_2000/ckpt 6 | TP1_DIR=$PROJECT_DIR/output/checkpoints/llama3.1-8b-tp1-mask-only-c4-singlenode/train_iters_2000/ckpt 7 | TOKENIZER_MODEL=assets/checkpoints/llama3.1_8b_hf/tokenizer.model 8 | 9 | OPTIONS=" \ 10 | --model-type GPT \ 11 | --loader megatron \ 12 | --saver megatron \ 13 | --target-tensor-parallel-size ${TP} \ 14 | --load-dir ${TP8_DIR} \ 15 | --save-dir ${TP1_DIR} \ 16 | --megatron-path ${PROJECT_DIR}" 17 | 18 | echo $TP8_DIR 19 | echo $TP1_DIR 20 | 21 | pip install transformers wandb accelerate tqdm; cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS 22 | -------------------------------------------------------------------------------- /scripts/tools/convert_llama3_8b_hf_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_DIR=$(pwd) 3 | 4 | TP=8 5 | HF_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3_8b_hf 6 | MEGATRON_FORMAT_DIR=$PROJECT_DIR/assets/checkpoints/llama3_8b_megatron_tp$TP 7 | TOKENIZER_MODEL=$HF_FORMAT_DIR/tokenizer.json 8 | 9 | OPTIONS=" \ 10 | --model-type GPT \ 11 | --loader llama2_hf \ 12 | --saver megatron \ 13 | --target-tensor-parallel-size ${TP} \ 14 | --load-dir ${HF_FORMAT_DIR} \ 15 | --save-dir ${MEGATRON_FORMAT_DIR} \ 16 | --tokenizer-model ${TOKENIZER_MODEL}" 17 | 18 | echo $HF_FORMAT_DIR 19 | echo $MEGATRON_FORMAT_DIR 20 | echo $TOKENIZER_MODEL 21 | 22 | cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS 23 | cp -r $HF_FORMAT_DIR/tokenizer* $MEGATRON_FORMAT_DIR 24 | -------------------------------------------------------------------------------- /scripts/tools/convert_llama3_8b_tp8_to_tp1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_DIR=$(pwd) 3 | 4 | TP=1 5 | TP8_DIR=$PROJECT_DIR/output/checkpoints/llama3-8b-tp8-mask-only-c4-singlenode/train_iters_2000/ckpt 6 | TP1_DIR=$PROJECT_DIR/output/checkpoints/llama3-8b-tp1-mask-only-c4-singlenode/train_iters_2000/ckpt 7 | TOKENIZER_MODEL=assets/checkpoints/llama3_8b_hf/tokenizer.model 8 | 9 | OPTIONS=" \ 10 | --model-type GPT \ 11 | --loader megatron \ 12 | --saver megatron \ 13 | --target-tensor-parallel-size ${TP} \ 14 | --load-dir ${TP8_DIR} \ 15 | --save-dir ${TP1_DIR} \ 16 | --megatron-path ${PROJECT_DIR}" 17 | 18 | echo $TP8_DIR 19 | echo $TP1_DIR 20 | 21 | pip install transformers wandb accelerate tqdm; cd $PROJECT_DIR/tools/checkpoint; python util.py $OPTIONS 22 | -------------------------------------------------------------------------------- /scripts/tools/download_llama2_13b_hf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM 3 | import os 4 | 5 | HF_TOKEN = os.environ.get("HF_TOKEN") 6 | os.makedirs(f"./assets/cache", exist_ok=True) 7 | 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"): 9 | # Load model and tokenizer 10 | model = AutoModelForCausalLM.from_pretrained( 11 | model_id, 12 | torch_dtype=torch.float16, 13 | cache_dir=f"./assets/cache", 14 | use_auth_token=HF_TOKEN, 15 | ) 16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN) 17 | # Save model and tokenizer to the specified directory 18 | model.save_pretrained(save_directory) 19 | tokenizer.save_pretrained(save_directory) 20 | print(f"Model and tokenizer saved to {save_directory}") 21 | 22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub 23 | dense_dir = "assets/checkpoints" 24 | os.makedirs(dense_dir, exist_ok=True) 25 | model_id = "meta-llama/Llama-2-13b-hf" 26 | save_directory = f"{dense_dir}/llama2_13b_hf" 27 | save_llama_model(model_id, save_directory) -------------------------------------------------------------------------------- /scripts/tools/download_llama2_7b_hf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM 3 | import os 4 | 5 | HF_TOKEN = os.environ.get("HF_TOKEN") 6 | os.makedirs(f"./assets/cache", exist_ok=True) 7 | 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"): 9 | # Load model and tokenizer 10 | model = AutoModelForCausalLM.from_pretrained( 11 | model_id, 12 | torch_dtype=torch.float16, 13 | cache_dir=f"./assets/cache", 14 | use_auth_token=HF_TOKEN, 15 | ) 16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN) 17 | # Save model and tokenizer to the specified directory 18 | model.save_pretrained(save_directory) 19 | tokenizer.save_pretrained(save_directory) 20 | print(f"Model and tokenizer saved to {save_directory}") 21 | 22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub 23 | dense_dir = "assets/checkpoints" 24 | os.makedirs(dense_dir, exist_ok=True) 25 | model_id = "meta-llama/Llama-2-7b-hf" 26 | save_directory = f"{dense_dir}/llama2_7b_hf" 27 | save_llama_model(model_id, save_directory) -------------------------------------------------------------------------------- /scripts/tools/download_llama3.1_8b_hf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM 3 | import os 4 | 5 | HF_TOKEN = os.environ.get("HF_TOKEN") 6 | os.makedirs(f"./assets/cache", exist_ok=True) 7 | 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"): 9 | # Load model and tokenizer 10 | model = AutoModelForCausalLM.from_pretrained( 11 | model_id, 12 | torch_dtype=torch.float16, 13 | cache_dir=f"./assets/cache", 14 | use_auth_token=HF_TOKEN, 15 | ) 16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN) 17 | # Save model and tokenizer to the specified directory 18 | model.save_pretrained(save_directory) 19 | tokenizer.save_pretrained(save_directory) 20 | print(f"Model and tokenizer saved to {save_directory}") 21 | 22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub 23 | dense_dir = "assets/checkpoints" 24 | os.makedirs(dense_dir, exist_ok=True) 25 | model_id = "meta-llama/Llama-3.1-8B" 26 | save_directory = f"{dense_dir}/llama3.1_8b_hf" 27 | save_llama_model(model_id, save_directory) -------------------------------------------------------------------------------- /scripts/tools/download_llama3_8b_hf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM 3 | import os 4 | 5 | HF_TOKEN = os.environ.get("HF_TOKEN") 6 | os.makedirs(f"./assets/cache", exist_ok=True) 7 | 8 | def save_llama_model(model_id, save_directory="assets/checkpoints/llama"): 9 | # Load model and tokenizer 10 | model = AutoModelForCausalLM.from_pretrained( 11 | model_id, 12 | torch_dtype=torch.float16, 13 | cache_dir=f"./assets/cache", 14 | use_auth_token=HF_TOKEN, 15 | ) 16 | tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=HF_TOKEN) 17 | # Save model and tokenizer to the specified directory 18 | model.save_pretrained(save_directory) 19 | tokenizer.save_pretrained(save_directory) 20 | print(f"Model and tokenizer saved to {save_directory}") 21 | 22 | # Replace 'LLaMA-2-model-id' with the actual model ID from Hugging Face's model hub 23 | dense_dir = "assets/checkpoints" 24 | os.makedirs(dense_dir, exist_ok=True) 25 | model_id = "meta-llama/Meta-Llama-3-8B" 26 | save_directory = f"{dense_dir}/llama3_8b_hf" 27 | save_llama_model(model_id, save_directory) -------------------------------------------------------------------------------- /tasks/data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """ Tasks data utility.""" 4 | 5 | import re 6 | import numpy as np 7 | 8 | 9 | def clean_text(text): 10 | """Remove new lines and multiple spaces and adjust end of sentence dot.""" 11 | 12 | text = text.replace("\n", " ") 13 | text = re.sub(r'\s+', ' ', text) 14 | for _ in range(3): 15 | text = text.replace(' . ', '. ') 16 | 17 | return text 18 | 19 | 20 | def build_sample(ids, types, paddings, label, unique_id): 21 | """Convert to numpy and return a sample consumed by the batch producer.""" 22 | 23 | ids_np = np.array(ids, dtype=np.int64) 24 | types_np = np.array(types, dtype=np.int64) 25 | paddings_np = np.array(paddings, dtype=np.int64) 26 | sample = ({'text': ids_np, 27 | 'types': types_np, 28 | 'padding_mask': paddings_np, 29 | 'label': int(label), 30 | 'uid': int(unique_id)}) 31 | 32 | return sample 33 | 34 | 35 | def build_tokens_types_paddings_from_text(text_a, text_b, 36 | tokenizer, max_seq_length): 37 | """Build token types and paddings, trim if needed, and pad if needed.""" 38 | 39 | text_a_ids = tokenizer.tokenize(text_a) 40 | text_b_ids = None 41 | if text_b is not None: 42 | text_b_ids = tokenizer.tokenize(text_b) 43 | 44 | return build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, 45 | max_seq_length, tokenizer.cls, 46 | tokenizer.sep, tokenizer.pad) 47 | 48 | 49 | def build_tokens_types_paddings_from_ids(text_a_ids, text_b_ids, max_seq_length, 50 | cls_id, sep_id, pad_id): 51 | """Build token types and paddings, trim if needed, and pad if needed.""" 52 | 53 | ids = [] 54 | types = [] 55 | paddings = [] 56 | 57 | # [CLS]. 58 | ids.append(cls_id) 59 | types.append(0) 60 | paddings.append(1) 61 | 62 | # A. 63 | len_text_a = len(text_a_ids) 64 | ids.extend(text_a_ids) 65 | types.extend([0] * len_text_a) 66 | paddings.extend([1] * len_text_a) 67 | 68 | # [SEP]. 69 | ids.append(sep_id) 70 | types.append(0) 71 | paddings.append(1) 72 | 73 | # B. 74 | if text_b_ids is not None: 75 | len_text_b = len(text_b_ids) 76 | ids.extend(text_b_ids) 77 | types.extend([1] * len_text_b) 78 | paddings.extend([1] * len_text_b) 79 | 80 | # Cap the size. 81 | trimmed = False 82 | if len(ids) >= max_seq_length: 83 | max_seq_length_m1 = max_seq_length - 1 84 | ids = ids[0:max_seq_length_m1] 85 | types = types[0:max_seq_length_m1] 86 | paddings = paddings[0:max_seq_length_m1] 87 | trimmed = True 88 | 89 | # [SEP]. 90 | if (text_b_ids is not None) or trimmed: 91 | ids.append(sep_id) 92 | if text_b_ids is None: 93 | types.append(0) 94 | else: 95 | types.append(1) 96 | paddings.append(1) 97 | 98 | # Padding. 99 | padding_length = max_seq_length - len(ids) 100 | if padding_length > 0: 101 | ids.extend([pad_id] * padding_length) 102 | types.extend([pad_id] * padding_length) 103 | paddings.extend([0] * padding_length) 104 | 105 | return ids, types, paddings 106 | -------------------------------------------------------------------------------- /tasks/glue/data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """GLUE dataset.""" 4 | 5 | from abc import ABC 6 | from abc import abstractmethod 7 | 8 | from torch.utils.data import Dataset 9 | 10 | from megatron import print_rank_0 11 | from tasks.data_utils import build_sample 12 | from tasks.data_utils import build_tokens_types_paddings_from_text 13 | 14 | 15 | class GLUEAbstractDataset(ABC, Dataset): 16 | """GLUE base dataset class.""" 17 | 18 | def __init__(self, task_name, dataset_name, datapaths, 19 | tokenizer, max_seq_length): 20 | # Store inputs. 21 | self.task_name = task_name 22 | self.dataset_name = dataset_name 23 | self.tokenizer = tokenizer 24 | self.max_seq_length = max_seq_length 25 | print_rank_0(' > building {} dataset for {}:'.format(self.task_name, 26 | self.dataset_name)) 27 | # Process the files. 28 | string = ' > paths:' 29 | for path in datapaths: 30 | string += ' ' + path 31 | print_rank_0(string) 32 | self.samples = [] 33 | for datapath in datapaths: 34 | self.samples.extend(self.process_samples_from_single_path(datapath)) 35 | print_rank_0(' >> total number of samples: {}'.format( 36 | len(self.samples))) 37 | 38 | def __len__(self): 39 | return len(self.samples) 40 | 41 | def __getitem__(self, idx): 42 | raw_sample = self.samples[idx] 43 | ids, types, paddings = build_tokens_types_paddings_from_text( 44 | raw_sample['text_a'], raw_sample['text_b'], 45 | self.tokenizer, self.max_seq_length) 46 | sample = build_sample(ids, types, paddings, 47 | raw_sample['label'], raw_sample['uid']) 48 | return sample 49 | 50 | @abstractmethod 51 | def process_samples_from_single_path(self, datapath): 52 | """Abstract method that takes a single path / filename and 53 | returns a list of dataset samples, each sample being a dict of 54 | {'text_a': string, 'text_b': string, 'label': int, 'uid': int} 55 | """ 56 | pass 57 | -------------------------------------------------------------------------------- /tasks/glue/finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """GLUE finetuning/evaluation.""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from megatron import get_tokenizer 8 | from megatron.model.classification import Classification 9 | from tasks.eval_utils import accuracy_func_provider 10 | from tasks.finetune_utils import finetune 11 | from megatron.arguments import core_transformer_config_from_args 12 | 13 | 14 | def glue_classification(num_classes, Dataset, 15 | name_from_datapath_func): 16 | 17 | def train_valid_datasets_provider(): 18 | """Build train and validation dataset.""" 19 | args = get_args() 20 | tokenizer = get_tokenizer() 21 | 22 | train_dataset = Dataset('training', args.train_data, 23 | tokenizer, args.seq_length) 24 | valid_dataset = Dataset('validation', args.valid_data, 25 | tokenizer, args.seq_length) 26 | 27 | return train_dataset, valid_dataset 28 | 29 | def model_provider(pre_process=True, post_process=True): 30 | """Build the model.""" 31 | args = get_args() 32 | config = core_transformer_config_from_args() 33 | 34 | print_rank_0('building classification model for {} ...'.format( 35 | args.task)) 36 | model = Classification(config=config, num_classes=num_classes, num_tokentypes=2, 37 | pre_process=pre_process, post_process=post_process) 38 | 39 | return model 40 | 41 | def metrics_func_provider(): 42 | """Privde metrics callback function.""" 43 | def single_dataset_provider(datapath): 44 | args = get_args() 45 | tokenizer = get_tokenizer() 46 | 47 | name = name_from_datapath_func(datapath) 48 | return Dataset(name, [datapath], tokenizer, args.seq_length) 49 | return accuracy_func_provider(single_dataset_provider) 50 | 51 | """Finetune/evaluate.""" 52 | finetune(train_valid_datasets_provider, model_provider, 53 | end_of_epoch_callback_provider=metrics_func_provider) 54 | 55 | 56 | def main(): 57 | args = get_args() 58 | 59 | if args.task == 'MNLI': 60 | 61 | num_classes = 3 62 | from tasks.glue.mnli import MNLIDataset as Dataset 63 | 64 | def name_from_datapath(datapath): 65 | return datapath.split('MNLI')[-1].strip( 66 | '.tsv').strip('/').replace('_', '-') 67 | 68 | elif args.task == 'QQP': 69 | 70 | num_classes = 2 71 | from tasks.glue.qqp import QQPDataset as Dataset 72 | 73 | def name_from_datapath(datapath): 74 | return datapath.split('QQP')[-1].strip( 75 | '.tsv').strip('/').replace('_', '-') 76 | 77 | else: 78 | raise NotImplementedError('GLUE task {} is not implemented.'.format( 79 | args.task)) 80 | 81 | glue_classification(num_classes, Dataset, name_from_datapath) 82 | -------------------------------------------------------------------------------- /tasks/glue/mnli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """MNLI dataset.""" 4 | 5 | from megatron import print_rank_0 6 | from tasks.data_utils import clean_text 7 | from .data import GLUEAbstractDataset 8 | 9 | 10 | LABELS = {'contradiction': 0, 'entailment': 1, 'neutral': 2} 11 | 12 | 13 | class MNLIDataset(GLUEAbstractDataset): 14 | 15 | def __init__(self, name, datapaths, tokenizer, max_seq_length, 16 | test_label='contradiction'): 17 | self.test_label = test_label 18 | super().__init__('MNLI', name, datapaths, 19 | tokenizer, max_seq_length) 20 | 21 | def process_samples_from_single_path(self, filename): 22 | """"Implement abstract method.""" 23 | print_rank_0(' > Processing {} ...'.format(filename)) 24 | 25 | samples = [] 26 | total = 0 27 | first = True 28 | is_test = False 29 | with open(filename, 'r') as f: 30 | for line in f: 31 | row = line.strip().split('\t') 32 | if first: 33 | first = False 34 | if len(row) == 10: 35 | is_test = True 36 | print_rank_0( 37 | ' reading {}, {} and {} columns and setting ' 38 | 'labels to {}'.format( 39 | row[0].strip(), row[8].strip(), 40 | row[9].strip(), self.test_label)) 41 | else: 42 | print_rank_0(' reading {} , {}, {}, and {} columns ' 43 | '...'.format( 44 | row[0].strip(), row[8].strip(), 45 | row[9].strip(), row[-1].strip())) 46 | continue 47 | 48 | text_a = clean_text(row[8].strip()) 49 | text_b = clean_text(row[9].strip()) 50 | unique_id = int(row[0].strip()) 51 | label = row[-1].strip() 52 | if is_test: 53 | label = self.test_label 54 | 55 | assert len(text_a) > 0 56 | assert len(text_b) > 0 57 | assert label in LABELS 58 | assert unique_id >= 0 59 | 60 | sample = {'text_a': text_a, 61 | 'text_b': text_b, 62 | 'label': LABELS[label], 63 | 'uid': unique_id} 64 | total += 1 65 | samples.append(sample) 66 | 67 | if total % 50000 == 0: 68 | print_rank_0(' > processed {} so far ...'.format(total)) 69 | 70 | print_rank_0(' >> processed {} samples.'.format(len(samples))) 71 | return samples 72 | -------------------------------------------------------------------------------- /tasks/latency/detokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Detokenization.""" 4 | 5 | import re 6 | 7 | 8 | def ptb_detokenizer(string): 9 | string = string.replace(" '", "'") 10 | string = string.replace(" \n", "\n") 11 | string = string.replace("\n ", "\n") 12 | string = string.replace(" n't", "n't") 13 | string = string.replace(" N ", "1 ") 14 | string = string.replace("$ 1", "$1") 15 | string = string.replace("# 1", "#1") 16 | return string 17 | 18 | 19 | def wikitext_detokenizer(string): 20 | # contractions 21 | string = string.replace("s '", "s'") 22 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 23 | # number separators 24 | string = string.replace(" @-@ ", "-") 25 | string = string.replace(" @,@ ", ",") 26 | string = string.replace(" @.@ ", ".") 27 | # punctuation 28 | string = string.replace(" : ", ": ") 29 | string = string.replace(" ; ", "; ") 30 | string = string.replace(" . ", ". ") 31 | string = string.replace(" ! ", "! ") 32 | string = string.replace(" ? ", "? ") 33 | string = string.replace(" , ", ", ") 34 | # double brackets 35 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 36 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 37 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 38 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 39 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 40 | # miscellaneous 41 | string = string.replace("= = = =", "====") 42 | string = string.replace("= = =", "===") 43 | string = string.replace("= =", "==") 44 | string = string.replace(" " + chr(176) + " ", chr(176)) 45 | string = string.replace(" \n", "\n") 46 | string = string.replace("\n ", "\n") 47 | string = string.replace(" N ", " 1 ") 48 | string = string.replace(" 's", "'s") 49 | 50 | return string 51 | 52 | 53 | def lambada_detokenizer(string): 54 | return string 55 | 56 | 57 | _DETOKENIZERS = { 58 | 'ptb': ptb_detokenizer, 59 | 'wiki': wikitext_detokenizer, 60 | 'lambada': lambada_detokenizer, 61 | } 62 | 63 | 64 | def get_detokenizer(path): 65 | for key in _DETOKENIZERS.keys(): 66 | if key in path: 67 | return _DETOKENIZERS[key] 68 | -------------------------------------------------------------------------------- /tasks/msdp/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Multi-Stage Prompting for Knowledgeable Dialogue Generation 3 | 4 | Below we present the steps to run our multi-stage dialogue prompting (MSDP) framework. 5 | 6 | ## Multi-Stage Dialogue Prompting 7 | 8 | ### Data Preparation 9 | 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/) 10 | 2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets. 11 | 12 | ### Stage-1: Prompting for Knowledge Generation 13 | 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation. 14 | 2. We provide the [`evaluation script`](../../examples/msdp/eval_knwl_generation.sh) for the automatic evaluation (i.e., F1, BLEU, METEOR, and ROUGE-L) of the knowledge generation. 15 | 16 | ### Stage-2: Prompting for Response Generation 17 | 1. We provide the script to [`prepare the input file`](../../examples/msdp/prep_resp_gen.sh) for the response generation (based on the previously generated knowledge file). 18 | 2. We provide the script to perform the [`second-stage prompting`](../../examples/msdp/prompt_resp_gen.sh) for the response generation. 19 | 3. We provide the [`evaluation script`](../../examples/msdp/eval_resp_generation.sh) for the automatic evaluation (i.e., F1, KF1, BLEU, METEOR, and ROUGE-L) of the response generation. 20 | -------------------------------------------------------------------------------- /tasks/msdp/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Model evaluation""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from tasks.msdp.metrics import F1Metric 8 | from tqdm import tqdm 9 | 10 | 11 | def evaluate_f1(guess_file, answer_file): 12 | """Evaluating F1 Score""" 13 | 14 | guess_list = [] 15 | print_rank_0('reading %s' % guess_file) 16 | with open(guess_file, "r") as f: 17 | for i, line in enumerate(tqdm(f)): 18 | line = line.strip() 19 | if "<|endoftext|>" in line: 20 | line = line.replace("<|endoftext|>", "") 21 | guess_list.append(line) 22 | 23 | answer_list = [] 24 | print_rank_0('reading %s' % answer_file) 25 | with open(answer_file, "r") as f: 26 | for i, line in enumerate(tqdm(f)): 27 | line = line.strip() 28 | if line == "no_passages_used": 29 | line = "" 30 | answer_list.append(line) 31 | 32 | assert len(guess_list) == len(answer_list), \ 33 | "lengths of guess and answer are different!" 34 | 35 | precision, recall, f1 = F1Metric.compute_all_pairs(guess_list, answer_list) 36 | print_rank_0('Precision: %.4f; recall: %.4f; f1: %.4f' % (precision, recall, f1)) 37 | 38 | print_rank_0('done :-)') 39 | 40 | 41 | def main(): 42 | args = get_args() 43 | 44 | evaluate_f1(args.guess_file, args.answer_file) 45 | 46 | -------------------------------------------------------------------------------- /tasks/msdp/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Run multi-stage dialogue prompting (MSDP).""" 4 | 5 | import os 6 | import sys 7 | sys.path.append(os.path.abspath(os.path.join( 8 | os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir))) 9 | from megatron import get_args 10 | from megatron.initialize import initialize_megatron 11 | 12 | 13 | def get_tasks_args(parser): 14 | """Provide extra arguments required for tasks.""" 15 | group = parser.add_argument_group(title='tasks') 16 | 17 | # parameters for the knowledgeable dialogue generation 18 | group.add_argument('--task', type=str, required=True, 19 | help='Task name.') 20 | group.add_argument("--sample-input-file", type=str, default=None, 21 | help='Get input from file instead of interactive mode, ' 22 | 'each line is an input.') 23 | group.add_argument("--sample-output-file", type=str, default=None, 24 | help='Output file got from --sample-input-file') 25 | group.add_argument('--prompt-file', type=str, default=None, 26 | help='prompting file') 27 | group.add_argument('--prompt-type', type=str, default=None, 28 | choices=['knowledge', 'response'], 29 | help='prompt type (knowledge or response)') 30 | group.add_argument('--num-prompt-examples', type=int, default=10, 31 | help='number of prompt examples') 32 | group.add_argument('--guess-file', type=str, default=None, 33 | help='datapath for generated sentences') 34 | group.add_argument('--answer-file', type=str, default=None, 35 | help='datapath for golden sentences') 36 | group.add_argument('--out-seq-length', type=int, default=100, 37 | help='output sequence length') 38 | group.add_argument('--api-prompt', default=False, action="store_true", 39 | help='setup model api for prompting') 40 | group.add_argument('--megatron-api-url', type=str, default=None, 41 | help='url of the megatron api') 42 | 43 | return parser 44 | 45 | 46 | if __name__ == '__main__': 47 | 48 | initialize_megatron(extra_args_provider=get_tasks_args) 49 | 50 | args = get_args() 51 | 52 | if args.num_layers_per_virtual_pipeline_stage is not None: 53 | print("Interleaved pipeline schedule is not yet supported for downstream tasks.") 54 | exit() 55 | 56 | if args.task == 'MSDP-PROMPT': 57 | from tasks.msdp.prompt import main 58 | 59 | elif args.task == 'MSDP-EVAL-F1': 60 | from tasks.msdp.evaluate import main 61 | 62 | else: 63 | raise NotImplementedError('Task {} is not implemented.'.format( 64 | args.task)) 65 | 66 | main() 67 | -------------------------------------------------------------------------------- /tasks/msdp/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | # The following code is adapted from 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 4 | # which is licensed under the MIT license. More details on the license can be 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE. 6 | 7 | """Provides standard metric evaluations for dialog.""" 8 | 9 | from collections import Counter 10 | from typing import List 11 | import numpy as np 12 | import re 13 | 14 | re_art = re.compile(r'\b(a|an|the)\b') 15 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') 16 | 17 | 18 | def normalize_answer(s): 19 | """ 20 | Lower text and remove punctuation, articles and extra whitespace. 21 | """ 22 | s = s.lower() 23 | s = re_punc.sub(' ', s) 24 | s = re_art.sub(' ', s) 25 | s = ' '.join(s.split()) 26 | return s 27 | 28 | 29 | class F1Metric: 30 | """ 31 | Helper class which computes token-level F1. 32 | """ 33 | 34 | @staticmethod 35 | def _prec_recall_f1_score(pred_items, gold_items): 36 | """ 37 | Compute precision, recall and f1 given a set of gold and prediction items. 38 | :param pred_items: iterable of predicted values 39 | :param gold_items: iterable of gold values 40 | :return: tuple (p, r, f1) for precision, recall, f1 41 | """ 42 | common = Counter(gold_items) & Counter(pred_items) 43 | num_same = sum(common.values()) 44 | if num_same == 0: 45 | return 0, 0, 0 46 | precision = 1.0 * num_same / len(pred_items) 47 | recall = 1.0 * num_same / len(gold_items) 48 | f1 = (2 * precision * recall) / (precision + recall) 49 | return precision, recall, f1 50 | 51 | @staticmethod 52 | def compute_each_pair(guess: str, answer: str): 53 | if answer == "": 54 | return None, None, None 55 | if guess == "": 56 | return 0, 0, 0 57 | g_tokens = normalize_answer(guess).split() 58 | a_tokens = normalize_answer(answer).split() 59 | 60 | precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens) 61 | return precision, recall, f1 62 | 63 | @staticmethod 64 | def compute_all_pairs(guesses: List[str], answers: List[str]): 65 | # additional augment: 66 | assert len(guesses) == len(answers) 67 | 68 | precision_list, recall_list, f1_list = [], [], [] 69 | for guess, answer in zip(guesses, answers): 70 | precision, recall, f1 = F1Metric.compute_each_pair(guess, answer) 71 | if precision is None or recall is None or f1 is None: 72 | continue 73 | precision_list.append(precision) 74 | recall_list.append(recall) 75 | f1_list.append(f1) 76 | 77 | return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list) 78 | -------------------------------------------------------------------------------- /tasks/orqa/README.md: -------------------------------------------------------------------------------- 1 | ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering 2 | 3 | Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408). 4 | 5 | ## Retriever Training 6 | 7 | #### Unsupervised pretraining 8 | 1. Use `tools/preprocess_data.py` to preprocess the dataset for Inverse Cloze Task (ICT), which we call unsupervised pretraining. This script takes as input a corpus in loose JSON format and creates fixed-size blocks of text as the fundamental units of data. For a corpus like Wikipedia, this will mean multiple sentences per block and multiple blocks per document. Run [`tools/preprocess_data.py`](../../tools/preprocess_data.py) to construct one or more indexed datasets with the `--split-sentences` argument to make sentences the basic unit. We construct two datasets, one with the title of every document and another with the body. 9 | 10 |
11 | python tools/preprocess_data.py \
12 |     --input /path/to/corpus.json \
13 |     --json-keys text title \
14 |     --split-sentences \
15 |     --tokenizer-type BertWordPieceLowerCase \
16 |     --vocab-file /path/to/vocab.txt \
17 |     --output-prefix corpus_indexed \
18 |     --workers 10
19 | 
20 | 21 | 2. The [`examples/pretrain_ict.sh`](../../examples/pretrain_ict.sh) script runs a single GPU 217M parameter biencoder model for ICT retriever training. Single GPU training is primarily intended for debugging purposes, as the code is developed for distributed training. The script uses a pretrained BERT model and we use a total of batch size of 4096 for the ICT training. 22 | 23 | 3. Evaluate the pretrained ICT model using [`examples/evaluate_retriever_nq.sh`](../../examples/evaluate_retriever_nq.sh) for [Google's Natural Questions Open dataset](https://arxiv.org/pdf/1906.00300.pdf). 24 | 25 | #### Supervised finetuning 26 | 27 | 1. Use the above pretrained ICT model to finetune using [Google's Natural Questions Open dataset](https://github.com/google-research/language/tree/master/language/orqa). The script [`examples/finetune_retriever_distributed.sh`](../../examples/finetune_retriever_distributed.sh) provides an example for how to perform the training. Our finetuning process includes retriever score scaling and longer training (80 epochs) on top [DPR training](https://arxiv.org/abs/2004.04906). 28 | 29 | 2. Evaluate the finetuned model using the same evaluation script as mentioned above for the unsupervised model. 30 | 31 | More details on the retriever are available in [our paper](https://arxiv.org/abs/2101.00408). 32 | 33 | ## Reader Training 34 | 35 | The reader component will be available soon. 36 | 37 | -------------------------------------------------------------------------------- /tasks/orqa/evaluate_orqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | from megatron import get_args, print_rank_0 6 | from megatron.indexer import IndexBuilder 7 | from tasks.orqa.evaluate_utils import ORQAEvaluator 8 | 9 | def main(): 10 | """ 11 | Main program 12 | """ 13 | 14 | args = get_args() 15 | 16 | """ 17 | Create a BlockData data structure by running an IndexBuilder over an 18 | ICT Dataset and then evaluate on NQ task 19 | """ 20 | 21 | print_rank_0("Starting index builder!") 22 | 23 | index_builder = IndexBuilder() 24 | index_builder.build_and_save_index() 25 | print_rank_0("Build and save indices: done!") 26 | 27 | 28 | print_rank_0("Starting evaluations!") 29 | 30 | # Set up the model and evaluator 31 | evaluator = ORQAEvaluator() 32 | 33 | # Run evaluation 34 | if args.qa_data_dev is not None: 35 | evaluator.evaluate(args.qa_data_dev, "DEV") 36 | 37 | if args.qa_data_test is not None: 38 | evaluator.evaluate(args.qa_data_test, "TEST") 39 | 40 | -------------------------------------------------------------------------------- /tasks/pruning/detokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Detokenization.""" 4 | 5 | import re 6 | 7 | 8 | def ptb_detokenizer(string): 9 | string = string.replace(" '", "'") 10 | string = string.replace(" \n", "\n") 11 | string = string.replace("\n ", "\n") 12 | string = string.replace(" n't", "n't") 13 | string = string.replace(" N ", "1 ") 14 | string = string.replace("$ 1", "$1") 15 | string = string.replace("# 1", "#1") 16 | return string 17 | 18 | 19 | def wikitext_detokenizer(string): 20 | # contractions 21 | string = string.replace("s '", "s'") 22 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 23 | # number separators 24 | string = string.replace(" @-@ ", "-") 25 | string = string.replace(" @,@ ", ",") 26 | string = string.replace(" @.@ ", ".") 27 | # punctuation 28 | string = string.replace(" : ", ": ") 29 | string = string.replace(" ; ", "; ") 30 | string = string.replace(" . ", ". ") 31 | string = string.replace(" ! ", "! ") 32 | string = string.replace(" ? ", "? ") 33 | string = string.replace(" , ", ", ") 34 | # double brackets 35 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 36 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 37 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 38 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 39 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 40 | # miscellaneous 41 | string = string.replace("= = = =", "====") 42 | string = string.replace("= = =", "===") 43 | string = string.replace("= =", "==") 44 | string = string.replace(" " + chr(176) + " ", chr(176)) 45 | string = string.replace(" \n", "\n") 46 | string = string.replace("\n ", "\n") 47 | string = string.replace(" N ", " 1 ") 48 | string = string.replace(" 's", "'s") 49 | 50 | return string 51 | 52 | 53 | def lambada_detokenizer(string): 54 | return string 55 | 56 | 57 | _DETOKENIZERS = { 58 | 'ptb': ptb_detokenizer, 59 | 'wiki': wikitext_detokenizer, 60 | 'lambada': lambada_detokenizer, 61 | } 62 | 63 | 64 | def get_detokenizer(path): 65 | for key in _DETOKENIZERS.keys(): 66 | if key in path: 67 | return _DETOKENIZERS[key] 68 | -------------------------------------------------------------------------------- /tasks/pruning/exclude_layers.py: -------------------------------------------------------------------------------- 1 | # 843m 2 | # 96 prunable layers 3 | exclude_layers_12 = [ 4 | "module.language_model.encoder.layers.23.mlp.dense_h_to_4h", 5 | "module.language_model.encoder.layers.23.mlp.dense_4h_to_h", 6 | "module.language_model.encoder.layers.7.mlp.dense_h_to_4h", 7 | "module.language_model.encoder.layers.0.mlp.dense_h_to_4h", 8 | "module.language_model.encoder.layers.22.mlp.dense_h_to_4h", 9 | "module.language_model.encoder.layers.8.self_attention.query_key_value", 10 | "module.language_model.encoder.layers.7.mlp.dense_4h_to_h", 11 | "module.language_model.encoder.layers.19.mlp.dense_h_to_4h", 12 | "module.language_model.encoder.layers.23.self_attention.query_key_value", 13 | "module.language_model.encoder.layers.21.mlp.dense_h_to_4h", 14 | "module.language_model.encoder.layers.20.mlp.dense_h_to_4h", 15 | "module.language_model.encoder.layers.0.mlp.dense_4h_to_h", 16 | ] 17 | 18 | # 8b 19 | # 128 prunable layers 20 | exclude_layers_13 = [ 21 | "module.language_model.encoder.layers.31.mlp.dense_h_to_4h", 22 | "module.language_model.encoder.layers.31.mlp.dense_4h_to_h", 23 | "module.language_model.encoder.layers.30.self_attention.query_key_value", 24 | "module.language_model.encoder.layers.30.mlp.dense_h_to_4h", 25 | "module.language_model.encoder.layers.29.mlp.dense_h_to_4h", 26 | "module.language_model.encoder.layers.28.mlp.dense_h_to_4h", 27 | "module.language_model.encoder.layers.30.mlp.dense_4h_to_h", 28 | "module.language_model.encoder.layers.27.mlp.dense_h_to_4h", 29 | "module.language_model.encoder.layers.22.self_attention.query_key_value", 30 | "module.language_model.encoder.layers.25.mlp.dense_h_to_4h", 31 | "module.language_model.encoder.layers.26.self_attention.query_key_value", 32 | "module.language_model.encoder.layers.26.mlp.dense_h_to_4h", 33 | "module.language_model.encoder.layers.31.self_attention.query_key_value", 34 | ] 35 | 36 | exclude_layers_1000 = ["mlp.dense_4h_to_h"] -------------------------------------------------------------------------------- /tasks/pruning/layerwrapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | # Define WrappedGPT class 6 | class WrappedGPT: 7 | """ 8 | This class wraps a GPT layer for specific operations. 9 | """ 10 | 11 | def __init__(self, layer, layer_id=0, layer_name="none"): 12 | self.layer = layer 13 | self.dev = self.layer.weight.device 14 | self.rows = layer.weight.data.shape[0] 15 | self.columns = layer.weight.data.shape[1] 16 | 17 | self.scaler_row = torch.zeros((self.columns), device=self.dev) 18 | self.nsamples = 0 19 | 20 | self.layer_id = layer_id 21 | self.layer_name = layer_name 22 | 23 | def add_batch(self, inp, out): 24 | if len(inp.shape) == 3: 25 | inp = inp.reshape((-1, inp.shape[-1])) 26 | if len(inp.shape) == 2: 27 | inp = inp.unsqueeze(0) 28 | tmp = inp.shape[0] 29 | if isinstance(self.layer, nn.Linear): 30 | if len(inp.shape) == 3: 31 | inp = inp.reshape((-1, inp.shape[-1])) 32 | inp = inp.t() 33 | 34 | self.scaler_row *= self.nsamples / (self.nsamples+tmp) 35 | self.nsamples += tmp 36 | 37 | inp = inp.type(torch.float32) 38 | self.scaler_row += torch.norm(inp, p=2, dim=1).squeeze(0) ** 2 / self.nsamples -------------------------------------------------------------------------------- /tasks/pruning/sparsity/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/tasks/pruning/sparsity/__init__.py -------------------------------------------------------------------------------- /tasks/pruning/sparsity/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVlabs/MaskLLM/50e50de8dc833714bb70a951d0ae37a5e79bcc7b/tasks/pruning/sparsity/utils/__init__.py -------------------------------------------------------------------------------- /tasks/race/finetune.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Race.""" 4 | 5 | from megatron import get_args 6 | from megatron import print_rank_0 7 | from megatron import get_tokenizer 8 | from megatron.model.multiple_choice import MultipleChoice 9 | from tasks.eval_utils import accuracy_func_provider 10 | from tasks.finetune_utils import finetune 11 | from tasks.race.data import RaceDataset 12 | from megatron.arguments import core_transformer_config_from_args 13 | 14 | 15 | def train_valid_datasets_provider(): 16 | """Provide train and validation datasets.""" 17 | args = get_args() 18 | tokenizer = get_tokenizer() 19 | 20 | train_dataset = RaceDataset('training', args.train_data, 21 | tokenizer, args.seq_length) 22 | valid_dataset = RaceDataset('validation', args.valid_data, 23 | tokenizer, args.seq_length) 24 | 25 | return train_dataset, valid_dataset 26 | 27 | 28 | def model_provider(pre_process=True, post_process=True): 29 | """Build the model.""" 30 | config = core_transformer_config_from_args(get_args()) 31 | print_rank_0('building multichoice model for RACE ...') 32 | model = MultipleChoice(config=config, 33 | num_tokentypes=2, 34 | pre_process=pre_process, 35 | post_process=post_process) 36 | 37 | return model 38 | 39 | 40 | def metrics_func_provider(): 41 | """Privde metrics callback function.""" 42 | args = get_args() 43 | tokenizer = get_tokenizer() 44 | 45 | def single_dataset_provider(datapath): 46 | name = datapath.split('RACE')[-1].strip('/').replace('/', '-') 47 | return RaceDataset(name, [datapath], tokenizer, args.seq_length) 48 | 49 | return accuracy_func_provider(single_dataset_provider) 50 | 51 | 52 | def main(): 53 | 54 | finetune(train_valid_datasets_provider, model_provider, 55 | end_of_epoch_callback_provider=metrics_func_provider) 56 | -------------------------------------------------------------------------------- /tasks/vision/classification/classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Vision-classification finetuning/evaluation.""" 4 | 5 | import torch.nn.functional as F 6 | from functools import partial 7 | from megatron import get_args, get_timers 8 | from megatron import print_rank_0 9 | from megatron.model.vision.classification import VitClassificationModel 10 | from megatron.data.vit_dataset import build_train_valid_datasets 11 | from tasks.vision.classification.eval_utils import accuracy_func_provider 12 | from tasks.vision.finetune_utils import finetune 13 | from megatron.utils import average_losses_across_data_parallel_group 14 | 15 | 16 | def classification(): 17 | def train_valid_datasets_provider(): 18 | """Build train and validation dataset.""" 19 | args = get_args() 20 | 21 | train_ds, valid_ds = build_train_valid_datasets( 22 | data_path=args.data_path, 23 | image_size=(args.img_h, args.img_w), 24 | ) 25 | return train_ds, valid_ds 26 | 27 | def model_provider(pre_process=True, post_process=True): 28 | """Build the model.""" 29 | args = get_args() 30 | 31 | print_rank_0("building classification model for ImageNet ...") 32 | 33 | return VitClassificationModel(num_classes=args.num_classes, finetune=True, 34 | pre_process=pre_process, post_process=post_process) 35 | 36 | def process_batch(batch): 37 | """Process batch and produce inputs for the model.""" 38 | images = batch[0].cuda().contiguous() 39 | labels = batch[1].cuda().contiguous() 40 | return images, labels 41 | 42 | def cross_entropy_loss_func(labels, output_tensor): 43 | logits = output_tensor 44 | 45 | # Cross-entropy loss. 46 | loss = F.cross_entropy(logits.contiguous().float(), labels) 47 | 48 | # Reduce loss for logging. 49 | averaged_loss = average_losses_across_data_parallel_group([loss]) 50 | 51 | return loss, {'lm loss': averaged_loss[0]} 52 | 53 | def _cross_entropy_forward_step(batch, model): 54 | """Simple forward step with cross-entropy loss.""" 55 | timers = get_timers() 56 | 57 | # Get the batch. 58 | timers("batch generator", log_level=2).start() 59 | try: 60 | batch_ = next(batch) 61 | except BaseException: 62 | batch_ = batch 63 | images, labels = process_batch(batch_) 64 | timers("batch generator").stop() 65 | 66 | # Forward model. 67 | output_tensor = model(images) 68 | 69 | return output_tensor, partial(cross_entropy_loss_func, labels) 70 | 71 | """Finetune/evaluate.""" 72 | finetune( 73 | train_valid_datasets_provider, 74 | model_provider, 75 | forward_step=_cross_entropy_forward_step, 76 | end_of_epoch_callback_provider=accuracy_func_provider, 77 | ) 78 | 79 | def main(): 80 | classification() 81 | 82 | -------------------------------------------------------------------------------- /tasks/vision/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Main tasks functionality.""" 4 | 5 | import os 6 | import sys 7 | 8 | sys.path.append( 9 | os.path.abspath( 10 | os.path.join( 11 | os.path.join(os.path.dirname(__file__), os.path.pardir), 12 | os.path.pardir, 13 | ) 14 | ) 15 | ) 16 | from megatron import get_args 17 | from megatron.initialize import initialize_megatron 18 | 19 | def get_tasks_args(parser): 20 | """Provide extra arguments required for tasks.""" 21 | group = parser.add_argument_group(title="tasks") 22 | 23 | group.add_argument('--task', type=str, default='segment', 24 | choices=['classify', 'segment_setr', 'segment_segformer'], 25 | help='task name.') 26 | group.add_argument("--epochs", type=int, default=None, 27 | help="Number of finetunning epochs. Zero results in " 28 | "evaluation only.") 29 | group.add_argument('--pretrained-checkpoint-type', type=str, default='default', 30 | choices=['default', 'external', 'constrastive'], 31 | help='Type of pretrained checkpoint') 32 | group.add_argument("--pretrained-checkpoint", type=str, default=None, 33 | help="Pretrained checkpoint used for finetunning.") 34 | group.add_argument('--seg-stride', type=int, default=None, 35 | help='sliding window stride during evaluation') 36 | return parser 37 | 38 | 39 | if __name__ == "__main__": 40 | 41 | initialize_megatron(extra_args_provider=get_tasks_args) 42 | args = get_args() 43 | 44 | if args.task == 'classify': 45 | from tasks.vision.classification.classification import main 46 | main() 47 | elif args.task == 'segment_setr': 48 | from tasks.vision.segmentation.finetune_setr import main 49 | main() 50 | elif args.task == 'segment_segformer': 51 | from tasks.vision.segmentation.finetune_segformer import main 52 | main() 53 | 54 | -------------------------------------------------------------------------------- /tasks/vision/segmentation/seg_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import math 3 | import einops 4 | import torch 5 | import apex 6 | import torch.nn.functional as F 7 | from megatron import get_args 8 | from megatron.model.module import MegatronModule 9 | from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead 10 | from megatron.model.vision.mit_backbone import mit_b3, mit_b5 11 | from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead 12 | 13 | 14 | class SetrSegmentationModel(MegatronModule): 15 | 16 | def __init__(self, 17 | num_classes, 18 | pre_process=True, 19 | post_process=True): 20 | super(SetrSegmentationModel, self).__init__() 21 | args = get_args() 22 | assert post_process & pre_process 23 | self.hidden_size = args.hidden_size 24 | self.num_classes = num_classes 25 | self.backbone = VitBackbone( 26 | pre_process=pre_process, 27 | post_process=post_process, 28 | class_token=False, 29 | post_layer_norm=False, 30 | drop_path_rate=0.1 31 | ) 32 | 33 | self.head = SetrSegmentationHead( 34 | self.hidden_size, 35 | self.num_classes 36 | ) 37 | 38 | def set_input_tensor(self, input_tensor): 39 | """See megatron.model.transformer.set_input_tensor()""" 40 | pass 41 | 42 | def forward(self, input): 43 | # [b hw c] 44 | hidden_states = self.backbone(input) 45 | result_final = self.head(hidden_states) 46 | return result_final 47 | 48 | 49 | class SegformerSegmentationModel(MegatronModule): 50 | 51 | def __init__(self, 52 | num_classes, 53 | pre_process=True, 54 | post_process=True): 55 | super(SegformerSegmentationModel, self).__init__() 56 | args = get_args() 57 | self.hidden_size = args.hidden_size 58 | self.num_classes = num_classes 59 | self.pre_process = pre_process 60 | self.post_process = post_process 61 | 62 | self.backbone = mit_b5() 63 | self.head = SegformerSegmentationHead( 64 | feature_strides=[4, 8, 16, 32], 65 | in_channels=[64, 128, 320, 512], 66 | embedding_dim=768, 67 | dropout_ratio=0.1 68 | ) 69 | 70 | def set_input_tensor(self, input_tensor): 71 | """See megatron.model.transformer.set_input_tensor()""" 72 | pass 73 | 74 | def forward(self, input): 75 | # [b hw c] 76 | hidden_states = self.backbone(input) 77 | hidden_states = self.head(hidden_states) 78 | return hidden_states 79 | 80 | -------------------------------------------------------------------------------- /tasks/vision/segmentation/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import numpy as np 4 | from megatron import get_args 5 | 6 | def slidingcrops(img, mask): 7 | # img: [b c h w] 8 | # mask: [b h w] 9 | args = get_args() 10 | assert args.img_h == args.img_w 11 | crop_size = args.img_h 12 | stride = args.seg_stride 13 | ignore_index = args.ignore_index 14 | n, c, h, w = img.shape 15 | assert h >= crop_size 16 | assert w >= crop_size 17 | long_size = max(h, w) 18 | 19 | img_slices, mask_slices, slices_info = [], [], [] 20 | if long_size > crop_size: 21 | assert stride <= crop_size 22 | h_step_num = int(math.ceil((h - crop_size) / float(stride))) + 1 23 | w_step_num = int(math.ceil((w - crop_size) / float(stride))) + 1 24 | for yy in range(h_step_num): 25 | for xx in range(w_step_num): 26 | sy, sx = yy * stride, xx * stride 27 | ey, ex = sy + crop_size, sx + crop_size 28 | img_sub = img[:, :, sy: ey, sx: ex] 29 | mask_sub = mask[:, sy: ey, sx: ex] 30 | 31 | # padding 32 | sub_h, sub_w = img_sub.shape[2:] 33 | pad_h = max(crop_size - sub_h, 0) 34 | pad_w = max(crop_size - sub_w, 0) 35 | img_sub = torch.nn.functional.pad(img_sub, pad=(0, pad_w, 0, pad_h), value=ignore_index) 36 | mask_sub = torch.nn.functional.pad(mask_sub, pad=(0, pad_w, 0, pad_h)) 37 | 38 | img_slices.append(img_sub) 39 | mask_slices.append(mask_sub) 40 | slices_info.append([sy, ey, sx, ex, sub_h, sub_w]) 41 | 42 | return torch.cat(img_slices), torch.cat(mask_slices), slices_info, (h, w) 43 | else: 44 | return img, mask, [[0, h, 0, w, h, w]], (h, w) 45 | 46 | 47 | def slidingjoins(preds, probs, labels, slices_info, img_size): 48 | args = get_args() 49 | num_slices = len(slices_info) 50 | 51 | if num_slices == 1: 52 | return preds, labels 53 | 54 | h, w = img_size 55 | split_size = args.micro_batch_size 56 | 57 | preds_split = torch.split(preds, split_size) 58 | probs_split = torch.split(probs, split_size) 59 | labels_split = torch.split(labels, split_size) 60 | 61 | assert(len(preds_split) == num_slices) 62 | 63 | total_max_probs = torch.zeros((split_size, h, w), dtype=torch.float, device='cuda') 64 | total_preds = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda') 65 | total_labels = torch.zeros((split_size, h, w), dtype=torch.int, device='cuda') 66 | 67 | for i in range(num_slices): 68 | sy, ey, sx, ex, sub_h, sub_w = slices_info[i] 69 | assert sy + sub_h <= h 70 | assert sx + sub_w <= w 71 | curr_max_probs = total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] 72 | curr_preds = total_preds[:, sy:sy + sub_h, sx:sx + sub_w] 73 | 74 | local_max_probs = probs_split[i][:, :sub_h, : sub_w] 75 | local_preds = preds_split[i][:, :sub_h, :sub_w] 76 | 77 | result_max_probs = torch.maximum(curr_max_probs, local_max_probs) 78 | result_preds = torch.where(curr_max_probs >= local_max_probs, curr_preds, local_preds) 79 | 80 | total_max_probs[:, sy:sy + sub_h, sx:sx + sub_w] = result_max_probs 81 | total_preds[:, sy:sy + sub_h, sx:sx + sub_w] = result_preds 82 | total_labels[:, sy:sy + sub_h, sx:sx + sub_w] = labels_split[i][0, :sub_h, :sub_w] 83 | 84 | return total_preds, total_labels 85 | 86 | -------------------------------------------------------------------------------- /tasks/zeroshot_gpt/detokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | """Detokenization.""" 4 | 5 | import re 6 | 7 | 8 | def ptb_detokenizer(string): 9 | string = string.replace(" '", "'") 10 | string = string.replace(" \n", "\n") 11 | string = string.replace("\n ", "\n") 12 | string = string.replace(" n't", "n't") 13 | string = string.replace(" N ", "1 ") 14 | string = string.replace("$ 1", "$1") 15 | string = string.replace("# 1", "#1") 16 | return string 17 | 18 | 19 | def wikitext_detokenizer(string): 20 | # contractions 21 | string = string.replace("s '", "s'") 22 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 23 | # number separators 24 | string = string.replace(" @-@ ", "-") 25 | string = string.replace(" @,@ ", ",") 26 | string = string.replace(" @.@ ", ".") 27 | # punctuation 28 | string = string.replace(" : ", ": ") 29 | string = string.replace(" ; ", "; ") 30 | string = string.replace(" . ", ". ") 31 | string = string.replace(" ! ", "! ") 32 | string = string.replace(" ? ", "? ") 33 | string = string.replace(" , ", ", ") 34 | # double brackets 35 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 36 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 37 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 38 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 39 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 40 | # miscellaneous 41 | string = string.replace("= = = =", "====") 42 | string = string.replace("= = =", "===") 43 | string = string.replace("= =", "==") 44 | string = string.replace(" " + chr(176) + " ", chr(176)) 45 | string = string.replace(" \n", "\n") 46 | string = string.replace("\n ", "\n") 47 | string = string.replace(" N ", " 1 ") 48 | string = string.replace(" 's", "'s") 49 | 50 | return string 51 | 52 | 53 | def lambada_detokenizer(string): 54 | return string 55 | 56 | 57 | _DETOKENIZERS = { 58 | 'ptb': ptb_detokenizer, 59 | 'wiki': wikitext_detokenizer, 60 | 'lambada': lambada_detokenizer, 61 | } 62 | 63 | 64 | def get_detokenizer(path): 65 | for key in _DETOKENIZERS.keys(): 66 | if key in path: 67 | return _DETOKENIZERS[key] 68 | -------------------------------------------------------------------------------- /tool_apply_sparsity.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description='Trim Lana checkpoint') 5 | parser.add_argument('--ckpt_dir', type=str, default='output/checkpoints/gpt3-843m-mask-only-simple-no-async-grad/train_iters_2000/ckpt/iter_0002000', help='Input checkpoint') 6 | args = parser.parse_args() 7 | 8 | def apply_sparsity(input, output): 9 | ckpt = torch.load(input, map_location='cpu') 10 | new_encoder_state_dict = {} 11 | mask_options = torch.zeros(1, 6, 4, dtype=torch.float32) 12 | mask_options[:, 0, :].data += torch.tensor([1, 1, 0, 0], dtype=torch.float32) 13 | mask_options[:, 1, :].data += torch.tensor([1, 0, 1, 0], dtype=torch.float32) 14 | mask_options[:, 2, :].data += torch.tensor([1, 0, 0, 1], dtype=torch.float32) 15 | mask_options[:, 3, :].data += torch.tensor([0, 1, 1, 0], dtype=torch.float32) 16 | mask_options[:, 4, :].data += torch.tensor([0, 1, 0, 1], dtype=torch.float32) 17 | mask_options[:, 5, :].data += torch.tensor([0, 0, 1, 1], dtype=torch.float32) 18 | 19 | for k,v in ckpt['model']['language_model']['encoder'].items(): 20 | if 'mask' not in k: 21 | new_encoder_state_dict[k] = v 22 | print("Save weights:", k) 23 | 24 | for k,v in ckpt['model']['language_model']['encoder'].items(): 25 | if '.diff_mask.gate' in k: 26 | gate = ckpt['model']['language_model']['encoder'][k].float() 27 | runtime_mask = ckpt['model']['language_model']['encoder'][k.replace('diff_mask.gate', 'mask')].float() 28 | winner_mask = mask_options[torch.arange(mask_options.shape[0]), gate.argmax(dim=-1)].view(*runtime_mask.shape) 29 | # set the type of winner mask the same as runtime_mask 30 | winner_mask = winner_mask.type_as(runtime_mask) 31 | new_encoder_state_dict[k.replace('diff_mask.gate', 'weight')] *= winner_mask 32 | print("freeze mask:", k.replace('diff_mask.gate', 'mask')) 33 | 34 | ckpt['model']['language_model']['encoder'] = new_encoder_state_dict 35 | print(ckpt['model']['language_model']['encoder'].keys()) 36 | torch.save(ckpt, output) 37 | 38 | import os 39 | import glob 40 | 41 | if args.ckpt_dir.endswith('/'): 42 | args.ckpt_dir = args.ckpt_dir[:-1] 43 | splited_dir = args.ckpt_dir.split('/') 44 | output_dir = os.path.join('/'.join(splited_dir[:-1]), 'iter_0000001') 45 | print(f"output_dir: {output_dir}") 46 | os.makedirs(output_dir, exist_ok=True) 47 | mp_rank_dirs = glob.glob(os.path.join(args.ckpt_dir, "mp_rank_*")) 48 | for mp_rank_dir in mp_rank_dirs: 49 | ckpt_file = os.path.join(mp_rank_dir, "model_optim_rng.pt") 50 | output_file = ckpt_file.replace(args.ckpt_dir, output_dir) 51 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 52 | apply_sparsity(ckpt_file, output_file) 53 | 54 | iteration_file = os.path.join( *splited_dir[:-1], 'latest_checkpointed_iteration.txt') 55 | print(iteration_file) 56 | with open(iteration_file, 'w') as f: 57 | f.write("1") 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /tool_compress_mask.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser(description='Mask Compression') 7 | parser.add_argument('--mask_ckpt', type=str, help='path to the mask checkpoint') 8 | parser.add_argument('--output', type=str, help='output path') 9 | 10 | args = parser.parse_args() 11 | 12 | if __name__=='__main__': 13 | mask_ckpt = torch.load(args.mask_ckpt, map_location='cpu') 14 | compressed_mask = {} 15 | for k, mask in mask_ckpt.items(): 16 | # Compress with np.packbits 17 | print(f"Compressing {k}...") 18 | mask = mask.cpu().numpy().astype(bool) 19 | mask = np.packbits(mask) 20 | compressed_mask[k] = mask 21 | np.savez_compressed(args.output, **compressed_mask) 22 | 23 | -------------------------------------------------------------------------------- /tool_compute_mask_hf.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | import torch 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | from importlib.metadata import version 7 | 8 | import time 9 | import torch 10 | import torch.nn as nn 11 | 12 | # Import get_loaders function from data module within the same directory 13 | 14 | from collections import defaultdict 15 | import fnmatch 16 | 17 | # Code adapted from https://github.com/IST-DASLab/sparsegpt/blob/master/datautils.py 18 | 19 | import numpy as np 20 | import random 21 | import torch 22 | from datasets import load_dataset 23 | 24 | print('torch', version('torch')) 25 | print('transformers', version('transformers')) 26 | print('accelerate', version('accelerate')) 27 | print('# of gpus: ', torch.cuda.device_count()) 28 | 29 | def get_llm(model_name, cache_dir="llm_weights"): 30 | model = AutoModelForCausalLM.from_pretrained( 31 | model_name, 32 | torch_dtype=torch.float16, 33 | cache_dir=cache_dir, 34 | device_map="cpu" 35 | ) 36 | model.seqlen = model.config.max_position_embeddings 37 | return model 38 | 39 | def main(): 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--dense', type=str, help='Dense model') 42 | parser.add_argument('--sparse', type=str, help='Sparse model') 43 | parser.add_argument('--save', type=str, help='Save as') 44 | parser.add_argument("--cache_dir", default="llm_weights", type=str ) 45 | args = parser.parse_args() 46 | 47 | # Setting seeds for reproducibilit 48 | with torch.no_grad(): 49 | dense = get_llm(args.dense, args.cache_dir) 50 | sparse = get_llm(args.sparse, args.cache_dir) 51 | 52 | mask_ckpt = {} 53 | for (name_dense, param_dense), (name_sparse, param_sparse) in zip(dense.named_parameters(), sparse.named_parameters()): 54 | sparsity = (param_sparse==0).float().mean().item() 55 | print(f"{name_sparse} - sparsity {sparsity:.4f}") 56 | # Check 2:4 57 | if abs(sparsity-0.5)<0.0001: 58 | mask = (param_sparse!=0).float() 59 | assert torch.equal(mask * param_dense, param_sparse) 60 | mask_ckpt[name_sparse+'.mask'] = mask 61 | else: 62 | # assert equal of dense and sparse_weight 63 | assert torch.equal(param_dense, param_sparse) 64 | 65 | torch.save(mask_ckpt, args.save) 66 | print(mask_ckpt.keys()) 67 | print(f"Mask saved as {args.save}") 68 | 69 | 70 | 71 | if __name__ == '__main__': 72 | main() -------------------------------------------------------------------------------- /tool_trim_learnable_sparsity.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description='Trim Lana checkpoint') 5 | parser.add_argument('--ckpt_dir', type=str, default='output/checkpoints/llama-mask-only/train_iters_2000/ckpt/iter_0002000', help='Input checkpoint') 6 | args = parser.parse_args() 7 | 8 | def trim_ckpt(input, output): 9 | ckpt = torch.load(input, map_location='cpu') 10 | new_encoder_state_dict = {} 11 | mask_options = torch.zeros(1, 6, 4, dtype=torch.float32) 12 | mask_options[:, 0, :].data += torch.tensor([1, 1, 0, 0], dtype=torch.float32) 13 | mask_options[:, 1, :].data += torch.tensor([1, 0, 1, 0], dtype=torch.float32) 14 | mask_options[:, 2, :].data += torch.tensor([1, 0, 0, 1], dtype=torch.float32) 15 | mask_options[:, 3, :].data += torch.tensor([0, 1, 1, 0], dtype=torch.float32) 16 | mask_options[:, 4, :].data += torch.tensor([0, 1, 0, 1], dtype=torch.float32) 17 | mask_options[:, 5, :].data += torch.tensor([0, 0, 1, 1], dtype=torch.float32) 18 | 19 | for k,v in ckpt['model']['language_model']['encoder'].items(): 20 | if '.diff_mask.gate' in k: 21 | gate = ckpt['model']['language_model']['encoder'][k].float() 22 | runtime_mask = ckpt['model']['language_model']['encoder'][k.replace('diff_mask.gate', 'mask')].float() 23 | winner_mask = mask_options[torch.arange(mask_options.shape[0]), gate.argmax(dim=-1)].view(*runtime_mask.shape) 24 | # set the type of winner mask the same as runtime_mask 25 | winner_mask = winner_mask.type_as(runtime_mask) 26 | new_encoder_state_dict[k.replace('diff_mask.gate', 'mask')] = winner_mask 27 | print("save winner mask:", k.replace('diff_mask.gate', 'mask')) 28 | continue 29 | 30 | if '.mask' in k: continue 31 | if '.mask_options' in k: continue 32 | 33 | new_encoder_state_dict[k] = v 34 | 35 | ckpt['model']['language_model']['encoder'] = new_encoder_state_dict 36 | print(ckpt['model']['language_model']['encoder'].keys()) 37 | torch.save(ckpt, output) 38 | 39 | 40 | import os 41 | import glob 42 | 43 | # Create output directory 44 | splited_dir = args.ckpt_dir.split('/') 45 | output_dir = os.path.join('/'.join(splited_dir[:-1]), 'release') 46 | print(f"output_dir: {output_dir}") 47 | os.makedirs(output_dir, exist_ok=True) 48 | 49 | # Trim the checkpoints 50 | mp_rank_dirs = glob.glob(os.path.join(args.ckpt_dir, "mp_rank_*")) 51 | for mp_rank_dir in mp_rank_dirs: 52 | ckpt_file = os.path.join(mp_rank_dir, "model_optim_rng.pt") 53 | output_file = ckpt_file.replace(args.ckpt_dir, output_dir) 54 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 55 | print(f"Trim {ckpt_file} to {output_file}") 56 | trim_ckpt(ckpt_file, output_file) 57 | 58 | # update the latest iteration to "release" 59 | iteration_file = os.path.join( *splited_dir[:-1], 'latest_checkpointed_iteration.txt') 60 | print(iteration_file) 61 | with open(iteration_file, 'w') as f: 62 | f.write("release") 63 | 64 | 65 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import torch 3 | import os,sys 4 | import pathlib 5 | import logging 6 | 7 | sys.path.append(os.path.join(pathlib.Path(__file__).parent.resolve(),'umct')) -------------------------------------------------------------------------------- /tools/autoformat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 4 | 5 | # for now we just format core 6 | 7 | black ${SCRIPT_DIR}/../megatron/core 8 | isort ${SCRIPT_DIR}/../megatron/core 9 | -------------------------------------------------------------------------------- /tools/bert_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .embed import BertEmbedder, DiskDataParallelBertEmbedder 4 | -------------------------------------------------------------------------------- /tools/bert_embedding/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from megatron import get_args, get_tokenizer 7 | from megatron.data.bert_dataset import build_training_sample 8 | 9 | 10 | class BertEmbeddingDataset(torch.utils.data.Dataset): 11 | '''Dataset to convert a text dataset to Bert tokens.''' 12 | 13 | def __init__(self, text_dataset, max_seq_length): 14 | 15 | super().__init__() 16 | 17 | args = get_args() 18 | 19 | # Dataset, tokenizer. 20 | self.text_dataset = text_dataset 21 | self.bert_tokenizer = get_tokenizer() 22 | 23 | # Params to store. 24 | self.max_seq_length = max_seq_length 25 | self.seed = args.seed 26 | self.masked_lm_prob = args.mask_prob 27 | 28 | # Vocab stuff. 29 | self.vocab_id_list = list(self.bert_tokenizer.inv_vocab.keys()) 30 | self.vocab_id_to_token_dict = self.bert_tokenizer.inv_vocab 31 | self.cls_id = self.bert_tokenizer.cls 32 | self.sep_id = self.bert_tokenizer.sep 33 | self.mask_id = self.bert_tokenizer.mask 34 | self.pad_id = self.bert_tokenizer.pad 35 | 36 | def __len__(self): 37 | return len(self.text_dataset) 38 | 39 | def __getitem__(self, idx): 40 | 41 | # Text. 42 | text_sample = self.text_dataset[idx] 43 | text = text_sample["text"] 44 | text = text.replace("<|endoftext|>", "") 45 | 46 | # Bert/Wordpiece tokens (+truncate). 47 | bert_token_ids = self.bert_tokenizer.tokenize(text) 48 | bert_token_ids = bert_token_ids[:self.max_seq_length - 2] # cls+sep. 49 | if not bert_token_ids: 50 | bert_token_ids = [ self.bert_tokenizer.pad_id ] # hack when empty seq 51 | 52 | # Note that this rng state should be numpy and not python since 53 | # python randint is inclusive whereas the numpy one is exclusive. 54 | # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1 55 | np_rng = np.random.RandomState(seed=((self.seed + idx) % 2**32)) 56 | 57 | # Build sample. 58 | sample = build_training_sample([bert_token_ids], 59 | len(bert_token_ids), 60 | len(bert_token_ids) + 2, # for cls+sep 61 | self.vocab_id_list, 62 | self.vocab_id_to_token_dict, 63 | self.cls_id, self.sep_id, 64 | self.mask_id, self.pad_id, 65 | self.masked_lm_prob, np_rng, 66 | binary_head=False) 67 | sample["seq_length"] = len(sample["text"]) 68 | return sample 69 | -------------------------------------------------------------------------------- /tools/bert_embedding/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "h5py", 7 | "transformers", # for huggingface bert 8 | ] 9 | 10 | for lib in required_libs: 11 | try: 12 | globals()[lib] = importlib.import_module(lib) 13 | except ImportError as e: 14 | raise Exception(f"Missing one or more packages required for Bert embedding: {required_libs}. Tried importing '{lib}'.") 15 | -------------------------------------------------------------------------------- /tools/download_c4.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | # English only 4 | for i in range(20): 5 | en = load_dataset("allenai/c4", data_files={'train': f'en/c4-train.{str(i).zfill(5)}-of-01024.json.gz'}, cache_dir='./assets/data', split='train') 6 | print(len(en)) 7 | 8 | # save as json files 9 | en.to_json(f'./assets/data/en/c4-train.{str(i).zfill(5)}-of-01024.json', orient='records', lines=True) -------------------------------------------------------------------------------- /tools/linter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import pathlib 4 | import subprocess 5 | 6 | 7 | def recursively_lint_files(): 8 | """Recursively lint all python files in chosen subdirectories of megatron-lm""" 9 | 10 | try: 11 | import autopep8 12 | except ModuleNotFoundError: 13 | print("Please first install autopep8 via `pip install autopep8`") 14 | return 15 | 16 | # get all python file paths from top level directory 17 | file_dir = str(pathlib.Path(__file__).parent.absolute()) 18 | working_dir = osp.join(file_dir, os.pardir) 19 | all_py_paths = set(os.path.join(working_dir, fname) 20 | for fname in os.listdir(working_dir) if ".py" in fname) 21 | 22 | # get all python file paths from chosen subdirectories 23 | check_dirs = ['docker', 'megatron', 'openwebtext', 'scripts', 'tasks'] 24 | for sub_dir in check_dirs: 25 | for path, _, fnames in os.walk(osp.join(working_dir, sub_dir)): 26 | all_py_paths.update(set(osp.join(path, fname) for fname in fnames if ".py" in fname)) 27 | 28 | print("Linting the following: ") 29 | for py_path in all_py_paths: 30 | print(py_path) 31 | command = 'autopep8 --max-line-length 100 --aggressive --in-place {}'.format(py_path) 32 | subprocess.check_call(command) 33 | 34 | 35 | if __name__ == "__main__": 36 | recursively_lint_files() 37 | -------------------------------------------------------------------------------- /tools/merge_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import argparse 5 | 6 | sys.path.append( 7 | os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 8 | ) 9 | 10 | from megatron.core.datasets.indexed_dataset import ( 11 | MMapIndexedDataset, 12 | MMapIndexedDatasetBuilder, 13 | get_bin_path, 14 | get_idx_path, 15 | ) 16 | 17 | 18 | def get_args(): 19 | parser = argparse.ArgumentParser() 20 | 21 | group = parser.add_argument_group(title="input data") 22 | group.add_argument( 23 | "--input", 24 | type=str, 25 | required=True, 26 | help="Path to directory containing all document files to merge", 27 | ) 28 | 29 | group = parser.add_argument_group(title="output data") 30 | group.add_argument( 31 | "--output-prefix", 32 | type=str, 33 | required=True, 34 | help="Path to binary output file without suffix", 35 | ) 36 | 37 | group = parser.add_argument_group(title="miscellaneous") 38 | group.add_argument( 39 | "--multimodal", 40 | action="store_true", 41 | help="Whether the datasets are assumed to be multimodal" 42 | ) 43 | 44 | args = parser.parse_args() 45 | 46 | assert os.path.isdir( 47 | args.input 48 | ), f"ERROR: {args.input} is not a directory or does not exist" 49 | 50 | assert os.path.isdir( 51 | os.path.dirname(args.output_prefix) 52 | ), f"ERROR: {os.path.dirname(args.output_prefix)} is not a directory or does not exist" 53 | 54 | return args 55 | 56 | 57 | def main(): 58 | args = get_args() 59 | 60 | prefixes = set() 61 | for basename in os.listdir(args.input): 62 | prefix, ext = os.path.splitext(basename) 63 | 64 | if prefix in prefixes: 65 | continue 66 | 67 | if not os.path.isfile(os.path.join(args.input, basename)): 68 | continue 69 | 70 | ext_pair = ".bin" if ext == ".idx" else ".idx" 71 | assert os.path.isfile( 72 | os.path.join(args.input, prefix) + ext_pair 73 | ), f"ERROR: {ext_pair} file not provided for {os.path.join(args.input, prefix)}" 74 | 75 | prefixes.add(prefix) 76 | 77 | builder = None 78 | for prefix in sorted(prefixes): 79 | if builder is None: 80 | dataset = MMapIndexedDataset(os.path.join(args.input, prefix), multimodal=args.multimodal) 81 | builder = MMapIndexedDatasetBuilder( 82 | get_bin_path(args.output_prefix), dtype=dataset.index.dtype, multimodal=args.multimodal 83 | ) 84 | del dataset 85 | 86 | builder.add_index(os.path.join(args.input, prefix)) 87 | 88 | builder.finalize(get_idx_path(args.output_prefix)) 89 | 90 | 91 | if __name__ == '__main__': 92 | 93 | main() 94 | -------------------------------------------------------------------------------- /tools/openwebtext/add_id.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import argparse 4 | import json 5 | import os 6 | import time 7 | 8 | """ 9 | This code adds id to each json object in a json file. User can add prefix 10 | to the ids. 11 | """ 12 | 13 | if __name__ == '__main__': 14 | 15 | print('parsing the arguments ...') 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--input-file', type=str, default=None, help='Input'\ 19 | ' json file where id needs to be added') 20 | parser.add_argument('--output-file', type=str, default=None, help=\ 21 | 'Output file name with id') 22 | parser.add_argument('--id-prefix', type=str, default=None, help=\ 23 | 'Id prefix') 24 | parser.add_argument('--log-interval', type=int, default=100, 25 | help='Log interval') 26 | args = parser.parse_args() 27 | 28 | print('Adding ids to dataset ...') 29 | 30 | f_input = open(args.input_file, 'r', encoding='utf-8') 31 | f_output = open(args.output_file, 'wb') 32 | 33 | unique_ids = 1 34 | start_time = time.time() 35 | for row in f_input: 36 | each_row = json.loads(row) 37 | adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids)) 38 | each_row['adlr_id'] = adlr_id_string 39 | myjson = json.dumps(each_row, ensure_ascii=False) 40 | 41 | f_output.write(myjson.encode('utf-8')) 42 | f_output.write('\n'.encode('utf-8')) 43 | 44 | if unique_ids % args.log_interval == 0: 45 | print(' processed {:9d} documents in {:.2f} seconds ...'.format( \ 46 | unique_ids, time.time() - start_time), flush=True) 47 | 48 | unique_ids += 1 49 | 50 | # Close the file. 51 | f_input.close() 52 | f_output.close() 53 | 54 | print('done :-)', flush=True) 55 | -------------------------------------------------------------------------------- /tools/openwebtext/group_duplicate_url.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import json 4 | import time 5 | import sys 6 | 7 | 8 | if __name__ == '__main__': 9 | 10 | 11 | print('grouping duplicate urls ...') 12 | 13 | input = sys.argv[1] 14 | output = sys.argv[2] 15 | if len(sys.argv) > 3: 16 | jaccard_similarity_threshold = float(sys.argv[3]) 17 | else: 18 | jaccard_similarity_threshold = 0.7 19 | 20 | url_to_index = {} 21 | index_to_urls = [] 22 | counter = 0 23 | start_time = time.time() 24 | with open(input, 'r') as f: 25 | for line in f: 26 | counter += 1 27 | myjson = json.loads(line) 28 | urls = [] 29 | for main_url in myjson.keys(): 30 | urls.append(main_url) 31 | for value in myjson[main_url]: 32 | for other_url, js in value.items(): 33 | if js >= jaccard_similarity_threshold: 34 | urls.append(other_url) 35 | current_index = -1 36 | other_indices = set() 37 | for url in urls: 38 | if url in url_to_index: 39 | if current_index == -1: 40 | current_index = url_to_index[url] 41 | elif current_index != url_to_index[url]: 42 | other_indices.add(url_to_index[url]) 43 | if current_index == -1: 44 | current_index = len(index_to_urls) 45 | index_to_urls.append(set()) 46 | for url in urls: 47 | url_to_index[url] = current_index 48 | index_to_urls[current_index].add(url) 49 | for index in other_indices: 50 | for url in index_to_urls[index]: 51 | index_to_urls[current_index].add(url) 52 | url_to_index[url] = current_index 53 | index_to_urls[index] = None 54 | 55 | if counter % 100000 == 0: 56 | print(' > processed {} lines in {} seconds ...'.format( 57 | counter, time.time() - start_time)) 58 | 59 | 60 | total_remove = 0 61 | total_remain = 0 62 | for urls in index_to_urls: 63 | if urls is not None: 64 | if len(urls) > 1: 65 | total_remove += (len(urls) - 1) 66 | total_remain += 1 67 | print('out of {} urls, only {} are unique and {} should be removed'.format( 68 | total_remove+total_remain, total_remain, total_remove)) 69 | 70 | with open(output, 'wb') as f: 71 | for i, urls in enumerate(index_to_urls): 72 | if urls is not None: 73 | if len(urls) > 1: 74 | myjson = json.dumps({str(i): list(urls)}, 75 | ensure_ascii=False) 76 | f.write(myjson.encode('utf-8')) 77 | f.write('\n'.encode('utf-8')) 78 | -------------------------------------------------------------------------------- /tools/openwebtext/merge_jsons.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import glob 5 | import sys 6 | import json 7 | import argparse 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--json_path", type=str, default=".", 13 | help="path where all the json files are located") 14 | 15 | parser.add_argument("--output_file", type=str, default="merged_output.json", 16 | help="filename where the merged json should go") 17 | 18 | args = parser.parse_args() 19 | 20 | json_path = args.json_path 21 | out_file = args.output_file 22 | 23 | json_files = glob.glob(json_path + '/*.json') 24 | 25 | counter = 0 26 | 27 | with open(out_file, 'w') as outfile: 28 | for fname in json_files: 29 | counter += 1 30 | 31 | if counter % 1024 == 0: 32 | print("Merging at ", counter, flush=True) 33 | 34 | with open(fname, 'r') as infile: 35 | for row in infile: 36 | each_row = json.loads(row) 37 | outfile.write(row) 38 | 39 | 40 | print("Merged file", out_file, flush=True) 41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/openwebtext/remove_group_duplicates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | import json 5 | import time 6 | import sys 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | url_filename = sys.argv[1] 12 | data_filename = sys.argv[2] 13 | output_filename = sys.argv[3] 14 | 15 | urls = set() 16 | with open(url_filename, 'r') as f: 17 | for line in f: 18 | myjson = json.loads(line) 19 | for key in myjson: 20 | this_urls = myjson[key] 21 | for i in range(1, len(this_urls)): 22 | urls.add(this_urls[i]) 23 | print('will be removing {} urls'.format(len(urls)), flush=True) 24 | 25 | written_docs = 0 26 | removed_docs = 0 27 | removed_chars = 0 28 | start_time = time.time() 29 | with open(output_filename, 'wb') as fout: 30 | with open(data_filename, 'r') as fin: 31 | for line in fin: 32 | try: 33 | myjson = json.loads(line) 34 | url = myjson['url'] 35 | if url in urls: 36 | print('removing', myjson) 37 | removed_docs += 1 38 | removed_chars += len(myjson['text']) 39 | continue 40 | myjson = json.dumps(myjson, ensure_ascii=False) 41 | fout.write(myjson.encode('utf-8')) 42 | fout.write('\n'.encode('utf-8')) 43 | written_docs += 1 44 | if written_docs % 10000 == 0: 45 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 46 | '| removed: {} (char: {})'.format( 47 | time.time() - start_time, 48 | written_docs, removed_docs, removed_chars)) 49 | except Exception as e: 50 | print('[SKIPPING]', line, e) 51 | 52 | print(' [PROCESSED] time (s): {:.2f} | written: {} ' 53 | '| removed: {} (char: {})'.format( 54 | time.time() - start_time, 55 | written_docs, removed_docs, removed_chars)) 56 | print('done :-)') 57 | -------------------------------------------------------------------------------- /tools/prepare_c4_megatron.sh: -------------------------------------------------------------------------------- 1 | # for i from 00000 to 00100 2 | mkdir -p assets/data/preprocessed 3 | for i in {00014..00020}; do 4 | echo "Processing ./assets/data/en/c4-train.${i}-of-01024.json" 5 | python tools/preprocess_data.py \ 6 | --input "./assets/data/en/c4-train.${i}-of-01024.json" \ 7 | --output-prefix assets/data/preprocessed/llama2_${i} \ 8 | --vocab-file ./assets/checkpoints/llama2_7b_hf/tokenizer.json \ 9 | --tokenizer-type Llama2Tokenizer \ 10 | --tokenizer-model ./assets/checkpoints/llama2_7b_hf/tokenizer.model \ 11 | --append-eod \ 12 | --workers 8 13 | done -------------------------------------------------------------------------------- /tools/retro/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .cli import retro 4 | -------------------------------------------------------------------------------- /tools/retro/cli/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | 5 | from . import retro 6 | 7 | 8 | if __name__ == "__main__": 9 | retro.init(os.environ["RETRO_WORKDIR"]) 10 | -------------------------------------------------------------------------------- /tools/retro/db/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .build import build_db 4 | -------------------------------------------------------------------------------- /tools/retro/db/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import json 4 | import numpy as np 5 | import torch 6 | from tqdm import tqdm 7 | 8 | from megatron import get_args, print_rank_0 9 | from tools.retro.external_libs import h5py 10 | from tools.retro.utils import get_gpt_tokenizer 11 | 12 | 13 | class DBDataset(torch.utils.data.Dataset): 14 | '''Dataset for iterating chunks. 15 | 16 | Requires: 17 | - List of indexed datasets 18 | - Chunk index array, with format: 19 | [dataset_idx, doc_id, start_idx, end_idx, bert_length]) 20 | ''' 21 | 22 | def __init__(self, db_path, indexed_datasets, chunks, max_chunk_length): 23 | 24 | assert chunks.shape[1] == 5, "expected 5 columns (dataset_idx, " \ 25 | "doc_idx, token_start_idx, token_end_idx, bert_chunk_length); " \ 26 | "found %d columns." % chunks.shape[1] 27 | 28 | self.db_path = db_path 29 | self.indexed_datasets = indexed_datasets 30 | self.chunks = chunks 31 | self.doc_chunk_map = None 32 | 33 | self.max_chunk_length = max_chunk_length 34 | self.eod_token_id = get_gpt_tokenizer().eod 35 | 36 | def __len__(self): 37 | return self.chunks.shape[0] 38 | 39 | def __getitem__(self, chunk_id): 40 | 41 | # Chunk start/end indexes. 42 | indexed_dataset_id, doc_id, token_start_idx, token_end_idx, _ = \ 43 | [ value.item() for value in self.chunks[chunk_id] ] 44 | chunk_length = token_end_idx - token_start_idx 45 | indexed_dataset = self.indexed_datasets[indexed_dataset_id] 46 | 47 | # Chunk token ids. 48 | token_ids = indexed_dataset.get(doc_id, 49 | offset=token_start_idx, 50 | length=chunk_length) 51 | 52 | # Extend chunks to max_chunk_length by padding with EOD tokens. 53 | if chunk_length != self.max_chunk_length: 54 | assert chunk_length < self.max_chunk_length, "invalid chunk len." 55 | token_ids = token_ids.tolist() 56 | token_ids += [self.eod_token_id] * \ 57 | (self.max_chunk_length - chunk_length) 58 | 59 | return { 60 | "doc_id" : doc_id, 61 | "text" : np.array(token_ids, dtype=np.int64), 62 | } 63 | 64 | def load_doc_tuples(self): 65 | '''Load the dataset & document ids. 66 | 67 | Load the dataset id & document id of each chunk in the database, to 68 | be used for causality filtering during querying. 69 | ''' 70 | self.doc_tuples = np.zeros(shape=(len(self), 2), dtype="uint32") 71 | block_size = int(1e6) 72 | for start_idx in tqdm(range(0, len(self), block_size)): 73 | end_idx = min(len(self), start_idx + block_size) 74 | self.doc_tuples[start_idx:end_idx]=self.chunks[start_idx:end_idx,:2] 75 | -------------------------------------------------------------------------------- /tools/retro/examples/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:23.09-py3 2 | 3 | RUN pip install -U faiss-gpu 4 | 5 | RUN apt update 6 | 7 | RUN apt install -qy htop 8 | 9 | RUN pip install -U transformers 10 | 11 | RUN pip install --upgrade google-api-python-client 12 | 13 | RUN pip install sentencepiece 14 | 15 | RUN pip install h5py 16 | 17 | RUN pip install nltk 18 | 19 | RUN pip install einops 20 | -------------------------------------------------------------------------------- /tools/retro/examples/pretrain_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -u 4 | 5 | unset NCCL_DEBUG 6 | export CUDA_DEVICE_MAX_CONNECTIONS=1 7 | 8 | ######## GPT or Retro?. ######## 9 | 10 | # 0 : GPT. 11 | # 1 : Retro 12 | 13 | ADD_RETRIEVER=1 14 | 15 | ######## Megatron, Retro dirs. ######## 16 | 17 | REPO_DIR="" 18 | RETRO_WORKDIR="" 19 | 20 | ######## Data. ######## 21 | 22 | DATA_BLEND="" 23 | 24 | ######## Args. ######## 25 | 26 | ARGS=" \ 27 | --log-interval 1 \ 28 | --use-flash-attn \ 29 | --apply-layernorm-1p \ 30 | --untie-embeddings-and-output-weights \ 31 | --disable-bias-linear \ 32 | --no-position-embedding \ 33 | --use-rotary-position-embeddings \ 34 | --rotary-percent 0.5 \ 35 | --swiglu \ 36 | --attention-dropout 0.0 \ 37 | --hidden-dropout 0.0 \ 38 | --exit-duration-in-mins 220 \ 39 | --tensor-model-parallel-size 1 \ 40 | --pipeline-model-parallel-size 1 \ 41 | --num-layers 24 \ 42 | --hidden-size 1024 \ 43 | --num-attention-heads 16 \ 44 | --seq-length 512 \ 45 | --max-position-embeddings 512 \ 46 | --micro-batch-size 16 \ 47 | --global-batch-size 256 \ 48 | --train-samples 200000 \ 49 | --lr-decay-samples 175000 \ 50 | --lr-warmup-samples 10000 \ 51 | --lr 2.5e-5 \ 52 | --min-lr 2.5e-6 \ 53 | --lr-decay-style cosine \ 54 | --eval-iters 50 \ 55 | --eval-interval 2000 \ 56 | --tokenizer-type GPTSentencePieceTokenizer \ 57 | --tokenizer-model \ 58 | --data-path ${DATA_BLEND} \ 59 | --split 98,2,0 \ 60 | --clip-grad 1.0 \ 61 | --weight-decay 0.1 \ 62 | --adam-beta1 0.9 \ 63 | --adam-beta2 0.95 \ 64 | --init-method-std 0.007 \ 65 | --log-params-norm \ 66 | --log-num-zeros-in-grad \ 67 | --bf16 \ 68 | " 69 | 70 | ######## Retro. ######## 71 | 72 | if [ "$ADD_RETRIEVER" = "0" ]; then 73 | SCRIPT=pretrain_gpt.py 74 | else 75 | ARGS="${ARGS} \ 76 | --retro-workdir ${RETRO_WORKDIR} \ 77 | --retro-add-retriever \ 78 | " 79 | SCRIPT=pretrain_retro.py 80 | fi 81 | 82 | ######## Command. ######## 83 | 84 | NPROCS=8 85 | CMD="\ 86 | pwd && cd ${REPO_DIR} && pwd && \ 87 | export PYTHONPATH=$PYTHONPATH:${REPO_DIR} && \ 88 | python -m torch.distributed.run \ 89 | --nproc_per_node ${NPROCS} \ 90 | --nnodes 1 \ 91 | --node_rank ${NODE_RANK} \ 92 | --master_addr ${MASTER_ADDR} \ 93 | --master_port 6000 \ 94 | ${SCRIPT} ${ARGS} \ 95 | " 96 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 97 | echo "CMD = '$CMD'." 98 | echo "~~~~~~~~~~~~~~~~~~~~~~~~~~" 99 | eval $CMD 100 | -------------------------------------------------------------------------------- /tools/retro/external_libs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import importlib 4 | 5 | required_libs = [ 6 | "faiss", 7 | "h5py", 8 | "transformers", # for huggingface bert 9 | ] 10 | 11 | for lib in required_libs: 12 | try: 13 | globals()[lib] = importlib.import_module(lib) 14 | except ImportError as e: 15 | raise Exception(f"Missing one or more packages required for Retro preprocessing: {required_libs}. Tried importing '{lib}'.") 16 | -------------------------------------------------------------------------------- /tools/retro/index/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .build import add_to_index, build_index, train_index 4 | # from .index import Index 5 | -------------------------------------------------------------------------------- /tools/retro/index/factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .indexes import FaissBaseIndex, FaissParallelAddIndex 4 | 5 | 6 | class IndexFactory: 7 | '''Get index. 8 | 9 | Index type generally read from argument '--retro-index-ty'. 10 | ''' 11 | 12 | @classmethod 13 | def get_index_class(cls, index_type): 14 | return { 15 | "faiss-base" : FaissBaseIndex, 16 | "faiss-par-add" : FaissParallelAddIndex, 17 | }[index_type] 18 | 19 | @classmethod 20 | def get_index(cls, index_type): 21 | index_class = cls.get_index_class(index_type) 22 | index = index_class() 23 | return index 24 | -------------------------------------------------------------------------------- /tools/retro/index/index.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import abc 4 | import numpy as np 5 | import os 6 | import torch 7 | 8 | from megatron import get_retro_args 9 | from tools.retro.external_libs import faiss 10 | 11 | from .utils import get_index_dir 12 | 13 | 14 | class Index(abc.ABC): 15 | 16 | '''Abstract base class for indexes. 17 | 18 | *Note* : While currently only Faiss-based classes are implemented, in the 19 | future, this class will be extended with other types of indexes that have 20 | different performance-accuracy trade-offs. 21 | 22 | The primary methods to override are: 23 | - train() : Train index on the sampled training chunks. 24 | - add() : Add all training chunks to index. 25 | ''' 26 | 27 | @classmethod 28 | def c_verbose(cls, index, v): 29 | '''Make index object verbose.''' 30 | assert isinstance(v, bool) 31 | faiss.ParameterSpace().set_index_parameter(index, "verbose", v) 32 | 33 | def get_empty_index_path(self): 34 | args = get_retro_args() 35 | return os.path.join( 36 | get_index_dir(), 37 | "empty_%.3f.faissindex" % args.retro_index_train_load_fraction, 38 | ) 39 | 40 | def get_empty_index(self): 41 | return faiss.read_index(self.get_empty_index_path()) 42 | 43 | def get_added_index_path(self): 44 | args = get_retro_args() 45 | return os.path.join( 46 | get_index_dir(), 47 | "added_%.3f_%.3f.faissindex" % ( 48 | args.retro_index_train_load_fraction, 49 | args.retro_index_add_load_fraction, 50 | ), 51 | ) 52 | 53 | def get_added_index(self): 54 | return faiss.read_index(self.get_added_index_path()) 55 | 56 | @abc.abstractmethod 57 | def train(self, *args): 58 | pass 59 | 60 | @abc.abstractmethod 61 | def add(self, *args): 62 | pass 63 | 64 | def embed_text_dataset_block(self, embedder, text_dataset, _range): 65 | '''Embed a range of a text dataset.''' 66 | sub_dataset = torch.utils.data.Subset(text_dataset, range(*_range)) 67 | return embedder.embed_text_dataset(sub_dataset) 68 | -------------------------------------------------------------------------------- /tools/retro/index/indexes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .faiss_base import FaissBaseIndex 4 | from .faiss_par_add import FaissParallelAddIndex 5 | -------------------------------------------------------------------------------- /tools/retro/index/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import concurrent 4 | import gc 5 | import glob 6 | import numpy as np 7 | import os 8 | import psutil 9 | import time 10 | import torch 11 | from tqdm import tqdm 12 | 13 | from megatron import get_retro_args, print_rank_0 14 | from tools.retro.db.utils import get_indexed_dataset_infos 15 | from tools.retro.external_libs import h5py 16 | 17 | 18 | def get_index_dir(): 19 | """Create sub-directory for this index.""" 20 | 21 | args = get_retro_args() 22 | 23 | # Directory path. 24 | index_dir_path = os.path.join( 25 | args.retro_workdir, 26 | "index", 27 | args.retro_index_type, 28 | args.retro_index_str, 29 | ) 30 | 31 | # Make directory. 32 | os.makedirs(index_dir_path, exist_ok=True) 33 | 34 | return index_dir_path 35 | 36 | 37 | def num_samples_to_block_ranges(num_samples): 38 | '''Split a range (length num_samples) into sequence of block ranges 39 | of size block_size.''' 40 | args = get_retro_args() 41 | block_size = args.retro_block_size 42 | start_idxs = list(range(0, num_samples, block_size)) 43 | end_idxs = [min(num_samples, s + block_size) for s in start_idxs] 44 | ranges = list(zip(start_idxs, end_idxs)) 45 | return ranges 46 | 47 | 48 | def get_training_data_root_dir(): 49 | args = get_retro_args() 50 | return os.path.join(args.retro_workdir, "index", "train_emb") 51 | 52 | 53 | def get_training_data_block_dir(): 54 | return os.path.join(get_training_data_root_dir(), "blocks") 55 | 56 | 57 | def get_training_data_block_paths(): 58 | return sorted(glob.glob(get_training_data_block_dir() + "/*.hdf5")) 59 | 60 | 61 | def get_training_data_merged_path(): 62 | args = get_retro_args() 63 | return os.path.join(get_training_data_root_dir(), 64 | "train_%.3f.bin" % args.retro_index_train_load_fraction) 65 | 66 | 67 | def get_added_codes_dir(): 68 | return os.path.join(get_index_dir(), "add_codes") 69 | 70 | 71 | def get_added_code_paths(): 72 | return sorted(glob.glob(get_added_codes_dir() + "/*.hdf5")) 73 | -------------------------------------------------------------------------------- /tools/retro/query/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .query import query_pretraining_neighbors 4 | -------------------------------------------------------------------------------- /tools/retro/query/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import hashlib 4 | import os 5 | 6 | from megatron import get_retro_args 7 | 8 | 9 | def get_query_workdir(): 10 | args = get_retro_args() 11 | return os.path.join(args.retro_workdir, "query") 12 | 13 | 14 | def get_neighbor_dirname(key, dataset): 15 | return os.path.join(get_query_workdir(), os.path.basename(f"{key}_{dataset.unique_description_hash}")) 16 | -------------------------------------------------------------------------------- /tools/retro/sft/README.md: -------------------------------------------------------------------------------- 1 | ## Note 2 | 3 | The content within this `sft` directory is still under active development and will be updated soon. -------------------------------------------------------------------------------- /tools/retro/sft/open_inst.sh: -------------------------------------------------------------------------------- 1 | DATA_BLEND="1.0 open_inst" 2 | -------------------------------------------------------------------------------- /tools/retro/text_generation/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | # The following code is adapted from 3 | # https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/metrics.py, 4 | # which is licensed under the MIT license. More details on the license can be 5 | # found at https://github.com/facebookresearch/ParlAI/blob/master/LICENSE. 6 | 7 | """Provides standard metric evaluations for dialog.""" 8 | 9 | from collections import Counter 10 | from typing import List 11 | import numpy as np 12 | import re 13 | from nltk import ngrams 14 | 15 | re_art = re.compile(r'\b(a|an|the)\b') 16 | re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') 17 | 18 | 19 | def normalize_answer(s): 20 | """ 21 | Lower text and remove punctuation, articles and extra whitespace. 22 | """ 23 | s = s.lower() 24 | s = re_punc.sub(' ', s) 25 | s = re_art.sub(' ', s) 26 | s = ' '.join(s.split()) 27 | return s 28 | 29 | 30 | class F1Metric: 31 | """ 32 | Helper class which computes token-level F1. 33 | """ 34 | 35 | @staticmethod 36 | def _prec_recall_f1_score(pred_items, gold_items): 37 | """ 38 | Compute precision, recall and f1 given a set of gold and prediction items. 39 | :param pred_items: iterable of predicted values 40 | :param gold_items: iterable of gold values 41 | :return: tuple (p, r, f1) for precision, recall, f1 42 | """ 43 | common = Counter(gold_items) & Counter(pred_items) 44 | num_same = sum(common.values()) 45 | if num_same == 0: 46 | return 0, 0, 0 47 | precision = 1.0 * num_same / len(pred_items) 48 | recall = 1.0 * num_same / len(gold_items) 49 | f1 = (2 * precision * recall) / (precision + recall) 50 | return precision, recall, f1 51 | 52 | @staticmethod 53 | def compute_each_pair(guess: str, answer: str, n=1): 54 | if answer == "": 55 | return None, None, None 56 | if guess == "": 57 | return 0, 0, 0 58 | g_tokens = normalize_answer(guess).split() 59 | a_tokens = normalize_answer(answer).split() 60 | g_tokens = list(ngrams(g_tokens, n)) 61 | a_tokens = list(ngrams(a_tokens, n)) 62 | precision, recall, f1 = F1Metric._prec_recall_f1_score(g_tokens, a_tokens) 63 | return precision, recall, f1 64 | 65 | @staticmethod 66 | def compute_all_pairs(guesses: List[str], answers: List[str], n=1): 67 | # additional augment: 68 | print("guess:", len(guesses), ", answers:", len(answers)) 69 | assert len(guesses) == len(answers) 70 | 71 | precision_list, recall_list, f1_list = [], [], [] 72 | for guess, answer in zip(guesses, answers): 73 | precision, recall, f1 = F1Metric.compute_each_pair(guess, answer, n) 74 | if precision is None or recall is None or f1 is None: 75 | continue 76 | precision_list.append(precision) 77 | recall_list.append(recall) 78 | f1_list.append(f1) 79 | 80 | return np.mean(precision_list), np.mean(recall_list), np.mean(f1_list) 81 | -------------------------------------------------------------------------------- /tools/retro/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import torch 5 | import types 6 | 7 | from megatron import get_retro_args 8 | from megatron.tokenizer.tokenizer import ( 9 | _BertWordPieceTokenizer, 10 | _GPT2BPETokenizer, 11 | _GPTSentencePieceTokenizer, 12 | ) 13 | 14 | 15 | def get_args_path(workdir): 16 | '''Argument copy stored within retro workdir.''' 17 | return os.path.join(workdir, "args.json") 18 | 19 | 20 | def get_num_chunks_per_sample(): 21 | '''Compute seq_length // chunk_length.''' 22 | args = get_retro_args() 23 | sample_length = args.retro_gpt_seq_length 24 | chunk_length = args.retro_gpt_chunk_length 25 | assert sample_length % chunk_length == 0 26 | return sample_length // chunk_length 27 | 28 | 29 | def get_gpt_tokenizer(): 30 | '''GPT (BPE) tokenizer.''' 31 | args = get_retro_args() 32 | tokenizer_type = args.retro_gpt_tokenizer_type 33 | if tokenizer_type == "GPT2BPETokenizer": 34 | assert args.retro_gpt_vocab_file and args.retro_gpt_merge_file 35 | return _GPT2BPETokenizer( 36 | vocab_file=args.retro_gpt_vocab_file, 37 | merge_file=args.retro_gpt_merge_file, 38 | ) 39 | elif tokenizer_type == 'GPTSentencePieceTokenizer': 40 | assert args.retro_gpt_tokenizer_model is not None 41 | return _GPTSentencePieceTokenizer(args.retro_gpt_tokenizer_model) 42 | else: 43 | raise Exception("unrecognized gpt tokenizer, '%s'." % tokenizer_type) 44 | 45 | 46 | def get_bert_tokenizer(): 47 | '''Bert (Wordpiece) tokenizer.''' 48 | args = get_retro_args() 49 | lower_case = { 50 | "BertWordPieceLowerCase" : True, 51 | "BertWordPieceCase" : False, 52 | }[args.retro_bert_tokenizer_type] 53 | return _BertWordPieceTokenizer( 54 | vocab_file=args.retro_bert_vocab_file, 55 | lower_case=lower_case, 56 | ) 57 | 58 | 59 | class GPTToTextDataset(torch.utils.data.Dataset): 60 | '''Dataset to convert GPT tokens to text.''' 61 | 62 | def __init__(self, gpt_dataset): 63 | 64 | super().__init__() 65 | 66 | self.gpt_dataset = gpt_dataset 67 | self.gpt_tokenizer = get_gpt_tokenizer() 68 | 69 | def __len__(self): 70 | return len(self.gpt_dataset) 71 | 72 | def __getitem__(self, idx): 73 | gpt_token_ids = self.gpt_dataset[idx]["text"].tolist() 74 | text = self.gpt_tokenizer.detokenize(gpt_token_ids) 75 | return {"text": text} 76 | -------------------------------------------------------------------------------- /tools/text_generation_cli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | import sys 3 | import json 4 | import requests 5 | 6 | 7 | if __name__ == "__main__": 8 | url = sys.argv[1] 9 | url = 'http://' + url + '/api' 10 | headers = {'Content-Type': 'application/json'} 11 | 12 | while True: 13 | sentence = input("Enter prompt: ") 14 | tokens_to_generate = int(eval(input("Enter number of tokens to generate: "))) 15 | 16 | data = {"prompts": [sentence], "tokens_to_generate": tokens_to_generate} 17 | response = requests.put(url, data=json.dumps(data), headers=headers) 18 | 19 | if response.status_code != 200: 20 | print(f"Error {response.status_code}: {response.json()['message']}") 21 | else: 22 | print("Megatron Response: ") 23 | print(response.json()['text'][0]) 24 | --------------------------------------------------------------------------------