├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── eval ├── README.md ├── configs │ └── eval_qwen2_linear_moe.py ├── lm_eval_linear_moe.sh ├── models │ └── qwen2_linear_moe.py └── run.py ├── examples ├── linear_llama3 │ ├── README.md │ ├── pretrain_llama.py │ └── run_pretrain_mcore_llama.sh ├── linear_moe_deepseek_v2 │ ├── README.md │ ├── pretrain_deepseek.py │ └── run_pretrain_deepseek.sh ├── linear_moe_mixtral │ ├── README.md │ ├── pretrain_mcore_mistral.py │ └── run_pretrain_mistral.sh └── linear_moe_qwen2 │ ├── README.md │ ├── check_tensorboard.sh │ ├── evaluate_huggingface_qwen.py │ ├── evaluate_mcore_qwen.py │ ├── pretrain_qwen.py │ ├── run_evaluate_huggingface_qwen.sh │ ├── run_evaluate_mcore_qwen.sh │ └── run_pretrain_qwen.sh ├── images ├── linear-moe-fig1.png └── linear-moe-fig2.png ├── linear_moe ├── __init__.py ├── arguments.py ├── data │ ├── __init__.py │ ├── bloom.py │ ├── glm.py │ ├── llama.py │ ├── llava │ │ ├── constants.py │ │ ├── conversation.py │ │ ├── cvcuda_image_processing_clip.py │ │ ├── mm_pretrain_dataset.py │ │ └── mm_utils.py │ ├── qwen_vl.py │ ├── starcoder.py │ └── utils.py ├── finetune_utils.py ├── generation │ ├── api.py │ ├── generation.py │ ├── gpt_predictor.py │ ├── megatron.md │ └── tokenization.py ├── initialize.py ├── lm_evaluate.py ├── model │ ├── __init__.py │ ├── common_modules │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── feature_map.py │ │ ├── l2norm.py │ │ ├── layernorm.py │ │ └── rotary.py │ ├── deepseek_v2 │ │ ├── __init__.py │ │ ├── hybrid │ │ │ ├── hybrid_model.py │ │ │ └── hybrid_transformer_block.py │ │ ├── layer_specs.py │ │ ├── model.py │ │ ├── moe │ │ │ ├── __init__.py │ │ │ ├── experts.py │ │ │ ├── moe_layer.py │ │ │ ├── router.py │ │ │ ├── router_old.py │ │ │ └── token_dispatcher.py │ │ ├── rms_norm.py │ │ ├── transformer │ │ │ ├── attention.py │ │ │ └── mlp.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── yarn_rotary_pos_embedding.py │ ├── llama3 │ │ ├── __init__.py │ │ ├── gpt_model.py │ │ ├── hybrid │ │ │ ├── hybrid_model.py │ │ │ └── hybrid_transformer_block.py │ │ ├── language_model.py │ │ ├── layer_specs.py │ │ ├── model.py │ │ ├── rms_norm.py │ │ ├── transformer │ │ │ ├── attention.py │ │ │ └── mlp.py │ │ ├── transformer_config.py │ │ ├── transformer_layer.py │ │ └── transformer_legacy.py │ ├── mixtral │ │ ├── __init__.py │ │ ├── hybrid │ │ │ ├── hybrid_model.py │ │ │ └── hybrid_transformer_block.py │ │ ├── layer_specs.py │ │ ├── model.py │ │ ├── moe │ │ │ ├── __init__.py │ │ │ ├── experts.py │ │ │ ├── grouped_gemm_util.py │ │ │ ├── moe_layer.py │ │ │ ├── moe_utils.py │ │ │ ├── router.py │ │ │ └── token_dispatcher.py │ │ ├── rms_norm.py │ │ ├── transformer │ │ │ ├── attention.py │ │ │ └── mlp.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ └── transformer_layer.py │ └── qwen2 │ │ ├── hybrid │ │ ├── hybrid_model.py │ │ └── hybrid_transformer_block.py │ │ ├── layer_specs.py │ │ ├── model.py │ │ ├── moe │ │ ├── __init__.py │ │ ├── experts.py │ │ ├── moe_layer.py │ │ ├── router.py │ │ └── token_dispatcher.py │ │ ├── rms_norm.py │ │ ├── transformer │ │ ├── attention.py │ │ └── mlp.py │ │ ├── transformer_block.py │ │ ├── transformer_config.py │ │ └── transformer_layer.py ├── sequence_modeling │ ├── attention │ │ ├── __init__.py │ │ └── dot_product_attention.py │ ├── based │ │ ├── __init__.py │ │ └── based.py │ ├── basic_linear_attention │ │ ├── __init__.py │ │ └── basic_linear_attention.py │ ├── deltanet │ │ ├── __init__.py │ │ └── deltanet.py │ ├── gated_deltanet │ │ ├── __init__.py │ │ └── gated_deltanet.py │ ├── gla │ │ ├── __init__.py │ │ ├── gla.py │ │ └── gla_gate.py │ ├── hgrn2 │ │ ├── __init__.py │ │ └── hgrn2.py │ ├── lasp2 │ │ ├── __init__.py │ │ ├── lasp2.py │ │ ├── lasp2_with_mask_triton_op.py │ │ └── lasp2_without_mask_triton_op.py │ ├── lightning_attention │ │ ├── __init__.py │ │ └── lightning_attention.py │ ├── linear_attention.py │ ├── linear_rnn.py │ ├── mamba2 │ │ ├── __init__.py │ │ ├── mamba_block.py │ │ ├── mamba_hybrid_layer_allocation.py │ │ ├── mamba_layer.py │ │ ├── mamba_mixer.py │ │ ├── mamba_model.py │ │ └── triton_cache_manager.py │ ├── mom_linear_attention.py │ ├── rebased │ │ ├── __init__.py │ │ └── rebased.py │ ├── retention │ │ ├── __init__.py │ │ └── retention.py │ ├── rwkv6 │ │ ├── __init__.py │ │ ├── dd_lerp_linear.py │ │ └── rwkv6.py │ ├── rwkv7 │ │ ├── __init__.py │ │ ├── lora_mlp.py │ │ └── rwkv7.py │ └── ssm.py ├── tokenizer │ ├── __init__.py │ ├── icetk_glm130b_tokenizer.py │ ├── jiebabpe_tokenizer.py │ ├── tokenization_baichuan.py │ ├── tokenization_qwen_vl.py │ └── tokenization_yi.py ├── training.py └── utils.py ├── requirements.txt └── toolkits ├── model_checkpoints_convertor ├── README.md ├── baichuan │ ├── checkpoint_reshaping_and_interoperability.py │ ├── configuration_baichuan.py │ ├── hf2te.py │ ├── model_convertor.sh │ └── te_model_convertor.sh ├── baichuan2 │ ├── checkpoint_reshaping_and_interoperability.py │ ├── configuration_baichuan.py │ ├── hf2te.py │ ├── hf2te_convertor.sh │ └── model_convertor.sh ├── bloom │ ├── checkpoint_reshaping_and_interoperability.py │ ├── deepspeed_to_megatron.py │ ├── deepspeed_to_megatron_ori.py │ ├── model_convertor_huggingface_megatron.sh │ ├── reward_model_convertor_megatron.sh │ ├── reward_model_to_megatron.py │ ├── run_convert_deepspeed_to_megatron.sh │ └── run_convert_deepspeed_to_transformers.sh ├── chatglm │ ├── checkpoint_reshaping_and_interoperability.py │ └── run_convert_huggingface_to_megatron.sh ├── deepseek │ ├── hf2mcore_deepseek_v2_moe.py │ └── hf2mcore_deepseek_v2_moe_convertor.sh ├── falcon │ ├── checkpoint_reshaping_and_interoperability.py │ ├── configuration_RW.py │ └── model_convertor.sh ├── falcon40b │ ├── checkpoint_reshaping_and_interoperability.py │ ├── configuration_RW.py │ └── model_convertor.sh ├── galactica │ ├── checkpoint_reshaping_and_interoperability.py │ └── run_convert_huggingface_to_megatron.sh ├── glm │ ├── checkpoint_reshaping_and_interoperability.py │ └── run_convert_transformers_to_megatron.sh ├── glm130b │ ├── checkpoint_reshaping_and_interoperability.py │ ├── merge_130b_ckpts.py │ └── run_convert_transformers_to_megatron.sh ├── llama │ ├── hf2mcore.py │ ├── hf2mcore_70b.py │ ├── hf2mcore_convertor.sh │ ├── hf2megatron.py │ ├── hf2megatron_convertor.sh │ └── hf_llama_moe │ │ ├── config_TEMPLATE.json │ │ └── llama_moe.py ├── mistral │ ├── hf2mcore.py │ ├── hf2mcore_convertor.sh │ ├── hf2mcore_mixtral.py │ ├── hf2megatron.py │ ├── hf2megatron_convertor.sh │ └── hf_mistral_moe │ │ └── config_TEMPLATE.json ├── qwen │ ├── hf2mcore_qwen1.5_dense_convertor.sh │ ├── hf2mcore_qwen1.5_dense_gqa.py │ ├── hf2mcore_qwen1.5_dense_mha.py │ ├── hf2mcore_qwen1.5_dense_mha_to_moe.py │ ├── hf2mcore_qwen1.5_dense_to_moe_convertor.sh │ ├── hf2mcore_qwen1.5_moe.py │ ├── hf2mcore_qwen1.5_moe_convertor.sh │ ├── hf2mcore_qwen2_convertor.sh │ ├── hf2mcore_qwen2_dense_and_moe_gqa.py │ ├── hf2megablocks_qwen1.5.py │ ├── hf2megablocks_qwen1.5_convertor.sh │ ├── hf2megatron_convertor.sh │ ├── hf2megatron_qwen1.0.py │ └── hf2megatron_qwen1.5.py ├── starcoder │ ├── checkpoint_reshaping_and_interoperability.py │ └── model_convertor.sh └── yi │ ├── checkpoint_reshaping_and_interoperability.py │ └── model_convertor.sh └── pretrain_data_preprocessing ├── README.md ├── clean_raw_text.py ├── convert_json_to_list.py ├── img.png ├── preprocess_data.py ├── preprocess_data_megatron.py ├── preprocess_wudao2.py ├── qwen_hf_preprocess_datasets.py ├── run_make_pretraining_dataset.sh ├── run_make_pretraining_dataset_megatron.sh ├── run_make_pretraining_dataset_megatron_slimpajama.sh ├── run_make_pretraining_dataset_megatron_slimpajama_chunk1_chunk2.sh ├── run_prepare_dataset.sh └── run_prepare_wudao.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | cmake 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 111 | .pdm.toml 112 | .pdm-python 113 | .pdm-build/ 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | 165 | # Linear-MoE 166 | .idea 167 | Megatron-LM* 168 | LM-Evaluation-Harness* 169 | *.pyc 170 | output 171 | outputs 172 | checkpoint 173 | data-cache 174 | tensorboard 175 | log 176 | triton-cache 177 | data 178 | *.zip -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/Megatron-LM-0.4.0"] 2 | path = third_party/Megatron-LM-0.4.0 3 | url = https://github.com/NVIDIA/Megatron-LM.git 4 | [submodule "third_party/Megatron-LM-0.9.0"] 5 | path = third_party/Megatron-LM-0.9.0 6 | url = https://github.com/NVIDIA/Megatron-LM.git 7 | [submodule "third_party/flash-linear-attention-1018"] 8 | path = third_party/flash-linear-attention-1018 9 | url = https://github.com/sustcsonglin/flash-linear-attention.git 10 | [submodule "third_party/lm-evaluation-harness"] 11 | path = third_party/lm-evaluation-harness 12 | url = https://github.com/weigao266/lm-evaluation-harness.git 13 | [submodule "third_party/flash-linear-attention-250303"] 14 | path = third_party/flash-linear-attention-250303 15 | url = https://github.com/fla-org/flash-linear-attention 16 | -------------------------------------------------------------------------------- /eval/README.md: -------------------------------------------------------------------------------- 1 | # Linear-MoE Evaluation 2 | 3 | First you should install `lm-evaluation-harness` in [third_party/lm-evaluation-harness](third_party/lm-evaluation-harness) by: 4 | 5 | ```bash 6 | cd third_party/lm-evaluation-harness 7 | pip install -e . 8 | ``` 9 | 10 | Edit `lm_eval_linear_moe.sh` to set checkpoint path like: 11 | 12 | ```bash 13 | CHECKPOINT_DIR=/your/checkpoint/dir 14 | CHECKPOINT_PATH=${CHECKPOINT_DIR}/pretrain-mcore-linear_attention-qwen2-A0.3B-lr-1e-4-minlr-1e-5-bs-8-gbs-64-seqlen-2048-pr-bf16-tp-1-pp-1-ac-sel-do-true-sp-false-tt-15000000000-wt-10000 15 | ``` 16 | 17 | and set the model and training configurations like: 18 | 19 | ```bash 20 | MODEL_SIZE=A0.3B 21 | SEQ_LEN=2048 22 | PAD_LEN=2048 23 | PR=bf16 24 | TP=1 25 | PP=1 26 | EP=1 27 | 28 | ... 29 | 30 | LA_MODULE="linear_attention" 31 | BASE_MODEL="qwen2" 32 | 33 | # set LAYER_TYPE_LIST="LLLNLLLNLLLN" for hybrid model 34 | LAYER_TYPE_LIST="LLLLLLLLLLLL" 35 | 36 | # Linear-MoE options 37 | linear_moe_options=" \ 38 | --use-la-module \ 39 | --la-module ${LA_MODULE} \ 40 | --la-mode fused_chunk \ 41 | --base-model ${BASE_MODEL} \ 42 | --la-feature-map elu \ 43 | --la-output-norm rmsnorm \ 44 | --la-gate-fn swish \ 45 | --layer-type-list ${LAYER_TYPE_LIST} \ 46 | " 47 | ``` 48 | 49 | then set the evaluation task and other configurations for `lm-evaluation-harness`: 50 | 51 | ```bash 52 | run_cmd="torchrun $DISTRIBUTED_ARGS --no-python lm_eval \ 53 | --model linear_moe \ 54 | --model_args path=${CHECKPOINT_PATH} max_length=2048 \ 55 | --tasks piqa \ 56 | --device cuda \ 57 | --batch_size 16 \ 58 | --output_path lm_eval_result \ 59 | ${megatron_options} ${pr_options} ${load_options} ${input_options} ${te_options} ${activation_checkpoint_options} ${do_options} ${flash_options} ${sp_options} ${moe_options} ${linear_moe_options}" 60 | ``` 61 | 62 | After finishing the above settings, evaluate Linear-MoE models by: 63 | 64 | ```bash 65 | sh lm_eval_linear_moe.sh 66 | ``` 67 | 68 | The evaluation results would be presented like below: 69 | 70 | ```bash 71 | linear_moe (path=/your/checkpoint/dir/pretrain-mcore-linear_attention-qwen2-A0.3B-lr-1e-4-minlr-1e-5-bs-8-gbs-64-seqlen-2048-pr-bf16-tp-1-pp-1-ac-sel-do-true-sp-false-tt-15000000000-wt-10000), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 16 72 | |Tasks|Version|Filter|n-shot| Metric | |Value | |Stderr| 73 | |-----|------:|------|-----:|--------|---|-----:|---|-----:| 74 | |piqa | 1|none | 0|acc |↑ |0.6436|± |0.0112| 75 | | | |none | 0|acc_norm|↑ |0.6436|± |0.0112| 76 | ``` 77 | -------------------------------------------------------------------------------- /eval/configs/eval_qwen2_linear_moe.py: -------------------------------------------------------------------------------- 1 | from mmengine.config import read_base 2 | 3 | with read_base(): 4 | from opencompass.configs.datasets.wikitext.wikitext_103_raw_ppl import wikitext_103_raw_datasets 5 | from opencompass.configs.datasets.lambada.lambada_gen import lambada_datasets 6 | from opencompass.configs.datasets.piqa.piqa_gen import piqa_datasets 7 | from opencompass.configs.datasets.hellaswag.hellaswag_gen import hellaswag_datasets 8 | from opencompass.configs.datasets.winogrande.winogrande_gen import winogrande_datasets 9 | from opencompass.configs.datasets.ARC_e.ARC_e_gen import ARC_e_datasets # ARC-easy 10 | from opencompass.configs.datasets.ARC_c.ARC_c_gen import ARC_c_datasets # ARC-challenge 11 | 12 | datasets = winogrande_datasets 13 | 14 | from eval.models.qwen2_linear_moe import Qwen2LinearMoe 15 | 16 | models = [ 17 | dict( 18 | type=Qwen2LinearMoe, 19 | path="", #'huggyllama/llama-7b', 20 | model_kwargs=dict(device_map='auto'), 21 | tokenizer_path="", # 'huggyllama/llama-7b', 22 | tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), 23 | max_seq_len=2048, 24 | max_out_len=512, 25 | run_cfg=dict(num_gpus=1, num_procs=1), 26 | batch_size=32, 27 | ) 28 | ] 29 | 30 | -------------------------------------------------------------------------------- /eval/run.py: -------------------------------------------------------------------------------- 1 | from cli.main import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /examples/linear_moe_qwen2/check_tensorboard.sh: -------------------------------------------------------------------------------- 1 | 2 | tensorboard --logdir=./output/tensorboard/ -------------------------------------------------------------------------------- /examples/linear_moe_qwen2/run_evaluate_huggingface_qwen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # sh run_evaluate_huggingface_qwen.sh 0.5B 1 256 256 bf16 /mnt/qwen-datasets/alpaca_zh-qwen-valid.json /mnt/qwen-ckpts/Qwen2-0.5B 3 | 4 | set -e 5 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 6 | MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) 7 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240612 8 | export CUDA_DEVICE_MAX_CONNECTIONS=1 9 | 10 | export CUDA_VISIBLE_DEVICES=6 11 | MASTER_ADDR=localhost 12 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 13 | NNODES=1 14 | NODE_RANK=0 15 | GPUS_PER_NODE=1 16 | 17 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 18 | 19 | MODEL_SIZE=$1 20 | BATCH_SIZE=$2 21 | SEQ_LEN=$3 22 | PAD_LEN=$4 23 | PR=$5 24 | DATASET_PATH=$6 25 | PRETRAIN_CHECKPOINT_PATH=$7 26 | 27 | 28 | if [ $MODEL_SIZE = 0.5B ]; then 29 | 30 | HIDDEN_SIZE=896 31 | INTERMEDIATE_SIZE=4864 32 | MAX_POSITION_EMBEDDINGS=131072 33 | MAX_WINDOW_LAYERS=24 34 | NUM_ATTENTION_HEADS=14 35 | NUM_HIDDEN_LAYERS=24 36 | NUM_KEY_VALUE_HEADS=2 37 | RMS_NORM_EPS=1e-6 38 | ROPE_THETA=1000000 39 | SLIDING_WINDOW=131072 40 | EXTRA_VOCAB_SIZE=293 41 | 42 | moe_options=" \ 43 | " 44 | 45 | elif [ $MODEL_SIZE = 1.5B ]; then 46 | 47 | HIDDEN_SIZE=1536 48 | INTERMEDIATE_SIZE=8960 49 | MAX_POSITION_EMBEDDINGS=131072 50 | MAX_WINDOW_LAYERS=28 51 | NUM_ATTENTION_HEADS=12 52 | NUM_HIDDEN_LAYERS=28 53 | NUM_KEY_VALUE_HEADS=2 54 | RMS_NORM_EPS=1e-6 55 | ROPE_THETA=1000000 56 | SLIDING_WINDOW=131072 57 | EXTRA_VOCAB_SIZE=293 58 | 59 | moe_options=" \ 60 | " 61 | 62 | elif [ $MODEL_SIZE = 7B ]; then 63 | 64 | HIDDEN_SIZE=3584 65 | INTERMEDIATE_SIZE=18944 66 | MAX_POSITION_EMBEDDINGS=131072 67 | MAX_WINDOW_LAYERS=28 68 | NUM_ATTENTION_HEADS=28 69 | NUM_HIDDEN_LAYERS=28 70 | NUM_KEY_VALUE_HEADS=4 71 | RMS_NORM_EPS=1e-6 72 | ROPE_THETA=1000000 73 | SLIDING_WINDOW=131072 74 | EXTRA_VOCAB_SIZE=421 75 | 76 | moe_options=" \ 77 | " 78 | 79 | elif [ $MODEL_SIZE = 72B ]; then 80 | 81 | HIDDEN_SIZE=8192 82 | INTERMEDIATE_SIZE=29568 83 | MAX_POSITION_EMBEDDINGS=131072 84 | MAX_WINDOW_LAYERS=80 85 | NUM_ATTENTION_HEADS=64 86 | NUM_HIDDEN_LAYERS=80 87 | NUM_KEY_VALUE_HEADS=8 88 | RMS_NORM_EPS=1e-5 89 | ROPE_THETA=1000000 90 | SLIDING_WINDOW=131072 91 | EXTRA_VOCAB_SIZE=421 92 | 93 | moe_options=" \ 94 | " 95 | 96 | fi 97 | 98 | if [ $PR = fp16 ]; then 99 | pr_options=" \ 100 | --fp16" 101 | elif [ $PR = bf16 ]; then 102 | pr_options=" \ 103 | --bf16" 104 | fi 105 | 106 | if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then 107 | load_options=" \ 108 | --load $PRETRAIN_CHECKPOINT_PATH" 109 | fi 110 | 111 | 112 | megatron_options=" \ 113 | --transformer-type huggingface \ 114 | --valid-data-path ${DATASET_PATH} 115 | --micro-batch-size ${BATCH_SIZE} \ 116 | --num-layers ${NUM_HIDDEN_LAYERS} \ 117 | --hidden-size ${HIDDEN_SIZE} \ 118 | --num-attention-heads ${NUM_ATTENTION_HEADS} \ 119 | --seq-length ${SEQ_LEN} \ 120 | --max-position-embeddings ${SEQ_LEN} \ 121 | --log-interval 1 \ 122 | --eval-interval 100 \ 123 | --eval-iters 10 \ 124 | --tensor-model-parallel-size 1 \ 125 | --pipeline-model-parallel-size 1 \ 126 | --no-load-optim \ 127 | --num-workers 0 \ 128 | --dataset LLama-SFT \ 129 | --use-distributed-optimizer \ 130 | --max-padding-length ${PAD_LEN} \ 131 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 132 | --patch-tokenizer-type LLamaTokenizer 133 | " 134 | 135 | run_cmd="torchrun $DISTRIBUTED_ARGS evaluate_huggingface_qwen.py 136 | ${megatron_options} ${pr_options} ${load_options}" 137 | 138 | echo ${run_cmd} 139 | eval ${run_cmd} 140 | set +x 141 | -------------------------------------------------------------------------------- /images/linear-moe-fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSparseLLMs/Linear-MoE/b312754b9b8a2cb9eb15e373baabe263b8d409c1/images/linear-moe-fig1.png -------------------------------------------------------------------------------- /images/linear-moe-fig2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSparseLLMs/Linear-MoE/b312754b9b8a2cb9eb15e373baabe263b8d409c1/images/linear-moe-fig2.png -------------------------------------------------------------------------------- /linear_moe/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /linear_moe/data/bloom.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import numpy as np 17 | import torch 18 | 19 | from linear_moe.tokenizer import get_tokenizer 20 | 21 | class BloomRawDataset(torch.utils.data.Dataset): 22 | """A class for processing a Bloom text dataset""" 23 | def __init__(self, datapaths, max_seq_length): 24 | """ 25 | Initializes the dataset. 26 | Args: 27 | path(str): The path of the dataset file. 28 | tokenizer(object): The tokenizer object. 29 | max_seq_length(int): The maximum length of sequences. 30 | """ 31 | self.tokenizer = get_tokenizer() 32 | self.max_seq_length = max_seq_length 33 | self.prompt = '' 34 | self.samples = [] 35 | for datapath in datapaths: 36 | self.samples.extend( 37 | self.process_samples_from_single_path(datapath)) 38 | print(' >> total number of samples: {}'.format(len(self.samples))) 39 | 40 | def __len__(self): 41 | return len(self.samples) 42 | 43 | def __getitem__(self, idx): 44 | raw_sample = self.samples[idx] 45 | return self.gpt_convert_example_to_feature(raw_sample, self.tokenizer, 46 | self.max_seq_length) 47 | 48 | def truncate(self, tokenizer, array, max_length): 49 | """ 50 | Truncates an array to a maximum length or pads it with zeros if its length is less than `max_length`. 51 | Args: 52 | tokenizer: The tokenizer used to encode the input. 53 | array: The numpy array to truncate or pad. 54 | max_length: The maximum length of the array. 55 | Returns: 56 | A numpy array of length `max_length` containing the contents of `array`, truncated if necessary or padded with zeros. 57 | """ 58 | 59 | if len(array) < max_length: 60 | return np.pad(array, (0, max_length - len(array)), 61 | constant_values=tokenizer.eod) 62 | else: 63 | return array[:max_length] 64 | 65 | def process_samples_from_single_path(self, filename): 66 | """ 67 | Process a single file containing prompt-answer pairs and return a list of samples. 68 | """ 69 | 70 | print(' > Processing {} ...'.format(filename)) 71 | samples = [] 72 | total = 0 73 | with open(filename, encoding='utf-8-sig') as f: 74 | for example in f: 75 | text = json.loads(example)['text'] 76 | sample = { 77 | 'prompt': 78 | text + '' if not text.endswith('') else text, 79 | 'answer': text, 80 | } 81 | total += 1 82 | samples.append(sample) 83 | 84 | print(' >> processed {} samples.'.format(len(samples))) 85 | return samples 86 | 87 | def gpt_convert_example_to_feature(self, sample, tokenizer, 88 | max_seq_length): 89 | """ 90 | Convert a single sample containing a prompt-answer pair into a format suitable for GPT training. 91 | """ 92 | 93 | tokens = tokenizer(sample['prompt']) 94 | input_ids = tokens['input_ids'] 95 | input_ids = self.truncate(tokenizer, input_ids, max_seq_length + 1) 96 | train_sample = {'input_ids': np.array(input_ids)} 97 | return train_sample -------------------------------------------------------------------------------- /linear_moe/data/llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /linear_moe/data/llava/mm_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from PIL import Image 16 | from io import BytesIO 17 | import base64 18 | import torch 19 | from transformers import StoppingCriteria 20 | from linear_moe.data.llava.constants import IMAGE_TOKEN_INDEX 21 | 22 | 23 | def load_image_from_base64(image): 24 | return Image.open(BytesIO(base64.b64decode(image))) 25 | 26 | 27 | def expand2square(pil_img, background_color): 28 | width, height = pil_img.size 29 | if width == height: 30 | return pil_img 31 | elif width > height: 32 | result = Image.new(pil_img.mode, (width, width), background_color) 33 | result.paste(pil_img, (0, (width - height) // 2)) 34 | return result 35 | else: 36 | result = Image.new(pil_img.mode, (height, height), background_color) 37 | result.paste(pil_img, ((height - width) // 2, 0)) 38 | return result 39 | 40 | 41 | def process_images(images, image_processor, model_cfg): 42 | image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) 43 | new_images = [] 44 | if image_aspect_ratio == 'pad': 45 | for image in images: 46 | image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean)) 47 | image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0] 48 | new_images.append(image) 49 | else: 50 | return image_processor(images, return_tensors='pt')['pixel_values'] 51 | if all(x.shape == new_images[0].shape for x in new_images): 52 | new_images = torch.stack(new_images, dim=0) 53 | return new_images 54 | 55 | 56 | def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None): 57 | prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('')] 58 | def insert_separator(X, sep): 59 | return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1] 60 | input_ids = [] 61 | offset = 0 62 | if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: 63 | offset = 1 64 | input_ids.append(prompt_chunks[0][0]) 65 | 66 | for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): 67 | input_ids.extend(x[offset:]) 68 | if return_tensors is not None: 69 | if return_tensors == 'pt': 70 | return torch.tensor(input_ids, dtype=torch.long) 71 | raise ValueError(f'Unsupported tensor type: {return_tensors}') 72 | 73 | return input_ids 74 | 75 | 76 | def get_model_name_from_path(model_path): 77 | model_path = model_path.strip("/") 78 | model_paths = model_path.split("/") 79 | if model_paths[-1].startswith('checkpoint-'): 80 | return model_paths[-2] + "_" + model_paths[-1] 81 | else: 82 | return model_paths[-1] 83 | 84 | 85 | 86 | 87 | class KeywordsStoppingCriteria(StoppingCriteria): 88 | def __init__(self, keywords, tokenizer, input_ids): 89 | self.keywords = keywords 90 | self.keyword_ids = [] 91 | self.max_keyword_len = 0 92 | for keyword in keywords: 93 | cur_keyword_ids = tokenizer(keyword).input_ids 94 | if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id: 95 | cur_keyword_ids = cur_keyword_ids[1:] 96 | if len(cur_keyword_ids) > self.max_keyword_len: 97 | self.max_keyword_len = len(cur_keyword_ids) 98 | self.keyword_ids.append(torch.tensor(cur_keyword_ids)) 99 | self.tokenizer = tokenizer 100 | self.start_len = input_ids.shape[1] 101 | 102 | def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: 103 | assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)" # TODO 104 | offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len) 105 | self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids] 106 | for keyword_id in self.keyword_ids: 107 | if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all(): 108 | return True 109 | outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0] 110 | for keyword in self.keywords: 111 | if keyword in outputs: 112 | return True 113 | return False -------------------------------------------------------------------------------- /linear_moe/data/qwen_vl.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | from typing import Dict 17 | import torch 18 | from torch.utils.data import Dataset 19 | import transformers 20 | from transformers.trainer_pt_utils import LabelSmoother 21 | try: 22 | from megatron import get_args 23 | except: 24 | from megatron.training import get_args 25 | 26 | from linear_moe.tokenizer import get_tokenizer 27 | 28 | IGNORE_TOKEN_ID = LabelSmoother.ignore_index 29 | 30 | def preprocess( 31 | sources, 32 | tokenizer: transformers.PreTrainedTokenizer, 33 | max_len: int, 34 | system_message: str = "You are a helpful assistant." 35 | ) -> Dict: 36 | """ 37 | Preprocess conversation data for the model input. 38 | 39 | Parameters: 40 | sources (List[Dict]): A list of conversation segments. 41 | tokenizer (PreTrainedTokenizer): A tokenizer instance. 42 | max_len (int): The maximum sequence length. 43 | system_message (str, optional): A default system message. 44 | 45 | Returns: 46 | Dict: A dictionary with 'input_ids', 'labels', and 'attention_mask'. 47 | """ 48 | roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"} 49 | im_start = tokenizer.im_start_id 50 | im_end = tokenizer.im_end_id 51 | nl_tokens = tokenizer('\n').input_ids 52 | _system = tokenizer('system').input_ids + nl_tokens 53 | _user = tokenizer('user').input_ids + nl_tokens 54 | _assistant = tokenizer('assistant').input_ids + nl_tokens 55 | 56 | # Apply prompt templates 57 | input_ids, targets = [], [] 58 | for i, source in enumerate(sources): 59 | if roles[source[0]["from"]] != roles["user"]: 60 | source = source[1:] 61 | 62 | input_id, target = [], [] 63 | system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens 64 | input_id += system 65 | target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens 66 | assert len(input_id) == len(target) 67 | for sentence in enumerate(source): 68 | role = roles[sentence["from"]] 69 | _input_id = tokenizer(role).input_ids + nl_tokens + \ 70 | tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens 71 | input_id += _input_id 72 | if role == '<|im_start|>user': 73 | _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens 74 | elif role == '<|im_start|>assistant': 75 | _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \ 76 | _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens 77 | else: 78 | raise NotImplementedError 79 | target += _target 80 | assert len(input_id) == len(target) 81 | input_id += [tokenizer.pad_token_id] * (max_len - len(input_id)) 82 | target += [IGNORE_TOKEN_ID] * (max_len - len(target)) 83 | input_ids.append(input_id[:max_len]) 84 | targets.append(target[:max_len]) 85 | input_ids = torch.tensor(input_ids, dtype=torch.int) 86 | targets = torch.tensor(targets, dtype=torch.int) 87 | 88 | return dict( 89 | input_ids=input_ids, 90 | labels=targets, 91 | attention_mask=input_ids.ne(tokenizer.pad_token_id), 92 | ) 93 | 94 | class LazySupervisedDataset(Dataset): 95 | """ 96 | A dataset class for supervised fine-tuning. 97 | 98 | Attributes: 99 | data_path (str): Path to the JSON file containing the conversational data. 100 | """ 101 | 102 | def __init__(self, data_path: str): 103 | super(LazySupervisedDataset, self).__init__() 104 | self.args = get_args() 105 | self.tokenizer = get_tokenizer() 106 | self.max_len = self.args.max_padding_length 107 | self.raw_data = json.load(open(data_path[0], "r")) 108 | self.cached_data_dict = {} 109 | 110 | def __len__(self): 111 | return len(self.raw_data) 112 | 113 | def __getitem__(self, i) -> Dict[str, torch.Tensor]: 114 | if i in self.cached_data_dict: 115 | return self.cached_data_dict[i] 116 | 117 | ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len) 118 | ret = dict( 119 | input_ids=ret["input_ids"][0], 120 | labels=ret["labels"][0], 121 | attention_mask=ret["attention_mask"][0], 122 | ) 123 | self.cached_data_dict[i] = ret 124 | 125 | return ret -------------------------------------------------------------------------------- /linear_moe/data/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | from megatron.core import mpu 17 | try: 18 | from megatron import get_args 19 | except: 20 | from megatron.training import get_args 21 | try: 22 | from megatron.utils import get_ltor_masks_and_position_ids 23 | except: 24 | from megatron.training.utils import get_ltor_masks_and_position_ids 25 | 26 | from linear_moe.tokenizer import get_tokenizer 27 | 28 | 29 | def get_batch_on_this_tp_rank_original(data_iterator): 30 | args = get_args() 31 | tokenizer = get_tokenizer() 32 | def _broadcast(item): 33 | torch.distributed.broadcast(item, mpu.get_tensor_model_parallel_src_rank(), 34 | group=mpu.get_tensor_model_parallel_group()) 35 | 36 | if mpu.get_tensor_model_parallel_rank() == 0: 37 | 38 | if isinstance(data_iterator, dict): 39 | data = data_iterator 40 | else: 41 | data = next(data_iterator) 42 | 43 | tokens_ = data['input_ids'].long() 44 | labels_ = data['labels'].long() 45 | tokens = tokens_[:, :-1].contiguous() 46 | labels = labels_[:, 1:].contiguous() 47 | # core/tensor_parallel/cross_entropy.py, target_mask = (target < vocab_start_index) | (target >= vocab_end_index) 48 | labels[labels == tokenizer.eos_token_id] = -100 49 | labels[labels == tokenizer.pad_token_id] = -100 50 | 51 | attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( 52 | labels, 53 | -100, 54 | args.reset_position_ids, 55 | args.reset_attention_mask, 56 | args.eod_mask_loss) 57 | 58 | batch = { 59 | 'tokens': tokens.cuda(non_blocking=True), 60 | 'labels': labels.cuda(non_blocking=True), 61 | 'loss_mask': loss_mask.cuda(non_blocking=True), 62 | 'attention_mask': attention_mask.cuda(non_blocking=True), 63 | 'position_ids': position_ids.cuda(non_blocking=True) 64 | } 65 | 66 | if args.pipeline_model_parallel_size == 1: 67 | _broadcast(batch['tokens']) 68 | _broadcast(batch['labels']) 69 | _broadcast(batch['loss_mask']) 70 | _broadcast(batch['attention_mask']) 71 | _broadcast(batch['position_ids']) 72 | 73 | elif mpu.is_pipeline_first_stage(): 74 | _broadcast(batch['tokens']) 75 | _broadcast(batch['attention_mask']) 76 | _broadcast(batch['position_ids']) 77 | 78 | elif mpu.is_pipeline_last_stage(): 79 | _broadcast(batch['labels']) 80 | _broadcast(batch['loss_mask']) 81 | _broadcast(batch['attention_mask']) 82 | 83 | else: 84 | 85 | tokens = torch.empty((args.micro_batch_size, args.seq_length), dtype=torch.int64, 86 | device=torch.cuda.current_device()) 87 | labels = torch.empty((args.micro_batch_size, args.seq_length), dtype=torch.int64, 88 | device=torch.cuda.current_device()) 89 | loss_mask = torch.empty((args.micro_batch_size, args.seq_length), dtype=torch.float32, 90 | device=torch.cuda.current_device()) 91 | attention_mask = torch.empty((args.micro_batch_size, 1, args.seq_length, args.seq_length), dtype=torch.bool, 92 | device=torch.cuda.current_device()) 93 | position_ids = torch.empty((args.micro_batch_size, args.seq_length), dtype=torch.int64, 94 | device=torch.cuda.current_device()) 95 | 96 | if args.pipeline_model_parallel_size == 1: 97 | _broadcast(tokens) 98 | _broadcast(labels) 99 | _broadcast(loss_mask) 100 | _broadcast(attention_mask) 101 | _broadcast(position_ids) 102 | 103 | elif mpu.is_pipeline_first_stage(): 104 | labels = None 105 | loss_mask = None 106 | 107 | _broadcast(tokens) 108 | _broadcast(attention_mask) 109 | _broadcast(position_ids) 110 | 111 | elif mpu.is_pipeline_last_stage(): 112 | tokens = None 113 | position_ids = None 114 | 115 | _broadcast(labels) 116 | _broadcast(loss_mask) 117 | _broadcast(attention_mask) 118 | 119 | batch = { 120 | 'tokens': tokens, 121 | 'labels': labels, 122 | 'loss_mask': loss_mask, 123 | 'attention_mask': attention_mask, 124 | 'position_ids': position_ids 125 | } 126 | 127 | return batch -------------------------------------------------------------------------------- /linear_moe/generation/gpt_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import torch 17 | 18 | from megatron.core.enums import ModelType 19 | from megatron.training import get_args 20 | from megatron.training import get_timers 21 | from megatron.training import get_model 22 | from megatron.training.checkpointing import load_checkpoint 23 | 24 | from linear_moe.generation.api import generate_and_post_process 25 | from linear_moe.tokenizer import build_tokenizer 26 | 27 | class GPTPredictor(): 28 | """A Predictor for model.""" 29 | def __init__(self): 30 | super().__init__() 31 | 32 | def predict(self): 33 | """Run predict process """ 34 | 35 | args = get_args() 36 | build_tokenizer(args) 37 | timers = get_timers() 38 | 39 | args.train_iters = 1 40 | # Model, optimizer, and learning rate. 41 | timers('model-and-optimizer-setup', log_level=0).start(barrier=True) 42 | model = get_model(self.model_provider, 43 | model_type=ModelType.encoder_or_decoder, 44 | wrap_with_ddp=False) 45 | assert args.load is not None 46 | if args.load is not None and args.no_load_optim: 47 | load_checkpoint(model, None, None) 48 | timers('model-and-optimizer-setup').stop() 49 | torch.distributed.barrier() 50 | 51 | timers = get_timers() 52 | timers('load-checkpoint', log_level=0).start(barrier=True) 53 | timers('load-checkpoint').stop() 54 | timers.log(['load-checkpoint']) 55 | timers.log(['model-and-optimizer-setup']) 56 | 57 | if not isinstance(model, list): 58 | model = [model] 59 | 60 | assert len(model) == 1, 'Above condition should have caught this' 61 | model = model[0] 62 | if args.text_generate_input_file != '': 63 | num_examples = len(open(args.text_generate_input_file).readlines()) 64 | prompts = [] 65 | pred_outputs = [] 66 | with open(args.text_generate_input_file, 67 | encoding='utf-8') as reader,\ 68 | open(args.text_generate_output_file, 69 | 'w', encoding='utf-8') as writer: 70 | buffer = [] 71 | 72 | for idx, line in enumerate(reader): 73 | line = line.strip() 74 | json_obj = json.loads(line) 75 | line = json_obj['query'][:args.seq_length] 76 | prompts.append(line) 77 | if len(buffer) < args.micro_batch_size: 78 | buffer.append(line) 79 | 80 | if len( 81 | buffer 82 | ) == args.micro_batch_size or idx == num_examples - 1: 83 | sl = args.out_seq_length 84 | tk = args.top_k 85 | tp = args.top_p 86 | temperature = args.temperature 87 | prompts_plus_generations, _, _, _ = \ 88 | generate_and_post_process(model, 89 | prompts=buffer, 90 | tokens_to_generate=sl, 91 | top_k_sampling=tk, 92 | temperature=temperature, 93 | top_p_sampling=tp) 94 | 95 | for prompt, p_and_g in zip(buffer, 96 | prompts_plus_generations): 97 | generation = p_and_g.replace('<|endoftext|>', '') 98 | print(p_and_g) 99 | writer.write(generation + '\n') 100 | pred_outputs.append(generation) 101 | buffer.clear() 102 | 103 | if idx % args.micro_batch_size == 0: 104 | print('processed {} examples'.format(idx)) 105 | -------------------------------------------------------------------------------- /linear_moe/generation/megatron.md: -------------------------------------------------------------------------------- 1 | ## Megatron推理 2 | 此处复用了Megatron-LM中的推理框架。 3 | 改动: 4 | - 修改tokenizer处理数据的部分,适配huggingface的tokenizer 5 | - 增加推理过程中对重复生成的处理,支持repetition_penalty. 6 | 7 | 8 | ## 模型推理示例 9 | 对于Megatron-LM训练的模型,可以直接用Megatron-LM框架进行推理。 10 | 参数如下 11 | ```bash 12 | ENV=$1 # 运行环境: dlc, dsw 13 | MEGATRON_PATH=$2 # 设置开源Megatron的代码路径 14 | LINEAR_MOE_PATH=$3 # 设置LINEAR_MOE的代码路径 15 | CHECKPOINT_PATH=$4 # 模型微调阶段的模型保存路径 16 | MODEL_SIZE=$5 # 模型结构参数量级: 1.1B, 1.7B, 7.1B 17 | TP=$6 # 模型并行度 18 | BS=$7 # 每卡推理一次迭代样本数: 1, 4, 8 19 | SEQ_LEN=$8 # 序列长度: 256, 512, 1024 20 | PAD_LEN=$9 # PAD长度:需要将文本拼接到的长度 21 | EXTRA_VOCAB_SIZE=${10} # 模型转换时增加的token数量 22 | PR=${11} # 推理采用的精度: fp16, bf16 23 | TOP_K=${12} # 采样策略中选择排在前面的候选词数量(0-n): 0, 5, 10, 20 24 | INPUT_SEQ_LEN=${13} # 输入序列长度: 512 25 | OUTPUT_SEQ_LEN=${14} # 输出序列长度: 256 26 | INPUT_FILE=${15} # 需要推理的文本文件: input.txt, 每行为一个样本 27 | OUTPUT_FILE=${16} # 推理输出的文件: output.txt 28 | # TOP_K和TOP_P必须有一个为0 29 | TOP_P=${17} # 采样策略中选择排在前面的候选词百分比(0-1): 0, 0.85, 0.95 30 | TEMPERATURE=${18} # 采样策略中温度惩罚: 1-n 31 | REPETITION_PENALTY=${19} # 避免生成是产生大量重复,可以设置为(1-2)默认为1.2 32 | ``` 33 | 运行以下命令进行模型推理。 34 | 35 | 以下有监督微调过程保存模型的推理代码,需要将run_text_generation_megatron_llama.sh脚本中CUDA_VISIBLE_DEVICES参数设置为0;GPUS_PER_NODE参数设置为1;同时使用下列代码进行推理。此时使用单卡进行推理。注意:此处模型tp为1,可使用单卡推理;如果tp>1,则需使用相应卡数进行推理。 36 | ```bash 37 | export WORK_DIR=/mnt/workspace 38 | cd ${WORK_DIR}/PAI-Megatron-Patch/examples/llama2 39 | bash run_text_generation_megatron_llama.sh \ 40 | dsw \ 41 | /root/Megatron-LM-23.04 \ 42 | ${WORK_DIR}/PAI-Megatron-Patch \ 43 | ../../../llama2-train \ 44 | 7B \ 45 | 1 \ 46 | 1 \ 47 | 1024 \ 48 | 1024 \ 49 | 0 \ 50 | fp16 \ 51 | 10 \ 52 | 512 \ 53 | 512 \ 54 | ${WORK_DIR}/pred_input.jsonl \ 55 | ${WORK_DIR}/llama2_pred.txt \ 56 | 0 \ 57 | 1.0 \ 58 | 1.2 59 | ``` 60 | 61 | -------------------------------------------------------------------------------- /linear_moe/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /linear_moe/model/common_modules/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .layernorm import (GroupNorm, GroupNormLinear, LayerNorm, 4 | LayerNormLinear, RMSNorm, RMSNormLinear) 5 | from fla.modules.rotary import RotaryEmbedding 6 | from .l2norm import l2_norm_fn 7 | 8 | __all__ = [ 9 | 'GroupNorm', 'GroupNormLinear', 'LayerNorm', 'LayerNormLinear', 'RMSNorm', 'RMSNormLinear', 10 | 'RotaryEmbedding' 11 | 'l2_norm_fn' 12 | ] 13 | -------------------------------------------------------------------------------- /linear_moe/model/deepseek_v2/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import GPTModel 2 | -------------------------------------------------------------------------------- /linear_moe/model/deepseek_v2/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSparseLLMs/Linear-MoE/b312754b9b8a2cb9eb15e373baabe263b8d409c1/linear_moe/model/deepseek_v2/moe/__init__.py -------------------------------------------------------------------------------- /linear_moe/model/deepseek_v2/moe/moe_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | import torch 17 | import torch.nn.functional as F 18 | 19 | from megatron.core import parallel_state 20 | from megatron.core.transformer.module import MegatronModule 21 | from megatron.core.transformer.transformer_config import TransformerConfig 22 | 23 | from .experts import GroupedMLP, SequentialMLP 24 | from .router import TopKRouter 25 | from .token_dispatcher import ( 26 | MoEAllGatherTokenDispatcher, 27 | MoEAlltoAllTokenDispatcher, 28 | ) 29 | from ..transformer.mlp import MLPSubmodules, MLP 30 | 31 | class BaseMoELayer(MegatronModule, ABC): 32 | """Base class for a mixture of experts layer. 33 | 34 | Args: 35 | config (TransformerConfig): Configuration object for the transformer model. 36 | """ 37 | 38 | def __init__(self, config: TransformerConfig, layer_number: int = None): 39 | super(BaseMoELayer, self).__init__(config) 40 | self.config = config 41 | self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size() 42 | assert self.expert_parallel_size > 0, "Expected non-negative expert parallel size" 43 | assert self.config.num_moe_experts % self.expert_parallel_size == 0 44 | self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size 45 | local_expert_indices_offset = ( 46 | parallel_state.get_expert_model_parallel_rank() * self.num_local_experts 47 | ) 48 | self.local_expert_indices = [ 49 | local_expert_indices_offset + i for i in range(self.num_local_experts) 50 | ] 51 | assert all(map(lambda x: x < self.config.num_moe_experts, self.local_expert_indices)) 52 | self.router = None 53 | self.experts = None 54 | self.token_dispatcher = None 55 | self.layer_number = layer_number 56 | 57 | @abstractmethod 58 | def forward(self, hidden_states): 59 | pass 60 | 61 | def set_layer_number(self, layer_number: int): 62 | self.layer_number = layer_number 63 | self.router.set_layer_number(layer_number) 64 | 65 | 66 | class MoELayer(BaseMoELayer): 67 | """Mixture of experts Layer **currently only supports no token dropping**. 68 | 69 | Args: 70 | BaseMoELayer (MegatronModule): Base class for MoE layers 71 | """ 72 | 73 | def __init__( 74 | self, config: TransformerConfig, submodules: MLPSubmodules = None, layer_number: int = None 75 | ): 76 | self.submodules = submodules 77 | super(MoELayer, self).__init__(config=config, layer_number=layer_number) 78 | self.router = TopKRouter(config=self.config) 79 | self.enable_shared_experts = config.enable_shared_expert 80 | if config.enable_shared_expert: 81 | self.shared_expert = MLP(self.config, submodules, is_expert=False, is_shared_expert=True) 82 | 83 | if self.config.moe_grouped_gemm: 84 | self.experts = GroupedMLP(self.num_local_experts, self.config) 85 | else: 86 | assert isinstance(self.submodules, MLPSubmodules) 87 | self.experts = SequentialMLP(self.num_local_experts, self.config, self.submodules) 88 | if config.moe_token_dispatcher_type == "allgather": 89 | self.token_dispatcher = MoEAllGatherTokenDispatcher( 90 | self.num_local_experts, self.local_expert_indices, config=self.config 91 | ) 92 | elif config.moe_token_dispatcher_type == "alltoall": 93 | self.token_dispatcher = MoEAlltoAllTokenDispatcher( 94 | self.num_local_experts, self.local_expert_indices, config=self.config 95 | ) 96 | else: 97 | raise ValueError( 98 | f"Unsupported token dispatcher type: {config.moe_token_dispatcher_type}" 99 | ) 100 | 101 | def forward(self, hidden_states: torch.Tensor): 102 | # process MoE 103 | scores, indices = self.router(hidden_states) 104 | (dispatched_input, tokens_per_expert) = self.token_dispatcher.token_permutation( 105 | hidden_states, scores, indices 106 | ) 107 | expert_output, mlp_bias = self.experts(dispatched_input, tokens_per_expert) 108 | output, mlp_bias = self.token_dispatcher.token_unpermutation(expert_output, mlp_bias) 109 | if self.enable_shared_experts: 110 | shared_expert_output, shared_bias = self.shared_expert(hidden_states) 111 | output = output + shared_expert_output.view(-1, hidden_states.shape[-2], hidden_states.shape[-1]) 112 | 113 | return output, mlp_bias 114 | -------------------------------------------------------------------------------- /linear_moe/model/deepseek_v2/rms_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class DeepseekV2RMSNorm(nn.Module): 5 | def __init__(self, hidden_size, eps=1e-6, config=None): 6 | """ 7 | DeepseekV2RMSNorm is equivalent to T5LayerNorm 8 | """ 9 | super().__init__() 10 | self.weight = nn.Parameter(torch.ones(hidden_size)) 11 | self.variance_epsilon = eps 12 | 13 | def forward(self, hidden_states): 14 | input_dtype = hidden_states.dtype 15 | hidden_states = hidden_states.to(torch.float32) 16 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 17 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) 18 | return self.weight * hidden_states.to(input_dtype) -------------------------------------------------------------------------------- /linear_moe/model/deepseek_v2/transformer_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from megatron.core.transformer import TransformerConfig 3 | 4 | 5 | @dataclass 6 | class DeepSeekV2TransformerConfig(TransformerConfig): 7 | 8 | moe_ffn_hidden_size: int = None 9 | 10 | enable_shared_expert: bool = False 11 | 12 | q_lora_rank: int = None 13 | 14 | kv_lora_rank: int = None 15 | 16 | qk_nope_head_dim: int = None 17 | 18 | qk_rope_head_dim: int = None 19 | 20 | v_head_dim: int = None 21 | 22 | num_shared_experts: int = None 23 | 24 | moe_layer_freq: int = None 25 | 26 | rotary_base: int = None 27 | 28 | rotary_scaling_factor: int = None 29 | 30 | max_position_embeddings: int = None 31 | 32 | moe_aux_loss_coeff: float = 0.0 33 | 34 | use_la_module: bool = False 35 | 36 | la_module: str = None 37 | 38 | la_mode: str = None 39 | 40 | base_model: str = None 41 | 42 | la_feature_map: str = None 43 | 44 | la_tie_feature_map_qk: bool = False 45 | 46 | la_norm_q: bool = False 47 | 48 | la_norm_k: bool = False 49 | 50 | la_do_feature_map_norm: bool = False 51 | 52 | la_output_norm: str = None 53 | 54 | la_checkpointing: bool = False 55 | 56 | la_elementwise_affine: bool = True 57 | 58 | la_norm_eps: float = 1e-5 59 | 60 | gla_la_gate_logit_normalizer: int = 16 61 | 62 | gla_la_gate_low_rank_dim: int = 16 63 | 64 | gla_la_clamp_min: float = None 65 | 66 | rwkv6_la_proj_low_rank_dim: int = 32 67 | 68 | rwkv6_la_gate_low_rank_dim: int = 64 69 | 70 | rwkv_7_la_decay_low_rank_dim: int = 64 71 | 72 | rwkv_7_la_gate_low_rank_dim: int = 128 73 | 74 | rwkv_7_a_low_rank_dim: int = 64 75 | 76 | la_gate_fn: str = 'swish' 77 | 78 | expand_k: float = 1.0 79 | 80 | expand_v: float = 1.0 81 | 82 | layer_type_list: str = None 83 | 84 | num_memories: int = 4 85 | 86 | topk: int = 2 87 | 88 | capacity: float = 1.0 89 | 90 | shared_mem: bool = True 91 | -------------------------------------------------------------------------------- /linear_moe/model/llama3/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import GPTModel 2 | -------------------------------------------------------------------------------- /linear_moe/model/llama3/rms_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class Llama3RMSNorm(nn.Module): 5 | def __init__(self, hidden_size, eps=1e-5, config=None): 6 | """ 7 | Llama3RMSNorm is equivalent to T5LayerNorm 8 | """ 9 | super().__init__() 10 | self.weight = nn.Parameter(torch.ones(hidden_size)) 11 | self.variance_epsilon = eps 12 | 13 | def forward(self, hidden_states): 14 | input_dtype = hidden_states.dtype 15 | hidden_states = hidden_states.to(torch.float32) 16 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 17 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) 18 | return self.weight * hidden_states.to(input_dtype) -------------------------------------------------------------------------------- /linear_moe/model/llama3/transformer_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from megatron.core.transformer import TransformerConfig 17 | 18 | 19 | @dataclass 20 | class Llama3TransformerConfig(TransformerConfig): 21 | 22 | transformer_impl: str = 'transformer_engine' 23 | 24 | moe_ffn_hidden_size: int = None 25 | 26 | shared_moe_ffn_hidden_size: int = None 27 | 28 | enable_shared_expert: bool = False 29 | 30 | num_shared_experts: int = None 31 | 32 | moe_layer_freq: int = None 33 | 34 | moe_megablocks: bool = False 35 | """When set to True, use Megablocks for MoE layer.""" 36 | 37 | moe_train_capacity_factor: float = None 38 | 39 | moe_eval_capacity_factor: float = None 40 | 41 | moe_token_dropping: bool = False 42 | 43 | rotary_base: int = None 44 | 45 | rotary_scaling_factor: int = None 46 | 47 | max_position_embeddings: int = None 48 | 49 | moe_aux_loss_coeff: float = 0.0 50 | 51 | use_la_module: bool = False 52 | 53 | megatron_hybrid_mamba_method: bool = False 54 | 55 | la_module: str = None 56 | 57 | la_mode: str = None 58 | 59 | base_model: str = None 60 | 61 | la_feature_map: str = None 62 | 63 | la_tie_feature_map_qk: bool = False 64 | 65 | la_norm_q: bool = False 66 | 67 | la_norm_k: bool = False 68 | 69 | la_do_feature_map_norm: bool = False 70 | 71 | la_output_norm: str = None 72 | 73 | la_checkpointing: bool = False 74 | 75 | la_elementwise_affine: bool = True 76 | 77 | la_norm_eps: float = 1e-5 78 | 79 | gla_la_gate_logit_normalizer: int = 16 80 | 81 | gla_la_gate_low_rank_dim: int = 16 82 | 83 | gla_la_clamp_min: float = None 84 | 85 | rwkv6_la_proj_low_rank_dim: int = 32 86 | 87 | rwkv6_la_gate_low_rank_dim: int = 64 88 | 89 | rwkv_7_la_decay_low_rank_dim: int = 64 90 | 91 | rwkv_7_la_gate_low_rank_dim: int = 128 92 | 93 | rwkv_7_a_low_rank_dim: int = 64 94 | 95 | la_gate_fn: str = 'swish' 96 | 97 | expand_k: float = 1.0 98 | 99 | expand_v: float = 1.0 100 | 101 | layer_type_list: str = None 102 | 103 | num_memories: int = 4 104 | 105 | topk: int = 2 106 | 107 | capacity: float = 1.0 108 | 109 | shared_mem: bool = True 110 | 111 | def __post_init__(self): 112 | super().__post_init__() 113 | 114 | if self.moe_megablocks and self.moe_grouped_gemm: 115 | raise ValueError("moe_megablocks and moe_grouped_gemm cannot be both True.") 116 | -------------------------------------------------------------------------------- /linear_moe/model/mixtral/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import GPTModel 2 | -------------------------------------------------------------------------------- /linear_moe/model/mixtral/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSparseLLMs/Linear-MoE/b312754b9b8a2cb9eb15e373baabe263b8d409c1/linear_moe/model/mixtral/moe/__init__.py -------------------------------------------------------------------------------- /linear_moe/model/mixtral/moe/grouped_gemm_util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | try: 16 | import grouped_gemm 17 | except ImportError: 18 | grouped_gemm = None 19 | 20 | def grouped_gemm_is_available(): 21 | return grouped_gemm is not None 22 | 23 | def assert_grouped_gemm_is_available(): 24 | assert grouped_gemm_is_available(), ( 25 | "Grouped GEMM is not available. Please run " 26 | "`pip install git+https://github.com/fanshiqing/grouped_gemm@main`." 27 | ) 28 | 29 | ops = grouped_gemm.ops if grouped_gemm_is_available() else None 30 | -------------------------------------------------------------------------------- /linear_moe/model/mixtral/moe/moe_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | import torch 17 | 18 | from megatron.core import parallel_state 19 | from megatron.core.transformer.module import MegatronModule 20 | 21 | from .experts import GroupedMLP, SequentialMLP 22 | from .router import TopKRouter 23 | from .token_dispatcher import MoEDroplessTokenDispatcher 24 | from ..transformer_config import TransformerConfig 25 | from ..transformer.mlp import MLPSubmodules 26 | 27 | class BaseMoELayer(MegatronModule, ABC): 28 | """Base class for a mixture of experts layer. 29 | 30 | Args: 31 | config (TransformerConfig): Configuration object for the transformer model. 32 | """ 33 | 34 | def __init__(self, config: TransformerConfig): 35 | super(BaseMoELayer, self).__init__(config) 36 | self.config = config 37 | self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size() 38 | assert self.config.num_moe_experts % self.expert_parallel_size == 0 39 | self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size 40 | local_expert_indices_offset = ( 41 | parallel_state.get_expert_model_parallel_rank() * self.num_local_experts 42 | ) 43 | self.local_expert_indices = [ 44 | local_expert_indices_offset + i for i in range(self.num_local_experts) 45 | ] 46 | self.router = None 47 | self.experts = None 48 | self.token_dispatcher = None 49 | 50 | @abstractmethod 51 | def forward(self, hidden_states): 52 | pass 53 | 54 | 55 | class MoELayer(BaseMoELayer): 56 | """Mixture of experts Layer **currently only supports no token dropping**. 57 | 58 | Args: 59 | BaseMoELayer (MegatronModule): Base class for MoE layers 60 | """ 61 | 62 | def __init__(self, config: TransformerConfig, submodules: MLPSubmodules = None): 63 | self.submodules = submodules 64 | super(MoELayer, self).__init__(config=config) 65 | self.router = TopKRouter( 66 | self.num_local_experts, self.local_expert_indices, config=self.config 67 | ) 68 | if self.config.moe_grouped_gemm: 69 | self.experts = GroupedMLP(self.num_local_experts, self.config) 70 | else: 71 | assert isinstance(self.submodules, MLPSubmodules) 72 | self.experts = SequentialMLP(self.num_local_experts, self.config, self.submodules) 73 | self.token_dispatcher = MoEDroplessTokenDispatcher( 74 | self.num_local_experts, self.local_expert_indices, config=self.config 75 | ) 76 | 77 | def forward(self, hidden_states: torch.Tensor): 78 | """ 79 | Forward pass for the MoE layer. 80 | 81 | The method routes input tokens to the appropriate expert networks, 82 | processes the tokens with the experts, and then combines the outputs. 83 | 84 | Args: 85 | hidden_states (torch.Tensor): The input tensor containing the hidden states 86 | from the previous layer of the transformer model.This tensor is expected to 87 | have a shape compatible with the expectations of the MoE layer, typically 88 | [batch_size, sequence_length, hidden_size]. 89 | 90 | Returns: 91 | Tupletorch.Tensor, torch.Tensor: A tuple containing two elements: 92 | - The first element is the output tensor after processing by the MoE layer. 93 | It has the same shape as the input hidden_states. 94 | - The second element is the bias introduced by the MLP experts, which may 95 | need to be accounted for in subsequent layers or loss calculations. 96 | """ 97 | # process MoE 98 | scores, indices = self.router(hidden_states) 99 | ( 100 | dispatched_input, 101 | tokens_per_expert, 102 | scores, 103 | indices, 104 | global_local_map, 105 | ) = self.token_dispatcher.token_permutation(hidden_states, scores, indices) 106 | expert_output, mlp_bias = self.experts(dispatched_input, tokens_per_expert) 107 | output, mlp_bias = self.token_dispatcher.token_unpermutation( 108 | expert_output, scores, indices, global_local_map, mlp_bias 109 | ) 110 | return output, mlp_bias 111 | -------------------------------------------------------------------------------- /linear_moe/model/mixtral/moe/moe_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | 17 | def switch_load_balancing_loss_func(gates, mask, moe_aux_loss_coeff): 18 | """Calculate the auxiliary loss for better load balacing. 19 | Please refer to the Switch Transformer paper (https://arxiv.org/abs/2101.03961) for details. 20 | 21 | Args: 22 | gates (torch.Tensor): The gates tensor representing the routing probabilities for each expert. 23 | mask (torch.Tensor): The 2D mask tensor indicating which experts are selected. 24 | 25 | Returns: 26 | torch.Tensor: The auxiliary loss for load balancing. 27 | """ 28 | num_experts = mask.size(-1) 29 | gates_mean = gates.mean(dim=0) 30 | selection_mean = mask.float().mean(dim=0) 31 | aux_loss = torch.sum(gates_mean * selection_mean) * num_experts 32 | aux_loss *= moe_aux_loss_coeff 33 | return aux_loss 34 | 35 | 36 | def z_loss_func(logits, z_loss_coeff): 37 | """Encourages the router's logits to remain small to enhance stability. 38 | Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details. 39 | 40 | Args: 41 | logits (torch.Tensor): The logits of the router. 42 | 43 | Returns: 44 | torch.Tensor: The logits after applying the z-loss. 45 | """ 46 | 47 | z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff 48 | return z_loss 49 | 50 | 51 | def sinkhorn(cost: torch.Tensor, tol: float = 0.0001): 52 | """Sinkhorn based MoE routing function 53 | 54 | Args: 55 | cost: A 2D tensor representing the cost matrix to be normalized. 56 | tol: A float value specifying the tolerance for convergence. Default is 0.0001. 57 | 58 | Returns: 59 | A 2D tensor representing the doubly stochastic matrix after Sinkhorn normalization. 60 | """ 61 | cost = torch.exp(cost) 62 | d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype) 63 | d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype) 64 | 65 | eps = 0.00000001 66 | error = 1e9 67 | d1_old = d1 68 | while error > tol: 69 | d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps) 70 | d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps) 71 | error = torch.mean(torch.abs(d1_old - d1)) 72 | d1_old = d1 73 | return d1 * cost * d0.unsqueeze(1) 74 | 75 | class MoEAuxLossAutoScaler(torch.autograd.Function): 76 | """An AutoScaler that compute and scales the grad for auxiliary loss. 77 | 78 | """ 79 | 80 | main_loss_backward_scale: int = 1 81 | 82 | @staticmethod 83 | def forward(ctx, output: torch.Tensor, aux_loss: torch.Tensor): 84 | """Preserve the aux_loss by storing it in the context to avoid garbage collection. 85 | 86 | Args: 87 | output (torch.Tensor): The output tensor. 88 | aux_loss (torch.Tensor): The auxiliary loss tensor. 89 | 90 | Returns: 91 | torch.Tensor: The output tensor. 92 | """ 93 | ctx.save_for_backward(aux_loss) 94 | return output 95 | 96 | @staticmethod 97 | def backward(ctx, grad_output: torch.Tensor): 98 | """Compute and scale the gradient for auxiliary loss.. 99 | 100 | Args: 101 | grad_output (torch.Tensor): The gradient of the output. 102 | 103 | Returns: 104 | Tuple[torch.Tensor, torch.Tensor]: The gradient of the output, scaled auxiliary loss gradient. 105 | """ 106 | (aux_loss,) = ctx.saved_tensors 107 | aux_loss_backward_scale = MoEAuxLossAutoScaler.main_loss_backward_scale 108 | scaled_aux_loss_grad = torch.ones_like(aux_loss) * aux_loss_backward_scale 109 | return grad_output, scaled_aux_loss_grad 110 | 111 | @staticmethod 112 | def set_loss_scale(scale: int): 113 | """set the scale of the aux loss. 114 | 115 | Args: 116 | scale (int): The scale value to set. Please ensure that the scale passed in matches the scale of the main_loss. 117 | """ 118 | MoEAuxLossAutoScaler.main_loss_backward_scale = scale 119 | -------------------------------------------------------------------------------- /linear_moe/model/mixtral/rms_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class MixtralRMSNorm(nn.Module): 5 | def __init__(self, hidden_size, eps=1e-6, config=None): 6 | """ 7 | Mixtral is equivalent to T5LayerNorm 8 | """ 9 | super().__init__() 10 | self.weight = nn.Parameter(torch.ones(hidden_size)) 11 | self.variance_epsilon = eps 12 | 13 | def forward(self, hidden_states): 14 | input_dtype = hidden_states.dtype 15 | hidden_states = hidden_states.to(torch.float32) 16 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 17 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) 18 | return self.weight * hidden_states.to(input_dtype) -------------------------------------------------------------------------------- /linear_moe/model/mixtral/transformer_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from megatron.core.transformer import TransformerConfig 17 | 18 | 19 | @dataclass 20 | class MixtralTransformerConfig(TransformerConfig): 21 | 22 | transformer_impl: str = 'transformer_engine' 23 | 24 | moe_ffn_hidden_size: int = None 25 | 26 | shared_moe_ffn_hidden_size: int = None 27 | 28 | enable_shared_expert: bool = False 29 | 30 | num_shared_experts: int = None 31 | 32 | moe_layer_freq: int = None 33 | 34 | moe_megablocks: bool = False 35 | """When set to True, use Megablocks for MoE layer.""" 36 | 37 | moe_train_capacity_factor: float = None 38 | 39 | moe_eval_capacity_factor: float = None 40 | 41 | moe_token_dropping: bool = False 42 | 43 | rotary_base: int = None 44 | 45 | rotary_scaling_factor: int = None 46 | 47 | max_position_embeddings: int = None 48 | 49 | moe_aux_loss_coeff: float = 0.0 50 | 51 | use_la_module: bool = False 52 | 53 | megatron_hybrid_mamba_method: bool = False 54 | 55 | la_module: str = None 56 | 57 | la_mode: str = None 58 | 59 | base_model: str = None 60 | 61 | la_feature_map: str = None 62 | 63 | la_tie_feature_map_qk: bool = False 64 | 65 | la_norm_q: bool = False 66 | 67 | la_norm_k: bool = False 68 | 69 | la_do_feature_map_norm: bool = False 70 | 71 | la_output_norm: str = None 72 | 73 | la_checkpointing: bool = False 74 | 75 | la_elementwise_affine: bool = True 76 | 77 | la_norm_eps: float = 1e-5 78 | 79 | gla_la_gate_logit_normalizer: int = 16 80 | 81 | gla_la_gate_low_rank_dim: int = 16 82 | 83 | gla_la_clamp_min: float = None 84 | 85 | rwkv6_la_proj_low_rank_dim: int = 32 86 | 87 | rwkv6_la_gate_low_rank_dim: int = 64 88 | 89 | rwkv_7_la_decay_low_rank_dim: int = 64 90 | 91 | rwkv_7_la_gate_low_rank_dim: int = 128 92 | 93 | rwkv_7_a_low_rank_dim: int = 64 94 | 95 | la_gate_fn: str = 'swish' 96 | 97 | expand_k: float = 1.0 98 | 99 | expand_v: float = 1.0 100 | 101 | layer_type_list: str = None 102 | 103 | num_memories: int = 4 104 | 105 | topk: int = 2 106 | 107 | capacity: float = 1.0 108 | 109 | shared_mem: bool = True 110 | 111 | def __post_init__(self): 112 | super().__post_init__() 113 | 114 | if self.moe_megablocks and self.moe_grouped_gemm: 115 | raise ValueError("moe_megablocks and moe_grouped_gemm cannot be both True.") 116 | -------------------------------------------------------------------------------- /linear_moe/model/qwen2/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSparseLLMs/Linear-MoE/b312754b9b8a2cb9eb15e373baabe263b8d409c1/linear_moe/model/qwen2/moe/__init__.py -------------------------------------------------------------------------------- /linear_moe/model/qwen2/rms_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class Qwen2RMSNorm(nn.Module): 5 | def __init__(self, hidden_size, eps=1e-6, config=None): 6 | """ 7 | Qwen2RMSNorm is equivalent to T5LayerNorm 8 | """ 9 | super().__init__() 10 | self.weight = nn.Parameter(torch.ones(hidden_size)) 11 | self.variance_epsilon = eps 12 | 13 | def forward(self, hidden_states): 14 | input_dtype = hidden_states.dtype 15 | hidden_states = hidden_states.to(torch.float32) 16 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 17 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) 18 | return self.weight * hidden_states.to(input_dtype) -------------------------------------------------------------------------------- /linear_moe/model/qwen2/transformer_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from megatron.core.transformer import TransformerConfig 17 | 18 | 19 | @dataclass 20 | class Qwen2TransformerConfig(TransformerConfig): 21 | 22 | transformer_impl: str = 'transformer_engine' 23 | 24 | moe_ffn_hidden_size: int = None 25 | 26 | shared_moe_ffn_hidden_size: int = None 27 | 28 | enable_shared_expert: bool = False 29 | 30 | num_shared_experts: int = None 31 | 32 | moe_layer_freq: int = None 33 | 34 | moe_megablocks: bool = False 35 | """When set to True, use Megablocks for MoE layer.""" 36 | 37 | moe_train_capacity_factor: float = None 38 | 39 | moe_eval_capacity_factor: float = None 40 | 41 | moe_token_dropping: bool = False 42 | 43 | rotary_base: int = None 44 | 45 | rotary_scaling_factor: int = None 46 | 47 | max_position_embeddings: int = None 48 | 49 | moe_aux_loss_coeff: float = 0.0 50 | 51 | use_la_module: bool = False 52 | 53 | megatron_hybrid_mamba_method: bool = False 54 | 55 | la_module: str = None 56 | 57 | la_mode: str = None 58 | 59 | base_model: str = None 60 | 61 | la_feature_map: str = None 62 | 63 | la_tie_feature_map_qk: bool = False 64 | 65 | la_norm_q: bool = False 66 | 67 | la_norm_k: bool = False 68 | 69 | la_do_feature_map_norm: bool = False 70 | 71 | la_output_norm: str = None 72 | 73 | la_checkpointing: bool = False 74 | 75 | la_elementwise_affine: bool = True 76 | 77 | la_norm_eps: float = 1e-5 78 | 79 | gla_la_gate_logit_normalizer: int = 16 80 | 81 | gla_la_gate_low_rank_dim: int = 16 82 | 83 | gla_la_clamp_min: float = None 84 | 85 | rwkv6_la_proj_low_rank_dim: int = 32 86 | 87 | rwkv6_la_gate_low_rank_dim: int = 64 88 | 89 | rwkv_7_la_decay_low_rank_dim: int = 64 90 | 91 | rwkv_7_la_gate_low_rank_dim: int = 128 92 | 93 | rwkv_7_a_low_rank_dim: int = 64 94 | 95 | la_gate_fn: str = 'swish' 96 | 97 | expand_k: float = 1.0 98 | 99 | expand_v: float = 1.0 100 | 101 | layer_type_list: str = None 102 | 103 | num_memories: int = 4 104 | 105 | topk: int = 2 106 | 107 | capacity: float = 1.0 108 | 109 | shared_mem: bool = True 110 | 111 | def __post_init__(self): 112 | super().__post_init__() 113 | 114 | if self.moe_megablocks and self.moe_grouped_gemm: 115 | raise ValueError("moe_megablocks and moe_grouped_gemm cannot be both True.") 116 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/attention/__init__.py: -------------------------------------------------------------------------------- 1 | from .dot_product_attention import ( 2 | DotProductAttention 3 | ) 4 | 5 | __all__ = [ 6 | "DotProductAttention" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/based/__init__.py: -------------------------------------------------------------------------------- 1 | from .based import ( 2 | Based 3 | ) 4 | 5 | __all__ = [ 6 | "Based" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/based/based.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | from megatron.core.transformer.module import MegatronModule 7 | from linear_moe.model.common_modules.feature_map import TaylorFeatureMap 8 | from fla.ops.based import parallel_based 9 | from fla.ops.linear_attn import chunk_linear_attn, fused_chunk_linear_attn 10 | 11 | 12 | class Based(MegatronModule): 13 | 14 | def __init__( 15 | self, 16 | config, 17 | expand_k: float = 1.0, 18 | expand_v: float = 1.0, 19 | ): 20 | super().__init__(config) 21 | 22 | self.la_mode = config.la_mode 23 | self.hidden_size = config.hidden_size 24 | self.key_dim = int(config.hidden_size * expand_k) 25 | self.value_dim = int(config.hidden_size * expand_v) 26 | self.num_heads = config.num_attention_heads 27 | # num_kv_heads here mains num_query_groups 28 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 29 | self.num_kv_groups = self.num_heads // self.num_kv_heads 30 | self.head_qk_dim = self.key_dim // self.num_heads 31 | self.head_v_dim = self.value_dim // self.num_heads 32 | self.la_feature_map_fn = TaylorFeatureMap(self.head_qk_dim) 33 | 34 | assert self.la_mode in ['chunk', 'fused_chunk', 'parallel'], f"Not supported mode `{self.la_mode}`." 35 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 36 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 37 | 38 | if self.la_mode == 'chunk': 39 | self._la_impl = chunk_linear_attn 40 | elif self.la_mode == 'fused_chunk': 41 | self._la_impl = fused_chunk_linear_attn 42 | elif self.la_mode == 'parallel': 43 | self._la_impl = parallel_based 44 | 45 | self.apply(self._initialize_weights) 46 | 47 | def _initialize_weights(self, module: torch.nn.Module): 48 | if getattr(module, "_is_hf_initialized", False): 49 | return 50 | if isinstance(module, torch.nn.Linear): 51 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 52 | if module.bias is not None: 53 | torch.nn.init.zeros_(module.bias) 54 | module._is_hf_initialized = True 55 | 56 | 57 | def forward( 58 | self, 59 | q: torch.Tensor, 60 | k: torch.Tensor, 61 | v: torch.Tensor, 62 | ) -> torch.Tensor: 63 | # torch.Size([128, 4, 16, 32]) 64 | q, k, v = (rearrange(x, 'n b h d -> b h n d') for x in (q, k, v)) 65 | 66 | # expects q: B, H, T, K 67 | if self.la_mode in ['chunk', 'fused_chunk']: 68 | q, k = map(self.la_feature_map_fn, (q, k)) 69 | output, _ = self._la_impl(q, k, v, normalize=True, scale=1) 70 | elif self.la_mode == 'parallel': 71 | assert q.shape[-1] <= 128 72 | output, _ = self._la_impl(q, k, v, True, True) 73 | 74 | output = rearrange(output, 'b h n d -> n b (h d)') 75 | 76 | return output 77 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/basic_linear_attention/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic_linear_attention import ( 2 | BasicLinearAttention 3 | ) 4 | 5 | __all__ = [ 6 | "BasicLinearAttention" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/deltanet/__init__.py: -------------------------------------------------------------------------------- 1 | from .deltanet import DeltaNet 2 | 3 | __all__ = [ 4 | "DeltaNets" 5 | ] 6 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/deltanet/deltanet.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | import torch.nn.functional as F 7 | from megatron.core.transformer.module import MegatronModule 8 | from linear_moe.model.common_modules.activations import ACT2FN 9 | 10 | from linear_moe.model.common_modules import RMSNorm, l2_norm_fn 11 | from fla.ops.delta_rule import (chunk_delta_rule, fused_chunk_delta_rule, 12 | fused_recurrent_delta_rule) 13 | 14 | def simple_norm(x): 15 | return (F.normalize(x, dim=-1) * x.shape[-1] ** 0.5).to(x) 16 | 17 | 18 | # @torch.jit.script 19 | def elu_p1(x): 20 | return (F.elu(x, 1., False) + 1.).to(x) 21 | 22 | 23 | # @torch.jit.script 24 | def sum_norm(x): 25 | return (x / x.sum(-1, keepdim=True)).to(x) 26 | 27 | 28 | # @torch.jit.script 29 | def elu_norm(x): 30 | dtype = x.dtype 31 | x = F.elu(x, 1., False) + 1. 32 | return (x / x.sum(-1, keepdim=True)).to(dtype) 33 | 34 | class DeltaNet(MegatronModule): 35 | 36 | def __init__( 37 | self, 38 | config, 39 | expand_k: float = 1.0, 40 | expand_v: float = 1.0, 41 | chunk_size: int = 64, 42 | qk_activation: str = 'silu', 43 | qk_norm: str = 'l2', 44 | ): 45 | super().__init__(config) 46 | 47 | self.la_mode = config.la_mode 48 | self.hidden_size = config.hidden_size 49 | self.num_heads = config.num_attention_heads 50 | # num_kv_heads here mains num_query_groups 51 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 52 | self.num_kv_groups = self.num_heads // self.num_kv_heads 53 | 54 | self.qk_activation = qk_activation 55 | self.qk_norm = qk_norm 56 | self.chunk_size = chunk_size 57 | 58 | assert self.qk_activation in ['silu', 'relu', 'elu', 'identity'] 59 | assert self.qk_norm in ['l2', 'sum'] 60 | self.key_dim = int(config.hidden_size * expand_k) 61 | self.value_dim = int(config.hidden_size * expand_v) 62 | 63 | assert self.la_mode in ['chunk', 'fused_chunk', 'fused_recurrent'], f"Not supported mode `{self.la_mode}`." 64 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 65 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 66 | 67 | self.head_qk_dim = self.key_dim // self.num_heads 68 | self.head_v_dim = self.value_dim // self.num_heads 69 | 70 | if config.la_output_norm == 'rmsnorm': 71 | self.la_output_norm = RMSNorm(hidden_size=self.head_v_dim, elementwise_affine=config.la_elementwise_affine, eps=config.la_norm_eps) 72 | elif config.la_output_norm == 'identity': 73 | self.la_output_norm = torch.nn.Identity() 74 | else: 75 | raise NotImplementedError(f"Not supported output norm `{self.la_output_norm}`.") 76 | 77 | if self.la_mode == 'chunk': 78 | self._la_impl = chunk_delta_rule 79 | elif self.la_mode == 'fused_chunk': 80 | self._la_impl = fused_chunk_delta_rule 81 | elif self.la_mode == 'fused_recurrent': 82 | self._la_impl = fused_recurrent_delta_rule 83 | 84 | self.apply(self._initialize_weights) 85 | 86 | def _initialize_weights(self, module: torch.nn.Module): 87 | if getattr(module, "_is_hf_initialized", False): 88 | return 89 | if isinstance(module, torch.nn.Linear): 90 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 91 | if module.bias is not None: 92 | torch.nn.init.zeros_(module.bias) 93 | module._is_hf_initialized = True 94 | 95 | 96 | def forward( 97 | self, 98 | q: torch.Tensor, 99 | k: torch.Tensor, 100 | v: torch.Tensor, 101 | beta: torch.Tensor, 102 | ) -> torch.Tensor: 103 | 104 | # torch.Size([128, 4, 16, 32]) 105 | q, k, v = (rearrange(x, 'n b h d -> b h n d') for x in (q, k, v)) 106 | 107 | if self.qk_activation != 'silu': 108 | if self.qk_activation == 'relu': 109 | q, k = q.relu(), k.relu() 110 | elif self.qk_activation == 'elu': 111 | q, k = elu_p1(q), elu_p1(k) 112 | elif self.qk_activation == 'identity': 113 | pass 114 | else: 115 | raise NotImplementedError 116 | 117 | if self.qk_norm is not None: 118 | if self.qk_norm == 'l2': 119 | q = l2_norm_fn(q) 120 | k = l2_norm_fn(k) 121 | elif self.qk_norm == 'sum': 122 | q = sum_norm(q).to(v) 123 | k = sum_norm(k).to(v) 124 | 125 | # expects q: B, H, T, K 126 | if self.la_mode == 'fused_recurrent': 127 | output, _ = self._la_impl(q, k, v, beta) 128 | else: 129 | assert self.chunk_size in [16, 32, 64] 130 | output, _ = self._la_impl(q, k, v, beta, self.chunk_size) 131 | 132 | output = self.la_output_norm(output) 133 | output = rearrange(output, 'b h n d -> n b (h d)') 134 | 135 | return output 136 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/gated_deltanet/__init__.py: -------------------------------------------------------------------------------- 1 | from .gated_deltanet import GatedDeltaNet 2 | 3 | __all__ = [ 4 | "GatedDeltaNets" 5 | ] 6 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/gated_deltanet/gated_deltanet.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | import torch.nn.functional as F 7 | from megatron.core.transformer.module import MegatronModule 8 | from linear_moe.model.common_modules.activations import ACT2FN 9 | 10 | from linear_moe.model.common_modules import RMSNorm, l2_norm_fn 11 | from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule 12 | 13 | def simple_norm(x): 14 | return (F.normalize(x, dim=-1) * x.shape[-1] ** 0.5).to(x) 15 | 16 | 17 | # @torch.jit.script 18 | def elu_p1(x): 19 | return (F.elu(x, 1., False) + 1.).to(x) 20 | 21 | 22 | # @torch.jit.script 23 | def sum_norm(x): 24 | return (x / x.sum(-1, keepdim=True)).to(x) 25 | 26 | 27 | # @torch.jit.script 28 | def elu_norm(x): 29 | dtype = x.dtype 30 | x = F.elu(x, 1., False) + 1. 31 | return (x / x.sum(-1, keepdim=True)).to(dtype) 32 | 33 | class GatedDeltaNet(MegatronModule): 34 | 35 | def __init__( 36 | self, 37 | config, 38 | expand_k: float = 1.0, 39 | expand_v: float = 1.0, 40 | chunk_size: int = 64, 41 | qk_activation: str = 'silu', 42 | qk_norm: str = 'l2', 43 | ): 44 | super().__init__(config) 45 | 46 | self.la_mode = config.la_mode 47 | self.hidden_size = config.hidden_size 48 | self.num_heads = config.num_attention_heads 49 | # num_kv_heads here mains num_query_groups 50 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 51 | self.num_kv_groups = self.num_heads // self.num_kv_heads 52 | 53 | self.qk_activation = qk_activation 54 | self.qk_norm = qk_norm 55 | self.chunk_size = chunk_size 56 | 57 | assert self.qk_activation in ['silu', 'relu', 'elu', 'identity'] 58 | assert self.qk_norm in ['l2', 'sum'] 59 | self.key_dim = int(config.hidden_size * expand_k) 60 | self.value_dim = int(config.hidden_size * expand_v) 61 | 62 | assert self.la_mode in ['chunk', 'fused_chunk', 'fused_recurrent'], f"Not supported mode `{self.la_mode}`." 63 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 64 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 65 | 66 | self.head_qk_dim = self.key_dim // self.num_heads 67 | self.head_v_dim = self.value_dim // self.num_heads 68 | 69 | 70 | if self.la_mode == 'chunk': 71 | self._la_impl = chunk_gated_delta_rule 72 | elif self.la_mode == 'fused_recurrent': 73 | self._la_impl = fused_recurrent_gated_delta_rule 74 | else: 75 | raise NotImplementedError('Not supported la_mode') 76 | 77 | self.apply(self._initialize_weights) 78 | 79 | def _initialize_weights(self, module: torch.nn.Module): 80 | if getattr(module, "_is_hf_initialized", False): 81 | return 82 | if isinstance(module, torch.nn.Linear): 83 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 84 | if module.bias is not None: 85 | torch.nn.init.zeros_(module.bias) 86 | module._is_hf_initialized = True 87 | 88 | 89 | def forward( 90 | self, 91 | q: torch.Tensor, 92 | k: torch.Tensor, 93 | v: torch.Tensor, 94 | beta: torch.Tensor, 95 | gk: torch.Tensor, 96 | ) -> torch.Tensor: 97 | 98 | # torch.Size([128, 4, 16, 32]) 99 | q, k, v, beta, gk = (x.transpose(0, 1).contiguous() for x in (q, k, v, beta, gk)) 100 | 101 | q, k, v = torch.nn.SiLU()(q), torch.nn.SiLU()(k), torch.nn.SiLU()(v) 102 | 103 | if self.qk_norm is not None: 104 | if self.qk_norm == 'l2': 105 | q = l2_norm_fn(q) 106 | k = l2_norm_fn(k) 107 | elif self.qk_norm == 'sum': 108 | q = sum_norm(q).to(v) 109 | k = sum_norm(k).to(v) 110 | 111 | # expects q: B, H, T, K 112 | output, _ = self._la_impl(q, k, v, gk, beta, head_first=False) 113 | 114 | output = rearrange(output, 'b n h d -> n b (h d)').contiguous() 115 | 116 | return output 117 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/gla/__init__.py: -------------------------------------------------------------------------------- 1 | from .gla import GLA 2 | from .gla_gate import GLAGate 3 | 4 | __all__ = [ 5 | "GLA" 6 | "GLAGate" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/gla/gla.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | import torch.nn.functional as F 7 | from megatron.core.transformer.module import MegatronModule 8 | from linear_moe.model.common_modules.activations import ACT2FN 9 | 10 | from linear_moe.model.common_modules import RMSNorm 11 | from fla.ops.gla import chunk_gla, fused_chunk_gla, fused_recurrent_gla 12 | 13 | 14 | class GLA(MegatronModule): 15 | 16 | def __init__( 17 | self, 18 | config, 19 | expand_k: float = 1.0, 20 | expand_v: float = 1.0, 21 | ): 22 | super().__init__(config) 23 | 24 | self.la_mode = config.la_mode 25 | self.hidden_size = config.hidden_size 26 | self.num_heads = config.num_attention_heads 27 | # num_kv_heads here mains num_query_groups 28 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 29 | self.num_kv_groups = self.num_heads // self.num_kv_heads 30 | 31 | self.la_feature_map = config.la_feature_map 32 | self.la_feature_map_fn = ACT2FN[self.la_feature_map] if self.la_feature_map is not None else None 33 | 34 | self.key_dim = int(config.hidden_size * expand_k) 35 | self.value_dim = int(config.hidden_size * expand_v) 36 | 37 | assert self.la_mode in ['chunk', 'fused_chunk', 'fused_recurrent'], f"Not supported mode `{self.la_mode}`." 38 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 39 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 40 | 41 | self.head_qk_dim = self.key_dim // self.num_heads 42 | self.head_v_dim = self.value_dim // self.num_heads 43 | 44 | if config.la_output_norm == 'rmsnorm': 45 | self.la_output_norm = RMSNorm(hidden_size=self.head_v_dim, elementwise_affine=config.la_elementwise_affine, eps=config.la_norm_eps) 46 | elif config.la_output_norm == 'identity': 47 | self.la_output_norm = torch.nn.Identity() 48 | else: 49 | raise NotImplementedError(f"Not supported output norm `{self.la_output_norm}`.") 50 | 51 | self.gla_la_gate_logit_normalizer = config.gla_la_gate_logit_normalizer 52 | self.gla_la_clamp_min = config.gla_la_clamp_min 53 | 54 | if self.la_mode == 'chunk': 55 | self._la_impl = chunk_gla 56 | elif self.la_mode == 'fused_chunk': 57 | self._la_impl = fused_chunk_gla 58 | elif self.la_mode == 'fused_recurrent': 59 | self._la_impl = fused_recurrent_gla 60 | 61 | self.apply(self._initialize_weights) 62 | 63 | def _initialize_weights(self, module: torch.nn.Module): 64 | if getattr(module, "_is_hf_initialized", False): 65 | return 66 | if isinstance(module, torch.nn.Linear): 67 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 68 | if module.bias is not None: 69 | torch.nn.init.zeros_(module.bias) 70 | module._is_hf_initialized = True 71 | 72 | 73 | def forward( 74 | self, 75 | q: torch.Tensor, 76 | k: torch.Tensor, 77 | v: torch.Tensor, 78 | gk: torch.Tensor, 79 | ) -> torch.Tensor: 80 | 81 | # torch.Size([128, 4, 16, 32]) 82 | q, k, v = (rearrange(x, 'n b h d -> b h n d').contiguous() for x in (q, k, v)) 83 | 84 | gk = rearrange(gk, 'n b (h d) -> b h n d', h=self.num_kv_heads).contiguous() 85 | gk = F.logsigmoid(gk) / self.gla_la_gate_logit_normalizer 86 | 87 | if self.la_feature_map_fn is not None: 88 | q, k = map(self.la_feature_map_fn, (q, k)) 89 | 90 | if self.gla_la_clamp_min is not None: 91 | gk = torch.clamp_min(gk, self.gla_la_clamp_min) 92 | 93 | # expects q: B, H, T, K 94 | output, _ = self._la_impl(q, k, v, gk) 95 | output = self.la_output_norm(output) 96 | 97 | output = rearrange(output, 'b h n d -> n b (h d)').contiguous() 98 | 99 | return output 100 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/gla/gla_gate.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | from megatron.core.transformer.module import MegatronModule 7 | 8 | 9 | class GLAGate(MegatronModule): 10 | 11 | def __init__(self, config): 12 | super().__init__(config) 13 | 14 | self.hidden_size = config.hidden_size 15 | self.key_dim = config.hidden_size 16 | self.num_heads = config.num_attention_heads 17 | # num_kv_heads here mains num_query_groups 18 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 19 | self.num_kv_groups = self.num_heads // self.num_kv_heads 20 | self.key_dim_per_group = self.key_dim // self.num_kv_groups 21 | self.gla_la_gate_low_rank_dim = config.gla_la_gate_low_rank_dim 22 | self.gk_proj = torch.nn.Sequential(torch.nn.Linear(self.hidden_size, self.gla_la_gate_low_rank_dim, bias=False), 23 | torch.nn.Linear(self.gla_la_gate_low_rank_dim, self.key_dim_per_group, bias=True)) 24 | 25 | def forward( 26 | self, 27 | hidden_states: torch.Tensor, 28 | ) -> torch.Tensor: 29 | gk = self.gk_proj(hidden_states) 30 | 31 | return gk 32 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/hgrn2/__init__.py: -------------------------------------------------------------------------------- 1 | from .hgrn2 import HGRN2 2 | 3 | __all__ = [ 4 | "HGRN2", 5 | ] 6 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/hgrn2/hgrn2.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | import torch.nn.functional as F 7 | from megatron.core.transformer.module import MegatronModule 8 | from transformers.activations import ACT2FN 9 | 10 | from linear_moe.model.common_modules import RMSNorm 11 | from fla.ops.gla import chunk_gla, fused_chunk_gla, fused_recurrent_gla 12 | 13 | 14 | class HGRN2(MegatronModule): 15 | 16 | def __init__( 17 | self, 18 | config, 19 | expand_ratio: Optional[int] = 128, 20 | ): 21 | super().__init__(config) 22 | 23 | self.la_mode = config.la_mode 24 | self.hidden_size = config.hidden_size 25 | self.num_heads = config.num_attention_heads 26 | self.head_dim = self.hidden_size // self.num_heads 27 | # num_kv_heads here mains num_query_groups 28 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 29 | self.num_kv_groups = self.num_heads // self.num_kv_heads 30 | 31 | if config.hidden_size is not None and config.num_attention_heads is not None: 32 | expand_ratio = self.hidden_size // self.num_heads 33 | self.expand_ratio = expand_ratio 34 | self.forget_dim = int(self.num_heads * self.expand_ratio) 35 | self.input_dim = self.hidden_size 36 | 37 | assert self.la_mode in ['chunk', 'fused_recurrent', 'fused_chunk'], f"Not supported mode `{self.la_mode}`." 38 | assert self.forget_dim % self.num_heads == 0, f"forget key dim must be divisible by num_heads of {self.num_heads}" 39 | assert self.input_dim % self.num_heads == 0, f"input value dim must be divisible by num_heads of {self.num_heads}" 40 | 41 | if config.la_output_norm == 'rmsnorm': 42 | self.la_output_norm = RMSNorm(hidden_size=self.head_dim, elementwise_affine=config.la_elementwise_affine, eps=config.la_norm_eps) 43 | elif config.la_output_norm == 'identity': 44 | self.la_output_norm = torch.nn.Identity() 45 | else: 46 | raise NotImplementedError(f"Not supported output norm `{self.la_output_norm}`.") 47 | 48 | if self.la_mode == 'chunk': 49 | self._la_impl = chunk_gla 50 | elif self.la_mode == 'fused_chunk': 51 | self._la_impl = fused_chunk_gla 52 | elif self.la_mode == 'fused_recurrent': 53 | self._la_impl = fused_recurrent_gla 54 | 55 | self.apply(self._initialize_weights) 56 | 57 | def _initialize_weights(self, module: torch.nn.Module): 58 | if getattr(module, "_is_hf_initialized", False): 59 | return 60 | if isinstance(module, torch.nn.Linear): 61 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 62 | if module.bias is not None: 63 | torch.nn.init.zeros_(module.bias) 64 | module._is_hf_initialized = True 65 | 66 | 67 | def forward( 68 | self, 69 | q: torch.Tensor, 70 | k: torch.Tensor, 71 | i: torch.Tensor, 72 | g: torch.Tensor, 73 | ) -> torch.Tensor: 74 | 75 | # expects q: b, h, n, d 76 | output, _ = self._la_impl(q, k, i, g) 77 | # import pdb; pdb.set_trace() 78 | output = self.la_output_norm(output) 79 | output = rearrange(output, 'b h n d -> n b (h d)') 80 | 81 | return output 82 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/lasp2/__init__.py: -------------------------------------------------------------------------------- 1 | from .lasp2 import ( 2 | LASP2 3 | ) 4 | 5 | __all__ = [ 6 | "LASP2" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/lasp2/lasp2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from einops import rearrange 3 | from megatron.core.transformer.module import MegatronModule 4 | from transformers.activations import ACT2FN 5 | 6 | from linear_moe.model.common_modules import RMSNorm 7 | from .lasp2_with_mask_triton_op import lasp2_with_mask_triton_op 8 | from .lasp2_without_mask_triton_op import lasp2_without_mask_triton_op 9 | from megatron.core.parallel_state import get_context_parallel_group 10 | 11 | 12 | class LASP2(MegatronModule): 13 | 14 | def __init__( 15 | self, 16 | config, 17 | expand_k: float = 1.0, 18 | expand_v: float = 1.0, 19 | ): 20 | super().__init__(config) 21 | 22 | self.la_mode = config.la_mode 23 | self.hidden_size = config.hidden_size 24 | self.num_heads = config.num_attention_heads 25 | # num_kv_heads here mains num_query_groups 26 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 27 | self.num_kv_groups = self.num_heads // self.num_kv_heads 28 | 29 | self.la_feature_map = config.la_feature_map 30 | self.la_feature_map_fn = ACT2FN[self.la_feature_map] if self.la_feature_map is not None else None 31 | 32 | self.key_dim = int(config.hidden_size * expand_k) 33 | self.value_dim = int(config.hidden_size * expand_v) 34 | 35 | assert self.la_mode in ['chunk', 'fused_chunk', 'parallel', 'fused_recurrent'], f"Not supported mode `{self.la_mode}`." 36 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 37 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 38 | 39 | self.head_qk_dim = self.key_dim // self.num_heads 40 | self.head_v_dim = self.value_dim // self.num_heads 41 | 42 | if config.la_output_norm == 'rmsnorm': 43 | self.la_output_norm = RMSNorm(hidden_size=self.head_v_dim, elementwise_affine=config.la_elementwise_affine, eps=config.la_norm_eps) 44 | elif config.la_output_norm == 'identity': 45 | self.la_output_norm = torch.nn.Identity() 46 | else: 47 | raise NotImplementedError(f"Not supported output norm `{self.la_output_norm}`.") 48 | 49 | self._la_impl = lasp2_without_mask_triton_op 50 | 51 | self.apply(self._initialize_weights) 52 | 53 | def _initialize_weights(self, module: torch.nn.Module): 54 | if getattr(module, "_is_hf_initialized", False): 55 | return 56 | if isinstance(module, torch.nn.Linear): 57 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 58 | if module.bias is not None: 59 | torch.nn.init.zeros_(module.bias) 60 | module._is_hf_initialized = True 61 | 62 | def forward( 63 | self, 64 | q: torch.Tensor, 65 | k: torch.Tensor, 66 | v: torch.Tensor, 67 | ) -> torch.Tensor: 68 | q, k, v = (rearrange(x, 'n b h d -> b h n d') for x in (q, k, v)) 69 | 70 | if self.la_feature_map_fn is not None: 71 | q, k = map(self.la_feature_map_fn, (q, k)) 72 | 73 | # expects q: b, h, n, d 74 | output = self._la_impl(q, k, v, get_context_parallel_group()) 75 | 76 | output = self.la_output_norm(output) 77 | output = rearrange(output, 'b h n d -> n b (h d)') 78 | 79 | return output 80 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/lightning_attention/__init__.py: -------------------------------------------------------------------------------- 1 | from .lightning_attention import ( 2 | LightningAttention 3 | ) 4 | 5 | __all__ = [ 6 | "LightningAttention" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/mamba2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSparseLLMs/Linear-MoE/b312754b9b8a2cb9eb15e373baabe263b8d409c1/linear_moe/sequence_modeling/mamba2/__init__.py -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/mamba2/mamba_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, Tri Dao, Albert Gu. 2 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | 4 | # Some of this code was adopted from https://github.com/state-spaces/mamba/ 5 | # This source code is licensed under the Apache license found in the 6 | # LICENSE file in the root directory of this source tree. 7 | 8 | from dataclasses import dataclass 9 | from typing import Union 10 | 11 | import torch 12 | from torch import Tensor 13 | 14 | from megatron.core.transformer.identity_op import IdentityOp 15 | from megatron.core.transformer.module import MegatronModule 16 | from megatron.core.transformer.spec_utils import ModuleSpec, build_module 17 | from megatron.core.transformer.transformer_config import TransformerConfig 18 | 19 | 20 | @dataclass 21 | class MambaLayerSubmodules: 22 | norm: Union[ModuleSpec, type] = IdentityOp 23 | mixer: Union[ModuleSpec, type] = IdentityOp 24 | mamba_bda: Union[ModuleSpec, type] = IdentityOp 25 | 26 | 27 | class MambaLayer(MegatronModule): 28 | def __init__( 29 | self, 30 | config: TransformerConfig, 31 | submodules: MambaLayerSubmodules, 32 | mamba_ssm_ngroups=8, 33 | layer_number: int = 1, 34 | residual_in_fp32=False, 35 | ): 36 | """ 37 | Top level Mamba Layer 38 | """ 39 | super().__init__(config) 40 | self.config = config 41 | self.layer_number = layer_number 42 | self.residual_in_fp32 = residual_in_fp32 43 | self.hidden_dropout = config.hidden_dropout 44 | self.mixer = build_module( 45 | submodules.mixer, 46 | self.config, 47 | d_model=self.config.hidden_size, 48 | ngroups=mamba_ssm_ngroups, 49 | layer_number=layer_number, 50 | ) 51 | self.norm = build_module(submodules.norm, self.config, self.config.hidden_size) 52 | self.mamba_bda = build_module(submodules.mamba_bda) 53 | self.bias_dropout_add_exec_handler = torch.enable_grad 54 | 55 | def forward( 56 | self, 57 | hidden_states: Tensor, 58 | attention_mask: Tensor, # Not used in MambaLayer 59 | inference_params=None, 60 | rotary_pos_emb: Tensor = None, # Not used in MambaLayer 61 | ): 62 | 63 | residual = hidden_states 64 | if self.residual_in_fp32: 65 | residual = residual.to(torch.float32) 66 | 67 | hidden_states = hidden_states.to(dtype=self.config.params_dtype) 68 | hidden_states = self.norm(hidden_states) 69 | 70 | mixer_out_with_bias = self.mixer(hidden_states, inference_params=inference_params) 71 | 72 | with self.bias_dropout_add_exec_handler(): 73 | hidden_states = self.mamba_bda(self.training, self.config.bias_dropout_fusion)( 74 | mixer_out_with_bias, residual, self.hidden_dropout 75 | ) 76 | 77 | return hidden_states 78 | 79 | def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None): 80 | return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype) 81 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/mamba2/triton_cache_manager.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import os 4 | import socket 5 | from pathlib import Path 6 | 7 | import torch 8 | 9 | try: 10 | from triton.runtime.cache import FileCacheManager 11 | except ImportError: 12 | raise ImportError("triton is required by the Mamba model but cannot be imported") 13 | 14 | 15 | def get_rank(): 16 | return torch.distributed.get_rank() 17 | 18 | 19 | def default_cache_dir(): 20 | return os.path.join(Path.home(), ".triton", "cache") 21 | 22 | 23 | class ParallelFileCacheManager(FileCacheManager): 24 | 25 | # See https://github.com/triton-lang/triton/blob/main/python/triton/runtime/cache.py 26 | 27 | # When running Triton with multiple ranks, they each create their own cache manager. Their input 28 | # keys to that class are mostly (but not entirely) the same across ranks, which leads many ranks 29 | # to write to the same 'key' directories in the cache dir at the same time during compilation, 30 | # leading to conflicts. This works around that by making each cache dir be rank specific by 31 | # adding "rank__" to the cache directory. 32 | 33 | def __init__(self, key): 34 | self.key = key 35 | self.lock_path = None 36 | # create cache directory if it doesn't exist 37 | self.cache_dir = os.environ.get('TRITON_CACHE_DIR', default_cache_dir()) 38 | self.cache_dir = os.path.join( 39 | self.cache_dir, "rank_{}_{}".format(socket.gethostname(), os.getpid()) 40 | ) 41 | if self.cache_dir: 42 | self.cache_dir = os.path.join(self.cache_dir, self.key) 43 | self.lock_path = os.path.join(self.cache_dir, "lock") 44 | os.makedirs(self.cache_dir, exist_ok=True) 45 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rebased/__init__.py: -------------------------------------------------------------------------------- 1 | from .rebased import ( 2 | Rebased 3 | ) 4 | 5 | __all__ = [ 6 | "Rebased" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rebased/rebased.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | from megatron.core.transformer.module import MegatronModule 7 | from linear_moe.model.common_modules.feature_map import RebasedFeatureMap 8 | from fla.ops.linear_attn import chunk_linear_attn, fused_chunk_linear_attn 9 | from fla.ops.rebased import parallel_rebased 10 | 11 | 12 | class Rebased(MegatronModule): 13 | 14 | def __init__( 15 | self, 16 | config, 17 | expand_k: float = 1.0, 18 | expand_v: float = 1.0, 19 | use_gamma: Optional[bool] = True, 20 | use_beta: Optional[bool] = True, 21 | normalize: Optional[bool] = True, 22 | eps: float = 1e-5, 23 | ): 24 | super().__init__(config) 25 | 26 | self.la_mode = config.la_mode 27 | self.hidden_size = config.hidden_size 28 | self.key_dim = int(config.hidden_size * expand_k) 29 | self.value_dim = int(config.hidden_size * expand_v) 30 | self.num_heads = config.num_attention_heads 31 | # num_kv_heads here mains num_query_groups 32 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 33 | self.num_kv_groups = self.num_heads // self.num_kv_heads 34 | self.head_qk_dim = self.key_dim // self.num_heads 35 | self.head_v_dim = self.value_dim // self.num_heads 36 | self.la_eps = eps 37 | self.la_feature_map_fn = RebasedFeatureMap(self.head_qk_dim, use_gamma, use_beta, normalize) 38 | 39 | 40 | assert self.la_mode in ['chunk', 'fused_chunk', 'parallel'], f"Not supported mode `{self.la_mode}`." 41 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 42 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 43 | 44 | if self.la_mode == 'chunk': 45 | self._la_impl = chunk_linear_attn 46 | elif self.la_mode == 'fused_chunk': 47 | self._la_impl = fused_chunk_linear_attn 48 | elif self.la_mode == 'parallel': 49 | self._la_impl = parallel_rebased 50 | 51 | self.apply(self._initialize_weights) 52 | 53 | def _initialize_weights(self, module: torch.nn.Module): 54 | if getattr(module, "_is_hf_initialized", False): 55 | return 56 | if isinstance(module, torch.nn.Linear): 57 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 58 | if module.bias is not None: 59 | torch.nn.init.zeros_(module.bias) 60 | module._is_hf_initialized = True 61 | 62 | 63 | def forward( 64 | self, 65 | q: torch.Tensor, 66 | k: torch.Tensor, 67 | v: torch.Tensor, 68 | ) -> torch.Tensor: 69 | # torch.Size([128, 4, 16, 32]) 70 | q, k, v = (rearrange(x, 'n b h d -> b h n d') for x in (q, k, v)) 71 | 72 | q, k = self.la_feature_map_fn(q, flatten=(self.la_mode != 'parallel')), self.la_feature_map_fn(k, flatten=(self.la_mode != 'parallel')) 73 | 74 | # expects q: B, H, T, K 75 | if self.la_mode in ['chunk', 'fused_chunk']: 76 | output, _ = self._la_impl(q, k, v, normalize=True, scale=1) 77 | elif self.la_mode == 'parallel': 78 | assert q.shape[-1] <= 128 79 | output, _ = self._la_impl(q, k, v, self.la_eps, True, True) 80 | 81 | output = rearrange(output, 'b h n d -> n b (h d)') 82 | return output 83 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/retention/__init__.py: -------------------------------------------------------------------------------- 1 | from .retention import ( 2 | Retention 3 | ) 4 | 5 | __all__ = [ 6 | "Retention" 7 | ] 8 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/retention/retention.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | from megatron.core.transformer.module import MegatronModule 7 | from transformers.activations import ACT2FN 8 | import torch.nn.functional as F 9 | 10 | from linear_moe.model.common_modules import RMSNorm 11 | from fla.ops.retention import (chunk_retention, fused_chunk_retention, 12 | fused_recurrent_retention, parallel_retention) 13 | 14 | 15 | class Retention(MegatronModule): 16 | 17 | def __init__( 18 | self, 19 | config, 20 | expand_k: float = 1.0, 21 | expand_v: float = 1.0, 22 | ): 23 | super().__init__(config) 24 | 25 | self.la_mode = config.la_mode 26 | self.hidden_size = config.hidden_size 27 | self.num_heads = config.num_attention_heads 28 | # num_kv_heads here mains num_query_groups 29 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 30 | self.num_kv_groups = self.num_heads // self.num_kv_heads 31 | 32 | self.la_feature_map = config.la_feature_map 33 | if self.la_feature_map == 'elu': 34 | def elu(x): 35 | return F.elu(x) + 1 36 | self.la_feature_map_fn = elu 37 | else: 38 | self.la_feature_map_fn = ACT2FN[self.la_feature_map] if self.la_feature_map is not None else None 39 | 40 | self.key_dim = int(config.hidden_size * expand_k) 41 | self.value_dim = int(config.hidden_size * expand_v) 42 | 43 | assert self.la_mode in ['chunk', 'fused_chunk', 'parallel', 'fused_recurrent'], f"Not supported mode `{self.la_mode}`." 44 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 45 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 46 | 47 | self.head_qk_dim = self.key_dim // self.num_heads 48 | self.head_v_dim = self.value_dim // self.num_heads 49 | 50 | if config.la_output_norm == 'rmsnorm': 51 | self.la_output_norm = RMSNorm(hidden_size=self.head_v_dim, elementwise_affine=config.la_elementwise_affine, eps=config.la_norm_eps) 52 | elif config.la_output_norm == 'identity': 53 | self.la_output_norm = torch.nn.Identity() 54 | else: 55 | raise NotImplementedError(f"Not supported output norm `{self.la_output_norm}`.") 56 | 57 | if self.la_mode == 'chunk': 58 | self._la_impl = chunk_retention 59 | elif self.la_mode == 'fused_chunk': 60 | self._la_impl = fused_chunk_retention 61 | elif self.la_mode == 'fused_recurrent': 62 | self._la_impl = fused_recurrent_retention 63 | elif self.la_mode == 'parallel': 64 | self._la_impl = parallel_retention 65 | 66 | self.apply(self._initialize_weights) 67 | 68 | def _initialize_weights(self, module: torch.nn.Module): 69 | if getattr(module, "_is_hf_initialized", False): 70 | return 71 | if isinstance(module, torch.nn.Linear): 72 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 73 | if module.bias is not None: 74 | torch.nn.init.zeros_(module.bias) 75 | module._is_hf_initialized = True 76 | 77 | 78 | def forward( 79 | self, 80 | q: torch.Tensor, 81 | k: torch.Tensor, 82 | v: torch.Tensor, 83 | ) -> torch.Tensor: 84 | # torch.Size([128, 4, 16, 32]) 85 | q, k, v = (rearrange(x, 'n b h d -> b h n d') for x in (q, k, v)) 86 | 87 | if self.la_feature_map_fn is not None: 88 | q, k = map(self.la_feature_map_fn, (q, k)) 89 | 90 | # expects q: B, H, T, K 91 | output, _ = self._la_impl(q, k, v) 92 | output = self.la_output_norm(output) 93 | 94 | output = rearrange(output, 'b h n d -> n b (h d)') 95 | 96 | return output 97 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rwkv6/__init__.py: -------------------------------------------------------------------------------- 1 | from .rwkv6 import RWKV6 2 | from .dd_lerp_linear import LerpLinear, DDLerpLinear 3 | 4 | __all__ = [ 5 | "RWKV6", 6 | "LerpLinear", 7 | "DDLerpLinear", 8 | ] 9 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rwkv6/dd_lerp_linear.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | import torch.nn as nn 5 | from typing import Optional 6 | from einops import rearrange 7 | from megatron.core.transformer.module import MegatronModule 8 | 9 | 10 | class LerpLinear(nn.Module): 11 | 12 | def __init__( 13 | self, 14 | input_dim: int, 15 | output_dim: int, 16 | low_rank_dim: Optional[int] = None 17 | ): 18 | super().__init__() 19 | 20 | self.input_dim = input_dim 21 | self.output_dim = output_dim 22 | self.low_rank_dim = low_rank_dim 23 | 24 | self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) 25 | if low_rank_dim is None: 26 | self.linear = nn.Linear(input_dim, output_dim, bias=False) 27 | else: 28 | self.linear = LoRA(input_dim, output_dim, low_rank_dim) 29 | self.mu = nn.Parameter(torch.zeros(input_dim)) 30 | 31 | def __repr__(self) -> str: 32 | s = f"{self.__class__.__name__}({self.input_dim}, {self.output_dim}" 33 | if self.low_rank_dim is not None: 34 | s += f", low_rank_dim={self.low_rank_dim}" 35 | s += ")" 36 | return s 37 | 38 | def forward(self, x: torch.Tensor, delta: Optional[torch.Tensor] = None) -> torch.Tensor: 39 | if delta is None: 40 | shifted = self.time_shift(x) 41 | if len(shifted.shape) == 2: 42 | shifted = shifted.unsqueeze(1) 43 | delta = shifted - x 44 | return self.linear(x + delta * self.mu) 45 | 46 | 47 | class DDLerpLinear(MegatronModule): 48 | 49 | def __init__( 50 | self, 51 | config, 52 | input_dim: int, 53 | output_dim: int, 54 | low_rank_dim: Optional[int] = None 55 | ): 56 | super().__init__(config) 57 | 58 | self.input_dim = input_dim 59 | self.output_dim = output_dim 60 | self.low_rank_dim = low_rank_dim 61 | 62 | self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) 63 | if low_rank_dim is None: 64 | self.linear = nn.Linear(input_dim, output_dim, bias=False) 65 | else: 66 | self.linear = LoRA(input_dim, output_dim, low_rank_dim) 67 | 68 | def __repr__(self) -> str: 69 | s = f"{self.__class__.__name__}({self.input_dim}, {self.output_dim}" 70 | if self.low_rank_dim is not None: 71 | s += f", low_rank_dim={self.low_rank_dim}" 72 | s += ")" 73 | return s 74 | 75 | def forward(self, x: torch.Tensor, mu: torch.Tensor, delta: Optional[torch.Tensor] = None) -> torch.Tensor: 76 | if delta is None: 77 | shifted = self.time_shift(x) 78 | if len(shifted.shape) == 2: 79 | shifted = shifted.unsqueeze(1) 80 | delta = shifted - x 81 | return self.linear(x + delta * mu) 82 | 83 | 84 | class LoRA(nn.Module): 85 | 86 | def __init__( 87 | self, 88 | input_dim: int, 89 | output_dim: int, 90 | low_rank_dim: int, 91 | bias: Optional[bool] = True 92 | ): 93 | super().__init__() 94 | 95 | self.input_dim = input_dim 96 | self.output_dim = output_dim 97 | self.low_rank_dim = low_rank_dim 98 | self.bias = bias 99 | 100 | self.lora = nn.Sequential( 101 | nn.Linear(input_dim, low_rank_dim, bias=False), 102 | nn.Tanh(), 103 | nn.Linear(low_rank_dim, output_dim, bias=bias) 104 | ) 105 | 106 | def __repr__(self) -> str: 107 | s = f"{self.__class__.__name__}(" 108 | s += f"input_dim={self.input_dim}, low_rank_dim={self.low_rank_dim}, output_dim={self.output_dim}" 109 | if not self.bias: 110 | s += f", bias={self.bias}" 111 | s += ")" 112 | return s 113 | 114 | def forward(self, x: torch.Tensor) -> torch.Tensor: 115 | return self.lora(x) -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rwkv6/rwkv6.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | import torch.nn.functional as F 7 | from megatron.core.transformer.module import MegatronModule 8 | from transformers.activations import ACT2FN 9 | 10 | from linear_moe.model.common_modules import RMSNorm, GroupNorm 11 | from fla.ops.rwkv6 import chunk_rwkv6, fused_recurrent_rwkv6 12 | 13 | 14 | class RWKV6(MegatronModule): 15 | 16 | def __init__( 17 | self, 18 | config, 19 | expand_k: float = 0.5, 20 | expand_v: float = 1.0, 21 | ): 22 | super().__init__(config) 23 | 24 | self.la_mode = config.la_mode 25 | self.hidden_size = config.hidden_size 26 | self.num_heads = config.num_attention_heads 27 | # num_kv_heads here mains num_query_groups 28 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 29 | self.num_kv_groups = self.num_heads // self.num_kv_heads 30 | 31 | # self.la_feature_map = config.la_feature_map 32 | # self.la_feature_map_fn = ACT2FN[self.la_feature_map] if self.la_feature_map is not None else None 33 | 34 | self.key_dim = int(config.hidden_size * expand_k) 35 | self.value_dim = int(config.hidden_size * expand_v) 36 | 37 | assert self.la_mode in ['chunk', 'fused_recurrent'], f"Not supported mode `{self.la_mode}`." 38 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 39 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 40 | 41 | self.head_dim = self.hidden_size // self.num_heads 42 | self.head_qk_dim = self.key_dim // self.num_heads 43 | self.head_v_dim = self.value_dim // self.num_heads 44 | 45 | if config.la_output_norm == 'rmsnorm': 46 | self.la_output_norm = RMSNorm(hidden_size=self.head_v_dim, elementwise_affine=config.la_elementwise_affine, eps=config.la_norm_eps) 47 | elif config.la_output_norm == 'identity': 48 | self.la_output_norm = torch.nn.Identity() 49 | elif config.la_output_norm == 'groupnorm': 50 | self.la_output_norm = GroupNorm(self.num_heads, self.value_dim, elementwise_affine=config.la_elementwise_affine, bias=True, eps=config.la_norm_eps) 51 | else: 52 | raise NotImplementedError(f"Not supported output norm `{self.la_output_norm}`.") 53 | 54 | if self.la_mode == 'chunk': 55 | self._la_impl = chunk_rwkv6 56 | elif self.la_mode == 'fused_recurrent': 57 | self._la_impl = fused_recurrent_rwkv6 58 | 59 | self.apply(self._initialize_weights) 60 | 61 | def _initialize_weights(self, module: torch.nn.Module): 62 | if getattr(module, "_is_hf_initialized", False): 63 | return 64 | if isinstance(module, torch.nn.Linear): 65 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 66 | if module.bias is not None: 67 | torch.nn.init.zeros_(module.bias) 68 | module._is_hf_initialized = True 69 | 70 | 71 | def forward( 72 | self, 73 | r: torch.Tensor, 74 | k: torch.Tensor, 75 | v: torch.Tensor, 76 | w: torch.Tensor, 77 | u: torch.Tensor, 78 | scale: float, 79 | ) -> torch.Tensor: 80 | 81 | # expects q: b, h, n, d 82 | output, _ = self._la_impl(r, k, v, w, u, scale) 83 | if isinstance(self.la_output_norm, GroupNorm): 84 | output = self.la_output_norm(rearrange(output, 'b h n d -> b n (h d)')) 85 | output = rearrange(output, 'b n (h d) -> n b (h d)', h = self.head_dim) 86 | elif isinstance(self.la_output_norm, RMSNorm): 87 | output = self.la_output_norm(output) 88 | output = rearrange(output, 'b h n d -> n b (h d)') 89 | 90 | return output 91 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rwkv7/__init__.py: -------------------------------------------------------------------------------- 1 | from .rwkv7 import RWKV7 2 | from .lora_mlp import LoRA 3 | __all__ = [ 4 | "RWKV7", 5 | "LoRA", 6 | ] 7 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rwkv7/lora_mlp.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | import torch.nn as nn 5 | from typing import Optional 6 | from einops import rearrange 7 | from megatron.core.transformer.module import MegatronModule 8 | 9 | 10 | class LoRA(MegatronModule): 11 | 12 | def __init__( 13 | self, 14 | config, 15 | input_dim: int, 16 | output_dim: int, 17 | low_rank_dim: int, 18 | bias: Optional[bool] = True, 19 | activation: Optional[str] = 'tanh' 20 | ): 21 | super().__init__(config) 22 | 23 | self.input_dim = input_dim 24 | self.output_dim = output_dim 25 | self.low_rank_dim = low_rank_dim 26 | self.bias = bias 27 | 28 | if activation is None: 29 | self.activation = nn.Identity() 30 | elif activation == 'sigmoid': 31 | self.activation = nn.Sigmoid() 32 | elif activation == 'tanh': 33 | self.activation = nn.Tanh() 34 | elif activation == 'relu': 35 | self.activation = nn.ReLU() 36 | else: 37 | raise ValueError(f"Not supported activation `{activation}`.") 38 | 39 | self.lora = nn.Sequential( 40 | nn.Linear(input_dim, low_rank_dim, bias=False), 41 | self.activation, 42 | nn.Linear(low_rank_dim, output_dim, bias=bias) 43 | ) 44 | 45 | def __repr__(self) -> str: 46 | s = f"{self.__class__.__name__}(" 47 | s += f"input_dim={self.input_dim}, low_rank_dim={self.low_rank_dim}, output_dim={self.output_dim}" 48 | if not self.bias: 49 | s += f", bias={self.bias}" 50 | s += ")" 51 | return s 52 | 53 | def forward(self, x: torch.Tensor) -> torch.Tensor: 54 | return self.lora(x) 55 | -------------------------------------------------------------------------------- /linear_moe/sequence_modeling/rwkv7/rwkv7.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import torch 4 | from typing import Optional 5 | from einops import rearrange 6 | import torch.nn.functional as F 7 | from megatron.core.transformer.module import MegatronModule 8 | from transformers.activations import ACT2FN 9 | 10 | from linear_moe.model.common_modules import RMSNorm, GroupNorm 11 | from fla.ops.rwkv7 import chunk_rwkv7, fused_recurrent_rwkv7 12 | 13 | 14 | class RWKV7(MegatronModule): 15 | 16 | def __init__( 17 | self, 18 | config, 19 | expand_k: float = 1.0, 20 | expand_v: float = 1.0, 21 | ): 22 | super().__init__(config) 23 | 24 | self.la_mode = config.la_mode 25 | self.hidden_size = config.hidden_size 26 | self.num_heads = config.num_attention_heads 27 | # num_kv_heads here mains num_query_groups 28 | self.num_kv_heads = config.num_query_groups if config.num_query_groups is not None else config.num_attention_heads 29 | self.num_kv_groups = self.num_heads // self.num_kv_heads 30 | 31 | # self.la_feature_map = config.la_feature_map 32 | # self.la_feature_map_fn = ACT2FN[self.la_feature_map] if self.la_feature_map is not None else None 33 | 34 | self.key_dim = int(config.hidden_size * expand_k) 35 | self.value_dim = int(config.hidden_size * expand_v) 36 | 37 | assert self.la_mode in [ 38 | 'chunk', 'fused_recurrent'], f"Not supported mode `{self.la_mode}`." 39 | assert self.key_dim % self.num_heads == 0, f"key dim must be divisible by num_heads of {self.num_heads}" 40 | assert self.value_dim % self.num_heads == 0, f"value dim must be divisible by num_heads of {self.num_heads}" 41 | 42 | self.head_dim = self.hidden_size // self.num_heads 43 | self.head_qk_dim = self.key_dim // self.num_heads 44 | self.head_v_dim = self.value_dim // self.num_heads 45 | 46 | if config.la_output_norm == 'rmsnorm': 47 | self.la_output_norm = RMSNorm( 48 | hidden_size=self.head_v_dim, elementwise_affine=config.la_elementwise_affine, eps=config.la_norm_eps) 49 | elif config.la_output_norm == 'identity': 50 | self.la_output_norm = torch.nn.Identity() 51 | elif config.la_output_norm == 'groupnorm': 52 | self.la_output_norm = GroupNorm( 53 | self.num_heads, self.value_dim, elementwise_affine=config.la_elementwise_affine, bias=True, eps=config.la_norm_eps) 54 | else: 55 | raise NotImplementedError( 56 | f"Not supported output norm `{self.la_output_norm}`.") 57 | 58 | if self.la_mode == 'chunk': 59 | self._la_impl = chunk_rwkv7 60 | elif self.la_mode == 'fused_recurrent': 61 | self._la_impl = fused_recurrent_rwkv7 62 | 63 | self.apply(self._initialize_weights) 64 | 65 | def _initialize_weights(self, module: torch.nn.Module): 66 | if getattr(module, "_is_hf_initialized", False): 67 | return 68 | if isinstance(module, torch.nn.Linear): 69 | torch.nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5) 70 | if module.bias is not None: 71 | torch.nn.init.zeros_(module.bias) 72 | module._is_hf_initialized = True 73 | 74 | def forward( 75 | self, 76 | r: torch.Tensor, 77 | w: torch.Tensor, 78 | k: torch.Tensor, 79 | v: torch.Tensor, 80 | a: torch.Tensor, 81 | b: torch.Tensor, 82 | scale: float, 83 | ) -> torch.Tensor: 84 | 85 | # expects q: b, h, n, d 86 | output, _ = self._la_impl(r, w, k, v, a, b, scale) 87 | if isinstance(self.la_output_norm, GroupNorm): 88 | output = self.la_output_norm(rearrange(output, 'b h n d -> b n (h d)')) 89 | output = rearrange(output, 'b n (h d) -> n b (h d)', h=self.head_dim) 90 | elif isinstance(self.la_output_norm, RMSNorm): 91 | output = self.la_output_norm(output) 92 | output = rearrange(output, 'b h n d -> n b (h d)') 93 | 94 | return output 95 | -------------------------------------------------------------------------------- /linear_moe/tokenizer/jiebabpe_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Alibaba PAI Team. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from megatron.tokenizer.tokenizer import AbstractTokenizer 16 | from tokenizers import Tokenizer 17 | 18 | 19 | class JiebaBPETokenizer(AbstractTokenizer): 20 | """SentencePiece BPE tokenizer with Jieba integration""" 21 | def __init__(self, tokenizer_json_file): 22 | name = 'Jieba BPE Tokenizer' 23 | super().__init__(name) 24 | 25 | self.tokenizer = Tokenizer.from_file(tokenizer_json_file) 26 | self.eod_id = self.tokenizer.token_to_id('<|endoftext|>') 27 | 28 | try: 29 | import rjieba 30 | except ImportError: 31 | raise ImportError( 32 | 'You need to install rjieba to use JiebaTokenizer. ' 33 | 'See https://pypi.org/project/rjieba/ for installation.') 34 | self.jieba = rjieba 35 | self.new_line = self.vocab['\n'] 36 | self.sep_token = self.vocab[''] 37 | 38 | @property 39 | def vocab_size(self): 40 | return self.tokenizer.get_vocab_size(with_added_tokens=True) 41 | 42 | @property 43 | def vocab(self): 44 | return self.tokenizer.get_vocab(with_added_tokens=True) 45 | 46 | @property 47 | def inv_vocab(self): 48 | vocab = self.vocab 49 | inv_vocab = dict() 50 | for key, val in vocab.items(): 51 | inv_vocab[val] = key 52 | return inv_vocab 53 | 54 | def tokenize(self, text, is_code=False, only_cut=False): 55 | if only_cut: 56 | seg_list = [x for x in self.jieba.cut(text)] 57 | return seg_list 58 | if not is_code: 59 | seg_list = [x for x in self.jieba.cut(text)] 60 | return self.tokenizer.encode(seg_list, 61 | is_pretokenized=True, 62 | add_special_tokens=True).ids 63 | else: 64 | return self.tokenizer.encode(text, 65 | is_pretokenized=False, 66 | add_special_tokens=True).ids 67 | 68 | def detokenize(self, token_ids): 69 | text = self.tokenizer.decode(token_ids, skip_special_tokens=False) 70 | return text 71 | 72 | def convert_tokens_to_ids(self, tokens): 73 | return self.tokenizer.encode(tokens, 74 | is_pretokenized=True, 75 | add_special_tokens=True).ids 76 | 77 | @property 78 | def eod(self): 79 | return self.eod_id 80 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | icetk 2 | ftfy 3 | hjson 4 | ninja 5 | tokenizers 6 | transformers 7 | accelerate 8 | datasets 9 | wcwidth 10 | pyarrow 11 | jieba 12 | sentencepiece 13 | rjieba 14 | sqlitedict 15 | sacrebleu 16 | datasets 17 | tensorboard 18 | lm_dataformat 19 | tiktoken 20 | mamba-ssm 21 | megablocks==0.5.1 # torch<3.0,>=2.3.0 22 | stanford-stk==0.7.1 23 | git+https://github.com/fanshiqing/grouped_gemm@v1.0 24 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/README.md: -------------------------------------------------------------------------------- 1 | ## hf-to-megatron 2 | hf-to-megatron是一款模型ckpt转换工具,方便用户低门槛的将huggingface版的ckpt转换到megatron格式,以使用megatron-lm的分布式能力训练LLM大模型。转换后的模型需配合PAI-Megatron-Patch代码库使用。目前已经支持下列模型: 3 | 4 | + bloom 5 | + llama/alpaca 6 | + chatglm 7 | + galactica 8 | + glm 9 | + glm130B 10 | + falcon 11 | + starcoder 12 | 13 | 相关转换后的模型存放在:oss://atp-modelzoo/release/models/pai-megatron-patch/ 14 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/baichuan/configuration_baichuan.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. 2 | 3 | from transformers.configuration_utils import PretrainedConfig 4 | 5 | 6 | class BaichuanConfig(PretrainedConfig): 7 | model_type = "baichuan" 8 | keys_to_ignore_at_inference = ["past_key_values"] 9 | 10 | def __init__( 11 | self, 12 | vocab_size=64000, 13 | hidden_size=5120, 14 | intermediate_size=13696, 15 | num_hidden_layers=40, 16 | num_attention_heads=40, 17 | hidden_act="silu", 18 | model_max_length=4096, 19 | initializer_range=0.02, 20 | rms_norm_eps=1e-6, 21 | use_cache=True, 22 | pad_token_id=0, 23 | bos_token_id=1, 24 | eos_token_id=2, 25 | tie_word_embeddings=False, 26 | gradient_checkpointing=False, 27 | z_loss_weight=0, 28 | **kwargs, 29 | ): 30 | self.vocab_size = vocab_size 31 | self.model_max_length = model_max_length 32 | self.hidden_size = hidden_size 33 | self.intermediate_size = intermediate_size 34 | self.num_hidden_layers = num_hidden_layers 35 | self.num_attention_heads = num_attention_heads 36 | self.hidden_act = hidden_act 37 | self.initializer_range = initializer_range 38 | self.rms_norm_eps = rms_norm_eps 39 | self.use_cache = use_cache 40 | self.z_loss_weight = z_loss_weight 41 | self.gradient_checkpointing = (gradient_checkpointing,) 42 | super().__init__( 43 | pad_token_id=pad_token_id, 44 | bos_token_id=bos_token_id, 45 | eos_token_id=eos_token_id, 46 | tie_word_embeddings=tie_word_embeddings, 47 | **kwargs, 48 | ) 49 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/baichuan/model_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # transformers to megatron 3 | # sh model_convertor.sh /root/Megatron-LM-23.04/ /mnt/baichuan-ckpts/baichuan-13b-base/ /mnt/baichuan-ckpts/baichuan-13b-base-hf-to-megatron-tp1-pp1 1 1 baichuan-13b 0 false 4 | set -e 5 | START_TIME=$SECONDS 6 | 7 | MEGATRON_PATH=$1 8 | SOURCE_CKPT_PATH=$2 9 | TARGET_CKPT_PATH=$3 10 | TP=$4 11 | PP=$5 12 | MN=$6 #baichuan-13b 13 | EXTRA_VOCAB_SIZE=$7 14 | mg2hf=$8 15 | 16 | if [ $mg2hf = true ]; then 17 | do_options=" 18 | --convert_checkpoint_from_megatron_to_transformers 19 | " 20 | elif [ $mg2hf = false ]; then 21 | do_options="" 22 | fi 23 | 24 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 25 | 26 | python checkpoint_reshaping_and_interoperability.py \ 27 | --load_path ${SOURCE_CKPT_PATH} \ 28 | --save_path ${TARGET_CKPT_PATH} \ 29 | --target_params_dtype fp16 \ 30 | --megatron-path ${MEGATRON_PATH} \ 31 | --target_tensor_model_parallel_size ${TP} \ 32 | --target_pipeline_model_parallel_size ${PP} \ 33 | --model_name ${MN} \ 34 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 35 | ${do_options} 36 | 37 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 38 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 39 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/baichuan/te_model_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | START_TIME=$SECONDS 5 | 6 | MEGATRON_PATH=$1 7 | SOURCE_CKPT_PATH=$2 8 | TARGET_CKPT_PATH=$3 9 | TP=$4 10 | PP=$5 11 | MN=$6 #baichuan-13b 12 | EXTRA_VOCAB_SIZE=$7 13 | 14 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 15 | 16 | python hf2te.py \ 17 | --load_path ${SOURCE_CKPT_PATH} \ 18 | --save_path ${TARGET_CKPT_PATH} \ 19 | --target_params_dtype fp16 \ 20 | --megatron-path ${MEGATRON_PATH} \ 21 | --target_tensor_model_parallel_size ${TP} \ 22 | --target_pipeline_model_parallel_size ${PP} \ 23 | --model_name ${MN} \ 24 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 25 | 26 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 27 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 28 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/baichuan2/configuration_baichuan.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. 2 | 3 | from transformers.configuration_utils import PretrainedConfig 4 | 5 | 6 | class BaichuanConfig(PretrainedConfig): 7 | model_type = "baichuan" 8 | keys_to_ignore_at_inference = ["past_key_values"] 9 | 10 | def __init__( 11 | self, 12 | vocab_size=64000, 13 | hidden_size=5120, 14 | intermediate_size=13696, 15 | num_hidden_layers=40, 16 | num_attention_heads=40, 17 | hidden_act="silu", 18 | model_max_length=4096, 19 | initializer_range=0.02, 20 | rms_norm_eps=1e-6, 21 | use_cache=True, 22 | pad_token_id=0, 23 | bos_token_id=1, 24 | eos_token_id=2, 25 | tie_word_embeddings=False, 26 | gradient_checkpointing=False, 27 | z_loss_weight=0, 28 | **kwargs, 29 | ): 30 | self.vocab_size = vocab_size 31 | self.model_max_length = model_max_length 32 | self.hidden_size = hidden_size 33 | self.intermediate_size = intermediate_size 34 | self.num_hidden_layers = num_hidden_layers 35 | self.num_attention_heads = num_attention_heads 36 | self.hidden_act = hidden_act 37 | self.initializer_range = initializer_range 38 | self.rms_norm_eps = rms_norm_eps 39 | self.use_cache = use_cache 40 | self.z_loss_weight = z_loss_weight 41 | self.gradient_checkpointing = (gradient_checkpointing,) 42 | super().__init__( 43 | pad_token_id=pad_token_id, 44 | bos_token_id=bos_token_id, 45 | eos_token_id=eos_token_id, 46 | tie_word_embeddings=tie_word_embeddings, 47 | **kwargs, 48 | ) 49 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/baichuan2/hf2te_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | START_TIME=$SECONDS 5 | 6 | MEGATRON_PATH=$1 7 | SOURCE_CKPT_PATH=$2 8 | TARGET_CKPT_PATH=$3 9 | TP=$4 10 | PP=$5 11 | MN=$6 #baichuan2-7b 12 | EXTRA_VOCAB_SIZE=$7 13 | 14 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 15 | 16 | python hf2te.py \ 17 | --load_path ${SOURCE_CKPT_PATH} \ 18 | --save_path ${TARGET_CKPT_PATH} \ 19 | --target_params_dtype fp16 \ 20 | --megatron-path ${MEGATRON_PATH} \ 21 | --target_tensor_model_parallel_size ${TP} \ 22 | --target_pipeline_model_parallel_size ${PP} \ 23 | --model_name ${MN} \ 24 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 25 | 26 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 27 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 28 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/baichuan2/model_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # transformers to megatron 3 | # sh model_convertor.sh /root/Megatron-LM-23.04/ /mnt/baichuan-ckpts/baichuan-13b-base/ /mnt/baichuan-ckpts/baichuan-13b-base-hf-to-megatron-tp1-pp1 1 1 baichuan-13b 0 false 4 | # megatron to transformers 5 | # sh model_convertor.sh ../../../../Megatron-LM/ ../../../../baichuan/baichuan-13b-base-hf-to-megatron-tp4-pp1/release/ ../../../../baichuan/baichuan2-13b-mg2hf41 4 1 baichuan2-13b 0 true 6 | set -e 7 | START_TIME=$SECONDS 8 | 9 | MEGATRON_PATH=$1 10 | SOURCE_CKPT_PATH=$2 11 | TARGET_CKPT_PATH=$3 12 | TP=$4 13 | PP=$5 14 | MN=$6 #baichuan2-7b, baichuan2-13b 15 | EXTRA_VOCAB_SIZE=$7 16 | mg2hf=$8 17 | 18 | if [ $mg2hf = true ]; then 19 | do_options=" 20 | --convert_checkpoint_from_megatron_to_transformers 21 | " 22 | elif [ $mg2hf = false ]; then 23 | do_options="" 24 | fi 25 | 26 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 27 | 28 | python checkpoint_reshaping_and_interoperability.py \ 29 | --load_path ${SOURCE_CKPT_PATH} \ 30 | --save_path ${TARGET_CKPT_PATH} \ 31 | --target_params_dtype fp16 \ 32 | --megatron-path ${MEGATRON_PATH} \ 33 | --target_tensor_model_parallel_size ${TP} \ 34 | --target_pipeline_model_parallel_size ${PP} \ 35 | --model_name ${MN} \ 36 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 37 | ${do_options} 38 | 39 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 40 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 41 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/bloom/model_convertor_huggingface_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # transformers2megatron 3 | # bash model_convertor_huggingface_megatron.sh ../../Megatron-LM/ ../../bloomwcp-shrink ../../bloomwcp-shrink_mg_test 1 1 false 4 | # megatron2transformers 5 | # bash model_convertor_huggingface_megatron.sh ../../Megatron-LM/ ../../bloomwcp-shrink_mg_test/release/ ../../bloomwcp-shrink_mg2hf-wotrain 1 1 true 6 | set -e 7 | START_TIME=$SECONDS 8 | 9 | MEGATRON_PATH=$1 10 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 11 | 12 | SOURCE_CKPT_PATH=$2 13 | TARGET_CKPT_PATH=$3 14 | TP=$4 15 | PP=$5 16 | mg2hf=$6 17 | 18 | if [ $mg2hf = true ]; then 19 | do_options=" 20 | --convert_checkpoint_from_megatron_to_transformers 21 | " 22 | 23 | elif [ $mg2hf = false ]; then 24 | do_options="" 25 | fi 26 | 27 | python checkpoint_reshaping_and_interoperability.py \ 28 | --load_path ${SOURCE_CKPT_PATH} \ 29 | --save_path ${TARGET_CKPT_PATH} \ 30 | --target_params_dtype fp16 \ 31 | --megatron-path ${MEGATRON_PATH} \ 32 | --target_tensor_model_parallel_size ${TP} \ 33 | --target_pipeline_model_parallel_size ${PP}\ 34 | ${do_options} 35 | 36 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 37 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 38 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/bloom/reward_model_convertor_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # transformers2megatron 3 | # bash reward_model_convertor_megatron.sh ../../Megatron-LM/ ../../convert_models/hf-reward/reward-bloom-7b1 ../../convert_models/megatron-reward/megatron-reward-7b1-8/ 8 1 false 4 | set -e 5 | START_TIME=$SECONDS 6 | 7 | MEGATRON_PATH=$1 8 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 9 | 10 | SOURCE_CKPT_PATH=$2 11 | TARGET_CKPT_PATH=$3 12 | TP=$4 13 | PP=$5 14 | mg2hf=$6 15 | 16 | if [ $mg2hf = true ]; then 17 | do_options=" 18 | --convert_checkpoint_from_megatron_to_transformers 19 | " 20 | 21 | elif [ $mg2hf = false ]; then 22 | do_options="" 23 | fi 24 | 25 | python reward_model_to_megatron.py \ 26 | --load_path ${SOURCE_CKPT_PATH} \ 27 | --save_path ${TARGET_CKPT_PATH} \ 28 | --target_params_dtype fp16 \ 29 | --megatron-path ${MEGATRON_PATH} \ 30 | --target_tensor_model_parallel_size ${TP} \ 31 | --target_pipeline_model_parallel_size ${PP}\ 32 | ${do_options} 33 | 34 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 35 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 36 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/bloom/run_convert_deepspeed_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export NCCL_DEBUG=WARN 3 | export LC_ALL=C.UTF-8 4 | export CUDA_VISIBLE_DEVICES=7 5 | export PYTHONPATH=/workspace/RapidformerPro/:/workspace/RapidformerPro/Megatron-LM-main/:$PYTHONPATH 6 | 7 | DS_CKPT_PATH=/mnt/bloom-ckpts/bloomz-1b7-optimizer-states 8 | MG_CKPT_PATH=/mnt/bloom-ckpts/bloomz-1b7-optimizer-states-to-megatron 9 | 10 | python deepspeed_to_megatron.py \ 11 | --input_folder ${DS_CKPT_PATH} \ 12 | --output_folder ${MG_CKPT_PATH} \ 13 | --target_tp 1 \ 14 | --target_pp 1 \ 15 | --for_release 16 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/bloom/run_convert_deepspeed_to_transformers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export NCCL_DEBUG=WARN 3 | export LC_ALL=C.UTF-8 4 | export CUDA_VISIBLE_DEVICES=7 5 | export PYTHONPATH=/workspace/RapidformerPro/:/workspace/RapidformerPro/Megatron-LM-main/:$PYTHONPATH 6 | 7 | DS_CKPT_PATH=/mnt/bloom-ckpts/bloomz-1b7-optimizer-states 8 | HF_CKPT_PATH=/mnt/bloom-ckpts/bloomz-1b7-optimizer-states-to-transformers 9 | 10 | python convert_bloom_original_checkpoint_to_pytorch.py \ 11 | --bloom_checkpoint_path ${DS_CKPT_PATH} \ 12 | --pytorch_dump_folder_path ${HF_CKPT_PATH} \ 13 | --pretraining_tp 1 \ 14 | --bloom_config_file /mnt/bloom-ckpts/bloomz-1b7/config.json 15 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/chatglm/run_convert_huggingface_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | START_TIME=$SECONDS 4 | 5 | MEGATRON_PATH=$1 6 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 7 | 8 | HF_CKPT_PATH=$2 9 | MG_CKPT_PATH=$3 10 | TP=$4 11 | PP=$5 12 | 13 | python checkpoint_reshaping_and_interoperability.py \ 14 | --load_path ${HF_CKPT_PATH} \ 15 | --save_path ${MG_CKPT_PATH} \ 16 | --target_params_dtype fp32 \ 17 | --megatron-path ${MEGATRON_PATH} \ 18 | --target_tensor_model_parallel_size ${TP} \ 19 | --target_pipeline_model_parallel_size ${PP} \ 20 | 21 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 22 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 23 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/deepseek/hf2mcore_deepseek_v2_moe_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | export CUDA_VISIBLE_DEVICES=7 4 | START_TIME=$SECONDS 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 7 | 8 | MODEL_SIZE=$1 9 | SOURCE_CKPT_PATH=$2 10 | TARGET_CKPT_PATH=$3 11 | TP=$4 12 | PP=$5 13 | EP=$6 14 | mg2hf=$7 15 | HF_CKPT_PATH=$8 16 | 17 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 18 | MEGATRON_PATH=$( dirname $(dirname $( dirname ${CURRENT_DIR}))) 19 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240405 20 | 21 | if [ $MODEL_SIZE = A2.4B ]; then 22 | 23 | HIDDEN_SIZE=2048 24 | NUM_ATTN_HEADS=16 25 | NUM_LAYERS=27 26 | INTERMEDIATE_SIZE=10944 27 | MOE_INTERMEDIATE_SIZE=1408 28 | MAX_POSITION_EMBEDDINGS=163840 29 | EXTRA_VOCAB_SIZE=2400 30 | KV_LORA_RANK=512 31 | QK_NOPE_HEAD_DIM=128 32 | QK_ROPE_HEAD_DIM=64 33 | V_HEAD_DIM=128 34 | ROPE_THETA=10000 35 | SCALE_FACTOR=40 36 | NUM_EXPERTS=64 37 | ROUTER_TOPK=6 38 | NUM_SHARED_EXPERTS=2 39 | MOE_LAYER_FREQ=1 40 | 41 | moe_options=" \ 42 | --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \ 43 | --enable-shared-expert \ 44 | --moe-layer-freq ${MOE_LAYER_FREQ} \ 45 | --num-shared-experts ${NUM_SHARED_EXPERTS} \ 46 | --moe-router-topk ${ROUTER_TOPK} \ 47 | --num-experts ${NUM_EXPERTS} \ 48 | --moe-aux-loss-coeff 1e-2 \ 49 | --expert-model-parallel-size 1 \ 50 | --target-expert-model-parallel-size ${EP} \ 51 | --kv-lora-rank ${KV_LORA_RANK} \ 52 | --qk-nope-head-dim ${QK_NOPE_HEAD_DIM} \ 53 | --qk-rope-head-dim ${QK_ROPE_HEAD_DIM} \ 54 | --v-head-dim ${V_HEAD_DIM} \ 55 | --moe-router-load-balancing-type aux_loss" 56 | 57 | cpu_options=" \ 58 | --use-cpu-initialization" 59 | 60 | elif [ $MODEL_SIZE = A21B ]; then 61 | 62 | HIDDEN_SIZE=5120 63 | NUM_ATTN_HEADS=128 64 | NUM_LAYERS=60 65 | INTERMEDIATE_SIZE=12288 66 | MOE_INTERMEDIATE_SIZE=1536 67 | MAX_POSITION_EMBEDDINGS=163840 68 | EXTRA_VOCAB_SIZE=2400 69 | Q_LORA_RANK=1536 70 | KV_LORA_RANK=512 71 | QK_NOPE_HEAD_DIM=128 72 | QK_ROPE_HEAD_DIM=64 73 | V_HEAD_DIM=128 74 | ROPE_THETA=10000 75 | SCALE_FACTOR=40 76 | NUM_EXPERTS=160 77 | ROUTER_TOPK=6 78 | NUM_SHARED_EXPERTS=2 79 | MOE_LAYER_FREQ=1 80 | 81 | moe_options=" \ 82 | --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \ 83 | --enable-shared-expert \ 84 | --moe-layer-freq ${MOE_LAYER_FREQ} \ 85 | --num-shared-experts ${NUM_SHARED_EXPERTS} \ 86 | --moe-router-topk ${ROUTER_TOPK} \ 87 | --num-experts ${NUM_EXPERTS} \ 88 | --moe-aux-loss-coeff 1e-2 \ 89 | --expert-model-parallel-size 1 \ 90 | --target-expert-model-parallel-size ${EP} \ 91 | --q-lora-rank ${Q_LORA_RANK} \ 92 | --kv-lora-rank ${KV_LORA_RANK} \ 93 | --qk-nope-head-dim ${QK_NOPE_HEAD_DIM} \ 94 | --qk-rope-head-dim ${QK_ROPE_HEAD_DIM} \ 95 | --v-head-dim ${V_HEAD_DIM} \ 96 | --moe-router-load-balancing-type aux_loss" 97 | 98 | cpu_options=" \ 99 | --use-cpu-initialization" 100 | 101 | fi 102 | 103 | 104 | if [ $mg2hf = true ]; then 105 | convert_options=" \ 106 | --convert-checkpoint-from-megatron-to-transformers \ 107 | --hf-ckpt-path ${HF_CKPT_PATH}" 108 | 109 | elif [ $mg2hf = false ]; then 110 | convert_options="" 111 | fi 112 | 113 | 114 | DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 115 | 116 | torchrun ${DISTRIBUTED_ARGS} hf2mcore_deepseek_v2_moe.py \ 117 | --load ${SOURCE_CKPT_PATH} \ 118 | --save ${TARGET_CKPT_PATH} \ 119 | --target-tensor-model-parallel-size ${TP} \ 120 | --pipeline-model-parallel-size ${PP} \ 121 | --micro-batch-size 1 \ 122 | --save-interval 1 \ 123 | --bf16 \ 124 | --swiglu \ 125 | --norm-epsilon 1e-6 \ 126 | --num-layers ${NUM_LAYERS} \ 127 | --hidden-size ${HIDDEN_SIZE} \ 128 | --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \ 129 | --ffn-hidden-size ${INTERMEDIATE_SIZE} \ 130 | --num-attention-heads ${NUM_ATTN_HEADS} \ 131 | --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \ 132 | --seq-length 1 \ 133 | --no-async-tensor-model-parallel-allreduce \ 134 | --patch-tokenizer-type LLamaTokenizer \ 135 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 136 | --untie-embeddings-and-output-weights \ 137 | --no-bias-swiglu-fusion \ 138 | --no-rope-fusion \ 139 | --use-rotary-position-embeddings \ 140 | --transformer-impl transformer_engine \ 141 | --disable-bias-linear \ 142 | --normalization RMSNorm \ 143 | --use-mcore-models \ 144 | --attention-dropout 0.0 \ 145 | --hidden-dropout 0.0 \ 146 | --rotary-base ${ROPE_THETA} \ 147 | --rotary-scaling-factor ${SCALE_FACTOR} \ 148 | ${convert_options} \ 149 | ${moe_options} \ 150 | ${cpu_options} 151 | 152 | 153 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 154 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/falcon/configuration_RW.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Bloom configuration""" 16 | from transformers.configuration_utils import PretrainedConfig 17 | from transformers.utils import logging 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | class RWConfig(PretrainedConfig): 24 | model_type = "RefinedWebModel" 25 | keys_to_ignore_at_inference = ["past_key_values"] 26 | attribute_map = { 27 | "num_hidden_layers": "n_layer", 28 | "num_attention_heads": "n_head", 29 | } 30 | 31 | def __init__( 32 | self, 33 | vocab_size=250880, 34 | hidden_size=64, 35 | n_layer=2, 36 | n_head=8, 37 | layer_norm_epsilon=1e-5, 38 | initializer_range=0.02, 39 | use_cache=True, 40 | bos_token_id=1, 41 | eos_token_id=2, 42 | apply_residual_connection_post_layernorm=False, 43 | hidden_dropout=0.0, 44 | attention_dropout=0.0, 45 | multi_query=False, 46 | alibi=False, 47 | bias=False, 48 | parallel_attn=False, 49 | **kwargs, 50 | ): 51 | self.vocab_size = vocab_size 52 | # Backward compatibility with n_embed kwarg 53 | n_embed = kwargs.pop("n_embed", None) 54 | self.hidden_size = hidden_size if n_embed is None else n_embed 55 | self.n_layer = n_layer 56 | self.n_head = n_head 57 | self.layer_norm_epsilon = layer_norm_epsilon 58 | self.initializer_range = initializer_range 59 | self.use_cache = use_cache 60 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm 61 | self.hidden_dropout = hidden_dropout 62 | self.attention_dropout = attention_dropout 63 | 64 | self.bos_token_id = bos_token_id 65 | self.eos_token_id = eos_token_id 66 | self.multi_query = multi_query 67 | self.alibi = alibi 68 | self.bias = bias 69 | self.parallel_attn = parallel_attn 70 | 71 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 72 | 73 | @property 74 | def head_dim(self): 75 | return self.hidden_size // self.n_head 76 | 77 | @property 78 | def rotary(self): 79 | return not self.alibi 80 | 81 | 82 | class RWConfig_40b(PretrainedConfig): 83 | model_type = "RefinedWeb" 84 | keys_to_ignore_at_inference = ["past_key_values"] 85 | attribute_map = { 86 | "num_hidden_layers": "n_layer", 87 | "num_attention_heads": "n_head", 88 | } 89 | 90 | def __init__( 91 | self, 92 | vocab_size=250880, 93 | hidden_size=64, 94 | n_layer=2, 95 | n_head=8, 96 | layer_norm_epsilon=1e-5, 97 | initializer_range=0.02, 98 | use_cache=True, 99 | bos_token_id=1, 100 | eos_token_id=2, 101 | apply_residual_connection_post_layernorm=False, 102 | hidden_dropout=0.0, 103 | attention_dropout=0.0, 104 | n_head_kv=None, 105 | alibi=False, 106 | **kwargs, 107 | ): 108 | self.vocab_size = vocab_size 109 | # Backward compatibility with n_embed kwarg 110 | n_embed = kwargs.pop("n_embed", None) 111 | self.hidden_size = hidden_size if n_embed is None else n_embed 112 | self.n_layer = n_layer 113 | self.n_head = n_head 114 | self.layer_norm_epsilon = layer_norm_epsilon 115 | self.initializer_range = initializer_range 116 | self.use_cache = use_cache 117 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm 118 | self.hidden_dropout = hidden_dropout 119 | self.attention_dropout = attention_dropout 120 | 121 | self.bos_token_id = bos_token_id 122 | self.eos_token_id = eos_token_id 123 | self.n_head_kv = n_head if n_head_kv is None else n_head_kv 124 | self.alibi = alibi 125 | 126 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 127 | 128 | @property 129 | def head_dim(self): 130 | return self.hidden_size // self.n_head 131 | 132 | @property 133 | def rotary(self): 134 | return not self.alibi 135 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/falcon/model_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # huggingface to megatron 3 | # bash model_convertor.sh /workspace/Megatron-LM/ /mnt/falcon-ckpts/falcon-7b-hf/ /mnt/falcon-ckpts/falcon-7b-hf-to-megatron-tp1-pp1 1 1 falcon-7b 0 false 4 | # megatron to huggingface: you need to copy the corresponding tokenizer files into the save dir 5 | # bash model_convertor.sh /workspace/Megatron-LM/ /mnt/falcon-ckpts/falcon-7b-hf-to-megatron-tp1-pp1/release/ /mnt/falcon-ckpts/falcon-7b-mg2hf/ 1 1 falcon-7b 0 true 6 | 7 | set -e 8 | START_TIME=$SECONDS 9 | 10 | MEGATRON_PATH=$1 11 | HF_CKPT_PATH=$2 12 | MG_CKPT_PATH=$3 13 | TP=$4 14 | PP=$5 15 | MN=$6 #falcon-7b, falcon-40b 16 | EXTRA_VOCAB_SIZE=$7 17 | mg2hf=$8 18 | 19 | if [ $mg2hf = true ]; then 20 | do_options=" 21 | --convert_checkpoint_from_megatron_to_transformers 22 | " 23 | elif [ $mg2hf = false ]; then 24 | do_options="" 25 | fi 26 | 27 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 28 | 29 | python checkpoint_reshaping_and_interoperability.py \ 30 | --load_path ${HF_CKPT_PATH} \ 31 | --save_path ${MG_CKPT_PATH} \ 32 | --target_params_dtype fp16 \ 33 | --megatron-path ${MEGATRON_PATH} \ 34 | --target_tensor_model_parallel_size ${TP} \ 35 | --target_pipeline_model_parallel_size ${PP} \ 36 | --model_name ${MN} \ 37 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 38 | ${do_options} 39 | 40 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 41 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 42 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/falcon40b/configuration_RW.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Bloom configuration""" 16 | from transformers.configuration_utils import PretrainedConfig 17 | from transformers.utils import logging 18 | 19 | 20 | logger = logging.get_logger(__name__) 21 | 22 | 23 | class RWConfig(PretrainedConfig): 24 | model_type = "RefinedWeb" 25 | keys_to_ignore_at_inference = ["past_key_values"] 26 | attribute_map = { 27 | "num_hidden_layers": "n_layer", 28 | "num_attention_heads": "n_head", 29 | } 30 | 31 | def __init__( 32 | self, 33 | vocab_size=250880, 34 | hidden_size=64, 35 | n_layer=2, 36 | n_head=8, 37 | layer_norm_epsilon=1e-5, 38 | initializer_range=0.02, 39 | use_cache=True, 40 | bos_token_id=1, 41 | eos_token_id=2, 42 | apply_residual_connection_post_layernorm=False, 43 | hidden_dropout=0.0, 44 | attention_dropout=0.0, 45 | n_head_kv=None, 46 | alibi=False, 47 | **kwargs, 48 | ): 49 | self.vocab_size = vocab_size 50 | # Backward compatibility with n_embed kwarg 51 | n_embed = kwargs.pop("n_embed", None) 52 | self.hidden_size = hidden_size if n_embed is None else n_embed 53 | self.n_layer = n_layer 54 | self.n_head = n_head 55 | self.layer_norm_epsilon = layer_norm_epsilon 56 | self.initializer_range = initializer_range 57 | self.use_cache = use_cache 58 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm 59 | self.hidden_dropout = hidden_dropout 60 | self.attention_dropout = attention_dropout 61 | 62 | self.bos_token_id = bos_token_id 63 | self.eos_token_id = eos_token_id 64 | self.n_head_kv = n_head if n_head_kv is None else n_head_kv 65 | self.alibi = alibi 66 | 67 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 68 | 69 | @property 70 | def head_dim(self): 71 | return self.hidden_size // self.n_head 72 | 73 | @property 74 | def rotary(self): 75 | return not self.alibi 76 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/falcon40b/model_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # transformers to megatron 3 | # bash model_convertor.sh /root/Megatron-LM/ /mnt/falcon-ckpts/falcon-40b-hf /mnt/falcon-ckpts/falcon-40b-hf-to-megatron-tp2-pp1 2 1 falcon-40b 0 false 4 | # megatron to transformers: You need to copy the tokenizer files into the save_path 5 | # bash model_convertor.sh /root/Megatron-LM/ /mnt/falcon-ckpts/falcon-40b-hf-to-megatron-tp2-pp1/release /mnt/falcon-ckpts/falcon-40b-mg2hf 2 1 falcon-40b 0 true 6 | 7 | set -e 8 | START_TIME=$SECONDS 9 | 10 | MEGATRON_PATH=$1 11 | HF_CKPT_PATH=$2 12 | MG_CKPT_PATH=$3 13 | TP=$4 14 | PP=$5 15 | MN=$6 #falcon-40b 16 | EXTRA_VOCAB_SIZE=$7 17 | mg2hf=$8 18 | 19 | if [ $mg2hf = true ]; then 20 | do_options=" 21 | --convert_checkpoint_from_megatron_to_transformers 22 | " 23 | elif [ $mg2hf = false ]; then 24 | do_options="" 25 | fi 26 | 27 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 28 | 29 | python checkpoint_reshaping_and_interoperability.py \ 30 | --load_path ${HF_CKPT_PATH} \ 31 | --save_path ${MG_CKPT_PATH} \ 32 | --target_params_dtype fp16 \ 33 | --megatron-path ${MEGATRON_PATH} \ 34 | --target_tensor_model_parallel_size ${TP} \ 35 | --target_pipeline_model_parallel_size ${PP} \ 36 | --model_name ${MN} \ 37 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 38 | ${do_options} 39 | 40 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 41 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 42 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/galactica/run_convert_huggingface_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | START_TIME=$SECONDS 4 | 5 | MEGATRON_PATH=$1 6 | HF_CKPT_PATH=$2 7 | MG_CKPT_PATH=$3 8 | TP=$4 9 | PP=$5 10 | MN=$6 #galactica-6.7b, galactica-30b 11 | EXTRA_VOCAB_SIZE=$7 12 | 13 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 14 | 15 | python checkpoint_reshaping_and_interoperability.py \ 16 | --load_path ${HF_CKPT_PATH} \ 17 | --save_path ${MG_CKPT_PATH} \ 18 | --target_params_dtype fp16 \ 19 | --megatron-path ${MEGATRON_PATH} \ 20 | --target_tensor_model_parallel_size ${TP} \ 21 | --target_pipeline_model_parallel_size ${PP} \ 22 | --model_name ${MN} \ 23 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} 24 | 25 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 26 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 27 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/glm/run_convert_transformers_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | START_TIME=$SECONDS 4 | 5 | MEGATRON_PATH=$1 6 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 7 | 8 | HF_CKPT_PATH=$2 9 | MG_CKPT_PATH=$3 10 | TP=$4 11 | PP=$5 12 | 13 | python checkpoint_reshaping_and_interoperability.py \ 14 | --load_path ${HF_CKPT_PATH} \ 15 | --save_path ${MG_CKPT_PATH} \ 16 | --target_params_dtype fp16 \ 17 | --megatron-path ${MEGATRON_PATH} \ 18 | --target_tensor_model_parallel_size ${TP} \ 19 | --target_pipeline_model_parallel_size ${PP} \ 20 | 21 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 22 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 23 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/glm130b/run_convert_transformers_to_megatron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | START_TIME=$SECONDS 4 | 5 | MEGATRON_PATH=$1 6 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 7 | 8 | HF_CKPT_PATH=$2 9 | MG_CKPT_PATH=$3 10 | TP=$4 11 | PP=$5 12 | 13 | python checkpoint_reshaping_and_interoperability.py \ 14 | --load_path ${HF_CKPT_PATH} \ 15 | --save_path ${MG_CKPT_PATH} \ 16 | --target_params_dtype fp16 \ 17 | --megatron-path ${MEGATRON_PATH} \ 18 | --target_tensor_model_parallel_size ${TP} \ 19 | --target_pipeline_model_parallel_size ${PP} \ 20 | 21 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 22 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 23 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/llama/hf2mcore_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | export CUDA_VISIBLE_DEVICES=7 5 | START_TIME=$SECONDS 6 | MASTER_ADDR=localhost 7 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 8 | 9 | MODEL_SIZE=$1 10 | HG_CKPT_PATH=$2 11 | MEGATRON_PATH=$3 12 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240126 13 | SOURCE_CKPT_PATH=$4 14 | TARGET_CKPT_PATH=$5 15 | TP=$6 16 | PP=$7 17 | EXTRA_VOCAB_SIZE=$8 18 | NUM_EXPERTS=$9 19 | EXPERTS_TOPK=${10} 20 | EP=${11} 21 | NUM_EXPERT_SPLITS=${12} 22 | mg2hf=${13} 23 | 24 | if [ $MODEL_SIZE = 7B ]; then 25 | 26 | NUM_LAYERS=32 27 | HIDDEN_SIZE=4096 28 | NUM_ATTN_HEADS=32 29 | INTERMEDIATE_SIZE=11008 30 | NUM_KV_HEADS=32 31 | VOCAB_SIZE=32000 32 | ROPE_THETA=10000 33 | 34 | gqa_options="" 35 | 36 | elif [ $MODEL_SIZE = 13B ]; then 37 | 38 | NUM_LAYERS=40 39 | HIDDEN_SIZE=5120 40 | NUM_ATTN_HEADS=40 41 | INTERMEDIATE_SIZE=13824 42 | NUM_KV_HEADS=40 43 | VOCAB_SIZE=32000 44 | ROPE_THETA=10000 45 | gqa_options="" 46 | 47 | elif [ $MODEL_SIZE = 70B ]; then 48 | 49 | NUM_LAYERS=80 50 | HIDDEN_SIZE=8192 51 | NUM_ATTN_HEADS=64 52 | INTERMEDIATE_SIZE=28672 53 | NUM_KV_HEADS=8 54 | VOCAB_SIZE=32000 55 | ROPE_THETA=10000 56 | gqa_options=" \ 57 | --group-query-attention \ 58 | --num-query-groups 8" 59 | 60 | elif [ $MODEL_SIZE = 8B ]; then 61 | 62 | NUM_LAYERS=32 63 | HIDDEN_SIZE=4096 64 | NUM_ATTN_HEADS=32 65 | INTERMEDIATE_SIZE=14336 66 | NUM_KV_HEADS=8 67 | VOCAB_SIZE=128256 68 | ROPE_THETA=500000 69 | 70 | gqa_options=" \ 71 | --group-query-attention \ 72 | --num-query-groups 8" 73 | 74 | fi 75 | 76 | if [ $NUM_EXPERT_SPLITS -gt 0 ]; then 77 | 78 | INTERMEDIATE_SIZE=$(( ${INTERMEDIATE_SIZE} / ${NUM_EXPERT_SPLITS})) 79 | 80 | fi 81 | 82 | if [ $NUM_EXPERTS -gt 0 ]; then 83 | expert_options=" 84 | --moe-router-topk ${EXPERTS_TOPK} \ 85 | --num-experts ${NUM_EXPERTS} \ 86 | --expert-model-parallel-size 1 \ 87 | --target_expert_model_parallel_size ${EP} \ 88 | --num_expert_split_size ${NUM_EXPERT_SPLITS} \ 89 | " 90 | fi 91 | 92 | if [ $mg2hf = true ]; then 93 | convert_options=" 94 | --convert_checkpoint_from_megatron_to_transformers 95 | " 96 | elif [ $mg2hf = false ]; then 97 | convert_options="" 98 | fi 99 | 100 | template_json="./hf_llama_moe/config_TEMPLATE.json" 101 | config_json="./hf_llama_moe/config.json" 102 | sed "s/CONFIG_HIDDEN_SIZE/${HIDDEN_SIZE}/" ${template_json} \ 103 | | sed "s/CONFIG_INTERMEDIATE_SIZE/${INTERMEDIATE_SIZE}/" \ 104 | | sed "s/CONFIG_ATTENTION_HEADS/${NUM_ATTN_HEADS}/" \ 105 | | sed "s/CONFIG_HIDDEN_LAYERS/${NUM_LAYERS}/" \ 106 | | sed "s/CONFIG_NUM_EXPERTS/${NUM_EXPERTS}/" \ 107 | | sed "s/CONFIG_EXPERTS_topk/${EXPERTS_TOPK}/" \ 108 | | sed "s/CONFIG_KV_HEADS/${NUM_KV_HEADS}/" \ 109 | | sed "s/CONFIG_VOCAB_SIZE/${VOCAB_SIZE}/" \ 110 | | sed "s/CONFIG_ROPE_THETA/${ROPE_THETA}/" \ 111 | > ${config_json} 112 | 113 | DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 114 | 115 | if [ $MODEL_SIZE != 70B ]; then 116 | 117 | torchrun ${DISTRIBUTED_ARGS} hf2mcore.py \ 118 | --load_path ${SOURCE_CKPT_PATH} \ 119 | --save_path ${TARGET_CKPT_PATH} \ 120 | --load ${HG_CKPT_PATH} \ 121 | --huggingface_model_path ${HG_CKPT_PATH} \ 122 | --megatron-path ${MEGATRON_PATH} \ 123 | --target_tensor_model_parallel_size ${TP} \ 124 | --target_pipeline_model_parallel_size ${PP} \ 125 | --micro-batch-size 1 \ 126 | --fp16 \ 127 | --swiglu \ 128 | --num-layers 1 \ 129 | --hidden-size 1 \ 130 | --ffn-hidden-size 1 \ 131 | --norm-epsilon 1e-5 \ 132 | --num-attention-heads 1 \ 133 | --max-position-embeddings 1 \ 134 | --seq-length 1 \ 135 | --no-async-tensor-model-parallel-allreduce \ 136 | --patch-tokenizer-type LLamaTokenizer \ 137 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 138 | --untie-embeddings-and-output-weights \ 139 | --no-rope-fusion \ 140 | --use-rotary-position-embeddings \ 141 | --transformer-impl transformer_engine \ 142 | --disable-bias-linear \ 143 | --normalization RMSNorm \ 144 | --use-mcore-models \ 145 | --attention-dropout 0.0 \ 146 | --hidden-dropout 0.0 \ 147 | ${expert_options} \ 148 | ${convert_options} \ 149 | ${gqa_options} 150 | 151 | else 152 | python hf2mcore_70b.py \ 153 | --load ${HG_CKPT_PATH} \ 154 | --megatron-path ${MEGATRON_PATH} \ 155 | --load_path ${SOURCE_CKPT_PATH} \ 156 | --save_path ${TARGET_CKPT_PATH} \ 157 | --target_params_dtype bf16 \ 158 | --target_tensor_model_parallel_size ${TP} \ 159 | --target_pipeline_model_parallel_size ${PP} \ 160 | ${convert_options} \ 161 | 162 | fi 163 | 164 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 165 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/llama/hf2megatron_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | START_TIME=$SECONDS 5 | 6 | MEGATRON_PATH=$1 7 | SOURCE_CKPT_PATH=$2 8 | TARGET_CKPT_PATH=$3 9 | TP=$4 10 | PP=$5 11 | MN=$6 #llama-7b, llama-13b, llama-30b, llama-65b, llama2-7b, llama2-13b, llama2-70b 12 | EXTRA_VOCAB_SIZE=$7 13 | mg2hf=$8 14 | 15 | if [ $mg2hf = true ]; then 16 | do_options=" 17 | --convert_checkpoint_from_megatron_to_transformers 18 | " 19 | elif [ $mg2hf = false ]; then 20 | do_options="" 21 | fi 22 | 23 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-231007 24 | 25 | python hf2megatron.py \ 26 | --load_path ${SOURCE_CKPT_PATH} \ 27 | --save_path ${TARGET_CKPT_PATH} \ 28 | --target_params_dtype fp16 \ 29 | --megatron-path ${MEGATRON_PATH} \ 30 | --target_tensor_model_parallel_size ${TP} \ 31 | --target_pipeline_model_parallel_size ${PP} \ 32 | --model_name ${MN} \ 33 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 34 | ${do_options} 35 | 36 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 37 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 38 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/llama/hf_llama_moe/config_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "LlamaMoeForCausalLM" 4 | ], 5 | "auto_map": { 6 | "AutoModelForCausalLM": "llama_moe.LlamaMoeForCausalLM", 7 | "AutoConfig": "llama_moe.LlamaMoeConfig" 8 | }, 9 | "bos_token_id": 1, 10 | "eos_token_id": 2, 11 | "pad_token_id": 0, 12 | "hidden_act": "silu", 13 | "hidden_size": CONFIG_HIDDEN_SIZE, 14 | "initializer_range": 0.02, 15 | "intermediate_size": CONFIG_INTERMEDIATE_SIZE, 16 | "max_position_embeddings": 2048, 17 | "model_type": "llama", 18 | "num_attention_heads": CONFIG_ATTENTION_HEADS, 19 | "num_hidden_layers": CONFIG_HIDDEN_LAYERS, 20 | "num_key_value_heads": CONFIG_KV_HEADS, 21 | "pretraining_tp": 2, 22 | "rms_norm_eps": 1e-05, 23 | "rope_theta": CONFIG_ROPE_THETA, 24 | "rope_scaling": null, 25 | "tie_word_embeddings": false, 26 | "torch_dtype": "float16", 27 | "transformers_version": "4.36.0.dev0", 28 | "use_cache": true, 29 | "vocab_size": CONFIG_VOCAB_SIZE, 30 | "num_local_experts": CONFIG_NUM_EXPERTS, 31 | "num_experts_per_tok": CONFIG_EXPERTS_topk 32 | } -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/llama/hf_llama_moe/llama_moe.py: -------------------------------------------------------------------------------- 1 | from transformers.models.llama.modeling_llama import LlamaConfig 2 | from transformers.models.llama.modeling_llama import LlamaForCausalLM 3 | from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock 4 | 5 | 6 | def get_hidden_output(module, args, output): 7 | return output[0] 8 | 9 | 10 | class LlamaMoeForCausalLM(LlamaForCausalLM): 11 | def __init__(self, config): 12 | super().__init__(config) 13 | for layer in self.model.layers: 14 | mlp = MixtralSparseMoeBlock(config) 15 | mlp.register_forward_hook(get_hidden_output) 16 | layer.mlp = mlp 17 | 18 | 19 | class LlamaMoeConfig(LlamaConfig): 20 | def __init__(self, *args, **kwargs): 21 | super().__init__(*args, **kwargs) 22 | # create params used in MixtralSparseMoeBlock 23 | self.hidden_dim = self.hidden_size 24 | self.ffn_dim = self.intermediate_size 25 | self.num_local_experts = kwargs.get('num_local_experts', 0) 26 | self.top_k = kwargs.get('num_experts_per_tok', 2) 27 | 28 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/mistral/hf2mcore_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | START_TIME=$SECONDS 5 | export CUDA_VISIBLE_DEVICES=0 6 | MASTER_ADDR=localhost 7 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 8 | NNODES=1 9 | NODE_RANK=0 10 | GPUS_PER_NODE=1 11 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 12 | 13 | MODEL_SIZE=$1 14 | HG_CKPT_PATH=$2 15 | MEGATRON_PATH=$3 16 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240126 17 | SOURCE_CKPT_PATH=$4 18 | TARGET_CKPT_PATH=$5 19 | TP=$6 20 | PP=$7 21 | EXTRA_VOCAB_SIZE=$8 22 | NUM_EXPERTS=$9 23 | EXPERTS_TOPK=${10} 24 | EP=${11} 25 | mg2hf=${12} 26 | 27 | if [ $MODEL_SIZE = 7B ]; then 28 | 29 | NUM_LAYERS=32 30 | HIDDEN_SIZE=4096 31 | NUM_ATTN_HEADS=32 32 | INTERMEDIATE_SIZE=14336 33 | NUM_KEY_VALUE_HEADS=8 34 | 35 | gqa_options=" \ 36 | --group-query-attention \ 37 | --num-query-groups 8" 38 | 39 | elif [ $MODEL_SIZE = 8x7B ]; then 40 | 41 | NUM_LAYERS=32 42 | HIDDEN_SIZE=4096 43 | NUM_ATTN_HEADS=32 44 | INTERMEDIATE_SIZE=14336 45 | NUM_KEY_VALUE_HEADS=8 46 | WS=${13} 47 | gqa_options=" \ 48 | --group-query-attention \ 49 | --num-query-groups 8" 50 | 51 | fi 52 | 53 | 54 | if [ $NUM_EXPERTS -gt 0 ]; then 55 | expert_options=" 56 | --moe-router-topk ${EXPERTS_TOPK} \ 57 | --num-experts ${NUM_EXPERTS} \ 58 | --target_expert_model_parallel_size ${EP} 59 | " 60 | fi 61 | 62 | if [ $mg2hf = true ]; then 63 | convert_options=" 64 | --convert_checkpoint_from_megatron_to_transformers 65 | " 66 | elif [ $mg2hf = false ]; then 67 | convert_options="" 68 | fi 69 | 70 | template_json="./hf_mistral_moe/config_TEMPLATE.json" 71 | config_json="./hf_mistral_moe/config.json" 72 | sed "s/CONFIG_HIDDEN_SIZE/${HIDDEN_SIZE}/" ${template_json} \ 73 | | sed "s/CONFIG_INTERMEDIATE_SIZE/${INTERMEDIATE_SIZE}/" \ 74 | | sed "s/CONFIG_ATTENTION_HEADS/${NUM_ATTN_HEADS}/" \ 75 | | sed "s/CONFIG_HIDDEN_LAYERS/${NUM_LAYERS}/" \ 76 | | sed "s/CONFIG_NUM_EXPERTS/${NUM_EXPERTS}/" \ 77 | | sed "s/CONFIG_EXPERTS_topk/${EXPERTS_TOPK}/" \ 78 | | sed "s/CONFIG_KV_HEADS/${NUM_KEY_VALUE_HEADS}/" \ 79 | > ${config_json} 80 | 81 | if [ $MODEL_SIZE = 7B ]; then 82 | 83 | torchrun ${DISTRIBUTED_ARGS} hf2mcore.py \ 84 | --load_path ${SOURCE_CKPT_PATH} \ 85 | --save_path ${TARGET_CKPT_PATH} \ 86 | --load ${HG_CKPT_PATH} \ 87 | --huggingface_model_path ${HG_CKPT_PATH} \ 88 | --megatron-path ${MEGATRON_PATH} \ 89 | --target_tensor_model_parallel_size ${TP} \ 90 | --target_pipeline_model_parallel_size ${PP} \ 91 | --micro-batch-size 1 \ 92 | --fp16 \ 93 | --swiglu \ 94 | --num-layers 1 \ 95 | --hidden-size 1 \ 96 | --ffn-hidden-size 1 \ 97 | --norm-epsilon 1e-5 \ 98 | --num-attention-heads 1 \ 99 | --max-position-embeddings 1 \ 100 | --seq-length 1 \ 101 | --no-async-tensor-model-parallel-allreduce \ 102 | --patch-tokenizer-type LLamaTokenizer \ 103 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 104 | --untie-embeddings-and-output-weights \ 105 | --no-rope-fusion \ 106 | --use-rotary-position-embeddings \ 107 | --transformer-impl transformer_engine \ 108 | --disable-bias-linear \ 109 | --normalization RMSNorm \ 110 | --use-mcore-models \ 111 | --attention-dropout 0.0 \ 112 | --hidden-dropout 0.0 \ 113 | ${expert_options} \ 114 | ${convert_options} \ 115 | ${gqa_options} \ 116 | 117 | elif [ $MODEL_SIZE = 8x7B ]; then 118 | 119 | python hf2mcore_mixtral.py \ 120 | --megatron-path ${MEGATRON_PATH} \ 121 | --load_path ${SOURCE_CKPT_PATH} \ 122 | --save_path ${TARGET_CKPT_PATH} \ 123 | --target_params_dtype bf16 \ 124 | --target_tensor_model_parallel_size ${TP} \ 125 | --target_pipeline_model_parallel_size ${PP} \ 126 | --target_expert_model_parallel_size ${EP} \ 127 | --world_size ${WS} \ 128 | ${convert_options} \ 129 | 130 | fi 131 | 132 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 133 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/mistral/hf2megatron_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | START_TIME=$SECONDS 5 | 6 | MEGATRON_PATH=$1 7 | SOURCE_CKPT_PATH=$2 8 | TARGET_CKPT_PATH=$3 9 | TP=$4 10 | PP=$5 11 | MN=$6 #mistral-7b 12 | EXTRA_VOCAB_SIZE=$7 13 | mg2hf=$8 14 | 15 | if [ $mg2hf = true ]; then 16 | do_options=" 17 | --convert_checkpoint_from_megatron_to_transformers 18 | " 19 | elif [ $mg2hf = false ]; then 20 | do_options="" 21 | fi 22 | 23 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-231007 24 | 25 | python hf2megatron.py \ 26 | --load_path ${SOURCE_CKPT_PATH} \ 27 | --save_path ${TARGET_CKPT_PATH} \ 28 | --target_params_dtype fp16 \ 29 | --megatron-path ${MEGATRON_PATH} \ 30 | --target_tensor_model_parallel_size ${TP} \ 31 | --target_pipeline_model_parallel_size ${PP} \ 32 | --model_name ${MN} \ 33 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 34 | ${do_options} 35 | 36 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 37 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 38 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/mistral/hf_mistral_moe/config_TEMPLATE.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "MixtralForCausalLM" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": CONFIG_HIDDEN_SIZE, 10 | "initializer_range": 0.02, 11 | "intermediate_size": CONFIG_INTERMEDIATE_SIZE, 12 | "max_position_embeddings": 32768, 13 | "model_type": "mixtral", 14 | "num_attention_heads": CONFIG_ATTENTION_HEADS, 15 | "num_experts_per_tok": CONFIG_EXPERTS_topk, 16 | "num_hidden_layers": CONFIG_HIDDEN_LAYERS, 17 | "num_key_value_heads": CONFIG_KV_HEADS, 18 | "num_local_experts": CONFIG_NUM_EXPERTS, 19 | "rms_norm_eps": 1e-05, 20 | "rope_theta": 1000000.0, 21 | "router_aux_loss_coef": 0.02, 22 | "sliding_window": 4096, 23 | "tie_word_embeddings": false, 24 | "torch_dtype": "float16", 25 | "transformers_version": "4.36.0.dev0", 26 | "use_cache": true, 27 | "vocab_size": 32000 28 | } -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # bash hf2mcore_qwen1.5_dense_convertor.sh 0.5B /mnt/qwen-ckpts/Qwen1.5-0.5B /mnt/qwen-ckpts/Qwen1.5-0.5B-hf-to-mcore-tp2-pp1 2 1 false 3 | 4 | set -e 5 | export CUDA_VISIBLE_DEVICES=7 6 | START_TIME=$SECONDS 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 9 | 10 | MODEL_SIZE=$1 11 | SOURCE_CKPT_PATH=$2 12 | TARGET_CKPT_PATH=$3 13 | TP=$4 14 | PP=$5 15 | mg2hf=$6 16 | HF_CKPT_PATH=$7 17 | 18 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 19 | MEGATRON_PATH=$( dirname $(dirname $( dirname ${CURRENT_DIR}))) 20 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240405 21 | 22 | if [ $MODEL_SIZE = 0.5B ]; then 23 | 24 | NUM_LAYERS=24 25 | HIDDEN_SIZE=1024 26 | NUM_ATTN_HEADS=16 27 | INTERMEDIATE_SIZE=2816 28 | EXTRA_VOCAB_SIZE=293 29 | 30 | gqa_options="" 31 | cpu_options="" 32 | 33 | elif [ $MODEL_SIZE = 1.8B ]; then 34 | 35 | NUM_LAYERS=24 36 | HIDDEN_SIZE=2048 37 | NUM_ATTN_HEADS=16 38 | INTERMEDIATE_SIZE=5504 39 | EXTRA_VOCAB_SIZE=293 40 | 41 | gqa_options="" 42 | cpu_options="" 43 | 44 | elif [ $MODEL_SIZE = 7B ]; then 45 | 46 | NUM_LAYERS=32 47 | HIDDEN_SIZE=4096 48 | NUM_ATTN_HEADS=32 49 | INTERMEDIATE_SIZE=11008 50 | EXTRA_VOCAB_SIZE=293 51 | 52 | gqa_options="" 53 | cpu_options="" 54 | 55 | elif [ $MODEL_SIZE = 14B ]; then 56 | 57 | NUM_LAYERS=40 58 | HIDDEN_SIZE=5120 59 | NUM_ATTN_HEADS=40 60 | INTERMEDIATE_SIZE=13696 61 | EXTRA_VOCAB_SIZE=293 62 | 63 | gqa_options="" 64 | cpu_options="" 65 | 66 | elif [ $MODEL_SIZE = 32B ]; then 67 | 68 | NUM_LAYERS=64 69 | HIDDEN_SIZE=5120 70 | NUM_ATTN_HEADS=40 71 | INTERMEDIATE_SIZE=27392 72 | EXTRA_VOCAB_SIZE=293 73 | 74 | cpu_options="" 75 | gqa_options=" \ 76 | --group-query-attention \ 77 | --num-query-groups 8" 78 | 79 | elif [ $MODEL_SIZE = 72B ]; then 80 | 81 | NUM_LAYERS=80 82 | HIDDEN_SIZE=8192 83 | NUM_ATTN_HEADS=64 84 | INTERMEDIATE_SIZE=24576 85 | EXTRA_VOCAB_SIZE=421 86 | 87 | gqa_options="" 88 | cpu_options=" \ 89 | --use-cpu-initialization" 90 | 91 | fi 92 | 93 | if [ $mg2hf = true ]; then 94 | convert_options=" \ 95 | --convert-checkpoint-from-megatron-to-transformers \ 96 | --hf-ckpt-path ${HF_CKPT_PATH}" 97 | 98 | elif [ $mg2hf = false ]; then 99 | convert_options="" 100 | fi 101 | 102 | 103 | DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 104 | 105 | if [ $MODEL_SIZE != 32B ]; then 106 | 107 | torchrun ${DISTRIBUTED_ARGS} hf2mcore_qwen1.5_dense_mha.py \ 108 | --load ${SOURCE_CKPT_PATH} \ 109 | --save ${TARGET_CKPT_PATH} \ 110 | --target-tensor-model-parallel-size ${TP} \ 111 | --pipeline-model-parallel-size ${PP} \ 112 | --micro-batch-size 1 \ 113 | --save-interval 1 \ 114 | --fp16 \ 115 | --swiglu \ 116 | --norm-epsilon 1e-6 \ 117 | --num-layers ${NUM_LAYERS} \ 118 | --hidden-size ${HIDDEN_SIZE} \ 119 | --ffn-hidden-size ${INTERMEDIATE_SIZE} \ 120 | --num-attention-heads ${NUM_ATTN_HEADS} \ 121 | --max-position-embeddings 1 \ 122 | --seq-length 1 \ 123 | --no-async-tensor-model-parallel-allreduce \ 124 | --patch-tokenizer-type Qwen2Tokenizer \ 125 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 126 | --untie-embeddings-and-output-weights \ 127 | --no-rope-fusion \ 128 | --use-rotary-position-embeddings \ 129 | --transformer-impl transformer_engine \ 130 | --disable-bias-linear \ 131 | --normalization RMSNorm \ 132 | --add-qkv-bias \ 133 | --use-mcore-models \ 134 | --attention-dropout 0.0 \ 135 | --hidden-dropout 0.0 \ 136 | ${convert_options} \ 137 | ${gqa_options} \ 138 | ${cpu_options} 139 | 140 | else 141 | python hf2mcore_qwen1.5_dense_gqa.py \ 142 | --load ${SOURCE_CKPT_PATH} \ 143 | --save ${TARGET_CKPT_PATH} \ 144 | --target-params-dtype bf16 \ 145 | --target-tensor-model-parallel-size ${TP} \ 146 | --target-pipeline-model-parallel-size ${PP} \ 147 | ${convert_options} \ 148 | 149 | fi 150 | 151 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 152 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_to_moe_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # bash hf2mcore_qwen1.5_dense_to_moe_convertor.sh 1.8B /mnt/qwen-ckpts/Qwen1.5-1.8B /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B-to-mcore-tp1-pp1-ep4 1 1 4 60 4 1408 3 | 4 | set -e 5 | export CUDA_VISIBLE_DEVICES=7 6 | START_TIME=$SECONDS 7 | MASTER_ADDR=localhost 8 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 9 | 10 | MODEL_SIZE=$1 11 | SOURCE_CKPT_PATH=$2 12 | TARGET_CKPT_PATH=$3 13 | TP=$4 14 | PP=$5 15 | EP=$6 16 | NUM_EXPERTS=$7 17 | NUM_SPLITS=$8 18 | MOE_INTERMEDIATE_SIZE=$9 19 | 20 | 21 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 22 | MEGATRON_PATH=$( dirname $(dirname $( dirname ${CURRENT_DIR}))) 23 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240405 24 | 25 | if [ $MODEL_SIZE = 1.8B ]; then 26 | 27 | NUM_LAYERS=24 28 | HIDDEN_SIZE=2048 29 | NUM_ATTN_HEADS=16 30 | INTERMEDIATE_SIZE=5504 31 | EXTRA_VOCAB_SIZE=293 32 | SHARED_EXPERT_INTERMEDIATE_SIZE=$(( ${MOE_INTERMEDIATE_SIZE} * 4 )) 33 | 34 | gqa_options="" 35 | cpu_options=" \ 36 | --use-cpu-initialization" 37 | 38 | fi 39 | 40 | DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 41 | 42 | if [ $MODEL_SIZE != 32B ]; then 43 | 44 | torchrun ${DISTRIBUTED_ARGS} hf2mcore_qwen1.5_dense_mha_to_moe.py \ 45 | --load ${SOURCE_CKPT_PATH} \ 46 | --save ${TARGET_CKPT_PATH} \ 47 | --target-tensor-model-parallel-size ${TP} \ 48 | --pipeline-model-parallel-size ${PP} \ 49 | --micro-batch-size 1 \ 50 | --save-interval 1 \ 51 | --fp16 \ 52 | --swiglu \ 53 | --norm-epsilon 1e-6 \ 54 | --num-layers ${NUM_LAYERS} \ 55 | --hidden-size ${HIDDEN_SIZE} \ 56 | --ffn-hidden-size ${INTERMEDIATE_SIZE} \ 57 | --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \ 58 | --shared-moe-ffn-hidden-size ${SHARED_EXPERT_INTERMEDIATE_SIZE} \ 59 | --num-attention-heads ${NUM_ATTN_HEADS} \ 60 | --max-position-embeddings 1 \ 61 | --seq-length 1 \ 62 | --no-async-tensor-model-parallel-allreduce \ 63 | --patch-tokenizer-type Qwen2Tokenizer \ 64 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 65 | --untie-embeddings-and-output-weights \ 66 | --no-rope-fusion \ 67 | --use-rotary-position-embeddings \ 68 | --transformer-impl transformer_engine \ 69 | --disable-bias-linear \ 70 | --normalization RMSNorm \ 71 | --add-qkv-bias \ 72 | --use-mcore-models \ 73 | --attention-dropout 0.0 \ 74 | --hidden-dropout 0.0 \ 75 | --enable-shared-expert \ 76 | --num-experts ${NUM_EXPERTS} \ 77 | --num-splits ${NUM_SPLITS} \ 78 | --target-expert-model-parallel-size ${EP} \ 79 | ${gqa_options} \ 80 | ${cpu_options} 81 | 82 | fi 83 | 84 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 85 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_moe_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # bash hf2mcore_qwen1.5_moe_convertor.sh A2.7B /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B-to-mcore-tp1-pp1-ep4 1 1 4 false 3 | # bash hf2mcore_qwen1.5_moe_convertor.sh A2.7B /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B-to-mcore-tp1-pp1-ep4 /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B-to-hf 1 1 4 true /mnt/qwen-ckpts/Qwen1.5-MoE-A2.7B 4 | 5 | set -e 6 | export CUDA_VISIBLE_DEVICES=7 7 | START_TIME=$SECONDS 8 | MASTER_ADDR=localhost 9 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 10 | 11 | MODEL_SIZE=$1 12 | SOURCE_CKPT_PATH=$2 13 | TARGET_CKPT_PATH=$3 14 | TP=$4 15 | PP=$5 16 | EP=$6 17 | mg2hf=$7 18 | HF_CKPT_PATH=$8 19 | 20 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 21 | MEGATRON_PATH=$( dirname $(dirname $( dirname ${CURRENT_DIR}))) 22 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240405 23 | 24 | if [ $MODEL_SIZE = A2.7B ]; then 25 | 26 | HIDDEN_SIZE=2048 27 | NUM_ATTN_HEADS=16 28 | NUM_LAYERS=24 29 | INTERMEDIATE_SIZE=5632 30 | MOE_INTERMEDIATE_SIZE=1408 31 | SHARED_EXPERT_INTERMEDIATE_SIZE=5632 32 | MAX_POSITION_EMBEDDINGS=8192 33 | EXTRA_VOCAB_SIZE=293 34 | NUM_EXPERTS=60 35 | EXPERTS_TOPK=4 36 | ROPE_THETA=1000000 37 | 38 | gqa_options="" 39 | cpu_options=" \ 40 | --use-cpu-initialization" 41 | 42 | fi 43 | 44 | 45 | if [ $NUM_EXPERTS -gt 0 ]; then 46 | expert_options=" \ 47 | --moe-router-topk ${EXPERTS_TOPK} \ 48 | --num-experts ${NUM_EXPERTS} \ 49 | --target-expert-model-parallel-size ${EP}" 50 | fi 51 | 52 | if [ $mg2hf = true ]; then 53 | convert_options=" \ 54 | --convert-checkpoint-from-megatron-to-transformers \ 55 | --hf-ckpt-path ${HF_CKPT_PATH}" 56 | 57 | elif [ $mg2hf = false ]; then 58 | convert_options="" 59 | fi 60 | 61 | 62 | DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 63 | 64 | torchrun ${DISTRIBUTED_ARGS} hf2mcore_qwen1.5_moe.py \ 65 | --load ${SOURCE_CKPT_PATH} \ 66 | --save ${TARGET_CKPT_PATH} \ 67 | --target-tensor-model-parallel-size ${TP} \ 68 | --pipeline-model-parallel-size ${PP} \ 69 | --micro-batch-size 1 \ 70 | --save-interval 1 \ 71 | --bf16 \ 72 | --swiglu \ 73 | --norm-epsilon 1e-6 \ 74 | --num-layers ${NUM_LAYERS} \ 75 | --hidden-size ${HIDDEN_SIZE} \ 76 | --moe-ffn-hidden-size ${MOE_INTERMEDIATE_SIZE} \ 77 | --shared-moe-ffn-hidden-size ${SHARED_EXPERT_INTERMEDIATE_SIZE} \ 78 | --ffn-hidden-size ${INTERMEDIATE_SIZE} \ 79 | --num-attention-heads ${NUM_ATTN_HEADS} \ 80 | --max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \ 81 | --seq-length 1 \ 82 | --no-async-tensor-model-parallel-allreduce \ 83 | --patch-tokenizer-type Qwen2Tokenizer \ 84 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 85 | --untie-embeddings-and-output-weights \ 86 | --no-rope-fusion \ 87 | --use-rotary-position-embeddings \ 88 | --transformer-impl transformer_engine \ 89 | --disable-bias-linear \ 90 | --normalization RMSNorm \ 91 | --add-qkv-bias \ 92 | --use-mcore-models \ 93 | --attention-dropout 0.0 \ 94 | --hidden-dropout 0.0 \ 95 | --enable-shared-expert \ 96 | --rotary-base ${ROPE_THETA} \ 97 | ${expert_options} \ 98 | ${convert_options} \ 99 | ${gqa_options} \ 100 | ${cpu_options} 101 | 102 | 103 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 104 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/qwen/hf2megablocks_qwen1.5_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # hf2megablocks: tp1_pp1_ep8_exp8_top2 4 | 5 | # sh hf2megablocks_convertor_1.5.sh 0.5B /mnt/qwen-ckpts/Qwen1.5-0.5B ../../../ /mnt/qwen-ckpts/Qwen1.5-0.5B /mnt/qwen-ckpts/Qwen1.5-0.5B_megablocks_tp1_pp1_ep8_exp8 1 1 293 8 8 2 false 6 | 7 | set -e 8 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 9 | START_TIME=$SECONDS 10 | MASTER_ADDR=localhost 11 | MASTER_PORT=$(shuf -n 1 -i 10000-65535) 12 | NNODES=1 13 | NODE_RANK=0 14 | GPUS_PER_NODE=8 15 | 16 | MODEL_SIZE=$1 17 | HG_CKPT_PATH=$2 18 | MEGATRON_PATH=$3 19 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-MegaBlocks 20 | SOURCE_CKPT_PATH=$4 21 | TARGET_CKPT_PATH=$5 22 | TP=$6 23 | PP=$7 24 | EXTRA_VOCAB_SIZE=$8 25 | NUM_EXPERTS=$9 26 | EXPERTS_TOPK=${10} 27 | EP=${11} 28 | mg2hf=${12} 29 | 30 | if [ $MODEL_SIZE = 0.5B ]; then 31 | 32 | NUM_LAYERS=24 33 | HIDDEN_SIZE=1024 34 | NUM_ATTN_HEADS=16 35 | INTERMEDIATE_SIZE=2816 36 | 37 | elif [ $MODEL_SIZE = 1.8B ]; then 38 | 39 | NUM_LAYERS=24 40 | HIDDEN_SIZE=2048 41 | NUM_ATTN_HEADS=16 42 | INTERMEDIATE_SIZE=5504 43 | 44 | elif [ $MODEL_SIZE = 7B ]; then 45 | 46 | NUM_LAYERS=32 47 | HIDDEN_SIZE=4096 48 | NUM_ATTN_HEADS=32 49 | INTERMEDIATE_SIZE=11008 50 | 51 | elif [ $MODEL_SIZE = 14B ]; then 52 | 53 | NUM_LAYERS=40 54 | HIDDEN_SIZE=5120 55 | NUM_ATTN_HEADS=40 56 | INTERMEDIATE_SIZE=13696 57 | 58 | fi 59 | 60 | 61 | if [ $NUM_EXPERTS -gt 0 ]; then 62 | expert_options=" 63 | --moe-top-k ${EXPERTS_TOPK} \ 64 | --moe-num-experts ${NUM_EXPERTS} \ 65 | --moe-expert-model-parallelism \ 66 | --target_expert_model_parallel_size ${EP} 67 | " 68 | fi 69 | 70 | if [ $mg2hf = true ]; then 71 | convert_options=" 72 | --convert_checkpoint_from_megatron_to_transformers 73 | " 74 | elif [ $mg2hf = false ]; then 75 | convert_options="" 76 | fi 77 | 78 | template_json="./hf_qwen1.5_moe/config_TEMPLATE.json" 79 | config_json="./hf_qwen1.5_moe/config.json" 80 | sed "s/CONFIG_HIDDEN_SIZE/${HIDDEN_SIZE}/" ${template_json} \ 81 | | sed "s/CONFIG_INTERMEDIATE_SIZE/${INTERMEDIATE_SIZE}/" \ 82 | | sed "s/CONFIG_ATTENTION_HEADS/${NUM_ATTN_HEADS}/" \ 83 | | sed "s/CONFIG_HIDDEN_LAYERS/${NUM_LAYERS}/" \ 84 | | sed "s/CONFIG_NUM_EXPERTS/${NUM_EXPERTS}/" \ 85 | | sed "s/CONFIG_EXPERTS_topk/${EXPERTS_TOPK}/" \ 86 | | sed "s/CONFIG_KV_HEADS/${NUM_ATTN_HEADS}/" \ 87 | > ${config_json} 88 | 89 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 90 | torchrun ${DISTRIBUTED_ARGS} hf2megablocks_qwen1.5.py \ 91 | --load_path ${SOURCE_CKPT_PATH} \ 92 | --save_path ${TARGET_CKPT_PATH} \ 93 | --load ${HG_CKPT_PATH} \ 94 | --huggingface_model_path ${HG_CKPT_PATH} \ 95 | --megatron-path ${MEGATRON_PATH} \ 96 | --target_tensor_model_parallel_size ${TP} \ 97 | --target_pipeline_model_parallel_size ${PP} \ 98 | --micro-batch-size 1 \ 99 | --fp16 \ 100 | --swiglu \ 101 | --num-layers 1 \ 102 | --hidden-size 1 \ 103 | --ffn-hidden-size 1 \ 104 | --norm-epsilon 1e-6 \ 105 | --num-attention-heads 1 \ 106 | --max-position-embeddings 1 \ 107 | --seq-length 1 \ 108 | --no-async-tensor-model-parallel-allreduce \ 109 | --patch-tokenizer-type Qwen2Tokenizer \ 110 | --extra-vocab-size ${EXTRA_VOCAB_SIZE} \ 111 | --untie-embeddings-and-output-weights \ 112 | --use-llama2-rotary-position-embeddings \ 113 | --disable-bias-linear \ 114 | --normalization RMSNorm \ 115 | --add-qkv-bias \ 116 | --attention-dropout 0.0 \ 117 | --hidden-dropout 0.0 \ 118 | ${expert_options} \ 119 | ${convert_options} 120 | 121 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 122 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/qwen/hf2megatron_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # bash model_convertor.sh ../../../../Megatron-LM/ ../../../../qwen-14b-hf-to-mg-tp2-pp1/release/ ../../../../qwen-14b-mg2hf21 2 1 qwen-14b 0 true 3 | # bash model_convertor.sh ../../../Megatron-LM-231007/ ../../../../qianwen/models--Qwen--Qwen1.5-7B-Chat/ ../../../../qianwen/models--Qwen--Qwen1.5-7B-Chat-hf2mg41/ 4 1 qwen1.5 0 false 4 | # bash model_convertor.sh ../../../Megatron-LM-231007/ ../../../../qianwen/models--Qwen--Qwen1.5-7B-Chat-hf2mg41/release/ ../../../../qianwen/models--Qwen--Qwen1.5-7B-Chat-mg2hf41/ 4 1 qwen1.5 0 true 5 | set -e 6 | START_TIME=$SECONDS 7 | 8 | MEGATRON_PATH=$1 9 | SOURCE_CKPT_PATH=$2 10 | TARGET_CKPT_PATH=$3 11 | TP=$4 12 | PP=$5 13 | MN=$6 #qwen-7b,qwen-14b,qwen-72b;qwen1.5-0.5b,qwen1.5-1.8b,qwen1.5-4b,qwen1.5-7b,qwen1.5-14b,qwen1.5-72b 14 | EXTRA_VOCAB_SIZE=$7 # 0 for all models 15 | mg2hf=$8 16 | 17 | if [ $mg2hf = true ]; then 18 | do_options=" 19 | --convert_checkpoint_from_megatron_to_transformers 20 | " 21 | elif [ $mg2hf = false ]; then 22 | do_options="" 23 | fi 24 | 25 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-231007 26 | 27 | if [[ "$MN" == *"qwen1.5"* ]]; then 28 | 29 | python hf2megatron_qwen1.5.py \ 30 | --load_path ${SOURCE_CKPT_PATH} \ 31 | --save_path ${TARGET_CKPT_PATH} \ 32 | --target_params_dtype bf16 \ 33 | --megatron-path ${MEGATRON_PATH} \ 34 | --target_tensor_model_parallel_size ${TP} \ 35 | --target_pipeline_model_parallel_size ${PP} \ 36 | --model_name ${MN} \ 37 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 38 | ${do_options} 39 | 40 | else 41 | 42 | python hf2megatron_qwen1.0.py \ 43 | --load_path ${SOURCE_CKPT_PATH} \ 44 | --save_path ${TARGET_CKPT_PATH} \ 45 | --target_params_dtype fp16 \ 46 | --megatron-path ${MEGATRON_PATH} \ 47 | --target_tensor_model_parallel_size ${TP} \ 48 | --target_pipeline_model_parallel_size ${PP} \ 49 | --model_name ${MN} \ 50 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 51 | ${do_options} 52 | 53 | fi 54 | 55 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 56 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 57 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/starcoder/model_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # megatron to transformers: You need to copy the tokenizer files into the save_path 3 | # bash model_convertor.sh ../../Megatron-LM/ ../../starcoder-mg21/release ../../starcoder-mg2hf 2 1 true 4 | # transformers to megatron 5 | # bash model_convertor.sh ../../Megatron-LM/ ../../starcoder-16b/ ../../starcoder-mg21 2 1 false 6 | set -e 7 | START_TIME=$SECONDS 8 | 9 | MEGATRON_PATH=$1 10 | SOURCE_CKPT_PATH=$2 11 | TARGET_CKPT_PATH=$3 12 | TP=$4 13 | PP=$5 14 | mg2hf=$6 15 | 16 | if [ $mg2hf = true ]; then 17 | do_options=" 18 | --convert_checkpoint_from_megatron_to_transformers 19 | " 20 | elif [ $mg2hf = false ]; then 21 | do_options="" 22 | fi 23 | 24 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 25 | 26 | python checkpoint_reshaping_and_interoperability.py \ 27 | --load_path ${SOURCE_CKPT_PATH} \ 28 | --save_path ${TARGET_CKPT_PATH} \ 29 | --target_params_dtype fp16 \ 30 | --megatron-path ${MEGATRON_PATH} \ 31 | --target_tensor_model_parallel_size ${TP} \ 32 | --target_pipeline_model_parallel_size ${PP} \ 33 | ${do_options} 34 | 35 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 36 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 37 | -------------------------------------------------------------------------------- /toolkits/model_checkpoints_convertor/yi/model_convertor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | START_TIME=$SECONDS 4 | 5 | MEGATRON_PATH=$1 6 | SOURCE_CKPT_PATH=$2 7 | TARGET_CKPT_PATH=$3 8 | TP=$4 9 | PP=$5 10 | MN=$6 #yi-6b 11 | EXTRA_VOCAB_SIZE=$7 12 | mg2hf=$8 13 | 14 | if [ $mg2hf = true ]; then 15 | do_options=" 16 | --convert_checkpoint_from_megatron_to_transformers 17 | " 18 | elif [ $mg2hf = false ]; then 19 | do_options="" 20 | fi 21 | 22 | export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH 23 | 24 | python checkpoint_reshaping_and_interoperability.py \ 25 | --load_path ${SOURCE_CKPT_PATH} \ 26 | --save_path ${TARGET_CKPT_PATH} \ 27 | --target_params_dtype fp16 \ 28 | --megatron-path ${MEGATRON_PATH} \ 29 | --target_tensor_model_parallel_size ${TP} \ 30 | --target_pipeline_model_parallel_size ${PP} \ 31 | --model_name ${MN} \ 32 | --extra_num_vocabs ${EXTRA_VOCAB_SIZE} \ 33 | ${do_options} 34 | 35 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 36 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 37 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/clean_raw_text.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import codecs 3 | import json 4 | import multiprocessing 5 | import os.path 6 | import re 7 | from glob import glob 8 | 9 | from tqdm import tqdm 10 | 11 | 12 | def clean_text(raw): 13 | httpcom = re.compile( 14 | r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@' 15 | r'.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # 匹配模式 16 | raw = httpcom.sub('', raw) 17 | 18 | space = re.compile(r' +') 19 | raw = space.sub(' ', raw) 20 | 21 | fil = re.compile( 22 | u'[^0-9a-zA-Z\u4e00-\u9fa5., ,\\-。%' 23 | u'《*》/•、&&(—)(+):?!!“”·]+', re.UNICODE) 24 | raw = fil.sub('', raw) 25 | return raw.strip() 26 | 27 | 28 | def run_preprocess(input_fp, output_fp): 29 | with codecs.open(output_fp, 'w', encoding='utf8') as json_file: 30 | with open(input_fp) as f: 31 | try: 32 | file_json = json.load(f) 33 | except ValueError: 34 | file_json = {} 35 | 36 | if 'output' in file_json[0].keys(): 37 | temp = 'output' 38 | elif 'content' in file_json[0].keys(): 39 | temp = 'content' 40 | for obj in file_json: 41 | text = obj[temp] 42 | text = clean_text(text) 43 | di = {'text': text} 44 | dumped_di = json.dumps(di, ensure_ascii=False) 45 | json_file.write(dumped_di + '\n') 46 | 47 | 48 | def main(): 49 | parser = argparse.ArgumentParser( 50 | formatter_class=argparse.RawTextHelpFormatter) 51 | parser.add_argument('--input-dir', 52 | '-input_dir', 53 | '-i', 54 | help='folder name of checkpoint files', 55 | required=True) 56 | 57 | parser.add_argument('--output-dir', 58 | '-output_dir', 59 | '-o', 60 | help='folder name of checkpoint files', 61 | required=True) 62 | 63 | parser.add_argument('--num-processes', 64 | '-num_processes', 65 | '-p', 66 | type=int, 67 | default=None, 68 | help='Number of processes') 69 | 70 | args = parser.parse_args() 71 | po = multiprocessing.Pool(args.num_processes) 72 | 73 | if not os.path.exists(args.output_dir): 74 | os.makedirs(args.output_dir) 75 | 76 | for input_file in tqdm(glob(args.input_dir + '/*.json')): 77 | fn = input_file.split('/')[-1] 78 | output_file = os.path.join(args.output_dir, fn) 79 | po.apply_async(func=run_preprocess, args=(input_file, output_file)) 80 | po.close() 81 | po.join() 82 | print('done') 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/convert_json_to_list.py: -------------------------------------------------------------------------------- 1 | import json 2 | json_file_path = ['wudao_train.json', 'wudao_valid.json'] 3 | for path in json_file_path: 4 | b = [] 5 | with open (path,encoding='utf-8') as json_file: 6 | for line in json_file.readlines(): 7 | dict=json.loads(line) 8 | b.append(dict) 9 | with open(path,'w',encoding='utf-8') as file_obj: 10 | json.dump(b,file_obj,ensure_ascii=False, indent=4) 11 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenSparseLLMs/Linear-MoE/b312754b9b8a2cb9eb15e373baabe263b8d409c1/toolkits/pretrain_data_preprocessing/img.png -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/preprocess_wudao2.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import codecs 3 | import json 4 | import multiprocessing 5 | import os.path 6 | import re 7 | from glob import glob 8 | from tqdm import tqdm 9 | 10 | 11 | def clean_text(raw): 12 | httpcom = re.compile( 13 | r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@' 14 | r'.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # 匹配模式 15 | raw = httpcom.sub('', raw) 16 | 17 | space = re.compile(r' +') 18 | raw = space.sub(' ', raw) 19 | 20 | fil = re.compile( 21 | u'[^0-9a-zA-Z\u4e00-\u9fa5., ,\\-。%' 22 | u'《*》/•、&&(—)(+):?!!“”·]+', re.UNICODE) 23 | raw = fil.sub('', raw) 24 | return raw.strip() 25 | 26 | 27 | def run_preprocess(input_fp, output_fp, target_key): 28 | with codecs.open(output_fp, 'w', encoding='utf8') as json_file: 29 | with open(input_fp) as f: 30 | try: 31 | file_json = json.load(f) 32 | except ValueError: 33 | file_json = {} 34 | for obj in file_json: 35 | text = obj['content'] 36 | text = clean_text(text) 37 | di = { 38 | 'instruction': "", 39 | 'input': "", 40 | target_key : text 41 | } 42 | dumped_di = json.dumps(di, ensure_ascii=False) 43 | json_file.write(dumped_di + '\n') 44 | 45 | 46 | def main(): 47 | parser = argparse.ArgumentParser( 48 | formatter_class=argparse.RawTextHelpFormatter) 49 | parser.add_argument('--input-dir', 50 | '-input_dir', 51 | '-i', 52 | help='folder name of checkpoint files', 53 | required=True) 54 | 55 | parser.add_argument('--output-dir', 56 | '-output_dir', 57 | '-o', 58 | help='folder name of checkpoint files', 59 | required=True) 60 | 61 | parser.add_argument('--target-key', 62 | '-target_key', 63 | '-k', 64 | type=str, 65 | default='content', 66 | help='target_key', 67 | ) 68 | 69 | parser.add_argument('--num-processes', 70 | '-num_processes', 71 | '-p', 72 | type=int, 73 | default=None, 74 | help='Number of processes') 75 | 76 | args = parser.parse_args() 77 | po = multiprocessing.Pool(args.num_processes) 78 | 79 | if not os.path.exists(args.output_dir): 80 | os.makedirs(args.output_dir) 81 | 82 | for input_file in tqdm(glob(args.input_dir + '/*.json')): 83 | fn = input_file.split('/')[-1] 84 | output_file = os.path.join(args.output_dir, fn) 85 | po.apply_async(func=run_preprocess, args=(input_file, output_file, args.target_key)) 86 | po.close() 87 | po.join() 88 | print('done') 89 | 90 | 91 | if __name__ == '__main__': 92 | main() 93 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/qwen_hf_preprocess_datasets.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import fire 3 | import glob 4 | import transformers 5 | from datasets import load_dataset 6 | 7 | 8 | def build_dataset( 9 | data_path: str, 10 | tokenizer: transformers.PreTrainedTokenizer, 11 | sequence_length: int, 12 | cache_dir: str, 13 | ): 14 | def group_texts(examples): 15 | # Concatenate all texts. 16 | concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} 17 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 18 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 19 | # customize this part to your needs. 20 | if total_length >= sequence_length: 21 | total_length = (total_length // sequence_length) * sequence_length 22 | # Split by chunks of block_size. 23 | result = { 24 | k: [t[i: i + sequence_length] for i in range(0, total_length, sequence_length)] 25 | for k, t in concatenated_examples.items() 26 | } 27 | result["labels"] = result["input_ids"].copy() 28 | return result 29 | 30 | num_workers = 16 31 | 32 | raw_datasets = load_dataset( 33 | "json", 34 | data_files=data_path, 35 | split="train", 36 | cache_dir=cache_dir, 37 | ) 38 | 39 | dataset = raw_datasets.map( 40 | lambda example: tokenizer(example["text"]), 41 | batched=True, 42 | batch_size=3000, 43 | num_proc=num_workers, 44 | remove_columns=raw_datasets.column_names, 45 | load_from_cache_file=True, 46 | desc="Running tokenization", 47 | ) 48 | 49 | dataset = dataset.map( 50 | group_texts, 51 | batched=True, 52 | num_proc=num_workers, 53 | load_from_cache_file=True, 54 | desc=f"Grouping texts with sequence length {sequence_length}", 55 | ) 56 | 57 | return dataset 58 | 59 | def run_preprocess(jsonl_path, encoded_path, tokenizer, sequence_length, cache_dir): 60 | dataset = build_dataset( 61 | data_path=jsonl_path, 62 | tokenizer=tokenizer, 63 | sequence_length=sequence_length, 64 | cache_dir=cache_dir, 65 | ) 66 | dataset.save_to_disk(encoded_path) 67 | 68 | def main( 69 | data_dir: str, 70 | tokenizer_name_or_path: str, 71 | sequence_length: int = 2048, 72 | cache_dir: str = "./hf-cache", 73 | ): 74 | tokenizer = transformers.AutoTokenizer.from_pretrained( 75 | tokenizer_name_or_path, 76 | cache_dir=cache_dir, 77 | model_max_length=sequence_length, 78 | padding_side="right", 79 | use_fast=True, 80 | trust_remote_code=True 81 | ) 82 | if tokenizer.pad_token is None: 83 | tokenizer.add_special_tokens(special_tokens_dict=dict(pad_token="<|extra_0|>")) 84 | 85 | ds_paths = {} 86 | for file in glob.glob(data_dir+"/*.jsonl"): 87 | ds_paths[file] = file.replace(".jsonl", ".encode") 88 | 89 | for k, v in ds_paths.items(): 90 | print("===========================================") 91 | print(k) 92 | print(v) 93 | print("===========================================") 94 | 95 | po = multiprocessing.Pool(8) 96 | for jsonl_path, encoded_path in ds_paths.items(): 97 | po.apply_async(func=run_preprocess, args=(jsonl_path, encoded_path, tokenizer, sequence_length, cache_dir)) 98 | po.close() 99 | po.join() 100 | print('done') 101 | 102 | 103 | if __name__ == "__main__": 104 | fire.Fire(main) 105 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/run_make_pretraining_dataset.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | START_TIME=$SECONDS 3 | LINEAR_MOE_PATH=$1 4 | MEGATRON_PATH=${LINEAR_MOE_PATH}/Megatron-LM-240126 5 | export PYTHONPATH=${MEGATRON_PATH}:${LINEAR_MOE_PATH}:$PYTHONPATH 6 | input_data_dir=$2 7 | tokenizer=$3 8 | output_data_dir=$4 9 | load_dir=$5 10 | 11 | INPUT="${input_data_dir}" 12 | 13 | 14 | 15 | if [ $tokenizer = "jiebabpe" ]; then 16 | 17 | if [ ! -f tokenizer.json ]; then 18 | wget https://easynlp-dev.oss-cn-zhangjiakou.aliyuncs.com/225247/RapidformerPro/tokenizer.json 19 | fi 20 | 21 | python preprocess_data.py \ 22 | --input ${INPUT} \ 23 | --output-prefix ${output_data_dir}/wudao_jiebabpe \ 24 | --dataset-impl mmap \ 25 | --vocab tokenizer.json \ 26 | --patch-tokenizer-type JiebaBPETokenizer \ 27 | --load ${load_dir} \ 28 | --workers 16 \ 29 | --append-eod 30 | 31 | elif [ $tokenizer = "bloombpe" ]; then 32 | 33 | python preprocess_data.py \ 34 | --input ${INPUT} \ 35 | --output-prefix ${output_data_dir}/wudao_bloombpe \ 36 | --dataset-impl mmap \ 37 | --patch-tokenizer-type BloomTokenizerFromHF \ 38 | --load ${load_dir} \ 39 | --workers 16 \ 40 | --append-eod 41 | 42 | elif [ $tokenizer = "glmchinesebpe" ]; then 43 | 44 | python preprocess_data.py \ 45 | --input ${INPUT} \ 46 | --output-prefix ${output_data_dir}/wudao_glmchinesebpe \ 47 | --dataset-impl mmap \ 48 | --patch-tokenizer-type GLM10BZHTokenizerFromHF \ 49 | --load ${load_dir} \ 50 | --workers 16 \ 51 | --append-eod 52 | 53 | elif [ $tokenizer = "glm130bbpe" ]; then 54 | 55 | python preprocess_data.py \ 56 | --input ${INPUT} \ 57 | --output-prefix ${output_data_dir}/wudao_glm130bbpe \ 58 | --dataset-impl mmap \ 59 | --patch-tokenizer-type IcetkGLM130BTokenizer \ 60 | --load ${load_dir} \ 61 | --workers 16 \ 62 | --append-eod 63 | 64 | elif [ $tokenizer = "llamabpe" ]; then 65 | 66 | python preprocess_data.py \ 67 | --input ${INPUT} \ 68 | --output-prefix ${output_data_dir}/wudao_llama3bpe \ 69 | --dataset-impl mmap \ 70 | --patch-tokenizer-type LLamaTokenizer \ 71 | --load ${load_dir} \ 72 | --workers 16 \ 73 | --append-eod 74 | 75 | elif [ $tokenizer = "falconbpe" ]; then 76 | 77 | python preprocess_data.py \ 78 | --input ${INPUT} \ 79 | --output-prefix ${output_data_dir}/wudao_falconbpe \ 80 | --dataset-impl mmap \ 81 | --patch-tokenizer-type FalconTokenizer \ 82 | --load ${load_dir} \ 83 | --workers 16 \ 84 | --append-eod 85 | 86 | elif [ $tokenizer = "galacticabpe" ]; then 87 | 88 | python preprocess_data.py \ 89 | --input ${INPUT} \ 90 | --output-prefix ${output_data_dir}/wudao_galacticabpe \ 91 | --dataset-impl mmap \ 92 | --patch-tokenizer-type OPTTokenizer \ 93 | --load ${load_dir} \ 94 | --workers 16 \ 95 | --append-eod 96 | 97 | elif [ $tokenizer = "starcoderbpe" ]; then 98 | python preprocess_data.py \ 99 | --input ${INPUT} \ 100 | --output-prefix ${output_data_dir}/wudao_starcoderbpe \ 101 | --dataset-impl mmap \ 102 | --patch-tokenizer-type StarcoderTokenizerFromHF \ 103 | --load ${load_dir} \ 104 | --workers 16 \ 105 | --append-eod 106 | 107 | elif [ $tokenizer = "qwenbpe" ]; then 108 | python preprocess_data.py \ 109 | --input ${INPUT} \ 110 | --output-prefix ${output_data_dir}/wudao_qwenbpe \ 111 | --dataset-impl mmap \ 112 | --patch-tokenizer-type QwenTokenizer \ 113 | --load ${load_dir} \ 114 | --workers 16 \ 115 | --append-eod 116 | 117 | elif [ $tokenizer = "mistralbpe" ]; then 118 | python preprocess_data.py \ 119 | --input ${INPUT} \ 120 | --output-prefix ${output_data_dir}/SlimPajama_mistralbpe \ 121 | --dataset-impl mmap \ 122 | --patch-tokenizer-type MistralTokenizer \ 123 | --load ${load_dir} \ 124 | --workers 16 \ 125 | --append-eod 126 | 127 | fi 128 | 129 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 130 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 131 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/run_make_pretraining_dataset_megatron.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export HF_ENDPOINT=https://hf-mirror.com 3 | 4 | START_TIME=$SECONDS 5 | 6 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 7 | MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) 8 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240405 9 | 10 | input_data_dir=$1 11 | tokenizer=$2 12 | json_keys=$3 13 | output_data_dir=$4 14 | load_dir=$5 15 | 16 | INPUT="${input_data_dir}" 17 | 18 | if [ $tokenizer = "Qwen2Tokenizer" ]; then 19 | python preprocess_data_megatron.py \ 20 | --input ${INPUT} \ 21 | --output-prefix ${output_data_dir}/mmap_qwen2_datasets \ 22 | --patch-tokenizer-type Qwen2Tokenizer \ 23 | --json-keys ${json_keys} \ 24 | --load ${load_dir} \ 25 | --workers 2 \ 26 | --partitions 2 \ 27 | --keep-sequential-samples \ 28 | --append-eod 29 | 30 | elif [ $tokenizer = "DeepSeekV2Tokenizer" ]; then 31 | python preprocess_data_megatron.py \ 32 | --input ${INPUT} \ 33 | --output-prefix ${output_data_dir}/mmap_deepseekv2_datasets \ 34 | --patch-tokenizer-type DeepSeekV2Tokenizer \ 35 | --json-keys ${json_keys} \ 36 | --load ${load_dir} \ 37 | --workers 8 \ 38 | --partitions 1 \ 39 | --keep-sequential-samples \ 40 | --append-eod 41 | 42 | elif [ $tokenizer = "LLamaTokenizer" ]; then 43 | python preprocess_data_megatron.py \ 44 | --input ${INPUT} \ 45 | --output-prefix ${output_data_dir}/mmap_llama_datasets \ 46 | --patch-tokenizer-type LLamaTokenizer \ 47 | --load ${load_dir} \ 48 | --workers 16 \ 49 | --partitions 1 \ 50 | --keep-sequential-samples \ 51 | --append-eod 52 | 53 | fi 54 | 55 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 56 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 57 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/run_make_pretraining_dataset_megatron_slimpajama.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export HF_ENDPOINT=https://hf-mirror.com 3 | 4 | START_TIME=$SECONDS 5 | 6 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 7 | MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) 8 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240726 9 | 10 | input_data_dir=/cpfs04/shared/MOE/datasets/data-SlimPajama/SlimPajama-627B-train-split.json 11 | tokenizer=Qwen2Tokenizer 12 | json_keys=text 13 | output_data_dir=/cpfs04/shared/MOE/datasets/data-SlimPajama/slimpajama_megatron_bin_data 14 | load_dir=/cpfs04/shared/MOE/checkpoints/qwen-ckpts/Qwen2-0.5B 15 | 16 | INPUT="${input_data_dir}" 17 | 18 | if [ $tokenizer = "Qwen2Tokenizer" ]; then 19 | python preprocess_data_megatron.py \ 20 | --input ${INPUT} \ 21 | --output-prefix ${output_data_dir}/mmap_qwen2_datasets \ 22 | --patch-tokenizer-type Qwen2Tokenizer \ 23 | --json-keys ${json_keys} \ 24 | --load ${load_dir} \ 25 | --workers 32 \ 26 | --partitions 1 \ 27 | --keep-sequential-samples \ 28 | --append-eod 29 | 30 | elif [ $tokenizer = "DeepSeekV2Tokenizer" ]; then 31 | python preprocess_data_megatron.py \ 32 | --input ${INPUT} \ 33 | --output-prefix ${output_data_dir}/mmap_deepseekv2_datasets \ 34 | --patch-tokenizer-type DeepSeekV2Tokenizer \ 35 | --json-keys ${json_keys} \ 36 | --load ${load_dir} \ 37 | --workers 8 \ 38 | --partitions 1 \ 39 | --keep-sequential-samples \ 40 | --append-eod 41 | 42 | elif [ $tokenizer = "LLamaTokenizer" ]; then 43 | python preprocess_data_megatron.py \ 44 | --input ${INPUT} \ 45 | --output-prefix ${output_data_dir}/mmap_llama_datasets \ 46 | --patch-tokenizer-type LLamaTokenizer \ 47 | --load ${load_dir} \ 48 | --workers 16 \ 49 | --partitions 1 \ 50 | --keep-sequential-samples \ 51 | --append-eod 52 | 53 | fi 54 | 55 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 56 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 57 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/run_make_pretraining_dataset_megatron_slimpajama_chunk1_chunk2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | export HF_ENDPOINT=https://hf-mirror.com 3 | 4 | START_TIME=$SECONDS 5 | 6 | CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )" 7 | MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR})) 8 | export PYTHONPATH=$PYTHONPATH:${MEGATRON_PATH}:${MEGATRON_PATH}/Megatron-LM-240726 9 | 10 | input_data_dir=/cpfs04/shared/MOE/datasets/data-SlimPajama/SlimPajama-627B-train-split-chunk1-chunk2.json 11 | tokenizer=Qwen2Tokenizer 12 | json_keys=text 13 | output_data_dir=/cpfs04/shared/MOE/datasets/data-SlimPajama/slimpajama_chunk1_chunk2_megatron_bin_data 14 | load_dir=/cpfs04/shared/MOE/checkpoints/qwen-ckpts/Qwen2-0.5B 15 | 16 | INPUT="${input_data_dir}" 17 | 18 | if [ $tokenizer = "Qwen2Tokenizer" ]; then 19 | python preprocess_data_megatron.py \ 20 | --input ${INPUT} \ 21 | --output-prefix ${output_data_dir}/mmap_qwen2_datasets \ 22 | --patch-tokenizer-type Qwen2Tokenizer \ 23 | --json-keys ${json_keys} \ 24 | --load ${load_dir} \ 25 | --workers 32 \ 26 | --partitions 1 \ 27 | --keep-sequential-samples \ 28 | --append-eod 29 | 30 | elif [ $tokenizer = "DeepSeekV2Tokenizer" ]; then 31 | python preprocess_data_megatron.py \ 32 | --input ${INPUT} \ 33 | --output-prefix ${output_data_dir}/mmap_deepseekv2_datasets \ 34 | --patch-tokenizer-type DeepSeekV2Tokenizer \ 35 | --json-keys ${json_keys} \ 36 | --load ${load_dir} \ 37 | --workers 8 \ 38 | --partitions 1 \ 39 | --keep-sequential-samples \ 40 | --append-eod 41 | 42 | elif [ $tokenizer = "LLamaTokenizer" ]; then 43 | python preprocess_data_megatron.py \ 44 | --input ${INPUT} \ 45 | --output-prefix ${output_data_dir}/mmap_llama_datasets \ 46 | --patch-tokenizer-type LLamaTokenizer \ 47 | --load ${load_dir} \ 48 | --workers 16 \ 49 | --partitions 1 \ 50 | --keep-sequential-samples \ 51 | --append-eod 52 | 53 | fi 54 | 55 | ELAPSED_TIME=$(($SECONDS - $START_TIME)) 56 | echo "$(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec" 57 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/run_prepare_dataset.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #clean 4 | python clean_raw_text.py -i WuDaoCorpus2.0_base_200G -o cleaned_wudao_dataset -p 32 5 | 6 | #merge 7 | find cleaned_wudao_dataset -name "*.json" -exec cat {} + > /cpfs01/user/paigpt/wudao/merged_wudao_cleaned.json 8 | 9 | #build zst 10 | split -l 6000000 --numeric-suffixes --additional-suffix=.jsonl /cpfs01/user/paigpt/wudao/merged_wudao_cleaned.json /cpfs01/user/paigpt/wudao/ 11 | zstd -z /cpfs01/user/paigpt/wudao/00.jsonl -o /cpfs01/user/paigpt/wudao/00.jsonl.zst & 12 | zstd -z /cpfs01/user/paigpt/wudao/01.jsonl -o /cpfs01/user/paigpt/wudao/01.jsonl.zst & 13 | zstd -z /cpfs01/user/paigpt/wudao/02.jsonl -o /cpfs01/user/paigpt/wudao/02.jsonl.zst & 14 | zstd -z /cpfs01/user/paigpt/wudao/03.jsonl -o /cpfs01/user/paigpt/wudao/03.jsonl.zst & 15 | zstd -z /cpfs01/user/paigpt/wudao/04.jsonl -o /cpfs01/user/paigpt/wudao/04.jsonl.zst & 16 | zstd -z /cpfs01/user/paigpt/wudao/05.jsonl -o /cpfs01/user/paigpt/wudao/05.jsonl.zst & 17 | zstd -z /cpfs01/user/paigpt/wudao/06.jsonl -o /cpfs01/user/paigpt/wudao/06.jsonl.zst & 18 | zstd -z /cpfs01/user/paigpt/wudao/07.jsonl -o /cpfs01/user/paigpt/wudao/07.jsonl.zst & 19 | zstd -z /cpfs01/user/paigpt/wudao/08.jsonl -o /cpfs01/user/paigpt/wudao/08.jsonl.zst & 20 | zstd -z /cpfs01/user/paigpt/wudao/09.jsonl -o /cpfs01/user/paigpt/wudao/09.jsonl.zst & 21 | -------------------------------------------------------------------------------- /toolkits/pretrain_data_preprocessing/run_prepare_wudao.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -ex 3 | # 请在此处设置原始数据所在路径 4 | data_dir=/mnt/mixtral-datasets/wudao_200g 5 | 6 | #开始数据清洗流程 7 | dataset_dir=$(dirname $data_dir) 8 | mkdir -p ${dataset_dir}/cleaned_wudao_dataset 9 | cd ${dataset_dir}/cleaned_wudao_dataset 10 | wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models/pai-megatron-patch/llama2-codes/preprocess_wudao2.py 11 | # 此处与上一节不同,增加了key参数设为text 12 | python preprocess_wudao2.py -i ${data_dir} -o ${dataset_dir}/cleaned_wudao_dataset -k text -p 32 13 | 14 | # 合并清洗后的数据 15 | mkdir -p ${dataset_dir}/wudao 16 | cd ${dataset_dir}/wudao 17 | find ${dataset_dir}/cleaned_wudao_dataset -name "*.json" -exec cat {} + > ${dataset_dir}/wudao/merged_wudao_cleaned.json 18 | rm -rf ${dataset_dir}/cleaned_wudao_dataset 19 | 20 | # 此处设置分块数为10,如数据处理慢可设置稍大 21 | NUM_PIECE=10 22 | # 对merged_wudao_cleaned.json文件进行处理 23 | mkdir -p ${dataset_dir}/cleaned_zst/ 24 | # 查询数据总长度,对数据进行拆分 25 | NUM=$(sed -n '$=' ${dataset_dir}/wudao/merged_wudao_cleaned.json) 26 | echo "total line of dataset is $NUM, data will be split into $NUM_PIECE pieces for processing" 27 | NUM=`expr $NUM / $NUM_PIECE` 28 | echo "each group is processing $NUM sample" 29 | split_dir=${dataset_dir}/split 30 | mkdir $split_dir 31 | split -l $NUM --numeric-suffixes --additional-suffix=.jsonl ${dataset_dir}/wudao/merged_wudao_cleaned.json $split_dir/ 32 | 33 | # 数据压缩 34 | o_path=${dataset_dir}/cleaned_zst/ 35 | mkdir -p $o_path 36 | files=$(ls $split_dir/*.jsonl) 37 | for filename in $files 38 | do 39 | f=$(basename $filename) 40 | zstd -z $filename -o $o_path/$f.zst & 41 | done 42 | rm -rf $split_dir 43 | rm ${dataset_dir}/wudao/merged_wudao_cleaned.json --------------------------------------------------------------------------------