├── mftcoder_accelerate ├── src │ ├── model │ │ ├── __init__.py │ │ ├── gpt_neox │ │ │ ├── generation_config.json │ │ │ ├── config.json │ │ │ ├── __init__.py │ │ │ ├── configuration_gpt_neox.py │ │ │ └── tokenization_gpt_neox_fast.py │ │ ├── qwen │ │ │ ├── tokenizer_config.json │ │ │ ├── cpp_kernels.py │ │ │ ├── configuration_qwen.py │ │ │ └── cache_autogptq_cuda_256.cpp │ │ ├── chatglm2 │ │ │ ├── tokenizer_config.json │ │ │ ├── config.json │ │ │ └── configuration_chatglm.py │ │ ├── chatglm3 │ │ │ ├── config.json │ │ │ └── configuration_chatglm.py │ │ ├── deepseek_v2 │ │ │ └── tokenization_deepseek_fast.py │ │ ├── baichuan2 │ │ │ ├── configuration_baichuan.py │ │ │ └── generation_utils.py │ │ ├── phi │ │ │ └── configuration_mixformer_sequential.py │ │ ├── code_llama │ │ │ └── __init__.py │ │ ├── gpt_bigcode │ │ │ ├── __init__.py │ │ │ └── configuration_gpt_bigcode.py │ │ └── aquila2 │ │ │ └── configuration_aquila.py │ ├── data │ │ ├── __init__.py │ │ ├── Makefile │ │ └── blendable_dataset.py │ ├── utils │ │ ├── __init__.py │ │ ├── model_mapping.py │ │ ├── common_utils.py │ │ └── agd.py │ ├── tokenizer │ │ ├── __init__.py │ │ ├── chat_template.py │ │ └── tokenizer.py │ ├── run_offline_tokenization.sh │ ├── accelerate_ds_config.yaml │ ├── accelerate_fsdp_config.yaml │ ├── configs │ │ ├── dpo_train_config.json │ │ ├── full_train_config.json │ │ ├── lora_train_config.json │ │ ├── qlora_train_config.json │ │ └── coba_train_config.json │ ├── ds_single_launch.sh │ ├── ds_zero3_single_launch.sh │ ├── offline_tokenization │ │ ├── writer.py │ │ └── concat_sst_bin_tokenization.py │ ├── fsdp_single_launch.sh │ ├── ds_multinode_launch.sh │ ├── xxpo │ │ ├── custom_callbacks.py │ │ └── xxpo_arguments.py │ ├── pefts │ │ ├── merge_base_and_lora_to_hf.py │ │ └── mft_arguments.py │ └── mpt │ │ └── mpt_arguments.py └── inference │ └── hf_inference.py ├── mftcoder_atorch ├── data │ ├── __init__.py │ ├── helpers.cpython-38-x86_64-linux-gnu.so │ └── Makefile ├── train │ ├── __init__.py │ ├── run_gpt_mft.sh │ ├── run_gpt_mft_peft.sh │ └── run_train.py ├── utils │ ├── __init__.py │ ├── merge_base_and_lora_to_hf.py │ └── learning_rates.py ├── .gitignore ├── model │ ├── gpt_neox │ │ ├── generation_config.json │ │ ├── config.json │ │ ├── __init__.py │ │ ├── configuration_gpt_neox.py │ │ └── tokenization_gpt_neox_fast.py │ ├── peft │ │ ├── tuner │ │ │ ├── pe_base_model.py │ │ │ ├── __init__.py │ │ │ ├── bitfit.py │ │ │ └── roem.py │ │ ├── __init__.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ └── mapping.py │ ├── __init__.py │ └── build_model.py ├── tokenizer │ ├── __init__.py │ └── train_tokenizer.py └── README_cn.md ├── .gitignore ├── assets ├── img.jpg ├── img_1.jpg ├── CodeFuse-AI群.png └── github-codefuse-logo-update.jpg ├── init_env.sh ├── requirements.txt └── LEGAL.md /mftcoder_accelerate/src/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mftcoder_atorch/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * -------------------------------------------------------------------------------- /mftcoder_accelerate/src/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * -------------------------------------------------------------------------------- /mftcoder_atorch/train/__init__.py: -------------------------------------------------------------------------------- 1 | from .run_train import * -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .DS_Store 3 | *.log 4 | */__pycache__/ 5 | *.pyc -------------------------------------------------------------------------------- /assets/img.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/img.jpg -------------------------------------------------------------------------------- /assets/img_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/img_1.jpg -------------------------------------------------------------------------------- /assets/CodeFuse-AI群.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/CodeFuse-AI群.png -------------------------------------------------------------------------------- /mftcoder_accelerate/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .common_utils import * 2 | from .loss_utils import * 3 | -------------------------------------------------------------------------------- /mftcoder_atorch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .common_utils import * 2 | from .auto_accelerate_utils import * -------------------------------------------------------------------------------- /mftcoder_atorch/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | */__pycache__/ 3 | *.pyc 4 | *.ipynb 5 | .DS_Store 6 | .idea/ 7 | evals/ -------------------------------------------------------------------------------- /assets/github-codefuse-logo-update.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/github-codefuse-logo-update.jpg -------------------------------------------------------------------------------- /init_env.sh: -------------------------------------------------------------------------------- 1 | pip install torch==2.1.0 && \ 2 | pip install tensorboard==2.11.0 && \ 3 | pip install packaging && \ 4 | pip install -r requirements.txt 5 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenizer import build_tokenizer 2 | from .tokenizer import init_tokenizer 3 | from .chat_template import MFTCoder_template -------------------------------------------------------------------------------- /mftcoder_atorch/data/helpers.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/mftcoder_atorch/data/helpers.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /mftcoder_atorch/model/gpt_neox/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token_id": 50256, 3 | "eos_token_id": 50256, 4 | "transformers_version": "4.26.0.dev0", 5 | "_from_model_config": true 6 | } 7 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/gpt_neox/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token_id": 50256, 3 | "eos_token_id": 50256, 4 | "transformers_version": "4.26.0.dev0", 5 | "_from_model_config": true 6 | } 7 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/qwen/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_max_length": 8192, 3 | "tokenizer_class": "QWenTokenizer", 4 | "auto_map": { 5 | "AutoTokenizer": [ 6 | "tokenization_qwen.QWenTokenizer", 7 | null 8 | ] 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /mftcoder_atorch/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/chatglm2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name_or_path": "THUDM/chatglm2-6b", 3 | "remove_space": false, 4 | "do_lower_case": false, 5 | "tokenizer_class": "ChatGLMTokenizer", 6 | "auto_map": { 7 | "AutoTokenizer": [ 8 | "tokenization_chatglm.ChatGLMTokenizer", 9 | null 10 | ] 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.5 2 | pandas==2.2.1 3 | torch==2.1.0 4 | tensorboard==2.11.0 5 | deepspeed==0.14.0 6 | transformers==4.44.2 7 | accelerate==0.31.0 8 | peft==0.10.0 9 | BitsAndBytes==0.43.0 10 | xformers==0.0.22.post7 11 | datasets 12 | ftfy 13 | packaging 14 | einops 15 | sentencepiece 16 | ujson 17 | jsonlines 18 | tiktoken 19 | transformers_stream_generator -------------------------------------------------------------------------------- /mftcoder_accelerate/src/run_offline_tokenization.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH= 2 | DATA_PATH= 3 | DATASET_NAME= 4 | OUTPUT_PATH= 5 | 6 | python offline_tokenization/concat_sst_bin_tokenization.py \ 7 | --model-path ${MODEL_PATH} \ 8 | --data-path ${DATA_PATH} \ 9 | --dataset-name ${DATASET_NAME} \ 10 | --output-path ${OUTPUT_PATH} \ 11 | --parallel 16 \ 12 | --seq-length 4096 \ 13 | --sample-percent 1.0 14 | -------------------------------------------------------------------------------- /LEGAL.md: -------------------------------------------------------------------------------- 1 | Legal Disclaimer 2 | 3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail. 4 | 5 | 法律免责声明 6 | 7 | 关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。 -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/tuner/pe_base_model.py: -------------------------------------------------------------------------------- 1 | class PEBaseModel: 2 | """PEtuning的基类模型,定义了PEtuning模型都该有的方法""" 3 | 4 | def __init__(): 5 | return 6 | 7 | def get_model(self): 8 | """对模型进行修改,冻结参数或者插入可训模块""" 9 | pass 10 | 11 | @classmethod 12 | def restore(self, model=None, path=None): 13 | """从path恢复PE模型 14 | 15 | Args: 16 | model (_type_, optional): 原始模型. Defaults to None. 17 | path (_type_, optional): 增量路径. Defaults to None. 18 | """ 19 | pass 20 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/accelerate_ds_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: cpu 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero3_save_16bit_model: true 9 | zero_stage: 2 10 | # steps_per_print: 1 11 | distributed_type: DEEPSPEED 12 | downcast_bf16: 'no' 13 | dynamo_backend: 'NO' 14 | fsdp_config: {} 15 | machine_rank: 0 16 | main_training_function: main 17 | megatron_lm_config: {} 18 | mixed_precision: 'bf16' 19 | num_machines: 1 20 | num_processes: 8 21 | rdzv_backend: static 22 | same_network: true 23 | use_cpu: false -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/__init__.py: -------------------------------------------------------------------------------- 1 | """peft models interface.""" 2 | 3 | from . import utils, tuner 4 | from peft.mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING 5 | from peft.utils import TaskType 6 | from .modeling_peft import AntPeftForCausalLM, AntPeftForEmbedding 7 | 8 | 9 | SUPPORTED_PEFT_TYPES = ["prefix", "lora", "adalora", "bitfit", "roem", "unipelt", "prompt", "ptuning"] 10 | 11 | # Register the Ant Causal Language Model 12 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_CAUSAL_LM"] = AntPeftForCausalLM 13 | TaskType.ANT_CAUSAL_LM = "ANT_CAUSAL_LM" 14 | 15 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_EMBEDDING"] = AntPeftForEmbedding 16 | TaskType.ANT_EMBEDDING = "ANT_EMBEDDING" 17 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/accelerate_fsdp_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | dynamo_backend: 'NO' 6 | fsdp_config: 7 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 8 | fsdp_backward_prefetch_policy: BACKWARD_PRE 9 | fsdp_offload_params: false 10 | fsdp_sharding_strategy: 1 11 | fsdp_state_dict_type: FULL_STATE_DICT 12 | fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer 13 | machine_rank: 0 14 | main_training_function: main 15 | megatron_lm_config: {} 16 | mixed_precision: bf16 17 | num_machines: 1 18 | num_processes: 2 19 | rdzv_backend: static 20 | same_network: true 21 | use_cpu: false -------------------------------------------------------------------------------- /mftcoder_atorch/model/gpt_neox/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "GPTNeoXForCausalLM" 4 | ], 5 | "bos_token_id": 100256, 6 | "eos_token_id": 100256, 7 | "hidden_act": "gelu", 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-05, 12 | "max_position_embeddings": 4096, 13 | "model_type": "gpt_neox", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "rotary_emb_base": 10000, 17 | "rotary_pct": 1.0, 18 | "tie_word_embeddings": false, 19 | "torch_dtype": "float16", 20 | "transformers_version": "4.26.1", 21 | "use_cache": true, 22 | "use_parallel_residual": true, 23 | "vocab_size": 100864 24 | } 25 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/gpt_neox/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "GPTNeoXForCausalLM" 4 | ], 5 | "attention_probs_dropout_prob": 0, 6 | "bos_token_id": 0, 7 | "eos_token_id": 0, 8 | "hidden_act": "gelu_fast", 9 | "hidden_dropout_prob": 0, 10 | "hidden_size": 768, 11 | "initializer_range": 0.02, 12 | "intermediate_size": 3072, 13 | "layer_norm_eps": 1e-05, 14 | "max_position_embeddings": 2048, 15 | "model_type": "gpt_neox", 16 | "num_attention_heads": 12, 17 | "num_hidden_layers": 12, 18 | "rotary_emb_base": 10000, 19 | "rotary_pct": 0.25, 20 | "tie_word_embeddings": false, 21 | "transformers_version": "4.28.1", 22 | "use_cache": false, 23 | "vocab_size": 50432 24 | } 25 | -------------------------------------------------------------------------------- /mftcoder_atorch/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .tokenizer import build_tokenizer 17 | -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """peft utils interface.""" 2 | 3 | from .config import PeftConfig, PetuningConfig 4 | 5 | from .mapping import TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING 6 | from .mapping import TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING 7 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING 8 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING 9 | from .mapping import TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING 10 | from .mapping import TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING 11 | from .mapping import WEIGHTS_NAME, CONFIG_NAME 12 | from .mapping import bloom_model_postprocess_past_key_value 13 | 14 | from .others import get_peft_model_state_dict, set_peft_model_state_dict, _freeze_model, prepare_model_for_kbit_training -------------------------------------------------------------------------------- /mftcoder_atorch/model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # from .gpt2_model import GPT2ModelPipe 19 | # from .utils import get_params_for_weight_decay_optimization 20 | # from .word_embeddings import SoftEmbedding 21 | -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/utils/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import sys 5 | sys.path.append("..") 6 | sys.path.append("../..") 7 | from typing import List, Optional 8 | from dataclasses import dataclass, field 9 | from peft.utils import PeftConfig 10 | 11 | 12 | @dataclass 13 | class PetuningConfig(PeftConfig): 14 | """ 15 | This is the base configuration class to store the configuration of [`ROEM`], or [`BitFit`]. 16 | 17 | Args: 18 | modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable 19 | and saved in the final checkpoint. 20 | """ 21 | 22 | modules_to_save: Optional[List[str]] = field( 23 | default=None, 24 | metadata={ 25 | "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 26 | "For example, in Sequence Classification or Token Classification tasks, " 27 | "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." 28 | }, 29 | ) -------------------------------------------------------------------------------- /mftcoder_accelerate/src/configs/dpo_train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "xxpo": "dpo", 3 | "data_paths": "$DATA_PATHS", 4 | "output_dir": "$OUTPUT_DIR", 5 | "tb_dir": "$TensorBoard_DIR", 6 | "pretrained_model_path": "$MODEL_NAME_OR_PATH", 7 | "model_type": "$MODEL_TYPE", 8 | "data_split": "99,1", 9 | "attn_implementation": "flash_attention_2", 10 | "beta": 0.1, 11 | "rpo_alpha": 0.5, 12 | "peft_type": "lora", 13 | "lora_rank": 64, 14 | "lora_alpha": 128, 15 | "lora_dropout": 0.0, 16 | "per_device_train_batch_size": 1, 17 | "per_device_eval_batch_size": 1, 18 | "tokenizer_type": "AutoTokenizer", 19 | "dataset_num_proc": 1, 20 | "learning_rate": 5e-7, 21 | "weight_decay": 0.01, 22 | "gradient_accumulation_steps": 8, 23 | "lr_scheduler_type": "cosine", 24 | "warmup_steps": 100, 25 | "num_train_epochs": 2, 26 | "seed": 1105, 27 | "max_prompt_length": 2048, 28 | "max_length": 4096, 29 | "logging_steps": 20, 30 | "save_steps": 500, 31 | "eval_steps": 500, 32 | "epoch_checkpointing": false, 33 | "saving_limit": 5 34 | } -------------------------------------------------------------------------------- /mftcoder_accelerate/src/ds_single_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Author: Chaoyu Chen 3 | # Last Modified: 2023/12/11 4 | # Description: An alternative(Command line) way to launch DeepSpeed training 5 | 6 | # Launch script on single node 7 | N_GPU_PER_NODE=8 8 | 9 | # config path 10 | CONFIG="configs/xxx_train_config.json" 11 | 12 | # envs used inside training 13 | export OMP_NUM_THREADS=4 14 | export TOKENIZERS_PARALLELISM=False 15 | 16 | TODAY=$(date +%Y-%m%d-%H%M) 17 | 18 | # accelerate launch --config_file accelerate_ds_config.yaml \ 19 | accelerate launch \ 20 | --num_machines 1 \ 21 | --num_processes $N_GPU_PER_NODE \ 22 | --use_deepspeed \ 23 | --zero_stage 2 \ 24 | --offload_optimizer_device 'cpu' \ 25 | --offload_param_device 'none' \ 26 | --gradient_accumulation_steps 1 \ 27 | --gradient_clipping 1.0 \ 28 | --zero3_init_flag false \ 29 | --zero3_save_16bit_model false \ 30 | --main_training_function 'main' \ 31 | --mixed_precision 'bf16' \ 32 | --dynamo_backend 'no' \ 33 | --same_network \ 34 | --machine_rank 0 \ 35 | --rdzv_backend 'static' \ 36 | pefts/mft_accelerate.py --train_config "$CONFIG" \ 37 | --distributed_type "deepspeed" \ 38 | > MFTCoder-training-"$TODAY".log 2>&1 & 39 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/ds_zero3_single_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Author: Chaoyu Chen 3 | # Last Modified: 2024/5/20 4 | # Description: An alternative(Command line) way to launch DeepSpeed training 5 | 6 | # Launch script on single node 7 | N_GPU_PER_NODE=8 8 | 9 | # config path 10 | CONFIG="configs/xxx_train_config.json" 11 | 12 | # envs used inside training 13 | export OMP_NUM_THREADS=4 14 | export TOKENIZERS_PARALLELISM=False 15 | 16 | TODAY=$(date +%Y-%m%d-%H%M) 17 | 18 | # accelerate launch --config_file accelerate_ds_config.yaml \ 19 | accelerate launch \ 20 | --num_machines 1 \ 21 | --num_processes $N_GPU_PER_NODE \ 22 | --use_deepspeed \ 23 | --zero_stage 3 \ 24 | --offload_optimizer_device 'cpu' \ 25 | --offload_param_device 'cpu' \ 26 | --gradient_accumulation_steps 1 \ 27 | --gradient_clipping 1.0 \ 28 | --zero3_init_flag true \ 29 | --zero3_save_16bit_model true \ 30 | --main_training_function 'main' \ 31 | --mixed_precision 'bf16' \ 32 | --dynamo_backend 'no' \ 33 | --same_network \ 34 | --machine_rank 0 \ 35 | --rdzv_backend 'static' \ 36 | pefts/mft_accelerate.py --train_config "$CONFIG" \ 37 | --distributed_type "deepspeed" \ 38 | > MFTCoder-training-"$TODAY".log 2>&1 & 39 | -------------------------------------------------------------------------------- /mftcoder_atorch/utils/merge_base_and_lora_to_hf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import shutil 5 | import torch 6 | import transformers 7 | sys.path.append("..") 8 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel 9 | from peft import LoraConfig, get_peft_model 10 | from peft import PeftModel 11 | from model_mapping import MODEL_SPECIAL_TOKENS 12 | 13 | 14 | model_path='path to base model' 15 | lora_adapter='path to lora adaptor ckpt' 16 | save_path='path to new merged model' 17 | model_type = 'gpt_neox' 18 | 19 | t0 = time.time() 20 | config = {"model_type": model_type} 21 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 22 | 23 | base_model = AutoModelForCausalLM.from_pretrained( 24 | model_path, 25 | trust_remote_code=True, 26 | torch_dtype=torch.bfloat16, 27 | return_dict=True, 28 | device_map="auto" 29 | ) 30 | print(base_model) 31 | 32 | # merge, save model and tokenizer 33 | model_to_merge = PeftModel.from_pretrained(base_model, lora_adapter) 34 | merged_model = model_to_merge.merge_and_unload() 35 | print(merged_model.config) 36 | merged_model.save_pretrained(save_path) 37 | tokenizer.save_pretrained(save_path) 38 | print(f"Merge finised: {save_path} saved, Cost {time.time()-t0:.2f}s") -------------------------------------------------------------------------------- /mftcoder_accelerate/src/offline_tokenization/writer.py: -------------------------------------------------------------------------------- 1 | 2 | import threading 3 | import fcntl 4 | import json 5 | 6 | class JSONLWriter(): 7 | """ 8 | A writer used to save jsonl lines into a file. 9 | """ 10 | def __init__(self, output_path, dataset_name): 11 | self.output_path = output_path 12 | self.out_file = open(output_path, 'w') 13 | self.cache = [] 14 | self.cache_size = 4096 15 | self.dataset_name = dataset_name 16 | self.index = 0 17 | 18 | def pack_into_jsonl(self, line_text): 19 | new_item = { 20 | "data_name": self.dataset_name, 21 | "id": self.index, 22 | "content": line_text 23 | } 24 | 25 | return new_item 26 | 27 | 28 | def add_item(self, line_text): 29 | if len(self.cache) >= self.cache_size: 30 | self.flush() 31 | 32 | item = self.pack_into_jsonl(line_text) 33 | self.cache.append(json.dumps(item)) 34 | self.index += 1 35 | 36 | 37 | def flush(self): 38 | content = '\n'.join(self.cache) 39 | fcntl.flock(self.out_file, fcntl.LOCK_EX) 40 | self.out_file.write(f'{content}\n') 41 | fcntl.flock(self.out_file, fcntl.LOCK_UN) 42 | self.cache = [] 43 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/configs/full_train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_paths": "$DATA_PATHS", 3 | "output_dir": "$OUTPUT_DIR", 4 | "tb_dir": "$TensorBoard_DIR", 5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH", 6 | "model_type": "$MODEL_TYPE", 7 | "load_raw_dataset": true, 8 | "data_split": "98,2,0", 9 | "padding_mode": "padding", 10 | "use_dynamic_padding": true, 11 | "tokenize_mode": "sft", 12 | "tokenizer_type": "AutoTokenizer", 13 | "weighted_loss_mode": "case3", 14 | "attn_implementation": "flash_attention_2", 15 | "seq_length": 4096, 16 | "seed": 1234, 17 | "peft_type": null, 18 | "per_device_train_batch_size": 2, 19 | "per_device_eval_batch_size": 2, 20 | "learning_rate": 5e-5, 21 | "min_lr": 5e-6, 22 | "weight_decay": 0.1, 23 | "gradient_accumulation_steps": 1, 24 | "lr_scheduler_type": "cosine", 25 | "num_warmup_steps": 300, 26 | "num_train_epochs": 4, 27 | "resume_from_checkpoint": null, 28 | "log_interval": 10, 29 | "checkpointing_steps": 100, 30 | "evaluation_steps": 100, 31 | "max_train_steps": null, 32 | "epoch_checkpointing": true, 33 | "shuffle_before_split": true, 34 | "early_stopping": true, 35 | "early_stopping_stall_num": 5, 36 | "saving_limit": 3 37 | } -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/tuner/__init__.py: -------------------------------------------------------------------------------- 1 | """peft tuner methods interface.""" 2 | 3 | from peft.utils import PeftType 4 | from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING 5 | from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING 6 | 7 | from .adalora import AdaLoraConfig, AdaLoraModel 8 | from .routelora import RouteLoraConfig, RouteLoraModel 9 | from .unipelt import UniPELTConfig, UniPELTModel, PEUniPELTModel 10 | from .pe_base_model import PEBaseModel 11 | from .bitfit import PeftBitfitConfig, PEBitfitModel, PeftBitfitModel 12 | from .roem import PeftROEMConfig, PEROEMModel, PeftROEMModel 13 | 14 | # Register new ant peft methods 15 | PeftType.ROUTELORA = "ROUTELORA" 16 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROUTELORA] = RouteLoraModel 17 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROUTELORA] = RouteLoraConfig 18 | 19 | PeftType.UNIPELT = "UNIPELT" 20 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.UNIPELT] = UniPELTModel 21 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.UNIPELT] = UniPELTConfig 22 | 23 | PeftType.ROEM = "ROEM" 24 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROEM] = PeftROEMModel 25 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROEM] = PeftROEMConfig 26 | 27 | PeftType.BITFIT = "BITFIT" 28 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.BITFIT] = PeftBitfitModel 29 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.BITFIT] = PeftBitfitConfig -------------------------------------------------------------------------------- /mftcoder_accelerate/src/fsdp_single_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Author: Chaoyu Chen 3 | # Last Modified: 2023/12/11 4 | # Description: An alternative(command line) way to launch FSDP training 5 | 6 | # Launch script on single node 7 | N_GPU_PER_NODE=8 8 | 9 | # config path 10 | CONFIG="configs/xxx_train_config.json" 11 | 12 | # fsdp_transformer_layer_cls_to_wrap, choose the DecoderLayer 13 | WRAP_MODULE="LlamaDecoderLayer" 14 | 15 | 16 | 17 | # envs used inside training 18 | export OMP_NUM_THREADS=4 19 | export TOKENIZERS_PARALLELISM=False 20 | 21 | TODAY=$(date +%Y-%m%d-%H%M) 22 | 23 | # accelerate launch --config_file accelerate_fsdp_config.yaml \ 24 | accelerate launch \ 25 | --use_fsdp \ 26 | --num_machines=1 \ 27 | --num_processes=$N_GPU_PER_NODE \ 28 | --fsdp_sharding_strategy=1 \ 29 | --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \ 30 | --fsdp_state_dict_type=FULL_STATE_DICT \ 31 | --fsdp_backward_prefetch_policy=BACKWARD_PRE \ 32 | --fsdp_transformer_layer_cls_to_wrap=$WRAP_MODULE \ 33 | --fsdp_offload_params=false \ 34 | --main_training_function=main \ 35 | --mixed_precision=bf16 \ 36 | --dynamo_backend=no \ 37 | --same_network \ 38 | --machine_rank=0 \ 39 | --rdzv_backend=static \ 40 | pefts/mft_accelerate.py --train_config "$CONFIG" \ 41 | --distributed_type "fsdp" \ 42 | > MFTCoder-training-"$TODAY".log 2>&1 & 43 | 44 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/configs/lora_train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_paths": "$DATA_PATHS", 3 | "output_dir": "$OUTPUT_DIR", 4 | "tb_dir": "$TensorBoard_DIR", 5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH", 6 | "model_type": "$MODEL_TYPE", 7 | "load_raw_dataset": true, 8 | "data_split": "98,2,0", 9 | "padding_mode": "padding", 10 | "use_dynamic_padding": true, 11 | "tokenize_mode": "sft", 12 | "tokenizer_type": "AutoTokenizer", 13 | "weighted_loss_mode": "case3", 14 | "attn_implementation": "flash_attention_2", 15 | "seq_length": 4096, 16 | "seed": 1234, 17 | "peft_type": "lora", 18 | "quantization": null, 19 | "lora_rank": 96, 20 | "lora_alpha": 32, 21 | "lora_dropout": 0.05, 22 | "per_device_train_batch_size": 2, 23 | "per_device_eval_batch_size": 2, 24 | "learning_rate": 5e-5, 25 | "min_lr": 5e-6, 26 | "weight_decay": 0.1, 27 | "gradient_accumulation_steps": 1, 28 | "lr_scheduler_type": "cosine", 29 | "num_warmup_steps": 300, 30 | "num_train_epochs": 4, 31 | "resume_from_checkpoint": null, 32 | "log_interval": 10, 33 | "checkpointing_steps": 100, 34 | "evaluation_steps": 100, 35 | "max_train_steps": null, 36 | "epoch_checkpointing": true, 37 | "shuffle_before_split": true, 38 | "early_stopping": true, 39 | "early_stopping_stall_num": 5, 40 | "saving_limit": null 41 | } -------------------------------------------------------------------------------- /mftcoder_accelerate/src/configs/qlora_train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_paths": "$DATA_PATHS", 3 | "output_dir": "$OUTPUT_DIR", 4 | "tb_dir": "$TensorBoard_DIR", 5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH", 6 | "model_type": "$MODEL_TYPE", 7 | "load_raw_dataset": true, 8 | "data_split": "98,2,0", 9 | "padding_mode": "padding", 10 | "use_dynamic_padding": true, 11 | "tokenize_mode": "sft", 12 | "tokenizer_type": "AutoTokenizer", 13 | "weighted_loss_mode": "case3", 14 | "attn_implementation": "flash_attention_2", 15 | "seq_length": 4096, 16 | "seed": 1234, 17 | "peft_type": "qlora", 18 | "quantization": "4bit", 19 | "lora_rank": 96, 20 | "lora_alpha": 32, 21 | "lora_dropout": 0.05, 22 | "per_device_train_batch_size": 2, 23 | "per_device_eval_batch_size": 2, 24 | "learning_rate": 5e-5, 25 | "min_lr": 5e-6, 26 | "weight_decay": 0.1, 27 | "gradient_accumulation_steps": 1, 28 | "lr_scheduler_type": "cosine", 29 | "num_warmup_steps": 300, 30 | "num_train_epochs": 4, 31 | "resume_from_checkpoint": null, 32 | "log_interval": 10, 33 | "checkpointing_steps": 100, 34 | "evaluation_steps": 100, 35 | "max_train_steps": null, 36 | "epoch_checkpointing": true, 37 | "shuffle_before_split": true, 38 | "early_stopping": true, 39 | "early_stopping_stall_num": 5, 40 | "saving_limit": null 41 | } -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/chatglm2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "THUDM/chatglm2-6b", 3 | "model_type": "chatglm", 4 | "architectures": [ 5 | "ChatGLMModel" 6 | ], 7 | "auto_map": { 8 | "AutoConfig": "configuration_chatglm.ChatGLMConfig", 9 | "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", 10 | "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", 11 | "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration" 12 | }, 13 | "add_bias_linear": false, 14 | "add_qkv_bias": true, 15 | "apply_query_key_layer_scaling": true, 16 | "apply_residual_connection_post_layernorm": false, 17 | "attention_dropout": 0.0, 18 | "attention_softmax_in_fp32": true, 19 | "bias_dropout_fusion": true, 20 | "ffn_hidden_size": 13696, 21 | "fp32_residual_connection": false, 22 | "hidden_dropout": 0.0, 23 | "hidden_size": 4096, 24 | "kv_channels": 128, 25 | "layernorm_epsilon": 1e-05, 26 | "multi_query_attention": true, 27 | "multi_query_group_num": 2, 28 | "num_attention_heads": 32, 29 | "num_layers": 28, 30 | "original_rope": true, 31 | "padded_vocab_size": 65024, 32 | "post_layer_norm": true, 33 | "rmsnorm": true, 34 | "seq_length": 32768, 35 | "use_cache": true, 36 | "torch_dtype": "float16", 37 | "transformers_version": "4.27.1", 38 | "tie_word_embeddings": false, 39 | "eos_token_id": 2, 40 | "pad_token_id": 0 41 | } -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/chatglm3/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "THUDM/chatglm3-6b", 3 | "model_type": "chatglm", 4 | "architectures": [ 5 | "ChatGLMModel" 6 | ], 7 | "auto_map": { 8 | "AutoConfig": "configuration_chatglm.ChatGLMConfig", 9 | "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration", 10 | "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration", 11 | "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration", 12 | "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification" 13 | }, 14 | "add_bias_linear": false, 15 | "add_qkv_bias": true, 16 | "apply_query_key_layer_scaling": true, 17 | "apply_residual_connection_post_layernorm": false, 18 | "attention_dropout": 0.0, 19 | "attention_softmax_in_fp32": true, 20 | "bias_dropout_fusion": true, 21 | "ffn_hidden_size": 13696, 22 | "fp32_residual_connection": false, 23 | "hidden_dropout": 0.0, 24 | "hidden_size": 4096, 25 | "kv_channels": 128, 26 | "layernorm_epsilon": 1e-05, 27 | "multi_query_attention": true, 28 | "multi_query_group_num": 2, 29 | "num_attention_heads": 32, 30 | "num_layers": 28, 31 | "original_rope": true, 32 | "padded_vocab_size": 65024, 33 | "post_layer_norm": true, 34 | "rmsnorm": true, 35 | "seq_length": 8192, 36 | "use_cache": true, 37 | "torch_dtype": "float16", 38 | "transformers_version": "4.30.2", 39 | "tie_word_embeddings": false, 40 | "eos_token_id": 2, 41 | "pad_token_id": 0 42 | } -------------------------------------------------------------------------------- /mftcoder_accelerate/src/ds_multinode_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Author: Chaoyu Chen 3 | # Last Modified: 2024/5/20 4 | # Description: # Launch script on Multiple Nodes 5 | 6 | # Run this script on all Nodes. 7 | 8 | # You need to export your number of nodes and number of GPUs per node first. 9 | N_NODE=4 10 | N_GPU_PER_NODE=8 11 | 12 | # You need to export $MACHINE_RANK, $MASTER_ADDR, $MASTER_PORT automatically for each Node. 13 | 14 | # config path 15 | CONFIG="configs/xxx_train_config.json" 16 | 17 | # envs used inside training 18 | export OMP_NUM_THREADS=4 19 | export TOKENIZERS_PARALLELISM=False 20 | 21 | TODAY=$(date +%Y-%m%d-%H%M) 22 | 23 | # accelerate launch --config_file accelerate_ds_config.yaml \ 24 | accelerate launch \ 25 | --num_machines $N_NODE \ 26 | --num_processes $(($N_NODE*$N_GPU_PER_NODE)) \ 27 | --use_deepspeed \ 28 | --deepspeed_multinode_launcher 'standard' \ 29 | --zero_stage 2 \ 30 | --offload_optimizer_device 'cpu' \ 31 | --offload_param_device 'none' \ 32 | --gradient_accumulation_steps 1 \ 33 | --gradient_clipping 1.0 \ 34 | --zero3_init_flag false \ 35 | --zero3_save_16bit_model false \ 36 | --main_training_function 'main' \ 37 | --mixed_precision 'bf16' \ 38 | --dynamo_backend 'no' \ 39 | --same_network \ 40 | --machine_rank $MACHINE_RANK \ 41 | --main_process_ip $MASTER_ADDR \ 42 | --main_process_port $MASTER_PORT \ 43 | --rdzv_backend 'static' \ 44 | pefts/mft_accelerate.py --train_config "$CONFIG" --distributed_type "deepspeed" -------------------------------------------------------------------------------- /mftcoder_accelerate/src/configs/coba_train_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_paths": "$DATA_PATHS", 3 | "output_dir": "$OUTPUT_DIR", 4 | "tb_dir": "$TensorBoard_DIR", 5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH", 6 | "model_type": "$MODEL_TYPE", 7 | "load_raw_dataset": true, 8 | "data_split": "95,5,0", 9 | "padding_mode": "padding", 10 | "use_dynamic_padding": true, 11 | "tokenize_mode": "sft", 12 | "tokenizer_type": "AutoTokenizer", 13 | "weighted_loss_mode": "coba", 14 | "coba_warmup_steps": 100, 15 | "coba_history_length": 200, 16 | "coba_tau": 5, 17 | "coba_update_interval": 1, 18 | "coba_sample_valid_num": 1, 19 | "attn_implementation": "flash_attention_2", 20 | "seq_length": 4096, 21 | "seed": 1234, 22 | "peft_type": "qlora", 23 | "quantization": "4bit", 24 | "lora_rank": 96, 25 | "lora_alpha": 32, 26 | "lora_dropout": 0.05, 27 | "per_device_train_batch_size": 8, 28 | "per_device_eval_batch_size": 8, 29 | "learning_rate": 5e-5, 30 | "min_lr": 5e-6, 31 | "weight_decay": 0.1, 32 | "gradient_accumulation_steps": 1, 33 | "lr_scheduler_type": "cosine", 34 | "num_warmup_steps": 300, 35 | "num_train_epochs": 4, 36 | "resume_from_checkpoint": null, 37 | "log_interval": 10, 38 | "checkpointing_steps": 100, 39 | "evaluation_steps": 100, 40 | "max_train_steps": null, 41 | "epoch_checkpointing": true, 42 | "shuffle_before_split": true, 43 | "early_stopping": true, 44 | "early_stopping_stall_num": 5, 45 | "saving_limit": null 46 | } -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/deepseek_v2/tokenization_deepseek_fast.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | 4 | from transformers.models.llama import LlamaTokenizerFast 5 | 6 | 7 | class DeepseekTokenizerFast(LlamaTokenizerFast): 8 | 9 | def convert_ids_to_tokens( 10 | self, ids: Union[int, List[int]], skip_special_tokens: bool = False 11 | ) -> Union[str, List[str]]: 12 | """ 13 | Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and 14 | added tokens. 15 | 16 | Args: 17 | ids (`int` or `List[int]`): 18 | The token id (or token ids) to convert to tokens. 19 | skip_special_tokens (`bool`, *optional*, defaults to `False`): 20 | Whether or not to remove special tokens in the decoding. 21 | 22 | Returns: 23 | `str` or `List[str]`: The decoded token(s). 24 | """ 25 | if isinstance(ids, int): 26 | return self._convert_id_to_token(ids) 27 | tokens = [] 28 | for index in ids: 29 | index = int(index) 30 | if skip_special_tokens and index in self.all_special_ids: 31 | continue 32 | token = self._tokenizer.id_to_token(index) 33 | tokens.append(token if token is not None else "") 34 | return tokens 35 | 36 | def _convert_id_to_token(self, index: int) -> Optional[str]: 37 | token = self._tokenizer.id_to_token(int(index)) 38 | return token if token is not None else "" 39 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/baichuan2/configuration_baichuan.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. 2 | 3 | from transformers.configuration_utils import PretrainedConfig 4 | 5 | 6 | class BaichuanConfig(PretrainedConfig): 7 | model_type = "baichuan" 8 | keys_to_ignore_at_inference = ["past_key_values"] 9 | 10 | def __init__( 11 | self, 12 | vocab_size=64000, 13 | hidden_size=5120, 14 | intermediate_size=13696, 15 | num_hidden_layers=40, 16 | num_attention_heads=40, 17 | hidden_act="silu", 18 | model_max_length=4096, 19 | initializer_range=0.02, 20 | rms_norm_eps=1e-6, 21 | use_cache=True, 22 | pad_token_id=0, 23 | bos_token_id=1, 24 | eos_token_id=2, 25 | tie_word_embeddings=False, 26 | gradient_checkpointing=False, 27 | z_loss_weight=0, 28 | **kwargs, 29 | ): 30 | self.vocab_size = vocab_size 31 | self.model_max_length = model_max_length 32 | self.hidden_size = hidden_size 33 | self.intermediate_size = intermediate_size 34 | self.num_hidden_layers = num_hidden_layers 35 | self.num_attention_heads = num_attention_heads 36 | self.hidden_act = hidden_act 37 | self.initializer_range = initializer_range 38 | self.rms_norm_eps = rms_norm_eps 39 | self.use_cache = use_cache 40 | self.z_loss_weight = z_loss_weight 41 | self.gradient_checkpointing = (gradient_checkpointing,) 42 | super().__init__( 43 | pad_token_id=pad_token_id, 44 | bos_token_id=bos_token_id, 45 | eos_token_id=eos_token_id, 46 | tie_word_embeddings=tie_word_embeddings, 47 | **kwargs, 48 | ) 49 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/phi/configuration_mixformer_sequential.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import math 5 | from typing import Any, Dict, List, Optional, Union 6 | 7 | from transformers import PretrainedConfig 8 | 9 | 10 | class MixFormerSequentialConfig(PretrainedConfig): 11 | """MixFormer (sequential for DeepSpeed) configuration.""" 12 | 13 | model_type = "mixformer-sequential" 14 | 15 | attribute_map = { 16 | "max_position_embeddings": "n_positions", 17 | "hidden_size": "n_embd", 18 | "num_attention_heads": "n_head", 19 | "num_hidden_layers": "n_layer", 20 | } 21 | 22 | def __init__( 23 | self, 24 | vocab_size: Optional[int] = 50304, 25 | n_positions: Optional[int] = 2048, 26 | n_embd: Optional[int] = 1024, 27 | n_layer: Optional[int] = 20, 28 | n_inner: Optional[int] = None, 29 | n_head: Optional[int] = 16, 30 | rotary_dim: Optional[int] = 32, 31 | activation_function: Optional[str] = "gelu_new", 32 | embd_pdrop: Optional[float] = 0.0, 33 | resid_pdrop: Optional[float] = 0.0, 34 | layer_norm_epsilon: Optional[float] = 1e-5, 35 | initializer_range: Optional[float] = 0.02, 36 | tie_word_embeddings: Optional[bool] = False, 37 | pad_vocab_size_multiple: Optional[int] = 64, 38 | **kwargs 39 | ) -> None: 40 | self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple) 41 | self.n_positions = n_positions 42 | self.n_embd = n_embd 43 | self.n_layer = n_layer 44 | self.n_inner = n_inner 45 | self.n_head = n_head 46 | self.rotary_dim = min(rotary_dim, n_embd // n_head) 47 | self.activation_function = activation_function 48 | self.embd_pdrop = embd_pdrop 49 | self.resid_pdrop = resid_pdrop 50 | self.layer_norm_epsilon = layer_norm_epsilon 51 | self.initializer_range = initializer_range 52 | 53 | super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) 54 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/code_llama/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 MetaAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available 17 | 18 | 19 | _import_structure = {} 20 | 21 | try: 22 | if not is_sentencepiece_available(): 23 | raise OptionalDependencyNotAvailable() 24 | except OptionalDependencyNotAvailable: 25 | pass 26 | else: 27 | _import_structure["tokenization_code_llama"] = ["CodeLlamaTokenizer"] 28 | 29 | try: 30 | if not is_tokenizers_available(): 31 | raise OptionalDependencyNotAvailable() 32 | except OptionalDependencyNotAvailable: 33 | pass 34 | else: 35 | _import_structure["tokenization_code_llama_fast"] = ["CodeLlamaTokenizerFast"] 36 | 37 | if TYPE_CHECKING: 38 | try: 39 | if not is_sentencepiece_available(): 40 | raise OptionalDependencyNotAvailable() 41 | except OptionalDependencyNotAvailable: 42 | pass 43 | else: 44 | from .tokenization_code_llama import CodeLlamaTokenizer 45 | 46 | try: 47 | if not is_tokenizers_available(): 48 | raise OptionalDependencyNotAvailable() 49 | except OptionalDependencyNotAvailable: 50 | pass 51 | else: 52 | from .tokenization_code_llama_fast import CodeLlamaTokenizerFast 53 | 54 | else: 55 | import sys 56 | 57 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 58 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/qwen/cpp_kernels.py: -------------------------------------------------------------------------------- 1 | from torch.utils import cpp_extension 2 | import pathlib 3 | import os 4 | import subprocess 5 | 6 | def _get_cuda_bare_metal_version(cuda_dir): 7 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], 8 | universal_newlines=True) 9 | output = raw_output.split() 10 | release_idx = output.index("release") + 1 11 | release = output[release_idx].split(".") 12 | bare_metal_major = release[0] 13 | bare_metal_minor = release[1][0] 14 | 15 | return raw_output, bare_metal_major, bare_metal_minor 16 | 17 | def _create_build_dir(buildpath): 18 | try: 19 | os.mkdir(buildpath) 20 | except OSError: 21 | if not os.path.isdir(buildpath): 22 | print(f"Creation of the build directory {buildpath} failed") 23 | 24 | # Check if cuda 11 is installed for compute capability 8.0 25 | cc_flag = [] 26 | _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME) 27 | if int(bare_metal_major) >= 11: 28 | cc_flag.append('-gencode') 29 | cc_flag.append('arch=compute_80,code=sm_80') 30 | if int(bare_metal_minor) >= 7: 31 | cc_flag.append('-gencode') 32 | cc_flag.append('arch=compute_90,code=sm_90') 33 | 34 | # Build path 35 | srcpath = pathlib.Path(__file__).parent.absolute() 36 | buildpath = srcpath / 'build' 37 | _create_build_dir(buildpath) 38 | 39 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags): 40 | return cpp_extension.load( 41 | name=name, 42 | sources=sources, 43 | build_directory=buildpath, 44 | extra_cflags=['-O3', ], 45 | extra_cuda_cflags=['-O3', 46 | '-gencode', 'arch=compute_70,code=sm_70', 47 | '--use_fast_math'] + extra_cuda_flags + cc_flag, 48 | verbose=1 49 | ) 50 | 51 | extra_flags = [] 52 | 53 | cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp", 54 | "./cache_autogptq_cuda_kernel_256.cu"] 55 | cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags) 56 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/utils/model_mapping.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author qumu 3 | transformers==4.40 is stable now 4 | """ 5 | 6 | # Models that Transformers support Code and FA2 when flash_attn>=2.1.0 7 | from transformers import ( 8 | GPTNeoXForCausalLM, 9 | GPTBigCodeForCausalLM, 10 | LlamaForCausalLM, 11 | MistralForCausalLM, 12 | MixtralForCausalLM, 13 | PhiForCausalLM, 14 | GemmaForCausalLM, 15 | Qwen2ForCausalLM, 16 | Qwen2MoeForCausalLM, 17 | Starcoder2ForCausalLM, 18 | ) 19 | 20 | # model in local model dir and support transformers FA2 21 | from model.deepseek_v2.modeling_deepseek import DeepseekV2ForCausalLM 22 | 23 | # model in local model and self-contained 24 | from model.aquila2.modeling_aquila import AquilaForCausalLM 25 | from model.baichuan2.modeling_baichuan import BaichuanForCausalLM 26 | from model.qwen.modeling_qwen import QWenLMHeadModel 27 | from model.chatglm2.modeling_chatglm import ChatGLMForConditionalGeneration as ChatGLMForConditionalGeneration2 28 | from model.chatglm3.modeling_chatglm import ChatGLMForConditionalGeneration as ChatGLMForConditionalGeneration3 29 | 30 | # from model.phi.modeling_mixformer_sequential import MixFormerSequentialForCausalLM 31 | 32 | MODEL_TYPES = { 33 | "aquila2": AquilaForCausalLM, 34 | "baichuan": BaichuanForCausalLM, 35 | "chatglm2": ChatGLMForConditionalGeneration2, 36 | "chatglm3": ChatGLMForConditionalGeneration3, 37 | "code_llama": LlamaForCausalLM, 38 | "deepseek": LlamaForCausalLM, 39 | "gpt_neox": GPTNeoXForCausalLM, 40 | "llama": LlamaForCausalLM, 41 | "mistral": MistralForCausalLM, 42 | "mixtral": MixtralForCausalLM, 43 | "phi": PhiForCausalLM, 44 | "qwen": QWenLMHeadModel, 45 | "starcoder": GPTBigCodeForCausalLM, 46 | "qwen2": Qwen2ForCausalLM, 47 | "gemma": GemmaForCausalLM, 48 | "qwen2_moe": Qwen2MoeForCausalLM, 49 | "starcoder2": Starcoder2ForCausalLM, 50 | "deepseek_v2": DeepseekV2ForCausalLM, 51 | } 52 | 53 | SUPPORT_IN_TRANSFORMERS = [ 54 | "code_llama", 55 | "llama", 56 | "deepseek", 57 | "mistral", 58 | "mixtral", 59 | "gpt_neox", 60 | "phi", 61 | "starcoder", 62 | "qwen2", 63 | "qwen2_moe", 64 | "gemma", 65 | "starcoder2", 66 | "deepseek_v2", 67 | ] 68 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/gpt_bigcode/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import TYPE_CHECKING 16 | 17 | from transformers.utils import ( 18 | OptionalDependencyNotAvailable, 19 | _LazyModule, 20 | is_torch_available, 21 | ) 22 | 23 | 24 | _import_structure = { 25 | "configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig"], 26 | } 27 | 28 | try: 29 | if not is_torch_available(): 30 | raise OptionalDependencyNotAvailable() 31 | except OptionalDependencyNotAvailable: 32 | pass 33 | else: 34 | _import_structure["modeling_gpt_bigcode"] = [ 35 | "GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST", 36 | "GPTBigCodeForSequenceClassification", 37 | "GPTBigCodeForTokenClassification", 38 | "GPTBigCodeForCausalLM", 39 | "GPTBigCodeModel", 40 | "GPTBigCodePreTrainedModel", 41 | ] 42 | 43 | if TYPE_CHECKING: 44 | from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig 45 | 46 | try: 47 | if not is_torch_available(): 48 | raise OptionalDependencyNotAvailable() 49 | except OptionalDependencyNotAvailable: 50 | pass 51 | else: 52 | from .modeling_gpt_bigcode import ( 53 | GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST, 54 | GPTBigCodeForCausalLM, 55 | GPTBigCodeForSequenceClassification, 56 | GPTBigCodeForTokenClassification, 57 | GPTBigCodeModel, 58 | GPTBigCodePreTrainedModel, 59 | ) 60 | 61 | 62 | else: 63 | import sys 64 | 65 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) 66 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/tokenizer/chat_template.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @author Chaoyu Chen 3 | # @date 2023/12/25 4 | 5 | # store possible chat_template for tokenizers to prepare input string 6 | # -------------------------------------------------- Import ------------------------------------------------------------ 7 | """ 8 | Usage: 9 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 10 | messages = [ 11 | {"role": "system", "content": "Be smart"}, 12 | {"role": "human", "content": "Hello, how are you?"}, 13 | {"role": "bot", "content": "I'm doing great. How can I help you today?"}, 14 | {"role": "human", "content": "I'd like to show off how chat templating works!"}, 15 | ] 16 | prompts = tokenizer.apply_chat_template(message, chat_template=MFTCoder_template, tokenize=False, add_generation_prompt=True) 17 | """ 18 | 19 | MFTCoder_template = ( 20 | "{% if messages[0]['role'] == 'system' %}" 21 | "{% set loop_messages = messages[1:] %}" # Extract system message if it's present 22 | "{% set system_message = messages[0]['content'] %}" 23 | "{% else %}" 24 | "{% set loop_messages = messages %}" 25 | "{% set system_message = false %}" 26 | "{% endif %}" 27 | "{% for message in loop_messages %}" # Loop over all non-system messages 28 | "{% if (message['role'] == 'user' or message['role'] == 'human') != (loop.index0 % 2 == 0) %}" 29 | "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" 30 | "{% endif %}" 31 | "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message 32 | "{% set content = 'system\n' + system_message + '\n' %}" 33 | "{% else %}" 34 | "{% set content = '' %}" 35 | "{% endif %}" 36 | "{% if message['role'] == 'user' or message['role'] == 'human' %}" 37 | "{{ content + 'human\n' + message['content'] + '\n' }}" 38 | "{% elif message['role'] == 'assistant' or message['role'] == 'bot' %}" 39 | "{{ 'bot\n' + message['content'] + '\n' + eos_token + '\n'}}" 40 | "{% else %}" 41 | "{{ raise_exception('Only user/human and assistant/bot roles are supported!') }}" 42 | "{% endif %}" 43 | "{% endfor %}" 44 | "{% if add_generation_prompt %}" 45 | "{{ 'bot\n' }}" 46 | "{% endif %}" 47 | ) 48 | 49 | if __name__ == "__main__": 50 | pass 51 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/chatglm2/configuration_chatglm.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | 3 | 4 | class ChatGLMConfig(PretrainedConfig): 5 | model_type = "chatglm" 6 | def __init__( 7 | self, 8 | num_layers=28, 9 | padded_vocab_size=65024, 10 | hidden_size=4096, 11 | ffn_hidden_size=13696, 12 | kv_channels=128, 13 | num_attention_heads=32, 14 | seq_length=2048, 15 | hidden_dropout=0.0, 16 | attention_dropout=0.0, 17 | layernorm_epsilon=1e-5, 18 | rmsnorm=True, 19 | apply_residual_connection_post_layernorm=False, 20 | post_layer_norm=True, 21 | add_bias_linear=False, 22 | add_qkv_bias=False, 23 | bias_dropout_fusion=True, 24 | multi_query_attention=False, 25 | multi_query_group_num=1, 26 | apply_query_key_layer_scaling=True, 27 | attention_softmax_in_fp32=True, 28 | fp32_residual_connection=False, 29 | quantization_bit=0, 30 | pre_seq_len=None, 31 | prefix_projection=False, 32 | **kwargs 33 | ): 34 | self.num_layers = num_layers 35 | self.vocab_size = padded_vocab_size 36 | self.padded_vocab_size = padded_vocab_size 37 | self.hidden_size = hidden_size 38 | self.ffn_hidden_size = ffn_hidden_size 39 | self.kv_channels = kv_channels 40 | self.num_attention_heads = num_attention_heads 41 | self.seq_length = seq_length 42 | self.hidden_dropout = hidden_dropout 43 | self.attention_dropout = attention_dropout 44 | self.layernorm_epsilon = layernorm_epsilon 45 | self.rmsnorm = rmsnorm 46 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm 47 | self.post_layer_norm = post_layer_norm 48 | self.add_bias_linear = add_bias_linear 49 | self.add_qkv_bias = add_qkv_bias 50 | self.bias_dropout_fusion = bias_dropout_fusion 51 | self.multi_query_attention = multi_query_attention 52 | self.multi_query_group_num = multi_query_group_num 53 | self.apply_query_key_layer_scaling = apply_query_key_layer_scaling 54 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32 55 | self.fp32_residual_connection = fp32_residual_connection 56 | self.quantization_bit = quantization_bit 57 | self.pre_seq_len = pre_seq_len 58 | self.prefix_projection = prefix_projection 59 | super().__init__(**kwargs) -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/chatglm3/configuration_chatglm.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | 3 | 4 | class ChatGLMConfig(PretrainedConfig): 5 | model_type = "chatglm" 6 | def __init__( 7 | self, 8 | num_layers=28, 9 | padded_vocab_size=65024, 10 | hidden_size=4096, 11 | ffn_hidden_size=13696, 12 | kv_channels=128, 13 | num_attention_heads=32, 14 | seq_length=2048, 15 | hidden_dropout=0.0, 16 | classifier_dropout=None, 17 | attention_dropout=0.0, 18 | layernorm_epsilon=1e-5, 19 | rmsnorm=True, 20 | apply_residual_connection_post_layernorm=False, 21 | post_layer_norm=True, 22 | add_bias_linear=False, 23 | add_qkv_bias=False, 24 | bias_dropout_fusion=True, 25 | multi_query_attention=False, 26 | multi_query_group_num=1, 27 | apply_query_key_layer_scaling=True, 28 | attention_softmax_in_fp32=True, 29 | fp32_residual_connection=False, 30 | quantization_bit=0, 31 | pre_seq_len=None, 32 | prefix_projection=False, 33 | **kwargs 34 | ): 35 | self.num_layers = num_layers 36 | self.vocab_size = padded_vocab_size 37 | self.padded_vocab_size = padded_vocab_size 38 | self.hidden_size = hidden_size 39 | self.ffn_hidden_size = ffn_hidden_size 40 | self.kv_channels = kv_channels 41 | self.num_attention_heads = num_attention_heads 42 | self.seq_length = seq_length 43 | self.hidden_dropout = hidden_dropout 44 | self.classifier_dropout = classifier_dropout 45 | self.attention_dropout = attention_dropout 46 | self.layernorm_epsilon = layernorm_epsilon 47 | self.rmsnorm = rmsnorm 48 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm 49 | self.post_layer_norm = post_layer_norm 50 | self.add_bias_linear = add_bias_linear 51 | self.add_qkv_bias = add_qkv_bias 52 | self.bias_dropout_fusion = bias_dropout_fusion 53 | self.multi_query_attention = multi_query_attention 54 | self.multi_query_group_num = multi_query_group_num 55 | self.apply_query_key_layer_scaling = apply_query_key_layer_scaling 56 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32 57 | self.fp32_residual_connection = fp32_residual_connection 58 | self.quantization_bit = quantization_bit 59 | self.pre_seq_len = pre_seq_len 60 | self.prefix_projection = prefix_projection 61 | super().__init__(**kwargs) -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/qwen/configuration_qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from transformers import PretrainedConfig 7 | 8 | 9 | class QWenConfig(PretrainedConfig): 10 | model_type = "qwen" 11 | keys_to_ignore_at_inference = ["past_key_values"] 12 | 13 | def __init__( 14 | self, 15 | vocab_size=151936, 16 | hidden_size=4096, 17 | num_hidden_layers=32, 18 | num_attention_heads=32, 19 | emb_dropout_prob=0.0, 20 | attn_dropout_prob=0.0, 21 | layer_norm_epsilon=1e-6, 22 | initializer_range=0.02, 23 | max_position_embeddings=8192, 24 | scale_attn_weights=True, 25 | use_cache=True, 26 | bf16=False, 27 | fp16=False, 28 | fp32=False, 29 | kv_channels=128, 30 | rotary_pct=1.0, 31 | rotary_emb_base=10000, 32 | use_dynamic_ntk=True, 33 | use_logn_attn=True, 34 | use_flash_attn="auto", 35 | intermediate_size=22016, 36 | no_bias=True, 37 | tie_word_embeddings=False, 38 | use_cache_quantization=False, 39 | use_cache_kernel=False, 40 | softmax_in_fp32=False, 41 | **kwargs, 42 | ): 43 | self.vocab_size = vocab_size 44 | self.hidden_size = hidden_size 45 | self.intermediate_size = intermediate_size 46 | self.num_hidden_layers = num_hidden_layers 47 | self.num_attention_heads = num_attention_heads 48 | self.emb_dropout_prob = emb_dropout_prob 49 | self.attn_dropout_prob = attn_dropout_prob 50 | self.layer_norm_epsilon = layer_norm_epsilon 51 | self.initializer_range = initializer_range 52 | self.scale_attn_weights = scale_attn_weights 53 | self.use_cache = use_cache 54 | self.max_position_embeddings = max_position_embeddings 55 | self.bf16 = bf16 56 | self.fp16 = fp16 57 | self.fp32 = fp32 58 | self.kv_channels = kv_channels 59 | self.rotary_pct = rotary_pct 60 | self.rotary_emb_base = rotary_emb_base 61 | self.use_dynamic_ntk = use_dynamic_ntk 62 | self.use_logn_attn = use_logn_attn 63 | self.use_flash_attn = use_flash_attn 64 | self.no_bias = no_bias 65 | self.use_cache_quantization = use_cache_quantization 66 | self.use_cache_kernel = use_cache_kernel 67 | self.softmax_in_fp32 = softmax_in_fp32 68 | super().__init__( 69 | tie_word_embeddings=tie_word_embeddings, 70 | **kwargs 71 | ) 72 | -------------------------------------------------------------------------------- /mftcoder_atorch/model/gpt_neox/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.file_utils import _LazyModule, is_tokenizers_available, is_torch_available 17 | from transformers.utils import OptionalDependencyNotAvailable 18 | # from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available 19 | # from ...utils import OptionalDependencyNotAvailable 20 | 21 | 22 | _import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]} 23 | 24 | try: 25 | if not is_tokenizers_available(): 26 | raise OptionalDependencyNotAvailable() 27 | except OptionalDependencyNotAvailable: 28 | pass 29 | else: 30 | _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"] 31 | 32 | try: 33 | if not is_torch_available(): 34 | raise OptionalDependencyNotAvailable() 35 | except OptionalDependencyNotAvailable: 36 | pass 37 | else: 38 | _import_structure["modeling_gpt_neox"] = [ 39 | "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST", 40 | "GPTNeoXForCausalLM", 41 | "GPTNeoXLayer", 42 | "GPTNeoXModel", 43 | "GPTNeoXPreTrainedModel", 44 | ] 45 | 46 | 47 | if TYPE_CHECKING: 48 | from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig 49 | 50 | try: 51 | if not is_tokenizers_available(): 52 | raise OptionalDependencyNotAvailable() 53 | except OptionalDependencyNotAvailable: 54 | pass 55 | else: 56 | from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast 57 | 58 | try: 59 | if not is_torch_available(): 60 | raise OptionalDependencyNotAvailable() 61 | except OptionalDependencyNotAvailable: 62 | pass 63 | else: 64 | from .modeling_gpt_neox import ( 65 | GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST, 66 | GPTNeoXForCausalLM, 67 | GPTNeoXLayer, 68 | GPTNeoXModel, 69 | GPTNeoXPreTrainedModel, 70 | ) 71 | 72 | 73 | else: 74 | import sys 75 | 76 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/gpt_neox/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.file_utils import _LazyModule, is_tokenizers_available, is_torch_available 17 | from transformers.utils import OptionalDependencyNotAvailable 18 | # from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available 19 | # from ...utils import OptionalDependencyNotAvailable 20 | 21 | 22 | _import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]} 23 | 24 | try: 25 | if not is_tokenizers_available(): 26 | raise OptionalDependencyNotAvailable() 27 | except OptionalDependencyNotAvailable: 28 | pass 29 | else: 30 | _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"] 31 | 32 | try: 33 | if not is_torch_available(): 34 | raise OptionalDependencyNotAvailable() 35 | except OptionalDependencyNotAvailable: 36 | pass 37 | else: 38 | _import_structure["modeling_gpt_neox"] = [ 39 | "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST", 40 | "GPTNeoXForCausalLM", 41 | "GPTNeoXLayer", 42 | "GPTNeoXModel", 43 | "GPTNeoXPreTrainedModel", 44 | ] 45 | 46 | 47 | if TYPE_CHECKING: 48 | from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig 49 | 50 | try: 51 | if not is_tokenizers_available(): 52 | raise OptionalDependencyNotAvailable() 53 | except OptionalDependencyNotAvailable: 54 | pass 55 | else: 56 | from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast 57 | 58 | try: 59 | if not is_torch_available(): 60 | raise OptionalDependencyNotAvailable() 61 | except OptionalDependencyNotAvailable: 62 | pass 63 | else: 64 | from .modeling_gpt_neox import ( 65 | GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST, 66 | GPTNeoXForCausalLM, 67 | GPTNeoXLayer, 68 | GPTNeoXModel, 69 | GPTNeoXPreTrainedModel, 70 | ) 71 | 72 | 73 | else: 74 | import sys 75 | 76 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) -------------------------------------------------------------------------------- /mftcoder_atorch/train/run_gpt_mft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOAD_RAW_DATASET=True 3 | if [ ${LOAD_RAW_DATASET} = "True" ]; then 4 | LOAD_RAW_DATASET="--load_raw_dataset" 5 | DATA_PATHS="$DATA_PATHS" 6 | DATA_WEIGHTS="[1.,1.,...,1.]" 7 | DATA_SPLIT="95,5,0" 8 | SHUFFLE_BEFORE_SPLIT="" 9 | USE_RANDOM_SAMPLER="" 10 | USE_WEIGHTED_LOSS="" 11 | WEIGHT_BY_NUM_DOCUMENTS="" 12 | else 13 | LOAD_RAW_DATASET="" 14 | DATA_PATHS="$DATA_PATHS" 15 | DATA_WEIGHTS="[1.,1.,...,1.]" 16 | DATA_SPLIT="95,5,0" 17 | SHUFFLE_BEFORE_SPLIT="--shuffle_before_split" 18 | USE_RANDOM_SAMPLER="--use_random_sampler" 19 | USE_WEIGHTED_LOSS="--use_weighted_loss" 20 | WEIGHT_BY_NUM_DOCUMENTS="--weight_by_num_documents" 21 | fi 22 | 23 | VOCAB_FILE="../utils/vocab.json" 24 | MODEL_TYPE="gpt_neox" 25 | 26 | PRETRAINED_MODEL_PATH="$MODEL_NAME_OR_PATH" 27 | RESUME_FROM_CHECKPOINT="false" 28 | 29 | PER_DEVICE_BATCH_SIZE=$1 30 | TP=$2 31 | DP=$3 32 | EPOCH=$4 33 | TOTAL_TRAIN_BATCH_SIZE=$(($PER_DEVICE_BATCH_SIZE * $TP * $DP)) 34 | GPU=$(($TP * $DP)) 35 | OUTPUT="$OUTPUT_DIR" 36 | TENSORBOARD_PATH="$TensorBoard_DIR" 37 | PREFIX="master-0" 38 | mkdir -p $OUTPUT || true 39 | echo "output to $OUTPUT" 40 | mkdir -p $TENSORBOARD_PATH 41 | chmod 777 $OUTPUT 42 | chmod 777 $TENSORBOARD_PATH 43 | 44 | python -m atorch.distributed.launch \ 45 | --nproc_per_node=$(nvidia-smi -L | wc -l) \ 46 | run_train.py \ 47 | ${LOAD_RAW_DATASET} \ 48 | --tokenize_mode 'sft' \ 49 | --train_mode 'sft' \ 50 | --padding_mode 'padding' \ 51 | --pretrained_model_path $PRETRAINED_MODEL_PATH \ 52 | --vocab_file $VOCAB_FILE \ 53 | --model_type $MODEL_TYPE \ 54 | --padding \ 55 | --data_paths $DATA_PATHS \ 56 | --data_weights $DATA_WEIGHTS \ 57 | --data_split $DATA_SPLIT \ 58 | ${SHUFFLE_BEFORE_SPLIT} \ 59 | ${USE_RANDOM_SAMPLER} \ 60 | ${USE_WEIGHTED_LOSS} \ 61 | ${WEIGHT_BY_NUM_DOCUMENTS} \ 62 | --train_iters 100 \ 63 | --num_warmup_steps 500 \ 64 | --custom_lr_scheduler_type 'cosine' \ 65 | --learning_rate 1.0e-4 \ 66 | --min_lr 1.0e-5 \ 67 | --valid_iters 10 \ 68 | --valid_interval 2000 \ 69 | --num_train_epochs $EPOCH \ 70 | --seq_length 4096 \ 71 | --total_train_batch_size $TOTAL_TRAIN_BATCH_SIZE \ 72 | --per_device_valid_batch_size $PER_DEVICE_BATCH_SIZE \ 73 | --seed 42 \ 74 | --preprocessing_num_workers 6 \ 75 | --num_workers 8 \ 76 | --output_dir $OUTPUT \ 77 | --tensorboard_dir $TENSORBOARD_PATH \ 78 | --ignore_mismatched_sizes \ 79 | --skip_atorch_autoacc_dryrun \ 80 | --tp $TP \ 81 | --dp $DP \ 82 | --bf16 \ 83 | --checkpointing_steps 2000 \ 84 | --log_interval 10 \ 85 | --make_vocab_size_divisible_by 128 \ 86 | --weighted_loss_mode 'case3' \ 87 | --checkpoint_activations \ 88 | --resume_from_checkpoint $RESUME_FROM_CHECKPOINT \ 89 | --max_grad_norm 1 \ 90 | --evaluation_strategy "steps,epoch" \ 91 | --save_strategy "steps" \ 92 | --save_total_limit 2 \ 93 | --extra_save_by_epoch \ 94 | --metric_for_best_model 'loss' \ 95 | --greater_is_better 'false' \ 96 | --early_stopping_patience 10 2>&1 | tee $OUTPUT/$PREFIX-output.txt -------------------------------------------------------------------------------- /mftcoder_atorch/train/run_gpt_mft_peft.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOAD_RAW_DATASET=True 3 | if [ ${LOAD_RAW_DATASET} = "True" ]; then 4 | LOAD_RAW_DATASET="--load_raw_dataset" 5 | DATA_PATHS="$DATA_PATHS" 6 | DATA_WEIGHTS="[1.,1.,...,1.]" 7 | DATA_SPLIT="90,10,0" 8 | SHUFFLE_BEFORE_SPLIT="" 9 | USE_RANDOM_SAMPLER="" 10 | USE_WEIGHTED_LOSS="" 11 | WEIGHT_BY_NUM_DOCUMENTS="" 12 | else 13 | LOAD_RAW_DATASET="" 14 | DATA_PATHS="$DATA_PATHS" 15 | DATA_WEIGHTS="[1.,1.,...,1.]" 16 | DATA_SPLIT="95,5,0" 17 | SHUFFLE_BEFORE_SPLIT="--shuffle_before_split" 18 | USE_RANDOM_SAMPLER="--use_random_sampler" 19 | USE_WEIGHTED_LOSS="--use_weighted_loss" 20 | WEIGHT_BY_NUM_DOCUMENTS="--weight_by_num_documents" 21 | fi 22 | 23 | VOCAB_FILE="../utils/vocab.json" 24 | MODEL_TYPE="gpt_neox" 25 | 26 | PRETRAINED_MODEL_PATH="$MODEL_NAME_OR_PATH" 27 | RESUME_FROM_CHECKPOINT="false" 28 | 29 | PER_DEVICE_BATCH_SIZE=$1 30 | TP=$2 31 | DP=$3 32 | EPOCH=$4 33 | TOTAL_TRAIN_BATCH_SIZE=$(($PER_DEVICE_BATCH_SIZE * $TP * $DP)) 34 | GPU=$(($TP * $DP)) 35 | OUTPUT="$OUTPUT_DIR" 36 | TENSORBOARD_PATH="$TensorBoard_DIR" 37 | PREFIX="master-0" 38 | mkdir -p $OUTPUT || true 39 | echo "output to $OUTPUT" 40 | mkdir -p $TENSORBOARD_PATH 41 | chmod 777 $OUTPUT 42 | chmod 777 $TENSORBOARD_PATH 43 | 44 | python -m atorch.distributed.launch \ 45 | --nproc_per_node=$(nvidia-smi -L | wc -l) \ 46 | run_train.py \ 47 | ${LOAD_RAW_DATASET} \ 48 | --tokenize_mode 'sft' \ 49 | --padding_mode 'padding' \ 50 | --pretrained_model_path $PRETRAINED_MODEL_PATH \ 51 | --vocab_file $VOCAB_FILE \ 52 | --model_type $MODEL_TYPE \ 53 | --padding \ 54 | --data_paths $DATA_PATHS \ 55 | --data_weights $DATA_WEIGHTS \ 56 | --data_split $DATA_SPLIT \ 57 | ${SHUFFLE_BEFORE_SPLIT} \ 58 | ${USE_RANDOM_SAMPLER} \ 59 | ${USE_WEIGHTED_LOSS} \ 60 | ${WEIGHT_BY_NUM_DOCUMENTS} \ 61 | --train_iters 100 \ 62 | --num_warmup_steps 30 \ 63 | --custom_lr_scheduler_type 'cosine' \ 64 | --learning_rate 5.0e-5 \ 65 | --min_lr 1.0e-6 \ 66 | --valid_iters 400 \ 67 | --valid_interval 500 \ 68 | --num_train_epochs $EPOCH \ 69 | --seq_length 4096 \ 70 | --total_train_batch_size $TOTAL_TRAIN_BATCH_SIZE \ 71 | --per_device_valid_batch_size $PER_DEVICE_BATCH_SIZE \ 72 | --seed 42 \ 73 | --preprocessing_num_workers 6 \ 74 | --num_workers 8 \ 75 | --output_dir $OUTPUT \ 76 | --tensorboard_dir $TENSORBOARD_PATH \ 77 | --ignore_mismatched_sizes \ 78 | --skip_atorch_autoacc_dryrun \ 79 | --tp $TP \ 80 | --dp $DP \ 81 | --bf16 \ 82 | --checkpointing_steps 500 \ 83 | --log_interval 10 \ 84 | --make_vocab_size_divisible_by 128 \ 85 | --weighted_loss_mode 'case3' \ 86 | --peft_type 'lora' \ 87 | --checkpoint_activations \ 88 | --resume_from_checkpoint $RESUME_FROM_CHECKPOINT \ 89 | --max_grad_norm 1 \ 90 | --evaluation_strategy "steps,epoch" \ 91 | --save_strategy "steps" \ 92 | --save_total_limit 2 \ 93 | --extra_save_by_epoch \ 94 | --metric_for_best_model 'loss' \ 95 | --greater_is_better 'false' \ 96 | --early_stopping_patience 5 \ 97 | --use_dynamic_padding 2>&1 | tee $OUTPUT/$PREFIX-output.txt 98 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Blendable dataset.""" 19 | 20 | import time 21 | 22 | import numpy as np 23 | import torch 24 | 25 | from utils.common_utils import print_rank_0 26 | 27 | 28 | class BlendableDataset(torch.utils.data.Dataset): 29 | def __init__(self, datasets, weights): 30 | self.datasets = datasets 31 | num_datasets = len(datasets) 32 | assert num_datasets == len(weights) 33 | 34 | self.size = 0 35 | for dataset in self.datasets: 36 | self.size += len(dataset) 37 | 38 | # Normalize weights. 39 | weights = np.array(weights, dtype=np.float64) 40 | sum_weights = np.sum(weights) 41 | assert sum_weights > 0.0 42 | weights /= sum_weights 43 | 44 | # recompute weights 45 | weights = self.calc_weights() 46 | 47 | # Build indices. 48 | start_time = time.time() 49 | assert num_datasets < 255 50 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 51 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 52 | 53 | from data import helpers 54 | 55 | helpers.build_blending_indices( 56 | self.dataset_index, 57 | self.dataset_sample_index, 58 | weights, 59 | num_datasets, 60 | self.size, 61 | torch.distributed.get_rank() == 0, 62 | ) 63 | 64 | print( 65 | "> RANK {} elapsed time for building blendable dataset indices: " 66 | "{:.2f} (sec)".format(torch.distributed.get_rank(), time.time() - start_time) 67 | ) 68 | 69 | def calc_weights(self): 70 | dataset_sample_cnt = [len(ds) for ds in self.datasets] 71 | total_cnt = sum(dataset_sample_cnt) 72 | weights = np.array([(cnt + 0.0) / total_cnt for cnt in dataset_sample_cnt], dtype=np.float64) 73 | return weights 74 | 75 | def __len__(self): 76 | return self.size 77 | 78 | def __getitem__(self, idx): 79 | try: 80 | dataset_idx = self.dataset_index[idx] 81 | sample_idx = self.dataset_sample_index[idx] 82 | return self.datasets[dataset_idx][sample_idx] 83 | except IndexError: 84 | new_idx = idx % len(self) 85 | print( 86 | f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" 87 | ) 88 | return self[new_idx] 89 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/baichuan2/generation_utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from queue import Queue 3 | 4 | import torch 5 | 6 | 7 | def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int=0): 8 | def _parse_messages(messages, split_role="user"): 9 | system, rounds = "", [] 10 | round = [] 11 | for i, message in enumerate(messages): 12 | if message["role"] == "system": 13 | assert i == 0 14 | system = message["content"] 15 | continue 16 | if message["role"] == split_role and round: 17 | rounds.append(round) 18 | round = [] 19 | round.append(message) 20 | if round: 21 | rounds.append(round) 22 | return system, rounds 23 | 24 | max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens 25 | max_input_tokens = model.config.model_max_length - max_new_tokens 26 | system, rounds = _parse_messages(messages, split_role="user") 27 | system_tokens = tokenizer.encode(system) 28 | max_history_tokens = max_input_tokens - len(system_tokens) 29 | 30 | history_tokens = [] 31 | for round in rounds[::-1]: 32 | round_tokens = [] 33 | for message in round: 34 | if message["role"] == "user": 35 | round_tokens.append(model.generation_config.user_token_id) 36 | else: 37 | round_tokens.append(model.generation_config.assistant_token_id) 38 | round_tokens.extend(tokenizer.encode(message["content"])) 39 | if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens: 40 | history_tokens = round_tokens + history_tokens # concat left 41 | if len(history_tokens) < max_history_tokens: 42 | continue 43 | break 44 | 45 | input_tokens = system_tokens + history_tokens 46 | if messages[-1]["role"] != "assistant": 47 | input_tokens.append(model.generation_config.assistant_token_id) 48 | input_tokens = input_tokens[-max_input_tokens:] # truncate left 49 | return torch.LongTensor([input_tokens]).to(model.device) 50 | 51 | 52 | class TextIterStreamer: 53 | def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False): 54 | self.tokenizer = tokenizer 55 | self.skip_prompt = skip_prompt 56 | self.skip_special_tokens = skip_special_tokens 57 | self.tokens = [] 58 | self.text_queue = Queue() 59 | self.next_tokens_are_prompt = True 60 | 61 | def put(self, value): 62 | if self.skip_prompt and self.next_tokens_are_prompt: 63 | self.next_tokens_are_prompt = False 64 | else: 65 | if len(value.shape) > 1: 66 | value = value[0] 67 | self.tokens.extend(value.tolist()) 68 | self.text_queue.put( 69 | self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens)) 70 | 71 | def end(self): 72 | self.text_queue.put(None) 73 | 74 | def __iter__(self): 75 | return self 76 | 77 | def __next__(self): 78 | value = self.text_queue.get() 79 | if value is None: 80 | raise StopIteration() 81 | else: 82 | return value 83 | 84 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/xxpo/custom_callbacks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Customized Callbacks to use with the Trainer class and customize the training loop. 3 | """ 4 | 5 | import copy 6 | import dataclasses 7 | import json 8 | from dataclasses import dataclass 9 | from typing import Dict, List, Optional, Union 10 | 11 | import numpy as np 12 | from tqdm.auto import tqdm 13 | 14 | from transformers.trainer_utils import IntervalStrategy, has_length 15 | from transformers.training_args import TrainingArguments 16 | from transformers.utils import logging 17 | from transformers import TrainerCallback 18 | 19 | logger = logging.get_logger(__name__) 20 | 21 | 22 | class CustomProgressCallback(TrainerCallback): 23 | """ 24 | A [`TrainerCallback`] that displays the progress of training or evaluation. 25 | """ 26 | 27 | def __init__(self): 28 | self.training_bar = None 29 | self.prediction_bar = None 30 | 31 | def on_train_begin(self, args, state, control, **kwargs): 32 | if state.is_world_process_zero: 33 | self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True) 34 | self.current_step = 0 35 | 36 | def on_step_end(self, args, state, control, **kwargs): 37 | if state.is_world_process_zero and state.global_step % args.logging_steps == 0: 38 | self.training_bar.update(args.logging_steps) 39 | self.current_step = state.global_step 40 | # pass 41 | 42 | def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs): 43 | # if state.is_world_process_zero and has_length(eval_dataloader): 44 | # if self.prediction_bar is None: 45 | # self.prediction_bar = tqdm( 46 | # total=len(eval_dataloader), leave=self.training_bar is None, dynamic_ncols=True 47 | # ) 48 | # self.prediction_bar.update(1) 49 | pass 50 | 51 | def on_evaluate(self, args, state, control, **kwargs): 52 | if state.is_world_process_zero: 53 | if self.prediction_bar is not None: 54 | self.prediction_bar.close() 55 | self.prediction_bar = None 56 | 57 | def on_predict(self, args, state, control, **kwargs): 58 | if state.is_world_process_zero: 59 | if self.prediction_bar is not None: 60 | self.prediction_bar.close() 61 | self.prediction_bar = None 62 | 63 | def on_log(self, args, state, control, logs=None, **kwargs): 64 | if state.is_world_process_zero and self.training_bar is not None: 65 | # avoid modifying the logs object as it is shared between callbacks 66 | logs = copy.deepcopy(logs) 67 | # _ = logs.pop("total_flos", None) 68 | # round numbers so that it looks better in console 69 | if "epoch" in logs: 70 | logs["epoch"] = round(logs["epoch"], 2) 71 | # self.training_bar.write(str(logs)) 72 | logger.info(logs) 73 | 74 | def on_train_end(self, args, state, control, **kwargs): 75 | if state.is_world_process_zero: 76 | self.training_bar.close() 77 | self.training_bar = None 78 | 79 | 80 | class PrinterCallback(TrainerCallback): 81 | """ 82 | A bare [`TrainerCallback`] that just prints the logs. 83 | """ 84 | 85 | def on_log(self, args, state, control, logs=None, **kwargs): 86 | _ = logs.pop("total_flos", None) 87 | if state.is_local_process_zero: 88 | print(logs) 89 | 90 | 91 | class LogCallback(TrainerCallback): 92 | """ 93 | A bare [`TrainerCallback`] that just prints the logs. 94 | """ 95 | 96 | def on_log(self, args, state, control, logs=None, **kwargs): 97 | _ = logs.pop("total_flos", None) 98 | if state.is_local_process_zero: 99 | logger.info(logs) -------------------------------------------------------------------------------- /mftcoder_accelerate/src/pefts/merge_base_and_lora_to_hf.py: -------------------------------------------------------------------------------- 1 | """ 2 | # @author Chaoyu Chen 3 | # @date 2023/10/19 4 | 5 | Merge base and lora adaptor 6 | """ 7 | 8 | import os 9 | import sys 10 | import time 11 | import shutil 12 | import argparse 13 | from typing import List 14 | import torch 15 | import transformers 16 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel 17 | from peft import LoraConfig, get_peft_model 18 | from peft import PeftModel 19 | 20 | # insert src as import path 21 | current_path = os.path.abspath(__file__) 22 | parent_dir = os.path.dirname(os.path.dirname(current_path)) 23 | sys.path.insert(0, parent_dir) 24 | print("In merge_base_and_lora_to_hf.py, sys path:", sys.path) 25 | 26 | from tokenizer import init_tokenizer 27 | 28 | 29 | def copy_tokenizer_files(mode_path: str, files_list: List[str], save_path: str): 30 | if not os.path.exists(save_path): 31 | os.makedirs(save_path) 32 | 33 | for filename in files_list: 34 | 35 | src_file = os.path.join(mode_path, filename) 36 | 37 | if os.path.exists(src_file): 38 | dest_file = os.path.join(save_path, filename) 39 | 40 | shutil.copy(src_file, dest_file) 41 | print(f"Copied {filename} to {save_path}") 42 | else: 43 | print(f"File {filename} does not exist in {mode_path}") 44 | 45 | 46 | if __name__ == "__main__": 47 | 48 | # arguments 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("--base_model_or_path", type=str, default=None) 51 | parser.add_argument("--adaptor_path", type=str, default=None) 52 | parser.add_argument("--model_type", type=str, default=None) 53 | parser.add_argument("--merged_output_path", type=str, default=None) 54 | args = parser.parse_args() 55 | 56 | model_path = args.base_model_or_path 57 | lora_adapter = args.adaptor_path 58 | model_type = args.model_type 59 | save_path = args.merged_output_path 60 | 61 | t0 = time.time() 62 | 63 | tokenizer = init_tokenizer(args.base_model_or_path) 64 | 65 | base_model = AutoModelForCausalLM.from_pretrained( 66 | model_path, 67 | trust_remote_code=True, 68 | torch_dtype=torch.bfloat16, 69 | # torch_dtype=torch.float32, 70 | return_dict=True, 71 | device_map="auto", 72 | ) 73 | print("--------------------------------------Base Model--------------------------------------------") 74 | print(base_model) 75 | print("--------------------------------------------------------------------------------------------") 76 | 77 | print("-----------------------------------Base Model Config----------------------------------------") 78 | print(base_model.config) 79 | print("--------------------------------------------------------------------------------------------") 80 | 81 | # merge, save model and tokenizer 82 | model_to_merge = PeftModel.from_pretrained(base_model, lora_adapter) 83 | merged_model = model_to_merge.merge_and_unload() 84 | # merged_model.to(torch.bfloat16) 85 | 86 | print("---------------------------------Merged Model Config----------------------------------------") 87 | print(merged_model.config) 88 | print("--------------------------------------------------------------------------------------------") 89 | merged_model.save_pretrained(save_path) 90 | 91 | print("-------------------------------------Tokenizer----------------------------------------------") 92 | print(tokenizer) 93 | print("--------------------------------------------------------------------------------------------") 94 | if model_type.lower() == "deepseek": 95 | copy_tokenizer_files( 96 | model_path, 97 | ["tokenizer.model", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"], 98 | save_path, 99 | ) 100 | else: 101 | tokenizer.save_pretrained(save_path) 102 | 103 | print(f"Merge finised: {save_path} saved, Cost {time.time() - t0:.2f}s") 104 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/utils/common_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import torch 4 | from packaging import version 5 | import importlib 6 | 7 | TASK2ID = {} 8 | ID2TASK = {} 9 | 10 | 11 | def is_flash_attn_2_available(): 12 | 13 | # Let's add an extra check to see if cuda is available 14 | 15 | if not torch.cuda.is_available(): 16 | return False 17 | 18 | if torch.version.cuda: 19 | return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0") 20 | elif torch.version.hip: 21 | # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention 22 | return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.0.4") 23 | else: 24 | return False 25 | 26 | 27 | def print_rank_0(*message): 28 | """If distributed is initialized print only on rank 0.""" 29 | if torch.distributed.is_initialized(): 30 | if torch.distributed.get_rank() == 0: 31 | print(*message, flush=True) 32 | else: 33 | print(*message, flush=True) 34 | 35 | 36 | def wait_for_everyone(): 37 | torch.distributed.barrier() 38 | 39 | 40 | def _goes_first(is_main): 41 | if is_main is False: 42 | wait_for_everyone() 43 | yield 44 | if is_main is True: 45 | wait_for_everyone() 46 | 47 | 48 | def get_model_params_num(model): 49 | """ 50 | Get params number of the model 51 | Args: 52 | model: model(required) 53 | Returns: 54 | the number of parameters of model 55 | """ 56 | num = 0 57 | for _, param in model.named_parameters(): 58 | num += param.nelement() 59 | return num 60 | 61 | 62 | def unwrap_model(model): 63 | """ 64 | Recursively unwraps a model from potential containers (as used in distributed training). 65 | 66 | Args: 67 | model (`torch.nn.Module`): The model to unwrap. 68 | """ 69 | # since there could be multiple levels of wrapping, unwrap recursively 70 | if hasattr(model, "module"): 71 | return unwrap_model(model.module) 72 | else: 73 | return model 74 | 75 | 76 | def honor_type(obj, generator): 77 | """ 78 | Cast a generator to the same type as obj (list, tuple or namedtuple) 79 | """ 80 | try: 81 | return type(obj)(generator) 82 | except TypeError: 83 | # Some objects may not be able to instantiate from a generator directly 84 | return type(obj)(*list(generator)) 85 | 86 | 87 | def get_computation_speed(batch_size_per_device, seq_len, step_time): 88 | 89 | return batch_size_per_device * seq_len / (step_time + 1e-12) 90 | 91 | 92 | def human_readable_flops(num): 93 | for unit in [ 94 | "", 95 | "KFLOPS", 96 | "MFLOPS", 97 | "GFLOPS", 98 | "TFLOPS", 99 | "PFLOPS", 100 | "EFLOPS", 101 | "ZFLOPS", 102 | ]: 103 | if abs(num) < 1000.0: 104 | return "%3.1f%s" % (num, unit) 105 | num /= 1000.0 106 | return "%.1f%s" % (num, "Yi") 107 | 108 | 109 | def get_tflops_new(args, batch_size, seq_len, step_time): 110 | sl = seq_len 111 | L = args.num_hidden_layers 112 | h = args.hidden_size 113 | V = args.vocab_size 114 | flops = 96 * batch_size * sl * L * h * h * (1 + sl / (6 * h) + V / (16 * L * h)) / step_time 115 | return human_readable_flops(flops) 116 | 117 | 118 | def get_tflops_megatron(total_model_param, hidden_size, num_hidden_layers, batch_size_per_device, seq_len, step_time): 119 | 120 | ff = total_model_param * 6 121 | attn = seq_len * hidden_size * num_hidden_layers * 60 122 | flops = batch_size_per_device * seq_len * (ff + attn) / step_time 123 | return human_readable_flops(flops) 124 | 125 | 126 | def generate_task_id(data_paths): 127 | data_prefixes = list(data_paths[1:-1].split(",")) 128 | print("data paths: ") 129 | print(data_prefixes) 130 | 131 | for i, prefix in enumerate(data_prefixes): 132 | task_name = prefix.split("/")[-1] 133 | TASK2ID[task_name] = i 134 | ID2TASK[i] = task_name 135 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/tokenizer/tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | # @author Chaoyu Chen 3 | # @date 2023/6/19 4 | """ 5 | 6 | import numpy as np 7 | from typing import List, Union 8 | from utils.common_utils import print_rank_0 9 | from transformers import AutoTokenizer, AutoConfig 10 | from tokenizer.chat_template import MFTCoder_template 11 | 12 | 13 | def init_tokenizer(path): 14 | """ 15 | Init a Huggingface tokenizer, parsing eos_token from the tokenizer_config then config. 16 | Set pad_token same as eos_token for easy life. 17 | :param path: model path or tokenizer path 18 | :return: Tokenizer (TokenizerFast is preferred) 19 | """ 20 | # tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False, legacy=False) 21 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 22 | config, unused_kwargs = AutoConfig.from_pretrained(path, trust_remote_code=True, return_unused_kwargs=True) 23 | 24 | if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id: 25 | print(f"Initial eos_token_id {tokenizer.eos_token_id} from tokenizer") 26 | eos_token_id = tokenizer.eos_token_id 27 | eos_token = tokenizer.convert_ids_to_tokens(eos_token_id) 28 | elif hasattr(tokenizer, "eos_token") and tokenizer.eos_token: 29 | print(f"Initial eos_token {tokenizer.eos_token} from tokenizer") 30 | eos_token = tokenizer.eos_token 31 | eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token) 32 | elif hasattr(config, "eos_token_id") and config.eos_token_id: 33 | print(f"Initial eos_token_id {config.eos_token_id} from config.json") 34 | eos_token_id = config.eos_token_id 35 | eos_token = tokenizer.convert_ids_to_tokens(config.eos_token_id) 36 | elif hasattr(config, "eos_token") and config.eos_token: 37 | print(f"Initial eos_token {config.eos_token} from config.json") 38 | eos_token = config.eos_token 39 | eos_token_id = tokenizer.convert_tokens_to_ids(config.eos_token) 40 | else: 41 | raise ValueError( 42 | "No available eos_token or eos_token_id, please provide eos_token by params or eos_token_id by config.json" 43 | ) 44 | try: 45 | tokenizer.eos_token = eos_token 46 | tokenizer.eos_token_id = eos_token_id 47 | # set pad_token to be same as eos_token, it is ok because is will be masked out. 48 | tokenizer.pad_token = eos_token 49 | tokenizer.pad_token_id = eos_token_id 50 | except: 51 | print(f"[WARNING]Cannot set tokenizer.eos_token") 52 | 53 | tokenizer.add_bos_token = False 54 | tokenizer.add_eos_token = False 55 | tokenizer.chat_template = MFTCoder_template 56 | print_rank_0(f"Tokenizer: {type(tokenizer)}") 57 | print_rank_0(f"Length of tokenizer: {len(tokenizer)}") 58 | print_rank_0(f"build_tokenizer pad_token_id: {tokenizer.pad_token_id}, eos_token_id: {tokenizer.eos_token_id}") 59 | print_rank_0(f"build_tokenizer pad_token : {tokenizer.pad_token}, eos_token: {tokenizer.eos_token}") 60 | 61 | return tokenizer 62 | 63 | 64 | def build_tokenizer(args): 65 | """Initialize tokenizer.""" 66 | print_rank_0(f"> building {args.tokenizer_type} tokenizer ...") 67 | # Select and instantiate the tokenizer. 68 | if args.tokenizer_type.lower() == "AutoTokenizer".lower(): 69 | assert args.pretrained_model_path is not None 70 | tokenizer = init_tokenizer(args.pretrained_model_path) 71 | else: 72 | raise NotImplementedError(f"{args.tokenizer_type} tokenizer is not implemented.") 73 | 74 | # Add vocab size. 75 | args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args) 76 | 77 | return tokenizer 78 | 79 | 80 | def _vocab_size_with_padding(orig_vocab_size, args): 81 | """Pad vocab size thus it is divisible by model parallel size and 82 | still having GPU friendly size.""" 83 | 84 | after = orig_vocab_size 85 | multiple = args.make_vocab_size_divisible_by * args.model_parallel_size 86 | while (after % multiple) != 0: 87 | after += 1 88 | print_rank_0( 89 | " > padded vocab (size: {}) with {} dummy tokens " 90 | "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after) 91 | ) 92 | 93 | return after 94 | -------------------------------------------------------------------------------- /mftcoder_atorch/tokenizer/train_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Assumes a dataset of jsonl files in the same format as the neox training set. 17 | """ 18 | 19 | from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers 20 | from tokenizers.normalizers import NFKC 21 | 22 | from glob import glob 23 | import os 24 | import json 25 | import argparse 26 | 27 | 28 | def load_jsonl(input_path, quiet=True) -> list: 29 | """ 30 | Read list of objects from a JSON lines file. 31 | """ 32 | data = [] 33 | with open(input_path, "r", encoding="utf-8") as f: 34 | for line in f: 35 | data.append(json.loads(line.rstrip("\n|\r"))) 36 | if not quiet: 37 | print("Loaded {} records from {}".format(len(data), input_path)) 38 | return data 39 | 40 | 41 | def json_iterator(input_dir, text_key="text"): 42 | all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json") 43 | for j in all_jsonls: 44 | data = load_jsonl(j) 45 | for doc in data: 46 | yield doc[text_key] 47 | 48 | 49 | def train_tokenizer( 50 | input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000 51 | ): 52 | """ 53 | Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` 54 | 55 | :param input_dir: input directory containing jsonl files 56 | :param save_path: path to save tokenizer to 57 | :param tokenizer_type: type of tokenizer to train. 58 | :param vocab_size: int, size of tokenizer's vocab 59 | :return: 60 | """ 61 | 62 | if tokenizer_type == "BPE": 63 | model = models.BPE() 64 | else: 65 | raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented") 66 | tokenizer = Tokenizer(model) 67 | 68 | # Customize pre-tokenization and decoding 69 | tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) 70 | tokenizer.decoder = decoders.ByteLevel() 71 | tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) 72 | tokenizer.normalizer = NFKC() 73 | 74 | # And then train 75 | trainer = trainers.BpeTrainer( 76 | vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"] 77 | ) 78 | tokenizer.train_from_iterator(json_iterator(input_dir), trainer) 79 | 80 | # And Save it 81 | tokenizer.save(save_path, pretty=True) 82 | print(f"Tokenizer saved at {save_path}") 83 | 84 | 85 | def parse_args(): 86 | parser = argparse.ArgumentParser( 87 | description="script for training a multilingual " 88 | "HF tokenizer on CC dumps with upweighting for low resource languages" 89 | ) 90 | parser.add_argument( 91 | "--json_input_dir", 92 | type=str, 93 | help="Path to folder containing tokenizer training data in jsonl format", 94 | ) 95 | parser.add_argument( 96 | "--tokenizer_output_path", 97 | type=str, 98 | help="Path to which your trained tokenizer will be saved (should end in .json)", 99 | ) 100 | parser.add_argument( 101 | "--tokenizer_type", 102 | type=str, 103 | help="type of tokenizer to train, currently only BPE is supported", 104 | choices=["BPE"], 105 | default=["BPE"], 106 | ) 107 | parser.add_argument( 108 | "-v", 109 | "--vocab_size", 110 | help="vocabulary size of tokenizer, default=52k", 111 | type=int, 112 | default=52000, 113 | ) 114 | return parser.parse_args() 115 | 116 | 117 | if __name__ == "__main__": 118 | 119 | args = parse_args() 120 | 121 | train_tokenizer( 122 | args.json_input_dir, 123 | save_path=args.tokenizer_output_path, 124 | tokenizer_type=args.tokenizer_type, 125 | vocab_size=args.vocab_size, 126 | ) 127 | -------------------------------------------------------------------------------- /mftcoder_atorch/train/run_train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import json 4 | import logging 5 | import math 6 | import os 7 | import numpy as np 8 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 9 | 10 | class cb: 11 | def __init__(self, path): 12 | self.path = path 13 | def __call__(self, s): 14 | with open(f"{self.path}/fsdp_mapping.html", "w") as f: 15 | f.write(s) 16 | 17 | # handle multi-processing writing 18 | os.environ["HF_MODULES_CACHE"] = os.path.join("/root/.cache/huggingface/modules", os.getenv("RANK", "")) 19 | import random # noqa: E402 20 | import datasets # noqa: E402 21 | import transformers # noqa: E402 22 | from torch.utils.data import DataLoader # noqa: E402 23 | from torch.utils.data.distributed import DistributedSampler # noqa: E402 24 | from transformers import ( # noqa: E402 25 | default_data_collator, 26 | # get_scheduler, 27 | set_seed, 28 | ) 29 | from transformers.utils.versions import require_version # noqa: E402 30 | from atorch.utils.meta_model_utils import init_empty_weights_with_disk_offload # noqa: E402 31 | 32 | from transformers import AutoTokenizer 33 | 34 | from torch.distributed.fsdp import ( 35 | FullyShardedDataParallel as FSDP, 36 | ) 37 | 38 | from utils.common_utils import ( 39 | is_local_main_process, generate_task_id, print_rank_0, is_old_version, 40 | atorch_init_distributed, atorch_reset_distributed, TASK2ID, ID2TASK, 41 | get_rank, get_world_size 42 | ) 43 | from utils.auto_accelerate_utils import DataCollatorForMFTDataset, loss_func_mft 44 | from arguments.get_arguments import parse_args 45 | from model.build_model import setup_model 46 | from data.gpt2_multi_task_dataset import load_dataset_from_jsonl 47 | from train.trainer.atorch_trainer import AtorchTrainer 48 | from pathlib import Path 49 | 50 | 51 | def main(): 52 | args = parse_args() 53 | 54 | # Make one log on every process with the configuration for debugging. 55 | logging.basicConfig( 56 | # format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 57 | format="%(asctime)s - %(name)s - %(message)s", 58 | datefmt="%m/%d/%Y %H:%M:%S", 59 | level=logging.INFO, 60 | ) 61 | logger = logging.getLogger(__name__) 62 | if is_local_main_process(): 63 | datasets.utils.logging.set_verbosity_warning() 64 | transformers.utils.logging.set_verbosity_info() 65 | else: 66 | datasets.utils.logging.set_verbosity_error() 67 | transformers.utils.logging.set_verbosity_error() 68 | 69 | # If passed along, set the training seed now. 70 | if args.seed is not None: 71 | set_seed(args.seed) 72 | 73 | generate_task_id(args.data_paths, args.train_mode) # generate TASK2ID, ID2TASK mapping 74 | print(TASK2ID) 75 | print(ID2TASK) 76 | 77 | model, model_config, tokenizer = setup_model(args, logger, use_cache=False) 78 | print(f'args.total_model_param: {args.total_model_param}') 79 | 80 | train_dataset, dataloader_args = None, None 81 | train_dataloader, valid_dataloader, test_dataloader = None, None, None 82 | 83 | args.world_size = get_world_size() 84 | global_rank = get_rank() 85 | print(f'world_size: {args.world_size}, global_rank: {global_rank}') 86 | args.per_device_train_batch_size = args.total_train_batch_size // args.world_size 87 | if args.load_raw_dataset: 88 | print_rank_0('load raw dataset') 89 | if args.model_type in ['gpt_neox']: 90 | train_dataset, valid_dataset = load_dataset_from_jsonl(args, tokenizer, shard_data=True, world_size=args.world_size, global_rank=global_rank) 91 | 92 | if train_dataset is not None: 93 | args.do_train = True 94 | if valid_dataset is not None: 95 | args.do_valid = True 96 | else: 97 | print_rank_0('please set load_raw_dataset to True and rerun') 98 | 99 | if args.resume_from_checkpoint == 'true': 100 | logger.info(f'Resume from {args.output_dir}') 101 | resume_from_checkpoint = True 102 | else: 103 | logger.info(f'Train from scratch') 104 | resume_from_checkpoint = False 105 | if args.model_type in ['gpt_neox']: 106 | gpt_data = True 107 | else: 108 | gpt_data = False 109 | data_collator = DataCollatorForMFTDataset(args.model_type, args.weighted_loss_mode, args.use_dynamic_padding) 110 | my_loss_function = loss_func_mft 111 | trainer = AtorchTrainer( 112 | model=model, 113 | args=args, 114 | train_dataset=train_dataset, 115 | valid_dataset=valid_dataset, 116 | tokenizer=tokenizer, 117 | # files_to_save=files_to_save, 118 | args_to_save={ 119 | # 'max_length': args.max_length, 120 | 'max_length': args.seq_length, 121 | 'peft_type': args.peft_type, 122 | 'gpt_model': gpt_data 123 | }, 124 | data_collator=data_collator, 125 | my_loss_func=my_loss_function, 126 | custom_lr_scheduler_type=args.custom_lr_scheduler_type, 127 | rank=global_rank 128 | ) 129 | if args.do_train: 130 | trainer.train(resume_from_checkpoint=resume_from_checkpoint) 131 | 132 | 133 | if __name__ == "__main__": 134 | atorch_init_distributed("nccl") 135 | main() 136 | atorch_reset_distributed() 137 | -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/utils/mapping.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import sys 5 | sys.path.append("..") 6 | sys.path.append("../..") 7 | import torch 8 | from peft.utils import ( 9 | TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, 10 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING 11 | ) 12 | 13 | 14 | # needed for prefix-tuning of bloom model 15 | def bloom_model_postprocess_past_key_value(past_key_values): 16 | past_key_values = torch.cat(past_key_values) 17 | ( 18 | total_layers, 19 | batch_size, 20 | num_attention_heads, 21 | num_virtual_tokens, 22 | head_dim, 23 | ) = past_key_values.shape 24 | keys = past_key_values[: total_layers // 2] 25 | keys = keys.transpose(2, 3).reshape( 26 | total_layers // 2, 27 | batch_size * num_attention_heads, 28 | head_dim, 29 | num_virtual_tokens, 30 | ) 31 | values = past_key_values[total_layers // 2 :] 32 | values = values.reshape( 33 | total_layers // 2, 34 | batch_size * num_attention_heads, 35 | num_virtual_tokens, 36 | head_dim, 37 | ) 38 | 39 | return tuple(zip(keys, values)) 40 | 41 | 42 | NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = { 43 | "t5": ["q", "v"], 44 | "mt5": ["q", "v"], 45 | "bart": ["q_proj", "v_proj"], 46 | "gpt2": ["c_attn"], 47 | "bloom": ["query_key_value"], 48 | "bloomz": ["query_key_value"], 49 | "blip-2": ["q", "v", "q_proj", "v_proj"], 50 | "opt": ["q_proj", "v_proj"], 51 | "gptj": ["q_proj", "v_proj"], 52 | "gpt_neox": ["query_key_value"], 53 | "gpt_neo": ["q_proj", "v_proj"], 54 | "bert": ["query", "value"], 55 | "roberta": ["query", "value"], 56 | "xlm-roberta": ["query", "value"], 57 | "electra": ["query", "value"], 58 | "deberta-v2": ["query_proj", "value_proj"], 59 | "deberta": ["in_proj"], 60 | "layoutlm": ["query", "value"], 61 | "llama": ["q_proj", "v_proj"], 62 | "chatglm": ["query_key_value"], 63 | "glm": ["query_key_value"], 64 | } 65 | 66 | NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = { 67 | "t5": ["q", "k", "v", "o", "wi", "wo"], 68 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], 69 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 70 | # "gpt2": ["c_attn"], 71 | "bloom": ["query_key_value"], 72 | "bloomz": ["query_key_value"], 73 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 74 | # "gptj": ["q_proj", "v_proj"], 75 | # "gpt_neox": ["query_key_value"], 76 | # "gpt_neo": ["q_proj", "v_proj"], 77 | # "bert": ["query", "value"], 78 | "roberta": ["query", "key", "value", "dense"], 79 | # "xlm-roberta": ["query", "value"], 80 | # "electra": ["query", "value"], 81 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], 82 | "chatglm": ["query_key_value"], 83 | "glm": ["query_key_value"], 84 | # "deberta": ["in_proj"], 85 | # "layoutlm": ["query", "value"], 86 | } 87 | 88 | TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING = { 89 | "t5": ["q", "k", "v", "o", "wi", "wo"], 90 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], 91 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 92 | # "gpt2": ["c_attn"], 93 | "bloom": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], 94 | "bloomz": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], 95 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 96 | # "gptj": ["q_proj", "v_proj"], 97 | # "gpt_neox": ["query_key_value"], 98 | # "gpt_neo": ["q_proj", "v_proj"], 99 | # "bert": ["query", "value"], 100 | "roberta": ["query", "key", "value", "dense"], 101 | # "xlm-roberta": ["query", "value"], 102 | # "electra": ["query", "value"], 103 | "llama": ["q_proj", "v_proj"], 104 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], 105 | "glm": ["query_key_value", "dense"] 106 | # "deberta": ["in_proj"], 107 | # "layoutlm": ["query", "value"], 108 | } 109 | 110 | TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING = { 111 | "t5": ["q", "k", "v", "o", "wi", "wo"], 112 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], 113 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 114 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 115 | "roberta": ["query", "key", "value", "dense"], 116 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], 117 | "chatglm": ["query_key_value"], 118 | "glm": ["query_key_value"] 119 | } 120 | 121 | TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING = { 122 | "glm": [0, 22], 123 | "bloom": [17, 22], 124 | "bloomz": [17, 22], 125 | } 126 | 127 | TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = { 128 | "bloom": bloom_model_postprocess_past_key_value, 129 | "bloomz": bloom_model_postprocess_past_key_value, 130 | } 131 | 132 | WEIGHTS_NAME = "adapter_model.bin" 133 | CONFIG_NAME = "adapter_config.json" 134 | 135 | 136 | TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.update( 137 | NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING 138 | ) 139 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING.update( 140 | NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING 141 | ) 142 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/xxpo/xxpo_arguments.py: -------------------------------------------------------------------------------- 1 | """ 2 | # @author Chaoyu Chen 3 | # @date 2023/10/19 4 | 5 | training arguments 6 | """ 7 | 8 | from dataclasses import dataclass, asdict 9 | from typing import List, Union 10 | 11 | 12 | @dataclass 13 | class XXPOTrainArgs: 14 | # train data paths on shared FS 15 | data_paths: Union[str, List[str]] 16 | 17 | # output dir for saving adaptors in peft or full ckpts in full-parameter training 18 | output_dir: str 19 | 20 | # tensorboard dir for saving tensorboard logs 21 | tb_dir: str 22 | 23 | # pretrained_model_path, on which is the model you want to train 24 | pretrained_model_path: str 25 | 26 | # model type of pretrained_model_path, support llama|qwen|starcoder|baichuan|chatglm2 27 | model_type: str 28 | 29 | # train/valid/test split 30 | data_split: str = "98,2,0" 31 | 32 | # lora or qlora or None(for full-parameter training) 33 | peft_type: Union[None, str] = "qlora" 34 | 35 | # if qlora, 4bit will be set, else None 36 | quantization: Union[None, str] = "4bit" 37 | 38 | # lora rank, the bigger, the more trainalbe parameters 39 | lora_rank: int = 96 40 | 41 | # lora alpha 42 | lora_alpha: int = 32 43 | 44 | # lora dropout 45 | lora_dropout: float = 0.05 46 | 47 | # lora targeting modules 48 | target_modules: Union[None, str, List[str]] = None 49 | 50 | # dpo or orpo 51 | xxpo: str = "dpo" 52 | 53 | # dpo/orpo beta 54 | beta: float = 0.1 55 | 56 | rpo_alpha: Union[None, float] = None 57 | 58 | # mircro train batch size 59 | per_device_train_batch_size: int = 8 60 | 61 | # micro eval batch size, always same as micro train batch size 62 | per_device_eval_batch_size: int = 8 63 | 64 | # HF AutoTokenizer is supported, maybe more types 65 | tokenizer_type: str = "AutoTokenizer" 66 | 67 | # initial lr 68 | learning_rate: float = 5e-5 69 | 70 | # minimum lr 71 | min_lr: float = 5e-6 72 | 73 | # weight decay 74 | weight_decay: float = 0.01 75 | 76 | # gradient_accumulation_steps 77 | gradient_accumulation_steps: int = 1 78 | 79 | # lr_scheduler_type 80 | lr_scheduler_type: str = "cosine" 81 | 82 | # optimizer_type 83 | optimizer_type: str = "adamw_torch" 84 | # optimizer_type: str = "paged_adamw_32bit" 85 | 86 | # gradient_checkpointing 87 | gradient_checkpointing: bool = True 88 | gradient_checkpointing_use_reentrant: bool = False 89 | 90 | # num of warmup_steps 91 | warmup_steps: Union[int, float] = 0.05 92 | 93 | # num_train_epochs 94 | num_train_epochs: int = 4 95 | 96 | # seed for reproducing 97 | seed: int = 1234 98 | 99 | # seq_length, context length 100 | seq_length: int = 4096 101 | 102 | save_only_model: bool = True 103 | 104 | # path of adaptor which is resumed from, None for not resuming training 105 | resume_from_checkpoint: Union[None, str] = None 106 | 107 | # auto resume from latest ckpt if job restarted 108 | auto_resume: bool = True 109 | 110 | # num of steps for logging training loss 111 | logging_steps: int = 10 112 | 113 | # num of steps for saving ckpt 114 | save_steps: int = 100 115 | 116 | # num of steps for evaluation(eval_loss), better same as checkpointing steps 117 | eval_steps: int = 100 118 | 119 | # max train steps, if None, depends on num_train_epochs 120 | max_steps: int = -1 121 | 122 | # if checkpointing every epoch, maybe True in sst 123 | epoch_checkpointing: bool = False 124 | 125 | # shuffle before train/valid split 126 | shuffle_before_split: bool = True 127 | 128 | # if early stop when eval loss is not converging in the past early_stopping_stall_num evaluation point 129 | early_stopping: bool = True 130 | early_stopping_stall_num: int = 5 131 | 132 | # limit num for saving ckpts, None for no limits. Used for full-parameter training to avoid exceeding disk quota. 133 | saving_limit: Union[None, int] = None 134 | 135 | # ATTENTION_CLASSES = { "eager": Normal Attention, "flash_attention_2": FlashAttention2} 136 | attn_implementation: str = "flash_attention_2" 137 | 138 | # tokenizer chat template, if None, will use MFTCoder template 139 | chat_template: Union[None, str] = None 140 | 141 | distributed_type: Union[None, str] = None 142 | 143 | init_timeout_seconds: Union[None, int] = 3600 144 | 145 | make_vocab_size_divisible_by: int = 32 146 | model_parallel_size: int = 1 147 | use_slow_tokenizer: bool = False 148 | world_size: int = 8 149 | 150 | # max prompt string length and whole str length 151 | max_prompt_length: Union[None, int] = 2048 152 | max_length: Union[None, int] = 4096 153 | 154 | # num of process processing dataset 155 | dataset_num_proc: int = 1 156 | 157 | # model_dtype[float16, bfloat16, float] for loading 158 | dtype: str = "bfloat16" 159 | 160 | # instrumentation 161 | disable_tqdm: bool = False 162 | sanity_check: bool = False 163 | 164 | # debug argument for distributed training 165 | # "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See" 166 | # "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992" 167 | ignore_bias_buffers: bool = True 168 | 169 | def dict(self): 170 | return {k: str(v) for k, v in asdict(self).items()} 171 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/mpt/mpt_arguments.py: -------------------------------------------------------------------------------- 1 | """ 2 | # @author Chaoyu Chen 3 | # @date 2024/6/1 4 | 5 | MPT training arguments 6 | """ 7 | 8 | from dataclasses import dataclass, asdict 9 | from typing import List, Union 10 | 11 | 12 | @dataclass 13 | class MptTrainArgs: 14 | # train data paths on shared FS 15 | data_paths: Union[str, List[str]] 16 | 17 | # output dir for saving adaptors in peft or full ckpts in full-parameter training 18 | output_dir: str 19 | 20 | # tensorboard dir for saving tensorboard logs 21 | tb_dir: str 22 | 23 | # pretrained_model_path, on which is the model you want to train 24 | pretrained_model_path: str 25 | 26 | # model type of pretrained_model_path, support llama|qwen|starcoder|baichuan|chatglm2 27 | model_type: str 28 | 29 | # load from raw jsonl file or tokenized binary file 30 | load_raw_dataset: bool = True 31 | 32 | # weights of loss calculation for each task, None means equal weights 33 | task_weights: Union[None, str] = None 34 | 35 | # weights of data sampling, leave it None 36 | data_weights: Union[None, str] = None 37 | 38 | # hf loading model low_cpu_mem_usage 39 | low_cpu_mem_usage: bool = True 40 | 41 | # train/valid/test split 42 | data_split: str = "98,2,0" 43 | 44 | # padding or pack or concat 45 | padding_mode: str = "padding" 46 | 47 | # sft or sst 48 | tokenize_mode: str = "sft" 49 | 50 | # case3 or case4 51 | weighted_loss_mode: str = "case3" 52 | 53 | # mircro train batch size 54 | per_device_train_batch_size: int = 8 55 | 56 | # micro eval batch size, always same as micro train batch size 57 | per_device_eval_batch_size: int = 8 58 | 59 | # HF AutoTokenizer is supported, maybe more types 60 | tokenizer_type: str = "AutoTokenizer" 61 | 62 | # initial lr 63 | learning_rate: float = 5e-5 64 | 65 | # minimum lr 66 | min_lr: float = 5e-6 67 | 68 | # weight decay 69 | weight_decay: float = 0.01 70 | 71 | # gradient_accumulation_steps 72 | gradient_accumulation_steps: int = 1 73 | 74 | # lr_scheduler_type 75 | lr_scheduler_type: str = "cosine" 76 | 77 | # num_warmup_steps 78 | num_warmup_steps: Union[int, float] = 0.05 79 | 80 | # num_train_epochs 81 | num_train_epochs: int = 4 82 | 83 | # seed for reproducing 84 | seed: int = 1234 85 | 86 | # seq_length, context length 87 | seq_length: int = 4096 88 | 89 | # path of adaptor which is resumed from, None for not resuming training 90 | resume_from_checkpoint: Union[None, str] = None 91 | 92 | # auto resume from latest ckpt if job restarted 93 | auto_resume: bool = True 94 | 95 | # num of steps for logging training loss 96 | log_interval: int = 10 97 | 98 | # num of steps for saving ckpt 99 | checkpointing_steps: int = 100 100 | 101 | # num of steps for evaluation(eval_loss), better same as checkpointing steps 102 | evaluation_steps: int = 100 103 | 104 | # max train steps, if None, depends on num_train_epochs 105 | max_train_steps: Union[None, int] = None 106 | 107 | # if checkpointing every epoch, maybe True in sst 108 | epoch_checkpointing: bool = False 109 | 110 | # save transformers model(safetensors) 111 | save_transformers_model: bool = False 112 | 113 | # shuffle before train/valid split 114 | shuffle_before_split: bool = True 115 | 116 | # DDP random sampler 117 | use_random_sampler: bool = True 118 | 119 | # if early stop when eval loss is not converging in the past early_stopping_stall_num evaluation point 120 | early_stopping: bool = True 121 | early_stopping_stall_num: int = 5 122 | 123 | # limit num for saving ckpts, None for no limits. Used for full-parameter training to avoid exceeding disk quota. 124 | saving_limit: Union[None, int] = None 125 | 126 | # if dynamic padding 127 | use_dynamic_padding: bool = True 128 | 129 | # warm-up steps for CoBa, recommand the number of valid batches 130 | coba_warmup_steps: int = 100 131 | # history length of sample valid loss used to fit the slope curve in CoBa, recommand [2*coba_warmup_steps,5*coba_warmup_steps] 132 | coba_history_length: int = 200 133 | # temperature for divergence factor in CoBa 134 | coba_tau: int = 5 135 | # iteration interval of update per task train weight in CoBa 136 | coba_update_interval: int = 1 137 | # the number of mini valid batches sampled at each updated iteration interval 138 | coba_sample_valid_num: int = 1 139 | 140 | # ATTENTION_CLASSES = { "eager": Normal Attention, "flash_attention_2": FlashAttention2} 141 | attn_implementation: str = "flash_attention_2" 142 | 143 | # role markers, which are prompt template before each role: system, user and assistant 144 | # role_markers: {"system": "### System:\n", "user": "### Instruction:\n", "assistant": "### Response:\n"} 145 | role_markers: Union[None, dict] = None 146 | 147 | distributed_type: Union[None, str] = None 148 | 149 | init_timeout_seconds: Union[None, int] = 3600 150 | 151 | # legacy, leave them 152 | use_xformers: bool = True 153 | trust_remote_code: bool = True 154 | weight_by_num_documents: bool = True 155 | make_vocab_size_divisible_by: int = 32 156 | model_parallel_size: int = 1 157 | use_slow_tokenizer: bool = False 158 | world_size: int = 8 159 | 160 | def dict(self): 161 | return {k: str(v) for k, v in asdict(self).items()} 162 | -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/tuner/bitfit.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | sys.path.append("../..") 4 | import torch 5 | import importlib 6 | from enum import Enum 7 | from peft.utils import PeftType 8 | from dataclasses import dataclass, field, asdict 9 | from typing import Optional, List 10 | 11 | from .pe_base_model import PEBaseModel 12 | from model.peft.utils import PetuningConfig 13 | from model.peft.utils.others import _freeze_model 14 | 15 | 16 | def is_alps_available(): 17 | return importlib.util.find_spec("alps") is not None 18 | 19 | 20 | if is_alps_available(): 21 | from alps.util import logger 22 | else: 23 | import logging 24 | logger = logging.getLogger(__file__) 25 | 26 | 27 | class PEBitfitModel(PEBaseModel): 28 | """ 29 | 只训练模型bias:参考 https://arxiv.org/pdf/2106.10199.pdf 30 | model: huggingface transformers model 31 | tokenizer: huggingface transformers tokenizer 32 | """ 33 | 34 | def __init__(self, model): 35 | self.model = model 36 | 37 | def get_model(self): 38 | not_freeze_param_name = ["bias"] 39 | set_parameter_requires_grad(self.model, not_freeze_param_name) 40 | return self.model 41 | 42 | @classmethod 43 | def restore(self, model=None, path=None): 44 | logger.info("bitfit不需要额外加载参数") 45 | return model 46 | 47 | 48 | # 根据名称锁定参数层 49 | def set_parameter_requires_grad(model, freeze_param_name=[]): 50 | if not isinstance(freeze_param_name, list): 51 | freeze_param_name = [freeze_param_name] 52 | 53 | for idx, (name, param) in enumerate(model.named_parameters()): 54 | for p in freeze_param_name: 55 | if p not in name: 56 | param.requires_grad = False 57 | # 打印参数层名 58 | for idx, (name, param) in enumerate(model.named_parameters()): 59 | for p in freeze_param_name: 60 | if p in name: 61 | print("trainable parameter name is:") 62 | print(name) 63 | param.requires_grad = True 64 | 65 | 66 | @dataclass 67 | class PeftBitfitConfig(PetuningConfig): 68 | """ 69 | This is the configuration class to store the configuration of a [`PeftBitfitModel`]. 70 | 71 | Args: 72 | modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable 73 | and saved in the final checkpoint. 74 | """ 75 | 76 | modules_to_save: Optional[List[str]] = field( 77 | default=None, 78 | metadata={ 79 | "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 80 | "For example, in Sequence Classification or Token Classification tasks, " 81 | "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." 82 | }, 83 | ) 84 | 85 | def __post_init__(self): 86 | self.peft_type = PeftType.BITFIT 87 | 88 | 89 | class PeftBitfitModel(torch.nn.Module): 90 | """ 91 | Creates Bitfit model for ant peft. 92 | 93 | Args: 94 | model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers. 95 | config ([`PeftBitfitConfig`]): The configuration of the Bitfit model. 96 | 97 | Returns: 98 | `torch.nn.Module`: The Bitfit model. 99 | 100 | **Attributes**: 101 | - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed. 102 | - **peft_config** ([`PeftBitfitConfig`]): The configuration of the Bitfit model. 103 | """ 104 | 105 | def __init__(self, model, config, adapter_name): 106 | super().__init__() 107 | self.model = model 108 | 109 | self.forward = self.model.forward 110 | self.peft_config = config 111 | self.add_adapter(adapter_name, self.peft_config[adapter_name]) 112 | 113 | def add_adapter(self, adapter_name, config=None): 114 | if not isinstance(config, PeftBitfitConfig): 115 | raise ValueError( 116 | f"The PeftBitfitModel need PeftBitfitConfig, but get {type(config)}." 117 | ) 118 | 119 | if config is not None: 120 | config = self._prepare_lora_config(config) 121 | self.peft_config[adapter_name] = config 122 | 123 | if len(self.peft_config) > 1: 124 | raise ValueError( 125 | "BitfitModel supports only 1 peft config or name." 126 | "Because it only freeze the shallow layers without any additional parameters." 127 | ) 128 | 129 | self.model = PEBitfitModel(self.model).get_model() 130 | 131 | if self.peft_config[adapter_name].inference_mode: 132 | _freeze_model(self.model) 133 | 134 | @staticmethod 135 | def _prepare_lora_config(peft_config): 136 | if peft_config.inference_mode: 137 | peft_config.merge_weights = True 138 | return peft_config 139 | 140 | def __getattr__(self, name: str): 141 | """Forward missing attributes to the wrapped module.""" 142 | try: 143 | return super().__getattr__(name) # defer to nn.Module's logic 144 | except AttributeError: 145 | return getattr(self.model, name) 146 | 147 | def get_peft_config_as_dict(self, inference: bool = False): 148 | config_dict = {} 149 | for key, value in self.peft_config.items(): 150 | config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} 151 | if inference: 152 | config["inference_mode"] = True 153 | config_dict[key] = config 154 | return config -------------------------------------------------------------------------------- /mftcoder_atorch/utils/learning_rates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Learning rate decay functions.""" 19 | 20 | import math 21 | 22 | # from .common_utils import print_rank_0 23 | 24 | 25 | class AnnealingLR(object): 26 | """Anneals the learning rate.""" 27 | 28 | def __init__( 29 | self, 30 | optimizer, 31 | start_lr, 32 | warmup_iter, 33 | total_iters, 34 | decay_style, 35 | last_iter, 36 | min_lr=0.0, 37 | use_checkpoint_lr_scheduler=True, 38 | override_lr_scheduler=False, 39 | use_mup=False, 40 | ): 41 | 42 | # Class values. 43 | self.optimizer = optimizer 44 | self.start_lr = start_lr 45 | self.min_lr = min_lr 46 | self.warmup_iter = warmup_iter 47 | self.num_iters = last_iter 48 | self.end_iter = total_iters 49 | assert self.end_iter > 0 50 | self.decay_style = decay_style 51 | self.override_lr_scheduler = override_lr_scheduler 52 | self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler 53 | self.use_mup = use_mup 54 | if self.override_lr_scheduler: 55 | assert not self.use_checkpoint_lr_scheduler, ( 56 | "both override and " "use-checkpoint are set." 57 | ) 58 | # Set the learning rate 59 | self.step(self.num_iters) 60 | 61 | print("> learning rate decay style: {}".format(self.decay_style)) 62 | 63 | def update_lr(self, lr): 64 | self.start_lr = lr 65 | 66 | def get_lr(self): 67 | """Learning rate decay functions from: 68 | https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" 69 | 70 | num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter) 71 | # Warmup. 72 | if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: 73 | return float(self.start_lr) * num_iters_ / self.warmup_iter 74 | 75 | num_iters_ = num_iters_ - self.warmup_iter 76 | if self.decay_style == "linear": 77 | lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter 78 | elif self.decay_style == "cosine": 79 | lr = ( 80 | self.start_lr 81 | / 2.0 82 | * (math.cos(math.pi * num_iters_ / self.end_iter) + 1) 83 | ) 84 | elif self.decay_style == "exponential": 85 | # exp(-0.693) = 1/2 86 | lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter) 87 | else: 88 | lr = self.start_lr 89 | return max(lr, self.min_lr) 90 | 91 | def step(self, step_num=None): 92 | """Set lr for all parameters groups.""" 93 | if step_num is None: 94 | step_num = self.num_iters + 1 95 | self.num_iters = step_num 96 | new_lr = self.get_lr() 97 | for group in self.optimizer.param_groups: 98 | if self.use_mup and "width_mult" in group: 99 | group["lr"] = new_lr / group["width_mult"] 100 | else: 101 | group["lr"] = new_lr 102 | 103 | def state_dict(self): 104 | state_dict = { 105 | "start_lr": self.start_lr, 106 | "warmup_iter": self.warmup_iter, 107 | "num_iters": self.num_iters, 108 | "decay_style": self.decay_style, 109 | "end_iter": self.end_iter, 110 | "min_lr": self.min_lr, 111 | } 112 | return state_dict 113 | 114 | def _check_and_set(self, cls_value, sd_value, name): 115 | """Auxiliary function for checking the values in the checkpoint and 116 | setting them.""" 117 | if self.override_lr_scheduler: 118 | print_rank_0(" > overriding {} value to {}".format(name, cls_value)) 119 | return cls_value 120 | 121 | if not self.use_checkpoint_lr_scheduler: 122 | assert cls_value == sd_value, ( 123 | "AnnealingLR: class input value" 124 | "and checkpoint values for {} do not match".format(name) 125 | ) 126 | print_rank_0(" > using checkpoint value {} for {}".format(sd_value, name)) 127 | return sd_value 128 | 129 | def load_state_dict(self, sd): 130 | 131 | self.start_lr = self._check_and_set( 132 | self.start_lr, sd["start_lr"], "learning rate" 133 | ) 134 | self.min_lr = self._check_and_set( 135 | self.min_lr, sd["min_lr"], "minimum learning rate" 136 | ) 137 | self.warmup_iter = self._check_and_set( 138 | self.warmup_iter, sd["warmup_iter"], "warmup iterations" 139 | ) 140 | self.end_iter = self._check_and_set( 141 | self.end_iter, sd["end_iter"], "total number of iterations" 142 | ) 143 | self.decay_style = self._check_and_set( 144 | self.decay_style, sd["decay_style"], "decay style" 145 | ) 146 | 147 | self.num_iters = sd["num_iters"] 148 | self.step(self.num_iters) 149 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/pefts/mft_arguments.py: -------------------------------------------------------------------------------- 1 | """ 2 | # @author Chaoyu Chen 3 | # @date 2023/10/19 4 | 5 | training arguments 6 | """ 7 | 8 | from dataclasses import dataclass, asdict 9 | from typing import List, Union 10 | 11 | 12 | @dataclass 13 | class MftTrainArgs: 14 | # train data paths on shared FS 15 | data_paths: Union[str, List[str]] 16 | 17 | # output dir for saving adaptors in peft or full ckpts in full-parameter training 18 | output_dir: str 19 | 20 | # tensorboard dir for saving tensorboard logs 21 | tb_dir: str 22 | 23 | # pretrained_model_path, on which is the model you want to train 24 | pretrained_model_path: str 25 | 26 | # model type of pretrained_model_path, support llama|qwen|starcoder|baichuan|chatglm2 27 | model_type: str 28 | 29 | # load from raw jsonl file or tokenized binary file 30 | load_raw_dataset: bool = True 31 | 32 | # weights of loss calculation for each task, None means equal weights 33 | task_weights: Union[None, str] = None 34 | 35 | # weights of data sampling, leave it None 36 | data_weights: Union[None, str] = None 37 | 38 | # hf loading model low_cpu_mem_usage 39 | low_cpu_mem_usage: bool = True 40 | 41 | # train/valid/test split 42 | data_split: str = "98,2,0" 43 | 44 | # padding or pack or concat 45 | padding_mode: str = "padding" 46 | 47 | # sft or sst 48 | tokenize_mode: str = "sft" 49 | 50 | # mft loss mode 51 | weighted_loss_mode: str = "case3" 52 | 53 | # lora or qlora or None(for full-parameter training) 54 | peft_type: Union[None, str] = "qlora" 55 | 56 | # if qlora, 4bit will be set, else None 57 | quantization: Union[None, str] = "4bit" 58 | 59 | # lora rank, the bigger, the more trainalbe parameters 60 | lora_rank: int = 96 61 | 62 | # lora alpha 63 | lora_alpha: int = 32 64 | 65 | # lora dropout 66 | lora_dropout: float = 0.05 67 | 68 | # lora targeting modules 69 | target_modules: Union[None, str, List[str]] = None 70 | 71 | # mircro train batch size 72 | per_device_train_batch_size: int = 8 73 | 74 | # micro eval batch size, always same as micro train batch size 75 | per_device_eval_batch_size: int = 8 76 | 77 | # HF AutoTokenizer is supported, maybe more types 78 | tokenizer_type: str = "AutoTokenizer" 79 | 80 | # initial lr 81 | learning_rate: float = 5e-5 82 | 83 | # minimum lr 84 | min_lr: float = 5e-6 85 | 86 | # weight decay 87 | weight_decay: float = 0.01 88 | 89 | # gradient_accumulation_steps 90 | gradient_accumulation_steps: int = 1 91 | 92 | # lr_scheduler_type 93 | lr_scheduler_type: str = "cosine" 94 | 95 | # num_warmup_steps 96 | num_warmup_steps: Union[int, float] = 0.05 97 | 98 | # num_train_epochs 99 | num_train_epochs: int = 4 100 | 101 | # seed for reproducing 102 | seed: int = 1234 103 | 104 | # seq_length, context length 105 | seq_length: int = 4096 106 | 107 | # path of adaptor which is resumed from, None for not resuming training 108 | resume_from_checkpoint: Union[None, str] = None 109 | 110 | # auto resume from latest ckpt if job restarted 111 | auto_resume: bool = True 112 | 113 | # num of steps for logging training loss 114 | log_interval: int = 10 115 | 116 | # num of steps for saving ckpt 117 | checkpointing_steps: int = 100 118 | 119 | # num of steps for evaluation(eval_loss), better same as checkpointing steps 120 | evaluation_steps: int = 100 121 | 122 | # max train steps, if None, depends on num_train_epochs 123 | max_train_steps: Union[None, int] = None 124 | 125 | # if checkpointing every epoch, maybe True in sst 126 | epoch_checkpointing: bool = False 127 | 128 | # shuffle before train/valid split 129 | shuffle_before_split: bool = True 130 | 131 | # DDP random sampler 132 | use_random_sampler: bool = True 133 | 134 | # if early stop when eval loss is not converging in the past early_stopping_stall_num evaluation point 135 | early_stopping: bool = True 136 | early_stopping_stall_num: int = 5 137 | 138 | # limit num for saving ckpts, None for no limits. Used for full-parameter training to avoid exceeding disk quota. 139 | saving_limit: Union[None, int] = None 140 | 141 | # if dynamic padding 142 | use_dynamic_padding: bool = True 143 | 144 | # warm-up steps for CoBa, recommand the number of valid batches 145 | coba_warmup_steps: int = 100 146 | # history length of sample valid loss used to fit the slope curve in CoBa, recommand [2*coba_warmup_steps,5*coba_warmup_steps] 147 | coba_history_length: int = 200 148 | # temperature for divergence factor in CoBa 149 | coba_tau: int = 5 150 | # iteration interval of update per task train weight in CoBa 151 | coba_update_interval: int = 1 152 | # the number of mini valid batches sampled at each updated iteration interval 153 | coba_sample_valid_num: int = 1 154 | 155 | # ATTENTION_CLASSES = { "eager": Normal Attention, "flash_attention_2": FlashAttention2} 156 | attn_implementation: str = "flash_attention_2" 157 | 158 | # role markers, which are prompt template before each role: system, user and assistant 159 | # role_markers: {"system": "### System:\n", "user": "### Instruction:\n", "assistant": "### Response:\n"} 160 | role_markers: Union[None, dict] = None 161 | 162 | distributed_type: Union[None, str] = None 163 | 164 | init_timeout_seconds: Union[None, int] = 3600 165 | 166 | # legacy, leave them 167 | use_xformers: bool = True 168 | trust_remote_code: bool = True 169 | weight_by_num_documents: bool = True 170 | make_vocab_size_divisible_by: int = 32 171 | model_parallel_size: int = 1 172 | use_slow_tokenizer: bool = False 173 | world_size: int = 8 174 | 175 | def dict(self): 176 | return {k: str(v) for k, v in asdict(self).items()} 177 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/aquila2/configuration_aquila.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | """ Aquila model configuration""" 21 | 22 | from transformers import PretrainedConfig 23 | 24 | 25 | 26 | class AquilaConfig(PretrainedConfig): 27 | r""" 28 | This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila 29 | model according to the specified arguments, defining the model architecture. Instantiating a configuration with the 30 | defaults will yield a similar configuration to that of the Aquila-7B. 31 | 32 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 33 | documentation from [`PretrainedConfig`] for more information. 34 | 35 | 36 | Args: 37 | vocab_size (`int`, *optional*, defaults to 32000): 38 | Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the 39 | `inputs_ids` passed when calling [`AquilaModel`] 40 | hidden_size (`int`, *optional*, defaults to 4096): 41 | Dimension of the hidden representations. 42 | intermediate_size (`int`, *optional*, defaults to 11008): 43 | Dimension of the MLP representations. 44 | num_hidden_layers (`int`, *optional*, defaults to 32): 45 | Number of hidden layers in the Transformer encoder. 46 | num_attention_heads (`int`, *optional*, defaults to 32): 47 | Number of attention heads for each attention layer in the Transformer encoder. 48 | hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): 49 | The non-linear activation function (function or string) in the decoder. 50 | max_position_embeddings (`int`, *optional*, defaults to 2048): 51 | The maximum sequence length that this model might ever be used with. Typically set this to something large 52 | just in case (e.g., 512 or 1024 or 2048). 53 | initializer_range (`float`, *optional*, defaults to 0.02): 54 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 55 | rms_norm_eps (`float`, *optional*, defaults to 1e-12): 56 | The epsilon used by the rms normalization layers. 57 | use_cache (`bool`, *optional*, defaults to `True`): 58 | Whether or not the model should return the last key/values attentions (not used by all models). Only 59 | relevant if `config.is_decoder=True`. 60 | tie_word_embeddings(`bool`, *optional*, defaults to `False`): 61 | Whether to tie weight embeddings 62 | Example: 63 | 64 | ```python 65 | >>> from transformers import AquilaModel, AquilaConfig 66 | 67 | >>> # Initializing a Aquila aquila-7b style configuration 68 | >>> configuration = AquilaConfig() 69 | 70 | >>> # Initializing a model from the aquila-7b style configuration 71 | >>> model = AquilaModel(configuration) 72 | 73 | >>> # Accessing the model configuration 74 | >>> configuration = model.config 75 | ```""" 76 | model_type = "aquila" 77 | keys_to_ignore_at_inference = ["past_key_values"] 78 | 79 | def __init__( 80 | self, 81 | vocab_size=100008, 82 | hidden_size=4096, 83 | intermediate_size=11008, 84 | num_hidden_layers=32, 85 | num_attention_heads=32, 86 | num_key_value_heads=None, 87 | hidden_act="silu", 88 | max_position_embeddings=2048, 89 | initializer_range=0.02, 90 | rms_norm_eps=1e-6, 91 | use_cache=True, 92 | pad_token_id=0, 93 | bos_token_id=1, 94 | eos_token_id=2, 95 | pretraining_tp=1, 96 | tie_word_embeddings=False, 97 | rope_theta=10000.0, 98 | rope_scaling=None, 99 | use_xformers=True, 100 | **kwargs, 101 | ): 102 | self.vocab_size = vocab_size 103 | self.max_position_embeddings = max_position_embeddings 104 | self.hidden_size = hidden_size 105 | self.intermediate_size = intermediate_size 106 | self.num_hidden_layers = num_hidden_layers 107 | 108 | # for backward compatibility 109 | if num_key_value_heads is None: 110 | num_key_value_heads = num_attention_heads 111 | 112 | self.num_key_value_heads = num_key_value_heads 113 | 114 | self.num_attention_heads = num_attention_heads 115 | self.hidden_act = hidden_act 116 | self.initializer_range = initializer_range 117 | self.rms_norm_eps = rms_norm_eps 118 | self.pretraining_tp = pretraining_tp 119 | self.use_cache = use_cache 120 | self.rope_theta = rope_theta 121 | self.rope_scaling = rope_scaling 122 | self.use_xformers = use_xformers 123 | 124 | super().__init__( 125 | pad_token_id=pad_token_id, 126 | bos_token_id=bos_token_id, 127 | eos_token_id=eos_token_id, 128 | tie_word_embeddings=tie_word_embeddings, 129 | **kwargs, 130 | ) 131 | 132 | -------------------------------------------------------------------------------- /mftcoder_atorch/model/gpt_neox/configuration_gpt_neox.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ GPTNeoX model configuration""" 16 | 17 | from transformers.configuration_utils import PretrainedConfig 18 | from transformers.utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = { 24 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json", 25 | # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox 26 | } 27 | 28 | 29 | class GPTNeoXConfig(PretrainedConfig): 30 | r""" 31 | This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an 32 | GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration 33 | with the defaults will yield a similar configuration to that of the GPTNeoX 34 | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture. 35 | 36 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 37 | documentation from [`PretrainedConfig`] for more information. 38 | 39 | 40 | Args: 41 | vocab_size (`int`, *optional*, defaults to 50432): 42 | Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the 43 | `inputs_ids` passed when calling [`GPTNeoXModel`]. 44 | hidden_size (`int`, *optional*, defaults to 6144): 45 | Dimension of the encoder layers and the pooler layer. 46 | num_hidden_layers (`int`, *optional*, defaults to 44): 47 | Number of hidden layers in the Transformer encoder. 48 | num_attention_heads (`int`, *optional*, defaults to 64): 49 | Number of attention heads for each attention layer in the Transformer encoder. 50 | intermediate_size (`int`, *optional*, defaults to 24576): 51 | Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 52 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): 53 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 54 | `"relu"`, `"selu"` and `"gelu_new"` are supported. 55 | rotary_pct (`float`, *optional*, defaults to 0.25): 56 | percentage of hidden dimensions to allocate to rotary embeddings 57 | rotary_emb_base (`int`, *optional*, defaults to 10000) 58 | base for computing rotary embeddings frequency 59 | max_position_embeddings (`int`, *optional*, defaults to 2048): 60 | The maximum sequence length that this model might ever be used with. Typically set this to something large 61 | just in case (e.g., 512 or 1024 or 2048). 62 | initializer_range (`float`, *optional*, defaults to 1e-5): 63 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 64 | layer_norm_eps (`float`, *optional*, defaults to 1e-12): 65 | The epsilon used by the layer normalization layers. 66 | use_cache (`bool`, *optional*, defaults to `True`): 67 | Whether or not the model should return the last key/values attentions (not used by all models). Only 68 | relevant if `config.is_decoder=True`. 69 | use_parallel_residual (`bool`, *optional*, defaults to `True`): 70 | Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training 71 | speedup at large scales (e.g. 20B). 72 | Example: 73 | 74 | ```python 75 | >>> from transformers import GPTNeoXConfig, GPTNeoXModel 76 | 77 | >>> # Initializing a GPTNeoX gpt-neox-20b style configuration 78 | >>> configuration = GPTNeoXConfig() 79 | 80 | >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration 81 | >>> model = GPTNeoXModel(configuration) # doctest: +SKIP 82 | 83 | >>> # Accessing the model configuration 84 | >>> configuration = model.config # doctest: +SKIP 85 | ```""" 86 | model_type = "gpt_neox" 87 | 88 | def __init__( 89 | self, 90 | vocab_size=50432, 91 | hidden_size=6144, 92 | num_hidden_layers=44, 93 | num_attention_heads=64, 94 | intermediate_size=24576, 95 | hidden_act="gelu", 96 | rotary_pct=0.25, 97 | rotary_emb_base=10000, 98 | max_position_embeddings=2048, 99 | initializer_range=0.02, 100 | layer_norm_eps=1e-5, 101 | use_cache=True, 102 | bos_token_id=0, 103 | eos_token_id=2, 104 | tie_word_embeddings=False, 105 | use_parallel_residual=True, 106 | **kwargs, 107 | ): 108 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 109 | self.vocab_size = vocab_size 110 | self.max_position_embeddings = max_position_embeddings 111 | self.hidden_size = hidden_size 112 | self.num_hidden_layers = num_hidden_layers 113 | self.num_attention_heads = num_attention_heads 114 | self.intermediate_size = intermediate_size 115 | self.hidden_act = hidden_act 116 | self.rotary_pct = rotary_pct 117 | self.rotary_emb_base = rotary_emb_base 118 | self.initializer_range = initializer_range 119 | self.layer_norm_eps = layer_norm_eps 120 | self.use_cache = use_cache 121 | self.tie_word_embeddings = tie_word_embeddings 122 | self.use_parallel_residual = use_parallel_residual -------------------------------------------------------------------------------- /mftcoder_atorch/model/gpt_neox/tokenization_gpt_neox_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for GPTNeoX.""" 16 | import json 17 | from typing import TYPE_CHECKING, List, Optional, Tuple 18 | 19 | from tokenizers import pre_tokenizers 20 | 21 | from transformers import PreTrainedTokenizerFast 22 | from transformers.utils import logging 23 | 24 | 25 | if TYPE_CHECKING: 26 | from transformers.pipelines.conversational import Conversation 27 | 28 | 29 | logger = logging.get_logger(__name__) 30 | 31 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} 32 | 33 | PRETRAINED_VOCAB_FILES_MAP = { 34 | "tokenizer_file": { 35 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json", 36 | }, 37 | } 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | "gpt-neox-20b": 2048, 41 | } 42 | 43 | 44 | class GPTNeoXTokenizerFast(PreTrainedTokenizerFast): 45 | """ 46 | Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level 47 | Byte-Pair-Encoding. 48 | 49 | This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will 50 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 51 | 52 | ``` 53 | >>> from transformers import GPTNeoXTokenizerFast 54 | >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2") 55 | >>> tokenizer("Hello world")['input_ids'] 56 | [15496, 995] 57 | >>> tokenizer(" Hello world")['input_ids'] 58 | [18435, 995] 59 | ``` 60 | 61 | You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since 62 | the model was not pretrained this way, it might yield a decrease in performance. 63 | 64 | 65 | 66 | When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. 67 | 68 | 69 | 70 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should 71 | refer to this superclass for more information regarding those methods. 72 | 73 | Args: 74 | vocab_file (`str`): 75 | Path to the vocabulary file. 76 | merges_file (`str`): 77 | Path to the merges file. 78 | errors (`str`, *optional*, defaults to `"replace"`): 79 | Paradigm to follow when decoding bytes to UTF-8. See 80 | [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. 81 | unk_token (`str`, *optional*, defaults to `<|endoftext|>`): 82 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 83 | token instead. 84 | bos_token (`str`, *optional*, defaults to `<|endoftext|>`): 85 | The beginning of sequence token. 86 | eos_token (`str`, *optional*, defaults to `<|endoftext|>`): 87 | The end of sequence token. 88 | add_prefix_space (`bool`, *optional*, defaults to `False`): 89 | Whether or not to add an initial space to the input. This allows to treat the leading word just as any 90 | other word. (GPTNeoX tokenizer detect beginning of words by the preceding space). 91 | trim_offsets (`bool`, *optional*, defaults to `True`): 92 | Whether or not the post-processing step should trim offsets to avoid including whitespaces. 93 | """ 94 | 95 | vocab_files_names = VOCAB_FILES_NAMES 96 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 97 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 98 | model_input_names = ["input_ids", "attention_mask"] 99 | 100 | def __init__( 101 | self, 102 | vocab_file=None, 103 | merges_file=None, 104 | tokenizer_file=None, 105 | unk_token="<|endoftext|>", 106 | bos_token="<|endoftext|>", 107 | eos_token="<|endoftext|>", 108 | add_prefix_space=False, 109 | **kwargs, 110 | ): 111 | super().__init__( 112 | vocab_file, 113 | merges_file, 114 | tokenizer_file=tokenizer_file, 115 | unk_token=unk_token, 116 | bos_token=bos_token, 117 | eos_token=eos_token, 118 | add_prefix_space=add_prefix_space, 119 | **kwargs, 120 | ) 121 | 122 | pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) 123 | if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: 124 | pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) 125 | pre_tok_state["add_prefix_space"] = add_prefix_space 126 | self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) 127 | 128 | self.add_prefix_space = add_prefix_space 129 | 130 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 131 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 132 | return tuple(files) 133 | 134 | def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: 135 | """This corresponds to DialoGPT variants of models.""" 136 | input_ids = [] 137 | for is_user, text in conversation.iter_texts(): 138 | input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id]) 139 | 140 | if len(input_ids) > self.model_max_length: 141 | input_ids = input_ids[-self.model_max_length :] 142 | return input_ids -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/gpt_neox/configuration_gpt_neox.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ GPTNeoX model configuration""" 16 | 17 | from transformers.configuration_utils import PretrainedConfig 18 | from transformers.utils import logging 19 | # from ...configuration_utils import PretrainedConfig 20 | # from ...utils import logging 21 | 22 | 23 | logger = logging.get_logger(__name__) 24 | 25 | GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = { 26 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json", 27 | # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox 28 | } 29 | 30 | 31 | class GPTNeoXConfig(PretrainedConfig): 32 | r""" 33 | This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an 34 | GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration 35 | with the defaults will yield a similar configuration to that of the GPTNeoX 36 | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture. 37 | 38 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 39 | documentation from [`PretrainedConfig`] for more information. 40 | 41 | 42 | Args: 43 | vocab_size (`int`, *optional*, defaults to 50432): 44 | Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the 45 | `inputs_ids` passed when calling [`GPTNeoXModel`]. 46 | hidden_size (`int`, *optional*, defaults to 6144): 47 | Dimension of the encoder layers and the pooler layer. 48 | num_hidden_layers (`int`, *optional*, defaults to 44): 49 | Number of hidden layers in the Transformer encoder. 50 | num_attention_heads (`int`, *optional*, defaults to 64): 51 | Number of attention heads for each attention layer in the Transformer encoder. 52 | intermediate_size (`int`, *optional*, defaults to 24576): 53 | Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 54 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): 55 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 56 | `"relu"`, `"selu"` and `"gelu_new"` are supported. 57 | rotary_pct (`float`, *optional*, defaults to 0.25): 58 | percentage of hidden dimensions to allocate to rotary embeddings 59 | rotary_emb_base (`int`, *optional*, defaults to 10000) 60 | base for computing rotary embeddings frequency 61 | max_position_embeddings (`int`, *optional*, defaults to 2048): 62 | The maximum sequence length that this model might ever be used with. Typically set this to something large 63 | just in case (e.g., 512 or 1024 or 2048). 64 | initializer_range (`float`, *optional*, defaults to 1e-5): 65 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 66 | layer_norm_eps (`float`, *optional*, defaults to 1e-12): 67 | The epsilon used by the layer normalization layers. 68 | use_cache (`bool`, *optional*, defaults to `True`): 69 | Whether or not the model should return the last key/values attentions (not used by all models). Only 70 | relevant if `config.is_decoder=True`. 71 | use_parallel_residual (`bool`, *optional*, defaults to `True`): 72 | Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training 73 | speedup at large scales (e.g. 20B). 74 | Example: 75 | 76 | ```python 77 | >>> from transformers import GPTNeoXConfig, GPTNeoXModel 78 | 79 | >>> # Initializing a GPTNeoX gpt-neox-20b style configuration 80 | >>> configuration = GPTNeoXConfig() 81 | 82 | >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration 83 | >>> model = GPTNeoXModel(configuration) # doctest: +SKIP 84 | 85 | >>> # Accessing the model configuration 86 | >>> configuration = model.config # doctest: +SKIP 87 | ```""" 88 | model_type = "gpt_neox" 89 | 90 | def __init__( 91 | self, 92 | vocab_size=50432, 93 | hidden_size=6144, 94 | num_hidden_layers=44, 95 | num_attention_heads=64, 96 | intermediate_size=24576, 97 | hidden_act="gelu", 98 | rotary_pct=0.25, 99 | rotary_emb_base=10000, 100 | max_position_embeddings=2048, 101 | initializer_range=0.02, 102 | layer_norm_eps=1e-5, 103 | use_cache=True, 104 | bos_token_id=0, 105 | eos_token_id=2, 106 | tie_word_embeddings=False, 107 | use_parallel_residual=True, 108 | **kwargs, 109 | ): 110 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 111 | self.vocab_size = vocab_size 112 | self.max_position_embeddings = max_position_embeddings 113 | self.hidden_size = hidden_size 114 | self.num_hidden_layers = num_hidden_layers 115 | self.num_attention_heads = num_attention_heads 116 | self.intermediate_size = intermediate_size 117 | self.hidden_act = hidden_act 118 | self.rotary_pct = rotary_pct 119 | self.rotary_emb_base = rotary_emb_base 120 | self.initializer_range = initializer_range 121 | self.layer_norm_eps = layer_norm_eps 122 | self.use_cache = use_cache 123 | self.tie_word_embeddings = tie_word_embeddings 124 | self.use_parallel_residual = use_parallel_residual -------------------------------------------------------------------------------- /mftcoder_atorch/model/peft/tuner/roem.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | sys.path.append("../..") 4 | import torch 5 | import importlib 6 | from enum import Enum 7 | from peft.utils import PeftType 8 | from dataclasses import dataclass, field, asdict 9 | from typing import Optional, List, Union 10 | 11 | from .pe_base_model import PEBaseModel 12 | from model.peft.utils import ( 13 | PetuningConfig, 14 | TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING 15 | ) 16 | from model.peft.utils.others import _freeze_model 17 | 18 | 19 | def is_alps_available(): 20 | return importlib.util.find_spec("alps") is not None 21 | 22 | 23 | if is_alps_available(): 24 | from alps.util import logger 25 | else: 26 | import logging 27 | logger = logging.getLogger(__file__) 28 | 29 | 30 | class PEROEMModel(PEBaseModel): 31 | """ 32 | 只训练模型中间偏上层mlp:参考 https://arxiv.org/pdf/2202.05262.pdf ; https://arxiv.org/abs/2012.14913 33 | model: huggingface transformers model 34 | tokenizer: huggingface transformers tokenizer 35 | """ 36 | 37 | def __init__(self, model, model_name, task_type=None): 38 | self.model = model 39 | self.model_name = model_name 40 | 41 | def get_model(self): 42 | layer_mapping = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[self.model_name] 43 | assert len(layer_mapping) == 2 44 | not_freeze_param_name = [] 45 | for i in range(layer_mapping[0], layer_mapping[1]): 46 | no_freeze_name = str(i) + ".mlp" 47 | logger.info(f"Freeze the {no_freeze_name} layer of model") 48 | not_freeze_param_name.append(no_freeze_name) 49 | set_parameter_requires_grad(self.model, not_freeze_param_name) 50 | return self.model 51 | 52 | @classmethod 53 | def restore(self, model=None, path=None): 54 | logger.info("roem不需要额外加载参数") 55 | return model 56 | 57 | 58 | # 根据名称锁定参数层 59 | def set_parameter_requires_grad(model, freeze_param_name=[]): 60 | if not isinstance(freeze_param_name, list): 61 | freeze_param_name = [freeze_param_name] 62 | 63 | for idx, (name, param) in enumerate(model.named_parameters()): 64 | for p in freeze_param_name: 65 | if p not in name: 66 | param.requires_grad = False 67 | # 打印参数层名 68 | for idx, (name, param) in enumerate(model.named_parameters()): 69 | for p in freeze_param_name: 70 | if p in name: 71 | print("The name of used parameter used by ROEM is:") 72 | print(name) 73 | param.requires_grad = True 74 | 75 | 76 | @dataclass 77 | class PeftROEMConfig(PetuningConfig): 78 | """ 79 | This is the configuration class to store the configuration of a [`PeftROEMModel`]. 80 | 81 | Args: 82 | target_layers (`Union[List[int], int]`): The names of the modules to apply Lora to. 83 | """ 84 | 85 | target_layers: Optional[Union[List[int], int]] = field( 86 | default=None, 87 | metadata={ 88 | "help": "List of layers of the model to freeze the parameters." 89 | "For example, [20, 30] or '30' " 90 | }, 91 | ) 92 | 93 | def __post_init__(self): 94 | self.peft_type = PeftType.ROEM 95 | 96 | 97 | class PeftROEMModel(torch.nn.Module): 98 | """ 99 | Creates ROEM model for ant peft. 100 | 101 | Args: 102 | model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers. 103 | config ([`PeftROEMConfig`]): The configuration of the ROEM model. 104 | 105 | Returns: 106 | `torch.nn.Module`: The ROEM model. 107 | 108 | **Attributes**: 109 | - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed. 110 | - **peft_config** ([`PeftROEMConfig`]): The configuration of the ROEM model. 111 | """ 112 | 113 | def __init__(self, model, config, adapter_name): 114 | super().__init__() 115 | self.model = model 116 | 117 | self.forward = self.model.forward 118 | self.peft_config = config 119 | self.add_adapter(adapter_name, self.peft_config[adapter_name]) 120 | 121 | def add_adapter(self, adapter_name, config=None): 122 | if not isinstance(config, PeftROEMConfig): 123 | raise ValueError( 124 | f"The PeftROEMModel need PeftROEMConfig, but get {type(config)}." 125 | ) 126 | 127 | model_config = self.model.config.to_dict() if hasattr(self.model.config, "to_dict") else self.model.config 128 | if config is not None: 129 | config = self._prepare_lora_config(config, model_config) 130 | self.peft_config[adapter_name] = config 131 | 132 | if len(self.peft_config) > 1: 133 | raise ValueError( 134 | "ROEMModel supports only 1 peft config or name." 135 | "Because it only freeze the shallow layers without any additional parameters." 136 | ) 137 | 138 | model_name = model_config["model_type"] 139 | self.model = PEROEMModel(self.model, model_name).get_model() 140 | 141 | if self.peft_config[adapter_name].inference_mode: 142 | _freeze_model(self.model) 143 | 144 | @staticmethod 145 | def _prepare_lora_config(peft_config, model_config): 146 | if peft_config.target_layers is None: 147 | if model_config["model_type"] not in TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING: 148 | raise ValueError("Please specify `target_layers` in `peft_config`") 149 | peft_config.target_layers = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[model_config["model_type"]] 150 | if peft_config.inference_mode: 151 | peft_config.merge_weights = True 152 | return peft_config 153 | 154 | def __getattr__(self, name: str): 155 | """Forward missing attributes to the wrapped module.""" 156 | try: 157 | return super().__getattr__(name) # defer to nn.Module's logic 158 | except AttributeError: 159 | return getattr(self.model, name) 160 | 161 | def get_peft_config_as_dict(self, inference: bool = False): 162 | config_dict = {} 163 | for key, value in self.peft_config.items(): 164 | config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} 165 | if inference: 166 | config["inference_mode"] = True 167 | config_dict[key] = config 168 | return config -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/gpt_neox/tokenization_gpt_neox_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for GPTNeoX.""" 16 | import json 17 | from typing import TYPE_CHECKING, List, Optional, Tuple 18 | 19 | from tokenizers import pre_tokenizers 20 | 21 | from transformers import PreTrainedTokenizerFast 22 | # from ...tokenization_utils_fast import PreTrainedTokenizerFast 23 | from transformers.utils import logging 24 | # from ...utils import logging 25 | 26 | 27 | if TYPE_CHECKING: 28 | from transformers.pipelines.conversational import Conversation 29 | 30 | 31 | logger = logging.get_logger(__name__) 32 | 33 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} 34 | 35 | PRETRAINED_VOCAB_FILES_MAP = { 36 | "tokenizer_file": { 37 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json", 38 | }, 39 | } 40 | 41 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 42 | "gpt-neox-20b": 2048, 43 | } 44 | 45 | 46 | class GPTNeoXTokenizerFast(PreTrainedTokenizerFast): 47 | """ 48 | Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level 49 | Byte-Pair-Encoding. 50 | 51 | This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will 52 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 53 | 54 | ``` 55 | >>> from transformers import GPTNeoXTokenizerFast 56 | >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2") 57 | >>> tokenizer("Hello world")['input_ids'] 58 | [15496, 995] 59 | >>> tokenizer(" Hello world")['input_ids'] 60 | [18435, 995] 61 | ``` 62 | 63 | You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since 64 | the model was not pretrained this way, it might yield a decrease in performance. 65 | 66 | 67 | 68 | When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. 69 | 70 | 71 | 72 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should 73 | refer to this superclass for more information regarding those methods. 74 | 75 | Args: 76 | vocab_file (`str`): 77 | Path to the vocabulary file. 78 | merges_file (`str`): 79 | Path to the merges file. 80 | errors (`str`, *optional*, defaults to `"replace"`): 81 | Paradigm to follow when decoding bytes to UTF-8. See 82 | [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. 83 | unk_token (`str`, *optional*, defaults to `<|endoftext|>`): 84 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 85 | token instead. 86 | bos_token (`str`, *optional*, defaults to `<|endoftext|>`): 87 | The beginning of sequence token. 88 | eos_token (`str`, *optional*, defaults to `<|endoftext|>`): 89 | The end of sequence token. 90 | add_prefix_space (`bool`, *optional*, defaults to `False`): 91 | Whether or not to add an initial space to the input. This allows to treat the leading word just as any 92 | other word. (GPTNeoX tokenizer detect beginning of words by the preceding space). 93 | trim_offsets (`bool`, *optional*, defaults to `True`): 94 | Whether or not the post-processing step should trim offsets to avoid including whitespaces. 95 | """ 96 | 97 | vocab_files_names = VOCAB_FILES_NAMES 98 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 99 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 100 | model_input_names = ["input_ids", "attention_mask"] 101 | 102 | def __init__( 103 | self, 104 | vocab_file=None, 105 | merges_file=None, 106 | tokenizer_file=None, 107 | unk_token="<|endoftext|>", 108 | bos_token="<|endoftext|>", 109 | eos_token="<|endoftext|>", 110 | add_prefix_space=False, 111 | **kwargs, 112 | ): 113 | super().__init__( 114 | vocab_file, 115 | merges_file, 116 | tokenizer_file=tokenizer_file, 117 | unk_token=unk_token, 118 | bos_token=bos_token, 119 | eos_token=eos_token, 120 | add_prefix_space=add_prefix_space, 121 | **kwargs, 122 | ) 123 | 124 | pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) 125 | if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: 126 | pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) 127 | pre_tok_state["add_prefix_space"] = add_prefix_space 128 | self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) 129 | 130 | self.add_prefix_space = add_prefix_space 131 | 132 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 133 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 134 | return tuple(files) 135 | 136 | def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: 137 | """This corresponds to DialoGPT variants of models.""" 138 | input_ids = [] 139 | for is_user, text in conversation.iter_texts(): 140 | input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id]) 141 | 142 | if len(input_ids) > self.model_max_length: 143 | input_ids = input_ids[-self.model_max_length :] 144 | return input_ids -------------------------------------------------------------------------------- /mftcoder_accelerate/src/utils/agd.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union 2 | 3 | import numpy as np 4 | import torch 5 | from torch import Tensor 6 | 7 | Params = Union[Iterable[Tensor], Iterable[Dict[str, Any]]] 8 | 9 | LossClosure = Callable[[], float] 10 | OptLossClosure = Optional[LossClosure] 11 | Betas2 = Tuple[float, float] 12 | State = Dict[str, Any] 13 | OptFloat = Optional[float] 14 | Nus2 = Tuple[float, float] 15 | 16 | __all__ = ("AGD",) 17 | 18 | 19 | class AGD(torch.optim.Optimizer): 20 | r"""AGD: an Auto-switchable Optimizer using Stepwise Gradient Difference as Preconditioning Matrix. 21 | Arguments: 22 | params (Params): Collection of parameters to be optimized, or an iterable of dictionaries specifying separate groups. 23 | lr (float, optional): The learning rate. Default is 1e-3. 24 | betas (tuple of 2 floats, optional): Coefficients used for computing running averages of gradient and its square. Default is (0.9, 0.999). 25 | delta (float, optional): Small constant for numerical stability to prevent division by zero. Default is 1e-5. 26 | weight_decay (float, optional): Weight decay coefficient. Default is 0.0. 27 | amsgrad (bool, optional): If set to True, applies the AMSGrad variant of the optimizer. Default is False. 28 | win (bool, optional): If set to True, applies the Win variant of the optimizer. Default is False. 29 | clip (bool, optional): Total update clip to prevent abnormal updates. Default is None. 30 | """ 31 | 32 | def __init__( 33 | self, 34 | params: Params, 35 | lr: float = 1e-3, 36 | betas: Betas2 = (0.9, 0.999), 37 | delta: float = 1e-5, 38 | weight_decay: float = 0.0, 39 | amsgrad: bool = False, 40 | win: bool = False, 41 | clip: float = None, 42 | ) -> None: 43 | if lr <= 0.0: 44 | raise ValueError("Invalid learning rate: {}".format(lr)) 45 | if delta < 0.0: 46 | raise ValueError("Invalid delta value: {}".format(delta)) 47 | if not 0.0 <= betas[0] < 1.0: 48 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) 49 | if not 0.0 <= betas[1] < 1.0: 50 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) 51 | if weight_decay < 0.0: 52 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) 53 | 54 | defaults = dict( 55 | lr=lr, 56 | betas=betas, 57 | delta=delta, 58 | weight_decay=weight_decay, 59 | amsgrad=amsgrad, 60 | win=win, 61 | clip=clip, 62 | ) 63 | super(AGD, self).__init__(params, defaults) 64 | 65 | def step(self, closure: OptLossClosure = None) -> OptFloat: 66 | loss = None 67 | if closure is not None: 68 | loss = closure() 69 | 70 | for group in self.param_groups: 71 | beta1, beta2 = group["betas"] 72 | 73 | for p in group["params"]: 74 | if p.grad is None: 75 | continue 76 | grad = p.grad.data 77 | if grad.is_sparse: 78 | msg = "AGD does not support sparse gradients." 79 | raise RuntimeError(msg) 80 | 81 | state = self.state[p] 82 | # Lazy state initialization 83 | if len(state) == 0: 84 | state["step"] = 0 85 | # Exponential moving average of gradient values 86 | state["exp_avg"] = torch.zeros_like(p, memory_format=torch.preserve_format) 87 | # Exponential moving average of squared gradient values 88 | state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format) 89 | if group["amsgrad"]: 90 | # Maintains max of all exp. moving avg. of sq. grad. values 91 | state["max_exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format) 92 | if group["win"]: 93 | state["z"] = torch.zeros_like(p, memory_format=torch.preserve_format) 94 | 95 | exp_avg, exp_avg_sq = ( 96 | state["exp_avg"], 97 | state["exp_avg_sq"], 98 | ) 99 | 100 | state["step"] += 1 101 | exp_avg_old = exp_avg.detach().clone() 102 | exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) 103 | bias_correction1_old = 1 - beta1 ** (state["step"] - 1) 104 | bias_correction1, bias_correction2 = ( 105 | 1 - beta1 ** state["step"], 106 | 1 - beta2 ** state["step"], 107 | ) 108 | update = ( 109 | exp_avg * (1 / bias_correction1) 110 | if state["step"] == 1 111 | else exp_avg * (1 / bias_correction1) - exp_avg_old * (1 / bias_correction1_old) 112 | ) 113 | exp_avg_sq.mul_(beta2).addcmul_(update, update, value=1 - beta2) 114 | 115 | if group["amsgrad"]: 116 | max_exp_avg_sq = state["max_exp_avg_sq"] 117 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) 118 | update = max_exp_avg_sq.sqrt() 119 | else: 120 | update = exp_avg_sq.sqrt() 121 | 122 | delta_adjust = group["delta"] * np.sqrt(bias_correction2) 123 | update.clamp_(min=delta_adjust) 124 | 125 | lr_adjust = group["lr"] * np.sqrt(bias_correction2) / bias_correction1 126 | update = exp_avg / update 127 | if group["clip"] is not None: 128 | update.clamp_(min=-group["clip"], max=group["clip"]) 129 | weight_decay = group["weight_decay"] 130 | if not group["win"]: 131 | p.data.mul_(1 - group["lr"] * weight_decay).add_(update, alpha=-lr_adjust) 132 | else: 133 | z = state["z"] 134 | z.data.add_(update, alpha=-lr_adjust).mul_(1.0 / (1.0 + weight_decay * lr_adjust)) 135 | lr_adjust2 = 2 * lr_adjust 136 | tao = 1.0 / (3.0 + lr_adjust2 * weight_decay) 137 | p.data.mul_(tao).add_(update, alpha=-tao * lr_adjust2).add_(z, alpha=2 * tao) 138 | return loss 139 | -------------------------------------------------------------------------------- /mftcoder_accelerate/inference/hf_inference.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @author Chaoyu Chen 3 | # @date 2024/1/4 4 | # @module hf_inference.py 5 | """ 6 | # @author qumu 7 | # @date 2023/9/19 8 | # @module hf_inference.py 9 | """ 10 | import os 11 | import sys 12 | import torch 13 | import textwrap 14 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList 15 | from peft import PeftModel 16 | 17 | 18 | def load_model_tokenizer( 19 | path, 20 | model_type=None, 21 | peft_path=None, 22 | torch_dtype=torch.bfloat16, 23 | quantization=None, 24 | eos_token=None, 25 | pad_token=None, 26 | batch_size=1, 27 | ): 28 | """ 29 | load model and tokenizer by transfromers 30 | """ 31 | 32 | # load tokenizer first 33 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) 34 | tokenizer.padding_side = "left" 35 | 36 | config, unused_kwargs = AutoConfig.from_pretrained(path, trust_remote_code=True, return_unused_kwargs=True) 37 | print("unused_kwargs:", unused_kwargs) 38 | print("config input:\n", config) 39 | 40 | # eos token parsing 41 | if eos_token: 42 | eos_token = eos_token 43 | eos_token_id = tokenizer.convert_tokens_to_ids(eos_token) 44 | print(f"eos_token {eos_token} from user input") 45 | elif hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id: 46 | print(f"Initial eos_token_id {tokenizer.eos_token_id} from tokenizer") 47 | eos_token_id = tokenizer.eos_token_id 48 | eos_token = tokenizer.convert_ids_to_tokens(eos_token_id) 49 | elif hasattr(tokenizer, "eos_token") and tokenizer.eos_token: 50 | print(f"Initial eos_token {tokenizer.eos_token} from tokenizer") 51 | eos_token = tokenizer.eos_token 52 | eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token) 53 | elif hasattr(config, "eos_token_id") and config.eos_token_id: 54 | print(f"Initial eos_token_id {config.eos_token_id} from config.json") 55 | eos_token_id = config.eos_token_id 56 | eos_token = tokenizer.convert_ids_to_tokens(config.eos_token_id) 57 | elif hasattr(config, "eos_token") and config.eos_token: 58 | print(f"Initial eos_token {config.eos_token} from config.json") 59 | eos_token = config.eos_token 60 | eos_token_id = tokenizer.convert_tokens_to_ids(config.eos_token) 61 | else: 62 | raise ValueError( 63 | "No available eos_token or eos_token_id, please provide eos_token by params or eos_token_id by config.json" 64 | ) 65 | 66 | try: 67 | tokenizer.eos_token = eos_token 68 | tokenizer.eos_token_id = eos_token_id 69 | # set pad_token to be same as eos_token, it is ok because is will be masked out. 70 | tokenizer.pad_token = eos_token 71 | tokenizer.pad_token_id = eos_token_id 72 | except: 73 | print(f"[WARNING]Cannot set tokenizer.eos_token") 74 | 75 | print(f"tokenizer's eos_token: {tokenizer.eos_token}, pad_token: {tokenizer.pad_token}") 76 | print(f"tokenizer's eos_token_id: {tokenizer.eos_token_id}, pad_token_id: {tokenizer.pad_token_id}") 77 | print(type(tokenizer)) 78 | 79 | base_model = AutoModelForCausalLM.from_pretrained( 80 | path, 81 | config=config, 82 | load_in_8bit=(quantization == "8bit"), 83 | load_in_4bit=(quantization == "4bit"), 84 | device_map="auto", 85 | torch_dtype=torch_dtype, 86 | trust_remote_code=True, 87 | low_cpu_mem_usage=True, 88 | ) 89 | 90 | if peft_path: 91 | print("Loading PEFT MODEL...") 92 | model = PeftModel.from_pretrained(base_model, peft_path) 93 | else: 94 | print("Loading Original MODEL...") 95 | model = base_model 96 | 97 | model.eval() 98 | 99 | print("=======================================MODEL Configs=====================================") 100 | print(model.config) 101 | print("=========================================================================================") 102 | print("=======================================MODEL Archetecture================================") 103 | print(model) 104 | print("=========================================================================================") 105 | 106 | return model, tokenizer 107 | 108 | 109 | def hf_inference(model, tokenizer, text_list, args=None, max_new_tokens=512, do_sample=True, **kwargs): 110 | """ 111 | transformers models inference by huggingface 112 | """ 113 | # text_list = [tokenizer.apply_chat_template([{"role": "user", "content": text}], tokenize=False) for text in text_list] 114 | inputs = tokenizer(text_list, return_tensors="pt", padding=True, add_special_tokens=False).to("cuda") 115 | # inputs["attention_mask"][0][:100] = 0 116 | # print(inputs) 117 | print("================================Prompts and Generations=============================") 118 | 119 | outputs = model.generate( 120 | inputs=inputs["input_ids"], 121 | attention_mask=inputs["attention_mask"], 122 | max_new_tokens=max_new_tokens, 123 | do_sample=do_sample, 124 | eos_token_id=tokenizer.eos_token_id, 125 | pad_token_id=tokenizer.pad_token_id, 126 | **kwargs, 127 | ) 128 | 129 | gen_text = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True) 130 | for i in range(len(text_list)): 131 | print("=========" * 10) 132 | print(f"Prompt:\n{text_list[i]}") 133 | gen_text[i] = gen_text[i].replace(tokenizer.pad_token, "") 134 | print(f"Generation:\n{gen_text[i]}") 135 | # print(f"Outputs ids:\n{outputs[i]}") 136 | sys.stdout.flush() 137 | 138 | return gen_text 139 | 140 | 141 | if __name__ == "__main__": 142 | # Default template used in MFTCoder training 143 | HUMAN_ROLE_START_TAG = "human\n" 144 | BOT_ROLE_START_TAG = "bot\n" 145 | 146 | instruction = "Write quick sort function in python." 147 | 148 | prompts = [f"{HUMAN_ROLE_START_TAG}{instruction}\n{BOT_ROLE_START_TAG}"] 149 | 150 | # if you use base + adaptor for inference, provide peft_path or left it None for normal inference 151 | base_model = "path/to/basemodel" 152 | peft_path = None 153 | model, tokenizer = load_model_tokenizer( 154 | base_model, model_type="", peft_path=peft_path, eos_token="", pad_token="" 155 | ) 156 | 157 | # hf_inference(model, tokenizer, prompts, do_sample=False, num_beams=1, num_return_sequences=1) 158 | hf_inference(model, tokenizer, prompts, do_sample=True, temperature=0.8) 159 | -------------------------------------------------------------------------------- /mftcoder_atorch/model/build_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import sys 4 | sys.path.append("..") 5 | from utils.common_utils import get_model_params_num 6 | from transformers import ( # noqa: E402 7 | CONFIG_MAPPING, 8 | AutoConfig, 9 | AutoModelForCausalLM, 10 | AutoTokenizer, 11 | PreTrainedTokenizerFast 12 | ) 13 | from .gpt_neox.configuration_gpt_neox import GPTNeoXConfig 14 | from .gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM 15 | from .gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast 16 | 17 | from torch.distributed.fsdp import ( 18 | FullyShardedDataParallel as FSDP, 19 | StateDictType, 20 | ) 21 | from utils.common_utils import print_rank_0, is_old_version 22 | from tokenizer import build_tokenizer 23 | from tokenizer.tokenizer import HFTokenizer 24 | 25 | import peft 26 | from peft.tuners.lora import LoraLayer 27 | from model.peft.utils import prepare_model_for_kbit_training 28 | from peft import ( # noqa 29 | LoraConfig, 30 | PrefixTuningConfig, 31 | PromptEncoderConfig, 32 | PromptEncoderReparameterizationType, 33 | PromptTuningConfig, 34 | PromptTuningInit, 35 | TaskType, 36 | get_peft_model 37 | ) 38 | import model.peft.modeling_peft # noqa 39 | from model.peft.tuner import AdaLoraConfig 40 | 41 | try: 42 | from transformers import BitsAndBytesConfig 43 | except ImportError: 44 | BitsAndBytesConfig = None 45 | try: 46 | import bitsandbytes as bnb # noqa 47 | except ImportError: 48 | bnb = None 49 | from packaging import version 50 | 51 | 52 | def find_all_linear_names(args, model): 53 | cls = bnb.nn.Linear4bit if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear) 54 | lora_module_names = set() 55 | for name, module in model.named_modules(): 56 | if isinstance(module, cls): 57 | names = name.split('.') 58 | lora_module_names.add(names[0] if len(names) == 1 else names[-1]) 59 | if 'lm_head' in lora_module_names: # needed for 16-bit 60 | lora_module_names.remove('lm_head') 61 | return list(lora_module_names) 62 | 63 | 64 | def setup_model(args, logger, use_cache=False): 65 | # Load pretrained model and tokenizer 66 | 67 | if args.pretrained_model_path: 68 | if args.model_type == 'gpt_neox': 69 | tokenizer = GPTNeoXTokenizerFast.from_pretrained(args.pretrained_model_path) 70 | tokenizer.eod_token = "<|endoftext|>" 71 | tokenizer.pad_token = "<|pad|>" 72 | tokenizer.sop_token = "<|endoftext|>" 73 | tokenizer.eop_token = "<|endoftext|>" 74 | tokenizer.eod_id = tokenizer.convert_tokens_to_ids(tokenizer.eod_token) 75 | tokenizer.pad_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) 76 | 77 | print_rank_0(f'tokenizer {tokenizer.eod_token} id: {tokenizer.eod_id}') 78 | print_rank_0(f'tokenizer {tokenizer.pad_token} id: {tokenizer.pad_id}') 79 | else: 80 | raise ValueError( 81 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 82 | "You can do it from another script, save it, and load it from here, using --tokenizer_path." 83 | ) 84 | 85 | if args.model_type == 'gpt_neox': 86 | auto_config = GPTNeoXConfig 87 | auto_model_class = GPTNeoXForCausalLM 88 | else: 89 | auto_config = AutoConfig 90 | auto_model_class = AutoModelForCausalLM 91 | 92 | # with init_empty_weights_with_disk_offload(ignore_tie_weights=False): 93 | if args.pretrained_model_path: 94 | logger.info("Training model from checkpoint") 95 | config = auto_config.from_pretrained(args.pretrained_model_path) 96 | if args.peft_type != "qlora": 97 | model = auto_model_class.from_pretrained(args.pretrained_model_path, trust_remote_code=True).cuda() 98 | # TODO: qlora 99 | else: 100 | logger.info("Training model from scratch") 101 | if args.model_type == 'gpt_neox': 102 | config = GPTNeoXConfig.from_json_file(args.config_path + '/config.json') 103 | model = GPTNeoXForCausalLM._from_config(config) 104 | else: 105 | config = AutoConfig.from_json_file(args.config_path + '/config.json') 106 | model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code) 107 | 108 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch 109 | # on a small vocab and want a smaller embedding size, remove this test. 110 | embedding_size = model.get_input_embeddings().weight.shape[0] 111 | print_rank_0('embedding size: ' + str(embedding_size)) 112 | print_rank_0('vocab size: ' + str(tokenizer.vocab_size)) 113 | if tokenizer.vocab_size > embedding_size: 114 | model.resize_token_embeddings(tokenizer.vocab_size) 115 | print_rank_0('resize embedding size: ' + str(model.get_input_embeddings().weight.shape[0])) 116 | 117 | print_rank_0(config) 118 | num_params = get_model_params_num(model) 119 | print_rank_0("num_params of this model:", num_params) 120 | args.total_model_param = num_params 121 | args.hidden_size = config.hidden_size 122 | args.num_hidden_layers = config.num_hidden_layers 123 | args.vocab_size = tokenizer.vocab_size 124 | print_rank_0(f'hidden size: {args.hidden_size}') 125 | print_rank_0(f'num hidden layers: {args.num_hidden_layers}') 126 | print_rank_0(f'vocab size: {args.vocab_size}') 127 | 128 | if args.peft_type: 129 | if args.peft_type in ['lora', 'qlora']: 130 | target_modules = None 131 | # TODO: qlora 132 | target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"] 133 | print_rank_0(f'target modules: {target_modules}') 134 | peft_config = LoraConfig( 135 | task_type=TaskType.ANT_CAUSAL_LM, 136 | inference_mode=False, 137 | r=96, 138 | lora_alpha=32, 139 | lora_dropout=0.05, 140 | target_modules=target_modules, 141 | ) 142 | logger.info( 143 | f"Load Peft {args.peft_type} model ......") 144 | if args.checkpoint_activations and args.peft_type in ["lora", "qlora"]: 145 | # Make Lora and gradient checkpointing compatible 146 | # https://github.com/huggingface/peft/issues/137 147 | model.enable_input_require_grads() 148 | model = get_peft_model(model, peft_config) 149 | logger.info( 150 | f"Reduce trainalbe params:\n") 151 | model.print_trainable_parameters() 152 | 153 | return model, config, tokenizer 154 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/gpt_bigcode/configuration_gpt_bigcode.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 The BigCode team and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ GPTBigCode configuration""" 16 | 17 | from transformers.configuration_utils import PretrainedConfig 18 | from transformers.utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP = { 24 | "bigcode/gpt_bigcode-santacoder": "https://huggingface.co/bigcode/gpt_bigcode-santacoder/resolve/main/config.json", 25 | } 26 | 27 | 28 | class GPTBigCodeConfig(PretrainedConfig): 29 | """ 30 | This is the configuration class to store the configuration of a [`GPTBigCodeModel`]. It is used to instantiate a 31 | GPTBigCode model according to the specified arguments, defining the model architecture. Instantiating a 32 | configuration with the defaults will yield a similar configuration to that of the GPTBigCode 33 | [gpt_bigcode](https://huggingface.co/gpt_bigcode) architecture. 34 | 35 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 36 | documentation from [`PretrainedConfig`] for more information. 37 | 38 | 39 | Args: 40 | vocab_size (`int`, *optional*, defaults to 50257): 41 | Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the 42 | `inputs_ids` passed when calling [`GPTBigCodeModel`]. 43 | n_positions (`int`, *optional*, defaults to 1024): 44 | The maximum sequence length that this model might ever be used with. Typically set this to something large 45 | just in case (e.g., 512 or 1024 or 2048). 46 | n_embd (`int`, *optional*, defaults to 768): 47 | Dimensionality of the embeddings and hidden states. 48 | n_layer (`int`, *optional*, defaults to 12): 49 | Number of hidden layers in the Transformer encoder. 50 | n_head (`int`, *optional*, defaults to 12): 51 | Number of attention heads for each attention layer in the Transformer encoder. 52 | n_inner (`int`, *optional*, defaults to None): 53 | Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd 54 | activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`): 55 | Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new", 56 | "gelu_pytorch_tanh"]`. 57 | resid_pdrop (`float`, *optional*, defaults to 0.1): 58 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. 59 | embd_pdrop (`float`, *optional*, defaults to 0.1): 60 | The dropout ratio for the embeddings. 61 | attn_pdrop (`float`, *optional*, defaults to 0.1): 62 | The dropout ratio for the attention. 63 | layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): 64 | The epsilon to use in the layer normalization layers. 65 | initializer_range (`float`, *optional*, defaults to 0.02): 66 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 67 | scale_attn_weights (`bool`, *optional*, defaults to `True`): 68 | Scale attention weights by dividing by sqrt(hidden_size).. 69 | use_cache (`bool`, *optional*, defaults to `True`): 70 | Whether or not the model should return the last key/values attentions (not used by all models). 71 | attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`): 72 | Whether to call the fused softmax in float32. 73 | scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`): 74 | Whether to scale the attention softmax in float32. 75 | attention_type (`bool`, *optional*, defaults to `True`): 76 | Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`). 77 | Example: 78 | 79 | ```python 80 | >>> from transformers import GPTBigCodeConfig, GPTBigCodeModel 81 | 82 | >>> # Initializing a GPTBigCode configuration 83 | >>> configuration = GPTBigCodeConfig() 84 | 85 | >>> # Initializing a model (with random weights) from the configuration 86 | >>> model = GPTBigCodeModel(configuration) 87 | 88 | >>> # Accessing the model configuration 89 | >>> configuration = model.config 90 | ```""" 91 | 92 | model_type = "gpt_bigcode" 93 | keys_to_ignore_at_inference = ["past_key_values"] 94 | attribute_map = { 95 | "hidden_size": "n_embd", 96 | "max_position_embeddings": "n_positions", 97 | "num_attention_heads": "n_head", 98 | "num_hidden_layers": "n_layer", 99 | } 100 | 101 | def __init__( 102 | self, 103 | vocab_size=50257, 104 | n_positions=1024, 105 | n_embd=768, 106 | n_layer=12, 107 | n_head=12, 108 | n_inner=None, 109 | activation_function="gelu_pytorch_tanh", 110 | resid_pdrop=0.1, 111 | embd_pdrop=0.1, 112 | attn_pdrop=0.1, 113 | layer_norm_epsilon=1e-5, 114 | initializer_range=0.02, 115 | scale_attn_weights=True, 116 | use_cache=True, 117 | bos_token_id=50256, 118 | eos_token_id=50256, 119 | attention_softmax_in_fp32=True, 120 | scale_attention_softmax_in_fp32=True, 121 | multi_query=True, 122 | **kwargs, 123 | ): 124 | self.vocab_size = vocab_size 125 | self.n_positions = n_positions 126 | self.n_embd = n_embd 127 | self.n_layer = n_layer 128 | self.n_head = n_head 129 | self.n_inner = n_inner 130 | self.activation_function = activation_function 131 | self.resid_pdrop = resid_pdrop 132 | self.embd_pdrop = embd_pdrop 133 | self.attn_pdrop = attn_pdrop 134 | self.layer_norm_epsilon = layer_norm_epsilon 135 | self.initializer_range = initializer_range 136 | self.scale_attn_weights = scale_attn_weights 137 | self.use_cache = use_cache 138 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32 139 | self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32 140 | self.multi_query = multi_query 141 | 142 | self.bos_token_id = bos_token_id 143 | self.eos_token_id = eos_token_id 144 | 145 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 146 | -------------------------------------------------------------------------------- /mftcoder_atorch/README_cn.md: -------------------------------------------------------------------------------- 1 | # MFTCoder训练: Atorch框架篇 2 | [![Generic badge](https://img.shields.io/badge/🤗-Huggingface%20Repo-green.svg)](https://huggingface.co/codefuse-ai) 3 | 4 | GitHub 5 | 6 | 7 | [**中文**] [[English]](README.md) 8 | 9 | ## 1. 更新 10 | 11 | 🔥 MFTCoder在Atorch框架下支持GPTNeoX模型的微调; 12 | 13 | 🔥 MFTCoder支持全量的有监督微调; 14 | 15 | 🔥 MFTCoder支持LoRA微调; 16 | 17 | ## 2. 数据格式 18 | 19 | ### 2.1 训练数据格式 20 | 训练数据为jsonl格式,每一行的数据格式如下,其中chat_rounds字段是必需的,可以根据实际需求添加或删除其他字段。 21 | 可以参考项目中的xxx.jsonl文件。 22 | ```json 23 | { 24 | "id":0, 25 | "data_name":"code-helper", 26 | "chat_rounds":[ 27 | { 28 | "role": "system", 29 | "content": "你是一个智能代码助手,可以回复用户与代码相关的问题", 30 | "chat_round_id": 0 31 | }, 32 | { 33 | "role": "human", 34 | "content": "写一个快速排序", 35 | "chat_round_id": 1 36 | }, 37 | { 38 | "role": "bot", 39 | "content": "以下是一个快速排序算法xxxxxx", 40 | "chat_round_id": 1 41 | }, 42 | { 43 | "role": "human", 44 | "content": "解释一下这段代码", 45 | "chat_round_id": 2 46 | }, 47 | { 48 | "role": "bot", 49 | "content": "好的,这段代码xxx", 50 | "chat_round_id": 2 51 | } 52 | ] 53 | } 54 | ``` 55 | 56 | ### 2.2 推理数据格式 57 | 推理数据格式为模型在训练数据格式下拼接的字符串形式,它也是推理时输入prompt拼接的方式: 58 | ```python 59 | """ 60 | <|role_start|>system<|role_end|>这是System指令 61 | <|role_start|>human<|role_end|>这是第1轮用户输入的问题 62 | <|role_start|>bot<|role_end|>这是第1轮模型生成的内容 63 | <|role_start|>human<|role_end|>这是第2轮用户输入的问题 64 | <|role_start|>bot<|role_end|>这是第2轮模型生成的内容 65 | ... 66 | ... 67 | ... 68 | <|role_start|>human<|role_end|>这是第n轮用户输入的问题 69 | <|role_start|>bot<|role_end|>{模型现在要生成的内容} 70 | """ 71 | ``` 72 | 73 | 74 | ## 3. 模型训练 75 | 目前 "MFTCoder/mft_atorch" 代码库支持全量参数指令微调和LoRA指令微调。 76 | 目前仅支持GPTNeoX模型的训练,理论上,HuggingFace上开源的GPTNeoX模型权重,均可使用本项目进行训练。 77 | 78 | 我们将训练中使用的各种组件抽取出来,以便后续的扩展和优化,详见主目录下的实现。微调训练的入口目录是```train/```, 训练入口文件是```train/run_train.py```, 参数配置存储在启动脚本```train/run_gpt_*.sh```等文件中,方便统一管理和更改。 79 | 80 | ### 3.1 数据格式 81 | 训练时,我们将多轮对话拼接成如下格式,然后进行tokenize。其中<|role_start|>human<|role_end|>表示human输入提示符,<|role_start|>bot<|role_end|>表示bot输出提示符,`````````` 表示eos_token。 82 | ``` 83 | "<|role_start|>human<|role_end|>input1target1input2target2... 84 | ``` 85 | 在计算loss时,我们通过mask的方式,input部分的loss不参与参数更新,只有“target”部分的loss参与参数更新。 86 | 这种方式充分利用了模型并行计算的优势,训练更加高效,且多轮对话中的每个target部分都参与了训练,训练更充分。 87 | 否则,就需要把一个n轮对话,拆分成n条数据,且只计算最后一个target的loss,大大降低了训练效率。 88 | 89 | ### 3.2 全量SFT 90 | 91 | 执行如下命令即可进行全量SFT: 92 | ```bash 93 | sh run_gpt_mft.sh 10 1 8 5 94 | ``` 95 | 96 | 需注意,启动脚本后的四个参数,分别是: 97 | - 第一个参数是总的per gpu batch size 98 | - 第二个参数是tensor parallel数(暂时只支持1) 99 | - 第三个参数是data parallel数,与所用GPU数保持一致 100 | - 第四个参数是训练epoch数 101 | 102 | 后面其他的训练方式启动脚本,也同样需要配置这四个参数 103 | 104 | ### 3.3 LoRA微调 105 | 106 | 执行如下命令即可进行Lora微调: 107 | ```bash 108 | sh run_gpt_mft_peft.sh 10 1 8 5 109 | ``` 110 | 111 | ### 3.4 启动脚本中主要参数说明 112 | ```train/run_gpt_*.sh```中的主要参数说明如下,以下参数可以根据需求进行修改,其他参数建议不做修改: 113 | - tokenize_mode: 目前仅支持"sft"。 114 | 115 | - train_mode: 目前仅支持"sft"。 116 | 117 | - load_raw_dataset: 需要保持"True",后续会支持其它模式数据,当前仅支持jsonl输入 118 | 119 | - data_paths: "[path1,path2,path3]" 输入数据地址,字符串,开头结尾用[],中间用```,```间隔不同path,每个path是一个目录,目录的最后一级名字作为任务名称,下面包含1到多个jsonl数据。 120 | 121 | - output_dir: 训练输出目录,存储checkpoint、lora_adaptor checkpoint等。 122 | 123 | - tensorboard_dir: 可以暂时忽略,实际tensorboard存储在output_dir的runs目录下。 124 | 125 | - model_type: 目前仅支持 gpt_neox。 126 | 127 | - peft_type: 目前仅支持 lora。 128 | 129 | - pretrained_model_path: 预训练模型的本地目录。 130 | 131 | - total_train_batch_size: 所有显卡train的batch size的总和,会根据启动脚本时输入的per gpu batch size自动计算。 132 | 133 | - per_device_valid_batch_size: 每张显卡eval的batch size,会根据启动脚本时输入的per gpu batch size自动计算。 134 | 135 | - gradient_accumulation_steps: 梯度累计步数。global batch=num_gpus * per_device_train_batch_size * gradient_accumulation_steps。 136 | 137 | - checkpoint_activations: 如果显存捉襟见肘,可以开启。以时间换空间,模型不缓存激活状态,会进行两次forward计算,以节省显存。 138 | 139 | - learning_rate: 学习率。全量参数微调的时候,建议小一些,1e-5或5e-6。qlora中的学习率设置更大一些,一般为1e-4、2e-4。 140 | 141 | - min_lr: 最低学习率, 一般是learning_rate的十分之一。 142 | 143 | - seq_length: 训练时的最大长度。按照自己的设备进行设置,越长需要占用越多显存。 144 | 145 | - log_interval: 每隔多少步统计一次train loss。 146 | 147 | - checkpointing_steps: 每隔多少步保存一个模型。 148 | 149 | - evalation_steps: 每隔多少步在验证集上evaluate一次。 150 | 151 | - early_stopping_patience: 多少个eval point不继续收敛,则停止训练。 152 | 153 | - lr_scheduler_type: 学习率变化策略。 154 | 155 | - num_warmup_steps: warm up步数,学习率经过多少步,增长到指定的数值。 156 | 157 | - seed: 随机种子,用于复现实验结果。 158 | 159 | - train_iters: 可以暂时设为比较小的数,如10,实际上不会影响训练步数,留作后面拓展读取其他形式数据集的功能。 160 | 161 | - valid_iters: 可以暂时设为比较小的数,如10,实际上不会影响训练步数,留作后面拓展读取其他形式数据集的功能。 162 | 163 | - evaluation_strategy: 训练期间evaluate的策略,"steps"表示每隔"valid_interval"步做一次evaluate,"epoch"表示每隔一个epoch做一次evaluate,支持同时开启。 164 | 165 | - save_strategy: 训练期间保存模型权重的策略,"steps"表示每隔"checkpointing_steps"步保存一次。 166 | 167 | - extra_save_by_epoch: 每过一个epoch是否要保存一个epoch级别的checkpoint。 168 | 169 | - save_total_limit: 最多保留的模型checkpoint个数,一般设置为2,会保留valid loss最低,以及最新的checkpoint,注意epoch级别的checkpoint会一直保留,且不受限制。 170 | 171 | - weighted_loss_mode: 多任务训练的loss加权方式。 172 | 173 | 174 | ## 4. 模型使用 175 | 176 | ### 4.1 权重合并 177 | 如果使用LoRA进行训练,本项目仅保存adapter的权重和配置文件,需要将adapter权重与base model进行合并。脚本见```utils/merge_base_and_lora_to_hf.py``` 178 | 179 | ### 4.2 模型推理 180 | 我们提供了单轮对话和多轮对话的如下脚本,该脚本可同时兼容大部分huggingface格式的模型。 181 | ```python 182 | from transformers import ( 183 | AutoTokenizer, 184 | AutoModelForCausalLM, 185 | ) 186 | tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, trust_remote_code=True, use_fast=False, legacy=False) 187 | tokenizer.padding_side = "left" 188 | tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("") 189 | tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("") 190 | model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, trust_remote_code=True) 191 | 192 | HUMAN_ROLE_START_TAG = "<|role_start|>human<|role_end|>" 193 | BOT_ROLE_START_TAG = "<|role_start|>bot<|role_end|>" 194 | texts = ["write a python function of quick sort."] 195 | texts = [f"{HUMAN_ROLE_START_TAG}{text}{BOT_ROLE_START_TAG}" for text in texts] 196 | 197 | inputs = tokenizer(texts, return_tensors='pt', padding=True, add_special_tokens=False).to("cuda") 198 | outputs = model.generate( 199 | inputs=inputs["input_ids"], 200 | attention_mask=inputs["attention_mask"], 201 | max_new_tokens=512, 202 | top_p=0.95, 203 | temperature=0.1, 204 | do_sample=True, 205 | eos_token_id=tokenizer.eos_token_id, 206 | pad_token_id=tokenizer.pad_token_id 207 | ) 208 | gen_text = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True) 209 | print(gen_text) 210 | ``` 211 | 212 | 生成脚本中的top_p、temperature、repetition_penalty、do_sample等参数对模型的生成效果影响较大,可按照自己的使用场景进行调试修改。 213 | 实践中,在代码生成场景中,如果采样模式,do_sample=True, top_p=0.95, temperature=0.1是pass@1指标的不错选择; 214 | 如果非采样模式, do_sample=False, beam_num=1或者3是不错的选择,其中beam_num=1即为greedy decoding。 215 | 216 | ## 5. FAQ 217 | #### 问题1:OOM如何解决? 218 | 如果发生OOM,可以缩小per GPU batch size (启动训练脚本时的第一个参数)、seq_length等参数来缓解。也可以设gradient_checkpointing=true,可以大幅降低显存占用,但训练速度会变慢一些。 219 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/offline_tokenization/concat_sst_bin_tokenization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import argparse 4 | import multiprocessing 5 | import os 6 | import sys 7 | import random 8 | import time 9 | import tqdm 10 | import glob 11 | import json 12 | import numpy as np 13 | 14 | 15 | # 将父目录的父目录加入path 16 | current_path = os.path.abspath(__file__) 17 | parent_dir = os.path.dirname(os.path.dirname(current_path)) 18 | grandparent_dir = os.path.dirname(parent_dir) 19 | sys.path.append(grandparent_dir) 20 | 21 | from tokenizer import init_tokenizer 22 | from pack_encoder import PackSSTBinEncoder, load_tokenizer 23 | from data import indexed_dataset 24 | 25 | from threading import Semaphore 26 | from colorama import Fore 27 | import lm_fmt as lmd 28 | 29 | 30 | def yield_from_files(files: list, semaphore): 31 | """ 32 | Iterator over input documents 33 | 34 | :param fnames: list of filenames 35 | """ 36 | def yielder(fname, semaphore): 37 | with open(fname, 'r') as f: 38 | for line in f: 39 | semaphore.acquire() 40 | yield json.loads(line) 41 | 42 | for fname in files: 43 | semaphore.acquire() 44 | yield from yielder(fname, semaphore) 45 | 46 | def yield_from_files2(fnames: list, semaphore, sample_percent): 47 | """ 48 | Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts / 49 | other compressed formats. Also filters out empty documents. 50 | 51 | :param fnames: list of filenames 52 | """ 53 | def yielder(fname, semaphore): 54 | try: 55 | sample_interval = int(1/sample_percent) 56 | for f in filter(lambda x: x, lmd.Reader(fname).stream_data(key=None)): 57 | rand_value = random.randint(1, sample_interval*100) 58 | if rand_value % sample_interval != 0: 59 | continue 60 | semaphore.acquire() 61 | 62 | #rand_value = random.randint(1, sample_interval*100) 63 | #if rand_value % sample_interval != 0: 64 | # yield None 65 | 66 | yield f 67 | except Exception as e: 68 | print('####Exception:', e.args) 69 | yield None 70 | 71 | for fname in fnames: 72 | semaphore.acquire() 73 | 74 | yield from yielder(fname, semaphore) 75 | 76 | 77 | def print_example_doc(input_ids, tokenizer): 78 | print(Fore.YELLOW + f'INPUT IDS len: {len(input_ids)}') 79 | print(Fore.BLUE + f'INPUT IDS:\n {input_ids}\n\n') 80 | 81 | print(Fore.RED + f'DETOKENIZED INPUT:\n{tokenizer.decode(input_ids)}') 82 | 83 | 84 | def core_process(encoded_docs, semaphore, seq_length, tokenizer, encoder, builder, output_idx_file): 85 | """ 86 | core of Data Pack SFT processing 87 | """ 88 | input_ids_key = 'input_ids' 89 | 90 | proc_start = time.time() 91 | total_bytes_processed = 0 92 | pbar = tqdm.tqdm() 93 | sentence_droped = 0 94 | loss_token_cnt = 0 95 | 96 | print("PRINT BEFORE STREAM PROCESS DATA") 97 | 98 | print_example_count = 0 99 | for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): 100 | total_bytes_processed += bytes_processed 101 | 102 | # release semaphore so `yield_from_files` can add another file to the buffer 103 | semaphore.release() 104 | 105 | # add each tokenized document / sentence, 106 | # For sft, each document has only one sample 107 | input_ids_sentence = doc[input_ids_key][0] 108 | if len(input_ids_sentence) < 1: 109 | sentence_droped += 1 110 | continue 111 | 112 | builder.add_item(np.array(input_ids_sentence, dtype=builder.dtype)) 113 | builder.end_document() 114 | #builder.finalize_without_close(output_idx_file) 115 | #builder.add_item_and_end_document_and_finalize(np.array(input_ids_sentence, dtype=builder.dtype), output_idx_file) 116 | 117 | # print the first packed sample as example 118 | if print_example_count < 1: 119 | print_example_doc(input_ids_sentence, tokenizer) 120 | print_example_count += 1 121 | 122 | # log progress 123 | if i % 100 == 0: 124 | current = time.time() 125 | elapsed = current - proc_start 126 | mbs = total_bytes_processed / elapsed / 1024 / 1024 127 | pbar.set_description( 128 | f"Processed {i} documents ({i / elapsed} docs/s, {mbs} MB/s)." 129 | ) 130 | if i != 0: 131 | pbar.update(100) 132 | 133 | # 尾部处理 134 | builder.finalize(output_idx_file) 135 | 136 | print(Fore.RED + "\ndroped docs: {}".format(sentence_droped)) 137 | 138 | 139 | def process_dataset(dataset_path, output_path, model_path, parallel_num, seq_length, dataset_name, sample_percent): 140 | """ 141 | Re-organize samples in the given data path into a Data Pack file. 142 | """ 143 | 144 | # get all jsonl files and corresponding reading handler 145 | files = glob.glob(os.path.join(dataset_path, '**/*.jsonl'), recursive=True) 146 | 147 | # build a semaphore object to stop `yield_from_files` from getting ahead 148 | # of encoder.encode and hence building up memory 149 | semaphore = Semaphore(1000 + parallel_num) 150 | 151 | # build sample iterator 152 | sample_iterator = yield_from_files2(files, semaphore, sample_percent) 153 | 154 | # load tokenizer 155 | # tokenizer = load_tokenizer(model_path, tokenizer_type) 156 | tokenizer = init_tokenizer(model_path) 157 | print('TOKEN of id=2:', tokenizer.convert_ids_to_tokens(2)) 158 | print('ID of :', tokenizer.convert_tokens_to_ids('')) 159 | print('TOKEN of id=0:', tokenizer.convert_ids_to_tokens(0)) 160 | print('ID of :', tokenizer.convert_tokens_to_ids('')) 161 | 162 | # init encoder 163 | encoder = PackSSTBinEncoder(seq_length, model_path) 164 | 165 | # create writer builder 166 | key = "input_ids" 167 | output_prefix = os.path.join(output_path, dataset_name) 168 | output_bin_file = "{}_{}.bin".format( 169 | output_prefix, key 170 | ) 171 | output_idx_file = "{}_{}.idx".format( 172 | output_prefix, key 173 | ) 174 | builder = indexed_dataset.make_builder( 175 | output_bin_file, 176 | impl="mmap", 177 | vocab_size=tokenizer.vocab_size, 178 | ) 179 | 180 | if parallel_num > 1: 181 | pool = multiprocessing.Pool(parallel_num, initializer=encoder.initializer) 182 | encoded_docs = pool.imap(encoder.encode, sample_iterator, chunksize=32) 183 | else: 184 | encoder.initializer() 185 | encoded_docs = (encoder.encode(doc) for doc in sample_iterator) 186 | 187 | if dataset_name is None: 188 | dataset_path = dataset_path[:-1] if dataset_path.endswith(os.path.sep) else dataset_path 189 | dataset_name = dataset_path.split(os.path.sep)[-1] 190 | 191 | core_process(encoded_docs, semaphore, seq_length, tokenizer, encoder, builder, output_idx_file) 192 | 193 | 194 | def main(data_path, output_path, model_path, parallel_num, seq_length, dataset_name, sample_percent): 195 | """ 196 | Entry 197 | """ 198 | 199 | process_dataset(data_path, output_path, model_path, parallel_num, seq_length, dataset_name, sample_percent) 200 | 201 | 202 | if __name__ == "__main__": 203 | parser = argparse.ArgumentParser(description="Generate a packed jsonl file in the Data Pack SFT way.") 204 | parser.add_argument('--model-path', type=str, help='Path of a pretrained model which contains tokenizer-related files.') 205 | parser.add_argument('--parallel', type=int, default=1, help='The num of parallel processing.') 206 | parser.add_argument('--output-path', type=str, help='Path to store the genered result file.') 207 | parser.add_argument('--data-path', type=str, default=None, help='Path of files to be processed') 208 | parser.add_argument('--seq-length', type=int, default=4096, help='The max input length (i.e. the max number of tokens in a sample)') 209 | # parser.add_argument('--eod-token-id', type=int, default=2, help='EOD token id') 210 | # parser.add_argument('--pad-token-id', type=int, default=0, help='PAD token id') 211 | # parser.add_argument('--tokenizer-type', type=str, choices=["LLAMATokenizer", None], default=None, help="What type of tokenizer to use. Default is None.") 212 | parser.add_argument('--dataset-name', type=str, default=None, help='The generated result dataset name. The folder name will be token by default.') 213 | parser.add_argument('--sample-percent', type=float, default=1.0, help='Sample percentage') 214 | 215 | args = parser.parse_args() 216 | print('ARGS\n', '\n'.join([str(key) + ':' + str(value) for key,value in vars(args).items()])) 217 | 218 | random.seed(9999) 219 | 220 | main(args.data_path, args.output_path, args.model_path, args.parallel, args.seq_length, args.dataset_name, args.sample_percent) 221 | -------------------------------------------------------------------------------- /mftcoder_accelerate/src/model/qwen/cache_autogptq_cuda_256.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_256.cpp 6 | void vecquant8matmul_cuda( 7 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 8 | torch::Tensor scales, torch::Tensor zeros, 9 | torch::Tensor g_idx 10 | ); 11 | 12 | void vecquant8matmul( 13 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 14 | torch::Tensor scales, torch::Tensor zeros, 15 | torch::Tensor g_idx 16 | ) { 17 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 18 | vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx); 19 | } 20 | 21 | void vecquant8matmul_batched_cuda( 22 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 23 | torch::Tensor scales, torch::Tensor zeros 24 | ); 25 | 26 | void vecquant8matmul_batched( 27 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 28 | torch::Tensor scales, torch::Tensor zeros 29 | ) { 30 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 31 | vecquant8matmul_batched_cuda(vec, mat, mul, scales, zeros); 32 | } 33 | 34 | void vecquant8matmul_batched_column_compression_cuda( 35 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 36 | torch::Tensor scales, torch::Tensor zeros 37 | ); 38 | 39 | void vecquant8matmul_batched_column_compression( 40 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 41 | torch::Tensor scales, torch::Tensor zeros 42 | ) { 43 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 44 | vecquant8matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros); 45 | } 46 | 47 | void vecquant4matmul_batched_cuda( 48 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 49 | torch::Tensor scales, torch::Tensor zeros 50 | ); 51 | 52 | void vecquant4matmul_batched( 53 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 54 | torch::Tensor scales, torch::Tensor zeros 55 | ) { 56 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 57 | vecquant4matmul_batched_cuda(vec, mat, mul, scales, zeros); 58 | } 59 | 60 | void vecquant4matmul_batched_column_compression_cuda( 61 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 62 | torch::Tensor scales, torch::Tensor zeros 63 | ); 64 | 65 | void vecquant4matmul_batched_column_compression( 66 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 67 | torch::Tensor scales, torch::Tensor zeros 68 | ) { 69 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 70 | vecquant4matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros); 71 | } 72 | 73 | void vecquant8matmul_batched_old_cuda( 74 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 75 | torch::Tensor scales, torch::Tensor zeros 76 | ); 77 | 78 | void vecquant8matmul_batched_old( 79 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 80 | torch::Tensor scales, torch::Tensor zeros 81 | ) { 82 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 83 | vecquant8matmul_batched_old_cuda(vec, mat, mul, scales, zeros); 84 | } 85 | 86 | 87 | void vecquant4matmul_batched_old_cuda( 88 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 89 | torch::Tensor scales, torch::Tensor zeros 90 | ); 91 | 92 | void vecquant4matmul_batched_old( 93 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 94 | torch::Tensor scales, torch::Tensor zeros 95 | ) { 96 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 97 | vecquant4matmul_batched_old_cuda(vec, mat, mul, scales, zeros); 98 | } 99 | 100 | void vecquant8matmul_batched_column_compression_old_cuda( 101 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 102 | torch::Tensor scales, torch::Tensor zeros 103 | ); 104 | 105 | void vecquant8matmul_batched_column_compression_old( 106 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 107 | torch::Tensor scales, torch::Tensor zeros 108 | ) { 109 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 110 | vecquant8matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros); 111 | } 112 | 113 | void vecquant4matmul_batched_column_compression_old_cuda( 114 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 115 | torch::Tensor scales, torch::Tensor zeros 116 | ); 117 | 118 | void vecquant4matmul_batched_column_compression_old( 119 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 120 | torch::Tensor scales, torch::Tensor zeros 121 | ) { 122 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 123 | vecquant4matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros); 124 | } 125 | 126 | 127 | 128 | void vecquant8matmul_batched_faster_cuda( 129 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 130 | torch::Tensor scales, torch::Tensor zeros 131 | ); 132 | 133 | void vecquant8matmul_batched_faster( 134 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 135 | torch::Tensor scales, torch::Tensor zeros 136 | ) { 137 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 138 | vecquant8matmul_batched_faster_cuda(vec, mat, mul, scales, zeros); 139 | } 140 | 141 | 142 | void vecquant8matmul_batched_faster_old_cuda( 143 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 144 | torch::Tensor scales, torch::Tensor zeros 145 | ); 146 | 147 | void vecquant8matmul_batched_faster_old( 148 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 149 | torch::Tensor scales, torch::Tensor zeros 150 | ) { 151 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 152 | vecquant8matmul_batched_faster_old_cuda(vec, mat, mul, scales, zeros); 153 | } 154 | 155 | void vecquant8matmul_batched_column_compression_faster_cuda( 156 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 157 | torch::Tensor scales, torch::Tensor zeros 158 | ); 159 | 160 | void vecquant8matmul_batched_column_compression_faster( 161 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 162 | torch::Tensor scales, torch::Tensor zeros 163 | ) { 164 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 165 | vecquant8matmul_batched_column_compression_faster_cuda(vec, mat, mul, scales, zeros); 166 | } 167 | 168 | 169 | void vecquant8matmul_batched_column_compression_faster_old_cuda( 170 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 171 | torch::Tensor scales, torch::Tensor zeros 172 | ); 173 | 174 | void vecquant8matmul_batched_column_compression_faster_old( 175 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul, 176 | torch::Tensor scales, torch::Tensor zeros 177 | ) { 178 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec)); 179 | vecquant8matmul_batched_column_compression_faster_old_cuda(vec, mat, mul, scales, zeros); 180 | } 181 | 182 | 183 | 184 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 185 | m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)"); 186 | m.def("vecquant8matmul_batched", &vecquant8matmul_batched, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)"); 187 | m.def("vecquant8matmul_batched_old", &vecquant8matmul_batched_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)"); 188 | m.def("vecquant8matmul_batched_faster", &vecquant8matmul_batched_faster, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)"); 189 | m.def("vecquant8matmul_batched_faster_old", &vecquant8matmul_batched_faster_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)"); 190 | m.def("vecquant4matmul_batched_old", &vecquant4matmul_batched_old, "Vector 4-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)"); 191 | m.def("vecquant8matmul_batched_column_compression", &vecquant8matmul_batched_column_compression, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)"); 192 | m.def("vecquant8matmul_batched_column_compression_old", &vecquant8matmul_batched_column_compression_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)"); 193 | m.def("vecquant8matmul_batched_column_compression_faster", &vecquant8matmul_batched_column_compression_faster, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)"); 194 | m.def("vecquant8matmul_batched_column_compression_faster_old", &vecquant8matmul_batched_column_compression_faster_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)"); 195 | m.def("vecquant4matmul_batched_column_compression_old", &vecquant4matmul_batched_column_compression_old, "Vector old 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)"); 196 | m.def("vecquant4matmul_batched", &vecquant4matmul_batched, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)"); 197 | m.def("vecquant4matmul_batched_column_compression", &vecquant4matmul_batched_column_compression, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)"); 198 | } 199 | --------------------------------------------------------------------------------