├── mftcoder_accelerate
├── src
│ ├── model
│ │ ├── __init__.py
│ │ ├── gpt_neox
│ │ │ ├── generation_config.json
│ │ │ ├── config.json
│ │ │ ├── __init__.py
│ │ │ ├── configuration_gpt_neox.py
│ │ │ └── tokenization_gpt_neox_fast.py
│ │ ├── qwen
│ │ │ ├── tokenizer_config.json
│ │ │ ├── cpp_kernels.py
│ │ │ ├── configuration_qwen.py
│ │ │ └── cache_autogptq_cuda_256.cpp
│ │ ├── chatglm2
│ │ │ ├── tokenizer_config.json
│ │ │ ├── config.json
│ │ │ └── configuration_chatglm.py
│ │ ├── chatglm3
│ │ │ ├── config.json
│ │ │ └── configuration_chatglm.py
│ │ ├── deepseek_v2
│ │ │ └── tokenization_deepseek_fast.py
│ │ ├── baichuan2
│ │ │ ├── configuration_baichuan.py
│ │ │ └── generation_utils.py
│ │ ├── phi
│ │ │ └── configuration_mixformer_sequential.py
│ │ ├── code_llama
│ │ │ └── __init__.py
│ │ ├── gpt_bigcode
│ │ │ ├── __init__.py
│ │ │ └── configuration_gpt_bigcode.py
│ │ └── aquila2
│ │ │ └── configuration_aquila.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── Makefile
│ │ └── blendable_dataset.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── model_mapping.py
│ │ ├── common_utils.py
│ │ └── agd.py
│ ├── tokenizer
│ │ ├── __init__.py
│ │ ├── chat_template.py
│ │ └── tokenizer.py
│ ├── run_offline_tokenization.sh
│ ├── accelerate_ds_config.yaml
│ ├── accelerate_fsdp_config.yaml
│ ├── configs
│ │ ├── dpo_train_config.json
│ │ ├── full_train_config.json
│ │ ├── lora_train_config.json
│ │ ├── qlora_train_config.json
│ │ └── coba_train_config.json
│ ├── ds_single_launch.sh
│ ├── ds_zero3_single_launch.sh
│ ├── offline_tokenization
│ │ ├── writer.py
│ │ └── concat_sst_bin_tokenization.py
│ ├── fsdp_single_launch.sh
│ ├── ds_multinode_launch.sh
│ ├── xxpo
│ │ ├── custom_callbacks.py
│ │ └── xxpo_arguments.py
│ ├── pefts
│ │ ├── merge_base_and_lora_to_hf.py
│ │ └── mft_arguments.py
│ └── mpt
│ │ └── mpt_arguments.py
└── inference
│ └── hf_inference.py
├── mftcoder_atorch
├── data
│ ├── __init__.py
│ ├── helpers.cpython-38-x86_64-linux-gnu.so
│ └── Makefile
├── train
│ ├── __init__.py
│ ├── run_gpt_mft.sh
│ ├── run_gpt_mft_peft.sh
│ └── run_train.py
├── utils
│ ├── __init__.py
│ ├── merge_base_and_lora_to_hf.py
│ └── learning_rates.py
├── .gitignore
├── model
│ ├── gpt_neox
│ │ ├── generation_config.json
│ │ ├── config.json
│ │ ├── __init__.py
│ │ ├── configuration_gpt_neox.py
│ │ └── tokenization_gpt_neox_fast.py
│ ├── peft
│ │ ├── tuner
│ │ │ ├── pe_base_model.py
│ │ │ ├── __init__.py
│ │ │ ├── bitfit.py
│ │ │ └── roem.py
│ │ ├── __init__.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── config.py
│ │ │ └── mapping.py
│ ├── __init__.py
│ └── build_model.py
├── tokenizer
│ ├── __init__.py
│ └── train_tokenizer.py
└── README_cn.md
├── .gitignore
├── assets
├── img.jpg
├── img_1.jpg
├── CodeFuse-AI群.png
└── github-codefuse-logo-update.jpg
├── init_env.sh
├── requirements.txt
└── LEGAL.md
/mftcoder_accelerate/src/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mftcoder_atorch/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *
--------------------------------------------------------------------------------
/mftcoder_atorch/train/__init__.py:
--------------------------------------------------------------------------------
1 | from .run_train import *
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .DS_Store
3 | *.log
4 | */__pycache__/
5 | *.pyc
--------------------------------------------------------------------------------
/assets/img.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/img.jpg
--------------------------------------------------------------------------------
/assets/img_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/img_1.jpg
--------------------------------------------------------------------------------
/assets/CodeFuse-AI群.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/CodeFuse-AI群.png
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .common_utils import *
2 | from .loss_utils import *
3 |
--------------------------------------------------------------------------------
/mftcoder_atorch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .common_utils import *
2 | from .auto_accelerate_utils import *
--------------------------------------------------------------------------------
/mftcoder_atorch/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | */__pycache__/
3 | *.pyc
4 | *.ipynb
5 | .DS_Store
6 | .idea/
7 | evals/
--------------------------------------------------------------------------------
/assets/github-codefuse-logo-update.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/assets/github-codefuse-logo-update.jpg
--------------------------------------------------------------------------------
/init_env.sh:
--------------------------------------------------------------------------------
1 | pip install torch==2.1.0 && \
2 | pip install tensorboard==2.11.0 && \
3 | pip install packaging && \
4 | pip install -r requirements.txt
5 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenizer import build_tokenizer
2 | from .tokenizer import init_tokenizer
3 | from .chat_template import MFTCoder_template
--------------------------------------------------------------------------------
/mftcoder_atorch/data/helpers.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/MFTCoder/HEAD/mftcoder_atorch/data/helpers.cpython-38-x86_64-linux-gnu.so
--------------------------------------------------------------------------------
/mftcoder_atorch/model/gpt_neox/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "bos_token_id": 50256,
3 | "eos_token_id": 50256,
4 | "transformers_version": "4.26.0.dev0",
5 | "_from_model_config": true
6 | }
7 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/gpt_neox/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "bos_token_id": 50256,
3 | "eos_token_id": 50256,
4 | "transformers_version": "4.26.0.dev0",
5 | "_from_model_config": true
6 | }
7 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/qwen/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "model_max_length": 8192,
3 | "tokenizer_class": "QWenTokenizer",
4 | "auto_map": {
5 | "AutoTokenizer": [
6 | "tokenization_qwen.QWenTokenizer",
7 | null
8 | ]
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/mftcoder_atorch/data/Makefile:
--------------------------------------------------------------------------------
1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
3 | LIBNAME = helpers
4 | LIBEXT = $(shell python3-config --extension-suffix)
5 |
6 | default: $(LIBNAME)$(LIBEXT)
7 |
8 | %$(LIBEXT): %.cpp
9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/data/Makefile:
--------------------------------------------------------------------------------
1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
3 | LIBNAME = helpers
4 | LIBEXT = $(shell python3-config --extension-suffix)
5 |
6 | default: $(LIBNAME)$(LIBEXT)
7 |
8 | %$(LIBEXT): %.cpp
9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/chatglm2/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "name_or_path": "THUDM/chatglm2-6b",
3 | "remove_space": false,
4 | "do_lower_case": false,
5 | "tokenizer_class": "ChatGLMTokenizer",
6 | "auto_map": {
7 | "AutoTokenizer": [
8 | "tokenization_chatglm.ChatGLMTokenizer",
9 | null
10 | ]
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.23.5
2 | pandas==2.2.1
3 | torch==2.1.0
4 | tensorboard==2.11.0
5 | deepspeed==0.14.0
6 | transformers==4.44.2
7 | accelerate==0.31.0
8 | peft==0.10.0
9 | BitsAndBytes==0.43.0
10 | xformers==0.0.22.post7
11 | datasets
12 | ftfy
13 | packaging
14 | einops
15 | sentencepiece
16 | ujson
17 | jsonlines
18 | tiktoken
19 | transformers_stream_generator
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/run_offline_tokenization.sh:
--------------------------------------------------------------------------------
1 | MODEL_PATH=
2 | DATA_PATH=
3 | DATASET_NAME=
4 | OUTPUT_PATH=
5 |
6 | python offline_tokenization/concat_sst_bin_tokenization.py \
7 | --model-path ${MODEL_PATH} \
8 | --data-path ${DATA_PATH} \
9 | --dataset-name ${DATASET_NAME} \
10 | --output-path ${OUTPUT_PATH} \
11 | --parallel 16 \
12 | --seq-length 4096 \
13 | --sample-percent 1.0
14 |
--------------------------------------------------------------------------------
/LEGAL.md:
--------------------------------------------------------------------------------
1 | Legal Disclaimer
2 |
3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
4 |
5 | 法律免责声明
6 |
7 | 关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/tuner/pe_base_model.py:
--------------------------------------------------------------------------------
1 | class PEBaseModel:
2 | """PEtuning的基类模型,定义了PEtuning模型都该有的方法"""
3 |
4 | def __init__():
5 | return
6 |
7 | def get_model(self):
8 | """对模型进行修改,冻结参数或者插入可训模块"""
9 | pass
10 |
11 | @classmethod
12 | def restore(self, model=None, path=None):
13 | """从path恢复PE模型
14 |
15 | Args:
16 | model (_type_, optional): 原始模型. Defaults to None.
17 | path (_type_, optional): 增量路径. Defaults to None.
18 | """
19 | pass
20 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/accelerate_ds_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | deepspeed_config:
3 | gradient_accumulation_steps: 1
4 | gradient_clipping: 1.0
5 | offload_optimizer_device: cpu
6 | offload_param_device: none
7 | zero3_init_flag: false
8 | zero3_save_16bit_model: true
9 | zero_stage: 2
10 | # steps_per_print: 1
11 | distributed_type: DEEPSPEED
12 | downcast_bf16: 'no'
13 | dynamo_backend: 'NO'
14 | fsdp_config: {}
15 | machine_rank: 0
16 | main_training_function: main
17 | megatron_lm_config: {}
18 | mixed_precision: 'bf16'
19 | num_machines: 1
20 | num_processes: 8
21 | rdzv_backend: static
22 | same_network: true
23 | use_cpu: false
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/__init__.py:
--------------------------------------------------------------------------------
1 | """peft models interface."""
2 |
3 | from . import utils, tuner
4 | from peft.mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
5 | from peft.utils import TaskType
6 | from .modeling_peft import AntPeftForCausalLM, AntPeftForEmbedding
7 |
8 |
9 | SUPPORTED_PEFT_TYPES = ["prefix", "lora", "adalora", "bitfit", "roem", "unipelt", "prompt", "ptuning"]
10 |
11 | # Register the Ant Causal Language Model
12 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_CAUSAL_LM"] = AntPeftForCausalLM
13 | TaskType.ANT_CAUSAL_LM = "ANT_CAUSAL_LM"
14 |
15 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_EMBEDDING"] = AntPeftForEmbedding
16 | TaskType.ANT_EMBEDDING = "ANT_EMBEDDING"
17 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/accelerate_fsdp_config.yaml:
--------------------------------------------------------------------------------
1 | compute_environment: LOCAL_MACHINE
2 | deepspeed_config: {}
3 | distributed_type: FSDP
4 | downcast_bf16: 'no'
5 | dynamo_backend: 'NO'
6 | fsdp_config:
7 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
8 | fsdp_backward_prefetch_policy: BACKWARD_PRE
9 | fsdp_offload_params: false
10 | fsdp_sharding_strategy: 1
11 | fsdp_state_dict_type: FULL_STATE_DICT
12 | fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
13 | machine_rank: 0
14 | main_training_function: main
15 | megatron_lm_config: {}
16 | mixed_precision: bf16
17 | num_machines: 1
18 | num_processes: 2
19 | rdzv_backend: static
20 | same_network: true
21 | use_cpu: false
--------------------------------------------------------------------------------
/mftcoder_atorch/model/gpt_neox/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "GPTNeoXForCausalLM"
4 | ],
5 | "bos_token_id": 100256,
6 | "eos_token_id": 100256,
7 | "hidden_act": "gelu",
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-05,
12 | "max_position_embeddings": 4096,
13 | "model_type": "gpt_neox",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "rotary_emb_base": 10000,
17 | "rotary_pct": 1.0,
18 | "tie_word_embeddings": false,
19 | "torch_dtype": "float16",
20 | "transformers_version": "4.26.1",
21 | "use_cache": true,
22 | "use_parallel_residual": true,
23 | "vocab_size": 100864
24 | }
25 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/gpt_neox/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "GPTNeoXForCausalLM"
4 | ],
5 | "attention_probs_dropout_prob": 0,
6 | "bos_token_id": 0,
7 | "eos_token_id": 0,
8 | "hidden_act": "gelu_fast",
9 | "hidden_dropout_prob": 0,
10 | "hidden_size": 768,
11 | "initializer_range": 0.02,
12 | "intermediate_size": 3072,
13 | "layer_norm_eps": 1e-05,
14 | "max_position_embeddings": 2048,
15 | "model_type": "gpt_neox",
16 | "num_attention_heads": 12,
17 | "num_hidden_layers": 12,
18 | "rotary_emb_base": 10000,
19 | "rotary_pct": 0.25,
20 | "tie_word_embeddings": false,
21 | "transformers_version": "4.28.1",
22 | "use_cache": false,
23 | "vocab_size": 50432
24 | }
25 |
--------------------------------------------------------------------------------
/mftcoder_atorch/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from .tokenizer import build_tokenizer
17 |
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """peft utils interface."""
2 |
3 | from .config import PeftConfig, PetuningConfig
4 |
5 | from .mapping import TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING
6 | from .mapping import TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING
7 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
8 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING
9 | from .mapping import TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING
10 | from .mapping import TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING
11 | from .mapping import WEIGHTS_NAME, CONFIG_NAME
12 | from .mapping import bloom_model_postprocess_past_key_value
13 |
14 | from .others import get_peft_model_state_dict, set_peft_model_state_dict, _freeze_model, prepare_model_for_kbit_training
--------------------------------------------------------------------------------
/mftcoder_atorch/model/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
3 | #
4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | # from .gpt2_model import GPT2ModelPipe
19 | # from .utils import get_params_for_weight_decay_optimization
20 | # from .word_embeddings import SoftEmbedding
21 |
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/utils/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import sys
5 | sys.path.append("..")
6 | sys.path.append("../..")
7 | from typing import List, Optional
8 | from dataclasses import dataclass, field
9 | from peft.utils import PeftConfig
10 |
11 |
12 | @dataclass
13 | class PetuningConfig(PeftConfig):
14 | """
15 | This is the base configuration class to store the configuration of [`ROEM`], or [`BitFit`].
16 |
17 | Args:
18 | modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable
19 | and saved in the final checkpoint.
20 | """
21 |
22 | modules_to_save: Optional[List[str]] = field(
23 | default=None,
24 | metadata={
25 | "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
26 | "For example, in Sequence Classification or Token Classification tasks, "
27 | "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
28 | },
29 | )
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/configs/dpo_train_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "xxpo": "dpo",
3 | "data_paths": "$DATA_PATHS",
4 | "output_dir": "$OUTPUT_DIR",
5 | "tb_dir": "$TensorBoard_DIR",
6 | "pretrained_model_path": "$MODEL_NAME_OR_PATH",
7 | "model_type": "$MODEL_TYPE",
8 | "data_split": "99,1",
9 | "attn_implementation": "flash_attention_2",
10 | "beta": 0.1,
11 | "rpo_alpha": 0.5,
12 | "peft_type": "lora",
13 | "lora_rank": 64,
14 | "lora_alpha": 128,
15 | "lora_dropout": 0.0,
16 | "per_device_train_batch_size": 1,
17 | "per_device_eval_batch_size": 1,
18 | "tokenizer_type": "AutoTokenizer",
19 | "dataset_num_proc": 1,
20 | "learning_rate": 5e-7,
21 | "weight_decay": 0.01,
22 | "gradient_accumulation_steps": 8,
23 | "lr_scheduler_type": "cosine",
24 | "warmup_steps": 100,
25 | "num_train_epochs": 2,
26 | "seed": 1105,
27 | "max_prompt_length": 2048,
28 | "max_length": 4096,
29 | "logging_steps": 20,
30 | "save_steps": 500,
31 | "eval_steps": 500,
32 | "epoch_checkpointing": false,
33 | "saving_limit": 5
34 | }
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/ds_single_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Author: Chaoyu Chen
3 | # Last Modified: 2023/12/11
4 | # Description: An alternative(Command line) way to launch DeepSpeed training
5 |
6 | # Launch script on single node
7 | N_GPU_PER_NODE=8
8 |
9 | # config path
10 | CONFIG="configs/xxx_train_config.json"
11 |
12 | # envs used inside training
13 | export OMP_NUM_THREADS=4
14 | export TOKENIZERS_PARALLELISM=False
15 |
16 | TODAY=$(date +%Y-%m%d-%H%M)
17 |
18 | # accelerate launch --config_file accelerate_ds_config.yaml \
19 | accelerate launch \
20 | --num_machines 1 \
21 | --num_processes $N_GPU_PER_NODE \
22 | --use_deepspeed \
23 | --zero_stage 2 \
24 | --offload_optimizer_device 'cpu' \
25 | --offload_param_device 'none' \
26 | --gradient_accumulation_steps 1 \
27 | --gradient_clipping 1.0 \
28 | --zero3_init_flag false \
29 | --zero3_save_16bit_model false \
30 | --main_training_function 'main' \
31 | --mixed_precision 'bf16' \
32 | --dynamo_backend 'no' \
33 | --same_network \
34 | --machine_rank 0 \
35 | --rdzv_backend 'static' \
36 | pefts/mft_accelerate.py --train_config "$CONFIG" \
37 | --distributed_type "deepspeed" \
38 | > MFTCoder-training-"$TODAY".log 2>&1 &
39 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/ds_zero3_single_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Author: Chaoyu Chen
3 | # Last Modified: 2024/5/20
4 | # Description: An alternative(Command line) way to launch DeepSpeed training
5 |
6 | # Launch script on single node
7 | N_GPU_PER_NODE=8
8 |
9 | # config path
10 | CONFIG="configs/xxx_train_config.json"
11 |
12 | # envs used inside training
13 | export OMP_NUM_THREADS=4
14 | export TOKENIZERS_PARALLELISM=False
15 |
16 | TODAY=$(date +%Y-%m%d-%H%M)
17 |
18 | # accelerate launch --config_file accelerate_ds_config.yaml \
19 | accelerate launch \
20 | --num_machines 1 \
21 | --num_processes $N_GPU_PER_NODE \
22 | --use_deepspeed \
23 | --zero_stage 3 \
24 | --offload_optimizer_device 'cpu' \
25 | --offload_param_device 'cpu' \
26 | --gradient_accumulation_steps 1 \
27 | --gradient_clipping 1.0 \
28 | --zero3_init_flag true \
29 | --zero3_save_16bit_model true \
30 | --main_training_function 'main' \
31 | --mixed_precision 'bf16' \
32 | --dynamo_backend 'no' \
33 | --same_network \
34 | --machine_rank 0 \
35 | --rdzv_backend 'static' \
36 | pefts/mft_accelerate.py --train_config "$CONFIG" \
37 | --distributed_type "deepspeed" \
38 | > MFTCoder-training-"$TODAY".log 2>&1 &
39 |
--------------------------------------------------------------------------------
/mftcoder_atorch/utils/merge_base_and_lora_to_hf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import shutil
5 | import torch
6 | import transformers
7 | sys.path.append("..")
8 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
9 | from peft import LoraConfig, get_peft_model
10 | from peft import PeftModel
11 | from model_mapping import MODEL_SPECIAL_TOKENS
12 |
13 |
14 | model_path='path to base model'
15 | lora_adapter='path to lora adaptor ckpt'
16 | save_path='path to new merged model'
17 | model_type = 'gpt_neox'
18 |
19 | t0 = time.time()
20 | config = {"model_type": model_type}
21 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
22 |
23 | base_model = AutoModelForCausalLM.from_pretrained(
24 | model_path,
25 | trust_remote_code=True,
26 | torch_dtype=torch.bfloat16,
27 | return_dict=True,
28 | device_map="auto"
29 | )
30 | print(base_model)
31 |
32 | # merge, save model and tokenizer
33 | model_to_merge = PeftModel.from_pretrained(base_model, lora_adapter)
34 | merged_model = model_to_merge.merge_and_unload()
35 | print(merged_model.config)
36 | merged_model.save_pretrained(save_path)
37 | tokenizer.save_pretrained(save_path)
38 | print(f"Merge finised: {save_path} saved, Cost {time.time()-t0:.2f}s")
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/offline_tokenization/writer.py:
--------------------------------------------------------------------------------
1 |
2 | import threading
3 | import fcntl
4 | import json
5 |
6 | class JSONLWriter():
7 | """
8 | A writer used to save jsonl lines into a file.
9 | """
10 | def __init__(self, output_path, dataset_name):
11 | self.output_path = output_path
12 | self.out_file = open(output_path, 'w')
13 | self.cache = []
14 | self.cache_size = 4096
15 | self.dataset_name = dataset_name
16 | self.index = 0
17 |
18 | def pack_into_jsonl(self, line_text):
19 | new_item = {
20 | "data_name": self.dataset_name,
21 | "id": self.index,
22 | "content": line_text
23 | }
24 |
25 | return new_item
26 |
27 |
28 | def add_item(self, line_text):
29 | if len(self.cache) >= self.cache_size:
30 | self.flush()
31 |
32 | item = self.pack_into_jsonl(line_text)
33 | self.cache.append(json.dumps(item))
34 | self.index += 1
35 |
36 |
37 | def flush(self):
38 | content = '\n'.join(self.cache)
39 | fcntl.flock(self.out_file, fcntl.LOCK_EX)
40 | self.out_file.write(f'{content}\n')
41 | fcntl.flock(self.out_file, fcntl.LOCK_UN)
42 | self.cache = []
43 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/configs/full_train_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_paths": "$DATA_PATHS",
3 | "output_dir": "$OUTPUT_DIR",
4 | "tb_dir": "$TensorBoard_DIR",
5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH",
6 | "model_type": "$MODEL_TYPE",
7 | "load_raw_dataset": true,
8 | "data_split": "98,2,0",
9 | "padding_mode": "padding",
10 | "use_dynamic_padding": true,
11 | "tokenize_mode": "sft",
12 | "tokenizer_type": "AutoTokenizer",
13 | "weighted_loss_mode": "case3",
14 | "attn_implementation": "flash_attention_2",
15 | "seq_length": 4096,
16 | "seed": 1234,
17 | "peft_type": null,
18 | "per_device_train_batch_size": 2,
19 | "per_device_eval_batch_size": 2,
20 | "learning_rate": 5e-5,
21 | "min_lr": 5e-6,
22 | "weight_decay": 0.1,
23 | "gradient_accumulation_steps": 1,
24 | "lr_scheduler_type": "cosine",
25 | "num_warmup_steps": 300,
26 | "num_train_epochs": 4,
27 | "resume_from_checkpoint": null,
28 | "log_interval": 10,
29 | "checkpointing_steps": 100,
30 | "evaluation_steps": 100,
31 | "max_train_steps": null,
32 | "epoch_checkpointing": true,
33 | "shuffle_before_split": true,
34 | "early_stopping": true,
35 | "early_stopping_stall_num": 5,
36 | "saving_limit": 3
37 | }
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/tuner/__init__.py:
--------------------------------------------------------------------------------
1 | """peft tuner methods interface."""
2 |
3 | from peft.utils import PeftType
4 | from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
5 | from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
6 |
7 | from .adalora import AdaLoraConfig, AdaLoraModel
8 | from .routelora import RouteLoraConfig, RouteLoraModel
9 | from .unipelt import UniPELTConfig, UniPELTModel, PEUniPELTModel
10 | from .pe_base_model import PEBaseModel
11 | from .bitfit import PeftBitfitConfig, PEBitfitModel, PeftBitfitModel
12 | from .roem import PeftROEMConfig, PEROEMModel, PeftROEMModel
13 |
14 | # Register new ant peft methods
15 | PeftType.ROUTELORA = "ROUTELORA"
16 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROUTELORA] = RouteLoraModel
17 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROUTELORA] = RouteLoraConfig
18 |
19 | PeftType.UNIPELT = "UNIPELT"
20 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.UNIPELT] = UniPELTModel
21 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.UNIPELT] = UniPELTConfig
22 |
23 | PeftType.ROEM = "ROEM"
24 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROEM] = PeftROEMModel
25 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROEM] = PeftROEMConfig
26 |
27 | PeftType.BITFIT = "BITFIT"
28 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.BITFIT] = PeftBitfitModel
29 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.BITFIT] = PeftBitfitConfig
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/fsdp_single_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Author: Chaoyu Chen
3 | # Last Modified: 2023/12/11
4 | # Description: An alternative(command line) way to launch FSDP training
5 |
6 | # Launch script on single node
7 | N_GPU_PER_NODE=8
8 |
9 | # config path
10 | CONFIG="configs/xxx_train_config.json"
11 |
12 | # fsdp_transformer_layer_cls_to_wrap, choose the DecoderLayer
13 | WRAP_MODULE="LlamaDecoderLayer"
14 |
15 |
16 |
17 | # envs used inside training
18 | export OMP_NUM_THREADS=4
19 | export TOKENIZERS_PARALLELISM=False
20 |
21 | TODAY=$(date +%Y-%m%d-%H%M)
22 |
23 | # accelerate launch --config_file accelerate_fsdp_config.yaml \
24 | accelerate launch \
25 | --use_fsdp \
26 | --num_machines=1 \
27 | --num_processes=$N_GPU_PER_NODE \
28 | --fsdp_sharding_strategy=1 \
29 | --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \
30 | --fsdp_state_dict_type=FULL_STATE_DICT \
31 | --fsdp_backward_prefetch_policy=BACKWARD_PRE \
32 | --fsdp_transformer_layer_cls_to_wrap=$WRAP_MODULE \
33 | --fsdp_offload_params=false \
34 | --main_training_function=main \
35 | --mixed_precision=bf16 \
36 | --dynamo_backend=no \
37 | --same_network \
38 | --machine_rank=0 \
39 | --rdzv_backend=static \
40 | pefts/mft_accelerate.py --train_config "$CONFIG" \
41 | --distributed_type "fsdp" \
42 | > MFTCoder-training-"$TODAY".log 2>&1 &
43 |
44 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/configs/lora_train_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_paths": "$DATA_PATHS",
3 | "output_dir": "$OUTPUT_DIR",
4 | "tb_dir": "$TensorBoard_DIR",
5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH",
6 | "model_type": "$MODEL_TYPE",
7 | "load_raw_dataset": true,
8 | "data_split": "98,2,0",
9 | "padding_mode": "padding",
10 | "use_dynamic_padding": true,
11 | "tokenize_mode": "sft",
12 | "tokenizer_type": "AutoTokenizer",
13 | "weighted_loss_mode": "case3",
14 | "attn_implementation": "flash_attention_2",
15 | "seq_length": 4096,
16 | "seed": 1234,
17 | "peft_type": "lora",
18 | "quantization": null,
19 | "lora_rank": 96,
20 | "lora_alpha": 32,
21 | "lora_dropout": 0.05,
22 | "per_device_train_batch_size": 2,
23 | "per_device_eval_batch_size": 2,
24 | "learning_rate": 5e-5,
25 | "min_lr": 5e-6,
26 | "weight_decay": 0.1,
27 | "gradient_accumulation_steps": 1,
28 | "lr_scheduler_type": "cosine",
29 | "num_warmup_steps": 300,
30 | "num_train_epochs": 4,
31 | "resume_from_checkpoint": null,
32 | "log_interval": 10,
33 | "checkpointing_steps": 100,
34 | "evaluation_steps": 100,
35 | "max_train_steps": null,
36 | "epoch_checkpointing": true,
37 | "shuffle_before_split": true,
38 | "early_stopping": true,
39 | "early_stopping_stall_num": 5,
40 | "saving_limit": null
41 | }
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/configs/qlora_train_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_paths": "$DATA_PATHS",
3 | "output_dir": "$OUTPUT_DIR",
4 | "tb_dir": "$TensorBoard_DIR",
5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH",
6 | "model_type": "$MODEL_TYPE",
7 | "load_raw_dataset": true,
8 | "data_split": "98,2,0",
9 | "padding_mode": "padding",
10 | "use_dynamic_padding": true,
11 | "tokenize_mode": "sft",
12 | "tokenizer_type": "AutoTokenizer",
13 | "weighted_loss_mode": "case3",
14 | "attn_implementation": "flash_attention_2",
15 | "seq_length": 4096,
16 | "seed": 1234,
17 | "peft_type": "qlora",
18 | "quantization": "4bit",
19 | "lora_rank": 96,
20 | "lora_alpha": 32,
21 | "lora_dropout": 0.05,
22 | "per_device_train_batch_size": 2,
23 | "per_device_eval_batch_size": 2,
24 | "learning_rate": 5e-5,
25 | "min_lr": 5e-6,
26 | "weight_decay": 0.1,
27 | "gradient_accumulation_steps": 1,
28 | "lr_scheduler_type": "cosine",
29 | "num_warmup_steps": 300,
30 | "num_train_epochs": 4,
31 | "resume_from_checkpoint": null,
32 | "log_interval": 10,
33 | "checkpointing_steps": 100,
34 | "evaluation_steps": 100,
35 | "max_train_steps": null,
36 | "epoch_checkpointing": true,
37 | "shuffle_before_split": true,
38 | "early_stopping": true,
39 | "early_stopping_stall_num": 5,
40 | "saving_limit": null
41 | }
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/chatglm2/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "THUDM/chatglm2-6b",
3 | "model_type": "chatglm",
4 | "architectures": [
5 | "ChatGLMModel"
6 | ],
7 | "auto_map": {
8 | "AutoConfig": "configuration_chatglm.ChatGLMConfig",
9 | "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
10 | "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
11 | "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration"
12 | },
13 | "add_bias_linear": false,
14 | "add_qkv_bias": true,
15 | "apply_query_key_layer_scaling": true,
16 | "apply_residual_connection_post_layernorm": false,
17 | "attention_dropout": 0.0,
18 | "attention_softmax_in_fp32": true,
19 | "bias_dropout_fusion": true,
20 | "ffn_hidden_size": 13696,
21 | "fp32_residual_connection": false,
22 | "hidden_dropout": 0.0,
23 | "hidden_size": 4096,
24 | "kv_channels": 128,
25 | "layernorm_epsilon": 1e-05,
26 | "multi_query_attention": true,
27 | "multi_query_group_num": 2,
28 | "num_attention_heads": 32,
29 | "num_layers": 28,
30 | "original_rope": true,
31 | "padded_vocab_size": 65024,
32 | "post_layer_norm": true,
33 | "rmsnorm": true,
34 | "seq_length": 32768,
35 | "use_cache": true,
36 | "torch_dtype": "float16",
37 | "transformers_version": "4.27.1",
38 | "tie_word_embeddings": false,
39 | "eos_token_id": 2,
40 | "pad_token_id": 0
41 | }
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/chatglm3/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_name_or_path": "THUDM/chatglm3-6b",
3 | "model_type": "chatglm",
4 | "architectures": [
5 | "ChatGLMModel"
6 | ],
7 | "auto_map": {
8 | "AutoConfig": "configuration_chatglm.ChatGLMConfig",
9 | "AutoModel": "modeling_chatglm.ChatGLMForConditionalGeneration",
10 | "AutoModelForCausalLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
11 | "AutoModelForSeq2SeqLM": "modeling_chatglm.ChatGLMForConditionalGeneration",
12 | "AutoModelForSequenceClassification": "modeling_chatglm.ChatGLMForSequenceClassification"
13 | },
14 | "add_bias_linear": false,
15 | "add_qkv_bias": true,
16 | "apply_query_key_layer_scaling": true,
17 | "apply_residual_connection_post_layernorm": false,
18 | "attention_dropout": 0.0,
19 | "attention_softmax_in_fp32": true,
20 | "bias_dropout_fusion": true,
21 | "ffn_hidden_size": 13696,
22 | "fp32_residual_connection": false,
23 | "hidden_dropout": 0.0,
24 | "hidden_size": 4096,
25 | "kv_channels": 128,
26 | "layernorm_epsilon": 1e-05,
27 | "multi_query_attention": true,
28 | "multi_query_group_num": 2,
29 | "num_attention_heads": 32,
30 | "num_layers": 28,
31 | "original_rope": true,
32 | "padded_vocab_size": 65024,
33 | "post_layer_norm": true,
34 | "rmsnorm": true,
35 | "seq_length": 8192,
36 | "use_cache": true,
37 | "torch_dtype": "float16",
38 | "transformers_version": "4.30.2",
39 | "tie_word_embeddings": false,
40 | "eos_token_id": 2,
41 | "pad_token_id": 0
42 | }
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/ds_multinode_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Author: Chaoyu Chen
3 | # Last Modified: 2024/5/20
4 | # Description: # Launch script on Multiple Nodes
5 |
6 | # Run this script on all Nodes.
7 |
8 | # You need to export your number of nodes and number of GPUs per node first.
9 | N_NODE=4
10 | N_GPU_PER_NODE=8
11 |
12 | # You need to export $MACHINE_RANK, $MASTER_ADDR, $MASTER_PORT automatically for each Node.
13 |
14 | # config path
15 | CONFIG="configs/xxx_train_config.json"
16 |
17 | # envs used inside training
18 | export OMP_NUM_THREADS=4
19 | export TOKENIZERS_PARALLELISM=False
20 |
21 | TODAY=$(date +%Y-%m%d-%H%M)
22 |
23 | # accelerate launch --config_file accelerate_ds_config.yaml \
24 | accelerate launch \
25 | --num_machines $N_NODE \
26 | --num_processes $(($N_NODE*$N_GPU_PER_NODE)) \
27 | --use_deepspeed \
28 | --deepspeed_multinode_launcher 'standard' \
29 | --zero_stage 2 \
30 | --offload_optimizer_device 'cpu' \
31 | --offload_param_device 'none' \
32 | --gradient_accumulation_steps 1 \
33 | --gradient_clipping 1.0 \
34 | --zero3_init_flag false \
35 | --zero3_save_16bit_model false \
36 | --main_training_function 'main' \
37 | --mixed_precision 'bf16' \
38 | --dynamo_backend 'no' \
39 | --same_network \
40 | --machine_rank $MACHINE_RANK \
41 | --main_process_ip $MASTER_ADDR \
42 | --main_process_port $MASTER_PORT \
43 | --rdzv_backend 'static' \
44 | pefts/mft_accelerate.py --train_config "$CONFIG" --distributed_type "deepspeed"
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/configs/coba_train_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "data_paths": "$DATA_PATHS",
3 | "output_dir": "$OUTPUT_DIR",
4 | "tb_dir": "$TensorBoard_DIR",
5 | "pretrained_model_path": "$MODEL_NAME_OR_PATH",
6 | "model_type": "$MODEL_TYPE",
7 | "load_raw_dataset": true,
8 | "data_split": "95,5,0",
9 | "padding_mode": "padding",
10 | "use_dynamic_padding": true,
11 | "tokenize_mode": "sft",
12 | "tokenizer_type": "AutoTokenizer",
13 | "weighted_loss_mode": "coba",
14 | "coba_warmup_steps": 100,
15 | "coba_history_length": 200,
16 | "coba_tau": 5,
17 | "coba_update_interval": 1,
18 | "coba_sample_valid_num": 1,
19 | "attn_implementation": "flash_attention_2",
20 | "seq_length": 4096,
21 | "seed": 1234,
22 | "peft_type": "qlora",
23 | "quantization": "4bit",
24 | "lora_rank": 96,
25 | "lora_alpha": 32,
26 | "lora_dropout": 0.05,
27 | "per_device_train_batch_size": 8,
28 | "per_device_eval_batch_size": 8,
29 | "learning_rate": 5e-5,
30 | "min_lr": 5e-6,
31 | "weight_decay": 0.1,
32 | "gradient_accumulation_steps": 1,
33 | "lr_scheduler_type": "cosine",
34 | "num_warmup_steps": 300,
35 | "num_train_epochs": 4,
36 | "resume_from_checkpoint": null,
37 | "log_interval": 10,
38 | "checkpointing_steps": 100,
39 | "evaluation_steps": 100,
40 | "max_train_steps": null,
41 | "epoch_checkpointing": true,
42 | "shuffle_before_split": true,
43 | "early_stopping": true,
44 | "early_stopping_stall_num": 5,
45 | "saving_limit": null
46 | }
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/deepseek_v2/tokenization_deepseek_fast.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Union
2 |
3 |
4 | from transformers.models.llama import LlamaTokenizerFast
5 |
6 |
7 | class DeepseekTokenizerFast(LlamaTokenizerFast):
8 |
9 | def convert_ids_to_tokens(
10 | self, ids: Union[int, List[int]], skip_special_tokens: bool = False
11 | ) -> Union[str, List[str]]:
12 | """
13 | Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
14 | added tokens.
15 |
16 | Args:
17 | ids (`int` or `List[int]`):
18 | The token id (or token ids) to convert to tokens.
19 | skip_special_tokens (`bool`, *optional*, defaults to `False`):
20 | Whether or not to remove special tokens in the decoding.
21 |
22 | Returns:
23 | `str` or `List[str]`: The decoded token(s).
24 | """
25 | if isinstance(ids, int):
26 | return self._convert_id_to_token(ids)
27 | tokens = []
28 | for index in ids:
29 | index = int(index)
30 | if skip_special_tokens and index in self.all_special_ids:
31 | continue
32 | token = self._tokenizer.id_to_token(index)
33 | tokens.append(token if token is not None else "")
34 | return tokens
35 |
36 | def _convert_id_to_token(self, index: int) -> Optional[str]:
37 | token = self._tokenizer.id_to_token(int(index))
38 | return token if token is not None else ""
39 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/baichuan2/configuration_baichuan.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
2 |
3 | from transformers.configuration_utils import PretrainedConfig
4 |
5 |
6 | class BaichuanConfig(PretrainedConfig):
7 | model_type = "baichuan"
8 | keys_to_ignore_at_inference = ["past_key_values"]
9 |
10 | def __init__(
11 | self,
12 | vocab_size=64000,
13 | hidden_size=5120,
14 | intermediate_size=13696,
15 | num_hidden_layers=40,
16 | num_attention_heads=40,
17 | hidden_act="silu",
18 | model_max_length=4096,
19 | initializer_range=0.02,
20 | rms_norm_eps=1e-6,
21 | use_cache=True,
22 | pad_token_id=0,
23 | bos_token_id=1,
24 | eos_token_id=2,
25 | tie_word_embeddings=False,
26 | gradient_checkpointing=False,
27 | z_loss_weight=0,
28 | **kwargs,
29 | ):
30 | self.vocab_size = vocab_size
31 | self.model_max_length = model_max_length
32 | self.hidden_size = hidden_size
33 | self.intermediate_size = intermediate_size
34 | self.num_hidden_layers = num_hidden_layers
35 | self.num_attention_heads = num_attention_heads
36 | self.hidden_act = hidden_act
37 | self.initializer_range = initializer_range
38 | self.rms_norm_eps = rms_norm_eps
39 | self.use_cache = use_cache
40 | self.z_loss_weight = z_loss_weight
41 | self.gradient_checkpointing = (gradient_checkpointing,)
42 | super().__init__(
43 | pad_token_id=pad_token_id,
44 | bos_token_id=bos_token_id,
45 | eos_token_id=eos_token_id,
46 | tie_word_embeddings=tie_word_embeddings,
47 | **kwargs,
48 | )
49 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/phi/configuration_mixformer_sequential.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import math
5 | from typing import Any, Dict, List, Optional, Union
6 |
7 | from transformers import PretrainedConfig
8 |
9 |
10 | class MixFormerSequentialConfig(PretrainedConfig):
11 | """MixFormer (sequential for DeepSpeed) configuration."""
12 |
13 | model_type = "mixformer-sequential"
14 |
15 | attribute_map = {
16 | "max_position_embeddings": "n_positions",
17 | "hidden_size": "n_embd",
18 | "num_attention_heads": "n_head",
19 | "num_hidden_layers": "n_layer",
20 | }
21 |
22 | def __init__(
23 | self,
24 | vocab_size: Optional[int] = 50304,
25 | n_positions: Optional[int] = 2048,
26 | n_embd: Optional[int] = 1024,
27 | n_layer: Optional[int] = 20,
28 | n_inner: Optional[int] = None,
29 | n_head: Optional[int] = 16,
30 | rotary_dim: Optional[int] = 32,
31 | activation_function: Optional[str] = "gelu_new",
32 | embd_pdrop: Optional[float] = 0.0,
33 | resid_pdrop: Optional[float] = 0.0,
34 | layer_norm_epsilon: Optional[float] = 1e-5,
35 | initializer_range: Optional[float] = 0.02,
36 | tie_word_embeddings: Optional[bool] = False,
37 | pad_vocab_size_multiple: Optional[int] = 64,
38 | **kwargs
39 | ) -> None:
40 | self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
41 | self.n_positions = n_positions
42 | self.n_embd = n_embd
43 | self.n_layer = n_layer
44 | self.n_inner = n_inner
45 | self.n_head = n_head
46 | self.rotary_dim = min(rotary_dim, n_embd // n_head)
47 | self.activation_function = activation_function
48 | self.embd_pdrop = embd_pdrop
49 | self.resid_pdrop = resid_pdrop
50 | self.layer_norm_epsilon = layer_norm_epsilon
51 | self.initializer_range = initializer_range
52 |
53 | super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
54 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/code_llama/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 MetaAI and The HuggingFace Inc. team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 |
16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available
17 |
18 |
19 | _import_structure = {}
20 |
21 | try:
22 | if not is_sentencepiece_available():
23 | raise OptionalDependencyNotAvailable()
24 | except OptionalDependencyNotAvailable:
25 | pass
26 | else:
27 | _import_structure["tokenization_code_llama"] = ["CodeLlamaTokenizer"]
28 |
29 | try:
30 | if not is_tokenizers_available():
31 | raise OptionalDependencyNotAvailable()
32 | except OptionalDependencyNotAvailable:
33 | pass
34 | else:
35 | _import_structure["tokenization_code_llama_fast"] = ["CodeLlamaTokenizerFast"]
36 |
37 | if TYPE_CHECKING:
38 | try:
39 | if not is_sentencepiece_available():
40 | raise OptionalDependencyNotAvailable()
41 | except OptionalDependencyNotAvailable:
42 | pass
43 | else:
44 | from .tokenization_code_llama import CodeLlamaTokenizer
45 |
46 | try:
47 | if not is_tokenizers_available():
48 | raise OptionalDependencyNotAvailable()
49 | except OptionalDependencyNotAvailable:
50 | pass
51 | else:
52 | from .tokenization_code_llama_fast import CodeLlamaTokenizerFast
53 |
54 | else:
55 | import sys
56 |
57 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
58 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/qwen/cpp_kernels.py:
--------------------------------------------------------------------------------
1 | from torch.utils import cpp_extension
2 | import pathlib
3 | import os
4 | import subprocess
5 |
6 | def _get_cuda_bare_metal_version(cuda_dir):
7 | raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
8 | universal_newlines=True)
9 | output = raw_output.split()
10 | release_idx = output.index("release") + 1
11 | release = output[release_idx].split(".")
12 | bare_metal_major = release[0]
13 | bare_metal_minor = release[1][0]
14 |
15 | return raw_output, bare_metal_major, bare_metal_minor
16 |
17 | def _create_build_dir(buildpath):
18 | try:
19 | os.mkdir(buildpath)
20 | except OSError:
21 | if not os.path.isdir(buildpath):
22 | print(f"Creation of the build directory {buildpath} failed")
23 |
24 | # Check if cuda 11 is installed for compute capability 8.0
25 | cc_flag = []
26 | _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
27 | if int(bare_metal_major) >= 11:
28 | cc_flag.append('-gencode')
29 | cc_flag.append('arch=compute_80,code=sm_80')
30 | if int(bare_metal_minor) >= 7:
31 | cc_flag.append('-gencode')
32 | cc_flag.append('arch=compute_90,code=sm_90')
33 |
34 | # Build path
35 | srcpath = pathlib.Path(__file__).parent.absolute()
36 | buildpath = srcpath / 'build'
37 | _create_build_dir(buildpath)
38 |
39 | def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
40 | return cpp_extension.load(
41 | name=name,
42 | sources=sources,
43 | build_directory=buildpath,
44 | extra_cflags=['-O3', ],
45 | extra_cuda_cflags=['-O3',
46 | '-gencode', 'arch=compute_70,code=sm_70',
47 | '--use_fast_math'] + extra_cuda_flags + cc_flag,
48 | verbose=1
49 | )
50 |
51 | extra_flags = []
52 |
53 | cache_autogptq_cuda_256_sources = ["./cache_autogptq_cuda_256.cpp",
54 | "./cache_autogptq_cuda_kernel_256.cu"]
55 | cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags)
56 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/utils/model_mapping.py:
--------------------------------------------------------------------------------
1 | """
2 | @author qumu
3 | transformers==4.40 is stable now
4 | """
5 |
6 | # Models that Transformers support Code and FA2 when flash_attn>=2.1.0
7 | from transformers import (
8 | GPTNeoXForCausalLM,
9 | GPTBigCodeForCausalLM,
10 | LlamaForCausalLM,
11 | MistralForCausalLM,
12 | MixtralForCausalLM,
13 | PhiForCausalLM,
14 | GemmaForCausalLM,
15 | Qwen2ForCausalLM,
16 | Qwen2MoeForCausalLM,
17 | Starcoder2ForCausalLM,
18 | )
19 |
20 | # model in local model dir and support transformers FA2
21 | from model.deepseek_v2.modeling_deepseek import DeepseekV2ForCausalLM
22 |
23 | # model in local model and self-contained
24 | from model.aquila2.modeling_aquila import AquilaForCausalLM
25 | from model.baichuan2.modeling_baichuan import BaichuanForCausalLM
26 | from model.qwen.modeling_qwen import QWenLMHeadModel
27 | from model.chatglm2.modeling_chatglm import ChatGLMForConditionalGeneration as ChatGLMForConditionalGeneration2
28 | from model.chatglm3.modeling_chatglm import ChatGLMForConditionalGeneration as ChatGLMForConditionalGeneration3
29 |
30 | # from model.phi.modeling_mixformer_sequential import MixFormerSequentialForCausalLM
31 |
32 | MODEL_TYPES = {
33 | "aquila2": AquilaForCausalLM,
34 | "baichuan": BaichuanForCausalLM,
35 | "chatglm2": ChatGLMForConditionalGeneration2,
36 | "chatglm3": ChatGLMForConditionalGeneration3,
37 | "code_llama": LlamaForCausalLM,
38 | "deepseek": LlamaForCausalLM,
39 | "gpt_neox": GPTNeoXForCausalLM,
40 | "llama": LlamaForCausalLM,
41 | "mistral": MistralForCausalLM,
42 | "mixtral": MixtralForCausalLM,
43 | "phi": PhiForCausalLM,
44 | "qwen": QWenLMHeadModel,
45 | "starcoder": GPTBigCodeForCausalLM,
46 | "qwen2": Qwen2ForCausalLM,
47 | "gemma": GemmaForCausalLM,
48 | "qwen2_moe": Qwen2MoeForCausalLM,
49 | "starcoder2": Starcoder2ForCausalLM,
50 | "deepseek_v2": DeepseekV2ForCausalLM,
51 | }
52 |
53 | SUPPORT_IN_TRANSFORMERS = [
54 | "code_llama",
55 | "llama",
56 | "deepseek",
57 | "mistral",
58 | "mixtral",
59 | "gpt_neox",
60 | "phi",
61 | "starcoder",
62 | "qwen2",
63 | "qwen2_moe",
64 | "gemma",
65 | "starcoder2",
66 | "deepseek_v2",
67 | ]
68 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/gpt_bigcode/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import TYPE_CHECKING
16 |
17 | from transformers.utils import (
18 | OptionalDependencyNotAvailable,
19 | _LazyModule,
20 | is_torch_available,
21 | )
22 |
23 |
24 | _import_structure = {
25 | "configuration_gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig"],
26 | }
27 |
28 | try:
29 | if not is_torch_available():
30 | raise OptionalDependencyNotAvailable()
31 | except OptionalDependencyNotAvailable:
32 | pass
33 | else:
34 | _import_structure["modeling_gpt_bigcode"] = [
35 | "GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST",
36 | "GPTBigCodeForSequenceClassification",
37 | "GPTBigCodeForTokenClassification",
38 | "GPTBigCodeForCausalLM",
39 | "GPTBigCodeModel",
40 | "GPTBigCodePreTrainedModel",
41 | ]
42 |
43 | if TYPE_CHECKING:
44 | from .configuration_gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig
45 |
46 | try:
47 | if not is_torch_available():
48 | raise OptionalDependencyNotAvailable()
49 | except OptionalDependencyNotAvailable:
50 | pass
51 | else:
52 | from .modeling_gpt_bigcode import (
53 | GPT_BIGCODE_PRETRAINED_MODEL_ARCHIVE_LIST,
54 | GPTBigCodeForCausalLM,
55 | GPTBigCodeForSequenceClassification,
56 | GPTBigCodeForTokenClassification,
57 | GPTBigCodeModel,
58 | GPTBigCodePreTrainedModel,
59 | )
60 |
61 |
62 | else:
63 | import sys
64 |
65 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
66 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/tokenizer/chat_template.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @author Chaoyu Chen
3 | # @date 2023/12/25
4 |
5 | # store possible chat_template for tokenizers to prepare input string
6 | # -------------------------------------------------- Import ------------------------------------------------------------
7 | """
8 | Usage:
9 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
10 | messages = [
11 | {"role": "system", "content": "Be smart"},
12 | {"role": "human", "content": "Hello, how are you?"},
13 | {"role": "bot", "content": "I'm doing great. How can I help you today?"},
14 | {"role": "human", "content": "I'd like to show off how chat templating works!"},
15 | ]
16 | prompts = tokenizer.apply_chat_template(message, chat_template=MFTCoder_template, tokenize=False, add_generation_prompt=True)
17 | """
18 |
19 | MFTCoder_template = (
20 | "{% if messages[0]['role'] == 'system' %}"
21 | "{% set loop_messages = messages[1:] %}" # Extract system message if it's present
22 | "{% set system_message = messages[0]['content'] %}"
23 | "{% else %}"
24 | "{% set loop_messages = messages %}"
25 | "{% set system_message = false %}"
26 | "{% endif %}"
27 | "{% for message in loop_messages %}" # Loop over all non-system messages
28 | "{% if (message['role'] == 'user' or message['role'] == 'human') != (loop.index0 % 2 == 0) %}"
29 | "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
30 | "{% endif %}"
31 | "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message
32 | "{% set content = 'system\n' + system_message + '\n' %}"
33 | "{% else %}"
34 | "{% set content = '' %}"
35 | "{% endif %}"
36 | "{% if message['role'] == 'user' or message['role'] == 'human' %}"
37 | "{{ content + 'human\n' + message['content'] + '\n' }}"
38 | "{% elif message['role'] == 'assistant' or message['role'] == 'bot' %}"
39 | "{{ 'bot\n' + message['content'] + '\n' + eos_token + '\n'}}"
40 | "{% else %}"
41 | "{{ raise_exception('Only user/human and assistant/bot roles are supported!') }}"
42 | "{% endif %}"
43 | "{% endfor %}"
44 | "{% if add_generation_prompt %}"
45 | "{{ 'bot\n' }}"
46 | "{% endif %}"
47 | )
48 |
49 | if __name__ == "__main__":
50 | pass
51 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/chatglm2/configuration_chatglm.py:
--------------------------------------------------------------------------------
1 | from transformers import PretrainedConfig
2 |
3 |
4 | class ChatGLMConfig(PretrainedConfig):
5 | model_type = "chatglm"
6 | def __init__(
7 | self,
8 | num_layers=28,
9 | padded_vocab_size=65024,
10 | hidden_size=4096,
11 | ffn_hidden_size=13696,
12 | kv_channels=128,
13 | num_attention_heads=32,
14 | seq_length=2048,
15 | hidden_dropout=0.0,
16 | attention_dropout=0.0,
17 | layernorm_epsilon=1e-5,
18 | rmsnorm=True,
19 | apply_residual_connection_post_layernorm=False,
20 | post_layer_norm=True,
21 | add_bias_linear=False,
22 | add_qkv_bias=False,
23 | bias_dropout_fusion=True,
24 | multi_query_attention=False,
25 | multi_query_group_num=1,
26 | apply_query_key_layer_scaling=True,
27 | attention_softmax_in_fp32=True,
28 | fp32_residual_connection=False,
29 | quantization_bit=0,
30 | pre_seq_len=None,
31 | prefix_projection=False,
32 | **kwargs
33 | ):
34 | self.num_layers = num_layers
35 | self.vocab_size = padded_vocab_size
36 | self.padded_vocab_size = padded_vocab_size
37 | self.hidden_size = hidden_size
38 | self.ffn_hidden_size = ffn_hidden_size
39 | self.kv_channels = kv_channels
40 | self.num_attention_heads = num_attention_heads
41 | self.seq_length = seq_length
42 | self.hidden_dropout = hidden_dropout
43 | self.attention_dropout = attention_dropout
44 | self.layernorm_epsilon = layernorm_epsilon
45 | self.rmsnorm = rmsnorm
46 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
47 | self.post_layer_norm = post_layer_norm
48 | self.add_bias_linear = add_bias_linear
49 | self.add_qkv_bias = add_qkv_bias
50 | self.bias_dropout_fusion = bias_dropout_fusion
51 | self.multi_query_attention = multi_query_attention
52 | self.multi_query_group_num = multi_query_group_num
53 | self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
54 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32
55 | self.fp32_residual_connection = fp32_residual_connection
56 | self.quantization_bit = quantization_bit
57 | self.pre_seq_len = pre_seq_len
58 | self.prefix_projection = prefix_projection
59 | super().__init__(**kwargs)
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/chatglm3/configuration_chatglm.py:
--------------------------------------------------------------------------------
1 | from transformers import PretrainedConfig
2 |
3 |
4 | class ChatGLMConfig(PretrainedConfig):
5 | model_type = "chatglm"
6 | def __init__(
7 | self,
8 | num_layers=28,
9 | padded_vocab_size=65024,
10 | hidden_size=4096,
11 | ffn_hidden_size=13696,
12 | kv_channels=128,
13 | num_attention_heads=32,
14 | seq_length=2048,
15 | hidden_dropout=0.0,
16 | classifier_dropout=None,
17 | attention_dropout=0.0,
18 | layernorm_epsilon=1e-5,
19 | rmsnorm=True,
20 | apply_residual_connection_post_layernorm=False,
21 | post_layer_norm=True,
22 | add_bias_linear=False,
23 | add_qkv_bias=False,
24 | bias_dropout_fusion=True,
25 | multi_query_attention=False,
26 | multi_query_group_num=1,
27 | apply_query_key_layer_scaling=True,
28 | attention_softmax_in_fp32=True,
29 | fp32_residual_connection=False,
30 | quantization_bit=0,
31 | pre_seq_len=None,
32 | prefix_projection=False,
33 | **kwargs
34 | ):
35 | self.num_layers = num_layers
36 | self.vocab_size = padded_vocab_size
37 | self.padded_vocab_size = padded_vocab_size
38 | self.hidden_size = hidden_size
39 | self.ffn_hidden_size = ffn_hidden_size
40 | self.kv_channels = kv_channels
41 | self.num_attention_heads = num_attention_heads
42 | self.seq_length = seq_length
43 | self.hidden_dropout = hidden_dropout
44 | self.classifier_dropout = classifier_dropout
45 | self.attention_dropout = attention_dropout
46 | self.layernorm_epsilon = layernorm_epsilon
47 | self.rmsnorm = rmsnorm
48 | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
49 | self.post_layer_norm = post_layer_norm
50 | self.add_bias_linear = add_bias_linear
51 | self.add_qkv_bias = add_qkv_bias
52 | self.bias_dropout_fusion = bias_dropout_fusion
53 | self.multi_query_attention = multi_query_attention
54 | self.multi_query_group_num = multi_query_group_num
55 | self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
56 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32
57 | self.fp32_residual_connection = fp32_residual_connection
58 | self.quantization_bit = quantization_bit
59 | self.pre_seq_len = pre_seq_len
60 | self.prefix_projection = prefix_projection
61 | super().__init__(**kwargs)
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/qwen/configuration_qwen.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Alibaba Cloud.
2 | #
3 | # This source code is licensed under the license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | from transformers import PretrainedConfig
7 |
8 |
9 | class QWenConfig(PretrainedConfig):
10 | model_type = "qwen"
11 | keys_to_ignore_at_inference = ["past_key_values"]
12 |
13 | def __init__(
14 | self,
15 | vocab_size=151936,
16 | hidden_size=4096,
17 | num_hidden_layers=32,
18 | num_attention_heads=32,
19 | emb_dropout_prob=0.0,
20 | attn_dropout_prob=0.0,
21 | layer_norm_epsilon=1e-6,
22 | initializer_range=0.02,
23 | max_position_embeddings=8192,
24 | scale_attn_weights=True,
25 | use_cache=True,
26 | bf16=False,
27 | fp16=False,
28 | fp32=False,
29 | kv_channels=128,
30 | rotary_pct=1.0,
31 | rotary_emb_base=10000,
32 | use_dynamic_ntk=True,
33 | use_logn_attn=True,
34 | use_flash_attn="auto",
35 | intermediate_size=22016,
36 | no_bias=True,
37 | tie_word_embeddings=False,
38 | use_cache_quantization=False,
39 | use_cache_kernel=False,
40 | softmax_in_fp32=False,
41 | **kwargs,
42 | ):
43 | self.vocab_size = vocab_size
44 | self.hidden_size = hidden_size
45 | self.intermediate_size = intermediate_size
46 | self.num_hidden_layers = num_hidden_layers
47 | self.num_attention_heads = num_attention_heads
48 | self.emb_dropout_prob = emb_dropout_prob
49 | self.attn_dropout_prob = attn_dropout_prob
50 | self.layer_norm_epsilon = layer_norm_epsilon
51 | self.initializer_range = initializer_range
52 | self.scale_attn_weights = scale_attn_weights
53 | self.use_cache = use_cache
54 | self.max_position_embeddings = max_position_embeddings
55 | self.bf16 = bf16
56 | self.fp16 = fp16
57 | self.fp32 = fp32
58 | self.kv_channels = kv_channels
59 | self.rotary_pct = rotary_pct
60 | self.rotary_emb_base = rotary_emb_base
61 | self.use_dynamic_ntk = use_dynamic_ntk
62 | self.use_logn_attn = use_logn_attn
63 | self.use_flash_attn = use_flash_attn
64 | self.no_bias = no_bias
65 | self.use_cache_quantization = use_cache_quantization
66 | self.use_cache_kernel = use_cache_kernel
67 | self.softmax_in_fp32 = softmax_in_fp32
68 | super().__init__(
69 | tie_word_embeddings=tie_word_embeddings,
70 | **kwargs
71 | )
72 |
--------------------------------------------------------------------------------
/mftcoder_atorch/model/gpt_neox/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 |
16 | from transformers.file_utils import _LazyModule, is_tokenizers_available, is_torch_available
17 | from transformers.utils import OptionalDependencyNotAvailable
18 | # from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
19 | # from ...utils import OptionalDependencyNotAvailable
20 |
21 |
22 | _import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]}
23 |
24 | try:
25 | if not is_tokenizers_available():
26 | raise OptionalDependencyNotAvailable()
27 | except OptionalDependencyNotAvailable:
28 | pass
29 | else:
30 | _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"]
31 |
32 | try:
33 | if not is_torch_available():
34 | raise OptionalDependencyNotAvailable()
35 | except OptionalDependencyNotAvailable:
36 | pass
37 | else:
38 | _import_structure["modeling_gpt_neox"] = [
39 | "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
40 | "GPTNeoXForCausalLM",
41 | "GPTNeoXLayer",
42 | "GPTNeoXModel",
43 | "GPTNeoXPreTrainedModel",
44 | ]
45 |
46 |
47 | if TYPE_CHECKING:
48 | from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
49 |
50 | try:
51 | if not is_tokenizers_available():
52 | raise OptionalDependencyNotAvailable()
53 | except OptionalDependencyNotAvailable:
54 | pass
55 | else:
56 | from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
57 |
58 | try:
59 | if not is_torch_available():
60 | raise OptionalDependencyNotAvailable()
61 | except OptionalDependencyNotAvailable:
62 | pass
63 | else:
64 | from .modeling_gpt_neox import (
65 | GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
66 | GPTNeoXForCausalLM,
67 | GPTNeoXLayer,
68 | GPTNeoXModel,
69 | GPTNeoXPreTrainedModel,
70 | )
71 |
72 |
73 | else:
74 | import sys
75 |
76 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/gpt_neox/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 |
16 | from transformers.file_utils import _LazyModule, is_tokenizers_available, is_torch_available
17 | from transformers.utils import OptionalDependencyNotAvailable
18 | # from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
19 | # from ...utils import OptionalDependencyNotAvailable
20 |
21 |
22 | _import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]}
23 |
24 | try:
25 | if not is_tokenizers_available():
26 | raise OptionalDependencyNotAvailable()
27 | except OptionalDependencyNotAvailable:
28 | pass
29 | else:
30 | _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"]
31 |
32 | try:
33 | if not is_torch_available():
34 | raise OptionalDependencyNotAvailable()
35 | except OptionalDependencyNotAvailable:
36 | pass
37 | else:
38 | _import_structure["modeling_gpt_neox"] = [
39 | "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
40 | "GPTNeoXForCausalLM",
41 | "GPTNeoXLayer",
42 | "GPTNeoXModel",
43 | "GPTNeoXPreTrainedModel",
44 | ]
45 |
46 |
47 | if TYPE_CHECKING:
48 | from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
49 |
50 | try:
51 | if not is_tokenizers_available():
52 | raise OptionalDependencyNotAvailable()
53 | except OptionalDependencyNotAvailable:
54 | pass
55 | else:
56 | from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
57 |
58 | try:
59 | if not is_torch_available():
60 | raise OptionalDependencyNotAvailable()
61 | except OptionalDependencyNotAvailable:
62 | pass
63 | else:
64 | from .modeling_gpt_neox import (
65 | GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
66 | GPTNeoXForCausalLM,
67 | GPTNeoXLayer,
68 | GPTNeoXModel,
69 | GPTNeoXPreTrainedModel,
70 | )
71 |
72 |
73 | else:
74 | import sys
75 |
76 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--------------------------------------------------------------------------------
/mftcoder_atorch/train/run_gpt_mft.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOAD_RAW_DATASET=True
3 | if [ ${LOAD_RAW_DATASET} = "True" ]; then
4 | LOAD_RAW_DATASET="--load_raw_dataset"
5 | DATA_PATHS="$DATA_PATHS"
6 | DATA_WEIGHTS="[1.,1.,...,1.]"
7 | DATA_SPLIT="95,5,0"
8 | SHUFFLE_BEFORE_SPLIT=""
9 | USE_RANDOM_SAMPLER=""
10 | USE_WEIGHTED_LOSS=""
11 | WEIGHT_BY_NUM_DOCUMENTS=""
12 | else
13 | LOAD_RAW_DATASET=""
14 | DATA_PATHS="$DATA_PATHS"
15 | DATA_WEIGHTS="[1.,1.,...,1.]"
16 | DATA_SPLIT="95,5,0"
17 | SHUFFLE_BEFORE_SPLIT="--shuffle_before_split"
18 | USE_RANDOM_SAMPLER="--use_random_sampler"
19 | USE_WEIGHTED_LOSS="--use_weighted_loss"
20 | WEIGHT_BY_NUM_DOCUMENTS="--weight_by_num_documents"
21 | fi
22 |
23 | VOCAB_FILE="../utils/vocab.json"
24 | MODEL_TYPE="gpt_neox"
25 |
26 | PRETRAINED_MODEL_PATH="$MODEL_NAME_OR_PATH"
27 | RESUME_FROM_CHECKPOINT="false"
28 |
29 | PER_DEVICE_BATCH_SIZE=$1
30 | TP=$2
31 | DP=$3
32 | EPOCH=$4
33 | TOTAL_TRAIN_BATCH_SIZE=$(($PER_DEVICE_BATCH_SIZE * $TP * $DP))
34 | GPU=$(($TP * $DP))
35 | OUTPUT="$OUTPUT_DIR"
36 | TENSORBOARD_PATH="$TensorBoard_DIR"
37 | PREFIX="master-0"
38 | mkdir -p $OUTPUT || true
39 | echo "output to $OUTPUT"
40 | mkdir -p $TENSORBOARD_PATH
41 | chmod 777 $OUTPUT
42 | chmod 777 $TENSORBOARD_PATH
43 |
44 | python -m atorch.distributed.launch \
45 | --nproc_per_node=$(nvidia-smi -L | wc -l) \
46 | run_train.py \
47 | ${LOAD_RAW_DATASET} \
48 | --tokenize_mode 'sft' \
49 | --train_mode 'sft' \
50 | --padding_mode 'padding' \
51 | --pretrained_model_path $PRETRAINED_MODEL_PATH \
52 | --vocab_file $VOCAB_FILE \
53 | --model_type $MODEL_TYPE \
54 | --padding \
55 | --data_paths $DATA_PATHS \
56 | --data_weights $DATA_WEIGHTS \
57 | --data_split $DATA_SPLIT \
58 | ${SHUFFLE_BEFORE_SPLIT} \
59 | ${USE_RANDOM_SAMPLER} \
60 | ${USE_WEIGHTED_LOSS} \
61 | ${WEIGHT_BY_NUM_DOCUMENTS} \
62 | --train_iters 100 \
63 | --num_warmup_steps 500 \
64 | --custom_lr_scheduler_type 'cosine' \
65 | --learning_rate 1.0e-4 \
66 | --min_lr 1.0e-5 \
67 | --valid_iters 10 \
68 | --valid_interval 2000 \
69 | --num_train_epochs $EPOCH \
70 | --seq_length 4096 \
71 | --total_train_batch_size $TOTAL_TRAIN_BATCH_SIZE \
72 | --per_device_valid_batch_size $PER_DEVICE_BATCH_SIZE \
73 | --seed 42 \
74 | --preprocessing_num_workers 6 \
75 | --num_workers 8 \
76 | --output_dir $OUTPUT \
77 | --tensorboard_dir $TENSORBOARD_PATH \
78 | --ignore_mismatched_sizes \
79 | --skip_atorch_autoacc_dryrun \
80 | --tp $TP \
81 | --dp $DP \
82 | --bf16 \
83 | --checkpointing_steps 2000 \
84 | --log_interval 10 \
85 | --make_vocab_size_divisible_by 128 \
86 | --weighted_loss_mode 'case3' \
87 | --checkpoint_activations \
88 | --resume_from_checkpoint $RESUME_FROM_CHECKPOINT \
89 | --max_grad_norm 1 \
90 | --evaluation_strategy "steps,epoch" \
91 | --save_strategy "steps" \
92 | --save_total_limit 2 \
93 | --extra_save_by_epoch \
94 | --metric_for_best_model 'loss' \
95 | --greater_is_better 'false' \
96 | --early_stopping_patience 10 2>&1 | tee $OUTPUT/$PREFIX-output.txt
--------------------------------------------------------------------------------
/mftcoder_atorch/train/run_gpt_mft_peft.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOAD_RAW_DATASET=True
3 | if [ ${LOAD_RAW_DATASET} = "True" ]; then
4 | LOAD_RAW_DATASET="--load_raw_dataset"
5 | DATA_PATHS="$DATA_PATHS"
6 | DATA_WEIGHTS="[1.,1.,...,1.]"
7 | DATA_SPLIT="90,10,0"
8 | SHUFFLE_BEFORE_SPLIT=""
9 | USE_RANDOM_SAMPLER=""
10 | USE_WEIGHTED_LOSS=""
11 | WEIGHT_BY_NUM_DOCUMENTS=""
12 | else
13 | LOAD_RAW_DATASET=""
14 | DATA_PATHS="$DATA_PATHS"
15 | DATA_WEIGHTS="[1.,1.,...,1.]"
16 | DATA_SPLIT="95,5,0"
17 | SHUFFLE_BEFORE_SPLIT="--shuffle_before_split"
18 | USE_RANDOM_SAMPLER="--use_random_sampler"
19 | USE_WEIGHTED_LOSS="--use_weighted_loss"
20 | WEIGHT_BY_NUM_DOCUMENTS="--weight_by_num_documents"
21 | fi
22 |
23 | VOCAB_FILE="../utils/vocab.json"
24 | MODEL_TYPE="gpt_neox"
25 |
26 | PRETRAINED_MODEL_PATH="$MODEL_NAME_OR_PATH"
27 | RESUME_FROM_CHECKPOINT="false"
28 |
29 | PER_DEVICE_BATCH_SIZE=$1
30 | TP=$2
31 | DP=$3
32 | EPOCH=$4
33 | TOTAL_TRAIN_BATCH_SIZE=$(($PER_DEVICE_BATCH_SIZE * $TP * $DP))
34 | GPU=$(($TP * $DP))
35 | OUTPUT="$OUTPUT_DIR"
36 | TENSORBOARD_PATH="$TensorBoard_DIR"
37 | PREFIX="master-0"
38 | mkdir -p $OUTPUT || true
39 | echo "output to $OUTPUT"
40 | mkdir -p $TENSORBOARD_PATH
41 | chmod 777 $OUTPUT
42 | chmod 777 $TENSORBOARD_PATH
43 |
44 | python -m atorch.distributed.launch \
45 | --nproc_per_node=$(nvidia-smi -L | wc -l) \
46 | run_train.py \
47 | ${LOAD_RAW_DATASET} \
48 | --tokenize_mode 'sft' \
49 | --padding_mode 'padding' \
50 | --pretrained_model_path $PRETRAINED_MODEL_PATH \
51 | --vocab_file $VOCAB_FILE \
52 | --model_type $MODEL_TYPE \
53 | --padding \
54 | --data_paths $DATA_PATHS \
55 | --data_weights $DATA_WEIGHTS \
56 | --data_split $DATA_SPLIT \
57 | ${SHUFFLE_BEFORE_SPLIT} \
58 | ${USE_RANDOM_SAMPLER} \
59 | ${USE_WEIGHTED_LOSS} \
60 | ${WEIGHT_BY_NUM_DOCUMENTS} \
61 | --train_iters 100 \
62 | --num_warmup_steps 30 \
63 | --custom_lr_scheduler_type 'cosine' \
64 | --learning_rate 5.0e-5 \
65 | --min_lr 1.0e-6 \
66 | --valid_iters 400 \
67 | --valid_interval 500 \
68 | --num_train_epochs $EPOCH \
69 | --seq_length 4096 \
70 | --total_train_batch_size $TOTAL_TRAIN_BATCH_SIZE \
71 | --per_device_valid_batch_size $PER_DEVICE_BATCH_SIZE \
72 | --seed 42 \
73 | --preprocessing_num_workers 6 \
74 | --num_workers 8 \
75 | --output_dir $OUTPUT \
76 | --tensorboard_dir $TENSORBOARD_PATH \
77 | --ignore_mismatched_sizes \
78 | --skip_atorch_autoacc_dryrun \
79 | --tp $TP \
80 | --dp $DP \
81 | --bf16 \
82 | --checkpointing_steps 500 \
83 | --log_interval 10 \
84 | --make_vocab_size_divisible_by 128 \
85 | --weighted_loss_mode 'case3' \
86 | --peft_type 'lora' \
87 | --checkpoint_activations \
88 | --resume_from_checkpoint $RESUME_FROM_CHECKPOINT \
89 | --max_grad_norm 1 \
90 | --evaluation_strategy "steps,epoch" \
91 | --save_strategy "steps" \
92 | --save_total_limit 2 \
93 | --extra_save_by_epoch \
94 | --metric_for_best_model 'loss' \
95 | --greater_is_better 'false' \
96 | --early_stopping_patience 5 \
97 | --use_dynamic_padding 2>&1 | tee $OUTPUT/$PREFIX-output.txt
98 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/data/blendable_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, EleutherAI
2 | # This file is based on code by the authors denoted below and has been modified from its original version.
3 | #
4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """Blendable dataset."""
19 |
20 | import time
21 |
22 | import numpy as np
23 | import torch
24 |
25 | from utils.common_utils import print_rank_0
26 |
27 |
28 | class BlendableDataset(torch.utils.data.Dataset):
29 | def __init__(self, datasets, weights):
30 | self.datasets = datasets
31 | num_datasets = len(datasets)
32 | assert num_datasets == len(weights)
33 |
34 | self.size = 0
35 | for dataset in self.datasets:
36 | self.size += len(dataset)
37 |
38 | # Normalize weights.
39 | weights = np.array(weights, dtype=np.float64)
40 | sum_weights = np.sum(weights)
41 | assert sum_weights > 0.0
42 | weights /= sum_weights
43 |
44 | # recompute weights
45 | weights = self.calc_weights()
46 |
47 | # Build indices.
48 | start_time = time.time()
49 | assert num_datasets < 255
50 | self.dataset_index = np.zeros(self.size, dtype=np.uint8)
51 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
52 |
53 | from data import helpers
54 |
55 | helpers.build_blending_indices(
56 | self.dataset_index,
57 | self.dataset_sample_index,
58 | weights,
59 | num_datasets,
60 | self.size,
61 | torch.distributed.get_rank() == 0,
62 | )
63 |
64 | print(
65 | "> RANK {} elapsed time for building blendable dataset indices: "
66 | "{:.2f} (sec)".format(torch.distributed.get_rank(), time.time() - start_time)
67 | )
68 |
69 | def calc_weights(self):
70 | dataset_sample_cnt = [len(ds) for ds in self.datasets]
71 | total_cnt = sum(dataset_sample_cnt)
72 | weights = np.array([(cnt + 0.0) / total_cnt for cnt in dataset_sample_cnt], dtype=np.float64)
73 | return weights
74 |
75 | def __len__(self):
76 | return self.size
77 |
78 | def __getitem__(self, idx):
79 | try:
80 | dataset_idx = self.dataset_index[idx]
81 | sample_idx = self.dataset_sample_index[idx]
82 | return self.datasets[dataset_idx][sample_idx]
83 | except IndexError:
84 | new_idx = idx % len(self)
85 | print(
86 | f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
87 | )
88 | return self[new_idx]
89 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/baichuan2/generation_utils.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from queue import Queue
3 |
4 | import torch
5 |
6 |
7 | def build_chat_input(model, tokenizer, messages: List[dict], max_new_tokens: int=0):
8 | def _parse_messages(messages, split_role="user"):
9 | system, rounds = "", []
10 | round = []
11 | for i, message in enumerate(messages):
12 | if message["role"] == "system":
13 | assert i == 0
14 | system = message["content"]
15 | continue
16 | if message["role"] == split_role and round:
17 | rounds.append(round)
18 | round = []
19 | round.append(message)
20 | if round:
21 | rounds.append(round)
22 | return system, rounds
23 |
24 | max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens
25 | max_input_tokens = model.config.model_max_length - max_new_tokens
26 | system, rounds = _parse_messages(messages, split_role="user")
27 | system_tokens = tokenizer.encode(system)
28 | max_history_tokens = max_input_tokens - len(system_tokens)
29 |
30 | history_tokens = []
31 | for round in rounds[::-1]:
32 | round_tokens = []
33 | for message in round:
34 | if message["role"] == "user":
35 | round_tokens.append(model.generation_config.user_token_id)
36 | else:
37 | round_tokens.append(model.generation_config.assistant_token_id)
38 | round_tokens.extend(tokenizer.encode(message["content"]))
39 | if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
40 | history_tokens = round_tokens + history_tokens # concat left
41 | if len(history_tokens) < max_history_tokens:
42 | continue
43 | break
44 |
45 | input_tokens = system_tokens + history_tokens
46 | if messages[-1]["role"] != "assistant":
47 | input_tokens.append(model.generation_config.assistant_token_id)
48 | input_tokens = input_tokens[-max_input_tokens:] # truncate left
49 | return torch.LongTensor([input_tokens]).to(model.device)
50 |
51 |
52 | class TextIterStreamer:
53 | def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False):
54 | self.tokenizer = tokenizer
55 | self.skip_prompt = skip_prompt
56 | self.skip_special_tokens = skip_special_tokens
57 | self.tokens = []
58 | self.text_queue = Queue()
59 | self.next_tokens_are_prompt = True
60 |
61 | def put(self, value):
62 | if self.skip_prompt and self.next_tokens_are_prompt:
63 | self.next_tokens_are_prompt = False
64 | else:
65 | if len(value.shape) > 1:
66 | value = value[0]
67 | self.tokens.extend(value.tolist())
68 | self.text_queue.put(
69 | self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens))
70 |
71 | def end(self):
72 | self.text_queue.put(None)
73 |
74 | def __iter__(self):
75 | return self
76 |
77 | def __next__(self):
78 | value = self.text_queue.get()
79 | if value is None:
80 | raise StopIteration()
81 | else:
82 | return value
83 |
84 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/xxpo/custom_callbacks.py:
--------------------------------------------------------------------------------
1 | """
2 | Customized Callbacks to use with the Trainer class and customize the training loop.
3 | """
4 |
5 | import copy
6 | import dataclasses
7 | import json
8 | from dataclasses import dataclass
9 | from typing import Dict, List, Optional, Union
10 |
11 | import numpy as np
12 | from tqdm.auto import tqdm
13 |
14 | from transformers.trainer_utils import IntervalStrategy, has_length
15 | from transformers.training_args import TrainingArguments
16 | from transformers.utils import logging
17 | from transformers import TrainerCallback
18 |
19 | logger = logging.get_logger(__name__)
20 |
21 |
22 | class CustomProgressCallback(TrainerCallback):
23 | """
24 | A [`TrainerCallback`] that displays the progress of training or evaluation.
25 | """
26 |
27 | def __init__(self):
28 | self.training_bar = None
29 | self.prediction_bar = None
30 |
31 | def on_train_begin(self, args, state, control, **kwargs):
32 | if state.is_world_process_zero:
33 | self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True)
34 | self.current_step = 0
35 |
36 | def on_step_end(self, args, state, control, **kwargs):
37 | if state.is_world_process_zero and state.global_step % args.logging_steps == 0:
38 | self.training_bar.update(args.logging_steps)
39 | self.current_step = state.global_step
40 | # pass
41 |
42 | def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
43 | # if state.is_world_process_zero and has_length(eval_dataloader):
44 | # if self.prediction_bar is None:
45 | # self.prediction_bar = tqdm(
46 | # total=len(eval_dataloader), leave=self.training_bar is None, dynamic_ncols=True
47 | # )
48 | # self.prediction_bar.update(1)
49 | pass
50 |
51 | def on_evaluate(self, args, state, control, **kwargs):
52 | if state.is_world_process_zero:
53 | if self.prediction_bar is not None:
54 | self.prediction_bar.close()
55 | self.prediction_bar = None
56 |
57 | def on_predict(self, args, state, control, **kwargs):
58 | if state.is_world_process_zero:
59 | if self.prediction_bar is not None:
60 | self.prediction_bar.close()
61 | self.prediction_bar = None
62 |
63 | def on_log(self, args, state, control, logs=None, **kwargs):
64 | if state.is_world_process_zero and self.training_bar is not None:
65 | # avoid modifying the logs object as it is shared between callbacks
66 | logs = copy.deepcopy(logs)
67 | # _ = logs.pop("total_flos", None)
68 | # round numbers so that it looks better in console
69 | if "epoch" in logs:
70 | logs["epoch"] = round(logs["epoch"], 2)
71 | # self.training_bar.write(str(logs))
72 | logger.info(logs)
73 |
74 | def on_train_end(self, args, state, control, **kwargs):
75 | if state.is_world_process_zero:
76 | self.training_bar.close()
77 | self.training_bar = None
78 |
79 |
80 | class PrinterCallback(TrainerCallback):
81 | """
82 | A bare [`TrainerCallback`] that just prints the logs.
83 | """
84 |
85 | def on_log(self, args, state, control, logs=None, **kwargs):
86 | _ = logs.pop("total_flos", None)
87 | if state.is_local_process_zero:
88 | print(logs)
89 |
90 |
91 | class LogCallback(TrainerCallback):
92 | """
93 | A bare [`TrainerCallback`] that just prints the logs.
94 | """
95 |
96 | def on_log(self, args, state, control, logs=None, **kwargs):
97 | _ = logs.pop("total_flos", None)
98 | if state.is_local_process_zero:
99 | logger.info(logs)
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/pefts/merge_base_and_lora_to_hf.py:
--------------------------------------------------------------------------------
1 | """
2 | # @author Chaoyu Chen
3 | # @date 2023/10/19
4 |
5 | Merge base and lora adaptor
6 | """
7 |
8 | import os
9 | import sys
10 | import time
11 | import shutil
12 | import argparse
13 | from typing import List
14 | import torch
15 | import transformers
16 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
17 | from peft import LoraConfig, get_peft_model
18 | from peft import PeftModel
19 |
20 | # insert src as import path
21 | current_path = os.path.abspath(__file__)
22 | parent_dir = os.path.dirname(os.path.dirname(current_path))
23 | sys.path.insert(0, parent_dir)
24 | print("In merge_base_and_lora_to_hf.py, sys path:", sys.path)
25 |
26 | from tokenizer import init_tokenizer
27 |
28 |
29 | def copy_tokenizer_files(mode_path: str, files_list: List[str], save_path: str):
30 | if not os.path.exists(save_path):
31 | os.makedirs(save_path)
32 |
33 | for filename in files_list:
34 |
35 | src_file = os.path.join(mode_path, filename)
36 |
37 | if os.path.exists(src_file):
38 | dest_file = os.path.join(save_path, filename)
39 |
40 | shutil.copy(src_file, dest_file)
41 | print(f"Copied {filename} to {save_path}")
42 | else:
43 | print(f"File {filename} does not exist in {mode_path}")
44 |
45 |
46 | if __name__ == "__main__":
47 |
48 | # arguments
49 | parser = argparse.ArgumentParser()
50 | parser.add_argument("--base_model_or_path", type=str, default=None)
51 | parser.add_argument("--adaptor_path", type=str, default=None)
52 | parser.add_argument("--model_type", type=str, default=None)
53 | parser.add_argument("--merged_output_path", type=str, default=None)
54 | args = parser.parse_args()
55 |
56 | model_path = args.base_model_or_path
57 | lora_adapter = args.adaptor_path
58 | model_type = args.model_type
59 | save_path = args.merged_output_path
60 |
61 | t0 = time.time()
62 |
63 | tokenizer = init_tokenizer(args.base_model_or_path)
64 |
65 | base_model = AutoModelForCausalLM.from_pretrained(
66 | model_path,
67 | trust_remote_code=True,
68 | torch_dtype=torch.bfloat16,
69 | # torch_dtype=torch.float32,
70 | return_dict=True,
71 | device_map="auto",
72 | )
73 | print("--------------------------------------Base Model--------------------------------------------")
74 | print(base_model)
75 | print("--------------------------------------------------------------------------------------------")
76 |
77 | print("-----------------------------------Base Model Config----------------------------------------")
78 | print(base_model.config)
79 | print("--------------------------------------------------------------------------------------------")
80 |
81 | # merge, save model and tokenizer
82 | model_to_merge = PeftModel.from_pretrained(base_model, lora_adapter)
83 | merged_model = model_to_merge.merge_and_unload()
84 | # merged_model.to(torch.bfloat16)
85 |
86 | print("---------------------------------Merged Model Config----------------------------------------")
87 | print(merged_model.config)
88 | print("--------------------------------------------------------------------------------------------")
89 | merged_model.save_pretrained(save_path)
90 |
91 | print("-------------------------------------Tokenizer----------------------------------------------")
92 | print(tokenizer)
93 | print("--------------------------------------------------------------------------------------------")
94 | if model_type.lower() == "deepseek":
95 | copy_tokenizer_files(
96 | model_path,
97 | ["tokenizer.model", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"],
98 | save_path,
99 | )
100 | else:
101 | tokenizer.save_pretrained(save_path)
102 |
103 | print(f"Merge finised: {save_path} saved, Cost {time.time() - t0:.2f}s")
104 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/utils/common_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import torch
4 | from packaging import version
5 | import importlib
6 |
7 | TASK2ID = {}
8 | ID2TASK = {}
9 |
10 |
11 | def is_flash_attn_2_available():
12 |
13 | # Let's add an extra check to see if cuda is available
14 |
15 | if not torch.cuda.is_available():
16 | return False
17 |
18 | if torch.version.cuda:
19 | return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
20 | elif torch.version.hip:
21 | # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
22 | return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.0.4")
23 | else:
24 | return False
25 |
26 |
27 | def print_rank_0(*message):
28 | """If distributed is initialized print only on rank 0."""
29 | if torch.distributed.is_initialized():
30 | if torch.distributed.get_rank() == 0:
31 | print(*message, flush=True)
32 | else:
33 | print(*message, flush=True)
34 |
35 |
36 | def wait_for_everyone():
37 | torch.distributed.barrier()
38 |
39 |
40 | def _goes_first(is_main):
41 | if is_main is False:
42 | wait_for_everyone()
43 | yield
44 | if is_main is True:
45 | wait_for_everyone()
46 |
47 |
48 | def get_model_params_num(model):
49 | """
50 | Get params number of the model
51 | Args:
52 | model: model(required)
53 | Returns:
54 | the number of parameters of model
55 | """
56 | num = 0
57 | for _, param in model.named_parameters():
58 | num += param.nelement()
59 | return num
60 |
61 |
62 | def unwrap_model(model):
63 | """
64 | Recursively unwraps a model from potential containers (as used in distributed training).
65 |
66 | Args:
67 | model (`torch.nn.Module`): The model to unwrap.
68 | """
69 | # since there could be multiple levels of wrapping, unwrap recursively
70 | if hasattr(model, "module"):
71 | return unwrap_model(model.module)
72 | else:
73 | return model
74 |
75 |
76 | def honor_type(obj, generator):
77 | """
78 | Cast a generator to the same type as obj (list, tuple or namedtuple)
79 | """
80 | try:
81 | return type(obj)(generator)
82 | except TypeError:
83 | # Some objects may not be able to instantiate from a generator directly
84 | return type(obj)(*list(generator))
85 |
86 |
87 | def get_computation_speed(batch_size_per_device, seq_len, step_time):
88 |
89 | return batch_size_per_device * seq_len / (step_time + 1e-12)
90 |
91 |
92 | def human_readable_flops(num):
93 | for unit in [
94 | "",
95 | "KFLOPS",
96 | "MFLOPS",
97 | "GFLOPS",
98 | "TFLOPS",
99 | "PFLOPS",
100 | "EFLOPS",
101 | "ZFLOPS",
102 | ]:
103 | if abs(num) < 1000.0:
104 | return "%3.1f%s" % (num, unit)
105 | num /= 1000.0
106 | return "%.1f%s" % (num, "Yi")
107 |
108 |
109 | def get_tflops_new(args, batch_size, seq_len, step_time):
110 | sl = seq_len
111 | L = args.num_hidden_layers
112 | h = args.hidden_size
113 | V = args.vocab_size
114 | flops = 96 * batch_size * sl * L * h * h * (1 + sl / (6 * h) + V / (16 * L * h)) / step_time
115 | return human_readable_flops(flops)
116 |
117 |
118 | def get_tflops_megatron(total_model_param, hidden_size, num_hidden_layers, batch_size_per_device, seq_len, step_time):
119 |
120 | ff = total_model_param * 6
121 | attn = seq_len * hidden_size * num_hidden_layers * 60
122 | flops = batch_size_per_device * seq_len * (ff + attn) / step_time
123 | return human_readable_flops(flops)
124 |
125 |
126 | def generate_task_id(data_paths):
127 | data_prefixes = list(data_paths[1:-1].split(","))
128 | print("data paths: ")
129 | print(data_prefixes)
130 |
131 | for i, prefix in enumerate(data_prefixes):
132 | task_name = prefix.split("/")[-1]
133 | TASK2ID[task_name] = i
134 | ID2TASK[i] = task_name
135 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/tokenizer/tokenizer.py:
--------------------------------------------------------------------------------
1 | """
2 | # @author Chaoyu Chen
3 | # @date 2023/6/19
4 | """
5 |
6 | import numpy as np
7 | from typing import List, Union
8 | from utils.common_utils import print_rank_0
9 | from transformers import AutoTokenizer, AutoConfig
10 | from tokenizer.chat_template import MFTCoder_template
11 |
12 |
13 | def init_tokenizer(path):
14 | """
15 | Init a Huggingface tokenizer, parsing eos_token from the tokenizer_config then config.
16 | Set pad_token same as eos_token for easy life.
17 | :param path: model path or tokenizer path
18 | :return: Tokenizer (TokenizerFast is preferred)
19 | """
20 | # tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False, legacy=False)
21 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
22 | config, unused_kwargs = AutoConfig.from_pretrained(path, trust_remote_code=True, return_unused_kwargs=True)
23 |
24 | if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id:
25 | print(f"Initial eos_token_id {tokenizer.eos_token_id} from tokenizer")
26 | eos_token_id = tokenizer.eos_token_id
27 | eos_token = tokenizer.convert_ids_to_tokens(eos_token_id)
28 | elif hasattr(tokenizer, "eos_token") and tokenizer.eos_token:
29 | print(f"Initial eos_token {tokenizer.eos_token} from tokenizer")
30 | eos_token = tokenizer.eos_token
31 | eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
32 | elif hasattr(config, "eos_token_id") and config.eos_token_id:
33 | print(f"Initial eos_token_id {config.eos_token_id} from config.json")
34 | eos_token_id = config.eos_token_id
35 | eos_token = tokenizer.convert_ids_to_tokens(config.eos_token_id)
36 | elif hasattr(config, "eos_token") and config.eos_token:
37 | print(f"Initial eos_token {config.eos_token} from config.json")
38 | eos_token = config.eos_token
39 | eos_token_id = tokenizer.convert_tokens_to_ids(config.eos_token)
40 | else:
41 | raise ValueError(
42 | "No available eos_token or eos_token_id, please provide eos_token by params or eos_token_id by config.json"
43 | )
44 | try:
45 | tokenizer.eos_token = eos_token
46 | tokenizer.eos_token_id = eos_token_id
47 | # set pad_token to be same as eos_token, it is ok because is will be masked out.
48 | tokenizer.pad_token = eos_token
49 | tokenizer.pad_token_id = eos_token_id
50 | except:
51 | print(f"[WARNING]Cannot set tokenizer.eos_token")
52 |
53 | tokenizer.add_bos_token = False
54 | tokenizer.add_eos_token = False
55 | tokenizer.chat_template = MFTCoder_template
56 | print_rank_0(f"Tokenizer: {type(tokenizer)}")
57 | print_rank_0(f"Length of tokenizer: {len(tokenizer)}")
58 | print_rank_0(f"build_tokenizer pad_token_id: {tokenizer.pad_token_id}, eos_token_id: {tokenizer.eos_token_id}")
59 | print_rank_0(f"build_tokenizer pad_token : {tokenizer.pad_token}, eos_token: {tokenizer.eos_token}")
60 |
61 | return tokenizer
62 |
63 |
64 | def build_tokenizer(args):
65 | """Initialize tokenizer."""
66 | print_rank_0(f"> building {args.tokenizer_type} tokenizer ...")
67 | # Select and instantiate the tokenizer.
68 | if args.tokenizer_type.lower() == "AutoTokenizer".lower():
69 | assert args.pretrained_model_path is not None
70 | tokenizer = init_tokenizer(args.pretrained_model_path)
71 | else:
72 | raise NotImplementedError(f"{args.tokenizer_type} tokenizer is not implemented.")
73 |
74 | # Add vocab size.
75 | args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
76 |
77 | return tokenizer
78 |
79 |
80 | def _vocab_size_with_padding(orig_vocab_size, args):
81 | """Pad vocab size thus it is divisible by model parallel size and
82 | still having GPU friendly size."""
83 |
84 | after = orig_vocab_size
85 | multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
86 | while (after % multiple) != 0:
87 | after += 1
88 | print_rank_0(
89 | " > padded vocab (size: {}) with {} dummy tokens "
90 | "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after)
91 | )
92 |
93 | return after
94 |
--------------------------------------------------------------------------------
/mftcoder_atorch/tokenizer/train_tokenizer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, EleutherAI
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Assumes a dataset of jsonl files in the same format as the neox training set.
17 | """
18 |
19 | from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
20 | from tokenizers.normalizers import NFKC
21 |
22 | from glob import glob
23 | import os
24 | import json
25 | import argparse
26 |
27 |
28 | def load_jsonl(input_path, quiet=True) -> list:
29 | """
30 | Read list of objects from a JSON lines file.
31 | """
32 | data = []
33 | with open(input_path, "r", encoding="utf-8") as f:
34 | for line in f:
35 | data.append(json.loads(line.rstrip("\n|\r")))
36 | if not quiet:
37 | print("Loaded {} records from {}".format(len(data), input_path))
38 | return data
39 |
40 |
41 | def json_iterator(input_dir, text_key="text"):
42 | all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
43 | for j in all_jsonls:
44 | data = load_jsonl(j)
45 | for doc in data:
46 | yield doc[text_key]
47 |
48 |
49 | def train_tokenizer(
50 | input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
51 | ):
52 | """
53 | Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
54 |
55 | :param input_dir: input directory containing jsonl files
56 | :param save_path: path to save tokenizer to
57 | :param tokenizer_type: type of tokenizer to train.
58 | :param vocab_size: int, size of tokenizer's vocab
59 | :return:
60 | """
61 |
62 | if tokenizer_type == "BPE":
63 | model = models.BPE()
64 | else:
65 | raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
66 | tokenizer = Tokenizer(model)
67 |
68 | # Customize pre-tokenization and decoding
69 | tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
70 | tokenizer.decoder = decoders.ByteLevel()
71 | tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
72 | tokenizer.normalizer = NFKC()
73 |
74 | # And then train
75 | trainer = trainers.BpeTrainer(
76 | vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
77 | )
78 | tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
79 |
80 | # And Save it
81 | tokenizer.save(save_path, pretty=True)
82 | print(f"Tokenizer saved at {save_path}")
83 |
84 |
85 | def parse_args():
86 | parser = argparse.ArgumentParser(
87 | description="script for training a multilingual "
88 | "HF tokenizer on CC dumps with upweighting for low resource languages"
89 | )
90 | parser.add_argument(
91 | "--json_input_dir",
92 | type=str,
93 | help="Path to folder containing tokenizer training data in jsonl format",
94 | )
95 | parser.add_argument(
96 | "--tokenizer_output_path",
97 | type=str,
98 | help="Path to which your trained tokenizer will be saved (should end in .json)",
99 | )
100 | parser.add_argument(
101 | "--tokenizer_type",
102 | type=str,
103 | help="type of tokenizer to train, currently only BPE is supported",
104 | choices=["BPE"],
105 | default=["BPE"],
106 | )
107 | parser.add_argument(
108 | "-v",
109 | "--vocab_size",
110 | help="vocabulary size of tokenizer, default=52k",
111 | type=int,
112 | default=52000,
113 | )
114 | return parser.parse_args()
115 |
116 |
117 | if __name__ == "__main__":
118 |
119 | args = parse_args()
120 |
121 | train_tokenizer(
122 | args.json_input_dir,
123 | save_path=args.tokenizer_output_path,
124 | tokenizer_type=args.tokenizer_type,
125 | vocab_size=args.vocab_size,
126 | )
127 |
--------------------------------------------------------------------------------
/mftcoder_atorch/train/run_train.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | import json
4 | import logging
5 | import math
6 | import os
7 | import numpy as np
8 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
9 |
10 | class cb:
11 | def __init__(self, path):
12 | self.path = path
13 | def __call__(self, s):
14 | with open(f"{self.path}/fsdp_mapping.html", "w") as f:
15 | f.write(s)
16 |
17 | # handle multi-processing writing
18 | os.environ["HF_MODULES_CACHE"] = os.path.join("/root/.cache/huggingface/modules", os.getenv("RANK", ""))
19 | import random # noqa: E402
20 | import datasets # noqa: E402
21 | import transformers # noqa: E402
22 | from torch.utils.data import DataLoader # noqa: E402
23 | from torch.utils.data.distributed import DistributedSampler # noqa: E402
24 | from transformers import ( # noqa: E402
25 | default_data_collator,
26 | # get_scheduler,
27 | set_seed,
28 | )
29 | from transformers.utils.versions import require_version # noqa: E402
30 | from atorch.utils.meta_model_utils import init_empty_weights_with_disk_offload # noqa: E402
31 |
32 | from transformers import AutoTokenizer
33 |
34 | from torch.distributed.fsdp import (
35 | FullyShardedDataParallel as FSDP,
36 | )
37 |
38 | from utils.common_utils import (
39 | is_local_main_process, generate_task_id, print_rank_0, is_old_version,
40 | atorch_init_distributed, atorch_reset_distributed, TASK2ID, ID2TASK,
41 | get_rank, get_world_size
42 | )
43 | from utils.auto_accelerate_utils import DataCollatorForMFTDataset, loss_func_mft
44 | from arguments.get_arguments import parse_args
45 | from model.build_model import setup_model
46 | from data.gpt2_multi_task_dataset import load_dataset_from_jsonl
47 | from train.trainer.atorch_trainer import AtorchTrainer
48 | from pathlib import Path
49 |
50 |
51 | def main():
52 | args = parse_args()
53 |
54 | # Make one log on every process with the configuration for debugging.
55 | logging.basicConfig(
56 | # format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
57 | format="%(asctime)s - %(name)s - %(message)s",
58 | datefmt="%m/%d/%Y %H:%M:%S",
59 | level=logging.INFO,
60 | )
61 | logger = logging.getLogger(__name__)
62 | if is_local_main_process():
63 | datasets.utils.logging.set_verbosity_warning()
64 | transformers.utils.logging.set_verbosity_info()
65 | else:
66 | datasets.utils.logging.set_verbosity_error()
67 | transformers.utils.logging.set_verbosity_error()
68 |
69 | # If passed along, set the training seed now.
70 | if args.seed is not None:
71 | set_seed(args.seed)
72 |
73 | generate_task_id(args.data_paths, args.train_mode) # generate TASK2ID, ID2TASK mapping
74 | print(TASK2ID)
75 | print(ID2TASK)
76 |
77 | model, model_config, tokenizer = setup_model(args, logger, use_cache=False)
78 | print(f'args.total_model_param: {args.total_model_param}')
79 |
80 | train_dataset, dataloader_args = None, None
81 | train_dataloader, valid_dataloader, test_dataloader = None, None, None
82 |
83 | args.world_size = get_world_size()
84 | global_rank = get_rank()
85 | print(f'world_size: {args.world_size}, global_rank: {global_rank}')
86 | args.per_device_train_batch_size = args.total_train_batch_size // args.world_size
87 | if args.load_raw_dataset:
88 | print_rank_0('load raw dataset')
89 | if args.model_type in ['gpt_neox']:
90 | train_dataset, valid_dataset = load_dataset_from_jsonl(args, tokenizer, shard_data=True, world_size=args.world_size, global_rank=global_rank)
91 |
92 | if train_dataset is not None:
93 | args.do_train = True
94 | if valid_dataset is not None:
95 | args.do_valid = True
96 | else:
97 | print_rank_0('please set load_raw_dataset to True and rerun')
98 |
99 | if args.resume_from_checkpoint == 'true':
100 | logger.info(f'Resume from {args.output_dir}')
101 | resume_from_checkpoint = True
102 | else:
103 | logger.info(f'Train from scratch')
104 | resume_from_checkpoint = False
105 | if args.model_type in ['gpt_neox']:
106 | gpt_data = True
107 | else:
108 | gpt_data = False
109 | data_collator = DataCollatorForMFTDataset(args.model_type, args.weighted_loss_mode, args.use_dynamic_padding)
110 | my_loss_function = loss_func_mft
111 | trainer = AtorchTrainer(
112 | model=model,
113 | args=args,
114 | train_dataset=train_dataset,
115 | valid_dataset=valid_dataset,
116 | tokenizer=tokenizer,
117 | # files_to_save=files_to_save,
118 | args_to_save={
119 | # 'max_length': args.max_length,
120 | 'max_length': args.seq_length,
121 | 'peft_type': args.peft_type,
122 | 'gpt_model': gpt_data
123 | },
124 | data_collator=data_collator,
125 | my_loss_func=my_loss_function,
126 | custom_lr_scheduler_type=args.custom_lr_scheduler_type,
127 | rank=global_rank
128 | )
129 | if args.do_train:
130 | trainer.train(resume_from_checkpoint=resume_from_checkpoint)
131 |
132 |
133 | if __name__ == "__main__":
134 | atorch_init_distributed("nccl")
135 | main()
136 | atorch_reset_distributed()
137 |
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/utils/mapping.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import sys
5 | sys.path.append("..")
6 | sys.path.append("../..")
7 | import torch
8 | from peft.utils import (
9 | TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
10 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING
11 | )
12 |
13 |
14 | # needed for prefix-tuning of bloom model
15 | def bloom_model_postprocess_past_key_value(past_key_values):
16 | past_key_values = torch.cat(past_key_values)
17 | (
18 | total_layers,
19 | batch_size,
20 | num_attention_heads,
21 | num_virtual_tokens,
22 | head_dim,
23 | ) = past_key_values.shape
24 | keys = past_key_values[: total_layers // 2]
25 | keys = keys.transpose(2, 3).reshape(
26 | total_layers // 2,
27 | batch_size * num_attention_heads,
28 | head_dim,
29 | num_virtual_tokens,
30 | )
31 | values = past_key_values[total_layers // 2 :]
32 | values = values.reshape(
33 | total_layers // 2,
34 | batch_size * num_attention_heads,
35 | num_virtual_tokens,
36 | head_dim,
37 | )
38 |
39 | return tuple(zip(keys, values))
40 |
41 |
42 | NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
43 | "t5": ["q", "v"],
44 | "mt5": ["q", "v"],
45 | "bart": ["q_proj", "v_proj"],
46 | "gpt2": ["c_attn"],
47 | "bloom": ["query_key_value"],
48 | "bloomz": ["query_key_value"],
49 | "blip-2": ["q", "v", "q_proj", "v_proj"],
50 | "opt": ["q_proj", "v_proj"],
51 | "gptj": ["q_proj", "v_proj"],
52 | "gpt_neox": ["query_key_value"],
53 | "gpt_neo": ["q_proj", "v_proj"],
54 | "bert": ["query", "value"],
55 | "roberta": ["query", "value"],
56 | "xlm-roberta": ["query", "value"],
57 | "electra": ["query", "value"],
58 | "deberta-v2": ["query_proj", "value_proj"],
59 | "deberta": ["in_proj"],
60 | "layoutlm": ["query", "value"],
61 | "llama": ["q_proj", "v_proj"],
62 | "chatglm": ["query_key_value"],
63 | "glm": ["query_key_value"],
64 | }
65 |
66 | NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = {
67 | "t5": ["q", "k", "v", "o", "wi", "wo"],
68 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
69 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
70 | # "gpt2": ["c_attn"],
71 | "bloom": ["query_key_value"],
72 | "bloomz": ["query_key_value"],
73 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
74 | # "gptj": ["q_proj", "v_proj"],
75 | # "gpt_neox": ["query_key_value"],
76 | # "gpt_neo": ["q_proj", "v_proj"],
77 | # "bert": ["query", "value"],
78 | "roberta": ["query", "key", "value", "dense"],
79 | # "xlm-roberta": ["query", "value"],
80 | # "electra": ["query", "value"],
81 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
82 | "chatglm": ["query_key_value"],
83 | "glm": ["query_key_value"],
84 | # "deberta": ["in_proj"],
85 | # "layoutlm": ["query", "value"],
86 | }
87 |
88 | TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING = {
89 | "t5": ["q", "k", "v", "o", "wi", "wo"],
90 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
91 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
92 | # "gpt2": ["c_attn"],
93 | "bloom": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
94 | "bloomz": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
95 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
96 | # "gptj": ["q_proj", "v_proj"],
97 | # "gpt_neox": ["query_key_value"],
98 | # "gpt_neo": ["q_proj", "v_proj"],
99 | # "bert": ["query", "value"],
100 | "roberta": ["query", "key", "value", "dense"],
101 | # "xlm-roberta": ["query", "value"],
102 | # "electra": ["query", "value"],
103 | "llama": ["q_proj", "v_proj"],
104 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
105 | "glm": ["query_key_value", "dense"]
106 | # "deberta": ["in_proj"],
107 | # "layoutlm": ["query", "value"],
108 | }
109 |
110 | TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING = {
111 | "t5": ["q", "k", "v", "o", "wi", "wo"],
112 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
113 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
114 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
115 | "roberta": ["query", "key", "value", "dense"],
116 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
117 | "chatglm": ["query_key_value"],
118 | "glm": ["query_key_value"]
119 | }
120 |
121 | TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING = {
122 | "glm": [0, 22],
123 | "bloom": [17, 22],
124 | "bloomz": [17, 22],
125 | }
126 |
127 | TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {
128 | "bloom": bloom_model_postprocess_past_key_value,
129 | "bloomz": bloom_model_postprocess_past_key_value,
130 | }
131 |
132 | WEIGHTS_NAME = "adapter_model.bin"
133 | CONFIG_NAME = "adapter_config.json"
134 |
135 |
136 | TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.update(
137 | NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
138 | )
139 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING.update(
140 | NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING
141 | )
142 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/xxpo/xxpo_arguments.py:
--------------------------------------------------------------------------------
1 | """
2 | # @author Chaoyu Chen
3 | # @date 2023/10/19
4 |
5 | training arguments
6 | """
7 |
8 | from dataclasses import dataclass, asdict
9 | from typing import List, Union
10 |
11 |
12 | @dataclass
13 | class XXPOTrainArgs:
14 | # train data paths on shared FS
15 | data_paths: Union[str, List[str]]
16 |
17 | # output dir for saving adaptors in peft or full ckpts in full-parameter training
18 | output_dir: str
19 |
20 | # tensorboard dir for saving tensorboard logs
21 | tb_dir: str
22 |
23 | # pretrained_model_path, on which is the model you want to train
24 | pretrained_model_path: str
25 |
26 | # model type of pretrained_model_path, support llama|qwen|starcoder|baichuan|chatglm2
27 | model_type: str
28 |
29 | # train/valid/test split
30 | data_split: str = "98,2,0"
31 |
32 | # lora or qlora or None(for full-parameter training)
33 | peft_type: Union[None, str] = "qlora"
34 |
35 | # if qlora, 4bit will be set, else None
36 | quantization: Union[None, str] = "4bit"
37 |
38 | # lora rank, the bigger, the more trainalbe parameters
39 | lora_rank: int = 96
40 |
41 | # lora alpha
42 | lora_alpha: int = 32
43 |
44 | # lora dropout
45 | lora_dropout: float = 0.05
46 |
47 | # lora targeting modules
48 | target_modules: Union[None, str, List[str]] = None
49 |
50 | # dpo or orpo
51 | xxpo: str = "dpo"
52 |
53 | # dpo/orpo beta
54 | beta: float = 0.1
55 |
56 | rpo_alpha: Union[None, float] = None
57 |
58 | # mircro train batch size
59 | per_device_train_batch_size: int = 8
60 |
61 | # micro eval batch size, always same as micro train batch size
62 | per_device_eval_batch_size: int = 8
63 |
64 | # HF AutoTokenizer is supported, maybe more types
65 | tokenizer_type: str = "AutoTokenizer"
66 |
67 | # initial lr
68 | learning_rate: float = 5e-5
69 |
70 | # minimum lr
71 | min_lr: float = 5e-6
72 |
73 | # weight decay
74 | weight_decay: float = 0.01
75 |
76 | # gradient_accumulation_steps
77 | gradient_accumulation_steps: int = 1
78 |
79 | # lr_scheduler_type
80 | lr_scheduler_type: str = "cosine"
81 |
82 | # optimizer_type
83 | optimizer_type: str = "adamw_torch"
84 | # optimizer_type: str = "paged_adamw_32bit"
85 |
86 | # gradient_checkpointing
87 | gradient_checkpointing: bool = True
88 | gradient_checkpointing_use_reentrant: bool = False
89 |
90 | # num of warmup_steps
91 | warmup_steps: Union[int, float] = 0.05
92 |
93 | # num_train_epochs
94 | num_train_epochs: int = 4
95 |
96 | # seed for reproducing
97 | seed: int = 1234
98 |
99 | # seq_length, context length
100 | seq_length: int = 4096
101 |
102 | save_only_model: bool = True
103 |
104 | # path of adaptor which is resumed from, None for not resuming training
105 | resume_from_checkpoint: Union[None, str] = None
106 |
107 | # auto resume from latest ckpt if job restarted
108 | auto_resume: bool = True
109 |
110 | # num of steps for logging training loss
111 | logging_steps: int = 10
112 |
113 | # num of steps for saving ckpt
114 | save_steps: int = 100
115 |
116 | # num of steps for evaluation(eval_loss), better same as checkpointing steps
117 | eval_steps: int = 100
118 |
119 | # max train steps, if None, depends on num_train_epochs
120 | max_steps: int = -1
121 |
122 | # if checkpointing every epoch, maybe True in sst
123 | epoch_checkpointing: bool = False
124 |
125 | # shuffle before train/valid split
126 | shuffle_before_split: bool = True
127 |
128 | # if early stop when eval loss is not converging in the past early_stopping_stall_num evaluation point
129 | early_stopping: bool = True
130 | early_stopping_stall_num: int = 5
131 |
132 | # limit num for saving ckpts, None for no limits. Used for full-parameter training to avoid exceeding disk quota.
133 | saving_limit: Union[None, int] = None
134 |
135 | # ATTENTION_CLASSES = { "eager": Normal Attention, "flash_attention_2": FlashAttention2}
136 | attn_implementation: str = "flash_attention_2"
137 |
138 | # tokenizer chat template, if None, will use MFTCoder template
139 | chat_template: Union[None, str] = None
140 |
141 | distributed_type: Union[None, str] = None
142 |
143 | init_timeout_seconds: Union[None, int] = 3600
144 |
145 | make_vocab_size_divisible_by: int = 32
146 | model_parallel_size: int = 1
147 | use_slow_tokenizer: bool = False
148 | world_size: int = 8
149 |
150 | # max prompt string length and whole str length
151 | max_prompt_length: Union[None, int] = 2048
152 | max_length: Union[None, int] = 4096
153 |
154 | # num of process processing dataset
155 | dataset_num_proc: int = 1
156 |
157 | # model_dtype[float16, bfloat16, float] for loading
158 | dtype: str = "bfloat16"
159 |
160 | # instrumentation
161 | disable_tqdm: bool = False
162 | sanity_check: bool = False
163 |
164 | # debug argument for distributed training
165 | # "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See"
166 | # "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
167 | ignore_bias_buffers: bool = True
168 |
169 | def dict(self):
170 | return {k: str(v) for k, v in asdict(self).items()}
171 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/mpt/mpt_arguments.py:
--------------------------------------------------------------------------------
1 | """
2 | # @author Chaoyu Chen
3 | # @date 2024/6/1
4 |
5 | MPT training arguments
6 | """
7 |
8 | from dataclasses import dataclass, asdict
9 | from typing import List, Union
10 |
11 |
12 | @dataclass
13 | class MptTrainArgs:
14 | # train data paths on shared FS
15 | data_paths: Union[str, List[str]]
16 |
17 | # output dir for saving adaptors in peft or full ckpts in full-parameter training
18 | output_dir: str
19 |
20 | # tensorboard dir for saving tensorboard logs
21 | tb_dir: str
22 |
23 | # pretrained_model_path, on which is the model you want to train
24 | pretrained_model_path: str
25 |
26 | # model type of pretrained_model_path, support llama|qwen|starcoder|baichuan|chatglm2
27 | model_type: str
28 |
29 | # load from raw jsonl file or tokenized binary file
30 | load_raw_dataset: bool = True
31 |
32 | # weights of loss calculation for each task, None means equal weights
33 | task_weights: Union[None, str] = None
34 |
35 | # weights of data sampling, leave it None
36 | data_weights: Union[None, str] = None
37 |
38 | # hf loading model low_cpu_mem_usage
39 | low_cpu_mem_usage: bool = True
40 |
41 | # train/valid/test split
42 | data_split: str = "98,2,0"
43 |
44 | # padding or pack or concat
45 | padding_mode: str = "padding"
46 |
47 | # sft or sst
48 | tokenize_mode: str = "sft"
49 |
50 | # case3 or case4
51 | weighted_loss_mode: str = "case3"
52 |
53 | # mircro train batch size
54 | per_device_train_batch_size: int = 8
55 |
56 | # micro eval batch size, always same as micro train batch size
57 | per_device_eval_batch_size: int = 8
58 |
59 | # HF AutoTokenizer is supported, maybe more types
60 | tokenizer_type: str = "AutoTokenizer"
61 |
62 | # initial lr
63 | learning_rate: float = 5e-5
64 |
65 | # minimum lr
66 | min_lr: float = 5e-6
67 |
68 | # weight decay
69 | weight_decay: float = 0.01
70 |
71 | # gradient_accumulation_steps
72 | gradient_accumulation_steps: int = 1
73 |
74 | # lr_scheduler_type
75 | lr_scheduler_type: str = "cosine"
76 |
77 | # num_warmup_steps
78 | num_warmup_steps: Union[int, float] = 0.05
79 |
80 | # num_train_epochs
81 | num_train_epochs: int = 4
82 |
83 | # seed for reproducing
84 | seed: int = 1234
85 |
86 | # seq_length, context length
87 | seq_length: int = 4096
88 |
89 | # path of adaptor which is resumed from, None for not resuming training
90 | resume_from_checkpoint: Union[None, str] = None
91 |
92 | # auto resume from latest ckpt if job restarted
93 | auto_resume: bool = True
94 |
95 | # num of steps for logging training loss
96 | log_interval: int = 10
97 |
98 | # num of steps for saving ckpt
99 | checkpointing_steps: int = 100
100 |
101 | # num of steps for evaluation(eval_loss), better same as checkpointing steps
102 | evaluation_steps: int = 100
103 |
104 | # max train steps, if None, depends on num_train_epochs
105 | max_train_steps: Union[None, int] = None
106 |
107 | # if checkpointing every epoch, maybe True in sst
108 | epoch_checkpointing: bool = False
109 |
110 | # save transformers model(safetensors)
111 | save_transformers_model: bool = False
112 |
113 | # shuffle before train/valid split
114 | shuffle_before_split: bool = True
115 |
116 | # DDP random sampler
117 | use_random_sampler: bool = True
118 |
119 | # if early stop when eval loss is not converging in the past early_stopping_stall_num evaluation point
120 | early_stopping: bool = True
121 | early_stopping_stall_num: int = 5
122 |
123 | # limit num for saving ckpts, None for no limits. Used for full-parameter training to avoid exceeding disk quota.
124 | saving_limit: Union[None, int] = None
125 |
126 | # if dynamic padding
127 | use_dynamic_padding: bool = True
128 |
129 | # warm-up steps for CoBa, recommand the number of valid batches
130 | coba_warmup_steps: int = 100
131 | # history length of sample valid loss used to fit the slope curve in CoBa, recommand [2*coba_warmup_steps,5*coba_warmup_steps]
132 | coba_history_length: int = 200
133 | # temperature for divergence factor in CoBa
134 | coba_tau: int = 5
135 | # iteration interval of update per task train weight in CoBa
136 | coba_update_interval: int = 1
137 | # the number of mini valid batches sampled at each updated iteration interval
138 | coba_sample_valid_num: int = 1
139 |
140 | # ATTENTION_CLASSES = { "eager": Normal Attention, "flash_attention_2": FlashAttention2}
141 | attn_implementation: str = "flash_attention_2"
142 |
143 | # role markers, which are prompt template before each role: system, user and assistant
144 | # role_markers: {"system": "### System:\n", "user": "### Instruction:\n", "assistant": "### Response:\n"}
145 | role_markers: Union[None, dict] = None
146 |
147 | distributed_type: Union[None, str] = None
148 |
149 | init_timeout_seconds: Union[None, int] = 3600
150 |
151 | # legacy, leave them
152 | use_xformers: bool = True
153 | trust_remote_code: bool = True
154 | weight_by_num_documents: bool = True
155 | make_vocab_size_divisible_by: int = 32
156 | model_parallel_size: int = 1
157 | use_slow_tokenizer: bool = False
158 | world_size: int = 8
159 |
160 | def dict(self):
161 | return {k: str(v) for k, v in asdict(self).items()}
162 |
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/tuner/bitfit.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | sys.path.append("../..")
4 | import torch
5 | import importlib
6 | from enum import Enum
7 | from peft.utils import PeftType
8 | from dataclasses import dataclass, field, asdict
9 | from typing import Optional, List
10 |
11 | from .pe_base_model import PEBaseModel
12 | from model.peft.utils import PetuningConfig
13 | from model.peft.utils.others import _freeze_model
14 |
15 |
16 | def is_alps_available():
17 | return importlib.util.find_spec("alps") is not None
18 |
19 |
20 | if is_alps_available():
21 | from alps.util import logger
22 | else:
23 | import logging
24 | logger = logging.getLogger(__file__)
25 |
26 |
27 | class PEBitfitModel(PEBaseModel):
28 | """
29 | 只训练模型bias:参考 https://arxiv.org/pdf/2106.10199.pdf
30 | model: huggingface transformers model
31 | tokenizer: huggingface transformers tokenizer
32 | """
33 |
34 | def __init__(self, model):
35 | self.model = model
36 |
37 | def get_model(self):
38 | not_freeze_param_name = ["bias"]
39 | set_parameter_requires_grad(self.model, not_freeze_param_name)
40 | return self.model
41 |
42 | @classmethod
43 | def restore(self, model=None, path=None):
44 | logger.info("bitfit不需要额外加载参数")
45 | return model
46 |
47 |
48 | # 根据名称锁定参数层
49 | def set_parameter_requires_grad(model, freeze_param_name=[]):
50 | if not isinstance(freeze_param_name, list):
51 | freeze_param_name = [freeze_param_name]
52 |
53 | for idx, (name, param) in enumerate(model.named_parameters()):
54 | for p in freeze_param_name:
55 | if p not in name:
56 | param.requires_grad = False
57 | # 打印参数层名
58 | for idx, (name, param) in enumerate(model.named_parameters()):
59 | for p in freeze_param_name:
60 | if p in name:
61 | print("trainable parameter name is:")
62 | print(name)
63 | param.requires_grad = True
64 |
65 |
66 | @dataclass
67 | class PeftBitfitConfig(PetuningConfig):
68 | """
69 | This is the configuration class to store the configuration of a [`PeftBitfitModel`].
70 |
71 | Args:
72 | modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable
73 | and saved in the final checkpoint.
74 | """
75 |
76 | modules_to_save: Optional[List[str]] = field(
77 | default=None,
78 | metadata={
79 | "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
80 | "For example, in Sequence Classification or Token Classification tasks, "
81 | "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
82 | },
83 | )
84 |
85 | def __post_init__(self):
86 | self.peft_type = PeftType.BITFIT
87 |
88 |
89 | class PeftBitfitModel(torch.nn.Module):
90 | """
91 | Creates Bitfit model for ant peft.
92 |
93 | Args:
94 | model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers.
95 | config ([`PeftBitfitConfig`]): The configuration of the Bitfit model.
96 |
97 | Returns:
98 | `torch.nn.Module`: The Bitfit model.
99 |
100 | **Attributes**:
101 | - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed.
102 | - **peft_config** ([`PeftBitfitConfig`]): The configuration of the Bitfit model.
103 | """
104 |
105 | def __init__(self, model, config, adapter_name):
106 | super().__init__()
107 | self.model = model
108 |
109 | self.forward = self.model.forward
110 | self.peft_config = config
111 | self.add_adapter(adapter_name, self.peft_config[adapter_name])
112 |
113 | def add_adapter(self, adapter_name, config=None):
114 | if not isinstance(config, PeftBitfitConfig):
115 | raise ValueError(
116 | f"The PeftBitfitModel need PeftBitfitConfig, but get {type(config)}."
117 | )
118 |
119 | if config is not None:
120 | config = self._prepare_lora_config(config)
121 | self.peft_config[adapter_name] = config
122 |
123 | if len(self.peft_config) > 1:
124 | raise ValueError(
125 | "BitfitModel supports only 1 peft config or name."
126 | "Because it only freeze the shallow layers without any additional parameters."
127 | )
128 |
129 | self.model = PEBitfitModel(self.model).get_model()
130 |
131 | if self.peft_config[adapter_name].inference_mode:
132 | _freeze_model(self.model)
133 |
134 | @staticmethod
135 | def _prepare_lora_config(peft_config):
136 | if peft_config.inference_mode:
137 | peft_config.merge_weights = True
138 | return peft_config
139 |
140 | def __getattr__(self, name: str):
141 | """Forward missing attributes to the wrapped module."""
142 | try:
143 | return super().__getattr__(name) # defer to nn.Module's logic
144 | except AttributeError:
145 | return getattr(self.model, name)
146 |
147 | def get_peft_config_as_dict(self, inference: bool = False):
148 | config_dict = {}
149 | for key, value in self.peft_config.items():
150 | config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
151 | if inference:
152 | config["inference_mode"] = True
153 | config_dict[key] = config
154 | return config
--------------------------------------------------------------------------------
/mftcoder_atorch/utils/learning_rates.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, EleutherAI
2 | # This file is based on code by the authors denoted below and has been modified from its original version.
3 | #
4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | """Learning rate decay functions."""
19 |
20 | import math
21 |
22 | # from .common_utils import print_rank_0
23 |
24 |
25 | class AnnealingLR(object):
26 | """Anneals the learning rate."""
27 |
28 | def __init__(
29 | self,
30 | optimizer,
31 | start_lr,
32 | warmup_iter,
33 | total_iters,
34 | decay_style,
35 | last_iter,
36 | min_lr=0.0,
37 | use_checkpoint_lr_scheduler=True,
38 | override_lr_scheduler=False,
39 | use_mup=False,
40 | ):
41 |
42 | # Class values.
43 | self.optimizer = optimizer
44 | self.start_lr = start_lr
45 | self.min_lr = min_lr
46 | self.warmup_iter = warmup_iter
47 | self.num_iters = last_iter
48 | self.end_iter = total_iters
49 | assert self.end_iter > 0
50 | self.decay_style = decay_style
51 | self.override_lr_scheduler = override_lr_scheduler
52 | self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
53 | self.use_mup = use_mup
54 | if self.override_lr_scheduler:
55 | assert not self.use_checkpoint_lr_scheduler, (
56 | "both override and " "use-checkpoint are set."
57 | )
58 | # Set the learning rate
59 | self.step(self.num_iters)
60 |
61 | print("> learning rate decay style: {}".format(self.decay_style))
62 |
63 | def update_lr(self, lr):
64 | self.start_lr = lr
65 |
66 | def get_lr(self):
67 | """Learning rate decay functions from:
68 | https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
69 |
70 | num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
71 | # Warmup.
72 | if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
73 | return float(self.start_lr) * num_iters_ / self.warmup_iter
74 |
75 | num_iters_ = num_iters_ - self.warmup_iter
76 | if self.decay_style == "linear":
77 | lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
78 | elif self.decay_style == "cosine":
79 | lr = (
80 | self.start_lr
81 | / 2.0
82 | * (math.cos(math.pi * num_iters_ / self.end_iter) + 1)
83 | )
84 | elif self.decay_style == "exponential":
85 | # exp(-0.693) = 1/2
86 | lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
87 | else:
88 | lr = self.start_lr
89 | return max(lr, self.min_lr)
90 |
91 | def step(self, step_num=None):
92 | """Set lr for all parameters groups."""
93 | if step_num is None:
94 | step_num = self.num_iters + 1
95 | self.num_iters = step_num
96 | new_lr = self.get_lr()
97 | for group in self.optimizer.param_groups:
98 | if self.use_mup and "width_mult" in group:
99 | group["lr"] = new_lr / group["width_mult"]
100 | else:
101 | group["lr"] = new_lr
102 |
103 | def state_dict(self):
104 | state_dict = {
105 | "start_lr": self.start_lr,
106 | "warmup_iter": self.warmup_iter,
107 | "num_iters": self.num_iters,
108 | "decay_style": self.decay_style,
109 | "end_iter": self.end_iter,
110 | "min_lr": self.min_lr,
111 | }
112 | return state_dict
113 |
114 | def _check_and_set(self, cls_value, sd_value, name):
115 | """Auxiliary function for checking the values in the checkpoint and
116 | setting them."""
117 | if self.override_lr_scheduler:
118 | print_rank_0(" > overriding {} value to {}".format(name, cls_value))
119 | return cls_value
120 |
121 | if not self.use_checkpoint_lr_scheduler:
122 | assert cls_value == sd_value, (
123 | "AnnealingLR: class input value"
124 | "and checkpoint values for {} do not match".format(name)
125 | )
126 | print_rank_0(" > using checkpoint value {} for {}".format(sd_value, name))
127 | return sd_value
128 |
129 | def load_state_dict(self, sd):
130 |
131 | self.start_lr = self._check_and_set(
132 | self.start_lr, sd["start_lr"], "learning rate"
133 | )
134 | self.min_lr = self._check_and_set(
135 | self.min_lr, sd["min_lr"], "minimum learning rate"
136 | )
137 | self.warmup_iter = self._check_and_set(
138 | self.warmup_iter, sd["warmup_iter"], "warmup iterations"
139 | )
140 | self.end_iter = self._check_and_set(
141 | self.end_iter, sd["end_iter"], "total number of iterations"
142 | )
143 | self.decay_style = self._check_and_set(
144 | self.decay_style, sd["decay_style"], "decay style"
145 | )
146 |
147 | self.num_iters = sd["num_iters"]
148 | self.step(self.num_iters)
149 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/pefts/mft_arguments.py:
--------------------------------------------------------------------------------
1 | """
2 | # @author Chaoyu Chen
3 | # @date 2023/10/19
4 |
5 | training arguments
6 | """
7 |
8 | from dataclasses import dataclass, asdict
9 | from typing import List, Union
10 |
11 |
12 | @dataclass
13 | class MftTrainArgs:
14 | # train data paths on shared FS
15 | data_paths: Union[str, List[str]]
16 |
17 | # output dir for saving adaptors in peft or full ckpts in full-parameter training
18 | output_dir: str
19 |
20 | # tensorboard dir for saving tensorboard logs
21 | tb_dir: str
22 |
23 | # pretrained_model_path, on which is the model you want to train
24 | pretrained_model_path: str
25 |
26 | # model type of pretrained_model_path, support llama|qwen|starcoder|baichuan|chatglm2
27 | model_type: str
28 |
29 | # load from raw jsonl file or tokenized binary file
30 | load_raw_dataset: bool = True
31 |
32 | # weights of loss calculation for each task, None means equal weights
33 | task_weights: Union[None, str] = None
34 |
35 | # weights of data sampling, leave it None
36 | data_weights: Union[None, str] = None
37 |
38 | # hf loading model low_cpu_mem_usage
39 | low_cpu_mem_usage: bool = True
40 |
41 | # train/valid/test split
42 | data_split: str = "98,2,0"
43 |
44 | # padding or pack or concat
45 | padding_mode: str = "padding"
46 |
47 | # sft or sst
48 | tokenize_mode: str = "sft"
49 |
50 | # mft loss mode
51 | weighted_loss_mode: str = "case3"
52 |
53 | # lora or qlora or None(for full-parameter training)
54 | peft_type: Union[None, str] = "qlora"
55 |
56 | # if qlora, 4bit will be set, else None
57 | quantization: Union[None, str] = "4bit"
58 |
59 | # lora rank, the bigger, the more trainalbe parameters
60 | lora_rank: int = 96
61 |
62 | # lora alpha
63 | lora_alpha: int = 32
64 |
65 | # lora dropout
66 | lora_dropout: float = 0.05
67 |
68 | # lora targeting modules
69 | target_modules: Union[None, str, List[str]] = None
70 |
71 | # mircro train batch size
72 | per_device_train_batch_size: int = 8
73 |
74 | # micro eval batch size, always same as micro train batch size
75 | per_device_eval_batch_size: int = 8
76 |
77 | # HF AutoTokenizer is supported, maybe more types
78 | tokenizer_type: str = "AutoTokenizer"
79 |
80 | # initial lr
81 | learning_rate: float = 5e-5
82 |
83 | # minimum lr
84 | min_lr: float = 5e-6
85 |
86 | # weight decay
87 | weight_decay: float = 0.01
88 |
89 | # gradient_accumulation_steps
90 | gradient_accumulation_steps: int = 1
91 |
92 | # lr_scheduler_type
93 | lr_scheduler_type: str = "cosine"
94 |
95 | # num_warmup_steps
96 | num_warmup_steps: Union[int, float] = 0.05
97 |
98 | # num_train_epochs
99 | num_train_epochs: int = 4
100 |
101 | # seed for reproducing
102 | seed: int = 1234
103 |
104 | # seq_length, context length
105 | seq_length: int = 4096
106 |
107 | # path of adaptor which is resumed from, None for not resuming training
108 | resume_from_checkpoint: Union[None, str] = None
109 |
110 | # auto resume from latest ckpt if job restarted
111 | auto_resume: bool = True
112 |
113 | # num of steps for logging training loss
114 | log_interval: int = 10
115 |
116 | # num of steps for saving ckpt
117 | checkpointing_steps: int = 100
118 |
119 | # num of steps for evaluation(eval_loss), better same as checkpointing steps
120 | evaluation_steps: int = 100
121 |
122 | # max train steps, if None, depends on num_train_epochs
123 | max_train_steps: Union[None, int] = None
124 |
125 | # if checkpointing every epoch, maybe True in sst
126 | epoch_checkpointing: bool = False
127 |
128 | # shuffle before train/valid split
129 | shuffle_before_split: bool = True
130 |
131 | # DDP random sampler
132 | use_random_sampler: bool = True
133 |
134 | # if early stop when eval loss is not converging in the past early_stopping_stall_num evaluation point
135 | early_stopping: bool = True
136 | early_stopping_stall_num: int = 5
137 |
138 | # limit num for saving ckpts, None for no limits. Used for full-parameter training to avoid exceeding disk quota.
139 | saving_limit: Union[None, int] = None
140 |
141 | # if dynamic padding
142 | use_dynamic_padding: bool = True
143 |
144 | # warm-up steps for CoBa, recommand the number of valid batches
145 | coba_warmup_steps: int = 100
146 | # history length of sample valid loss used to fit the slope curve in CoBa, recommand [2*coba_warmup_steps,5*coba_warmup_steps]
147 | coba_history_length: int = 200
148 | # temperature for divergence factor in CoBa
149 | coba_tau: int = 5
150 | # iteration interval of update per task train weight in CoBa
151 | coba_update_interval: int = 1
152 | # the number of mini valid batches sampled at each updated iteration interval
153 | coba_sample_valid_num: int = 1
154 |
155 | # ATTENTION_CLASSES = { "eager": Normal Attention, "flash_attention_2": FlashAttention2}
156 | attn_implementation: str = "flash_attention_2"
157 |
158 | # role markers, which are prompt template before each role: system, user and assistant
159 | # role_markers: {"system": "### System:\n", "user": "### Instruction:\n", "assistant": "### Response:\n"}
160 | role_markers: Union[None, dict] = None
161 |
162 | distributed_type: Union[None, str] = None
163 |
164 | init_timeout_seconds: Union[None, int] = 3600
165 |
166 | # legacy, leave them
167 | use_xformers: bool = True
168 | trust_remote_code: bool = True
169 | weight_by_num_documents: bool = True
170 | make_vocab_size_divisible_by: int = 32
171 | model_parallel_size: int = 1
172 | use_slow_tokenizer: bool = False
173 | world_size: int = 8
174 |
175 | def dict(self):
176 | return {k: str(v) for k, v in asdict(self).items()}
177 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/aquila2/configuration_aquila.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3 | #
4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5 | # and OPT implementations in this library. It has been modified from its
6 | # original forms to accommodate minor architectural differences compared
7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | """ Aquila model configuration"""
21 |
22 | from transformers import PretrainedConfig
23 |
24 |
25 |
26 | class AquilaConfig(PretrainedConfig):
27 | r"""
28 | This is the configuration class to store the configuration of a [`AquilaModel`]. It is used to instantiate an Aquila
29 | model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
30 | defaults will yield a similar configuration to that of the Aquila-7B.
31 |
32 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33 | documentation from [`PretrainedConfig`] for more information.
34 |
35 |
36 | Args:
37 | vocab_size (`int`, *optional*, defaults to 32000):
38 | Vocabulary size of the Aquila model. Defines the number of different tokens that can be represented by the
39 | `inputs_ids` passed when calling [`AquilaModel`]
40 | hidden_size (`int`, *optional*, defaults to 4096):
41 | Dimension of the hidden representations.
42 | intermediate_size (`int`, *optional*, defaults to 11008):
43 | Dimension of the MLP representations.
44 | num_hidden_layers (`int`, *optional*, defaults to 32):
45 | Number of hidden layers in the Transformer encoder.
46 | num_attention_heads (`int`, *optional*, defaults to 32):
47 | Number of attention heads for each attention layer in the Transformer encoder.
48 | hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
49 | The non-linear activation function (function or string) in the decoder.
50 | max_position_embeddings (`int`, *optional*, defaults to 2048):
51 | The maximum sequence length that this model might ever be used with. Typically set this to something large
52 | just in case (e.g., 512 or 1024 or 2048).
53 | initializer_range (`float`, *optional*, defaults to 0.02):
54 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
55 | rms_norm_eps (`float`, *optional*, defaults to 1e-12):
56 | The epsilon used by the rms normalization layers.
57 | use_cache (`bool`, *optional*, defaults to `True`):
58 | Whether or not the model should return the last key/values attentions (not used by all models). Only
59 | relevant if `config.is_decoder=True`.
60 | tie_word_embeddings(`bool`, *optional*, defaults to `False`):
61 | Whether to tie weight embeddings
62 | Example:
63 |
64 | ```python
65 | >>> from transformers import AquilaModel, AquilaConfig
66 |
67 | >>> # Initializing a Aquila aquila-7b style configuration
68 | >>> configuration = AquilaConfig()
69 |
70 | >>> # Initializing a model from the aquila-7b style configuration
71 | >>> model = AquilaModel(configuration)
72 |
73 | >>> # Accessing the model configuration
74 | >>> configuration = model.config
75 | ```"""
76 | model_type = "aquila"
77 | keys_to_ignore_at_inference = ["past_key_values"]
78 |
79 | def __init__(
80 | self,
81 | vocab_size=100008,
82 | hidden_size=4096,
83 | intermediate_size=11008,
84 | num_hidden_layers=32,
85 | num_attention_heads=32,
86 | num_key_value_heads=None,
87 | hidden_act="silu",
88 | max_position_embeddings=2048,
89 | initializer_range=0.02,
90 | rms_norm_eps=1e-6,
91 | use_cache=True,
92 | pad_token_id=0,
93 | bos_token_id=1,
94 | eos_token_id=2,
95 | pretraining_tp=1,
96 | tie_word_embeddings=False,
97 | rope_theta=10000.0,
98 | rope_scaling=None,
99 | use_xformers=True,
100 | **kwargs,
101 | ):
102 | self.vocab_size = vocab_size
103 | self.max_position_embeddings = max_position_embeddings
104 | self.hidden_size = hidden_size
105 | self.intermediate_size = intermediate_size
106 | self.num_hidden_layers = num_hidden_layers
107 |
108 | # for backward compatibility
109 | if num_key_value_heads is None:
110 | num_key_value_heads = num_attention_heads
111 |
112 | self.num_key_value_heads = num_key_value_heads
113 |
114 | self.num_attention_heads = num_attention_heads
115 | self.hidden_act = hidden_act
116 | self.initializer_range = initializer_range
117 | self.rms_norm_eps = rms_norm_eps
118 | self.pretraining_tp = pretraining_tp
119 | self.use_cache = use_cache
120 | self.rope_theta = rope_theta
121 | self.rope_scaling = rope_scaling
122 | self.use_xformers = use_xformers
123 |
124 | super().__init__(
125 | pad_token_id=pad_token_id,
126 | bos_token_id=bos_token_id,
127 | eos_token_id=eos_token_id,
128 | tie_word_embeddings=tie_word_embeddings,
129 | **kwargs,
130 | )
131 |
132 |
--------------------------------------------------------------------------------
/mftcoder_atorch/model/gpt_neox/configuration_gpt_neox.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ GPTNeoX model configuration"""
16 |
17 | from transformers.configuration_utils import PretrainedConfig
18 | from transformers.utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 | GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
25 | # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
26 | }
27 |
28 |
29 | class GPTNeoXConfig(PretrainedConfig):
30 | r"""
31 | This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an
32 | GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration
33 | with the defaults will yield a similar configuration to that of the GPTNeoX
34 | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture.
35 |
36 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37 | documentation from [`PretrainedConfig`] for more information.
38 |
39 |
40 | Args:
41 | vocab_size (`int`, *optional*, defaults to 50432):
42 | Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the
43 | `inputs_ids` passed when calling [`GPTNeoXModel`].
44 | hidden_size (`int`, *optional*, defaults to 6144):
45 | Dimension of the encoder layers and the pooler layer.
46 | num_hidden_layers (`int`, *optional*, defaults to 44):
47 | Number of hidden layers in the Transformer encoder.
48 | num_attention_heads (`int`, *optional*, defaults to 64):
49 | Number of attention heads for each attention layer in the Transformer encoder.
50 | intermediate_size (`int`, *optional*, defaults to 24576):
51 | Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
52 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
53 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
54 | `"relu"`, `"selu"` and `"gelu_new"` are supported.
55 | rotary_pct (`float`, *optional*, defaults to 0.25):
56 | percentage of hidden dimensions to allocate to rotary embeddings
57 | rotary_emb_base (`int`, *optional*, defaults to 10000)
58 | base for computing rotary embeddings frequency
59 | max_position_embeddings (`int`, *optional*, defaults to 2048):
60 | The maximum sequence length that this model might ever be used with. Typically set this to something large
61 | just in case (e.g., 512 or 1024 or 2048).
62 | initializer_range (`float`, *optional*, defaults to 1e-5):
63 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64 | layer_norm_eps (`float`, *optional*, defaults to 1e-12):
65 | The epsilon used by the layer normalization layers.
66 | use_cache (`bool`, *optional*, defaults to `True`):
67 | Whether or not the model should return the last key/values attentions (not used by all models). Only
68 | relevant if `config.is_decoder=True`.
69 | use_parallel_residual (`bool`, *optional*, defaults to `True`):
70 | Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
71 | speedup at large scales (e.g. 20B).
72 | Example:
73 |
74 | ```python
75 | >>> from transformers import GPTNeoXConfig, GPTNeoXModel
76 |
77 | >>> # Initializing a GPTNeoX gpt-neox-20b style configuration
78 | >>> configuration = GPTNeoXConfig()
79 |
80 | >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration
81 | >>> model = GPTNeoXModel(configuration) # doctest: +SKIP
82 |
83 | >>> # Accessing the model configuration
84 | >>> configuration = model.config # doctest: +SKIP
85 | ```"""
86 | model_type = "gpt_neox"
87 |
88 | def __init__(
89 | self,
90 | vocab_size=50432,
91 | hidden_size=6144,
92 | num_hidden_layers=44,
93 | num_attention_heads=64,
94 | intermediate_size=24576,
95 | hidden_act="gelu",
96 | rotary_pct=0.25,
97 | rotary_emb_base=10000,
98 | max_position_embeddings=2048,
99 | initializer_range=0.02,
100 | layer_norm_eps=1e-5,
101 | use_cache=True,
102 | bos_token_id=0,
103 | eos_token_id=2,
104 | tie_word_embeddings=False,
105 | use_parallel_residual=True,
106 | **kwargs,
107 | ):
108 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
109 | self.vocab_size = vocab_size
110 | self.max_position_embeddings = max_position_embeddings
111 | self.hidden_size = hidden_size
112 | self.num_hidden_layers = num_hidden_layers
113 | self.num_attention_heads = num_attention_heads
114 | self.intermediate_size = intermediate_size
115 | self.hidden_act = hidden_act
116 | self.rotary_pct = rotary_pct
117 | self.rotary_emb_base = rotary_emb_base
118 | self.initializer_range = initializer_range
119 | self.layer_norm_eps = layer_norm_eps
120 | self.use_cache = use_cache
121 | self.tie_word_embeddings = tie_word_embeddings
122 | self.use_parallel_residual = use_parallel_residual
--------------------------------------------------------------------------------
/mftcoder_atorch/model/gpt_neox/tokenization_gpt_neox_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for GPTNeoX."""
16 | import json
17 | from typing import TYPE_CHECKING, List, Optional, Tuple
18 |
19 | from tokenizers import pre_tokenizers
20 |
21 | from transformers import PreTrainedTokenizerFast
22 | from transformers.utils import logging
23 |
24 |
25 | if TYPE_CHECKING:
26 | from transformers.pipelines.conversational import Conversation
27 |
28 |
29 | logger = logging.get_logger(__name__)
30 |
31 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
32 |
33 | PRETRAINED_VOCAB_FILES_MAP = {
34 | "tokenizer_file": {
35 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json",
36 | },
37 | }
38 |
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 | "gpt-neox-20b": 2048,
41 | }
42 |
43 |
44 | class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
45 | """
46 | Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
47 | Byte-Pair-Encoding.
48 |
49 | This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
50 | be encoded differently whether it is at the beginning of the sentence (without space) or not:
51 |
52 | ```
53 | >>> from transformers import GPTNeoXTokenizerFast
54 | >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
55 | >>> tokenizer("Hello world")['input_ids']
56 | [15496, 995]
57 | >>> tokenizer(" Hello world")['input_ids']
58 | [18435, 995]
59 | ```
60 |
61 | You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
62 | the model was not pretrained this way, it might yield a decrease in performance.
63 |
64 |
65 |
66 | When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
67 |
68 |
69 |
70 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
71 | refer to this superclass for more information regarding those methods.
72 |
73 | Args:
74 | vocab_file (`str`):
75 | Path to the vocabulary file.
76 | merges_file (`str`):
77 | Path to the merges file.
78 | errors (`str`, *optional*, defaults to `"replace"`):
79 | Paradigm to follow when decoding bytes to UTF-8. See
80 | [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
81 | unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
82 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
83 | token instead.
84 | bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
85 | The beginning of sequence token.
86 | eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
87 | The end of sequence token.
88 | add_prefix_space (`bool`, *optional*, defaults to `False`):
89 | Whether or not to add an initial space to the input. This allows to treat the leading word just as any
90 | other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
91 | trim_offsets (`bool`, *optional*, defaults to `True`):
92 | Whether or not the post-processing step should trim offsets to avoid including whitespaces.
93 | """
94 |
95 | vocab_files_names = VOCAB_FILES_NAMES
96 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
97 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
98 | model_input_names = ["input_ids", "attention_mask"]
99 |
100 | def __init__(
101 | self,
102 | vocab_file=None,
103 | merges_file=None,
104 | tokenizer_file=None,
105 | unk_token="<|endoftext|>",
106 | bos_token="<|endoftext|>",
107 | eos_token="<|endoftext|>",
108 | add_prefix_space=False,
109 | **kwargs,
110 | ):
111 | super().__init__(
112 | vocab_file,
113 | merges_file,
114 | tokenizer_file=tokenizer_file,
115 | unk_token=unk_token,
116 | bos_token=bos_token,
117 | eos_token=eos_token,
118 | add_prefix_space=add_prefix_space,
119 | **kwargs,
120 | )
121 |
122 | pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
123 | if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
124 | pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
125 | pre_tok_state["add_prefix_space"] = add_prefix_space
126 | self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
127 |
128 | self.add_prefix_space = add_prefix_space
129 |
130 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
131 | files = self._tokenizer.model.save(save_directory, name=filename_prefix)
132 | return tuple(files)
133 |
134 | def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
135 | """This corresponds to DialoGPT variants of models."""
136 | input_ids = []
137 | for is_user, text in conversation.iter_texts():
138 | input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
139 |
140 | if len(input_ids) > self.model_max_length:
141 | input_ids = input_ids[-self.model_max_length :]
142 | return input_ids
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/gpt_neox/configuration_gpt_neox.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ GPTNeoX model configuration"""
16 |
17 | from transformers.configuration_utils import PretrainedConfig
18 | from transformers.utils import logging
19 | # from ...configuration_utils import PretrainedConfig
20 | # from ...utils import logging
21 |
22 |
23 | logger = logging.get_logger(__name__)
24 |
25 | GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
27 | # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
28 | }
29 |
30 |
31 | class GPTNeoXConfig(PretrainedConfig):
32 | r"""
33 | This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an
34 | GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration
35 | with the defaults will yield a similar configuration to that of the GPTNeoX
36 | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture.
37 |
38 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39 | documentation from [`PretrainedConfig`] for more information.
40 |
41 |
42 | Args:
43 | vocab_size (`int`, *optional*, defaults to 50432):
44 | Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the
45 | `inputs_ids` passed when calling [`GPTNeoXModel`].
46 | hidden_size (`int`, *optional*, defaults to 6144):
47 | Dimension of the encoder layers and the pooler layer.
48 | num_hidden_layers (`int`, *optional*, defaults to 44):
49 | Number of hidden layers in the Transformer encoder.
50 | num_attention_heads (`int`, *optional*, defaults to 64):
51 | Number of attention heads for each attention layer in the Transformer encoder.
52 | intermediate_size (`int`, *optional*, defaults to 24576):
53 | Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
54 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
55 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
56 | `"relu"`, `"selu"` and `"gelu_new"` are supported.
57 | rotary_pct (`float`, *optional*, defaults to 0.25):
58 | percentage of hidden dimensions to allocate to rotary embeddings
59 | rotary_emb_base (`int`, *optional*, defaults to 10000)
60 | base for computing rotary embeddings frequency
61 | max_position_embeddings (`int`, *optional*, defaults to 2048):
62 | The maximum sequence length that this model might ever be used with. Typically set this to something large
63 | just in case (e.g., 512 or 1024 or 2048).
64 | initializer_range (`float`, *optional*, defaults to 1e-5):
65 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
66 | layer_norm_eps (`float`, *optional*, defaults to 1e-12):
67 | The epsilon used by the layer normalization layers.
68 | use_cache (`bool`, *optional*, defaults to `True`):
69 | Whether or not the model should return the last key/values attentions (not used by all models). Only
70 | relevant if `config.is_decoder=True`.
71 | use_parallel_residual (`bool`, *optional*, defaults to `True`):
72 | Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
73 | speedup at large scales (e.g. 20B).
74 | Example:
75 |
76 | ```python
77 | >>> from transformers import GPTNeoXConfig, GPTNeoXModel
78 |
79 | >>> # Initializing a GPTNeoX gpt-neox-20b style configuration
80 | >>> configuration = GPTNeoXConfig()
81 |
82 | >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration
83 | >>> model = GPTNeoXModel(configuration) # doctest: +SKIP
84 |
85 | >>> # Accessing the model configuration
86 | >>> configuration = model.config # doctest: +SKIP
87 | ```"""
88 | model_type = "gpt_neox"
89 |
90 | def __init__(
91 | self,
92 | vocab_size=50432,
93 | hidden_size=6144,
94 | num_hidden_layers=44,
95 | num_attention_heads=64,
96 | intermediate_size=24576,
97 | hidden_act="gelu",
98 | rotary_pct=0.25,
99 | rotary_emb_base=10000,
100 | max_position_embeddings=2048,
101 | initializer_range=0.02,
102 | layer_norm_eps=1e-5,
103 | use_cache=True,
104 | bos_token_id=0,
105 | eos_token_id=2,
106 | tie_word_embeddings=False,
107 | use_parallel_residual=True,
108 | **kwargs,
109 | ):
110 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
111 | self.vocab_size = vocab_size
112 | self.max_position_embeddings = max_position_embeddings
113 | self.hidden_size = hidden_size
114 | self.num_hidden_layers = num_hidden_layers
115 | self.num_attention_heads = num_attention_heads
116 | self.intermediate_size = intermediate_size
117 | self.hidden_act = hidden_act
118 | self.rotary_pct = rotary_pct
119 | self.rotary_emb_base = rotary_emb_base
120 | self.initializer_range = initializer_range
121 | self.layer_norm_eps = layer_norm_eps
122 | self.use_cache = use_cache
123 | self.tie_word_embeddings = tie_word_embeddings
124 | self.use_parallel_residual = use_parallel_residual
--------------------------------------------------------------------------------
/mftcoder_atorch/model/peft/tuner/roem.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | sys.path.append("../..")
4 | import torch
5 | import importlib
6 | from enum import Enum
7 | from peft.utils import PeftType
8 | from dataclasses import dataclass, field, asdict
9 | from typing import Optional, List, Union
10 |
11 | from .pe_base_model import PEBaseModel
12 | from model.peft.utils import (
13 | PetuningConfig,
14 | TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING
15 | )
16 | from model.peft.utils.others import _freeze_model
17 |
18 |
19 | def is_alps_available():
20 | return importlib.util.find_spec("alps") is not None
21 |
22 |
23 | if is_alps_available():
24 | from alps.util import logger
25 | else:
26 | import logging
27 | logger = logging.getLogger(__file__)
28 |
29 |
30 | class PEROEMModel(PEBaseModel):
31 | """
32 | 只训练模型中间偏上层mlp:参考 https://arxiv.org/pdf/2202.05262.pdf ; https://arxiv.org/abs/2012.14913
33 | model: huggingface transformers model
34 | tokenizer: huggingface transformers tokenizer
35 | """
36 |
37 | def __init__(self, model, model_name, task_type=None):
38 | self.model = model
39 | self.model_name = model_name
40 |
41 | def get_model(self):
42 | layer_mapping = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[self.model_name]
43 | assert len(layer_mapping) == 2
44 | not_freeze_param_name = []
45 | for i in range(layer_mapping[0], layer_mapping[1]):
46 | no_freeze_name = str(i) + ".mlp"
47 | logger.info(f"Freeze the {no_freeze_name} layer of model")
48 | not_freeze_param_name.append(no_freeze_name)
49 | set_parameter_requires_grad(self.model, not_freeze_param_name)
50 | return self.model
51 |
52 | @classmethod
53 | def restore(self, model=None, path=None):
54 | logger.info("roem不需要额外加载参数")
55 | return model
56 |
57 |
58 | # 根据名称锁定参数层
59 | def set_parameter_requires_grad(model, freeze_param_name=[]):
60 | if not isinstance(freeze_param_name, list):
61 | freeze_param_name = [freeze_param_name]
62 |
63 | for idx, (name, param) in enumerate(model.named_parameters()):
64 | for p in freeze_param_name:
65 | if p not in name:
66 | param.requires_grad = False
67 | # 打印参数层名
68 | for idx, (name, param) in enumerate(model.named_parameters()):
69 | for p in freeze_param_name:
70 | if p in name:
71 | print("The name of used parameter used by ROEM is:")
72 | print(name)
73 | param.requires_grad = True
74 |
75 |
76 | @dataclass
77 | class PeftROEMConfig(PetuningConfig):
78 | """
79 | This is the configuration class to store the configuration of a [`PeftROEMModel`].
80 |
81 | Args:
82 | target_layers (`Union[List[int], int]`): The names of the modules to apply Lora to.
83 | """
84 |
85 | target_layers: Optional[Union[List[int], int]] = field(
86 | default=None,
87 | metadata={
88 | "help": "List of layers of the model to freeze the parameters."
89 | "For example, [20, 30] or '30' "
90 | },
91 | )
92 |
93 | def __post_init__(self):
94 | self.peft_type = PeftType.ROEM
95 |
96 |
97 | class PeftROEMModel(torch.nn.Module):
98 | """
99 | Creates ROEM model for ant peft.
100 |
101 | Args:
102 | model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers.
103 | config ([`PeftROEMConfig`]): The configuration of the ROEM model.
104 |
105 | Returns:
106 | `torch.nn.Module`: The ROEM model.
107 |
108 | **Attributes**:
109 | - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed.
110 | - **peft_config** ([`PeftROEMConfig`]): The configuration of the ROEM model.
111 | """
112 |
113 | def __init__(self, model, config, adapter_name):
114 | super().__init__()
115 | self.model = model
116 |
117 | self.forward = self.model.forward
118 | self.peft_config = config
119 | self.add_adapter(adapter_name, self.peft_config[adapter_name])
120 |
121 | def add_adapter(self, adapter_name, config=None):
122 | if not isinstance(config, PeftROEMConfig):
123 | raise ValueError(
124 | f"The PeftROEMModel need PeftROEMConfig, but get {type(config)}."
125 | )
126 |
127 | model_config = self.model.config.to_dict() if hasattr(self.model.config, "to_dict") else self.model.config
128 | if config is not None:
129 | config = self._prepare_lora_config(config, model_config)
130 | self.peft_config[adapter_name] = config
131 |
132 | if len(self.peft_config) > 1:
133 | raise ValueError(
134 | "ROEMModel supports only 1 peft config or name."
135 | "Because it only freeze the shallow layers without any additional parameters."
136 | )
137 |
138 | model_name = model_config["model_type"]
139 | self.model = PEROEMModel(self.model, model_name).get_model()
140 |
141 | if self.peft_config[adapter_name].inference_mode:
142 | _freeze_model(self.model)
143 |
144 | @staticmethod
145 | def _prepare_lora_config(peft_config, model_config):
146 | if peft_config.target_layers is None:
147 | if model_config["model_type"] not in TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING:
148 | raise ValueError("Please specify `target_layers` in `peft_config`")
149 | peft_config.target_layers = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[model_config["model_type"]]
150 | if peft_config.inference_mode:
151 | peft_config.merge_weights = True
152 | return peft_config
153 |
154 | def __getattr__(self, name: str):
155 | """Forward missing attributes to the wrapped module."""
156 | try:
157 | return super().__getattr__(name) # defer to nn.Module's logic
158 | except AttributeError:
159 | return getattr(self.model, name)
160 |
161 | def get_peft_config_as_dict(self, inference: bool = False):
162 | config_dict = {}
163 | for key, value in self.peft_config.items():
164 | config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
165 | if inference:
166 | config["inference_mode"] = True
167 | config_dict[key] = config
168 | return config
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/gpt_neox/tokenization_gpt_neox_fast.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for GPTNeoX."""
16 | import json
17 | from typing import TYPE_CHECKING, List, Optional, Tuple
18 |
19 | from tokenizers import pre_tokenizers
20 |
21 | from transformers import PreTrainedTokenizerFast
22 | # from ...tokenization_utils_fast import PreTrainedTokenizerFast
23 | from transformers.utils import logging
24 | # from ...utils import logging
25 |
26 |
27 | if TYPE_CHECKING:
28 | from transformers.pipelines.conversational import Conversation
29 |
30 |
31 | logger = logging.get_logger(__name__)
32 |
33 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
34 |
35 | PRETRAINED_VOCAB_FILES_MAP = {
36 | "tokenizer_file": {
37 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json",
38 | },
39 | }
40 |
41 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
42 | "gpt-neox-20b": 2048,
43 | }
44 |
45 |
46 | class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
47 | """
48 | Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
49 | Byte-Pair-Encoding.
50 |
51 | This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
52 | be encoded differently whether it is at the beginning of the sentence (without space) or not:
53 |
54 | ```
55 | >>> from transformers import GPTNeoXTokenizerFast
56 | >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
57 | >>> tokenizer("Hello world")['input_ids']
58 | [15496, 995]
59 | >>> tokenizer(" Hello world")['input_ids']
60 | [18435, 995]
61 | ```
62 |
63 | You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
64 | the model was not pretrained this way, it might yield a decrease in performance.
65 |
66 |
67 |
68 | When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
69 |
70 |
71 |
72 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
73 | refer to this superclass for more information regarding those methods.
74 |
75 | Args:
76 | vocab_file (`str`):
77 | Path to the vocabulary file.
78 | merges_file (`str`):
79 | Path to the merges file.
80 | errors (`str`, *optional*, defaults to `"replace"`):
81 | Paradigm to follow when decoding bytes to UTF-8. See
82 | [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
83 | unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
84 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
85 | token instead.
86 | bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
87 | The beginning of sequence token.
88 | eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
89 | The end of sequence token.
90 | add_prefix_space (`bool`, *optional*, defaults to `False`):
91 | Whether or not to add an initial space to the input. This allows to treat the leading word just as any
92 | other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
93 | trim_offsets (`bool`, *optional*, defaults to `True`):
94 | Whether or not the post-processing step should trim offsets to avoid including whitespaces.
95 | """
96 |
97 | vocab_files_names = VOCAB_FILES_NAMES
98 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
99 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
100 | model_input_names = ["input_ids", "attention_mask"]
101 |
102 | def __init__(
103 | self,
104 | vocab_file=None,
105 | merges_file=None,
106 | tokenizer_file=None,
107 | unk_token="<|endoftext|>",
108 | bos_token="<|endoftext|>",
109 | eos_token="<|endoftext|>",
110 | add_prefix_space=False,
111 | **kwargs,
112 | ):
113 | super().__init__(
114 | vocab_file,
115 | merges_file,
116 | tokenizer_file=tokenizer_file,
117 | unk_token=unk_token,
118 | bos_token=bos_token,
119 | eos_token=eos_token,
120 | add_prefix_space=add_prefix_space,
121 | **kwargs,
122 | )
123 |
124 | pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
125 | if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
126 | pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
127 | pre_tok_state["add_prefix_space"] = add_prefix_space
128 | self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
129 |
130 | self.add_prefix_space = add_prefix_space
131 |
132 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
133 | files = self._tokenizer.model.save(save_directory, name=filename_prefix)
134 | return tuple(files)
135 |
136 | def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
137 | """This corresponds to DialoGPT variants of models."""
138 | input_ids = []
139 | for is_user, text in conversation.iter_texts():
140 | input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
141 |
142 | if len(input_ids) > self.model_max_length:
143 | input_ids = input_ids[-self.model_max_length :]
144 | return input_ids
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/utils/agd.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union
2 |
3 | import numpy as np
4 | import torch
5 | from torch import Tensor
6 |
7 | Params = Union[Iterable[Tensor], Iterable[Dict[str, Any]]]
8 |
9 | LossClosure = Callable[[], float]
10 | OptLossClosure = Optional[LossClosure]
11 | Betas2 = Tuple[float, float]
12 | State = Dict[str, Any]
13 | OptFloat = Optional[float]
14 | Nus2 = Tuple[float, float]
15 |
16 | __all__ = ("AGD",)
17 |
18 |
19 | class AGD(torch.optim.Optimizer):
20 | r"""AGD: an Auto-switchable Optimizer using Stepwise Gradient Difference as Preconditioning Matrix.
21 | Arguments:
22 | params (Params): Collection of parameters to be optimized, or an iterable of dictionaries specifying separate groups.
23 | lr (float, optional): The learning rate. Default is 1e-3.
24 | betas (tuple of 2 floats, optional): Coefficients used for computing running averages of gradient and its square. Default is (0.9, 0.999).
25 | delta (float, optional): Small constant for numerical stability to prevent division by zero. Default is 1e-5.
26 | weight_decay (float, optional): Weight decay coefficient. Default is 0.0.
27 | amsgrad (bool, optional): If set to True, applies the AMSGrad variant of the optimizer. Default is False.
28 | win (bool, optional): If set to True, applies the Win variant of the optimizer. Default is False.
29 | clip (bool, optional): Total update clip to prevent abnormal updates. Default is None.
30 | """
31 |
32 | def __init__(
33 | self,
34 | params: Params,
35 | lr: float = 1e-3,
36 | betas: Betas2 = (0.9, 0.999),
37 | delta: float = 1e-5,
38 | weight_decay: float = 0.0,
39 | amsgrad: bool = False,
40 | win: bool = False,
41 | clip: float = None,
42 | ) -> None:
43 | if lr <= 0.0:
44 | raise ValueError("Invalid learning rate: {}".format(lr))
45 | if delta < 0.0:
46 | raise ValueError("Invalid delta value: {}".format(delta))
47 | if not 0.0 <= betas[0] < 1.0:
48 | raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
49 | if not 0.0 <= betas[1] < 1.0:
50 | raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
51 | if weight_decay < 0.0:
52 | raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
53 |
54 | defaults = dict(
55 | lr=lr,
56 | betas=betas,
57 | delta=delta,
58 | weight_decay=weight_decay,
59 | amsgrad=amsgrad,
60 | win=win,
61 | clip=clip,
62 | )
63 | super(AGD, self).__init__(params, defaults)
64 |
65 | def step(self, closure: OptLossClosure = None) -> OptFloat:
66 | loss = None
67 | if closure is not None:
68 | loss = closure()
69 |
70 | for group in self.param_groups:
71 | beta1, beta2 = group["betas"]
72 |
73 | for p in group["params"]:
74 | if p.grad is None:
75 | continue
76 | grad = p.grad.data
77 | if grad.is_sparse:
78 | msg = "AGD does not support sparse gradients."
79 | raise RuntimeError(msg)
80 |
81 | state = self.state[p]
82 | # Lazy state initialization
83 | if len(state) == 0:
84 | state["step"] = 0
85 | # Exponential moving average of gradient values
86 | state["exp_avg"] = torch.zeros_like(p, memory_format=torch.preserve_format)
87 | # Exponential moving average of squared gradient values
88 | state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
89 | if group["amsgrad"]:
90 | # Maintains max of all exp. moving avg. of sq. grad. values
91 | state["max_exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
92 | if group["win"]:
93 | state["z"] = torch.zeros_like(p, memory_format=torch.preserve_format)
94 |
95 | exp_avg, exp_avg_sq = (
96 | state["exp_avg"],
97 | state["exp_avg_sq"],
98 | )
99 |
100 | state["step"] += 1
101 | exp_avg_old = exp_avg.detach().clone()
102 | exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
103 | bias_correction1_old = 1 - beta1 ** (state["step"] - 1)
104 | bias_correction1, bias_correction2 = (
105 | 1 - beta1 ** state["step"],
106 | 1 - beta2 ** state["step"],
107 | )
108 | update = (
109 | exp_avg * (1 / bias_correction1)
110 | if state["step"] == 1
111 | else exp_avg * (1 / bias_correction1) - exp_avg_old * (1 / bias_correction1_old)
112 | )
113 | exp_avg_sq.mul_(beta2).addcmul_(update, update, value=1 - beta2)
114 |
115 | if group["amsgrad"]:
116 | max_exp_avg_sq = state["max_exp_avg_sq"]
117 | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
118 | update = max_exp_avg_sq.sqrt()
119 | else:
120 | update = exp_avg_sq.sqrt()
121 |
122 | delta_adjust = group["delta"] * np.sqrt(bias_correction2)
123 | update.clamp_(min=delta_adjust)
124 |
125 | lr_adjust = group["lr"] * np.sqrt(bias_correction2) / bias_correction1
126 | update = exp_avg / update
127 | if group["clip"] is not None:
128 | update.clamp_(min=-group["clip"], max=group["clip"])
129 | weight_decay = group["weight_decay"]
130 | if not group["win"]:
131 | p.data.mul_(1 - group["lr"] * weight_decay).add_(update, alpha=-lr_adjust)
132 | else:
133 | z = state["z"]
134 | z.data.add_(update, alpha=-lr_adjust).mul_(1.0 / (1.0 + weight_decay * lr_adjust))
135 | lr_adjust2 = 2 * lr_adjust
136 | tao = 1.0 / (3.0 + lr_adjust2 * weight_decay)
137 | p.data.mul_(tao).add_(update, alpha=-tao * lr_adjust2).add_(z, alpha=2 * tao)
138 | return loss
139 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/inference/hf_inference.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @author Chaoyu Chen
3 | # @date 2024/1/4
4 | # @module hf_inference.py
5 | """
6 | # @author qumu
7 | # @date 2023/9/19
8 | # @module hf_inference.py
9 | """
10 | import os
11 | import sys
12 | import torch
13 | import textwrap
14 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
15 | from peft import PeftModel
16 |
17 |
18 | def load_model_tokenizer(
19 | path,
20 | model_type=None,
21 | peft_path=None,
22 | torch_dtype=torch.bfloat16,
23 | quantization=None,
24 | eos_token=None,
25 | pad_token=None,
26 | batch_size=1,
27 | ):
28 | """
29 | load model and tokenizer by transfromers
30 | """
31 |
32 | # load tokenizer first
33 | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
34 | tokenizer.padding_side = "left"
35 |
36 | config, unused_kwargs = AutoConfig.from_pretrained(path, trust_remote_code=True, return_unused_kwargs=True)
37 | print("unused_kwargs:", unused_kwargs)
38 | print("config input:\n", config)
39 |
40 | # eos token parsing
41 | if eos_token:
42 | eos_token = eos_token
43 | eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)
44 | print(f"eos_token {eos_token} from user input")
45 | elif hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id:
46 | print(f"Initial eos_token_id {tokenizer.eos_token_id} from tokenizer")
47 | eos_token_id = tokenizer.eos_token_id
48 | eos_token = tokenizer.convert_ids_to_tokens(eos_token_id)
49 | elif hasattr(tokenizer, "eos_token") and tokenizer.eos_token:
50 | print(f"Initial eos_token {tokenizer.eos_token} from tokenizer")
51 | eos_token = tokenizer.eos_token
52 | eos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
53 | elif hasattr(config, "eos_token_id") and config.eos_token_id:
54 | print(f"Initial eos_token_id {config.eos_token_id} from config.json")
55 | eos_token_id = config.eos_token_id
56 | eos_token = tokenizer.convert_ids_to_tokens(config.eos_token_id)
57 | elif hasattr(config, "eos_token") and config.eos_token:
58 | print(f"Initial eos_token {config.eos_token} from config.json")
59 | eos_token = config.eos_token
60 | eos_token_id = tokenizer.convert_tokens_to_ids(config.eos_token)
61 | else:
62 | raise ValueError(
63 | "No available eos_token or eos_token_id, please provide eos_token by params or eos_token_id by config.json"
64 | )
65 |
66 | try:
67 | tokenizer.eos_token = eos_token
68 | tokenizer.eos_token_id = eos_token_id
69 | # set pad_token to be same as eos_token, it is ok because is will be masked out.
70 | tokenizer.pad_token = eos_token
71 | tokenizer.pad_token_id = eos_token_id
72 | except:
73 | print(f"[WARNING]Cannot set tokenizer.eos_token")
74 |
75 | print(f"tokenizer's eos_token: {tokenizer.eos_token}, pad_token: {tokenizer.pad_token}")
76 | print(f"tokenizer's eos_token_id: {tokenizer.eos_token_id}, pad_token_id: {tokenizer.pad_token_id}")
77 | print(type(tokenizer))
78 |
79 | base_model = AutoModelForCausalLM.from_pretrained(
80 | path,
81 | config=config,
82 | load_in_8bit=(quantization == "8bit"),
83 | load_in_4bit=(quantization == "4bit"),
84 | device_map="auto",
85 | torch_dtype=torch_dtype,
86 | trust_remote_code=True,
87 | low_cpu_mem_usage=True,
88 | )
89 |
90 | if peft_path:
91 | print("Loading PEFT MODEL...")
92 | model = PeftModel.from_pretrained(base_model, peft_path)
93 | else:
94 | print("Loading Original MODEL...")
95 | model = base_model
96 |
97 | model.eval()
98 |
99 | print("=======================================MODEL Configs=====================================")
100 | print(model.config)
101 | print("=========================================================================================")
102 | print("=======================================MODEL Archetecture================================")
103 | print(model)
104 | print("=========================================================================================")
105 |
106 | return model, tokenizer
107 |
108 |
109 | def hf_inference(model, tokenizer, text_list, args=None, max_new_tokens=512, do_sample=True, **kwargs):
110 | """
111 | transformers models inference by huggingface
112 | """
113 | # text_list = [tokenizer.apply_chat_template([{"role": "user", "content": text}], tokenize=False) for text in text_list]
114 | inputs = tokenizer(text_list, return_tensors="pt", padding=True, add_special_tokens=False).to("cuda")
115 | # inputs["attention_mask"][0][:100] = 0
116 | # print(inputs)
117 | print("================================Prompts and Generations=============================")
118 |
119 | outputs = model.generate(
120 | inputs=inputs["input_ids"],
121 | attention_mask=inputs["attention_mask"],
122 | max_new_tokens=max_new_tokens,
123 | do_sample=do_sample,
124 | eos_token_id=tokenizer.eos_token_id,
125 | pad_token_id=tokenizer.pad_token_id,
126 | **kwargs,
127 | )
128 |
129 | gen_text = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
130 | for i in range(len(text_list)):
131 | print("=========" * 10)
132 | print(f"Prompt:\n{text_list[i]}")
133 | gen_text[i] = gen_text[i].replace(tokenizer.pad_token, "")
134 | print(f"Generation:\n{gen_text[i]}")
135 | # print(f"Outputs ids:\n{outputs[i]}")
136 | sys.stdout.flush()
137 |
138 | return gen_text
139 |
140 |
141 | if __name__ == "__main__":
142 | # Default template used in MFTCoder training
143 | HUMAN_ROLE_START_TAG = "human\n"
144 | BOT_ROLE_START_TAG = "bot\n"
145 |
146 | instruction = "Write quick sort function in python."
147 |
148 | prompts = [f"{HUMAN_ROLE_START_TAG}{instruction}\n{BOT_ROLE_START_TAG}"]
149 |
150 | # if you use base + adaptor for inference, provide peft_path or left it None for normal inference
151 | base_model = "path/to/basemodel"
152 | peft_path = None
153 | model, tokenizer = load_model_tokenizer(
154 | base_model, model_type="", peft_path=peft_path, eos_token="", pad_token=""
155 | )
156 |
157 | # hf_inference(model, tokenizer, prompts, do_sample=False, num_beams=1, num_return_sequences=1)
158 | hf_inference(model, tokenizer, prompts, do_sample=True, temperature=0.8)
159 |
--------------------------------------------------------------------------------
/mftcoder_atorch/model/build_model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import sys
4 | sys.path.append("..")
5 | from utils.common_utils import get_model_params_num
6 | from transformers import ( # noqa: E402
7 | CONFIG_MAPPING,
8 | AutoConfig,
9 | AutoModelForCausalLM,
10 | AutoTokenizer,
11 | PreTrainedTokenizerFast
12 | )
13 | from .gpt_neox.configuration_gpt_neox import GPTNeoXConfig
14 | from .gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
15 | from .gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
16 |
17 | from torch.distributed.fsdp import (
18 | FullyShardedDataParallel as FSDP,
19 | StateDictType,
20 | )
21 | from utils.common_utils import print_rank_0, is_old_version
22 | from tokenizer import build_tokenizer
23 | from tokenizer.tokenizer import HFTokenizer
24 |
25 | import peft
26 | from peft.tuners.lora import LoraLayer
27 | from model.peft.utils import prepare_model_for_kbit_training
28 | from peft import ( # noqa
29 | LoraConfig,
30 | PrefixTuningConfig,
31 | PromptEncoderConfig,
32 | PromptEncoderReparameterizationType,
33 | PromptTuningConfig,
34 | PromptTuningInit,
35 | TaskType,
36 | get_peft_model
37 | )
38 | import model.peft.modeling_peft # noqa
39 | from model.peft.tuner import AdaLoraConfig
40 |
41 | try:
42 | from transformers import BitsAndBytesConfig
43 | except ImportError:
44 | BitsAndBytesConfig = None
45 | try:
46 | import bitsandbytes as bnb # noqa
47 | except ImportError:
48 | bnb = None
49 | from packaging import version
50 |
51 |
52 | def find_all_linear_names(args, model):
53 | cls = bnb.nn.Linear4bit if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
54 | lora_module_names = set()
55 | for name, module in model.named_modules():
56 | if isinstance(module, cls):
57 | names = name.split('.')
58 | lora_module_names.add(names[0] if len(names) == 1 else names[-1])
59 | if 'lm_head' in lora_module_names: # needed for 16-bit
60 | lora_module_names.remove('lm_head')
61 | return list(lora_module_names)
62 |
63 |
64 | def setup_model(args, logger, use_cache=False):
65 | # Load pretrained model and tokenizer
66 |
67 | if args.pretrained_model_path:
68 | if args.model_type == 'gpt_neox':
69 | tokenizer = GPTNeoXTokenizerFast.from_pretrained(args.pretrained_model_path)
70 | tokenizer.eod_token = "<|endoftext|>"
71 | tokenizer.pad_token = "<|pad|>"
72 | tokenizer.sop_token = "<|endoftext|>"
73 | tokenizer.eop_token = "<|endoftext|>"
74 | tokenizer.eod_id = tokenizer.convert_tokens_to_ids(tokenizer.eod_token)
75 | tokenizer.pad_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
76 |
77 | print_rank_0(f'tokenizer {tokenizer.eod_token} id: {tokenizer.eod_id}')
78 | print_rank_0(f'tokenizer {tokenizer.pad_token} id: {tokenizer.pad_id}')
79 | else:
80 | raise ValueError(
81 | "You are instantiating a new tokenizer from scratch. This is not supported by this script."
82 | "You can do it from another script, save it, and load it from here, using --tokenizer_path."
83 | )
84 |
85 | if args.model_type == 'gpt_neox':
86 | auto_config = GPTNeoXConfig
87 | auto_model_class = GPTNeoXForCausalLM
88 | else:
89 | auto_config = AutoConfig
90 | auto_model_class = AutoModelForCausalLM
91 |
92 | # with init_empty_weights_with_disk_offload(ignore_tie_weights=False):
93 | if args.pretrained_model_path:
94 | logger.info("Training model from checkpoint")
95 | config = auto_config.from_pretrained(args.pretrained_model_path)
96 | if args.peft_type != "qlora":
97 | model = auto_model_class.from_pretrained(args.pretrained_model_path, trust_remote_code=True).cuda()
98 | # TODO: qlora
99 | else:
100 | logger.info("Training model from scratch")
101 | if args.model_type == 'gpt_neox':
102 | config = GPTNeoXConfig.from_json_file(args.config_path + '/config.json')
103 | model = GPTNeoXForCausalLM._from_config(config)
104 | else:
105 | config = AutoConfig.from_json_file(args.config_path + '/config.json')
106 | model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code)
107 |
108 | # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
109 | # on a small vocab and want a smaller embedding size, remove this test.
110 | embedding_size = model.get_input_embeddings().weight.shape[0]
111 | print_rank_0('embedding size: ' + str(embedding_size))
112 | print_rank_0('vocab size: ' + str(tokenizer.vocab_size))
113 | if tokenizer.vocab_size > embedding_size:
114 | model.resize_token_embeddings(tokenizer.vocab_size)
115 | print_rank_0('resize embedding size: ' + str(model.get_input_embeddings().weight.shape[0]))
116 |
117 | print_rank_0(config)
118 | num_params = get_model_params_num(model)
119 | print_rank_0("num_params of this model:", num_params)
120 | args.total_model_param = num_params
121 | args.hidden_size = config.hidden_size
122 | args.num_hidden_layers = config.num_hidden_layers
123 | args.vocab_size = tokenizer.vocab_size
124 | print_rank_0(f'hidden size: {args.hidden_size}')
125 | print_rank_0(f'num hidden layers: {args.num_hidden_layers}')
126 | print_rank_0(f'vocab size: {args.vocab_size}')
127 |
128 | if args.peft_type:
129 | if args.peft_type in ['lora', 'qlora']:
130 | target_modules = None
131 | # TODO: qlora
132 | target_modules = ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
133 | print_rank_0(f'target modules: {target_modules}')
134 | peft_config = LoraConfig(
135 | task_type=TaskType.ANT_CAUSAL_LM,
136 | inference_mode=False,
137 | r=96,
138 | lora_alpha=32,
139 | lora_dropout=0.05,
140 | target_modules=target_modules,
141 | )
142 | logger.info(
143 | f"Load Peft {args.peft_type} model ......")
144 | if args.checkpoint_activations and args.peft_type in ["lora", "qlora"]:
145 | # Make Lora and gradient checkpointing compatible
146 | # https://github.com/huggingface/peft/issues/137
147 | model.enable_input_require_grads()
148 | model = get_peft_model(model, peft_config)
149 | logger.info(
150 | f"Reduce trainalbe params:\n")
151 | model.print_trainable_parameters()
152 |
153 | return model, config, tokenizer
154 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/gpt_bigcode/configuration_gpt_bigcode.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2023 The BigCode team and HuggingFace Inc. team.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ GPTBigCode configuration"""
16 |
17 | from transformers.configuration_utils import PretrainedConfig
18 | from transformers.utils import logging
19 |
20 |
21 | logger = logging.get_logger(__name__)
22 |
23 | GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24 | "bigcode/gpt_bigcode-santacoder": "https://huggingface.co/bigcode/gpt_bigcode-santacoder/resolve/main/config.json",
25 | }
26 |
27 |
28 | class GPTBigCodeConfig(PretrainedConfig):
29 | """
30 | This is the configuration class to store the configuration of a [`GPTBigCodeModel`]. It is used to instantiate a
31 | GPTBigCode model according to the specified arguments, defining the model architecture. Instantiating a
32 | configuration with the defaults will yield a similar configuration to that of the GPTBigCode
33 | [gpt_bigcode](https://huggingface.co/gpt_bigcode) architecture.
34 |
35 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36 | documentation from [`PretrainedConfig`] for more information.
37 |
38 |
39 | Args:
40 | vocab_size (`int`, *optional*, defaults to 50257):
41 | Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
42 | `inputs_ids` passed when calling [`GPTBigCodeModel`].
43 | n_positions (`int`, *optional*, defaults to 1024):
44 | The maximum sequence length that this model might ever be used with. Typically set this to something large
45 | just in case (e.g., 512 or 1024 or 2048).
46 | n_embd (`int`, *optional*, defaults to 768):
47 | Dimensionality of the embeddings and hidden states.
48 | n_layer (`int`, *optional*, defaults to 12):
49 | Number of hidden layers in the Transformer encoder.
50 | n_head (`int`, *optional*, defaults to 12):
51 | Number of attention heads for each attention layer in the Transformer encoder.
52 | n_inner (`int`, *optional*, defaults to None):
53 | Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
54 | activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
55 | Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new",
56 | "gelu_pytorch_tanh"]`.
57 | resid_pdrop (`float`, *optional*, defaults to 0.1):
58 | The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
59 | embd_pdrop (`float`, *optional*, defaults to 0.1):
60 | The dropout ratio for the embeddings.
61 | attn_pdrop (`float`, *optional*, defaults to 0.1):
62 | The dropout ratio for the attention.
63 | layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
64 | The epsilon to use in the layer normalization layers.
65 | initializer_range (`float`, *optional*, defaults to 0.02):
66 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67 | scale_attn_weights (`bool`, *optional*, defaults to `True`):
68 | Scale attention weights by dividing by sqrt(hidden_size)..
69 | use_cache (`bool`, *optional*, defaults to `True`):
70 | Whether or not the model should return the last key/values attentions (not used by all models).
71 | attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
72 | Whether to call the fused softmax in float32.
73 | scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
74 | Whether to scale the attention softmax in float32.
75 | attention_type (`bool`, *optional*, defaults to `True`):
76 | Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`).
77 | Example:
78 |
79 | ```python
80 | >>> from transformers import GPTBigCodeConfig, GPTBigCodeModel
81 |
82 | >>> # Initializing a GPTBigCode configuration
83 | >>> configuration = GPTBigCodeConfig()
84 |
85 | >>> # Initializing a model (with random weights) from the configuration
86 | >>> model = GPTBigCodeModel(configuration)
87 |
88 | >>> # Accessing the model configuration
89 | >>> configuration = model.config
90 | ```"""
91 |
92 | model_type = "gpt_bigcode"
93 | keys_to_ignore_at_inference = ["past_key_values"]
94 | attribute_map = {
95 | "hidden_size": "n_embd",
96 | "max_position_embeddings": "n_positions",
97 | "num_attention_heads": "n_head",
98 | "num_hidden_layers": "n_layer",
99 | }
100 |
101 | def __init__(
102 | self,
103 | vocab_size=50257,
104 | n_positions=1024,
105 | n_embd=768,
106 | n_layer=12,
107 | n_head=12,
108 | n_inner=None,
109 | activation_function="gelu_pytorch_tanh",
110 | resid_pdrop=0.1,
111 | embd_pdrop=0.1,
112 | attn_pdrop=0.1,
113 | layer_norm_epsilon=1e-5,
114 | initializer_range=0.02,
115 | scale_attn_weights=True,
116 | use_cache=True,
117 | bos_token_id=50256,
118 | eos_token_id=50256,
119 | attention_softmax_in_fp32=True,
120 | scale_attention_softmax_in_fp32=True,
121 | multi_query=True,
122 | **kwargs,
123 | ):
124 | self.vocab_size = vocab_size
125 | self.n_positions = n_positions
126 | self.n_embd = n_embd
127 | self.n_layer = n_layer
128 | self.n_head = n_head
129 | self.n_inner = n_inner
130 | self.activation_function = activation_function
131 | self.resid_pdrop = resid_pdrop
132 | self.embd_pdrop = embd_pdrop
133 | self.attn_pdrop = attn_pdrop
134 | self.layer_norm_epsilon = layer_norm_epsilon
135 | self.initializer_range = initializer_range
136 | self.scale_attn_weights = scale_attn_weights
137 | self.use_cache = use_cache
138 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32
139 | self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
140 | self.multi_query = multi_query
141 |
142 | self.bos_token_id = bos_token_id
143 | self.eos_token_id = eos_token_id
144 |
145 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
146 |
--------------------------------------------------------------------------------
/mftcoder_atorch/README_cn.md:
--------------------------------------------------------------------------------
1 | # MFTCoder训练: Atorch框架篇
2 | [](https://huggingface.co/codefuse-ai)
3 |
4 |
5 |
6 |
7 | [**中文**] [[English]](README.md)
8 |
9 | ## 1. 更新
10 |
11 | 🔥 MFTCoder在Atorch框架下支持GPTNeoX模型的微调;
12 |
13 | 🔥 MFTCoder支持全量的有监督微调;
14 |
15 | 🔥 MFTCoder支持LoRA微调;
16 |
17 | ## 2. 数据格式
18 |
19 | ### 2.1 训练数据格式
20 | 训练数据为jsonl格式,每一行的数据格式如下,其中chat_rounds字段是必需的,可以根据实际需求添加或删除其他字段。
21 | 可以参考项目中的xxx.jsonl文件。
22 | ```json
23 | {
24 | "id":0,
25 | "data_name":"code-helper",
26 | "chat_rounds":[
27 | {
28 | "role": "system",
29 | "content": "你是一个智能代码助手,可以回复用户与代码相关的问题",
30 | "chat_round_id": 0
31 | },
32 | {
33 | "role": "human",
34 | "content": "写一个快速排序",
35 | "chat_round_id": 1
36 | },
37 | {
38 | "role": "bot",
39 | "content": "以下是一个快速排序算法xxxxxx",
40 | "chat_round_id": 1
41 | },
42 | {
43 | "role": "human",
44 | "content": "解释一下这段代码",
45 | "chat_round_id": 2
46 | },
47 | {
48 | "role": "bot",
49 | "content": "好的,这段代码xxx",
50 | "chat_round_id": 2
51 | }
52 | ]
53 | }
54 | ```
55 |
56 | ### 2.2 推理数据格式
57 | 推理数据格式为模型在训练数据格式下拼接的字符串形式,它也是推理时输入prompt拼接的方式:
58 | ```python
59 | """
60 | <|role_start|>system<|role_end|>这是System指令
61 | <|role_start|>human<|role_end|>这是第1轮用户输入的问题
62 | <|role_start|>bot<|role_end|>这是第1轮模型生成的内容
63 | <|role_start|>human<|role_end|>这是第2轮用户输入的问题
64 | <|role_start|>bot<|role_end|>这是第2轮模型生成的内容
65 | ...
66 | ...
67 | ...
68 | <|role_start|>human<|role_end|>这是第n轮用户输入的问题
69 | <|role_start|>bot<|role_end|>{模型现在要生成的内容}
70 | """
71 | ```
72 |
73 |
74 | ## 3. 模型训练
75 | 目前 "MFTCoder/mft_atorch" 代码库支持全量参数指令微调和LoRA指令微调。
76 | 目前仅支持GPTNeoX模型的训练,理论上,HuggingFace上开源的GPTNeoX模型权重,均可使用本项目进行训练。
77 |
78 | 我们将训练中使用的各种组件抽取出来,以便后续的扩展和优化,详见主目录下的实现。微调训练的入口目录是```train/```, 训练入口文件是```train/run_train.py```, 参数配置存储在启动脚本```train/run_gpt_*.sh```等文件中,方便统一管理和更改。
79 |
80 | ### 3.1 数据格式
81 | 训练时,我们将多轮对话拼接成如下格式,然后进行tokenize。其中<|role_start|>human<|role_end|>表示human输入提示符,<|role_start|>bot<|role_end|>表示bot输出提示符,`````````` 表示eos_token。
82 | ```
83 | "<|role_start|>human<|role_end|>input1target1input2target2...
84 | ```
85 | 在计算loss时,我们通过mask的方式,input部分的loss不参与参数更新,只有“target”部分的loss参与参数更新。
86 | 这种方式充分利用了模型并行计算的优势,训练更加高效,且多轮对话中的每个target部分都参与了训练,训练更充分。
87 | 否则,就需要把一个n轮对话,拆分成n条数据,且只计算最后一个target的loss,大大降低了训练效率。
88 |
89 | ### 3.2 全量SFT
90 |
91 | 执行如下命令即可进行全量SFT:
92 | ```bash
93 | sh run_gpt_mft.sh 10 1 8 5
94 | ```
95 |
96 | 需注意,启动脚本后的四个参数,分别是:
97 | - 第一个参数是总的per gpu batch size
98 | - 第二个参数是tensor parallel数(暂时只支持1)
99 | - 第三个参数是data parallel数,与所用GPU数保持一致
100 | - 第四个参数是训练epoch数
101 |
102 | 后面其他的训练方式启动脚本,也同样需要配置这四个参数
103 |
104 | ### 3.3 LoRA微调
105 |
106 | 执行如下命令即可进行Lora微调:
107 | ```bash
108 | sh run_gpt_mft_peft.sh 10 1 8 5
109 | ```
110 |
111 | ### 3.4 启动脚本中主要参数说明
112 | ```train/run_gpt_*.sh```中的主要参数说明如下,以下参数可以根据需求进行修改,其他参数建议不做修改:
113 | - tokenize_mode: 目前仅支持"sft"。
114 |
115 | - train_mode: 目前仅支持"sft"。
116 |
117 | - load_raw_dataset: 需要保持"True",后续会支持其它模式数据,当前仅支持jsonl输入
118 |
119 | - data_paths: "[path1,path2,path3]" 输入数据地址,字符串,开头结尾用[],中间用```,```间隔不同path,每个path是一个目录,目录的最后一级名字作为任务名称,下面包含1到多个jsonl数据。
120 |
121 | - output_dir: 训练输出目录,存储checkpoint、lora_adaptor checkpoint等。
122 |
123 | - tensorboard_dir: 可以暂时忽略,实际tensorboard存储在output_dir的runs目录下。
124 |
125 | - model_type: 目前仅支持 gpt_neox。
126 |
127 | - peft_type: 目前仅支持 lora。
128 |
129 | - pretrained_model_path: 预训练模型的本地目录。
130 |
131 | - total_train_batch_size: 所有显卡train的batch size的总和,会根据启动脚本时输入的per gpu batch size自动计算。
132 |
133 | - per_device_valid_batch_size: 每张显卡eval的batch size,会根据启动脚本时输入的per gpu batch size自动计算。
134 |
135 | - gradient_accumulation_steps: 梯度累计步数。global batch=num_gpus * per_device_train_batch_size * gradient_accumulation_steps。
136 |
137 | - checkpoint_activations: 如果显存捉襟见肘,可以开启。以时间换空间,模型不缓存激活状态,会进行两次forward计算,以节省显存。
138 |
139 | - learning_rate: 学习率。全量参数微调的时候,建议小一些,1e-5或5e-6。qlora中的学习率设置更大一些,一般为1e-4、2e-4。
140 |
141 | - min_lr: 最低学习率, 一般是learning_rate的十分之一。
142 |
143 | - seq_length: 训练时的最大长度。按照自己的设备进行设置,越长需要占用越多显存。
144 |
145 | - log_interval: 每隔多少步统计一次train loss。
146 |
147 | - checkpointing_steps: 每隔多少步保存一个模型。
148 |
149 | - evalation_steps: 每隔多少步在验证集上evaluate一次。
150 |
151 | - early_stopping_patience: 多少个eval point不继续收敛,则停止训练。
152 |
153 | - lr_scheduler_type: 学习率变化策略。
154 |
155 | - num_warmup_steps: warm up步数,学习率经过多少步,增长到指定的数值。
156 |
157 | - seed: 随机种子,用于复现实验结果。
158 |
159 | - train_iters: 可以暂时设为比较小的数,如10,实际上不会影响训练步数,留作后面拓展读取其他形式数据集的功能。
160 |
161 | - valid_iters: 可以暂时设为比较小的数,如10,实际上不会影响训练步数,留作后面拓展读取其他形式数据集的功能。
162 |
163 | - evaluation_strategy: 训练期间evaluate的策略,"steps"表示每隔"valid_interval"步做一次evaluate,"epoch"表示每隔一个epoch做一次evaluate,支持同时开启。
164 |
165 | - save_strategy: 训练期间保存模型权重的策略,"steps"表示每隔"checkpointing_steps"步保存一次。
166 |
167 | - extra_save_by_epoch: 每过一个epoch是否要保存一个epoch级别的checkpoint。
168 |
169 | - save_total_limit: 最多保留的模型checkpoint个数,一般设置为2,会保留valid loss最低,以及最新的checkpoint,注意epoch级别的checkpoint会一直保留,且不受限制。
170 |
171 | - weighted_loss_mode: 多任务训练的loss加权方式。
172 |
173 |
174 | ## 4. 模型使用
175 |
176 | ### 4.1 权重合并
177 | 如果使用LoRA进行训练,本项目仅保存adapter的权重和配置文件,需要将adapter权重与base model进行合并。脚本见```utils/merge_base_and_lora_to_hf.py```
178 |
179 | ### 4.2 模型推理
180 | 我们提供了单轮对话和多轮对话的如下脚本,该脚本可同时兼容大部分huggingface格式的模型。
181 | ```python
182 | from transformers import (
183 | AutoTokenizer,
184 | AutoModelForCausalLM,
185 | )
186 | tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, trust_remote_code=True, use_fast=False, legacy=False)
187 | tokenizer.padding_side = "left"
188 | tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("")
189 | tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("")
190 | model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, trust_remote_code=True)
191 |
192 | HUMAN_ROLE_START_TAG = "<|role_start|>human<|role_end|>"
193 | BOT_ROLE_START_TAG = "<|role_start|>bot<|role_end|>"
194 | texts = ["write a python function of quick sort."]
195 | texts = [f"{HUMAN_ROLE_START_TAG}{text}{BOT_ROLE_START_TAG}" for text in texts]
196 |
197 | inputs = tokenizer(texts, return_tensors='pt', padding=True, add_special_tokens=False).to("cuda")
198 | outputs = model.generate(
199 | inputs=inputs["input_ids"],
200 | attention_mask=inputs["attention_mask"],
201 | max_new_tokens=512,
202 | top_p=0.95,
203 | temperature=0.1,
204 | do_sample=True,
205 | eos_token_id=tokenizer.eos_token_id,
206 | pad_token_id=tokenizer.pad_token_id
207 | )
208 | gen_text = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
209 | print(gen_text)
210 | ```
211 |
212 | 生成脚本中的top_p、temperature、repetition_penalty、do_sample等参数对模型的生成效果影响较大,可按照自己的使用场景进行调试修改。
213 | 实践中,在代码生成场景中,如果采样模式,do_sample=True, top_p=0.95, temperature=0.1是pass@1指标的不错选择;
214 | 如果非采样模式, do_sample=False, beam_num=1或者3是不错的选择,其中beam_num=1即为greedy decoding。
215 |
216 | ## 5. FAQ
217 | #### 问题1:OOM如何解决?
218 | 如果发生OOM,可以缩小per GPU batch size (启动训练脚本时的第一个参数)、seq_length等参数来缓解。也可以设gradient_checkpointing=true,可以大幅降低显存占用,但训练速度会变慢一些。
219 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/offline_tokenization/concat_sst_bin_tokenization.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import argparse
4 | import multiprocessing
5 | import os
6 | import sys
7 | import random
8 | import time
9 | import tqdm
10 | import glob
11 | import json
12 | import numpy as np
13 |
14 |
15 | # 将父目录的父目录加入path
16 | current_path = os.path.abspath(__file__)
17 | parent_dir = os.path.dirname(os.path.dirname(current_path))
18 | grandparent_dir = os.path.dirname(parent_dir)
19 | sys.path.append(grandparent_dir)
20 |
21 | from tokenizer import init_tokenizer
22 | from pack_encoder import PackSSTBinEncoder, load_tokenizer
23 | from data import indexed_dataset
24 |
25 | from threading import Semaphore
26 | from colorama import Fore
27 | import lm_fmt as lmd
28 |
29 |
30 | def yield_from_files(files: list, semaphore):
31 | """
32 | Iterator over input documents
33 |
34 | :param fnames: list of filenames
35 | """
36 | def yielder(fname, semaphore):
37 | with open(fname, 'r') as f:
38 | for line in f:
39 | semaphore.acquire()
40 | yield json.loads(line)
41 |
42 | for fname in files:
43 | semaphore.acquire()
44 | yield from yielder(fname, semaphore)
45 |
46 | def yield_from_files2(fnames: list, semaphore, sample_percent):
47 | """
48 | Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
49 | other compressed formats. Also filters out empty documents.
50 |
51 | :param fnames: list of filenames
52 | """
53 | def yielder(fname, semaphore):
54 | try:
55 | sample_interval = int(1/sample_percent)
56 | for f in filter(lambda x: x, lmd.Reader(fname).stream_data(key=None)):
57 | rand_value = random.randint(1, sample_interval*100)
58 | if rand_value % sample_interval != 0:
59 | continue
60 | semaphore.acquire()
61 |
62 | #rand_value = random.randint(1, sample_interval*100)
63 | #if rand_value % sample_interval != 0:
64 | # yield None
65 |
66 | yield f
67 | except Exception as e:
68 | print('####Exception:', e.args)
69 | yield None
70 |
71 | for fname in fnames:
72 | semaphore.acquire()
73 |
74 | yield from yielder(fname, semaphore)
75 |
76 |
77 | def print_example_doc(input_ids, tokenizer):
78 | print(Fore.YELLOW + f'INPUT IDS len: {len(input_ids)}')
79 | print(Fore.BLUE + f'INPUT IDS:\n {input_ids}\n\n')
80 |
81 | print(Fore.RED + f'DETOKENIZED INPUT:\n{tokenizer.decode(input_ids)}')
82 |
83 |
84 | def core_process(encoded_docs, semaphore, seq_length, tokenizer, encoder, builder, output_idx_file):
85 | """
86 | core of Data Pack SFT processing
87 | """
88 | input_ids_key = 'input_ids'
89 |
90 | proc_start = time.time()
91 | total_bytes_processed = 0
92 | pbar = tqdm.tqdm()
93 | sentence_droped = 0
94 | loss_token_cnt = 0
95 |
96 | print("PRINT BEFORE STREAM PROCESS DATA")
97 |
98 | print_example_count = 0
99 | for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
100 | total_bytes_processed += bytes_processed
101 |
102 | # release semaphore so `yield_from_files` can add another file to the buffer
103 | semaphore.release()
104 |
105 | # add each tokenized document / sentence,
106 | # For sft, each document has only one sample
107 | input_ids_sentence = doc[input_ids_key][0]
108 | if len(input_ids_sentence) < 1:
109 | sentence_droped += 1
110 | continue
111 |
112 | builder.add_item(np.array(input_ids_sentence, dtype=builder.dtype))
113 | builder.end_document()
114 | #builder.finalize_without_close(output_idx_file)
115 | #builder.add_item_and_end_document_and_finalize(np.array(input_ids_sentence, dtype=builder.dtype), output_idx_file)
116 |
117 | # print the first packed sample as example
118 | if print_example_count < 1:
119 | print_example_doc(input_ids_sentence, tokenizer)
120 | print_example_count += 1
121 |
122 | # log progress
123 | if i % 100 == 0:
124 | current = time.time()
125 | elapsed = current - proc_start
126 | mbs = total_bytes_processed / elapsed / 1024 / 1024
127 | pbar.set_description(
128 | f"Processed {i} documents ({i / elapsed} docs/s, {mbs} MB/s)."
129 | )
130 | if i != 0:
131 | pbar.update(100)
132 |
133 | # 尾部处理
134 | builder.finalize(output_idx_file)
135 |
136 | print(Fore.RED + "\ndroped docs: {}".format(sentence_droped))
137 |
138 |
139 | def process_dataset(dataset_path, output_path, model_path, parallel_num, seq_length, dataset_name, sample_percent):
140 | """
141 | Re-organize samples in the given data path into a Data Pack file.
142 | """
143 |
144 | # get all jsonl files and corresponding reading handler
145 | files = glob.glob(os.path.join(dataset_path, '**/*.jsonl'), recursive=True)
146 |
147 | # build a semaphore object to stop `yield_from_files` from getting ahead
148 | # of encoder.encode and hence building up memory
149 | semaphore = Semaphore(1000 + parallel_num)
150 |
151 | # build sample iterator
152 | sample_iterator = yield_from_files2(files, semaphore, sample_percent)
153 |
154 | # load tokenizer
155 | # tokenizer = load_tokenizer(model_path, tokenizer_type)
156 | tokenizer = init_tokenizer(model_path)
157 | print('TOKEN of id=2:', tokenizer.convert_ids_to_tokens(2))
158 | print('ID of :', tokenizer.convert_tokens_to_ids(''))
159 | print('TOKEN of id=0:', tokenizer.convert_ids_to_tokens(0))
160 | print('ID of :', tokenizer.convert_tokens_to_ids(''))
161 |
162 | # init encoder
163 | encoder = PackSSTBinEncoder(seq_length, model_path)
164 |
165 | # create writer builder
166 | key = "input_ids"
167 | output_prefix = os.path.join(output_path, dataset_name)
168 | output_bin_file = "{}_{}.bin".format(
169 | output_prefix, key
170 | )
171 | output_idx_file = "{}_{}.idx".format(
172 | output_prefix, key
173 | )
174 | builder = indexed_dataset.make_builder(
175 | output_bin_file,
176 | impl="mmap",
177 | vocab_size=tokenizer.vocab_size,
178 | )
179 |
180 | if parallel_num > 1:
181 | pool = multiprocessing.Pool(parallel_num, initializer=encoder.initializer)
182 | encoded_docs = pool.imap(encoder.encode, sample_iterator, chunksize=32)
183 | else:
184 | encoder.initializer()
185 | encoded_docs = (encoder.encode(doc) for doc in sample_iterator)
186 |
187 | if dataset_name is None:
188 | dataset_path = dataset_path[:-1] if dataset_path.endswith(os.path.sep) else dataset_path
189 | dataset_name = dataset_path.split(os.path.sep)[-1]
190 |
191 | core_process(encoded_docs, semaphore, seq_length, tokenizer, encoder, builder, output_idx_file)
192 |
193 |
194 | def main(data_path, output_path, model_path, parallel_num, seq_length, dataset_name, sample_percent):
195 | """
196 | Entry
197 | """
198 |
199 | process_dataset(data_path, output_path, model_path, parallel_num, seq_length, dataset_name, sample_percent)
200 |
201 |
202 | if __name__ == "__main__":
203 | parser = argparse.ArgumentParser(description="Generate a packed jsonl file in the Data Pack SFT way.")
204 | parser.add_argument('--model-path', type=str, help='Path of a pretrained model which contains tokenizer-related files.')
205 | parser.add_argument('--parallel', type=int, default=1, help='The num of parallel processing.')
206 | parser.add_argument('--output-path', type=str, help='Path to store the genered result file.')
207 | parser.add_argument('--data-path', type=str, default=None, help='Path of files to be processed')
208 | parser.add_argument('--seq-length', type=int, default=4096, help='The max input length (i.e. the max number of tokens in a sample)')
209 | # parser.add_argument('--eod-token-id', type=int, default=2, help='EOD token id')
210 | # parser.add_argument('--pad-token-id', type=int, default=0, help='PAD token id')
211 | # parser.add_argument('--tokenizer-type', type=str, choices=["LLAMATokenizer", None], default=None, help="What type of tokenizer to use. Default is None.")
212 | parser.add_argument('--dataset-name', type=str, default=None, help='The generated result dataset name. The folder name will be token by default.')
213 | parser.add_argument('--sample-percent', type=float, default=1.0, help='Sample percentage')
214 |
215 | args = parser.parse_args()
216 | print('ARGS\n', '\n'.join([str(key) + ':' + str(value) for key,value in vars(args).items()]))
217 |
218 | random.seed(9999)
219 |
220 | main(args.data_path, args.output_path, args.model_path, args.parallel, args.seq_length, args.dataset_name, args.sample_percent)
221 |
--------------------------------------------------------------------------------
/mftcoder_accelerate/src/model/qwen/cache_autogptq_cuda_256.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 | // adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
6 | void vecquant8matmul_cuda(
7 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
8 | torch::Tensor scales, torch::Tensor zeros,
9 | torch::Tensor g_idx
10 | );
11 |
12 | void vecquant8matmul(
13 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
14 | torch::Tensor scales, torch::Tensor zeros,
15 | torch::Tensor g_idx
16 | ) {
17 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
18 | vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
19 | }
20 |
21 | void vecquant8matmul_batched_cuda(
22 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
23 | torch::Tensor scales, torch::Tensor zeros
24 | );
25 |
26 | void vecquant8matmul_batched(
27 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
28 | torch::Tensor scales, torch::Tensor zeros
29 | ) {
30 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
31 | vecquant8matmul_batched_cuda(vec, mat, mul, scales, zeros);
32 | }
33 |
34 | void vecquant8matmul_batched_column_compression_cuda(
35 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
36 | torch::Tensor scales, torch::Tensor zeros
37 | );
38 |
39 | void vecquant8matmul_batched_column_compression(
40 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
41 | torch::Tensor scales, torch::Tensor zeros
42 | ) {
43 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
44 | vecquant8matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
45 | }
46 |
47 | void vecquant4matmul_batched_cuda(
48 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
49 | torch::Tensor scales, torch::Tensor zeros
50 | );
51 |
52 | void vecquant4matmul_batched(
53 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
54 | torch::Tensor scales, torch::Tensor zeros
55 | ) {
56 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
57 | vecquant4matmul_batched_cuda(vec, mat, mul, scales, zeros);
58 | }
59 |
60 | void vecquant4matmul_batched_column_compression_cuda(
61 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
62 | torch::Tensor scales, torch::Tensor zeros
63 | );
64 |
65 | void vecquant4matmul_batched_column_compression(
66 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
67 | torch::Tensor scales, torch::Tensor zeros
68 | ) {
69 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
70 | vecquant4matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
71 | }
72 |
73 | void vecquant8matmul_batched_old_cuda(
74 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
75 | torch::Tensor scales, torch::Tensor zeros
76 | );
77 |
78 | void vecquant8matmul_batched_old(
79 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
80 | torch::Tensor scales, torch::Tensor zeros
81 | ) {
82 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
83 | vecquant8matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
84 | }
85 |
86 |
87 | void vecquant4matmul_batched_old_cuda(
88 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
89 | torch::Tensor scales, torch::Tensor zeros
90 | );
91 |
92 | void vecquant4matmul_batched_old(
93 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
94 | torch::Tensor scales, torch::Tensor zeros
95 | ) {
96 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
97 | vecquant4matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
98 | }
99 |
100 | void vecquant8matmul_batched_column_compression_old_cuda(
101 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
102 | torch::Tensor scales, torch::Tensor zeros
103 | );
104 |
105 | void vecquant8matmul_batched_column_compression_old(
106 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
107 | torch::Tensor scales, torch::Tensor zeros
108 | ) {
109 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
110 | vecquant8matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
111 | }
112 |
113 | void vecquant4matmul_batched_column_compression_old_cuda(
114 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
115 | torch::Tensor scales, torch::Tensor zeros
116 | );
117 |
118 | void vecquant4matmul_batched_column_compression_old(
119 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
120 | torch::Tensor scales, torch::Tensor zeros
121 | ) {
122 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
123 | vecquant4matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
124 | }
125 |
126 |
127 |
128 | void vecquant8matmul_batched_faster_cuda(
129 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
130 | torch::Tensor scales, torch::Tensor zeros
131 | );
132 |
133 | void vecquant8matmul_batched_faster(
134 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
135 | torch::Tensor scales, torch::Tensor zeros
136 | ) {
137 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
138 | vecquant8matmul_batched_faster_cuda(vec, mat, mul, scales, zeros);
139 | }
140 |
141 |
142 | void vecquant8matmul_batched_faster_old_cuda(
143 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
144 | torch::Tensor scales, torch::Tensor zeros
145 | );
146 |
147 | void vecquant8matmul_batched_faster_old(
148 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
149 | torch::Tensor scales, torch::Tensor zeros
150 | ) {
151 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
152 | vecquant8matmul_batched_faster_old_cuda(vec, mat, mul, scales, zeros);
153 | }
154 |
155 | void vecquant8matmul_batched_column_compression_faster_cuda(
156 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
157 | torch::Tensor scales, torch::Tensor zeros
158 | );
159 |
160 | void vecquant8matmul_batched_column_compression_faster(
161 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
162 | torch::Tensor scales, torch::Tensor zeros
163 | ) {
164 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
165 | vecquant8matmul_batched_column_compression_faster_cuda(vec, mat, mul, scales, zeros);
166 | }
167 |
168 |
169 | void vecquant8matmul_batched_column_compression_faster_old_cuda(
170 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
171 | torch::Tensor scales, torch::Tensor zeros
172 | );
173 |
174 | void vecquant8matmul_batched_column_compression_faster_old(
175 | torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
176 | torch::Tensor scales, torch::Tensor zeros
177 | ) {
178 | const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
179 | vecquant8matmul_batched_column_compression_faster_old_cuda(vec, mat, mul, scales, zeros);
180 | }
181 |
182 |
183 |
184 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
185 | m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
186 | m.def("vecquant8matmul_batched", &vecquant8matmul_batched, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
187 | m.def("vecquant8matmul_batched_old", &vecquant8matmul_batched_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
188 | m.def("vecquant8matmul_batched_faster", &vecquant8matmul_batched_faster, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
189 | m.def("vecquant8matmul_batched_faster_old", &vecquant8matmul_batched_faster_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
190 | m.def("vecquant4matmul_batched_old", &vecquant4matmul_batched_old, "Vector 4-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
191 | m.def("vecquant8matmul_batched_column_compression", &vecquant8matmul_batched_column_compression, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
192 | m.def("vecquant8matmul_batched_column_compression_old", &vecquant8matmul_batched_column_compression_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
193 | m.def("vecquant8matmul_batched_column_compression_faster", &vecquant8matmul_batched_column_compression_faster, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
194 | m.def("vecquant8matmul_batched_column_compression_faster_old", &vecquant8matmul_batched_column_compression_faster_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
195 | m.def("vecquant4matmul_batched_column_compression_old", &vecquant4matmul_batched_column_compression_old, "Vector old 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
196 | m.def("vecquant4matmul_batched", &vecquant4matmul_batched, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
197 | m.def("vecquant4matmul_batched_column_compression", &vecquant4matmul_batched_column_compression, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
198 | }
199 |
--------------------------------------------------------------------------------