├── data ├── __init__.py ├── helpers.cpython-38-x86_64-linux-gnu.so ├── Makefile ├── tokenization │ ├── generate_dataset.py │ └── preprocess_data.py ├── blendable_dataset.py ├── samplers.py └── get_data_from_hf.py ├── .DS_Store ├── assets ├── .DS_Store ├── PPL.png ├── logo.png ├── model.png └── passkey.png ├── model ├── .DS_Store ├── gpt_neox │ ├── .DS_Store │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── modeling_gpt_neox.cpython-38.pyc │ │ └── configuration_gpt_neox.cpython-38.pyc │ ├── generation_config.json │ ├── config.json │ ├── __init__.py │ ├── tokenization_gpt_neox_fast.py │ └── configuration_gpt_neox.py ├── llama │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── modeling_llama.cpython-38.pyc │ │ └── configuration_llama.cpython-38.pyc │ ├── config_7b.json │ ├── config.json │ ├── __init__.py │ ├── configuration_llama.py │ ├── tokenization_llama_fast.py │ └── convert_llama_weights_to_hf.py ├── __init__.py └── peft │ ├── tuner │ ├── pe_base_model.py │ ├── __init__.py │ ├── bitfit.py │ └── roem.py │ ├── __init__.py │ └── utils │ ├── __init__.py │ ├── config.py │ ├── mapping.py │ └── others.py ├── tools ├── .DS_Store └── analysis │ ├── MMapIndexDatasetParser.py │ ├── post_tokenization_check.py │ └── MMapTokenIdsBinChecker.py ├── train ├── .DS_Store ├── __init__.py ├── run_coca_llama.sh └── run_coca_neox.sh ├── inference ├── .DS_Store └── generate.py ├── .idea ├── .gitignore ├── misc.xml ├── vcs.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── Collinear-Constrained-Attention.iml └── modules.xml ├── LEGAL.md ├── tokenizer ├── __init__.py ├── train_tokenizer.py └── tokenizer.py ├── utils ├── __init__.py ├── merge_base_and_lora_to_hf.py ├── hselect.py ├── learning_rates.py └── common_utils.py ├── dockerfile └── README.md /data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/.DS_Store -------------------------------------------------------------------------------- /assets/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/.DS_Store -------------------------------------------------------------------------------- /assets/PPL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/PPL.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/logo.png -------------------------------------------------------------------------------- /assets/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/model.png -------------------------------------------------------------------------------- /model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/.DS_Store -------------------------------------------------------------------------------- /tools/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/tools/.DS_Store -------------------------------------------------------------------------------- /train/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/train/.DS_Store -------------------------------------------------------------------------------- /assets/passkey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/passkey.png -------------------------------------------------------------------------------- /inference/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/inference/.DS_Store -------------------------------------------------------------------------------- /model/gpt_neox/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/.DS_Store -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # 数据源本地存储已忽略文件 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # 基于编辑器的 HTTP 客户端请求 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /data/helpers.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/data/helpers.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /model/llama/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/llama/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /model/gpt_neox/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /model/gpt_neox/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token_id": 100256, 3 | "eos_token_id": 100256, 4 | "transformers_version": "4.26.0.dev0", 5 | "_from_model_config": true 6 | } 7 | -------------------------------------------------------------------------------- /model/llama/__pycache__/modeling_llama.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/llama/__pycache__/modeling_llama.cpython-38.pyc -------------------------------------------------------------------------------- /model/llama/__pycache__/configuration_llama.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/llama/__pycache__/configuration_llama.cpython-38.pyc -------------------------------------------------------------------------------- /model/gpt_neox/__pycache__/modeling_gpt_neox.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/__pycache__/modeling_gpt_neox.cpython-38.pyc -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /model/gpt_neox/__pycache__/configuration_gpt_neox.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/__pycache__/configuration_gpt_neox.cpython-38.pyc -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /.idea/Collinear-Constrained-Attention.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /LEGAL.md: -------------------------------------------------------------------------------- 1 | Legal Disclaimer 2 | 3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail. 4 | 5 | 法律免责声明 6 | 7 | 关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。 -------------------------------------------------------------------------------- /model/llama/config_7b.json: -------------------------------------------------------------------------------- 1 | {"architectures": ["LLaMAForCausalLM"], "bos_token_id": 0, "eos_token_id": 1, "hidden_act": "silu", "hidden_size": 4096, "intermediate_size": 11008, "initializer_range": 0.02, "max_sequence_length": 2048, "model_type": "llama", "num_attention_heads": 32, "num_hidden_layers": 32, "pad_token_id": -1, "rms_norm_eps": 1e-06, "torch_dtype": "float16", "transformers_version": "4.27.0.dev0", "use_cache": true, "vocab_size": 32000} -------------------------------------------------------------------------------- /model/llama/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": ["LLaMAForCausalLM"], 3 | "bos_token_id": 100256, 4 | "eos_token_id": 100256, 5 | "hidden_act": "silu", 6 | "hidden_size": 1024, 7 | "intermediate_size": 4096, 8 | "initializer_range": 0.02, 9 | "max_sequence_length": 512, 10 | "model_type": "llama", 11 | "num_attention_heads": 16, 12 | "num_hidden_layers": 24, 13 | "pad_token_id": 100737, 14 | "rms_norm_eps": 1e-06, 15 | "torch_dtype": "float16", 16 | "transformers_version": "4.27.0.dev0", 17 | "use_cache": true, 18 | "vocab_size": 100864, 19 | "use_xformers": false 20 | } -------------------------------------------------------------------------------- /train/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .run_train import * -------------------------------------------------------------------------------- /model/gpt_neox/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "GPTNeoXForCausalLM" 4 | ], 5 | "bos_token_id": 100256, 6 | "eos_token_id": 100256, 7 | "hidden_act": "gelu", 8 | "hidden_size": 1024, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 4096, 11 | "layer_norm_eps": 1e-05, 12 | "max_position_embeddings": 512, 13 | "model_type": "gpt_neox", 14 | "num_attention_heads": 16, 15 | "num_hidden_layers": 24, 16 | "rope_scaling": null, 17 | "rotary_emb_base": 10000, 18 | "rotary_pct": 1.0, 19 | "tie_word_embeddings": false, 20 | "torch_dtype": "float16", 21 | "transformers_version": "4.26.1", 22 | "use_cache": true, 23 | "use_parallel_residual": true, 24 | "vocab_size": 100864 25 | } -------------------------------------------------------------------------------- /tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from .tokenizer import build_tokenizer 17 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .common_utils import * 17 | from .auto_accelerate_utils import * -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | # from .gpt2_model import GPT2ModelPipe 19 | # from .utils import get_params_for_weight_decay_optimization 20 | # from .word_embeddings import SoftEmbedding 21 | -------------------------------------------------------------------------------- /model/peft/tuner/pe_base_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | class PEBaseModel: 17 | """PEtuning的基类模型,定义了PEtuning模型都该有的方法""" 18 | 19 | def __init__(): 20 | return 21 | 22 | def get_model(self): 23 | """对模型进行修改,冻结参数或者插入可训模块""" 24 | pass 25 | 26 | @classmethod 27 | def restore(self, model=None, path=None): 28 | """从path恢复PE模型 29 | 30 | Args: 31 | model (_type_, optional): 原始模型. Defaults to None. 32 | path (_type_, optional): 增量路径. Defaults to None. 33 | """ 34 | pass 35 | -------------------------------------------------------------------------------- /model/peft/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """peft models interface.""" 17 | 18 | from . import utils, tuner 19 | from peft.mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING 20 | from peft.utils import TaskType 21 | from .modeling_peft import AntPeftForCausalLM, AntPeftForEmbedding 22 | 23 | 24 | SUPPORTED_PEFT_TYPES = ["prefix", "lora", "adalora", "bitfit", "roem", "unipelt", "prompt", "ptuning"] 25 | 26 | # Register the Ant Causal Language Model 27 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_CAUSAL_LM"] = AntPeftForCausalLM 28 | TaskType.ANT_CAUSAL_LM = "ANT_CAUSAL_LM" 29 | 30 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_EMBEDDING"] = AntPeftForEmbedding 31 | TaskType.ANT_EMBEDDING = "ANT_EMBEDDING" 32 | -------------------------------------------------------------------------------- /model/peft/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """peft utils interface.""" 17 | 18 | from .config import PeftConfig, PetuningConfig 19 | 20 | from .mapping import TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING 21 | from .mapping import TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING 22 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING 23 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING 24 | from .mapping import TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING 25 | from .mapping import TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING 26 | from .mapping import WEIGHTS_NAME, CONFIG_NAME 27 | from .mapping import bloom_model_postprocess_past_key_value 28 | 29 | from .others import get_peft_model_state_dict, set_peft_model_state_dict, _freeze_model, prepare_model_for_kbit_training -------------------------------------------------------------------------------- /inference/generate.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import re 18 | import time 19 | import json 20 | import torch 21 | import random 22 | import argparse 23 | import jsonlines 24 | import numpy as np 25 | from copy import deepcopy 26 | from transformers import AutoModelForCausalLM, AutoTokenizer 27 | 28 | 29 | def load_model(args): 30 | st = time.time() 31 | checkpoint = args.model_dir 32 | print('LOAD CKPT: {}'.format(checkpoint)) 33 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side="left") 34 | tokenizer.add_special_tokens({'eos_token': "<|endoftext|>"}) 35 | tokenizer.add_special_tokens({'pad_token': "<|pad|>"}) 36 | 37 | model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float16) 38 | print('Model load spend: {:.4f}s'.format(time.time() - st)) 39 | return tokenizer, model -------------------------------------------------------------------------------- /model/peft/utils/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright (c) 2023 Ant Group. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import sys 18 | sys.path.append("..") 19 | sys.path.append("../..") 20 | from typing import List, Optional 21 | from dataclasses import dataclass, field 22 | from peft.utils import PeftConfig 23 | 24 | 25 | @dataclass 26 | class PetuningConfig(PeftConfig): 27 | """ 28 | This is the base configuration class to store the configuration of [`ROEM`], or [`BitFit`]. 29 | 30 | Args: 31 | modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable 32 | and saved in the final checkpoint. 33 | """ 34 | 35 | modules_to_save: Optional[List[str]] = field( 36 | default=None, 37 | metadata={ 38 | "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 39 | "For example, in Sequence Classification or Token Classification tasks, " 40 | "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." 41 | }, 42 | ) -------------------------------------------------------------------------------- /utils/merge_base_and_lora_to_hf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import sys 18 | import torch 19 | import transformers 20 | from transformers import AutoModelForCausalLM, AutoTokenizer 21 | from peft import LoraConfig, get_peft_model 22 | from peft import PeftModelForCausalLM 23 | # from transformers import BitsAndBytesConfig 24 | # from peft import prepare_model_for_kbit_training 25 | 26 | model_path='/output/checkpoint/mpt-fsdp-tp-bm-512-tp-1-dp-8-gpu-8-bin/checkpoint-320000' 27 | lora_adapter='/output/checkpoint/mpt-fsdp-tp-bm-512-tp-1-dp-8-gpu-8-bin-lora/checkpoint-20000' 28 | save_path='/output/checkpoint/mpt-fsdp-tp-bm-512-tp-1-dp-8-gpu-8-bin-lora/checkpoint-20000-merge' 29 | 30 | base_model = AutoModelForCausalLM.from_pretrained( 31 | model_path, 32 | trust_remote_code=True, 33 | torch_dtype=torch.float16, 34 | return_dict=True, 35 | device_map="auto" 36 | ) 37 | print(base_model) 38 | model_to_merge = PeftModelForCausalLM.from_pretrained(base_model, lora_adapter) 39 | merged_model = model_to_merge.merge_and_unload() 40 | 41 | tokenizer = AutoTokenizer.from_pretrained(model_path) 42 | 43 | merged_model.save_pretrained(save_path) 44 | tokenizer.save_pretrained(save_path) 45 | print(f"Merge finised: {save_path} saved") -------------------------------------------------------------------------------- /dockerfile: -------------------------------------------------------------------------------- 1 | FROM reg.docker.alibaba-inc.com/atorch/atorch-dev:20230808torch210dev20230731cu118nlp 2 | 3 | USER root 4 | WORKDIR /root 5 | 6 | ENV BASH_ENV /root/.bashrc 7 | ENV LANGUAGE zh_cn 8 | ENV LC_ALL zh_CN.UTF-8 9 | ENV SHELL /bin/bash 10 | SHELL ["/bin/bash","-c"] 11 | 12 | ADD lib /root/builder 13 | 14 | RUN rm -rf /pai-extension && mv ~/builder/pai-extension /pai-extension && chmod 777 -R /pai-extension 15 | RUN mv ~/builder/theia-ide/.theia ~/.theia && rm -rf ~/.aistudio/hooks/* 16 | 17 | RUN sh ~/builder/script/install-dumb-init.sh 18 | 19 | RUN sh ~/builder/script/install-node.sh v5.20.3 \ 20 | && rm -rf ~/.aistudio && mkdir -p ~/.aistudio \ 21 | && echo 'export npm_config_user=root' >> ~/.bashrc \ 22 | && mv ~/builder/theia-ide/.aistudio/* ~/.aistudio \ 23 | && mv ~/builder/theia-ide/.aistudio/.[^.]* ~/.aistudio \ 24 | && gcc --version 25 | 26 | RUN sh ~/builder/script/install-third-common.sh 27 | RUN sh ~/builder/script/python/install-sdk.sh || echo "install sdk failed" 28 | RUN sh ~/builder/script/matplot/installer.sh 29 | RUN pip install -I urllib3==1.26.4 && sh ~/builder/script/python/install-jupyter.sh 30 | RUN sh ~/builder/script/setup-base.sh 31 | RUN pip install jinja2==2.11.3 --no-deps 32 | RUN pip install markupsafe==1.1.1 --no-deps 33 | 34 | # git lfs 35 | RUN pip install gradio==3.20.1 36 | 37 | RUN npm i -g @alipay/aistudio-bootstrap \ 38 | && npm i -g @alipay/aistudio-installer-cli \ 39 | && ais-installer install full \ 40 | && ais-installer collect --version=${IMAGEVERSION} --type=${IMAGETYPE} 41 | 42 | RUN pip install -U transformers==4.30.1 43 | RUN pip install -U bitsandbytes==0.39.0 44 | RUN pip install -U accelerate==0.20.3 45 | RUN pip install peft==0.4.0 46 | RUN pip uninstall flash_attn -y 47 | RUN pip install xformers --no-deps 48 | RUN pip install -U atorch==0.1.7rc17 --no-deps 49 | RUN pip install zstandard 50 | RUN pip install ujson 51 | RUN pip install jsonlines -------------------------------------------------------------------------------- /model/peft/tuner/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """peft tuner methods interface.""" 17 | 18 | from peft.utils import PeftType 19 | from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING 20 | from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING 21 | 22 | from .adalora import AdaLoraConfig, AdaLoraModel 23 | from .routelora import RouteLoraConfig, RouteLoraModel 24 | from .unipelt import UniPELTConfig, UniPELTModel, PEUniPELTModel 25 | from .pe_base_model import PEBaseModel 26 | from .bitfit import PeftBitfitConfig, PEBitfitModel, PeftBitfitModel 27 | from .roem import PeftROEMConfig, PEROEMModel, PeftROEMModel 28 | 29 | # Register new ant peft methods 30 | PeftType.ROUTELORA = "ROUTELORA" 31 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROUTELORA] = RouteLoraModel 32 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROUTELORA] = RouteLoraConfig 33 | 34 | PeftType.UNIPELT = "UNIPELT" 35 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.UNIPELT] = UniPELTModel 36 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.UNIPELT] = UniPELTConfig 37 | 38 | PeftType.ROEM = "ROEM" 39 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROEM] = PeftROEMModel 40 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROEM] = PeftROEMConfig 41 | 42 | PeftType.BITFIT = "BITFIT" 43 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.BITFIT] = PeftBitfitModel 44 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.BITFIT] = PeftBitfitConfig -------------------------------------------------------------------------------- /model/gpt_neox/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.file_utils import _LazyModule, is_tokenizers_available, is_torch_available 17 | from transformers.utils import OptionalDependencyNotAvailable 18 | # from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available 19 | # from ...utils import OptionalDependencyNotAvailable 20 | 21 | 22 | _import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]} 23 | 24 | try: 25 | if not is_tokenizers_available(): 26 | raise OptionalDependencyNotAvailable() 27 | except OptionalDependencyNotAvailable: 28 | pass 29 | else: 30 | _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"] 31 | 32 | try: 33 | if not is_torch_available(): 34 | raise OptionalDependencyNotAvailable() 35 | except OptionalDependencyNotAvailable: 36 | pass 37 | else: 38 | _import_structure["modeling_gpt_neox"] = [ 39 | "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST", 40 | "GPTNeoXForCausalLM", 41 | "GPTNeoXLayer", 42 | "GPTNeoXModel", 43 | "GPTNeoXPreTrainedModel", 44 | ] 45 | 46 | 47 | if TYPE_CHECKING: 48 | from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig 49 | 50 | try: 51 | if not is_tokenizers_available(): 52 | raise OptionalDependencyNotAvailable() 53 | except OptionalDependencyNotAvailable: 54 | pass 55 | else: 56 | from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast 57 | 58 | try: 59 | if not is_torch_available(): 60 | raise OptionalDependencyNotAvailable() 61 | except OptionalDependencyNotAvailable: 62 | pass 63 | else: 64 | from .modeling_gpt_neox import ( 65 | GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST, 66 | GPTNeoXForCausalLM, 67 | GPTNeoXLayer, 68 | GPTNeoXModel, 69 | GPTNeoXPreTrainedModel, 70 | ) 71 | 72 | 73 | else: 74 | import sys 75 | 76 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) -------------------------------------------------------------------------------- /data/tokenization/generate_dataset.py: -------------------------------------------------------------------------------- 1 | # origin: 使用GPT-Neox原生的Encoder:key为text,只生成input_ids,训练时document首尾相连按窗口去取 2 | # Prompt_padding:使用UniformEncoder:key为input_ids和loss_mask; loss_mask保证只训练Target部分的Loss;每条样本Padding到seq_length,避免了一个Sample里包含多个样本的问题,但缺点是比较浪费计算资源 3 | prompt_padding_cmd = "python preprocess_data.py \ 4 | --input {input} \ 5 | --jsonl-keys input_ids loss_mask \ 6 | --output-prefix {output_prefix} \ 7 | --vocab ../../tokenizer-ant-v3.json \ 8 | --dataset-impl mmap \ 9 | --tokenizer-type HFTokenizer \ 10 | --workers {worker} \ 11 | --encoder UniformEncoder \ 12 | --seq-length {seq_length} \ 13 | --mode sft \ 14 | --padding" 15 | 16 | align_padding_cmd = "python preprocess_data_align.py \ 17 | --input {input} \ 18 | --jsonl-keys w_input_ids w_loss_mask l_input_ids l_loss_mask \ 19 | --output-prefix {output_prefix} \ 20 | --vocab ../../tokenizer-ant-v3.json \ 21 | --dataset-impl mmap \ 22 | --tokenizer-type HFTokenizer \ 23 | --workers {worker} \ 24 | --encoder UniformEncoder \ 25 | --seq-length {seq_length} \ 26 | --mode align \ 27 | --padding" 28 | 29 | origin_cmd = "python preprocess_data.py \ 30 | --input {input} \ 31 | --output-prefix {output_prefix} \ 32 | --vocab ../../tokenizer-ant-v3.json \ 33 | --dataset-impl mmap \ 34 | --tokenizer-type HFTokenizer \ 35 | --workers {worker} \ 36 | --encoder OriginEncoder \ 37 | --append-eod" 38 | 39 | convert_dict = { 40 | "origin":{ 41 | "output_path":"xxx", 42 | "cmd": origin_cmd 43 | }, 44 | "prompt_padding":{ 45 | "output_path":"/ossfs/workspace/coh_tokenization", 46 | "cmd": prompt_padding_cmd 47 | }, 48 | "align_padding":{ 49 | "output_path":"/ossfs/workspace/alignment_tokenization", 50 | "cmd": align_padding_cmd 51 | } 52 | } 53 | 54 | input_dict = { 55 | 'dataset1': "/path/dataset1_path", 56 | 'dataset2': "/path/dataset2_path" 57 | } 58 | # conver_type_list = ["align_padding"] 59 | conver_type_list = ["prompt_padding"] 60 | output_name = "coh" 61 | 62 | if __name__ == "__main__": 63 | import os 64 | seq_length = 2048 65 | worker = 16 66 | 67 | for convert_type in conver_type_list: 68 | convert_info = convert_dict[convert_type] 69 | output_path = convert_info["output_path"] 70 | convert_cmd = convert_info["cmd"] 71 | output_prefix = os.path.join(output_path, output_name) 72 | 73 | input_ = ",".join(input_dict.values()) 74 | print(input_) 75 | cmd = convert_cmd.replace("{input}", input_).replace("{output_prefix}", output_prefix).replace("{seq_length}", str(seq_length)).replace("{worker}", str(worker)) 76 | os.system(cmd) -------------------------------------------------------------------------------- /model/llama/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_sentencepiece_available, 20 | is_tokenizers_available, 21 | is_torch_available, 22 | ) 23 | 24 | 25 | _import_structure = { 26 | "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"], 27 | } 28 | 29 | try: 30 | if not is_sentencepiece_available(): 31 | raise OptionalDependencyNotAvailable() 32 | except OptionalDependencyNotAvailable: 33 | pass 34 | else: 35 | _import_structure["tokenization_llama"] = ["LlamaTokenizer"] 36 | 37 | try: 38 | if not is_tokenizers_available(): 39 | raise OptionalDependencyNotAvailable() 40 | except OptionalDependencyNotAvailable: 41 | pass 42 | else: 43 | _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"] 44 | 45 | try: 46 | if not is_torch_available(): 47 | raise OptionalDependencyNotAvailable() 48 | except OptionalDependencyNotAvailable: 49 | pass 50 | else: 51 | _import_structure["modeling_llama"] = [ 52 | "LlamaForCausalLM", 53 | "LlamaModel", 54 | "LlamaPreTrainedModel", 55 | "LlamaForSequenceClassification", 56 | ] 57 | 58 | 59 | if TYPE_CHECKING: 60 | from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig 61 | 62 | try: 63 | if not is_sentencepiece_available(): 64 | raise OptionalDependencyNotAvailable() 65 | except OptionalDependencyNotAvailable: 66 | pass 67 | else: 68 | from .tokenization_llama import LlamaTokenizer 69 | 70 | try: 71 | if not is_tokenizers_available(): 72 | raise OptionalDependencyNotAvailable() 73 | except OptionalDependencyNotAvailable: 74 | pass 75 | else: 76 | from .tokenization_llama_fast import LlamaTokenizerFast 77 | 78 | try: 79 | if not is_torch_available(): 80 | raise OptionalDependencyNotAvailable() 81 | except OptionalDependencyNotAvailable: 82 | pass 83 | else: 84 | from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel 85 | 86 | 87 | else: 88 | import sys 89 | 90 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) -------------------------------------------------------------------------------- /data/blendable_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Blendable dataset.""" 19 | 20 | import time 21 | 22 | import numpy as np 23 | import torch 24 | 25 | from utils.common_utils import print_rank_0 26 | 27 | 28 | class BlendableDataset(torch.utils.data.Dataset): 29 | def __init__(self, datasets, weights): 30 | self.datasets = datasets 31 | num_datasets = len(datasets) 32 | assert num_datasets == len(weights) 33 | 34 | self.size = 0 35 | for dataset in self.datasets: 36 | self.size += len(dataset) 37 | 38 | # Normalize weights. 39 | weights = np.array(weights, dtype=np.float64) 40 | sum_weights = np.sum(weights) 41 | assert sum_weights > 0.0 42 | weights /= sum_weights 43 | 44 | # recompute weights 45 | weights = self.calc_weights() 46 | 47 | # Build indices. 48 | start_time = time.time() 49 | assert num_datasets < 255 50 | self.dataset_index = np.zeros(self.size, dtype=np.uint8) 51 | self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) 52 | 53 | from data import helpers 54 | 55 | helpers.build_blending_indices( 56 | self.dataset_index, 57 | self.dataset_sample_index, 58 | weights, 59 | num_datasets, 60 | self.size, 61 | torch.distributed.get_rank() == 0, 62 | ) 63 | 64 | print( 65 | "> RANK {} elapsed time for building blendable dataset indices: " 66 | "{:.2f} (sec)".format( 67 | torch.distributed.get_rank(), time.time() - start_time 68 | ) 69 | ) 70 | 71 | def calc_weights(self): 72 | dataset_sample_cnt = [len(ds) for ds in self.datasets] 73 | total_cnt = sum(dataset_sample_cnt) 74 | weights = np.array([(cnt + 0.0) / total_cnt for cnt in dataset_sample_cnt], dtype=np.float64) 75 | return weights 76 | 77 | def __len__(self): 78 | return self.size 79 | 80 | def __getitem__(self, idx): 81 | try: 82 | dataset_idx = self.dataset_index[idx] 83 | sample_idx = self.dataset_sample_index[idx] 84 | return self.datasets[dataset_idx][sample_idx] 85 | except IndexError: 86 | new_idx = idx % len(self) 87 | print( 88 | f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})" 89 | ) 90 | return self[new_idx] 91 | -------------------------------------------------------------------------------- /tools/analysis/MMapIndexDatasetParser.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import struct 17 | import os 18 | import numpy as np 19 | 20 | 21 | dtypes = { 22 | 1: np.uint8, 23 | 2: np.int8, 24 | 3: np.int16, 25 | 4: np.int32, 26 | 5: np.int64, 27 | 6: np.float32, 28 | 7: np.float64, 29 | 8: np.uint16, 30 | } 31 | 32 | 33 | class MMapIndexDataset: 34 | """ 35 | 描述GPT-Neox mmap实现方式获得的*.idx文件对应的数据集,即Tokenization索引数据集 36 | """ 37 | 38 | # magic code 39 | _HDR_MAGIC = b"MMIDIDX\x00\x00" 40 | 41 | _VERSION = 1 42 | 43 | def __init__(self, index_dataset_file_path): 44 | """ 45 | 对于给定的GPT-Neox mmap实现方式的索引文件生成对应的数据集描述 46 | """ 47 | assert os.path.exists, ( 48 | "给定的路径不存在" 49 | "请确保给定的.idx文件路径是存在的" 50 | ) 51 | assert os.path.isfile, ( 52 | "给定的路径不是一个文件" 53 | "请确保给定的是一个.idx文件的路径" 54 | ) 55 | 56 | self.path = index_dataset_file_path 57 | 58 | with open(self.path, 'rb') as fb: 59 | magic = fb.read(9) 60 | assert magic == self._HDR_MAGIC, ( 61 | "Magic Code与期望格式不匹配" 62 | "请确保提供的是GPT Neox MMAP方式生成的.idx文件" 63 | ) 64 | 65 | version = struct.unpack('&1 | tee $OUTPUT/$PREFIX-output.txt -------------------------------------------------------------------------------- /train/run_coca_neox.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOAD_RAW_DATASET=False 3 | if [ ${LOAD_RAW_DATASET} = "True" ]; then 4 | LOAD_RAW_DATASET="--load_raw_dataset" 5 | DATA_PATHS="[/path/dataset1,/path/dataset2]" 6 | DATA_WEIGHTS="[1.,1.]" 7 | DATA_SPLIT="90,10,0" 8 | SHUFFLE_BEFORE_SPLIT="" 9 | USE_RANDOM_SAMPLER="" 10 | USE_WEIGHTED_LOSS="" 11 | WEIGHT_BY_NUM_DOCUMENTS="" 12 | else 13 | LOAD_RAW_DATASET="" 14 | DATA_PATHS="[/path/dataset1,/path/dataset2]" 15 | DATA_WEIGHTS="[1.,1.]" 16 | DATA_SPLIT="90,10,0" 17 | SHUFFLE_BEFORE_SPLIT="--shuffle_before_split" 18 | USE_RANDOM_SAMPLER="--use_random_sampler" 19 | USE_WEIGHTED_LOSS="--use_weighted_loss" 20 | WEIGHT_BY_NUM_DOCUMENTS="--weight_by_num_documents" 21 | fi 22 | 23 | #USE_XFORMERS=True 24 | #if [ ${USE_XFORMERS} = "True" ]; then 25 | # USE_XFORMERS="--use_xformers" 26 | #else 27 | # USE_XFORMERS="" 28 | #fi 29 | 30 | VOCAB_FILE="../tools/codegpt-13b-tokenizer.json" 31 | TOKENIZER_TYPE="HFTokenizer" 32 | MODEL_TYPE="gpt_neox" 33 | MODEL_CONFIG_PATH="../model/${MODEL_TYPE}" 34 | 35 | RESUME_FROM_CHECKPOINT="false" 36 | 37 | PER_DEVICE_BATCH_SIZE=$1 38 | TP=$2 39 | DP=$3 40 | EPOCH=$4 41 | TOTAL_TRAIN_BATCH_SIZE=$(($PER_DEVICE_BATCH_SIZE * $TP * $DP)) 42 | 43 | GPU=$(($TP * $DP)) 44 | OUTPUT="/output/checkpoint/mpt-fsdp-tp-bm-${TOTAL_TRAIN_BATCH_SIZE}-tp-${TP}-dp-${DP}-gpu-${GPU}-bin" 45 | TENSORBOARD_PATH="/output/tensorboard/mpt-fsdp-tp-bm-${TOTAL_TRAIN_BATCH_SIZE}-tp-${TP}-dp-${DP}-gpu-${GPU}-bin" 46 | 47 | PREFIX="master-0" 48 | mkdir -p $OUTPUT || true 49 | echo "output to $OUTPUT" 50 | mkdir -p $TENSORBOARD_PATH 51 | chmod 777 $OUTPUT 52 | chmod 777 $TENSORBOARD_PATH 53 | 54 | # atorch environment maybe not available yet, opensource soon 55 | pip install -U --no-deps atorch==0.1.7rc9 56 | pip install tensorboard==2.3.0 57 | pip install peft==0.3.0 --no-dependencies 58 | pip install zstandard 59 | pip install ujson 60 | pip install jsonlines 61 | pip list 62 | 63 | python -m atorch.distributed.launch \ 64 | --nproc_per_node=$(nvidia-smi -L | wc -l) \ 65 | run_train.py \ 66 | ${LOAD_RAW_DATASET} \ 67 | ${SPLIT_BEFORE_READ} \ 68 | --tokenize_mode 'pretrain' \ 69 | --train_mode 'sst' \ 70 | --config_path $MODEL_CONFIG_PATH \ 71 | --tokenizer_type $TOKENIZER_TYPE \ 72 | --vocab_file $VOCAB_FILE \ 73 | --model_type $MODEL_TYPE \ 74 | --padding \ 75 | --data_paths $DATA_PATHS \ 76 | --data_weights $DATA_WEIGHTS \ 77 | --data_split $DATA_SPLIT \ 78 | ${SHUFFLE_BEFORE_SPLIT} \ 79 | ${USE_RANDOM_SAMPLER} \ 80 | ${USE_WEIGHTED_LOSS} \ 81 | ${WEIGHT_BY_NUM_DOCUMENTS} \ 82 | --train_iters 100 \ 83 | --num_warmup_steps 6000 \ 84 | --custom_lr_scheduler_type 'cosine' \ 85 | --learning_rate 1.0e-4 \ 86 | --min_lr 1.0e-5 \ 87 | --valid_iters 500 \ 88 | --valid_interval 2000 \ 89 | --num_train_epochs $EPOCH \ 90 | --seq_length 512 \ 91 | --total_train_batch_size $TOTAL_TRAIN_BATCH_SIZE \ 92 | --per_device_valid_batch_size $PER_DEVICE_BATCH_SIZE \ 93 | --seed 42 \ 94 | --preprocessing_num_workers 6 \ 95 | --num_workers 8 \ 96 | --output_dir $OUTPUT \ 97 | --tensorboard_dir $TENSORBOARD_PATH \ 98 | --ignore_mismatched_sizes \ 99 | --skip_atorch_autoacc_dryrun \ 100 | --tp $TP \ 101 | --dp $DP \ 102 | --bf16 \ 103 | --pipe_parallel_size 0 \ 104 | --model_parallel_size 1 \ 105 | --checkpointing_steps 5000 \ 106 | --log_interval 10 \ 107 | --make_vocab_size_divisible_by 128 \ 108 | --weighted_loss_mode 'case3' \ 109 | --checkpoint_activations \ 110 | --resume_from_checkpoint $RESUME_FROM_CHECKPOINT \ 111 | --max_grad_norm 1 \ 112 | --evaluation_strategy "steps,epoch" \ 113 | --save_strategy "steps" \ 114 | --save_total_limit 10 \ 115 | --extra_save_by_epoch \ 116 | --metric_for_best_model 'loss' \ 117 | --greater_is_better 'false' \ 118 | --zero_opt_level zero3 2>&1 | tee $OUTPUT/$PREFIX-output.txt -------------------------------------------------------------------------------- /tokenizer/train_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Assumes a dataset of jsonl files in the same format as the neox training set. 17 | """ 18 | 19 | from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers 20 | from tokenizers.normalizers import NFKC 21 | 22 | from glob import glob 23 | import os 24 | import json 25 | import argparse 26 | 27 | 28 | def load_jsonl(input_path, quiet=True) -> list: 29 | """ 30 | Read list of objects from a JSON lines file. 31 | """ 32 | data = [] 33 | with open(input_path, "r", encoding="utf-8") as f: 34 | for line in f: 35 | data.append(json.loads(line.rstrip("\n|\r"))) 36 | if not quiet: 37 | print("Loaded {} records from {}".format(len(data), input_path)) 38 | return data 39 | 40 | 41 | def json_iterator(input_dir, text_key="text"): 42 | all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json") 43 | for j in all_jsonls: 44 | data = load_jsonl(j) 45 | for doc in data: 46 | yield doc[text_key] 47 | 48 | 49 | def train_tokenizer( 50 | input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000 51 | ): 52 | """ 53 | Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path` 54 | 55 | :param input_dir: input directory containing jsonl files 56 | :param save_path: path to save tokenizer to 57 | :param tokenizer_type: type of tokenizer to train. 58 | :param vocab_size: int, size of tokenizer's vocab 59 | :return: 60 | """ 61 | 62 | if tokenizer_type == "BPE": 63 | model = models.BPE() 64 | else: 65 | raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented") 66 | tokenizer = Tokenizer(model) 67 | 68 | # Customize pre-tokenization and decoding 69 | tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True) 70 | tokenizer.decoder = decoders.ByteLevel() 71 | tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) 72 | tokenizer.normalizer = NFKC() 73 | 74 | # And then train 75 | trainer = trainers.BpeTrainer( 76 | vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"] 77 | ) 78 | tokenizer.train_from_iterator(json_iterator(input_dir), trainer) 79 | 80 | # And Save it 81 | tokenizer.save(save_path, pretty=True) 82 | print(f"Tokenizer saved at {save_path}") 83 | 84 | 85 | def parse_args(): 86 | parser = argparse.ArgumentParser( 87 | description="script for training a multilingual " 88 | "HF tokenizer on CC dumps with upweighting for low resource languages" 89 | ) 90 | parser.add_argument( 91 | "--json_input_dir", 92 | type=str, 93 | help="Path to folder containing tokenizer training data in jsonl format", 94 | ) 95 | parser.add_argument( 96 | "--tokenizer_output_path", 97 | type=str, 98 | help="Path to which your trained tokenizer will be saved (should end in .json)", 99 | ) 100 | parser.add_argument( 101 | "--tokenizer_type", 102 | type=str, 103 | help="type of tokenizer to train, currently only BPE is supported", 104 | choices=["BPE"], 105 | default=["BPE"], 106 | ) 107 | parser.add_argument( 108 | "-v", 109 | "--vocab_size", 110 | help="vocabulary size of tokenizer, default=52k", 111 | type=int, 112 | default=52000, 113 | ) 114 | return parser.parse_args() 115 | 116 | 117 | if __name__ == "__main__": 118 | 119 | args = parse_args() 120 | 121 | train_tokenizer( 122 | args.json_input_dir, 123 | save_path=args.tokenizer_output_path, 124 | tokenizer_type=args.tokenizer_type, 125 | vocab_size=args.vocab_size, 126 | ) 127 | -------------------------------------------------------------------------------- /tools/analysis/post_tokenization_check.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import MMapIndexDatasetParser 17 | import MMapTokenIdsBinChecker 18 | import argparse 19 | import os 20 | 21 | 22 | def process_dataset(index_dataset_path, input_ids_bin_path, loss_mask_bin_path, tokenizer_path, detokenization_output_path, seq_len, random_sampling_num): 23 | # 检查index dataset IDX文件 24 | mmap_index_dataset_checker = MMapIndexDatasetParser.MMapIndexDatasetChecker(index_dataset_path) 25 | if not os.path.exists(input_ids_bin_path) or not os.path.isfile(input_ids_bin_path): 26 | print(f"给定的input_ids.bin路径不存在或不是文件:{input_ids_bin_path}") 27 | return False 28 | 29 | check_result = mmap_index_dataset_checker.check(bin_bytes_size=os.path.getsize(input_ids_bin_path), seq_len=seq_len) 30 | if not check_result: 31 | print('!!!'*40) 32 | print(f"!\033[1;31;47mIDX检查未通过 {index_dataset_path}\033[0m") 33 | print('!!!'*40) 34 | return False 35 | 36 | # 检查input ids BIN文件 37 | mmap_token_ids_bin_checker = MMapTokenIdsBinChecker.MMapTokenIdsBinChecker(input_ids_bin_path=input_ids_bin_path, 38 | loss_mask_bin_path=loss_mask_bin_path, 39 | tokenizer_path=tokenizer_path, 40 | detokenize_output_path=detokenization_output_path, 41 | seq_len=seq_len, 42 | element_size=mmap_index_dataset_checker.mmap_index_dataset._dtype_size, 43 | dtype=mmap_index_dataset_checker.mmap_index_dataset._dtype, 44 | sample_total=mmap_index_dataset_checker.mmap_index_dataset._len, 45 | ramdom_sampling_num=random_sampling_num) 46 | 47 | check_result = mmap_token_ids_bin_checker.check() 48 | if not check_result: 49 | print('!!!'*40) 50 | print(f'!\033[1;31;47m【ERROR】数据集{loss_mask_bin_path}抽检未通过\033[0m') 51 | print('!!!'*40) 52 | return False 53 | else: 54 | print('###'*40) 55 | print(f'#\033[1;32;47m【OK】数据集{loss_mask_bin_path}抽检通过\033[0m') 56 | print('###'*40) 57 | return True 58 | 59 | 60 | 61 | if __name__ == "__main__": 62 | parser = argparse.ArgumentParser(description="") 63 | parser.add_argument('datasets_dir', help="GPT Neox MMAP方式生成的数据集目录路径,要求该路径下每个子目录对应一个数据集") 64 | parser.add_argument('tokenizer_path', help="词表文件路径") 65 | parser.add_argument('detokenization_output_dir', help='存储detokenization结果文件的目录路径') 66 | parser.add_argument('seq_len', help="每个样本token数量") 67 | parser.add_argument('--random_sampling_num', '-rsn', type=int,default=5, help='要随机抽样检查的数量,默认是100') 68 | 69 | args = parser.parse_args() 70 | 71 | for dir in os.listdir(args.datasets_dir): 72 | #if dir != 'codecompletion': 73 | # continue 74 | for file in os.listdir(os.path.join(args.datasets_dir, dir)): 75 | if file.endswith('_input_ids.idx'): 76 | mmap_index_dataset_path = os.path.join(args.datasets_dir, dir, file) 77 | elif file.endswith('_input_ids.bin'): 78 | mmap_input_ids_bin_path = os.path.join(args.datasets_dir, dir, file) 79 | elif file.endswith('loss_mask.bin'): 80 | mmap_loss_mask_bin_path = os.path.join(args.datasets_dir, dir, file) 81 | 82 | detokenization_output_path = os.path.join(args.detokenization_output_dir, f"{dir}.txt") 83 | print(f'\n\n开始检查数据集{dir}....') 84 | process_dataset(mmap_index_dataset_path, mmap_input_ids_bin_path, mmap_loss_mask_bin_path, args.tokenizer_path, detokenization_output_path, int(args.seq_len), args.random_sampling_num) -------------------------------------------------------------------------------- /utils/hselect.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Functions ported from the R package sm. 3 | 4 | Implements different bandwidth selection methods, including: 5 | - Scott's rule of thumb 6 | - Silverman's rule of thumb 7 | - Sheather-Jones estimator 8 | ''' 9 | 10 | import numpy as np 11 | # import distributions as distr 12 | 13 | 14 | __all__ = ['wmean', 15 | 'wvar', 16 | 'dnorm', 17 | 'hsilverman', 18 | 'hscott', 19 | 'hnorm', 20 | 'hsj'] 21 | 22 | 23 | def wmean(x, w): 24 | ''' 25 | Weighted mean 26 | ''' 27 | return sum(x * w) / float(sum(w)) 28 | 29 | 30 | def wvar(x, w): 31 | ''' 32 | Weighted variance 33 | ''' 34 | return sum(w * (x - wmean(x, w)) ** 2) / float(sum(w) - 1) 35 | 36 | 37 | def dnorm(x): 38 | return distr.normal.pdf(x, 0.0, 1.0) 39 | 40 | 41 | def bowman(x): 42 | pass 43 | # TODO: implement? 44 | #hx = median(abs(x - median(x))) / 0.6745 * (4 / 3 / r.n) ^ 0.2 45 | #hy = median(abs(y - median(y))) / 0.6745 * (4 / 3 / r.n) ^ 0.2 46 | #h = sqrt(hy * hx) 47 | 48 | 49 | def hsilverman(x, weights=None): 50 | IQR = np.percentile(x, 75) - np.percentile(x, 25) 51 | A = min(np.std(x, ddof=1), IQR / 1.349) 52 | 53 | if weights is None: 54 | weights = np.ones(len(x)) 55 | n = float(sum(weights)) 56 | 57 | return 0.9 * A * n ** (-0.2) 58 | 59 | 60 | def hscott(x, weights=None): 61 | 62 | IQR = np.percentile(x, 75) - np.percentile(x, 25) 63 | A = min(np.std(x, ddof=1), IQR / 1.349) 64 | 65 | if weights is None: 66 | weights = np.ones(len(x)) 67 | n = float(sum(weights)) 68 | 69 | return 1.059 * A * n ** (-0.2) 70 | 71 | 72 | def hnorm(x, weights=None): 73 | ''' 74 | Bandwidth estimate assuming f is normal. See paragraph 2.4.2 of 75 | Bowman and Azzalini[1]_ for details. 76 | 77 | References 78 | ---------- 79 | .. [1] Applied Smoothing Techniques for Data Analysis: the 80 | Kernel Approach with S-Plus Illustrations. 81 | Bowman, A.W. and Azzalini, A. (1997). 82 | Oxford University Press, Oxford 83 | ''' 84 | 85 | x = np.asarray(x) 86 | 87 | if weights is None: 88 | weights = np.ones(len(x)) 89 | 90 | n = float(sum(weights)) 91 | 92 | if len(x.shape) == 1: 93 | sd = np.sqrt(wvar(x, weights)) 94 | return sd * (4 / (3 * n)) ** (1 / 5.0) 95 | 96 | # TODO: make this work for more dimensions 97 | # ((4 / (p + 2) * n)^(1 / (p+4)) * sigma_i 98 | if len(x.shape) == 2: 99 | ndim = x.shape[1] 100 | sd = np.sqrt(np.apply_along_axis(wvar, 1, x, weights)) 101 | return (4.0 / ((ndim + 2.0) * n) ** (1.0 / (ndim + 4.0))) * sd 102 | 103 | 104 | def hsj(x, weights=None): 105 | ''' 106 | Sheather-Jones bandwidth estimator [1]_. 107 | 108 | References 109 | ---------- 110 | .. [1] A reliable data-based bandwidth selection method for kernel 111 | density estimation. Simon J. Sheather and Michael C. Jones. 112 | Journal of the Royal Statistical Society, Series B. 1991 113 | ''' 114 | 115 | h0 = hnorm(x) 116 | v0 = sj(x, h0) 117 | 118 | if v0 > 0: 119 | hstep = 1.1 120 | else: 121 | hstep = 0.9 122 | 123 | h1 = h0 * hstep 124 | v1 = sj(x, h1) 125 | 126 | while v1 * v0 > 0: 127 | h0 = h1 128 | v0 = v1 129 | h1 = h0 * hstep 130 | v1 = sj(x, h1) 131 | 132 | return h0 + (h1 - h0) * abs(v0) / (abs(v0) + abs(v1)) 133 | 134 | 135 | def sj(x, h): 136 | ''' 137 | Equation 12 of Sheather and Jones [1]_ 138 | 139 | References 140 | ---------- 141 | .. [1] A reliable data-based bandwidth selection method for kernel 142 | density estimation. Simon J. Sheather and Michael C. Jones. 143 | Journal of the Royal Statistical Society, Series B. 1991 144 | ''' 145 | phi6 = lambda x: (x ** 6 - 15 * x ** 4 + 45 * x ** 2 - 15) * dnorm(x) 146 | phi4 = lambda x: (x ** 4 - 6 * x ** 2 + 3) * dnorm(x) 147 | 148 | n = len(x) 149 | one = np.ones((1, n)) 150 | 151 | lam = np.percentile(x, 75) - np.percentile(x, 25) 152 | a = 0.92 * lam * n ** (-1 / 7.0) 153 | b = 0.912 * lam * n ** (-1 / 9.0) 154 | 155 | W = np.tile(x, (n, 1)) 156 | W = W - W.T 157 | 158 | W1 = phi6(W / b) 159 | tdb = np.dot(np.dot(one, W1), one.T) 160 | tdb = -tdb / (n * (n - 1) * b ** 7) 161 | 162 | W1 = phi4(W / a) 163 | sda = np.dot(np.dot(one, W1), one.T) 164 | sda = sda / (n * (n - 1) * a ** 5) 165 | 166 | alpha2 = 1.357 * (abs(sda / tdb)) ** (1 / 7.0) * h ** (5 / 7.0) 167 | 168 | W1 = phi4(W / alpha2) 169 | sdalpha2 = np.dot(np.dot(one, W1), one.T) 170 | sdalpha2 = sdalpha2 / (n * (n - 1) * alpha2 ** 5) 171 | 172 | return (distr.normal.pdf(0, 0, np.sqrt(2)) / 173 | (n * abs(sdalpha2[0, 0]))) ** 0.2 - h -------------------------------------------------------------------------------- /utils/learning_rates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Learning rate decay functions.""" 19 | 20 | import math 21 | 22 | # from .common_utils import print_rank_0 23 | 24 | 25 | class AnnealingLR(object): 26 | """Anneals the learning rate.""" 27 | 28 | def __init__( 29 | self, 30 | optimizer, 31 | start_lr, 32 | warmup_iter, 33 | total_iters, 34 | decay_style, 35 | last_iter, 36 | min_lr=0.0, 37 | use_checkpoint_lr_scheduler=True, 38 | override_lr_scheduler=False, 39 | use_mup=False, 40 | ): 41 | 42 | # Class values. 43 | self.optimizer = optimizer 44 | self.start_lr = start_lr 45 | self.min_lr = min_lr 46 | self.warmup_iter = warmup_iter 47 | self.num_iters = last_iter 48 | self.end_iter = total_iters 49 | assert self.end_iter > 0 50 | self.decay_style = decay_style 51 | self.override_lr_scheduler = override_lr_scheduler 52 | self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler 53 | self.use_mup = use_mup 54 | if self.override_lr_scheduler: 55 | assert not self.use_checkpoint_lr_scheduler, ( 56 | "both override and " "use-checkpoint are set." 57 | ) 58 | # Set the learning rate 59 | self.step(self.num_iters) 60 | 61 | print("> learning rate decay style: {}".format(self.decay_style)) 62 | 63 | def update_lr(self, lr): 64 | self.start_lr = lr 65 | 66 | def get_lr(self): 67 | """Learning rate decay functions from: 68 | https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" 69 | 70 | num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter) 71 | # Warmup. 72 | if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter: 73 | return float(self.start_lr) * num_iters_ / self.warmup_iter 74 | 75 | num_iters_ = num_iters_ - self.warmup_iter 76 | if self.decay_style == "linear": 77 | lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter 78 | elif self.decay_style == "cosine": 79 | lr = ( 80 | self.start_lr 81 | / 2.0 82 | * (math.cos(math.pi * num_iters_ / self.end_iter) + 1) 83 | ) 84 | elif self.decay_style == "exponential": 85 | # exp(-0.693) = 1/2 86 | lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter) 87 | else: 88 | lr = self.start_lr 89 | return max(lr, self.min_lr) 90 | 91 | def step(self, step_num=None): 92 | """Set lr for all parameters groups.""" 93 | if step_num is None: 94 | step_num = self.num_iters + 1 95 | self.num_iters = step_num 96 | new_lr = self.get_lr() 97 | for group in self.optimizer.param_groups: 98 | if self.use_mup and "width_mult" in group: 99 | group["lr"] = new_lr / group["width_mult"] 100 | else: 101 | group["lr"] = new_lr 102 | 103 | def state_dict(self): 104 | state_dict = { 105 | "start_lr": self.start_lr, 106 | "warmup_iter": self.warmup_iter, 107 | "num_iters": self.num_iters, 108 | "decay_style": self.decay_style, 109 | "end_iter": self.end_iter, 110 | "min_lr": self.min_lr, 111 | } 112 | return state_dict 113 | 114 | def _check_and_set(self, cls_value, sd_value, name): 115 | """Auxiliary function for checking the values in the checkpoint and 116 | setting them.""" 117 | if self.override_lr_scheduler: 118 | print_rank_0(" > overriding {} value to {}".format(name, cls_value)) 119 | return cls_value 120 | 121 | if not self.use_checkpoint_lr_scheduler: 122 | assert cls_value == sd_value, ( 123 | "AnnealingLR: class input value" 124 | "and checkpoint values for {} do not match".format(name) 125 | ) 126 | print_rank_0(" > using checkpoint value {} for {}".format(sd_value, name)) 127 | return sd_value 128 | 129 | def load_state_dict(self, sd): 130 | 131 | self.start_lr = self._check_and_set( 132 | self.start_lr, sd["start_lr"], "learning rate" 133 | ) 134 | self.min_lr = self._check_and_set( 135 | self.min_lr, sd["min_lr"], "minimum learning rate" 136 | ) 137 | self.warmup_iter = self._check_and_set( 138 | self.warmup_iter, sd["warmup_iter"], "warmup iterations" 139 | ) 140 | self.end_iter = self._check_and_set( 141 | self.end_iter, sd["end_iter"], "total number of iterations" 142 | ) 143 | self.decay_style = self._check_and_set( 144 | self.decay_style, sd["decay_style"], "decay style" 145 | ) 146 | 147 | self.num_iters = sd["num_iters"] 148 | self.step(self.num_iters) 149 | -------------------------------------------------------------------------------- /model/peft/utils/mapping.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright (c) 2023 Ant Group. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import sys 18 | sys.path.append("..") 19 | sys.path.append("../..") 20 | import torch 21 | from peft.utils import ( 22 | TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, 23 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING 24 | ) 25 | 26 | 27 | # needed for prefix-tuning of bloom model 28 | def bloom_model_postprocess_past_key_value(past_key_values): 29 | past_key_values = torch.cat(past_key_values) 30 | ( 31 | total_layers, 32 | batch_size, 33 | num_attention_heads, 34 | num_virtual_tokens, 35 | head_dim, 36 | ) = past_key_values.shape 37 | keys = past_key_values[: total_layers // 2] 38 | keys = keys.transpose(2, 3).reshape( 39 | total_layers // 2, 40 | batch_size * num_attention_heads, 41 | head_dim, 42 | num_virtual_tokens, 43 | ) 44 | values = past_key_values[total_layers // 2 :] 45 | values = values.reshape( 46 | total_layers // 2, 47 | batch_size * num_attention_heads, 48 | num_virtual_tokens, 49 | head_dim, 50 | ) 51 | 52 | return tuple(zip(keys, values)) 53 | 54 | 55 | NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = { 56 | "t5": ["q", "v"], 57 | "mt5": ["q", "v"], 58 | "bart": ["q_proj", "v_proj"], 59 | "gpt2": ["c_attn"], 60 | "bloom": ["query_key_value"], 61 | "bloomz": ["query_key_value"], 62 | "blip-2": ["q", "v", "q_proj", "v_proj"], 63 | "opt": ["q_proj", "v_proj"], 64 | "gptj": ["q_proj", "v_proj"], 65 | "gpt_neox": ["query_key_value"], 66 | "gpt_neo": ["q_proj", "v_proj"], 67 | "bert": ["query", "value"], 68 | "roberta": ["query", "value"], 69 | "xlm-roberta": ["query", "value"], 70 | "electra": ["query", "value"], 71 | "deberta-v2": ["query_proj", "value_proj"], 72 | "deberta": ["in_proj"], 73 | "layoutlm": ["query", "value"], 74 | "llama": ["q_proj", "v_proj"], 75 | "chatglm": ["query_key_value"], 76 | "antglm": ["query_key_value"], 77 | "glm": ["query_key_value"], 78 | } 79 | 80 | NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = { 81 | "t5": ["q", "k", "v", "o", "wi", "wo"], 82 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], 83 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 84 | # "gpt2": ["c_attn"], 85 | "bloom": ["query_key_value"], 86 | "bloomz": ["query_key_value"], 87 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 88 | # "gptj": ["q_proj", "v_proj"], 89 | # "gpt_neox": ["query_key_value"], 90 | # "gpt_neo": ["q_proj", "v_proj"], 91 | # "bert": ["query", "value"], 92 | "roberta": ["query", "key", "value", "dense"], 93 | # "xlm-roberta": ["query", "value"], 94 | # "electra": ["query", "value"], 95 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], 96 | "chatglm": ["query_key_value"], 97 | "antglm": ["query_key_value"], 98 | "glm": ["query_key_value"], 99 | # "deberta": ["in_proj"], 100 | # "layoutlm": ["query", "value"], 101 | } 102 | 103 | TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING = { 104 | "t5": ["q", "k", "v", "o", "wi", "wo"], 105 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], 106 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 107 | # "gpt2": ["c_attn"], 108 | "bloom": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], 109 | "bloomz": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], 110 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 111 | # "gptj": ["q_proj", "v_proj"], 112 | # "gpt_neox": ["query_key_value"], 113 | # "gpt_neo": ["q_proj", "v_proj"], 114 | # "bert": ["query", "value"], 115 | "roberta": ["query", "key", "value", "dense"], 116 | # "xlm-roberta": ["query", "value"], 117 | # "electra": ["query", "value"], 118 | "llama": ["q_proj", "v_proj"], 119 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], 120 | "antglm": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], 121 | "glm": ["query_key_value", "dense"] 122 | # "deberta": ["in_proj"], 123 | # "layoutlm": ["query", "value"], 124 | } 125 | 126 | TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING = { 127 | "t5": ["q", "k", "v", "o", "wi", "wo"], 128 | "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], 129 | "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 130 | "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], 131 | "roberta": ["query", "key", "value", "dense"], 132 | "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], 133 | "chatglm": ["query_key_value"], 134 | "glm": ["query_key_value"] 135 | } 136 | 137 | TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING = { 138 | "glm": [0, 22], 139 | "antglm": [17, 22], 140 | "bloom": [17, 22], 141 | "bloomz": [17, 22], 142 | } 143 | 144 | TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = { 145 | "bloom": bloom_model_postprocess_past_key_value, 146 | "bloomz": bloom_model_postprocess_past_key_value, 147 | } 148 | 149 | WEIGHTS_NAME = "adapter_model.bin" 150 | CONFIG_NAME = "adapter_config.json" 151 | 152 | 153 | TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.update( 154 | NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING 155 | ) 156 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING.update( 157 | NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING 158 | ) 159 | -------------------------------------------------------------------------------- /model/gpt_neox/tokenization_gpt_neox_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group 3 | # This file is based on code by the authors denoted below and has been modified from its original version. 4 | # 5 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """Tokenization classes for GPTNeoX.""" 19 | import json 20 | from typing import TYPE_CHECKING, List, Optional, Tuple 21 | 22 | from tokenizers import pre_tokenizers 23 | 24 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast 25 | from transformers.utils import logging 26 | 27 | 28 | if TYPE_CHECKING: 29 | from transformers.pipelines.conversational import Conversation 30 | 31 | 32 | logger = logging.get_logger(__name__) 33 | 34 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} 35 | 36 | PRETRAINED_VOCAB_FILES_MAP = { 37 | "tokenizer_file": { 38 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json", 39 | }, 40 | } 41 | 42 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 43 | "gpt-neox-20b": 2048, 44 | } 45 | 46 | 47 | class GPTNeoXTokenizerFast(PreTrainedTokenizerFast): 48 | """ 49 | Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level 50 | Byte-Pair-Encoding. 51 | 52 | This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will 53 | be encoded differently whether it is at the beginning of the sentence (without space) or not: 54 | 55 | ```python 56 | >>> from transformers import GPTNeoXTokenizerFast 57 | 58 | >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2") 59 | >>> tokenizer("Hello world")["input_ids"] 60 | [15496, 995] 61 | 62 | >>> tokenizer(" Hello world")["input_ids"] 63 | [18435, 995] 64 | ``` 65 | 66 | You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since 67 | the model was not pretrained this way, it might yield a decrease in performance. 68 | 69 | 70 | 71 | When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. 72 | 73 | 74 | 75 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should 76 | refer to this superclass for more information regarding those methods. 77 | 78 | Args: 79 | vocab_file (`str`): 80 | Path to the vocabulary file. 81 | merges_file (`str`): 82 | Path to the merges file. 83 | errors (`str`, *optional*, defaults to `"replace"`): 84 | Paradigm to follow when decoding bytes to UTF-8. See 85 | [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. 86 | unk_token (`str`, *optional*, defaults to `<|endoftext|>`): 87 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 88 | token instead. 89 | bos_token (`str`, *optional*, defaults to `<|endoftext|>`): 90 | The beginning of sequence token. 91 | eos_token (`str`, *optional*, defaults to `<|endoftext|>`): 92 | The end of sequence token. 93 | add_prefix_space (`bool`, *optional*, defaults to `False`): 94 | Whether or not to add an initial space to the input. This allows to treat the leading word just as any 95 | other word. (GPTNeoX tokenizer detect beginning of words by the preceding space). 96 | trim_offsets (`bool`, *optional*, defaults to `True`): 97 | Whether or not the post-processing step should trim offsets to avoid including whitespaces. 98 | """ 99 | 100 | vocab_files_names = VOCAB_FILES_NAMES 101 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 102 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 103 | model_input_names = ["input_ids", "attention_mask"] 104 | 105 | def __init__( 106 | self, 107 | vocab_file=None, 108 | merges_file=None, 109 | tokenizer_file=None, 110 | unk_token="<|endoftext|>", 111 | bos_token="<|endoftext|>", 112 | eos_token="<|endoftext|>", 113 | add_prefix_space=False, 114 | **kwargs, 115 | ): 116 | super().__init__( 117 | vocab_file, 118 | merges_file, 119 | tokenizer_file=tokenizer_file, 120 | unk_token=unk_token, 121 | bos_token=bos_token, 122 | eos_token=eos_token, 123 | add_prefix_space=add_prefix_space, 124 | **kwargs, 125 | ) 126 | 127 | pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) 128 | if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space: 129 | pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type")) 130 | pre_tok_state["add_prefix_space"] = add_prefix_space 131 | self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) 132 | 133 | self.add_prefix_space = add_prefix_space 134 | 135 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 136 | files = self._tokenizer.model.save(save_directory, name=filename_prefix) 137 | return tuple(files) 138 | 139 | def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: 140 | """This corresponds to DialoGPT variants of models.""" 141 | input_ids = [] 142 | for is_user, text in conversation.iter_texts(): 143 | input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id]) 144 | 145 | if len(input_ids) > self.model_max_length: 146 | input_ids = input_ids[-self.model_max_length :] 147 | return input_ids 148 | -------------------------------------------------------------------------------- /model/peft/tuner/bitfit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append("..") 18 | sys.path.append("../..") 19 | import torch 20 | import importlib 21 | from enum import Enum 22 | from peft.utils import PeftType 23 | from dataclasses import dataclass, field, asdict 24 | from typing import Optional, List 25 | 26 | from .pe_base_model import PEBaseModel 27 | from model.peft.utils import PetuningConfig 28 | from model.peft.utils.others import _freeze_model 29 | 30 | 31 | def is_alps_available(): 32 | return importlib.util.find_spec("alps") is not None 33 | 34 | 35 | if is_alps_available(): 36 | from alps.util import logger 37 | else: 38 | import logging 39 | logger = logging.getLogger(__file__) 40 | 41 | 42 | class PEBitfitModel(PEBaseModel): 43 | """ 44 | 只训练模型bias:参考 https://arxiv.org/pdf/2106.10199.pdf 45 | model: huggingface transformers model 46 | tokenizer: huggingface transformers tokenizer 47 | """ 48 | 49 | def __init__(self, model): 50 | self.model = model 51 | 52 | def get_model(self): 53 | not_freeze_param_name = ["bias"] 54 | set_parameter_requires_grad(self.model, not_freeze_param_name) 55 | return self.model 56 | 57 | @classmethod 58 | def restore(self, model=None, path=None): 59 | logger.info("bitfit不需要额外加载参数") 60 | return model 61 | 62 | 63 | # 根据名称锁定参数层 64 | def set_parameter_requires_grad(model, freeze_param_name=[]): 65 | if not isinstance(freeze_param_name, list): 66 | freeze_param_name = [freeze_param_name] 67 | 68 | for idx, (name, param) in enumerate(model.named_parameters()): 69 | for p in freeze_param_name: 70 | if p not in name: 71 | param.requires_grad = False 72 | # 打印参数层名 73 | for idx, (name, param) in enumerate(model.named_parameters()): 74 | for p in freeze_param_name: 75 | if p in name: 76 | print("trainable parameter name is:") 77 | print(name) 78 | param.requires_grad = True 79 | 80 | 81 | @dataclass 82 | class PeftBitfitConfig(PetuningConfig): 83 | """ 84 | This is the configuration class to store the configuration of a [`PeftBitfitModel`]. 85 | 86 | Args: 87 | modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable 88 | and saved in the final checkpoint. 89 | """ 90 | 91 | modules_to_save: Optional[List[str]] = field( 92 | default=None, 93 | metadata={ 94 | "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " 95 | "For example, in Sequence Classification or Token Classification tasks, " 96 | "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." 97 | }, 98 | ) 99 | 100 | def __post_init__(self): 101 | self.peft_type = PeftType.BITFIT 102 | 103 | 104 | class PeftBitfitModel(torch.nn.Module): 105 | """ 106 | Creates Bitfit model for ant peft. 107 | 108 | Args: 109 | model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers. 110 | config ([`PeftBitfitConfig`]): The configuration of the Bitfit model. 111 | 112 | Returns: 113 | `torch.nn.Module`: The Bitfit model. 114 | 115 | Example: 116 | 117 | ```python 118 | >>> from solutions.antllm.antllm.models.glm.modeling_glm import GLMForConditionalGeneration 119 | >>> from solutions.antllm.antllm.models.peft.tuner import PeftBitfitConfig, PeftBitfitModel 120 | >>> from peft import LoraModel, LoraConfig 121 | 122 | >>> config = PeftBitfitConfig() 123 | 124 | >>> model = GLMForConditionalGeneration.from_pretrained("path_to_model") 125 | >>> roem_model = PeftBitfitModel(config, model) 126 | ``` 127 | 128 | **Attributes**: 129 | - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed. 130 | - **peft_config** ([`PeftBitfitConfig`]): The configuration of the Bitfit model. 131 | """ 132 | 133 | def __init__(self, model, config, adapter_name): 134 | super().__init__() 135 | self.model = model 136 | 137 | self.forward = self.model.forward 138 | self.peft_config = config 139 | self.add_adapter(adapter_name, self.peft_config[adapter_name]) 140 | 141 | def add_adapter(self, adapter_name, config=None): 142 | if not isinstance(config, PeftBitfitConfig): 143 | raise ValueError( 144 | f"The PeftBitfitModel need PeftBitfitConfig, but get {type(config)}." 145 | ) 146 | 147 | if config is not None: 148 | config = self._prepare_lora_config(config) 149 | self.peft_config[adapter_name] = config 150 | 151 | if len(self.peft_config) > 1: 152 | raise ValueError( 153 | "BitfitModel supports only 1 peft config or name." 154 | "Because it only freeze the shallow layers without any additional parameters." 155 | ) 156 | 157 | self.model = PEBitfitModel(self.model).get_model() 158 | 159 | if self.peft_config[adapter_name].inference_mode: 160 | _freeze_model(self.model) 161 | 162 | @staticmethod 163 | def _prepare_lora_config(peft_config): 164 | if peft_config.inference_mode: 165 | peft_config.merge_weights = True 166 | return peft_config 167 | 168 | def __getattr__(self, name: str): 169 | """Forward missing attributes to the wrapped module.""" 170 | try: 171 | return super().__getattr__(name) # defer to nn.Module's logic 172 | except AttributeError: 173 | return getattr(self.model, name) 174 | 175 | def get_peft_config_as_dict(self, inference: bool = False): 176 | config_dict = {} 177 | for key, value in self.peft_config.items(): 178 | config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} 179 | if inference: 180 | config["inference_mode"] = True 181 | config_dict[key] = config 182 | return config -------------------------------------------------------------------------------- /data/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Batch samplers that work with either random or sequential data samplers.""" 19 | 20 | import torch 21 | from torch.utils import data 22 | 23 | 24 | class RandomSampler(data.sampler.Sampler): 25 | """Based off of pytorch RandomSampler and DistributedSampler. Essentially 26 | a RandomSampler, but this class lets the user set an epoch like 27 | DistributedSampler Samples elements randomly. If without replacement, then 28 | sample from a shuffled dataset. If with replacement, then user can 29 | specify ``num_samples`` to draw. 30 | Arguments: 31 | data_source (Dataset): dataset to sample from 32 | num_samples (int): number of samples to draw, default=len(dataset) 33 | replacement (bool): samples are drawn with replacement if ``True``, 34 | default=False 35 | """ 36 | 37 | def __init__(self, data_source, replacement=False, num_samples=None): 38 | self.data_source = data_source 39 | self.replacement = replacement 40 | self._num_samples = num_samples 41 | self.epoch = -1 42 | 43 | if self._num_samples is not None and replacement is False: 44 | raise ValueError( 45 | "With replacement=False, num_samples should not " 46 | "be specified, since a random permute will be " 47 | "performed." 48 | ) 49 | 50 | if not isinstance(self.num_samples, int) or self.num_samples <= 0: 51 | raise ValueError( 52 | "num_samples should be a positive integer " 53 | "value, but got num_samples={}".format(self.num_samples) 54 | ) 55 | if not isinstance(self.replacement, bool): 56 | raise ValueError( 57 | "replacement should be a boolean value, but got " 58 | "replacement={}".format(self.replacement) 59 | ) 60 | 61 | @property 62 | def num_samples(self): 63 | # dataset size might change at runtime 64 | if self._num_samples is None: 65 | return len(self.data_source) 66 | return self._num_samples 67 | 68 | def __iter__(self): 69 | n = len(self.data_source) 70 | g = torch.Generator() 71 | if self.epoch >= 0: 72 | g.manual_seed(self.epoch) 73 | if self.replacement: 74 | return iter( 75 | torch.randint( 76 | high=n, size=(self.num_samples,), dtype=torch.int64, generator=g 77 | ).tolist() 78 | ) 79 | return iter(torch.randperm(n, generator=g).tolist()) 80 | 81 | def __len__(self): 82 | return self.num_samples 83 | 84 | def set_epoch(self, epoch): 85 | self.epoch = epoch 86 | 87 | 88 | class DistributedBatchSampler(data.sampler.BatchSampler): 89 | """Similar to normal implementation of distributed sampler, except 90 | implementation is at the batch sampler level, instead of just the 91 | sampler level. This allows wrapping of arbitrary data samplers 92 | (sequential, random, WeightedRandomSampler, etc.) with this batch 93 | sampler. 94 | 95 | The `interleave` argument specifies how to distribute a batch. A value 96 | of True combined with the above random sampler is equivalent to pytorch's 97 | torch.utils.data.distributed.DistributedSampler. 98 | 99 | For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2 100 | specifying True will result in the following samples for each gpu: 101 | GPU0: [0,2,4,6] GPU1: [1,3,5,7] 102 | specifying False will result in the following samples: 103 | GPU0: [0,1,2,3] GPU1: [4,5,6,7]""" 104 | 105 | def __init__( 106 | self, 107 | sampler, 108 | batch_size, 109 | drop_last, 110 | rank=-1, 111 | world_size=2, 112 | wrap_last=False, 113 | interleave=False, 114 | ): 115 | super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last) 116 | if rank == -1: 117 | assert False, "should not be here" 118 | rank = torch.distributed.get_rank() 119 | self.rank = rank 120 | self.world_size = world_size 121 | self.sampler.wrap_around = 0 122 | self.wrap_around = 0 123 | self.wrap_last = wrap_last 124 | self.start_iter = 0 125 | self.interleave = interleave 126 | 127 | def __iter__(self): 128 | batch = [] 129 | i = 0 130 | for idx in self.data_iterator(self.sampler, wrap_around=False): 131 | batch.append(idx) 132 | if len(batch) == self.batch_size: 133 | tbatch = self._batch(batch) 134 | if i >= self.start_iter: 135 | yield tbatch 136 | self.start_iter = 0 137 | i += 1 138 | batch = [] 139 | batch_len = len(batch) 140 | if batch_len > 0 and not self.drop_last: 141 | if self.wrap_last: 142 | self.sampler.wrap_around -= self.batch_size 143 | self.wrap_around += len(batch) 144 | self.wrap_around %= self.batch_size 145 | yield self._batch(batch) 146 | if self.wrap_last: 147 | self.sampler.wrap_around += self.batch_size 148 | 149 | def data_iterator(self, _iter, wrap_around=False): 150 | """iterates through data and handles wrap around""" 151 | for i, idx in enumerate(_iter): 152 | if i < self.wrap_around % self.batch_size: 153 | continue 154 | if wrap_around: 155 | self.wrap_around += 1 156 | self.wrap_around %= self.batch_size 157 | yield idx 158 | 159 | def _batch(self, batch): 160 | """extracts samples only pertaining to this worker's batch""" 161 | if self.interleave: 162 | return batch[self.rank : self.batch_size : self.world_size] 163 | start = self.rank * self.batch_size // self.world_size 164 | end = (self.rank + 1) * self.batch_size // self.world_size 165 | return batch[start:end] 166 | -------------------------------------------------------------------------------- /data/get_data_from_hf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | import random 4 | 5 | from utils.common_utils import main_process_first 6 | from itertools import chain # noqa: E402 7 | from datasets import load_dataset, load_from_disk # noqa: E402 8 | 9 | 10 | def get_hf_dataset(args): 11 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 12 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 13 | # (the dataset will be downloaded automatically from the datasets Hub). 14 | # 15 | # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called 16 | # 'text' is found. You can easily tweak this behavior (see below). 17 | # 18 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 19 | # download the dataset. 20 | if args.dataset_name is not None: 21 | # Downloading and loading a dataset from the hub. 22 | raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) 23 | if "validation" not in raw_datasets.keys(): 24 | raw_datasets["validation"] = load_dataset( 25 | args.dataset_name, 26 | args.dataset_config_name, 27 | split=f"train[:{args.validation_split_percentage}%]", 28 | ) 29 | raw_datasets["train"] = load_dataset( 30 | args.dataset_name, 31 | args.dataset_config_name, 32 | split=f"train[{args.validation_split_percentage}%:]", 33 | ) 34 | elif args.dataset_path is not None: 35 | raw_datasets = load_from_disk(args.dataset_path) 36 | else: 37 | data_files = {} 38 | dataset_args = {} 39 | if args.train_file is not None: 40 | data_files["train"] = args.train_file 41 | if args.validation_file is not None: 42 | data_files["validation"] = args.validation_file 43 | extension = args.train_file.split(".")[-1] 44 | if extension == "txt": 45 | extension = "text" 46 | dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks 47 | raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args) 48 | # If no validation data is there, validation_split_percentage will be used to divide the dataset. 49 | if "validation" not in raw_datasets.keys(): 50 | raw_datasets["validation"] = load_dataset( 51 | extension, 52 | data_files=data_files, 53 | split=f"train[:{args.validation_split_percentage}%]", 54 | **dataset_args, 55 | ) 56 | raw_datasets["train"] = load_dataset( 57 | extension, 58 | data_files=data_files, 59 | split=f"train[{args.validation_split_percentage}%:]", 60 | **dataset_args, 61 | ) 62 | 63 | return raw_datasets 64 | 65 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 66 | # https://huggingface.co/docs/datasets/loading_datasets.html. 67 | 68 | 69 | def preprocess_hf_datasets(args, raw_datasets, tokenizer, logger): 70 | # Preprocessing the datasets. 71 | # First we tokenize all the texts. 72 | column_names = raw_datasets["train"].column_names 73 | text_column_name = "text" if "text" in column_names else column_names[0] 74 | 75 | def tokenize_function(examples): 76 | return tokenizer(examples[text_column_name]) 77 | 78 | with main_process_first(): 79 | tokenized_datasets = raw_datasets.map( 80 | tokenize_function, 81 | batched=True, 82 | num_proc=args.preprocessing_num_workers, 83 | remove_columns=column_names, 84 | load_from_cache_file=not args.overwrite_cache, 85 | desc="Running tokenizer on dataset", 86 | ) 87 | 88 | if args.block_size is None: 89 | block_size = tokenizer.model_max_length 90 | if block_size > 1024: 91 | logger.warning( 92 | "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" 93 | " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" 94 | " override this default with `--block_size xxx`." 95 | ) 96 | block_size = 1024 97 | else: 98 | if args.block_size > tokenizer.model_max_length: 99 | logger.warning( 100 | f"The block_size passed ({args.block_size}) is larger than the maximum length for the model" 101 | f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." 102 | ) 103 | block_size = min(args.block_size, tokenizer.model_max_length) 104 | 105 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. 106 | def group_texts(examples): 107 | # Concatenate all texts. 108 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} 109 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 110 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 111 | # customize this part to your needs. 112 | if total_length >= block_size: 113 | total_length = (total_length // block_size) * block_size 114 | # Split by chunks of max_len. 115 | result = { 116 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 117 | for k, t in concatenated_examples.items() 118 | } 119 | result["labels"] = result["input_ids"].copy() 120 | return result 121 | 122 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder 123 | # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower 124 | # to preprocess. 125 | # 126 | # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: 127 | # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map 128 | 129 | with main_process_first(): 130 | lm_datasets = tokenized_datasets.map( 131 | group_texts, 132 | batched=True, 133 | num_proc=args.preprocessing_num_workers, 134 | load_from_cache_file=not args.overwrite_cache, 135 | desc=f"Grouping texts in chunks of {block_size}", 136 | ) 137 | 138 | train_dataset = lm_datasets["train"] 139 | eval_dataset = lm_datasets["validation"] 140 | 141 | # Log a few random samples from the training set: 142 | # for index in random.sample(range(len(train_dataset)), 3): 143 | # logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") 144 | 145 | return lm_datasets 146 | -------------------------------------------------------------------------------- /model/peft/tuner/roem.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append("..") 18 | sys.path.append("../..") 19 | import torch 20 | import importlib 21 | from enum import Enum 22 | from peft.utils import PeftType 23 | from dataclasses import dataclass, field, asdict 24 | from typing import Optional, List, Union 25 | 26 | from .pe_base_model import PEBaseModel 27 | from model.peft.utils import ( 28 | PetuningConfig, 29 | TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING 30 | ) 31 | from model.peft.utils.others import _freeze_model 32 | 33 | 34 | def is_alps_available(): 35 | return importlib.util.find_spec("alps") is not None 36 | 37 | 38 | if is_alps_available(): 39 | from alps.util import logger 40 | else: 41 | import logging 42 | logger = logging.getLogger(__file__) 43 | 44 | 45 | class PEROEMModel(PEBaseModel): 46 | """ 47 | 只训练模型中间偏上层mlp:参考 https://arxiv.org/pdf/2202.05262.pdf ; https://arxiv.org/abs/2012.14913 48 | model: huggingface transformers model 49 | tokenizer: huggingface transformers tokenizer 50 | """ 51 | 52 | def __init__(self, model, model_name, task_type=None): 53 | self.model = model 54 | self.model_name = model_name 55 | 56 | def get_model(self): 57 | layer_mapping = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[self.model_name] 58 | assert len(layer_mapping) == 2 59 | not_freeze_param_name = [] 60 | for i in range(layer_mapping[0], layer_mapping[1]): 61 | no_freeze_name = str(i) + ".mlp" 62 | logger.info(f"Freeze the {no_freeze_name} layer of model") 63 | not_freeze_param_name.append(no_freeze_name) 64 | set_parameter_requires_grad(self.model, not_freeze_param_name) 65 | return self.model 66 | 67 | @classmethod 68 | def restore(self, model=None, path=None): 69 | logger.info("roem不需要额外加载参数") 70 | return model 71 | 72 | 73 | # 根据名称锁定参数层 74 | def set_parameter_requires_grad(model, freeze_param_name=[]): 75 | if not isinstance(freeze_param_name, list): 76 | freeze_param_name = [freeze_param_name] 77 | 78 | for idx, (name, param) in enumerate(model.named_parameters()): 79 | for p in freeze_param_name: 80 | if p not in name: 81 | param.requires_grad = False 82 | # 打印参数层名 83 | for idx, (name, param) in enumerate(model.named_parameters()): 84 | for p in freeze_param_name: 85 | if p in name: 86 | print("The name of used parameter used by ROEM is:") 87 | print(name) 88 | param.requires_grad = True 89 | 90 | 91 | @dataclass 92 | class PeftROEMConfig(PetuningConfig): 93 | """ 94 | This is the configuration class to store the configuration of a [`PeftROEMModel`]. 95 | 96 | Args: 97 | target_layers (`Union[List[int], int]`): The names of the modules to apply Lora to. 98 | """ 99 | 100 | target_layers: Optional[Union[List[int], int]] = field( 101 | default=None, 102 | metadata={ 103 | "help": "List of layers of the model to freeze the parameters." 104 | "For example, [20, 30] or '30' " 105 | }, 106 | ) 107 | 108 | def __post_init__(self): 109 | self.peft_type = PeftType.ROEM 110 | 111 | 112 | class PeftROEMModel(torch.nn.Module): 113 | """ 114 | Creates ROEM model for ant peft. 115 | 116 | Args: 117 | model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers. 118 | config ([`PeftROEMConfig`]): The configuration of the ROEM model. 119 | 120 | Returns: 121 | `torch.nn.Module`: The ROEM model. 122 | 123 | Example: 124 | 125 | ```python 126 | >>> from solutions.antllm.antllm.models.glm.modeling_glm import GLMForConditionalGeneration 127 | >>> from solutions.antllm.antllm.models.peft.tuner import PeftROEMConfig, PeftROEMModel 128 | >>> from peft import LoraModel, LoraConfig 129 | 130 | >>> config = PeftROEMConfig( 131 | ... target_layers=[17, 22], 132 | ... ) 133 | 134 | >>> model = GLMForConditionalGeneration.from_pretrained("path_to_model") 135 | >>> roem_model = PeftROEMModel(config, model) 136 | ``` 137 | 138 | **Attributes**: 139 | - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed. 140 | - **peft_config** ([`PeftROEMConfig`]): The configuration of the ROEM model. 141 | """ 142 | 143 | def __init__(self, model, config, adapter_name): 144 | super().__init__() 145 | self.model = model 146 | 147 | self.forward = self.model.forward 148 | self.peft_config = config 149 | self.add_adapter(adapter_name, self.peft_config[adapter_name]) 150 | 151 | def add_adapter(self, adapter_name, config=None): 152 | if not isinstance(config, PeftROEMConfig): 153 | raise ValueError( 154 | f"The PeftROEMModel need PeftROEMConfig, but get {type(config)}." 155 | ) 156 | 157 | model_config = self.model.config.to_dict() if hasattr(self.model.config, "to_dict") else self.model.config 158 | if config is not None: 159 | config = self._prepare_lora_config(config, model_config) 160 | self.peft_config[adapter_name] = config 161 | 162 | if len(self.peft_config) > 1: 163 | raise ValueError( 164 | "ROEMModel supports only 1 peft config or name." 165 | "Because it only freeze the shallow layers without any additional parameters." 166 | ) 167 | 168 | model_name = model_config["model_type"] 169 | self.model = PEROEMModel(self.model, model_name).get_model() 170 | 171 | if self.peft_config[adapter_name].inference_mode: 172 | _freeze_model(self.model) 173 | 174 | @staticmethod 175 | def _prepare_lora_config(peft_config, model_config): 176 | if peft_config.target_layers is None: 177 | if model_config["model_type"] not in TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING: 178 | raise ValueError("Please specify `target_layers` in `peft_config`") 179 | peft_config.target_layers = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[model_config["model_type"]] 180 | if peft_config.inference_mode: 181 | peft_config.merge_weights = True 182 | return peft_config 183 | 184 | def __getattr__(self, name: str): 185 | """Forward missing attributes to the wrapped module.""" 186 | try: 187 | return super().__getattr__(name) # defer to nn.Module's logic 188 | except AttributeError: 189 | return getattr(self.model, name) 190 | 191 | def get_peft_config_as_dict(self, inference: bool = False): 192 | config_dict = {} 193 | for key, value in self.peft_config.items(): 194 | config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} 195 | if inference: 196 | config["inference_mode"] = True 197 | config_dict[key] = config 198 | return config -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 | 🤗 Hugging Face(is coming) 7 | • 8 | 🤖 ModelScope(is coming) 9 | • 10 | 📄 Paper 11 |

12 | 13 |
14 | 15 | [![GitHub issues](https://img.shields.io/github/issues/codefuse-ai/Collinear-Constrained-Attention)](https://github.com/codefuse-ai/Collinear-Constrained-Attention/issues) 16 | [![GitHub Repo stars](https://img.shields.io/github/stars/codefuse-ai/Collinear-Constrained-Attention?style=social)](https://github.com/codefuse-ai/Collinear-Constrained-Attention) 17 | 18 |
19 | 20 | [comment]: <> ([Weights & Biases monitoring](https://wandb.ai/eleutherai/neox)) 21 | 22 | This repository provides an implementation of [CoCA (Collinear Constrained Attention)](https://arxiv.org/abs/2309.08646). This implementation is based on 2 transformer models in [Hugging Face](). 23 | 24 | - [GPT-NeoX](https://github.com/huggingface/transformers/tree/main/src/transformers/models/gpt_neox) which is an [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. 25 | - [LLaMA](https://github.com/huggingface/transformers/tree/main/src/transformers/models/llama) from Meta AI team. 26 | 27 | We just point out those modifications which made to implement CoCA here. For more information about model training and inference, we recommend [transformers](https://github.com/huggingface/transformers). 28 | 29 | For practicality, we enhanced CoCA's computational and spatial efficiency with [opt_einsum](https://github.com/dgasmith/opt_einsum), view this repository for more information. 30 | 31 | ![Model Structure](https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/model.png "Model Structure") 32 | 33 | ![PPL Performance](https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/PPL.png "PPL Performance") ![Passkey Performance](https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/passkey.png "Passkey Performance") 34 | 35 | [comment]: <> () 36 | 37 | ## 🚀 Quick Start 38 | 39 | ### 💻 Environment 40 | Atorch is an optimized torch version by Ant Group, it's not available for opensource community yet. It will be opensource in near future. Before that, you may use origin torch version instead. 41 | 42 | ### 📂 Datasets 43 | You can use raw data or tokenized data for training. 44 | 45 | When using raw data, please ensure the data format as: 46 | ```json 47 | {"content" : "It is a sentence for training."} 48 | ``` 49 | using `.jsonl` for saving data. 50 | 51 | You can also use tokenized data saving in `.bin` via [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) tokenizer. 52 | ```bash 53 | python ./data/tokenization/generate_dataset.py 54 | ``` 55 | notice to modify `input_dict`, `conver_type_list`, `output_name`, `seq_length` for your own dataset. 56 | 57 | ### 🏋️‍♂️ Training 58 | You can train a model from scratch as follows: 59 | ```bash 60 | bash ./train/run_coca.sh 32 1 8 2 61 | ``` 62 | 63 | - first parameter means `per gpu batch size` 64 | - second parameter means `tensor parallel`(larger than 1 is not supported yet) 65 | - third parameter means `data parallel`, equals to the number of GPUs 66 | - last parameter means `train epochs` 67 | 68 | If you want to load a pre-trained model, set `--pretrained_model_path $PRETRAINED_MODEL_PATH \`. 69 | 70 | ### 🧠 Inference 71 | CoCA can be loaded using the `transformers` functionality: 72 | 73 | ```python 74 | from model.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXConfig 75 | from transformers import AutoTokenizer 76 | from transformers import GenerationConfig 77 | 78 | config = GPTNeoXConfig.from_pretrained(checkpoint) 79 | config.is_decoder = True 80 | 81 | # If you want to inference out of training length, 82 | # CoCA is compatible with NTK-aware scaled RoPE and performs much more better than original attention structure 83 | rope_scaling= {"type": "dynamic", "factor": 4.0} 84 | config.rope_scaling = rope_scaling 85 | 86 | model = GPTNeoXForCausalLM.from_pretrained(checkpoint, 87 | config=config, 88 | device_map="auto") 89 | 90 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side="left") 91 | tokenizer.add_special_tokens({'eos_token': "<|endoftext|>"}) 92 | tokenizer.add_special_tokens({'pad_token': "<|pad|>"}) 93 | ``` 94 | 95 | ## 📝 Administrative Notes 96 | 97 | ### 📚 Citing CoCA 98 | 99 | If you have found the CoCA library helpful in your work, you can cite this repository as 100 | 101 | ```bibtex 102 | @inproceedings{zhu2024coca, 103 | title={CoCA: Fusing Position Embedding with Collinear Constrained Attention in Transformers for Long Context Window Extending}, 104 | author={Shiyi Zhu and Jing Ye and Wei Jiang and Siqiao Xue and Qi Zhang and Yifan Wu and Jianguo Li}, 105 | booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics}, 106 | month = august, 107 | year = {2024}, 108 | publisher = {Association for Computational Linguistics}, 109 | } 110 | ``` 111 | 112 | ### 📜 Licensing 113 | 114 | This repository hosts code of CoCA project. Copyright (c) 2023, Ant Group. Licensed under the Apache License: 115 | 116 | Licensed under the Apache License, Version 2.0 (the "License"); 117 | you may not use this file except in compliance with the License. 118 | You may obtain a copy of the License at 119 | 120 | http://www.apache.org/licenses/LICENSE-2.0 121 | 122 | Unless required by applicable law or agreed to in writing, software 123 | distributed under the License is distributed on an "AS IS" BASIS, 124 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 125 | See the License for the specific language governing permissions and 126 | limitations under the License. 127 | 128 | This repository is based off code written by EleutherAI that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by EleutherAI maintain a EleutherAI copyright header. When the EleutherAI code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License. 129 | 130 | This repository is based off code written by Meta AI that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by Meta AI maintain a Meta AI copyright header. When the Meta AI code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License. 131 | 132 | This repository is based off code written by NVIDIA that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by NVIDIA maintain a NVIDIA copyright header. All files that do not contain such a header are the exclusive copyright of EleutherAI. When the NVIDIA code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License. 133 | 134 | This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate. 135 | 136 | For full terms, see the `LICENSE` file. If you have any questions, comments, or concerns about licensing please email me at zhushiyi.zsy@antgroup.com. 137 | -------------------------------------------------------------------------------- /model/gpt_neox/configuration_gpt_neox.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group 3 | # This file is based on code by the authors denoted below and has been modified from its original version. 4 | # 5 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """ GPTNeoX model configuration""" 19 | 20 | from transformers.configuration_utils import PretrainedConfig 21 | from transformers.utils import logging 22 | 23 | 24 | logger = logging.get_logger(__name__) 25 | 26 | GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json", 28 | # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox 29 | } 30 | 31 | 32 | class GPTNeoXConfig(PretrainedConfig): 33 | r""" 34 | This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an 35 | GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration 36 | with the defaults will yield a similar configuration to that of the GPTNeoX 37 | [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture. 38 | 39 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 40 | documentation from [`PretrainedConfig`] for more information. 41 | 42 | 43 | Args: 44 | vocab_size (`int`, *optional*, defaults to 50432): 45 | Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the 46 | `inputs_ids` passed when calling [`GPTNeoXModel`]. 47 | hidden_size (`int`, *optional*, defaults to 6144): 48 | Dimension of the encoder layers and the pooler layer. 49 | num_hidden_layers (`int`, *optional*, defaults to 44): 50 | Number of hidden layers in the Transformer encoder. 51 | num_attention_heads (`int`, *optional*, defaults to 64): 52 | Number of attention heads for each attention layer in the Transformer encoder. 53 | intermediate_size (`int`, *optional*, defaults to 24576): 54 | Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. 55 | hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): 56 | The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, 57 | `"relu"`, `"selu"` and `"gelu_new"` are supported. 58 | rotary_pct (`float`, *optional*, defaults to 0.25): 59 | percentage of hidden dimensions to allocate to rotary embeddings 60 | rotary_emb_base (`int`, *optional*, defaults to 10000) 61 | base for computing rotary embeddings frequency 62 | max_position_embeddings (`int`, *optional*, defaults to 2048): 63 | The maximum sequence length that this model might ever be used with. Typically set this to something large 64 | just in case (e.g., 512 or 1024 or 2048). 65 | initializer_range (`float`, *optional*, defaults to 1e-5): 66 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 67 | layer_norm_eps (`float`, *optional*, defaults to 1e-12): 68 | The epsilon used by the layer normalization layers. 69 | use_cache (`bool`, *optional*, defaults to `True`): 70 | Whether or not the model should return the last key/values attentions (not used by all models). Only 71 | relevant if `config.is_decoder=True`. 72 | use_parallel_residual (`bool`, *optional*, defaults to `True`): 73 | Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training 74 | speedup at large scales (e.g. 20B). 75 | rope_scaling (`Dict`, *optional*): 76 | Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling 77 | strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format 78 | is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update 79 | `max_position_embeddings` to the expected new maximum. See the following thread for more information on how 80 | these scaling strategies behave: 81 | https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an 82 | experimental feature, subject to breaking API changes in future versions. 83 | Example: 84 | 85 | ```python 86 | >>> from transformers import GPTNeoXConfig, GPTNeoXModel 87 | 88 | >>> # Initializing a GPTNeoX gpt-neox-20b style configuration 89 | >>> configuration = GPTNeoXConfig() 90 | 91 | >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration 92 | >>> model = GPTNeoXModel(configuration) # doctest: +SKIP 93 | 94 | >>> # Accessing the model configuration 95 | >>> configuration = model.config # doctest: +SKIP 96 | ```""" 97 | model_type = "gpt_neox" 98 | 99 | def __init__( 100 | self, 101 | vocab_size=50432, 102 | hidden_size=6144, 103 | num_hidden_layers=44, 104 | num_attention_heads=64, 105 | intermediate_size=24576, 106 | hidden_act="gelu", 107 | rotary_pct=0.25, 108 | rotary_emb_base=10000, 109 | max_position_embeddings=2048, 110 | initializer_range=0.02, 111 | layer_norm_eps=1e-5, 112 | use_cache=True, 113 | bos_token_id=0, 114 | eos_token_id=2, 115 | tie_word_embeddings=False, 116 | use_parallel_residual=True, 117 | rope_scaling=None, 118 | **kwargs 119 | ): 120 | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) 121 | self.vocab_size = vocab_size 122 | self.max_position_embeddings = max_position_embeddings 123 | self.hidden_size = hidden_size 124 | self.num_hidden_layers = num_hidden_layers 125 | self.num_attention_heads = num_attention_heads 126 | self.intermediate_size = intermediate_size 127 | self.hidden_act = hidden_act 128 | self.rotary_pct = rotary_pct 129 | self.rotary_emb_base = rotary_emb_base 130 | self.initializer_range = initializer_range 131 | self.layer_norm_eps = layer_norm_eps 132 | self.use_cache = use_cache 133 | self.tie_word_embeddings = tie_word_embeddings 134 | self.use_parallel_residual = use_parallel_residual 135 | self.rope_scaling = rope_scaling 136 | self._rope_scaling_validation() 137 | 138 | if self.hidden_size % self.num_attention_heads != 0: 139 | raise ValueError( 140 | "The hidden size is not divisble by the number of attention heads! Make sure to update them!" 141 | ) 142 | 143 | # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation 144 | def _rope_scaling_validation(self): 145 | """ 146 | Validate the `rope_scaling` configuration. 147 | """ 148 | if self.rope_scaling is None: 149 | return 150 | 151 | if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: 152 | raise ValueError( 153 | "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, " 154 | f"got {self.rope_scaling}" 155 | ) 156 | rope_scaling_type = self.rope_scaling.get("type", None) 157 | rope_scaling_factor = self.rope_scaling.get("factor", None) 158 | if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: 159 | raise ValueError( 160 | f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" 161 | ) 162 | if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: 163 | raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") 164 | -------------------------------------------------------------------------------- /model/llama/configuration_llama.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | """ LLaMA model configuration""" 21 | 22 | from transformers.configuration_utils import PretrainedConfig 23 | from transformers.utils import logging 24 | 25 | 26 | logger = logging.get_logger(__name__) 27 | 28 | LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} 29 | 30 | 31 | class LlamaConfig(PretrainedConfig): 32 | r""" 33 | This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA 34 | model according to the specified arguments, defining the model architecture. Instantiating a configuration with the 35 | defaults will yield a similar configuration to that of the LLaMA-7B. 36 | 37 | Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the 38 | documentation from [`PretrainedConfig`] for more information. 39 | 40 | 41 | Args: 42 | vocab_size (`int`, *optional*, defaults to 32000): 43 | Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the 44 | `inputs_ids` passed when calling [`LlamaModel`] 45 | hidden_size (`int`, *optional*, defaults to 4096): 46 | Dimension of the hidden representations. 47 | intermediate_size (`int`, *optional*, defaults to 11008): 48 | Dimension of the MLP representations. 49 | num_hidden_layers (`int`, *optional*, defaults to 32): 50 | Number of hidden layers in the Transformer encoder. 51 | num_attention_heads (`int`, *optional*, defaults to 32): 52 | Number of attention heads for each attention layer in the Transformer encoder. 53 | num_key_value_heads (`int`, *optional*): 54 | This is the number of key_value heads that should be used to implement Grouped Query Attention. If 55 | `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if 56 | `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When 57 | converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed 58 | by meanpooling all the original heads within that group. For more details checkout [this 59 | paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to 60 | `num_attention_heads`. 61 | pretraining_tp (`int`, *optional*, defaults to `1`): 62 | Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this 63 | document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is 64 | necessary to ensure exact reproducibility of the pretraining results. Please refer to [this 65 | issue](https://github.com/pytorch/pytorch/issues/76232). 66 | hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): 67 | The non-linear activation function (function or string) in the decoder. 68 | max_position_embeddings (`int`, *optional*, defaults to 2048): 69 | The maximum sequence length that this model might ever be used with. Typically set this to something large 70 | just in case (e.g., 512 or 1024 or 2048). 71 | initializer_range (`float`, *optional*, defaults to 0.02): 72 | The standard deviation of the truncated_normal_initializer for initializing all weight matrices. 73 | rms_norm_eps (`float`, *optional*, defaults to 1e-12): 74 | The epsilon used by the rms normalization layers. 75 | use_cache (`bool`, *optional*, defaults to `True`): 76 | Whether or not the model should return the last key/values attentions (not used by all models). Only 77 | relevant if `config.is_decoder=True`. 78 | tie_word_embeddings(`bool`, *optional*, defaults to `False`): 79 | Whether to tie weight embeddings 80 | rope_scaling (`Dict`, *optional*): 81 | Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling 82 | strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format 83 | is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update 84 | `max_position_embeddings` to the expected new maximum. See the following thread for more information on how 85 | these scaling strategies behave: 86 | https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an 87 | experimental feature, subject to breaking API changes in future versions. 88 | 89 | Example: 90 | 91 | ```python 92 | >>> from transformers import LlamaModel, LlamaConfig 93 | 94 | >>> # Initializing a LLaMA llama-7b style configuration 95 | >>> configuration = LlamaConfig() 96 | 97 | >>> # Initializing a model from the llama-7b style configuration 98 | >>> model = LlamaModel(configuration) 99 | 100 | >>> # Accessing the model configuration 101 | >>> configuration = model.config 102 | ```""" 103 | model_type = "llama" 104 | keys_to_ignore_at_inference = ["past_key_values"] 105 | 106 | def __init__( 107 | self, 108 | vocab_size=32000, 109 | hidden_size=4096, 110 | intermediate_size=11008, 111 | num_hidden_layers=32, 112 | num_attention_heads=32, 113 | num_key_value_heads=None, 114 | hidden_act="silu", 115 | max_position_embeddings=2048, 116 | initializer_range=0.02, 117 | rms_norm_eps=1e-6, 118 | use_cache=True, 119 | pad_token_id=None, 120 | bos_token_id=1, 121 | eos_token_id=2, 122 | pretraining_tp=1, 123 | tie_word_embeddings=False, 124 | rope_scaling=None, 125 | **kwargs, 126 | ): 127 | self.vocab_size = vocab_size 128 | self.max_position_embeddings = max_position_embeddings 129 | self.hidden_size = hidden_size 130 | self.intermediate_size = intermediate_size 131 | self.num_hidden_layers = num_hidden_layers 132 | self.num_attention_heads = num_attention_heads 133 | 134 | # for backward compatibility 135 | if num_key_value_heads is None: 136 | num_key_value_heads = num_attention_heads 137 | 138 | self.num_key_value_heads = num_key_value_heads 139 | self.hidden_act = hidden_act 140 | self.initializer_range = initializer_range 141 | self.rms_norm_eps = rms_norm_eps 142 | self.pretraining_tp = pretraining_tp 143 | self.use_cache = use_cache 144 | self.rope_scaling = rope_scaling 145 | self._rope_scaling_validation() 146 | 147 | super().__init__( 148 | pad_token_id=pad_token_id, 149 | bos_token_id=bos_token_id, 150 | eos_token_id=eos_token_id, 151 | tie_word_embeddings=tie_word_embeddings, 152 | **kwargs, 153 | ) 154 | 155 | def _rope_scaling_validation(self): 156 | """ 157 | Validate the `rope_scaling` configuration. 158 | """ 159 | if self.rope_scaling is None: 160 | return 161 | 162 | if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: 163 | raise ValueError( 164 | "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, " 165 | f"got {self.rope_scaling}" 166 | ) 167 | rope_scaling_type = self.rope_scaling.get("type", None) 168 | rope_scaling_factor = self.rope_scaling.get("factor", None) 169 | if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: 170 | raise ValueError( 171 | f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" 172 | ) 173 | if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: 174 | raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}") -------------------------------------------------------------------------------- /tools/analysis/MMapTokenIdsBinChecker.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import sys 17 | sys.path.append("..") 18 | sys.path.append("../..") 19 | 20 | import os 21 | import struct 22 | from transformers import PreTrainedTokenizerFast 23 | import random 24 | import numpy as np 25 | from model.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast 26 | 27 | tokenizer_vocab_file = '/mnt/user/bingchang/multisft/code/13b/code/v1-old/gpt-neox-2.0-sft-6b/tokenizer-ant-v5.json' 28 | 29 | table = {ord(f): ord(t) for f, t in zip( 30 | u',。!?:【】()%#@&1234567890', 31 | u',.!?:[]()%#@&1234567890')} 32 | 33 | 34 | def punctuation_format(text): 35 | # Replace non-breaking space with space 36 | text = text.strip() + '\n' 37 | text = text.replace('\u202f', ' ').replace('\xa0', ' ') 38 | # change chinese punctuation to english ones 39 | text = text.translate(table) 40 | return text 41 | 42 | 43 | def save_to_file(file_path, text): 44 | """ 45 | 写给定的追加写入到文件中 46 | """ 47 | with open(file_path, 'a') as f: 48 | f.write(f'{text}') 49 | 50 | 51 | def detokenize(input_ids, tokenizer, padding_token=None): 52 | """ 53 | 使用给定的对给定的token id列表进行解码,如果给定了padding_token,则将padding部分移除 54 | """ 55 | result = tokenizer.decode(input_ids) 56 | if padding_token and padding_token in result: 57 | result = result[:result.index(padding_token)] 58 | return result 59 | 60 | 61 | def convert_bytes_to_elements(byte_data, dtype): 62 | """ 63 | 将字节数组转为对应数据类型数组 64 | """ 65 | result = np.frombuffer(byte_data, dtype=dtype) 66 | return [x for x in result] 67 | 68 | 69 | class MMapTokenIdsBinChecker: 70 | """ 71 | 检查GPT Neox MMAP方式生成的input_ids.bin文件 72 | """ 73 | 74 | # 用于检查的随机采样数量 75 | _SAMPLING_NUM = 100 76 | 77 | _SEED = 202306192219 78 | 79 | _PADING_TOKEN = "<|pad|>" 80 | 81 | def __init__ (self, input_ids_bin_path:str, loss_mask_bin_path:str, tokenizer_path:str, detokenize_output_path:str, seq_len:int, element_size:int, dtype:np.dtype, sample_total:int, ramdom_sampling_num:int): 82 | assert os.path.exists(input_ids_bin_path), ( 83 | "给定的input_ids.bin文件路径不存在" 84 | "请确保给定的路径是存在的" 85 | ) 86 | assert os.path.isfile(input_ids_bin_path), ( 87 | "给定的input_ids.bin文件不是一个文件" 88 | "请确保给定的是一个GPT Neox MMAP方式生成的input_ids.bin文件" 89 | ) 90 | assert os.path.exists(loss_mask_bin_path) and os.path.isfile(loss_mask_bin_path), ( 91 | "给定的loss_mask.bin文件路径不存在或者非文件" 92 | "请确保给定有效的loss_mask.bin文件路径" 93 | ) 94 | assert os.path.exists(tokenizer_path) and os.path.isfile(tokenizer_path), ( 95 | "给定的词表文件不存在或者不是一个文件" 96 | "请确保给定有效的词表文件路径" 97 | ) 98 | 99 | self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path) 100 | 101 | self._SAMPLING_NUM = ramdom_sampling_num 102 | 103 | sampled_input_ids = [] 104 | sampled_loss_masks = [] 105 | sampled_indexes = [] 106 | with open(input_ids_bin_path, 'rb') as fb, open(loss_mask_bin_path, 'rb') as f_lm: 107 | # 随机选取若干个样本,以进行detokenization验证和loss mask验证 108 | random.seed(self._SEED) 109 | random_indexes = random.sample(range(0, sample_total), min(self._SAMPLING_NUM, sample_total)) 110 | print('随机采样样本索引为:', random_indexes) 111 | # 依次处理每个取样的样本 112 | for i in random_indexes: 113 | # 通过设定文件offset位置,读取取样的一个样本 114 | reset_pos = max(0, i-1)*seq_len*element_size 115 | fb.seek(reset_pos) 116 | data = fb.read(element_size*seq_len) 117 | # 将样本从byte序列转为int序列 118 | token_ids = convert_bytes_to_elements(data, dtype) 119 | sampled_input_ids.append(token_ids) 120 | text = detokenize(token_ids, self.tokenizer, self._PADING_TOKEN) 121 | # 保存到文件中供人工校验 122 | save_to_file(detokenize_output_path, '\n' + '[' + str(i) + ']' + '=*='*30 + '\n') 123 | save_to_file(detokenize_output_path, f"{text}\n") 124 | # 读取样本对应的loss_mask,用于检查是否只有部分的loss mask为1 125 | f_lm.seek(reset_pos) 126 | loss_mask_data = convert_bytes_to_elements(f_lm.read(seq_len*element_size), dtype) 127 | sampled_loss_masks.append(loss_mask_data) 128 | 129 | # my_text = punctuation_format(text) 130 | # my_tokenizer = GPTNeoXTokenizerFast.from_pretrained("/mnt/user/fuhang/checkpoints/neox-2.0-125m-sst-0614/hf_ckpt") 131 | # my_tokenizer.eod_token = "<|endoftext|>" 132 | # my_tokenizer.pad_token = "<|extratoken_1|>" 133 | # my_tokenizer.sop_token = "<|endoftext|>" # 适配multi task dataset 134 | # my_tokenizer.eop_token = "<|endoftext|>" 135 | # my_tokenizer.eod_id = my_tokenizer.convert_tokens_to_ids(my_tokenizer.eod_token) 136 | # my_tokenizer.pad_id = my_tokenizer.convert_tokens_to_ids(my_tokenizer.pad_token) 137 | # my_token_ids = my_tokenizer(my_text)['input_ids'] 138 | 139 | # sampled_indexes.append(i) 140 | # if i == 1926485: 141 | # print('\n\n', '=*='*50, '\n', token_ids) 142 | # print(i, text) 143 | 144 | # print('\n\n', '=*='*50, '\n', token_ids) 145 | print('\n\n', '=*='*50, '\n') 146 | print('token ids: ') 147 | print(token_ids) 148 | print('loss mask:') 149 | print(loss_mask_data) 150 | # print('\n\n', '=*='*50, '\n') 151 | # print('my token ids: ') 152 | # print(my_token_ids) 153 | # print(i) 154 | # print(text) 155 | 156 | 157 | self._sampled_input_ids = sampled_input_ids 158 | self._sampled_loss_masks = sampled_loss_masks 159 | self._sampled_indexes = sampled_indexes 160 | 161 | 162 | def check_loss_mask(self): 163 | """ 164 | 检查是否只有bot角色的内容对应的loss mask为1 165 | """ 166 | for i in range(len(self._sampled_input_ids)): 167 | sampled_input_ids = self._sampled_input_ids[i] 168 | sampled_loss_mask = self._sampled_loss_masks[i] 169 | 170 | #print(i, 'input_ids', sampled_input_ids) 171 | #print('\n') 172 | #print(i, 'loss mask', len(sampled_loss_mask), sampled_loss_mask) 173 | #print('\n\n', '=*='*30) 174 | 175 | # 找出loss mask为1的片段 176 | pieces = [] 177 | if 1 not in sampled_loss_mask: 178 | print(f'\033[1;31;47m【异常】样本{self._sampled_indexes[i]} loss mask全为0\033[0m') 179 | print('detokenizee', detokenize(sampled_input_ids, self.tokenizer, self._PADING_TOKEN)) 180 | print('input_ids', sampled_input_ids) 181 | return False 182 | if 0 not in sampled_loss_mask: 183 | print(f'\033[1;31;47m【异常】样本{self._sampled_indexes[i]} loss mask全为1\033[0m') 184 | return False 185 | 186 | start_index = sampled_loss_mask.index(1) 187 | accul_index = 0 188 | while start_index > -1: 189 | #print('start_index', start_index) 190 | 191 | if 0 in sampled_loss_mask[start_index:]: 192 | end_index = sampled_loss_mask[start_index:].index(0) 193 | end_index = len(sampled_loss_mask) 194 | else: 195 | print(self._sampled_loss_masks[i]) 196 | end_index = start_index + end_index 197 | #print('end_index', end_index) 198 | 199 | pieces.append((accul_index + start_index, accul_index + end_index)) 200 | 201 | sampled_input_ids = sampled_input_ids[end_index:] 202 | sampled_loss_mask = sampled_loss_mask[end_index:] 203 | accul_index += end_index 204 | 205 | if 1 not in sampled_loss_mask: 206 | break 207 | start_index = sampled_loss_mask.index(1) 208 | 209 | 210 | # 检查每段loss mask为1的数据对应的token ids之前的三个词是否是<|role_start|>bot<|role_end|>,最后一个词是否是<|end|> 211 | for piece in pieces: 212 | token_ids_piece = self._sampled_input_ids[i][max(0, piece[0]-3):piece[1]] 213 | text_piece = detokenize(token_ids_piece, self.tokenizer, self._PADING_TOKEN) 214 | if not text_piece.startswith("<|role_start|>bot<|role_end|>") or not text_piece.endswith('<|end|>'): 215 | print(f'\033[1;31;47m【异常】样本{self._sampled_indexes[i]}存在loss mask为1但对应的不是bot片段:{text_piece}\033[0m') 216 | return False 217 | 218 | return True 219 | 220 | 221 | def check(self): 222 | return self.check_loss_mask() 223 | -------------------------------------------------------------------------------- /model/llama/tokenization_llama_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import os 16 | from shutil import copyfile 17 | from typing import TYPE_CHECKING, Optional, Tuple 18 | 19 | from tokenizers import processors 20 | 21 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast 22 | from transformers.utils import is_sentencepiece_available, logging 23 | from transformers.utils.versions import require_version 24 | 25 | 26 | if TYPE_CHECKING: 27 | from transformers.pipelines.conversational import Conversation 28 | 29 | require_version("tokenizers>=0.13.3") 30 | 31 | if is_sentencepiece_available(): 32 | from .tokenization_llama import LlamaTokenizer 33 | else: 34 | LlamaTokenizer = None 35 | 36 | logger = logging.get_logger(__name__) 37 | VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"} 38 | 39 | B_INST, E_INST = "[INST]", "[/INST]" 40 | B_SYS, E_SYS = "<>\n", "\n<>\n\n" 41 | 42 | # fmt: off 43 | DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \ 44 | answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\ 45 | that your responses are socially unbiased and positive in nature. 46 | 47 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \ 48 | correct. If you don't know the answer to a question, please don't share false information.""" 49 | # fmt: on 50 | 51 | 52 | class LlamaTokenizerFast(PreTrainedTokenizerFast): 53 | """ 54 | Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. 55 | 56 | This uses notably ByteFallback and no normalization. 57 | 58 | ``` 59 | from transformers import LlamaTokenizerFast 60 | 61 | tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer") 62 | tokenizer.encode("Hello this is a test") 63 | >>> [1, 15043, 445, 338, 263, 1243] 64 | ``` 65 | 66 | If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or 67 | call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the 68 | values of the first token and final token of an encoded sequence will not be correct). For more details, checkout 69 | [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation. 70 | 71 | 72 | This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should 73 | refer to this superclass for more information regarding those methods. 74 | 75 | Args: 76 | vocab_file (`str`): 77 | [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that 78 | contains the vocabulary necessary to instantiate a tokenizer. 79 | tokenizer_file (`str`): 80 | [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that 81 | contains everything needed to load the tokenizer. 82 | 83 | clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`): 84 | Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra 85 | spaces. 86 | 87 | bos_token (`str`, *optional*, defaults to `""`): 88 | The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. 89 | 90 | eos_token (`str`, *optional*, defaults to `""`): 91 | The end of sequence token. 92 | 93 | unk_token (`str`, *optional*, defaults to `""`): 94 | The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this 95 | token instead. 96 | """ 97 | 98 | vocab_files_names = VOCAB_FILES_NAMES 99 | slow_tokenizer_class = LlamaTokenizer 100 | padding_side = "left" 101 | model_input_names = ["input_ids", "attention_mask"] 102 | 103 | def __init__( 104 | self, 105 | vocab_file=None, 106 | tokenizer_file=None, 107 | clean_up_tokenization_spaces=False, 108 | unk_token="", 109 | bos_token="", 110 | eos_token="", 111 | add_bos_token=True, 112 | add_eos_token=False, 113 | **kwargs, 114 | ): 115 | super().__init__( 116 | vocab_file=vocab_file, 117 | tokenizer_file=tokenizer_file, 118 | clean_up_tokenization_spaces=clean_up_tokenization_spaces, 119 | unk_token=unk_token, 120 | bos_token=bos_token, 121 | eos_token=eos_token, 122 | **kwargs, 123 | ) 124 | self._add_bos_token = add_bos_token 125 | self._add_eos_token = add_eos_token 126 | self.update_post_processor() 127 | 128 | self.vocab_file = vocab_file 129 | self.can_save_slow_tokenizer = False if not self.vocab_file else True 130 | 131 | def update_post_processor(self): 132 | """ 133 | Updates the underlying post processor with the current `bos_token` and `eos_token`. 134 | """ 135 | bos = self.bos_token 136 | bos_token_id = self.bos_token_id 137 | 138 | eos = self.eos_token 139 | eos_token_id = self.eos_token_id 140 | 141 | single = f"{(bos+':0 ') * self.add_bos_token}$A:0{(' '+eos+':0') * self.add_eos_token}" 142 | pair = f"{single}{(' '+bos+':1') * self.add_bos_token} $B:1{(' '+eos+':1') * self.add_eos_token}" 143 | 144 | special_tokens = [] 145 | if self.add_bos_token: 146 | special_tokens.append((bos, bos_token_id)) 147 | if self.add_eos_token: 148 | special_tokens.append((eos, eos_token_id)) 149 | self._tokenizer.post_processor = processors.TemplateProcessing( 150 | single=single, pair=pair, special_tokens=special_tokens 151 | ) 152 | 153 | @property 154 | def add_eos_token(self): 155 | return self._add_eos_token 156 | 157 | @property 158 | def add_bos_token(self): 159 | return self._add_bos_token 160 | 161 | @add_eos_token.setter 162 | def add_eos_token(self, value): 163 | self._add_eos_token = value 164 | self.update_post_processor() 165 | 166 | @add_bos_token.setter 167 | def add_bos_token(self, value): 168 | self._add_bos_token = value 169 | self.update_post_processor() 170 | 171 | def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: 172 | if not self.can_save_slow_tokenizer: 173 | raise ValueError( 174 | "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " 175 | "tokenizer." 176 | ) 177 | 178 | if not os.path.isdir(save_directory): 179 | logger.error(f"Vocabulary path ({save_directory}) should be a directory") 180 | return 181 | out_vocab_file = os.path.join( 182 | save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] 183 | ) 184 | 185 | if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): 186 | copyfile(self.vocab_file, out_vocab_file) 187 | 188 | return (out_vocab_file,) 189 | 190 | def _build_conversation_input_ids(self, conversation: "Conversation"): 191 | """Builds the input ids for a conversation. 192 | This is the format used in the provided examples. System prompts should be manually added at the beginning of 193 | the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used. 194 | ``` 195 | [INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer 196 | [INST] Prompt [/INST] Answer 197 | [INST] Prompt [/INST] 198 | ``` 199 | 200 | If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following: 201 | ```python 202 | >>> from transformers import Conversation 203 | 204 | >>> Conversation( 205 | ... "<>\n Only answer with emojis, and charades\n<>\n\nHow can I build a house in 10 septs?" 206 | ... ) 207 | ``` 208 | Args: 209 | conversation (`Conversation`): 210 | Conversation to build input ids for. 211 | Returns: 212 | `List[int]`: 213 | Input ids for the conversation. 214 | """ 215 | if len(conversation.past_user_inputs) > 0: 216 | if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]: 217 | conversation.past_user_inputs[0] = ( 218 | B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0] 219 | ) 220 | elif conversation.new_user_input: 221 | if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input: 222 | conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input 223 | else: 224 | raise ValueError("Last message must be from user") 225 | 226 | dialogue = list(conversation.iter_texts()) 227 | if not all([is_user for is_user, msg in dialogue[::2]]) or not all( 228 | [not is_user for is_user, msg in dialogue[1::2]] 229 | ): 230 | raise ValueError( 231 | "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)" 232 | ) 233 | 234 | dialog_tokens = [] 235 | dialog_tokens += sum( 236 | [ 237 | [self.bos_token_id] 238 | + self.encode( 239 | f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False 240 | ) 241 | + [self.eos_token_id] 242 | for prompt, answer in zip(dialogue[::2], dialogue[1::2]) 243 | ], 244 | [], 245 | ) 246 | dialog_tokens += [self.bos_token_id] + self.encode( 247 | f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False 248 | ) 249 | return dialog_tokens -------------------------------------------------------------------------------- /data/tokenization/preprocess_data.py: -------------------------------------------------------------------------------- 1 | """Processing data for pretraining.""" 2 | 3 | import argparse 4 | import multiprocessing 5 | import os 6 | import sys 7 | import numpy as np 8 | import random 9 | # sys.path.append( 10 | # os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) 11 | # ) 12 | 13 | # 将父目录的父目录加入path 14 | current_path = os.path.abspath(__file__) 15 | parent_dir = os.path.dirname(os.path.dirname(current_path)) 16 | grandparent_dir = os.path.dirname(parent_dir) 17 | sys.path.append(grandparent_dir) 18 | # print(grandparent_dir) 19 | 20 | import data.tokenization.lm_dataformat as lmd 21 | 22 | import time 23 | import tqdm 24 | import torch 25 | import ftfy 26 | import glob 27 | 28 | from tokenizer import build_tokenizer 29 | from threading import Semaphore 30 | 31 | 32 | 33 | 34 | table = {ord(f):ord(t) for f,t in zip( 35 | u',。!?:【】()%#@&1234567890', 36 | u',.!?:[]()%#@&1234567890')} 37 | 38 | 39 | def punctuation_format(text: str): 40 | # Replace non-breaking space with space 41 | # text = text.strip() + '\n' 42 | text = text.replace('\u202f', ' ').replace('\xa0', ' ') 43 | # change chinese punctuation to english ones 44 | text = text.translate(table) 45 | return text 46 | 47 | def is_prompt_answer_format(data): 48 | 49 | if "prompt" in data and "answer" in data: 50 | return True 51 | else: 52 | return False 53 | 54 | 55 | def is_chatml_format(data): 56 | if "chat_rounds" in data and len(data["chat_rounds"]) > 0: 57 | return True 58 | else: 59 | return False 60 | 61 | 62 | def is_text_format(data): 63 | if "text" in data: 64 | return True 65 | else: 66 | return False 67 | 68 | class Encoder(object): 69 | def __init__(self, args, tokenizer=None): 70 | self.args = args 71 | self.tokenizer = tokenizer 72 | 73 | def initializer(self): 74 | # Use Encoder class as a container for global data 75 | if self.tokenizer is None: 76 | self.tokenizer = build_tokenizer(self.args) 77 | else: 78 | self.tokenizer = self.tokenizer 79 | 80 | def encode(self, text): 81 | if self.args.ftfy: 82 | text = ftfy.fix_text(text) 83 | ids = {} 84 | for key in self.args.jsonl_keys: 85 | doc_ids = [] 86 | text_ids = self.tokenizer.encode(text, add_special_tokens=False) 87 | if len(text_ids) > 0: 88 | doc_ids.append(text_ids) 89 | if self.args.append_eod: 90 | doc_ids[-1].append(self.tokenizer.eod_id) 91 | ids[key] = doc_ids 92 | return ids, len(text) 93 | 94 | 95 | class UniformEncoder(Encoder): 96 | def __init__(self, args, mode='sft', tokenizer=None): 97 | super().__init__(args, tokenizer=tokenizer) 98 | self.mode = mode 99 | # 实际计算时会Shift一位,因此这里seq_length + 1 100 | if args.load_raw_dataset: 101 | self.seq_length = args.seq_length + 1 102 | self.stride = args.seq_length 103 | else: 104 | self.seq_length = args.seq_length 105 | 106 | self.remain_input_ids = [] 107 | self.remain_loss_mask = [] 108 | 109 | def encode(self, data): 110 | 111 | encode_res = { 112 | "input_ids":[], 113 | "loss_mask":[] 114 | } 115 | 116 | if is_prompt_answer_format(data): 117 | data_type = 'prompt_answer' 118 | elif is_chatml_format(data): 119 | data_type = 'chatML' 120 | elif is_text_format(data): 121 | data_type = 'text' 122 | else: 123 | raise ValueError("data format not supported, please use prompt/answer, or chatML or pretrain text") 124 | 125 | for token_res in self._tokenize_fields(data, data_type=data_type): 126 | for k, v in token_res.items(): 127 | encode_res[k].append(v) 128 | 129 | length = 0 130 | if data_type == 'prompt_answer': 131 | length = len(data['prompt']) + len(data['answer']) 132 | elif data_type == 'chatML': 133 | for chat in data['chat_rounds']: 134 | length += len(chat['content']) 135 | elif data_type == 'text': 136 | length += len(data['text']) 137 | 138 | return encode_res, length 139 | 140 | 141 | def _tokenize_fields(self, data, data_type): 142 | 143 | CHAT_COL = 'chat_rounds' 144 | ROLE_COL = 'role' 145 | CONTENT_COL = 'content' 146 | 147 | PROMPT_COL = 'prompt' 148 | ANSWER_COL = 'answer' 149 | SYSTEM_COL = 'system' 150 | 151 | TEXT_COL = 'text' 152 | 153 | if self.mode == 'sft': 154 | HUMAN = 'human' 155 | BOT = 'bot' 156 | SYSTEM = 'system' 157 | ROLE_START_MARKER = '<|role_start|>' 158 | ROLE_END_MARKER = '<|role_end|>' 159 | elif self.mode == 'pretrain' or data_type == 'text': 160 | HUMAN = '' 161 | BOT = '' 162 | SYSTEM = '' 163 | ROLE_START_MARKER = '' 164 | ROLE_END_MARKER = '' 165 | else: 166 | raise ValueError(f"tokenize_mode does not support {self.mode}, please use sft or pretrain") 167 | 168 | 169 | human_marker_ids = self.tokenizer.encode(f"{ROLE_START_MARKER}{HUMAN}{ROLE_END_MARKER}", add_special_tokens=False) 170 | bot_marker_ids = self.tokenizer.encode(f"{ROLE_START_MARKER}{BOT}{ROLE_END_MARKER}", add_special_tokens=False) 171 | system_marker_ids = self.tokenizer.encode(f"{ROLE_START_MARKER}{SYSTEM}{ROLE_END_MARKER}", add_special_tokens=False) 172 | sft_end_marker_ids = [self.tokenizer.eod_id] 173 | 174 | # 处理逻辑: 175 | # 统一处理SST,单轮、多轮sft的需求 176 | 177 | input_ids = [] 178 | loss_mask = [] 179 | 180 | if data_type == "prompt_answer": 181 | system = data.get(SYSTEM_COL, '') 182 | prompt = data[PROMPT_COL] 183 | answer = data[ANSWER_COL] 184 | system = punctuation_format(system) 185 | prompt = punctuation_format(prompt) 186 | answer = punctuation_format(answer) 187 | system_ids = system_marker_ids + self.tokenizer.encode(system, add_special_tokens=False) if system else [] 188 | prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False) 189 | answer_ids = self.tokenizer.encode(answer, add_special_tokens=False) + sft_end_marker_ids 190 | input_ids += system_ids + human_marker_ids + prompt_ids + bot_marker_ids + answer_ids 191 | loss_mask += [0] * len(system_ids) + [0] * len(human_marker_ids) + [0] * len(prompt_ids) + \ 192 | [0] * len(bot_marker_ids) + [1] * len(answer_ids) 193 | elif data_type == 'chatML': 194 | chat = data[CHAT_COL] 195 | for r in chat: 196 | role = r[ROLE_COL] 197 | content = r[CONTENT_COL] 198 | content = punctuation_format(content) 199 | # if not content.endswith('\n'): # chatML格式 200 | # content = content + '\n' 201 | if role == HUMAN: 202 | role_marker_ids = human_marker_ids 203 | content_ids = self.tokenizer.encode(content, add_special_tokens=False) 204 | elif role == BOT: 205 | # 每一个bot输出结尾的eod,计算loss, 学会在哪里停, human和system的eod不需要计算loss 206 | role_marker_ids = bot_marker_ids 207 | content_ids = self.tokenizer.encode(content, add_special_tokens=False) + sft_end_marker_ids 208 | elif role == SYSTEM: 209 | role_marker_ids = system_marker_ids 210 | content_ids = self.tokenizer.encode(content, add_special_tokens=False) 211 | else: 212 | raise ValueError(f"Role {role} not supported.") 213 | 214 | input_ids += role_marker_ids + content_ids 215 | masklet = [1] if role == BOT else [0] 216 | loss_mask += [0] * len(role_marker_ids) + masklet * len(content_ids) 217 | elif data_type == "text": 218 | text = data[TEXT_COL] 219 | text = punctuation_format(text) 220 | text_ids = self.tokenizer.encode(text, add_special_tokens=False) + sft_end_marker_ids 221 | input_ids += text_ids 222 | loss_mask += [1] * len(text_ids) 223 | else: 224 | raise ValueError( 225 | f"data_type does not support {self.args.data_type}, please use chatML or prompt_answer or text(for pretrain)") 226 | 227 | # print(self.mode) 228 | if self.mode == 'pretrain': 229 | # change loss mask to all 1s 230 | input_ids = input_ids 231 | loss_mask = [1] * len(loss_mask) 232 | elif self.mode == 'sft': 233 | # do nothing 234 | input_ids = input_ids 235 | loss_mask = loss_mask 236 | 237 | assert len(input_ids) == len(loss_mask) 238 | if self.args.padding_mode == 'padding': 239 | if len(input_ids) <= self.seq_length: 240 | yield self.padding(input_ids, loss_mask) 241 | 242 | # 如果超长,直接抛弃 or 使用seq_length窗口滑动采样 243 | else: 244 | # cursor = 0 245 | # while cursor < len(input_ids): 246 | # end_idx = min(cursor + self.seq_length, len(input_ids)) 247 | # yield self.padding(input_ids[cursor: end_idx], loss_mask[cursor: end_idx]) 248 | # cursor = end_idx 249 | yield {} 250 | elif self.args.padding_mode == 'concat': 251 | input_ids = self.remain_input_ids + input_ids 252 | loss_mask = self.remain_loss_mask + loss_mask 253 | if len(input_ids) < self.seq_length: 254 | self.remain_input_ids = input_ids 255 | self.remain_loss_mask = loss_mask 256 | assert len(self.remain_input_ids) == len(self.remain_loss_mask) 257 | yield {} 258 | else: 259 | cursor = 0 260 | while cursor + self.seq_length <= len(input_ids): 261 | yield { 262 | "input_ids": input_ids[cursor: cursor + self.seq_length], 263 | "loss_mask": loss_mask[cursor: cursor + self.seq_length] 264 | } 265 | cursor = cursor + self.stride 266 | self.remain_input_ids = input_ids[cursor:] 267 | self.remain_loss_mask = loss_mask[cursor:] 268 | assert len(self.remain_input_ids) == len(self.remain_loss_mask) 269 | yield {} 270 | elif self.args.padding_mode == 'pack': 271 | if len(input_ids) > self.seq_length: 272 | yield {} 273 | elif len(self.remain_input_ids) + len(input_ids) > self.seq_length: 274 | input_ids, self.remain_input_ids = self.remain_input_ids, input_ids 275 | loss_mask, self.remain_loss_mask = self.remain_loss_mask, loss_mask 276 | assert len(input_ids) == len(loss_mask) 277 | yield self.padding(input_ids, loss_mask) 278 | else: 279 | self.remain_input_ids = self.remain_input_ids + input_ids 280 | self.remain_loss_mask = self.remain_loss_mask + loss_mask 281 | assert len(self.remain_input_ids) == len(self.remain_loss_mask) 282 | yield {} 283 | 284 | 285 | def padding(self, input_ids, loss_mask): 286 | pad_id = self.tokenizer.pad_id 287 | assert len(input_ids) <= self.seq_length, f"padding sequence: {len(input_ids)} > {self.seq_length}" 288 | input_ids += [pad_id] * (self.seq_length - len(input_ids)) 289 | loss_mask += [0] * (self.seq_length - len(loss_mask)) 290 | return { 291 | "input_ids": input_ids, 292 | "loss_mask": loss_mask 293 | } 294 | 295 | # 输入为args.input, 使用","进行分割,每一个itme可以是jsonl或文件夹 296 | def find_jsonl_fnames(inputs): 297 | fnames = [] 298 | for p in inputs.split(","): 299 | if not os.path.isdir(p): 300 | if p.endswith(".jsonl"): 301 | print(f"loading from {p}") 302 | fnames.append(p) 303 | else: 304 | p_list = glob.glob(p + "/*") 305 | for p_ in p_list: 306 | if p_.endswith(".jsonl"): 307 | print(f"loading from {p_}") 308 | fnames.append(p_) 309 | return fnames 310 | 311 | def yield_from_files(fnames: list, semaphore): 312 | """ 313 | Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts / 314 | other compressed formats. Also filters out empty documents. 315 | 316 | :param fnames: list of filenames 317 | """ 318 | 319 | def yielder(fname, semaphore): 320 | for f in filter(lambda x: x, lmd.Reader(fname).stream_data(key=['task', 'src_language', 'src_code', 'tgt_language', 'tgt_code', 'sql', 'prompt', 'answer', 'bad_answer'])): 321 | semaphore.acquire() 322 | yield f 323 | 324 | for fname in fnames: 325 | semaphore.acquire() 326 | 327 | yield from yielder(fname, semaphore) -------------------------------------------------------------------------------- /model/llama/convert_llama_weights_to_hf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import argparse 15 | import gc 16 | import json 17 | import os 18 | import shutil 19 | import warnings 20 | 21 | import torch 22 | 23 | # from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer 24 | from .configuration_llama import LlamaConfig 25 | from .modeling_llama import LlamaForCausalLM 26 | from .tokenization_llama import LlamaTokenizer 27 | 28 | 29 | try: 30 | # from transformers import LlamaTokenizerFast 31 | from tokenization_llama_fast import LlamaTokenizerFast 32 | except ImportError as e: 33 | warnings.warn(e) 34 | warnings.warn( 35 | "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" 36 | ) 37 | LlamaTokenizerFast = None 38 | 39 | """ 40 | Sample usage: 41 | 42 | ``` 43 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \ 44 | --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path 45 | ``` 46 | 47 | Thereafter, models can be loaded via: 48 | 49 | ```py 50 | from transformers import LlamaForCausalLM, LlamaTokenizer 51 | 52 | model = LlamaForCausalLM.from_pretrained("/output/path") 53 | tokenizer = LlamaTokenizer.from_pretrained("/output/path") 54 | ``` 55 | 56 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions 57 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). 58 | """ 59 | 60 | INTERMEDIATE_SIZE_MAP = { 61 | "7B": 11008, 62 | "13B": 13824, 63 | "30B": 17920, 64 | "65B": 22016, 65 | "70B": 28672, 66 | } 67 | NUM_SHARDS = { 68 | "7B": 1, 69 | "7Bf": 1, 70 | "13B": 2, 71 | "13Bf": 2, 72 | "30B": 4, 73 | "65B": 8, 74 | "70B": 8, 75 | "70Bf": 8, 76 | } 77 | 78 | 79 | def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): 80 | return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) 81 | 82 | 83 | def read_json(path): 84 | with open(path, "r") as f: 85 | return json.load(f) 86 | 87 | 88 | def write_json(text, path): 89 | with open(path, "w") as f: 90 | json.dump(text, f) 91 | 92 | 93 | def write_model(model_path, input_base_path, model_size, safe_serialization=True): 94 | os.makedirs(model_path, exist_ok=True) 95 | tmp_model_path = os.path.join(model_path, "tmp") 96 | os.makedirs(tmp_model_path, exist_ok=True) 97 | 98 | params = read_json(os.path.join(input_base_path, "params.json")) 99 | num_shards = NUM_SHARDS[model_size] 100 | n_layers = params["n_layers"] 101 | n_heads = params["n_heads"] 102 | n_heads_per_shard = n_heads // num_shards 103 | dim = params["dim"] 104 | dims_per_head = dim // n_heads 105 | base = 10000.0 106 | inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) 107 | 108 | if "n_kv_heads" in params: 109 | num_key_value_heads = params["n_kv_heads"] # for GQA / MQA 110 | num_local_key_value_heads = n_heads_per_shard // num_key_value_heads 111 | key_value_dim = dim // num_key_value_heads 112 | else: # compatibility with other checkpoints 113 | num_key_value_heads = n_heads 114 | num_local_key_value_heads = n_heads_per_shard 115 | key_value_dim = dim 116 | 117 | # permute for sliced rotary 118 | def permute(w, n_heads=n_heads, dim1=dim, dim2=dim): 119 | return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) 120 | 121 | print(f"Fetching all parameters from the checkpoint at {input_base_path}.") 122 | # Load weights 123 | if model_size == "7B": 124 | # Not sharded 125 | # (The sharded implementation would also work, but this is simpler.) 126 | loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") 127 | else: 128 | # Sharded 129 | loaded = [ 130 | torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") 131 | for i in range(num_shards) 132 | ] 133 | param_count = 0 134 | index_dict = {"weight_map": {}} 135 | for layer_i in range(n_layers): 136 | filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" 137 | if model_size == "7B": 138 | # Unsharded 139 | state_dict = { 140 | f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( 141 | loaded[f"layers.{layer_i}.attention.wq.weight"] 142 | ), 143 | f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( 144 | loaded[f"layers.{layer_i}.attention.wk.weight"] 145 | ), 146 | f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], 147 | f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], 148 | f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], 149 | f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], 150 | f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], 151 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"], 152 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"], 153 | } 154 | else: 155 | # Sharded 156 | # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share 157 | # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is 158 | # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned. 159 | 160 | state_dict = { 161 | f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ 162 | f"layers.{layer_i}.attention_norm.weight" 163 | ].clone(), 164 | f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ 165 | f"layers.{layer_i}.ffn_norm.weight" 166 | ].clone(), 167 | } 168 | state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( 169 | torch.cat( 170 | [ 171 | loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) 172 | for i in range(num_shards) 173 | ], 174 | dim=0, 175 | ).reshape(dim, dim) 176 | ) 177 | state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( 178 | torch.cat( 179 | [ 180 | loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( 181 | num_local_key_value_heads, dims_per_head, dim 182 | ) 183 | for i in range(num_shards) 184 | ], 185 | dim=0, 186 | ).reshape(key_value_dim, dim), 187 | num_key_value_heads, 188 | key_value_dim, 189 | dim, 190 | ) 191 | state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( 192 | [ 193 | loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( 194 | num_local_key_value_heads, dims_per_head, dim 195 | ) 196 | for i in range(num_shards) 197 | ], 198 | dim=0, 199 | ).reshape(key_value_dim, dim) 200 | 201 | state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( 202 | [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 203 | ) 204 | state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( 205 | [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 206 | ) 207 | state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( 208 | [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 209 | ) 210 | state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( 211 | [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 212 | ) 213 | 214 | state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq 215 | for k, v in state_dict.items(): 216 | index_dict["weight_map"][k] = filename 217 | param_count += v.numel() 218 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 219 | 220 | filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" 221 | if model_size == "7B": 222 | # Unsharded 223 | state_dict = { 224 | "model.embed_tokens.weight": loaded["tok_embeddings.weight"], 225 | "model.norm.weight": loaded["norm.weight"], 226 | "lm_head.weight": loaded["output.weight"], 227 | } 228 | else: 229 | state_dict = { 230 | "model.norm.weight": loaded[0]["norm.weight"], 231 | "model.embed_tokens.weight": torch.cat( 232 | [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 233 | ), 234 | "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), 235 | } 236 | 237 | for k, v in state_dict.items(): 238 | index_dict["weight_map"][k] = filename 239 | param_count += v.numel() 240 | torch.save(state_dict, os.path.join(tmp_model_path, filename)) 241 | 242 | # Write configs 243 | index_dict["metadata"] = {"total_size": param_count * 2} 244 | write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) 245 | ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 246 | multiple_of = params["multiple_of"] if "multiple_of" in params else 256 247 | config = LlamaConfig( 248 | hidden_size=dim, 249 | intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), 250 | num_attention_heads=params["n_heads"], 251 | num_hidden_layers=params["n_layers"], 252 | rms_norm_eps=params["norm_eps"], 253 | num_key_value_heads=num_key_value_heads, 254 | ) 255 | config.save_pretrained(tmp_model_path) 256 | 257 | # Make space so we can load the model properly now. 258 | del state_dict 259 | del loaded 260 | gc.collect() 261 | 262 | print("Loading the checkpoint in a Llama model.") 263 | model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 264 | # Avoid saving this as part of the config. 265 | del model.config._name_or_path 266 | 267 | print("Saving in the Transformers format.") 268 | model.save_pretrained(model_path, safe_serialization=safe_serialization) 269 | shutil.rmtree(tmp_model_path) 270 | 271 | 272 | def write_tokenizer(tokenizer_path, input_tokenizer_path): 273 | # Initialize the tokenizer based on the `spm` model 274 | tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast 275 | print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") 276 | tokenizer = tokenizer_class(input_tokenizer_path) 277 | tokenizer.save_pretrained(tokenizer_path) 278 | 279 | 280 | def main(): 281 | parser = argparse.ArgumentParser() 282 | parser.add_argument( 283 | "--input_dir", 284 | help="Location of LLaMA weights, which contains tokenizer.model and model folders", 285 | ) 286 | parser.add_argument( 287 | "--model_size", 288 | choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"], 289 | help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama", 290 | ) 291 | parser.add_argument( 292 | "--output_dir", 293 | help="Location to write HF model and tokenizer", 294 | ) 295 | parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.") 296 | args = parser.parse_args() 297 | if args.model_size != "tokenizer_only": 298 | write_model( 299 | model_path=args.output_dir, 300 | input_base_path=os.path.join(args.input_dir, args.model_size), 301 | model_size=args.model_size, 302 | safe_serialization=args.safe_serialization, 303 | ) 304 | spm_path = os.path.join(args.input_dir, "tokenizer.model") 305 | write_tokenizer(args.output_dir, spm_path) 306 | 307 | 308 | if __name__ == "__main__": 309 | main() -------------------------------------------------------------------------------- /model/peft/utils/others.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright (c) 2023 Ant Group. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import sys 18 | sys.path.append("..") 19 | sys.path.append("../..") 20 | import copy 21 | 22 | import torch 23 | from .config import PetuningConfig 24 | from peft.utils import PromptLearningConfig, PeftType 25 | 26 | 27 | def prepare_model_for_int8_training(model, use_gradient_checkpointing=True): 28 | r""" 29 | This method wraps the entire protocol for preparing a model before running a training. This includes: 30 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm 31 | head to fp32 32 | 33 | Args: 34 | model, (`transformers.PreTrainedModel`): 35 | The loaded model from `transformers` 36 | """ 37 | loaded_in_8bit = getattr(model, "is_loaded_in_8bit", False) 38 | 39 | for name, param in model.named_parameters(): 40 | # freeze base model's layers 41 | param.requires_grad = False 42 | 43 | # cast all non INT8 parameters to fp32 44 | for param in model.parameters(): 45 | if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16): 46 | param.data = param.data.to(torch.float32) 47 | 48 | if loaded_in_8bit and use_gradient_checkpointing: 49 | # For backward compatibility 50 | if hasattr(model, "enable_input_require_grads"): 51 | model.enable_input_require_grads() 52 | else: 53 | 54 | def make_inputs_require_grad(module, input, output): 55 | output.requires_grad_(True) 56 | 57 | model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) 58 | 59 | # enable gradient checkpointing for memory efficiency 60 | model.gradient_checkpointing_enable() 61 | 62 | return model 63 | 64 | 65 | def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True): 66 | r""" 67 | This method wraps the entire protocol for preparing a model before running a training. This includes: 68 | 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm 69 | head to fp32 70 | 71 | Args: 72 | model, (`transformers.PreTrainedModel`): 73 | The loaded model from `transformers` 74 | """ 75 | loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) 76 | 77 | for name, param in model.named_parameters(): 78 | # freeze base model's layers 79 | param.requires_grad = False 80 | 81 | # cast all non INT8 parameters to fp32 82 | for param in model.parameters(): 83 | if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16): 84 | param.data = param.data.to(torch.float32) 85 | 86 | if loaded_in_kbit and use_gradient_checkpointing: 87 | # For backward compatibility 88 | if hasattr(model, "enable_input_require_grads"): 89 | model.enable_input_require_grads() 90 | else: 91 | 92 | def make_inputs_require_grad(module, input, output): 93 | output.requires_grad_(True) 94 | 95 | model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) 96 | 97 | # enable gradient checkpointing for memory efficiency 98 | model.gradient_checkpointing_enable() 99 | 100 | return model 101 | 102 | 103 | # copied from transformers.models.bart.modeling_bart 104 | def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): 105 | """ 106 | Shift input ids one token to the right. 107 | 108 | Args: 109 | input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids 110 | pad_token_id (`int`): The id of the `padding` token. 111 | decoder_start_token_id (`int`): The id of the `start` token. 112 | """ 113 | shifted_input_ids = input_ids.new_zeros(input_ids.shape) 114 | shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() 115 | shifted_input_ids[:, 0] = decoder_start_token_id 116 | 117 | if pad_token_id is None: 118 | raise ValueError("self.model.config.pad_token_id has to be defined.") 119 | # replace possible -100 values in labels by `pad_token_id` 120 | shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) 121 | 122 | return shifted_input_ids 123 | 124 | 125 | class ModulesToSaveWrapper(torch.nn.Module): 126 | def __init__(self, module_to_save, adapter_name): 127 | super().__init__() 128 | self.original_module = module_to_save 129 | self.modules_to_save = torch.nn.ModuleDict({}) 130 | self.update(adapter_name) 131 | self.active_adapter = adapter_name 132 | 133 | def update(self, adapter_name): 134 | self.modules_to_save.update(torch.nn.ModuleDict({adapter_name: copy.deepcopy(self.original_module)})) 135 | 136 | def forward(self, *args, **kwargs): 137 | if self.active_adapter not in self.modules_to_save: 138 | return self.original_module(*args, **kwargs) 139 | return self.modules_to_save[self.active_adapter](*args, **kwargs) 140 | 141 | 142 | def _get_submodules(model, key): 143 | parent = model.get_submodule(".".join(key.split(".")[:-1])) 144 | target_name = key.split(".")[-1] 145 | target = model.get_submodule(key) 146 | return parent, target, target_name 147 | 148 | 149 | def _freeze_adapter(model, adapter_name): 150 | for n, p in model.named_parameters(): 151 | if adapter_name in n: 152 | p.requires_grad = False 153 | 154 | 155 | def _freeze_model(model): 156 | for n, p in model.named_parameters(): 157 | p.requires_grad = False 158 | 159 | 160 | def _set_trainable(model, adapter_name): 161 | key_list = [key for key, _ in model.named_modules()] 162 | for key in key_list: 163 | target_module_found = any(key.endswith(target_key) for target_key in model.modules_to_save) 164 | if target_module_found: 165 | parent, target, target_name = _get_submodules(model, key) 166 | if isinstance(target, ModulesToSaveWrapper): 167 | target.update(adapter_name) 168 | else: 169 | for param in target.parameters(): 170 | param.requires_grad = True 171 | setattr(parent, target_name, ModulesToSaveWrapper(target, adapter_name)) 172 | 173 | 174 | def _set_adapter(model, adapter_name): 175 | for module in model.modules(): 176 | if isinstance(module, ModulesToSaveWrapper): 177 | module.active_adapter = adapter_name 178 | 179 | 180 | def fsdp_auto_wrap_policy(model): 181 | import functools 182 | import os 183 | 184 | from accelerate import FullyShardedDataParallelPlugin 185 | from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy 186 | 187 | from peft.tuners import PrefixEncoder, PromptEmbedding, PromptEncoder 188 | 189 | def lambda_policy_fn(module): 190 | if ( 191 | len(list(module.named_children())) == 0 192 | and getattr(module, "weight", None) is not None 193 | and module.weight.requires_grad 194 | ): 195 | return True 196 | return False 197 | 198 | lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn) 199 | transformer_wrap_policy = functools.partial( 200 | transformer_auto_wrap_policy, 201 | transformer_layer_cls=( 202 | PrefixEncoder, 203 | PromptEncoder, 204 | PromptEmbedding, 205 | FullyShardedDataParallelPlugin.get_module_class_from_name( 206 | model, os.environ.get("FSDP_TRANSFORMER_CLS_TO_WRAP", "") 207 | ), 208 | ), 209 | ) 210 | 211 | auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy]) 212 | return auto_wrap_policy 213 | 214 | 215 | def transpose(weight, fan_in_fan_out): 216 | return weight.T if fan_in_fan_out else weight 217 | 218 | 219 | def get_peft_model_state_dict(model, state_dict=None, adapter_name="default"): 220 | """ 221 | Get the state dict of the Peft model. 222 | 223 | Args: 224 | model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP, 225 | the model should be the underlying model/unwrapped model (i.e. model.module). 226 | state_dict (`dict`, *optional*, defaults to `None`): 227 | The state dict of the model. If not provided, the state dict of the model 228 | will be used. 229 | """ 230 | config = model.peft_config[adapter_name] 231 | if state_dict is None: 232 | state_dict = model.state_dict() 233 | if config.peft_type in (PeftType.LORA, PeftType.ADALORA, PeftType.ROUTELORA, PeftType.UNIPELT): 234 | # to_return = lora_state_dict(model, bias=model.peft_config.bias) 235 | # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py` 236 | # to be used directly with the state dict which is necessary when using DeepSpeed or FSDP 237 | bias = config.bias 238 | if bias == "none": 239 | to_return = {k: state_dict[k] for k in state_dict if "lora_" in k} 240 | elif bias == "all": 241 | to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k} 242 | elif bias == "lora_only": 243 | to_return = {} 244 | for k in state_dict: 245 | if "lora_" in k: 246 | to_return[k] = state_dict[k] 247 | bias_name = k.split("lora_")[0] + "bias" 248 | if bias_name in state_dict: 249 | to_return[bias_name] = state_dict[bias_name] 250 | else: 251 | raise NotImplementedError 252 | to_return = {k: v for k, v in to_return.items() if (("lora_" in k and adapter_name in k) or ("bias" in k))} 253 | if config.peft_type == PeftType.ADALORA: 254 | rank_pattern = config.rank_pattern 255 | if rank_pattern is not None: 256 | rank_pattern = {k.replace(f".{adapter_name}", ""): v for k, v in rank_pattern.items()} 257 | config.rank_pattern = rank_pattern 258 | to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name) 259 | 260 | elif config.peft_type == PeftType.ADAPTION_PROMPT: 261 | to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")} 262 | elif isinstance(config, PromptLearningConfig): 263 | to_return = {} 264 | if config.inference_mode: 265 | prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight 266 | else: 267 | prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name) 268 | to_return["prompt_embeddings"] = prompt_embeddings 269 | elif isinstance(config, PetuningConfig): 270 | to_return = state_dict 271 | else: 272 | raise NotImplementedError 273 | if model.modules_to_save is not None: 274 | for key, value in state_dict.items(): 275 | if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save): 276 | to_return[key.replace("modules_to_save.", "")] = value 277 | 278 | to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()} 279 | return to_return 280 | 281 | 282 | def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default"): 283 | """ 284 | Set the state dict of the Peft model. 285 | 286 | Args: 287 | model ([`PeftModel`]): The Peft model. 288 | peft_model_state_dict (`dict`): The state dict of the Peft model. 289 | """ 290 | config = model.peft_config[adapter_name] 291 | state_dict = {} 292 | if model.modules_to_save is not None: 293 | for key, value in peft_model_state_dict.items(): 294 | if any(module_name in key for module_name in model.modules_to_save): 295 | for module_name in model.modules_to_save: 296 | if module_name in key: 297 | key = key.replace(module_name, f"{module_name}.modules_to_save.{adapter_name}") 298 | break 299 | state_dict[key] = value 300 | else: 301 | state_dict = peft_model_state_dict 302 | 303 | if config.peft_type in (PeftType.LORA, PeftType.ADALORA, PeftType.ROUTELORA, PeftType.UNIPELT): 304 | peft_model_state_dict = {} 305 | for k, v in state_dict.items(): 306 | if "lora_" in k: 307 | suffix = k.split("lora_")[1] 308 | if "." in suffix: 309 | suffix_to_replace = ".".join(suffix.split(".")[1:]) 310 | k = k.replace(suffix_to_replace, f"{adapter_name}.{suffix_to_replace}") 311 | else: 312 | k = f"{k}.{adapter_name}" 313 | peft_model_state_dict[k] = v 314 | else: 315 | peft_model_state_dict[k] = v 316 | if config.peft_type == PeftType.ADALORA: 317 | rank_pattern = config.rank_pattern 318 | if rank_pattern is not None: 319 | model.resize_modules_by_rank_pattern(rank_pattern, adapter_name) 320 | elif isinstance(config, PromptLearningConfig) or config.peft_type == PeftType.ADAPTION_PROMPT: 321 | peft_model_state_dict = state_dict 322 | elif isinstance(config, PetuningConfig): 323 | peft_model_state_dict = state_dict 324 | else: 325 | raise NotImplementedError 326 | 327 | model.load_state_dict(peft_model_state_dict, strict=False) 328 | if isinstance(config, PromptLearningConfig): 329 | model.prompt_encoder[adapter_name].embedding.load_state_dict( 330 | {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True 331 | ) 332 | -------------------------------------------------------------------------------- /utils/common_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2023 Ant Group. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import os 17 | import math 18 | import torch 19 | import atorch 20 | import logging 21 | import numpy as np 22 | from collections.abc import Mapping # noqa: E402 23 | from contextlib import contextmanager # noqa: E402 24 | from torch.distributed.fsdp import ( 25 | FullyShardedDataParallel as FSDP, 26 | # BackwardPrefetch, 27 | FullStateDictConfig, 28 | StateDictType, 29 | ) 30 | from transformers import get_scheduler 31 | from utils.learning_rates import AnnealingLR 32 | TASK2ID = {} 33 | ID2TASK = {} 34 | logger = logging.getLogger(__name__) 35 | 36 | def get_rank(): 37 | return atorch.rank() 38 | 39 | 40 | def get_local_rank(): 41 | return atorch.local_rank() 42 | 43 | 44 | def is_main_process(): 45 | return atorch.rank() == 0 46 | 47 | 48 | def is_local_main_process(): 49 | return atorch.local_rank() == 0 50 | 51 | 52 | def print_rank_0(*message): 53 | """If distributed is initialized print only on rank 0.""" 54 | if torch.distributed.is_initialized(): 55 | if torch.distributed.get_rank() == 0: 56 | print(*message, flush=True) 57 | else: 58 | print(*message, flush=True) 59 | 60 | 61 | def get_world_size(): 62 | return atorch.world_size() 63 | 64 | 65 | def wait_for_everyone(): 66 | torch.distributed.barrier() 67 | 68 | 69 | def atorch_init_distributed(backend="nccl"): 70 | atorch.init_distributed(backend, set_cuda_device_using_local_rank=True) 71 | # atorch.init_distributed(backend) 72 | 73 | 74 | def atorch_reset_distributed(): 75 | atorch.reset_distributed() 76 | 77 | 78 | def _goes_first(is_main): 79 | if is_main is False: 80 | wait_for_everyone() 81 | yield 82 | if is_main is True: 83 | wait_for_everyone() 84 | 85 | 86 | def get_model_params_num(model): 87 | """ 88 | Get params number of the model 89 | Args: 90 | model: model(required) 91 | Returns: 92 | the number of parameters of model 93 | """ 94 | num = 0 95 | for _, param in model.named_parameters(): 96 | num += param.nelement() 97 | return num 98 | 99 | 100 | @contextmanager 101 | def main_process_first(): 102 | yield from _goes_first(is_main_process()) 103 | 104 | 105 | def unwrap_model(model): 106 | """ 107 | Recursively unwraps a model from potential containers (as used in distributed training). 108 | 109 | Args: 110 | model (`torch.nn.Module`): The model to unwrap. 111 | """ 112 | # since there could be multiple levels of wrapping, unwrap recursively 113 | if hasattr(model, "module"): 114 | return unwrap_model(model.module) 115 | else: 116 | return model 117 | 118 | 119 | def honor_type(obj, generator): 120 | """ 121 | Cast a generator to the same type as obj (list, tuple or namedtuple) 122 | """ 123 | try: 124 | return type(obj)(generator) 125 | except TypeError: 126 | # Some objects may not be able to instantiate from a generator directly 127 | return type(obj)(*list(generator)) 128 | 129 | 130 | def recursively_apply( 131 | func, 132 | data, 133 | *args, 134 | test_type=lambda t: isinstance(t, torch.Tensor), 135 | error_on_other_type=False, 136 | **kwargs, 137 | ): 138 | if isinstance(data, (tuple, list)): 139 | return honor_type( 140 | data, 141 | ( 142 | recursively_apply( 143 | func, 144 | o, 145 | *args, 146 | test_type=test_type, 147 | error_on_other_type=error_on_other_type, 148 | **kwargs, 149 | ) 150 | for o in data 151 | ), 152 | ) 153 | elif isinstance(data, Mapping): 154 | return type(data)( 155 | { 156 | k: recursively_apply( 157 | func, 158 | v, 159 | *args, 160 | test_type=test_type, 161 | error_on_other_type=error_on_other_type, 162 | **kwargs, 163 | ) 164 | for k, v in data.items() 165 | } 166 | ) 167 | elif test_type(data): 168 | return func(data, *args, **kwargs) 169 | elif error_on_other_type: 170 | raise TypeError( 171 | f"Can't apply {func.__name__} on object of type {type(data)}, only of nested list/tuple/dicts of objects " 172 | f"that satisfy {test_type.__name__}." 173 | ) 174 | return data 175 | 176 | 177 | def gather(tensor): 178 | def _gpu_gather_one(tensor): 179 | if tensor.ndim == 0: 180 | tensor = tensor.clone()[None] 181 | output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())] 182 | torch.distributed.all_gather(output_tensors, tensor) 183 | return torch.cat(output_tensors, dim=0) 184 | 185 | return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True) 186 | 187 | 188 | def save_ckpt(model, optimizer, lr_scheduler, epoch, steps, save_path, logger): 189 | if isinstance(model, FSDP): 190 | print('Saving a FSDP model') 191 | optim_state_dict = FSDP.full_optim_state_dict(model, optimizer) 192 | save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True) 193 | with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy): 194 | model_state_dict = model.state_dict() 195 | lrs_state_dict = lr_scheduler.state_dict() 196 | else: 197 | print('Saving a normal model') 198 | model_state_dict = model.state_dict() 199 | optim_state_dict = optimizer.state_dict() 200 | lrs_state_dict = lr_scheduler.state_dict() 201 | # rank0 保存 202 | if is_main_process(): 203 | torch.save( 204 | { 205 | "epoch": epoch + 1, 206 | "step": steps, 207 | "state_dict": model_state_dict, 208 | "optimizer": optim_state_dict, 209 | "lrs_state_dict": lrs_state_dict, 210 | }, 211 | save_path, 212 | ) 213 | logger.info(f"Saved checkpoint {save_path} (epoch {epoch + 1} @ {steps} steps)") 214 | wait_for_everyone() 215 | # torch.distributed.barrier() # other rank waiting 216 | 217 | 218 | def scheduler_and_resume(args, train_dataloader, model, optimizer, checkpoint): 219 | # Scheduler and math around the number of training steps. 220 | overrode_max_steps = False 221 | args.num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) 222 | if args.max_steps == -1: 223 | args.max_steps = args.num_train_epochs * args.num_update_steps_per_epoch 224 | overrode_max_steps = True 225 | 226 | lr_scheduler = AnnealingLR( 227 | optimizer, 228 | start_lr=args.learning_rate, 229 | warmup_iter=args.num_warmup_steps, 230 | total_iters=args.max_steps * args.gradient_accumulation_steps, 231 | decay_style=args.lr_scheduler_type, 232 | last_iter=0, 233 | min_lr=args.min_lr, 234 | use_checkpoint_lr_scheduler=True, 235 | ) 236 | # lr_scheduler = get_scheduler( 237 | # name=args.lr_scheduler_type, 238 | # optimizer=optimizer, 239 | # num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, 240 | # num_training_steps=args.max_steps * args.gradient_accumulation_steps 241 | # ) 242 | 243 | if args.resume_from_checkpoint is not None: 244 | if os.path.isfile(args.resume_from_checkpoint): 245 | starting_epoch = checkpoint["epoch"] - 1 246 | steps = checkpoint["step"] 247 | args.resume_step = steps 248 | # Restore the optim state 249 | if optimizer is not None: 250 | if isinstance(model, FSDP): 251 | print('Loading optimizer for a FSDP model') 252 | full_osd = checkpoint["optimizer"] 253 | sharded_osd = FSDP.scatter_full_optim_state_dict(full_osd, model) 254 | optimizer.load_state_dict(sharded_osd) 255 | else: 256 | print('Loading optimizer for a normal model') 257 | optimizer.load_state_dict(checkpoint["optimizer"]) 258 | logging.info("Optimizer state is restored from the checkpoint") 259 | if lr_scheduler is not None: 260 | lr_scheduler.load_state_dict(checkpoint["lrs_state_dict"]) 261 | logging.info(f"Loaded checkpoint '{args.resume_from_checkpoint}' (epoch {checkpoint['epoch']} @ {steps} steps)") 262 | else: 263 | logger.info(f"No optimizer and lr scheduler checkpoint found at '{args.resume_from_checkpoint}'") 264 | 265 | # We need to recalculate our total training steps as the size of the training dataloader may have changed. 266 | args.num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) 267 | if overrode_max_steps: 268 | args.max_steps = args.num_train_epochs * args.num_update_steps_per_epoch 269 | # Afterwards we recalculate our number of training epochs 270 | args.num_train_epochs = math.ceil(args.max_steps / args.num_update_steps_per_epoch) 271 | 272 | # Figure out how many steps we should save the Accelerator states 273 | # if args.checkpointing_steps is not None and args.checkpointing_steps.isdigit(): 274 | # args.checkpointing_steps = int(args.checkpointing_steps) 275 | 276 | return args, lr_scheduler, optimizer 277 | 278 | 279 | # def get_tflops(model_numel, batch_size, seq_len, step_time): 280 | # return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) 281 | 282 | 283 | def get_computation_speed(batch_size_per_device, seq_len, step_time): 284 | 285 | return batch_size_per_device * seq_len / (step_time + 1e-12) 286 | 287 | 288 | def human_readable_flops(num): 289 | for unit in [ 290 | "", 291 | "KFLOPS", 292 | "MFLOPS", 293 | "GFLOPS", 294 | "TFLOPS", 295 | "PFLOPS", 296 | "EFLOPS", 297 | "ZFLOPS", 298 | ]: 299 | if abs(num) < 1000.0: 300 | return "%3.1f%s" % (num, unit) 301 | num /= 1000.0 302 | return "%.1f%s" % (num, "Yi") 303 | 304 | 305 | def get_tflops_new(args, batch_size, seq_len, step_time): 306 | sl = seq_len 307 | L = args.num_hidden_layers 308 | h = args.hidden_size 309 | V = args.vocab_size 310 | flops = (96 * batch_size * sl * L * h * h * (1 + sl / (6 * h) + V / (16 * L * h)) / step_time) 311 | return human_readable_flops(flops) 312 | 313 | 314 | def get_tflops_megatron(total_model_param, hidden_size, num_hidden_layers, 315 | batch_size_per_device, seq_len, step_time): 316 | 317 | ff = total_model_param * 6 318 | attn = seq_len * hidden_size * num_hidden_layers * 60 319 | flops = ( 320 | batch_size_per_device 321 | * seq_len 322 | * (ff + attn) 323 | / step_time 324 | ) 325 | return human_readable_flops(flops) 326 | 327 | 328 | def is_old_version(path): 329 | new_vocab_files = ['merge.model'] 330 | new_vocab_file_exists = [] 331 | for filename in new_vocab_files: 332 | if not os.path.exists(os.path.join(path, filename)): 333 | new_vocab_file_exists.append(False) 334 | else: 335 | new_vocab_file_exists.append(True) 336 | if all(new_vocab_file_exists): 337 | return False 338 | if any(new_vocab_file_exists): 339 | return 'new_version_file_absent' 340 | else: 341 | return True 342 | 343 | 344 | def generate_task_id(data_paths, train_mode): 345 | data_prefixes = list(data_paths[1:-1].split(',')) 346 | print("data paths: ") 347 | print(data_prefixes) 348 | 349 | for i, prefix in enumerate(data_prefixes): 350 | if train_mode == 'sft': 351 | task_name = prefix.split('/')[-1] 352 | else: 353 | task_name = prefix.split('/')[-2] 354 | TASK2ID[task_name] = i 355 | ID2TASK[i] = task_name 356 | 357 | 358 | class EarlyStopping: 359 | """Early stops the training if validation loss doesn't improve after a given patience.""" 360 | def __init__(self, patience=7, verbose=False, delta=0): 361 | """ 362 | Args: 363 | patience (int): How long to wait after last time validation loss improved. 364 | Default: 7 365 | verbose (bool): If True, prints a message for each validation loss improvement. 366 | Default: False 367 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 368 | Default: 0 369 | """ 370 | self.patience = patience 371 | self.verbose = verbose 372 | self.counter = 0 373 | self.best_score = None 374 | self.early_stop = False 375 | self.val_loss_min = np.Inf 376 | self.delta = delta 377 | 378 | def __call__(self, val_loss, model): 379 | 380 | score = -val_loss 381 | 382 | if self.best_score is None: 383 | self.best_score = score 384 | # self.save_checkpoint(val_loss, model) 385 | elif score < self.best_score + self.delta: 386 | self.counter += 1 387 | print(f'EarlyStopping counter: {self.counter} out of {self.patience}') 388 | if self.counter >= self.patience: 389 | self.early_stop = True 390 | else: 391 | self.best_score = score 392 | # self.save_checkpoint(val_loss, model) 393 | self.counter = 0 394 | 395 | def save_checkpoint(self, val_loss, model): 396 | '''Saves model when validation loss decrease.''' 397 | if self.verbose: 398 | print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') 399 | torch.save(model.state_dict(), 'checkpoint.pt') # 这里会存储迄今最优模型的参数 400 | self.val_loss_min = val_loss 401 | 402 | -------------------------------------------------------------------------------- /tokenizer/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, EleutherAI 2 | # This file is based on code by the authors denoted below and has been modified from its original version. 3 | # 4 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | """Megatron tokenizers.""" 19 | 20 | from abc import ABC 21 | from abc import abstractmethod 22 | 23 | from tokenizers import Tokenizer 24 | from transformers import GPT2Tokenizer, GPT2TokenizerFast 25 | import numpy as np 26 | import sentencepiece as spm 27 | from typing import List, Union 28 | from .gpt2_tokenization import GPT2Tokenizer 29 | from utils.common_utils import print_rank_0, is_old_version 30 | from model.glm.tokenization_glm import GLMTokenizer 31 | 32 | def build_tokenizer(args): 33 | """Initialize tokenizer.""" 34 | print_rank_0("> building {} tokenizer ...".format(args.tokenizer_type)) 35 | # if args.rank == 0: 36 | # print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True) 37 | 38 | # Select and instantiate the tokenizer. 39 | if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower(): 40 | assert args.vocab_file is not None 41 | assert args.merge_file is not None 42 | tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) 43 | elif args.tokenizer_type.lower() == "SPMTokenizer".lower(): 44 | assert args.vocab_file is not None 45 | tokenizer = SentencePieceTokenizer(args.vocab_file) 46 | elif args.tokenizer_type.lower() == "HFTokenizer".lower(): 47 | assert args.vocab_file is not None 48 | tokenizer = HFTokenizer(args.vocab_file) 49 | elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower(): 50 | if args.vocab_file is None: 51 | print( 52 | "WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer" 53 | ) 54 | tokenizer = HFGPT2Tokenizer(args.vocab_file) 55 | elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower(): 56 | tokenizer = CharLevelTokenizer(vocab_size=512) 57 | elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower(): 58 | assert args.vocab_file is not None 59 | tokenizer = TiktokenTokenizer(args.vocab_file) 60 | elif args.tokenizer_type.lower() == "GLMTokenizer".lower(): 61 | if is_old_version(args.pretrained_model_path): 62 | print('is an old version') 63 | from model.glm.tokenization_glm_deprecated import GLMChineseTokenizer 64 | args.glm_mask = '[sMASK]' 65 | old_version_tokenizer = True 66 | tokenizer = GLMChineseTokenizer.from_pretrained(args.pretrained_model_path, trust_remote_code=True) 67 | else: 68 | print('is not an old version') 69 | old_version_tokenizer = False 70 | tokenizer = GLMTokenizer.from_pretrained(args.pretrained_model_path, trust_remote_code=True) 71 | else: 72 | raise NotImplementedError( 73 | "{} tokenizer is not " "implemented.".format(args.tokenizer_type) 74 | ) 75 | 76 | # Add vocab size. 77 | args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args) 78 | 79 | return tokenizer 80 | 81 | 82 | def _vocab_size_with_padding(orig_vocab_size, args): 83 | """Pad vocab size so it is divisible by model parallel size and 84 | still having GPU friendly size.""" 85 | 86 | after = orig_vocab_size 87 | multiple = args.make_vocab_size_divisible_by * args.model_parallel_size 88 | while (after % multiple) != 0: 89 | after += 1 90 | print_rank_0( 91 | " > padded vocab (size: {}) with {} dummy tokens " 92 | "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after) 93 | ) 94 | # if args.rank == 0: 95 | # print( 96 | # " > padded vocab (size: {}) with {} dummy tokens " 97 | # "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after), 98 | # flush=True, 99 | # ) 100 | return after 101 | 102 | 103 | class AbstractTokenizer(ABC): 104 | """Abstract class for tokenizer.""" 105 | 106 | def __init__(self, name): 107 | self.name = name 108 | super().__init__() 109 | 110 | @property 111 | @abstractmethod 112 | def vocab_size(self): 113 | pass 114 | 115 | @property 116 | @abstractmethod 117 | def vocab(self): 118 | """Dictionary from vocab text token to id token.""" 119 | pass 120 | 121 | @property 122 | @abstractmethod 123 | def inv_vocab(self): 124 | """Dictionary from vocab id token to text token.""" 125 | pass 126 | 127 | @abstractmethod 128 | def tokenize(self, text): 129 | pass 130 | 131 | def detokenize(self, token_ids): 132 | raise NotImplementedError( 133 | "detokenizer is not implemented for {} " "tokenizer".format(self.name) 134 | ) 135 | 136 | @property 137 | def cls(self): 138 | raise NotImplementedError( 139 | "CLS is not provided for {} " "tokenizer".format(self.name) 140 | ) 141 | 142 | @property 143 | def sep(self): 144 | raise NotImplementedError( 145 | "SEP is not provided for {} " "tokenizer".format(self.name) 146 | ) 147 | 148 | @property 149 | def pad(self): 150 | raise NotImplementedError( 151 | "PAD is not provided for {} " "tokenizer".format(self.name) 152 | ) 153 | 154 | @property 155 | def eod(self): 156 | raise NotImplementedError( 157 | "EOD is not provided for {} " "tokenizer".format(self.name) 158 | ) 159 | 160 | @property 161 | def mask(self): 162 | raise NotImplementedError( 163 | "MASK is not provided for {} " "tokenizer".format(self.name) 164 | ) 165 | 166 | 167 | class _GPT2BPETokenizer(AbstractTokenizer): 168 | """Original GPT2 BPE tokenizer.""" 169 | 170 | def __init__(self, vocab_file, merge_file): 171 | name = "GPT2 BPE" 172 | super().__init__(name) 173 | 174 | self.tokenizer = GPT2Tokenizer( 175 | vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None 176 | ) 177 | self.eod_id = self.tokenizer.encoder["<|endoftext|>"] 178 | 179 | @property 180 | def vocab_size(self): 181 | return len(self.tokenizer.encoder) 182 | 183 | @property 184 | def vocab(self): 185 | return self.tokenizer.encoder 186 | 187 | @property 188 | def inv_vocab(self): 189 | return self.tokenizer.decoder 190 | 191 | def tokenize(self, text): 192 | return self.tokenizer.encode(text) 193 | 194 | def detokenize(self, token_ids): 195 | return self.tokenizer.decode(token_ids) 196 | 197 | @property 198 | def eod(self): 199 | return self.eod_id 200 | 201 | 202 | class SentencePieceTokenizer(AbstractTokenizer): 203 | """Designed to Integrate SP's Tokenizer.""" 204 | 205 | def __init__(self, vocab_file): 206 | name = "SPM" 207 | super().__init__(name) 208 | 209 | self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file) 210 | # self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>") 211 | self.eod_id = self.tokenizer.piece_to_id("") 212 | self.pad_id = self.tokenizer.piece_to_id("[PAD]") 213 | self.unk_id = self.tokenizer.piece_to_id("") 214 | 215 | @property 216 | def vocab_size(self): 217 | return self.tokenizer.get_piece_size() 218 | 219 | @property 220 | def vocab(self): 221 | return { 222 | self.tokenizer.id_to_piece(idx): idx 223 | for idx in range(self.tokenizer.get_piece_size()) 224 | } 225 | 226 | @property 227 | def inv_vocab(self): 228 | return { 229 | idx: self.tokenizer.id_to_piece(idx) 230 | for idx in range(self.tokenizer.get_piece_size()) 231 | } 232 | 233 | def tokenize(self, text): 234 | return self.tokenizer.encode(text) 235 | 236 | def detokenize(self, token_ids): 237 | return self.tokenizer.decode(token_ids) 238 | 239 | @property 240 | def eod(self): 241 | return self.eod_id 242 | 243 | 244 | class HFTokenizer(AbstractTokenizer): 245 | """Designed to Integrate HF's Tokenizer library.""" 246 | 247 | def __init__(self, vocab_file): 248 | name = "HFTokenizer" 249 | super().__init__(name) 250 | 251 | self.tokenizer = Tokenizer.from_file(vocab_file) 252 | # self.eod_id = self.tokenizer.token_to_id("<|endoftext|>") 253 | self.eod_id = self.tokenizer.token_to_id("<|end|>") 254 | # self.pad_id = self.tokenizer.token_to_id("<|padding|>") 255 | 256 | # 新词表没有<|padding|>, 用<|extratoken_1|>代替,和tokenization一致 257 | # self.pad_id = self.tokenizer.token_to_id("<|extratoken_1|>") 258 | self.pad_id = self.tokenizer.token_to_id("<|pad|>") 259 | 260 | @property 261 | def vocab_size(self): 262 | return self.tokenizer.get_vocab_size() 263 | 264 | @property 265 | def vocab(self): 266 | return self.tokenizer.get_vocab() 267 | 268 | @property 269 | def inv_vocab(self): 270 | return self.tokenizer.decoder 271 | 272 | def tokenize(self, text: str): 273 | return self.tokenizer.encode(text).ids 274 | 275 | def tokenize_batch(self, text_batch: Union[List[str], str]): 276 | return self.tokenizer.encode_batch(text_batch) 277 | 278 | def detokenize(self, token_ids): 279 | return self.tokenizer.decode(token_ids) 280 | 281 | @property 282 | def eod(self): 283 | return self.eod_id 284 | 285 | 286 | class HFGPT2Tokenizer(AbstractTokenizer): 287 | """Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF""" 288 | 289 | def __init__(self, vocab_file=None, fast=True): 290 | name = "HFGPT2Tokenizer" 291 | if fast: 292 | name += "Fast" 293 | super().__init__(name) 294 | if vocab_file is None: 295 | vocab_file = "gpt2" 296 | if fast: 297 | self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file) 298 | else: 299 | self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file) 300 | 301 | self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"}) 302 | self.eod_id = self.tokenizer.eos_token_id 303 | self.pad_id = self.tokenizer.pad_token_id 304 | 305 | @property 306 | def vocab_size(self): 307 | return len(self.tokenizer) 308 | 309 | @property 310 | def vocab(self): 311 | return self.tokenizer.get_vocab() 312 | 313 | @property 314 | def inv_vocab(self): 315 | return self.tokenizer._tokenizer.decoder 316 | 317 | def tokenize(self, text: str): 318 | return self.tokenizer.encode(text) 319 | 320 | def tokenize_batch(self, text_batch: Union[List[str], str]): 321 | if isinstance(text_batch, str): 322 | text_batch = [text_batch] 323 | return [self.tokenize(t) for t in text_batch] 324 | 325 | def detokenize(self, token_ids): 326 | return self.tokenizer.decode(token_ids) 327 | 328 | @property 329 | def eod(self): 330 | return self.eod_id 331 | 332 | 333 | class CharLevelTokenizer(AbstractTokenizer): 334 | """Character Level Tokenizer""" 335 | 336 | def __init__(self, vocab_size): 337 | name = "CharLevelTokenizer" 338 | super().__init__(name) 339 | self._vocab_size = vocab_size 340 | self.eod_id = 0 341 | self.pad_id = 1 342 | 343 | def clamp(self, n): 344 | return max(32, min(n, self.vocab_size)) 345 | 346 | @property 347 | def vocab_size(self): 348 | return self._vocab_size 349 | 350 | @property 351 | def vocab(self): 352 | raise NotImplementedError 353 | 354 | @property 355 | def inv_vocab(self): 356 | raise NotImplementedError 357 | 358 | def decode_token(self, token: int): 359 | return str(chr(self.clamp(token))) 360 | 361 | def tokenize(self, text: str): 362 | return list(np.fromstring(text, dtype=np.uint8)) 363 | 364 | def tokenize_batch(self, text_batch: Union[List[str], str]): 365 | if isinstance(text_batch, list): 366 | return [self.tokenize(s) for s in text_batch] 367 | else: 368 | return self.tokenize(text_batch) 369 | 370 | def detokenize(self, token_ids): 371 | return "".join(list(map(self.decode_token, token_ids))) 372 | 373 | @property 374 | def eod(self): 375 | return self.eod_id 376 | 377 | 378 | class TiktokenTokenizer(AbstractTokenizer): 379 | """Tokenizer from OpenAI's tiktoken implementation""" 380 | 381 | def __init__(self, vocab_file): 382 | try: 383 | import tiktoken 384 | except ModuleNotFoundError: 385 | print("Please install tiktoken: (https://github.com/openai/tiktoken)") 386 | raise Exception 387 | 388 | name = "TiktokenTokenizer" 389 | super().__init__(name) 390 | 391 | self.tokenizer = tiktoken.get_encoding(vocab_file) 392 | self.eod_id = self.tokenizer.eot_token 393 | self.pad_id = None 394 | 395 | @property 396 | def vocab_size(self): 397 | return self.tokenizer.n_vocab 398 | 399 | @property 400 | def vocab(self): 401 | raise NotImplementedError( 402 | "TiktokenTokenizer does not implement vocabulary access." 403 | ) 404 | 405 | @property 406 | def inv_vocab(self): 407 | raise NotImplementedError( 408 | "TiktokenTokenizer does not implement vocabulary access. \ 409 | To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ." 410 | ) 411 | 412 | def tokenize(self, text: str): 413 | return self.tokenizer.encode(text) # , allowed_special="all") 414 | 415 | def tokenize_batch(self, text_batch: List[str]): 416 | return self.tokenizer.encode_batch(text_batch, allowed_special="all") 417 | 418 | def detokenize(self, token_ids): 419 | return self.tokenizer.decode(tokens=token_ids, errors="strict") 420 | 421 | @property 422 | def eod(self): 423 | return self.eod_id 424 | 425 | @property 426 | def pad(self): 427 | raise NotImplementedError 428 | --------------------------------------------------------------------------------