├── data
    ├── __init__.py
    ├── helpers.cpython-38-x86_64-linux-gnu.so
    ├── Makefile
    ├── tokenization
    │   ├── generate_dataset.py
    │   └── preprocess_data.py
    ├── blendable_dataset.py
    ├── samplers.py
    └── get_data_from_hf.py
├── .DS_Store
├── assets
    ├── .DS_Store
    ├── PPL.png
    ├── logo.png
    ├── model.png
    └── passkey.png
├── model
    ├── .DS_Store
    ├── gpt_neox
    │   ├── .DS_Store
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── modeling_gpt_neox.cpython-38.pyc
    │   │   └── configuration_gpt_neox.cpython-38.pyc
    │   ├── generation_config.json
    │   ├── config.json
    │   ├── __init__.py
    │   ├── tokenization_gpt_neox_fast.py
    │   └── configuration_gpt_neox.py
    ├── llama
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── modeling_llama.cpython-38.pyc
    │   │   └── configuration_llama.cpython-38.pyc
    │   ├── config_7b.json
    │   ├── config.json
    │   ├── __init__.py
    │   ├── configuration_llama.py
    │   ├── tokenization_llama_fast.py
    │   └── convert_llama_weights_to_hf.py
    ├── __init__.py
    └── peft
    │   ├── tuner
    │       ├── pe_base_model.py
    │       ├── __init__.py
    │       ├── bitfit.py
    │       └── roem.py
    │   ├── __init__.py
    │   └── utils
    │       ├── __init__.py
    │       ├── config.py
    │       ├── mapping.py
    │       └── others.py
├── tools
    ├── .DS_Store
    └── analysis
    │   ├── MMapIndexDatasetParser.py
    │   ├── post_tokenization_check.py
    │   └── MMapTokenIdsBinChecker.py
├── train
    ├── .DS_Store
    ├── __init__.py
    ├── run_coca_llama.sh
    └── run_coca_neox.sh
├── inference
    ├── .DS_Store
    └── generate.py
├── .idea
    ├── .gitignore
    ├── misc.xml
    ├── vcs.xml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── Collinear-Constrained-Attention.iml
    └── modules.xml
├── LEGAL.md
├── tokenizer
    ├── __init__.py
    ├── train_tokenizer.py
    └── tokenizer.py
├── utils
    ├── __init__.py
    ├── merge_base_and_lora_to_hf.py
    ├── hselect.py
    ├── learning_rates.py
    └── common_utils.py
├── dockerfile
└── README.md


/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/.DS_Store


--------------------------------------------------------------------------------
/assets/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/.DS_Store


--------------------------------------------------------------------------------
/assets/PPL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/PPL.png


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/logo.png


--------------------------------------------------------------------------------
/assets/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/model.png


--------------------------------------------------------------------------------
/model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/.DS_Store


--------------------------------------------------------------------------------
/tools/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/tools/.DS_Store


--------------------------------------------------------------------------------
/train/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/train/.DS_Store


--------------------------------------------------------------------------------
/assets/passkey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/assets/passkey.png


--------------------------------------------------------------------------------
/inference/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/inference/.DS_Store


--------------------------------------------------------------------------------
/model/gpt_neox/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/.DS_Store


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # 默认忽略的文件
2 | /shelf/
3 | /workspace.xml
4 | # 数据源本地存储已忽略文件
5 | /dataSources/
6 | /dataSources.local.xml
7 | # 基于编辑器的 HTTP 客户端请求
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/data/helpers.cpython-38-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/data/helpers.cpython-38-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/model/llama/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/llama/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/model/gpt_neox/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/model/gpt_neox/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "bos_token_id": 100256,
3 |   "eos_token_id": 100256,
4 |   "transformers_version": "4.26.0.dev0",
5 |   "_from_model_config": true
6 | }
7 | 


--------------------------------------------------------------------------------
/model/llama/__pycache__/modeling_llama.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/llama/__pycache__/modeling_llama.cpython-38.pyc


--------------------------------------------------------------------------------
/model/llama/__pycache__/configuration_llama.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/llama/__pycache__/configuration_llama.cpython-38.pyc


--------------------------------------------------------------------------------
/model/gpt_neox/__pycache__/modeling_gpt_neox.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/__pycache__/modeling_gpt_neox.cpython-38.pyc


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/model/gpt_neox/__pycache__/configuration_gpt_neox.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefuse-ai/Collinear-Constrained-Attention/master/model/gpt_neox/__pycache__/configuration_gpt_neox.cpython-38.pyc


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/.idea/Collinear-Constrained-Attention.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="inheritedJdk" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Collinear-Constrained-Attention.iml" filepath="$PROJECT_DIR$/.idea/Collinear-Constrained-Attention.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/LEGAL.md:
--------------------------------------------------------------------------------
1 | Legal Disclaimer
2 | 
3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
4 | 
5 | 法律免责声明
6 | 
7 | 关于代码注释部分，中文注释为官方版本，其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致，当中文注释与其它语言注释存在不一致时，请以中文注释为准。


--------------------------------------------------------------------------------
/model/llama/config_7b.json:
--------------------------------------------------------------------------------
1 | {"architectures": ["LLaMAForCausalLM"], "bos_token_id": 0, "eos_token_id": 1, "hidden_act": "silu", "hidden_size": 4096, "intermediate_size": 11008, "initializer_range": 0.02, "max_sequence_length": 2048, "model_type": "llama", "num_attention_heads": 32, "num_hidden_layers": 32, "pad_token_id": -1, "rms_norm_eps": 1e-06, "torch_dtype": "float16", "transformers_version": "4.27.0.dev0", "use_cache": true, "vocab_size": 32000}


--------------------------------------------------------------------------------
/model/llama/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"architectures": ["LLaMAForCausalLM"],
 3 |     "bos_token_id": 100256,
 4 |     "eos_token_id": 100256,
 5 | 	"hidden_act": "silu",
 6 | 	"hidden_size": 1024,
 7 | 	"intermediate_size": 4096,
 8 | 	"initializer_range": 0.02,
 9 | 	"max_sequence_length": 512,
10 | 	"model_type": "llama",
11 | 	"num_attention_heads": 16,
12 | 	"num_hidden_layers": 24,
13 | 	"pad_token_id": 100737,
14 | 	"rms_norm_eps": 1e-06,
15 | 	"torch_dtype": "float16",
16 | 	"transformers_version": "4.27.0.dev0",
17 | 	"use_cache": true,
18 | 	"vocab_size": 100864,
19 | 	"use_xformers": false
20 | }


--------------------------------------------------------------------------------
/train/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .run_train import *


--------------------------------------------------------------------------------
/model/gpt_neox/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "GPTNeoXForCausalLM"
 4 |   ],
 5 |   "bos_token_id": 100256,
 6 |   "eos_token_id": 100256,
 7 |   "hidden_act": "gelu",
 8 |   "hidden_size": 1024,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 4096,
11 |   "layer_norm_eps": 1e-05,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "gpt_neox",
14 |   "num_attention_heads": 16,
15 |   "num_hidden_layers": 24,
16 |   "rope_scaling": null,
17 |   "rotary_emb_base": 10000,
18 |   "rotary_pct": 1.0,
19 |   "tie_word_embeddings": false,
20 |   "torch_dtype": "float16",
21 |   "transformers_version": "4.26.1",
22 |   "use_cache": true,
23 |   "use_parallel_residual": true,
24 |   "vocab_size": 100864
25 | }


--------------------------------------------------------------------------------
/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from .tokenizer import build_tokenizer
17 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .common_utils import *
17 | from .auto_accelerate_utils import *


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | # from .gpt2_model import GPT2ModelPipe
19 | # from .utils import get_params_for_weight_decay_optimization
20 | # from .word_embeddings import SoftEmbedding
21 | 


--------------------------------------------------------------------------------
/model/peft/tuner/pe_base_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | class PEBaseModel:
17 |     """PEtuning的基类模型，定义了PEtuning模型都该有的方法"""
18 | 
19 |     def __init__():
20 |         return
21 | 
22 |     def get_model(self):
23 |         """对模型进行修改，冻结参数或者插入可训模块"""
24 |         pass
25 | 
26 |     @classmethod
27 |     def restore(self, model=None, path=None):
28 |         """从path恢复PE模型
29 | 
30 |         Args:
31 |             model (_type_, optional): 原始模型. Defaults to None.
32 |             path (_type_, optional): 增量路径. Defaults to None.
33 |         """
34 |         pass
35 | 


--------------------------------------------------------------------------------
/model/peft/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """peft models interface."""
17 | 
18 | from . import utils, tuner
19 | from peft.mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
20 | from peft.utils import TaskType
21 | from .modeling_peft import AntPeftForCausalLM, AntPeftForEmbedding
22 | 
23 | 
24 | SUPPORTED_PEFT_TYPES = ["prefix", "lora", "adalora", "bitfit", "roem", "unipelt", "prompt", "ptuning"]
25 | 
26 | # Register the Ant Causal Language Model
27 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_CAUSAL_LM"] = AntPeftForCausalLM
28 | TaskType.ANT_CAUSAL_LM = "ANT_CAUSAL_LM"
29 | 
30 | MODEL_TYPE_TO_PEFT_MODEL_MAPPING["ANT_EMBEDDING"] = AntPeftForEmbedding
31 | TaskType.ANT_EMBEDDING = "ANT_EMBEDDING"
32 | 


--------------------------------------------------------------------------------
/model/peft/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """peft utils interface."""
17 | 
18 | from .config import PeftConfig, PetuningConfig
19 | 
20 | from .mapping import TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING
21 | from .mapping import TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING
22 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
23 | from .mapping import TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING
24 | from .mapping import TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING
25 | from .mapping import TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING
26 | from .mapping import WEIGHTS_NAME, CONFIG_NAME
27 | from .mapping import bloom_model_postprocess_past_key_value
28 | 
29 | from .others import get_peft_model_state_dict, set_peft_model_state_dict, _freeze_model, prepare_model_for_kbit_training


--------------------------------------------------------------------------------
/inference/generate.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import re
18 | import time
19 | import json
20 | import torch
21 | import random
22 | import argparse
23 | import jsonlines
24 | import numpy as np
25 | from copy import deepcopy
26 | from transformers import AutoModelForCausalLM, AutoTokenizer
27 | 
28 | 
29 | def load_model(args):
30 |     st = time.time()
31 |     checkpoint = args.model_dir
32 |     print('LOAD CKPT: {}'.format(checkpoint))
33 |     tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side="left")
34 |     tokenizer.add_special_tokens({'eos_token': "<|endoftext|>"})
35 |     tokenizer.add_special_tokens({'pad_token': "<|pad|>"})
36 | 
37 |     model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float16)
38 |     print('Model load spend: {:.4f}s'.format(time.time() - st))
39 |     return tokenizer, model


--------------------------------------------------------------------------------
/model/peft/utils/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # Copyright (c) 2023 Ant Group. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import sys
18 | sys.path.append("..")
19 | sys.path.append("../..")
20 | from typing import List, Optional
21 | from dataclasses import dataclass, field
22 | from peft.utils import PeftConfig
23 | 
24 | 
25 | @dataclass
26 | class PetuningConfig(PeftConfig):
27 |     """
28 |     This is the base configuration class to store the configuration of [`ROEM`], or [`BitFit`].
29 | 
30 |     Args:
31 |         modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable
32 |             and saved in the final checkpoint.
33 |     """
34 | 
35 |     modules_to_save: Optional[List[str]] = field(
36 |         default=None,
37 |         metadata={
38 |             "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
39 |             "For example, in Sequence Classification or Token Classification tasks, "
40 |             "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
41 |         },
42 |     )


--------------------------------------------------------------------------------
/utils/merge_base_and_lora_to_hf.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import sys
18 | import torch
19 | import transformers
20 | from transformers import AutoModelForCausalLM, AutoTokenizer
21 | from peft import LoraConfig, get_peft_model
22 | from peft import PeftModelForCausalLM
23 | # from transformers import BitsAndBytesConfig
24 | # from peft import prepare_model_for_kbit_training
25 | 
26 | model_path='/output/checkpoint/mpt-fsdp-tp-bm-512-tp-1-dp-8-gpu-8-bin/checkpoint-320000'
27 | lora_adapter='/output/checkpoint/mpt-fsdp-tp-bm-512-tp-1-dp-8-gpu-8-bin-lora/checkpoint-20000'
28 | save_path='/output/checkpoint/mpt-fsdp-tp-bm-512-tp-1-dp-8-gpu-8-bin-lora/checkpoint-20000-merge'
29 | 
30 | base_model = AutoModelForCausalLM.from_pretrained(
31 |     model_path,  
32 |     trust_remote_code=True,
33 |     torch_dtype=torch.float16, 
34 |     return_dict=True,
35 |     device_map="auto"
36 | )
37 | print(base_model)
38 | model_to_merge = PeftModelForCausalLM.from_pretrained(base_model, lora_adapter)
39 | merged_model = model_to_merge.merge_and_unload()
40 | 
41 | tokenizer = AutoTokenizer.from_pretrained(model_path)
42 | 
43 | merged_model.save_pretrained(save_path)
44 | tokenizer.save_pretrained(save_path)
45 | print(f"Merge finised: {save_path} saved")


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | FROM reg.docker.alibaba-inc.com/atorch/atorch-dev:20230808torch210dev20230731cu118nlp
 2 | 
 3 | USER root
 4 | WORKDIR /root
 5 | 
 6 | ENV BASH_ENV /root/.bashrc
 7 | ENV LANGUAGE zh_cn
 8 | ENV LC_ALL zh_CN.UTF-8
 9 | ENV SHELL /bin/bash
10 | SHELL ["/bin/bash","-c"]
11 | 
12 | ADD lib /root/builder
13 | 
14 | RUN rm -rf /pai-extension && mv ~/builder/pai-extension /pai-extension && chmod 777 -R /pai-extension
15 | RUN mv ~/builder/theia-ide/.theia ~/.theia && rm -rf ~/.aistudio/hooks/*
16 | 
17 | RUN sh ~/builder/script/install-dumb-init.sh
18 | 
19 | RUN sh ~/builder/script/install-node.sh v5.20.3 \
20 |     && rm -rf ~/.aistudio && mkdir -p ~/.aistudio \
21 |     && echo 'export npm_config_user=root' >> ~/.bashrc \
22 |     && mv ~/builder/theia-ide/.aistudio/* ~/.aistudio \
23 |     && mv ~/builder/theia-ide/.aistudio/.[^.]* ~/.aistudio \
24 |     && gcc --version
25 | 
26 | RUN sh ~/builder/script/install-third-common.sh
27 | RUN sh ~/builder/script/python/install-sdk.sh || echo "install sdk failed"
28 | RUN sh ~/builder/script/matplot/installer.sh
29 | RUN pip install -I urllib3==1.26.4 && sh ~/builder/script/python/install-jupyter.sh
30 | RUN sh ~/builder/script/setup-base.sh
31 | RUN pip install jinja2==2.11.3 --no-deps
32 | RUN pip install markupsafe==1.1.1 --no-deps
33 | 
34 | # git lfs
35 | RUN pip install gradio==3.20.1
36 | 
37 | RUN npm i -g @alipay/aistudio-bootstrap \
38 |     && npm i -g @alipay/aistudio-installer-cli \
39 |     && ais-installer install full \
40 |     && ais-installer collect --version=${IMAGEVERSION} --type=${IMAGETYPE}
41 | 
42 | RUN pip install -U transformers==4.30.1
43 | RUN pip install -U bitsandbytes==0.39.0
44 | RUN pip install -U accelerate==0.20.3
45 | RUN pip install peft==0.4.0
46 | RUN pip uninstall flash_attn -y
47 | RUN pip install xformers --no-deps
48 | RUN pip install -U atorch==0.1.7rc17 --no-deps
49 | RUN pip install zstandard
50 | RUN pip install ujson
51 | RUN pip install jsonlines


--------------------------------------------------------------------------------
/model/peft/tuner/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """peft tuner methods interface."""
17 | 
18 | from peft.utils import PeftType
19 | from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
20 | from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
21 | 
22 | from .adalora import AdaLoraConfig, AdaLoraModel
23 | from .routelora import RouteLoraConfig, RouteLoraModel
24 | from .unipelt import UniPELTConfig, UniPELTModel, PEUniPELTModel
25 | from .pe_base_model import PEBaseModel
26 | from .bitfit import PeftBitfitConfig, PEBitfitModel, PeftBitfitModel
27 | from .roem import PeftROEMConfig, PEROEMModel, PeftROEMModel
28 | 
29 | # Register new ant peft methods
30 | PeftType.ROUTELORA = "ROUTELORA"
31 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROUTELORA] = RouteLoraModel
32 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROUTELORA] = RouteLoraConfig
33 | 
34 | PeftType.UNIPELT = "UNIPELT"
35 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.UNIPELT] = UniPELTModel
36 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.UNIPELT] = UniPELTConfig
37 | 
38 | PeftType.ROEM = "ROEM"
39 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ROEM] = PeftROEMModel
40 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ROEM] = PeftROEMConfig
41 | 
42 | PeftType.BITFIT = "BITFIT"
43 | PEFT_TYPE_TO_MODEL_MAPPING[PeftType.BITFIT] = PeftBitfitModel
44 | PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.BITFIT] = PeftBitfitConfig


--------------------------------------------------------------------------------
/model/gpt_neox/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.file_utils import _LazyModule, is_tokenizers_available, is_torch_available
17 | from transformers.utils import OptionalDependencyNotAvailable
18 | # from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
19 | # from ...utils import OptionalDependencyNotAvailable
20 | 
21 | 
22 | _import_structure = {"configuration_gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"]}
23 | 
24 | try:
25 |     if not is_tokenizers_available():
26 |         raise OptionalDependencyNotAvailable()
27 | except OptionalDependencyNotAvailable:
28 |     pass
29 | else:
30 |     _import_structure["tokenization_gpt_neox_fast"] = ["GPTNeoXTokenizerFast"]
31 | 
32 | try:
33 |     if not is_torch_available():
34 |         raise OptionalDependencyNotAvailable()
35 | except OptionalDependencyNotAvailable:
36 |     pass
37 | else:
38 |     _import_structure["modeling_gpt_neox"] = [
39 |         "GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST",
40 |         "GPTNeoXForCausalLM",
41 |         "GPTNeoXLayer",
42 |         "GPTNeoXModel",
43 |         "GPTNeoXPreTrainedModel",
44 |     ]
45 | 
46 | 
47 | if TYPE_CHECKING:
48 |     from .configuration_gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
49 | 
50 |     try:
51 |         if not is_tokenizers_available():
52 |             raise OptionalDependencyNotAvailable()
53 |     except OptionalDependencyNotAvailable:
54 |         pass
55 |     else:
56 |         from .tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
57 | 
58 |     try:
59 |         if not is_torch_available():
60 |             raise OptionalDependencyNotAvailable()
61 |     except OptionalDependencyNotAvailable:
62 |         pass
63 |     else:
64 |         from .modeling_gpt_neox import (
65 |             GPT_NEOX_PRETRAINED_MODEL_ARCHIVE_LIST,
66 |             GPTNeoXForCausalLM,
67 |             GPTNeoXLayer,
68 |             GPTNeoXModel,
69 |             GPTNeoXPreTrainedModel,
70 |         )
71 | 
72 | 
73 | else:
74 |     import sys
75 | 
76 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)


--------------------------------------------------------------------------------
/data/tokenization/generate_dataset.py:
--------------------------------------------------------------------------------
 1 | # origin: 使用GPT-Neox原生的Encoder：key为text，只生成input_ids，训练时document首尾相连按窗口去取
 2 | # Prompt_padding：使用UniformEncoder：key为input_ids和loss_mask; loss_mask保证只训练Target部分的Loss；每条样本Padding到seq_length，避免了一个Sample里包含多个样本的问题，但缺点是比较浪费计算资源
 3 | prompt_padding_cmd = "python preprocess_data.py \
 4 |             --input {input} \
 5 |             --jsonl-keys input_ids loss_mask \
 6 |             --output-prefix {output_prefix} \
 7 |             --vocab ../../tokenizer-ant-v3.json  \
 8 |             --dataset-impl mmap \
 9 |             --tokenizer-type HFTokenizer \
10 |             --workers {worker} \
11 |             --encoder UniformEncoder \
12 |             --seq-length {seq_length} \
13 |             --mode sft \
14 |             --padding"
15 | 
16 | align_padding_cmd = "python preprocess_data_align.py \
17 |             --input {input} \
18 |             --jsonl-keys w_input_ids w_loss_mask l_input_ids l_loss_mask \
19 |             --output-prefix {output_prefix} \
20 |             --vocab ../../tokenizer-ant-v3.json  \
21 |             --dataset-impl mmap \
22 |             --tokenizer-type HFTokenizer \
23 |             --workers {worker} \
24 |             --encoder UniformEncoder \
25 |             --seq-length {seq_length} \
26 |             --mode align \
27 |             --padding"
28 | 
29 | origin_cmd = "python preprocess_data.py \
30 |             --input {input} \
31 |             --output-prefix {output_prefix} \
32 |             --vocab ../../tokenizer-ant-v3.json  \
33 |             --dataset-impl mmap \
34 |             --tokenizer-type HFTokenizer \
35 |             --workers {worker} \
36 |             --encoder OriginEncoder \
37 |             --append-eod"
38 | 
39 | convert_dict = {
40 |     "origin":{
41 |         "output_path":"xxx",
42 |         "cmd": origin_cmd
43 |     },
44 |     "prompt_padding":{
45 |         "output_path":"/ossfs/workspace/coh_tokenization",
46 |         "cmd": prompt_padding_cmd
47 |     },
48 |     "align_padding":{
49 |         "output_path":"/ossfs/workspace/alignment_tokenization",
50 |         "cmd": align_padding_cmd
51 |     }
52 | }
53 | 
54 | input_dict = {
55 |     'dataset1': "/path/dataset1_path",
56 |     'dataset2': "/path/dataset2_path"
57 | }
58 | # conver_type_list = ["align_padding"]
59 | conver_type_list = ["prompt_padding"]
60 | output_name = "coh"
61 | 
62 | if __name__ == "__main__":
63 |     import os
64 |     seq_length = 2048
65 |     worker = 16
66 | 
67 |     for convert_type in conver_type_list:
68 |         convert_info = convert_dict[convert_type]
69 |         output_path = convert_info["output_path"]
70 |         convert_cmd = convert_info["cmd"]
71 |         output_prefix = os.path.join(output_path, output_name)
72 | 
73 |         input_ = ",".join(input_dict.values())
74 |         print(input_)
75 |         cmd = convert_cmd.replace("{input}", input_).replace("{output_prefix}", output_prefix).replace("{seq_length}", str(seq_length)).replace("{worker}", str(worker))
76 |         os.system(cmd)


--------------------------------------------------------------------------------
/model/llama/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_sentencepiece_available,
20 |     is_tokenizers_available,
21 |     is_torch_available,
22 | )
23 | 
24 | 
25 | _import_structure = {
26 |     "configuration_llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
27 | }
28 | 
29 | try:
30 |     if not is_sentencepiece_available():
31 |         raise OptionalDependencyNotAvailable()
32 | except OptionalDependencyNotAvailable:
33 |     pass
34 | else:
35 |     _import_structure["tokenization_llama"] = ["LlamaTokenizer"]
36 | 
37 | try:
38 |     if not is_tokenizers_available():
39 |         raise OptionalDependencyNotAvailable()
40 | except OptionalDependencyNotAvailable:
41 |     pass
42 | else:
43 |     _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
44 | 
45 | try:
46 |     if not is_torch_available():
47 |         raise OptionalDependencyNotAvailable()
48 | except OptionalDependencyNotAvailable:
49 |     pass
50 | else:
51 |     _import_structure["modeling_llama"] = [
52 |         "LlamaForCausalLM",
53 |         "LlamaModel",
54 |         "LlamaPreTrainedModel",
55 |         "LlamaForSequenceClassification",
56 |     ]
57 | 
58 | 
59 | if TYPE_CHECKING:
60 |     from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
61 | 
62 |     try:
63 |         if not is_sentencepiece_available():
64 |             raise OptionalDependencyNotAvailable()
65 |     except OptionalDependencyNotAvailable:
66 |         pass
67 |     else:
68 |         from .tokenization_llama import LlamaTokenizer
69 | 
70 |     try:
71 |         if not is_tokenizers_available():
72 |             raise OptionalDependencyNotAvailable()
73 |     except OptionalDependencyNotAvailable:
74 |         pass
75 |     else:
76 |         from .tokenization_llama_fast import LlamaTokenizerFast
77 | 
78 |     try:
79 |         if not is_torch_available():
80 |             raise OptionalDependencyNotAvailable()
81 |     except OptionalDependencyNotAvailable:
82 |         pass
83 |     else:
84 |         from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
85 | 
86 | 
87 | else:
88 |     import sys
89 | 
90 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)


--------------------------------------------------------------------------------
/data/blendable_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021, EleutherAI
 2 | # This file is based on code by the authors denoted below and has been modified from its original version.
 3 | #
 4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | """Blendable dataset."""
19 | 
20 | import time
21 | 
22 | import numpy as np
23 | import torch
24 | 
25 | from utils.common_utils import print_rank_0
26 | 
27 | 
28 | class BlendableDataset(torch.utils.data.Dataset):
29 |     def __init__(self, datasets, weights):
30 |         self.datasets = datasets
31 |         num_datasets = len(datasets)
32 |         assert num_datasets == len(weights)
33 | 
34 |         self.size = 0
35 |         for dataset in self.datasets:
36 |             self.size += len(dataset)
37 | 
38 |         # Normalize weights.
39 |         weights = np.array(weights, dtype=np.float64)
40 |         sum_weights = np.sum(weights)
41 |         assert sum_weights > 0.0
42 |         weights /= sum_weights
43 | 
44 |         # recompute weights
45 |         weights = self.calc_weights()
46 |         
47 |         # Build indices.
48 |         start_time = time.time()
49 |         assert num_datasets < 255
50 |         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
51 |         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
52 | 
53 |         from data import helpers
54 | 
55 |         helpers.build_blending_indices(
56 |             self.dataset_index,
57 |             self.dataset_sample_index,
58 |             weights,
59 |             num_datasets,
60 |             self.size,
61 |             torch.distributed.get_rank() == 0,
62 |         )
63 | 
64 |         print(
65 |             "> RANK {} elapsed time for building blendable dataset indices: "
66 |             "{:.2f} (sec)".format(
67 |                 torch.distributed.get_rank(), time.time() - start_time
68 |             )
69 |         )
70 | 
71 |     def calc_weights(self):
72 |         dataset_sample_cnt = [len(ds) for ds in self.datasets]
73 |         total_cnt = sum(dataset_sample_cnt)
74 |         weights = np.array([(cnt + 0.0) / total_cnt for cnt in dataset_sample_cnt], dtype=np.float64)
75 |         return weights
76 |     
77 |     def __len__(self):
78 |         return self.size
79 | 
80 |     def __getitem__(self, idx):
81 |         try:
82 |             dataset_idx = self.dataset_index[idx]
83 |             sample_idx = self.dataset_sample_index[idx]
84 |             return self.datasets[dataset_idx][sample_idx]
85 |         except IndexError:
86 |             new_idx = idx % len(self)
87 |             print(
88 |                 f"WARNING: Got index out of bounds error with index {idx} - taking modulo of index instead ({new_idx})"
89 |             )
90 |             return self[new_idx]
91 | 


--------------------------------------------------------------------------------
/tools/analysis/MMapIndexDatasetParser.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2023 Ant Group. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import struct
 17 | import os
 18 | import numpy as np
 19 | 
 20 | 
 21 | dtypes = {
 22 |     1: np.uint8,
 23 |     2: np.int8,
 24 |     3: np.int16,
 25 |     4: np.int32,
 26 |     5: np.int64,
 27 |     6: np.float32,
 28 |     7: np.float64,
 29 |     8: np.uint16,
 30 | }
 31 | 
 32 | 
 33 | class MMapIndexDataset:
 34 |     """
 35 |     描述GPT-Neox mmap实现方式获得的*.idx文件对应的数据集，即Tokenization索引数据集
 36 |     """
 37 | 
 38 |     # magic code
 39 |     _HDR_MAGIC = b"MMIDIDX\x00\x00"
 40 | 
 41 |     _VERSION = 1
 42 | 
 43 |     def __init__(self, index_dataset_file_path):
 44 |         """
 45 |         对于给定的GPT-Neox mmap实现方式的索引文件<index_dataset_file_path>生成对应的数据集描述
 46 |         """
 47 |         assert os.path.exists, (
 48 |             "给定的路径不存在"
 49 |             "请确保给定的.idx文件路径是存在的"
 50 |         )
 51 |         assert os.path.isfile, (
 52 |             "给定的路径不是一个文件"
 53 |             "请确保给定的是一个.idx文件的路径"
 54 |         )
 55 |         
 56 |         self.path = index_dataset_file_path
 57 |         
 58 |         with open(self.path, 'rb') as fb:
 59 |             magic = fb.read(9)
 60 |             assert magic == self._HDR_MAGIC, (
 61 |                 "Magic Code与期望格式不匹配"
 62 |                 "请确保提供的是GPT Neox MMAP方式生成的.idx文件"
 63 |             )
 64 | 
 65 |             version = struct.unpack('<Q', fb.read(8))
 66 |             assert version[0] == self._VERSION, (
 67 |                 "提供的文件版本与期望不一致"
 68 |             )
 69 | 
 70 |             # 每个token用什么数据类型表达，例如，4字节INT32类型，2字节UINT16类型等
 71 |             dtype_code = struct.unpack('<B', fb.read(1))[0]
 72 |             self._dtype = dtypes[dtype_code]
 73 |             self._dtype_size = self._dtype().itemsize
 74 | 
 75 |             # 数据集样本数量
 76 |             self._len = struct.unpack('<Q', fb.read(8))[0]
 77 |             # doc index数量
 78 |             self._doc_index_count = struct.unpack('<Q', fb.read(8))[0]
 79 | 
 80 |             # 每个样本token数量，这里假设是SFT Padding模式
 81 |             self._seq_len = struct.unpack('<I', fb.read(4))[0]
 82 | 
 83 | 
 84 | 
 85 | class MMapIndexDatasetChecker:
 86 |     """
 87 |     对GPT Neox MMAP方式生成的Index数据集文件进行检查
 88 |     """
 89 | 
 90 |     def __init__(self, mmap_index_dataset_path):
 91 |         self.mmap_index_dataset_path = mmap_index_dataset_path
 92 |         self.mmap_index_dataset = MMapIndexDataset(self.mmap_index_dataset_path)
 93 | 
 94 | 
 95 |     def check(self, bin_bytes_size=None, seq_len=None):
 96 |         try:
 97 |             if seq_len and self.mmap_index_dataset._seq_len != seq_len:
 98 |                 print(f"\033[1;31;47mIDX文件中单个样本长度({self.mmap_index_dataset._seq_len})与给定长度({seq_len})不一致\033[0m")
 99 |                 return False
100 |             print('bin_bytes_size', bin_bytes_size)
101 |             if seq_len and bin_bytes_size and self.mmap_index_dataset._len * self.mmap_index_dataset._dtype_size * self.mmap_index_dataset._seq_len != bin_bytes_size:
102 |                 print("\033[1;31;47m依据IDX文件中的样本数量和单个样本大小计算出来的bin文件字节数与给定的不相同\033[0m")
103 |                 return False
104 | 
105 |             return True
106 |         except:
107 |             return False
108 | 


--------------------------------------------------------------------------------
/train/run_coca_llama.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | LOAD_RAW_DATASET=False
  3 | if [ ${LOAD_RAW_DATASET} = "True" ]; then
  4 |   LOAD_RAW_DATASET="--load_raw_dataset"
  5 |   DATA_PATHS="[/path/dataset1,/path/dataset2]"
  6 |   DATA_WEIGHTS="[1.,1.]"
  7 |   DATA_SPLIT="90,10,0"
  8 |   SHUFFLE_BEFORE_SPLIT=""
  9 |   USE_RANDOM_SAMPLER=""
 10 |   USE_WEIGHTED_LOSS=""
 11 |   WEIGHT_BY_NUM_DOCUMENTS=""
 12 | else
 13 |   LOAD_RAW_DATASET=""
 14 |   DATA_PATHS="[/path/dataset1,/path/dataset2]"
 15 |   DATA_WEIGHTS="[1.,1.]"
 16 |   DATA_SPLIT="90,10,0"
 17 |   SHUFFLE_BEFORE_SPLIT="--shuffle_before_split"
 18 |   USE_RANDOM_SAMPLER="--use_random_sampler"
 19 |   USE_WEIGHTED_LOSS="--use_weighted_loss"
 20 |   WEIGHT_BY_NUM_DOCUMENTS="--weight_by_num_documents"
 21 | fi
 22 | 
 23 | #USE_XFORMERS=True
 24 | #if [ ${USE_XFORMERS} = "True" ]; then
 25 | #  USE_XFORMERS="--use_xformers"
 26 | #else
 27 | #  USE_XFORMERS=""
 28 | #fi
 29 | 
 30 | VOCAB_FILE="../tools/codegpt-13b-tokenizer.json"
 31 | TOKENIZER_TYPE="HFTokenizer"
 32 | MODEL_TYPE="llama"
 33 | MODEL_CONFIG_PATH="../model/${MODEL_TYPE}"
 34 | 
 35 | RESUME_FROM_CHECKPOINT="false"
 36 | 
 37 | PER_DEVICE_BATCH_SIZE=$1
 38 | TP=$2
 39 | DP=$3
 40 | EPOCH=$4
 41 | TOTAL_TRAIN_BATCH_SIZE=$(($PER_DEVICE_BATCH_SIZE * $TP * $DP))
 42 | 
 43 | GPU=$(($TP * $DP))
 44 | OUTPUT="/output/checkpoint/mpt-fsdp-tp-bm-${TOTAL_TRAIN_BATCH_SIZE}-tp-${TP}-dp-${DP}-gpu-${GPU}-bin"
 45 | TENSORBOARD_PATH="/output/tensorboard/mpt-fsdp-tp-bm-${TOTAL_TRAIN_BATCH_SIZE}-tp-${TP}-dp-${DP}-gpu-${GPU}-bin"
 46 | 
 47 | PREFIX="master-0"
 48 | mkdir -p $OUTPUT || true
 49 | echo "output to $OUTPUT"
 50 | mkdir -p $TENSORBOARD_PATH
 51 | chmod 777 $OUTPUT
 52 | chmod 777 $TENSORBOARD_PATH
 53 | 
 54 | # atorch environment maybe not available yet, opensource soon
 55 | pip install -U --no-deps atorch==0.1.7rc9
 56 | pip install tensorboard==2.3.0
 57 | pip install peft==0.3.0 --no-dependencies
 58 | pip install zstandard
 59 | pip install ujson
 60 | pip install jsonlines
 61 | pip list
 62 | 
 63 | python -m atorch.distributed.launch \
 64 |     --nproc_per_node=$(nvidia-smi -L | wc -l) \
 65 |     run_train.py \
 66 |     ${LOAD_RAW_DATASET} \
 67 |     ${SPLIT_BEFORE_READ} \
 68 |     --tokenize_mode 'pretrain' \
 69 |     --train_mode 'sst' \
 70 |     --config_path $MODEL_CONFIG_PATH \
 71 |     --tokenizer_type $TOKENIZER_TYPE \
 72 |     --vocab_file $VOCAB_FILE \
 73 |     --model_type $MODEL_TYPE \
 74 |     --padding \
 75 |     --data_paths $DATA_PATHS \
 76 |     --data_weights $DATA_WEIGHTS \
 77 |     --data_split $DATA_SPLIT \
 78 |     ${SHUFFLE_BEFORE_SPLIT} \
 79 |     ${USE_RANDOM_SAMPLER} \
 80 |     ${USE_WEIGHTED_LOSS} \
 81 |     ${WEIGHT_BY_NUM_DOCUMENTS} \
 82 |     --train_iters 100 \
 83 |     --num_warmup_steps 6000 \
 84 |     --custom_lr_scheduler_type 'cosine' \
 85 |     --learning_rate 1.0e-4 \
 86 |     --min_lr 1.0e-5 \
 87 |     --valid_iters 500 \
 88 |     --valid_interval 2000 \
 89 |     --num_train_epochs $EPOCH \
 90 |     --seq_length 512 \
 91 |     --total_train_batch_size $TOTAL_TRAIN_BATCH_SIZE \
 92 |     --per_device_valid_batch_size $PER_DEVICE_BATCH_SIZE \
 93 |     --seed 42 \
 94 |     --preprocessing_num_workers 6 \
 95 |     --num_workers 8 \
 96 |     --output_dir $OUTPUT \
 97 |     --tensorboard_dir $TENSORBOARD_PATH \
 98 |     --ignore_mismatched_sizes \
 99 |     --skip_atorch_autoacc_dryrun \
100 |     --tp $TP \
101 |     --dp $DP \
102 |     --bf16 \
103 |     --pipe_parallel_size 0 \
104 |     --model_parallel_size 1 \
105 |     --checkpointing_steps 5000 \
106 |     --log_interval 10 \
107 |     --make_vocab_size_divisible_by 128 \
108 |     --weighted_loss_mode 'case3' \
109 |     --checkpoint_activations \
110 |     --resume_from_checkpoint $RESUME_FROM_CHECKPOINT \
111 |     --max_grad_norm 1 \
112 |     --evaluation_strategy "steps,epoch" \
113 |     --save_strategy "steps" \
114 |     --save_total_limit 10 \
115 |     --extra_save_by_epoch \
116 |     --metric_for_best_model 'loss' \
117 |     --greater_is_better 'false' \
118 |     --zero_opt_level zero3 2>&1 | tee $OUTPUT/$PREFIX-output.txt


--------------------------------------------------------------------------------
/train/run_coca_neox.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | LOAD_RAW_DATASET=False
  3 | if [ ${LOAD_RAW_DATASET} = "True" ]; then
  4 |   LOAD_RAW_DATASET="--load_raw_dataset"
  5 |   DATA_PATHS="[/path/dataset1,/path/dataset2]"
  6 |   DATA_WEIGHTS="[1.,1.]"
  7 |   DATA_SPLIT="90,10,0"
  8 |   SHUFFLE_BEFORE_SPLIT=""
  9 |   USE_RANDOM_SAMPLER=""
 10 |   USE_WEIGHTED_LOSS=""
 11 |   WEIGHT_BY_NUM_DOCUMENTS=""
 12 | else
 13 |   LOAD_RAW_DATASET=""
 14 |   DATA_PATHS="[/path/dataset1,/path/dataset2]"
 15 |   DATA_WEIGHTS="[1.,1.]"
 16 |   DATA_SPLIT="90,10,0"
 17 |   SHUFFLE_BEFORE_SPLIT="--shuffle_before_split"
 18 |   USE_RANDOM_SAMPLER="--use_random_sampler"
 19 |   USE_WEIGHTED_LOSS="--use_weighted_loss"
 20 |   WEIGHT_BY_NUM_DOCUMENTS="--weight_by_num_documents"
 21 | fi
 22 | 
 23 | #USE_XFORMERS=True
 24 | #if [ ${USE_XFORMERS} = "True" ]; then
 25 | #  USE_XFORMERS="--use_xformers"
 26 | #else
 27 | #  USE_XFORMERS=""
 28 | #fi
 29 | 
 30 | VOCAB_FILE="../tools/codegpt-13b-tokenizer.json"
 31 | TOKENIZER_TYPE="HFTokenizer"
 32 | MODEL_TYPE="gpt_neox"
 33 | MODEL_CONFIG_PATH="../model/${MODEL_TYPE}"
 34 | 
 35 | RESUME_FROM_CHECKPOINT="false"
 36 | 
 37 | PER_DEVICE_BATCH_SIZE=$1
 38 | TP=$2
 39 | DP=$3
 40 | EPOCH=$4
 41 | TOTAL_TRAIN_BATCH_SIZE=$(($PER_DEVICE_BATCH_SIZE * $TP * $DP))
 42 | 
 43 | GPU=$(($TP * $DP))
 44 | OUTPUT="/output/checkpoint/mpt-fsdp-tp-bm-${TOTAL_TRAIN_BATCH_SIZE}-tp-${TP}-dp-${DP}-gpu-${GPU}-bin"
 45 | TENSORBOARD_PATH="/output/tensorboard/mpt-fsdp-tp-bm-${TOTAL_TRAIN_BATCH_SIZE}-tp-${TP}-dp-${DP}-gpu-${GPU}-bin"
 46 | 
 47 | PREFIX="master-0"
 48 | mkdir -p $OUTPUT || true
 49 | echo "output to $OUTPUT"
 50 | mkdir -p $TENSORBOARD_PATH
 51 | chmod 777 $OUTPUT
 52 | chmod 777 $TENSORBOARD_PATH
 53 | 
 54 | # atorch environment maybe not available yet, opensource soon
 55 | pip install -U --no-deps atorch==0.1.7rc9
 56 | pip install tensorboard==2.3.0
 57 | pip install peft==0.3.0 --no-dependencies
 58 | pip install zstandard
 59 | pip install ujson
 60 | pip install jsonlines
 61 | pip list
 62 | 
 63 | python -m atorch.distributed.launch \
 64 |     --nproc_per_node=$(nvidia-smi -L | wc -l) \
 65 |     run_train.py \
 66 |     ${LOAD_RAW_DATASET} \
 67 |     ${SPLIT_BEFORE_READ} \
 68 |     --tokenize_mode 'pretrain' \
 69 |     --train_mode 'sst' \
 70 |     --config_path $MODEL_CONFIG_PATH \
 71 |     --tokenizer_type $TOKENIZER_TYPE \
 72 |     --vocab_file $VOCAB_FILE \
 73 |     --model_type $MODEL_TYPE \
 74 |     --padding \
 75 |     --data_paths $DATA_PATHS \
 76 |     --data_weights $DATA_WEIGHTS \
 77 |     --data_split $DATA_SPLIT \
 78 |     ${SHUFFLE_BEFORE_SPLIT} \
 79 |     ${USE_RANDOM_SAMPLER} \
 80 |     ${USE_WEIGHTED_LOSS} \
 81 |     ${WEIGHT_BY_NUM_DOCUMENTS} \
 82 |     --train_iters 100 \
 83 |     --num_warmup_steps 6000 \
 84 |     --custom_lr_scheduler_type 'cosine' \
 85 |     --learning_rate 1.0e-4 \
 86 |     --min_lr 1.0e-5 \
 87 |     --valid_iters 500 \
 88 |     --valid_interval 2000 \
 89 |     --num_train_epochs $EPOCH \
 90 |     --seq_length 512 \
 91 |     --total_train_batch_size $TOTAL_TRAIN_BATCH_SIZE \
 92 |     --per_device_valid_batch_size $PER_DEVICE_BATCH_SIZE \
 93 |     --seed 42 \
 94 |     --preprocessing_num_workers 6 \
 95 |     --num_workers 8 \
 96 |     --output_dir $OUTPUT \
 97 |     --tensorboard_dir $TENSORBOARD_PATH \
 98 |     --ignore_mismatched_sizes \
 99 |     --skip_atorch_autoacc_dryrun \
100 |     --tp $TP \
101 |     --dp $DP \
102 |     --bf16 \
103 |     --pipe_parallel_size 0 \
104 |     --model_parallel_size 1 \
105 |     --checkpointing_steps 5000 \
106 |     --log_interval 10 \
107 |     --make_vocab_size_divisible_by 128 \
108 |     --weighted_loss_mode 'case3' \
109 |     --checkpoint_activations \
110 |     --resume_from_checkpoint $RESUME_FROM_CHECKPOINT \
111 |     --max_grad_norm 1 \
112 |     --evaluation_strategy "steps,epoch" \
113 |     --save_strategy "steps" \
114 |     --save_total_limit 10 \
115 |     --extra_save_by_epoch \
116 |     --metric_for_best_model 'loss' \
117 |     --greater_is_better 'false' \
118 |     --zero_opt_level zero3 2>&1 | tee $OUTPUT/$PREFIX-output.txt


--------------------------------------------------------------------------------
/tokenizer/train_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021, EleutherAI
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """
 16 | Assumes a dataset of jsonl files in the same format as the neox training set.
 17 | """
 18 | 
 19 | from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors, trainers
 20 | from tokenizers.normalizers import NFKC
 21 | 
 22 | from glob import glob
 23 | import os
 24 | import json
 25 | import argparse
 26 | 
 27 | 
 28 | def load_jsonl(input_path, quiet=True) -> list:
 29 |     """
 30 |     Read list of objects from a JSON lines file.
 31 |     """
 32 |     data = []
 33 |     with open(input_path, "r", encoding="utf-8") as f:
 34 |         for line in f:
 35 |             data.append(json.loads(line.rstrip("\n|\r")))
 36 |     if not quiet:
 37 |         print("Loaded {} records from {}".format(len(data), input_path))
 38 |     return data
 39 | 
 40 | 
 41 | def json_iterator(input_dir, text_key="text"):
 42 |     all_jsonls = glob(f"{input_dir}/*.jsonl") + glob(f"{input_dir}/*.json")
 43 |     for j in all_jsonls:
 44 |         data = load_jsonl(j)
 45 |         for doc in data:
 46 |             yield doc[text_key]
 47 | 
 48 | 
 49 | def train_tokenizer(
 50 |     input_dir: str, save_path: str, tokenizer_type: str = "BPE", vocab_size: int = 52000
 51 | ):
 52 |     """
 53 |     Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`
 54 | 
 55 |     :param input_dir: input directory containing jsonl files
 56 |     :param save_path: path to save tokenizer to
 57 |     :param tokenizer_type: type of tokenizer to train.
 58 |     :param vocab_size: int, size of tokenizer's vocab
 59 |     :return:
 60 |     """
 61 | 
 62 |     if tokenizer_type == "BPE":
 63 |         model = models.BPE()
 64 |     else:
 65 |         raise NotImplementedError(f"Tokenizer type {tokenizer_type} not implemented")
 66 |     tokenizer = Tokenizer(model)
 67 | 
 68 |     # Customize pre-tokenization and decoding
 69 |     tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
 70 |     tokenizer.decoder = decoders.ByteLevel()
 71 |     tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
 72 |     tokenizer.normalizer = NFKC()
 73 | 
 74 |     # And then train
 75 |     trainer = trainers.BpeTrainer(
 76 |         vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"]
 77 |     )
 78 |     tokenizer.train_from_iterator(json_iterator(input_dir), trainer)
 79 | 
 80 |     # And Save it
 81 |     tokenizer.save(save_path, pretty=True)
 82 |     print(f"Tokenizer saved at {save_path}")
 83 | 
 84 | 
 85 | def parse_args():
 86 |     parser = argparse.ArgumentParser(
 87 |         description="script for training a multilingual "
 88 |         "HF tokenizer on CC dumps with upweighting for low resource languages"
 89 |     )
 90 |     parser.add_argument(
 91 |         "--json_input_dir",
 92 |         type=str,
 93 |         help="Path to folder containing tokenizer training data in jsonl format",
 94 |     )
 95 |     parser.add_argument(
 96 |         "--tokenizer_output_path",
 97 |         type=str,
 98 |         help="Path to which your trained tokenizer will be saved (should end in .json)",
 99 |     )
100 |     parser.add_argument(
101 |         "--tokenizer_type",
102 |         type=str,
103 |         help="type of tokenizer to train, currently only BPE is supported",
104 |         choices=["BPE"],
105 |         default=["BPE"],
106 |     )
107 |     parser.add_argument(
108 |         "-v",
109 |         "--vocab_size",
110 |         help="vocabulary size of tokenizer, default=52k",
111 |         type=int,
112 |         default=52000,
113 |     )
114 |     return parser.parse_args()
115 | 
116 | 
117 | if __name__ == "__main__":
118 | 
119 |     args = parse_args()
120 | 
121 |     train_tokenizer(
122 |         args.json_input_dir,
123 |         save_path=args.tokenizer_output_path,
124 |         tokenizer_type=args.tokenizer_type,
125 |         vocab_size=args.vocab_size,
126 |     )
127 | 


--------------------------------------------------------------------------------
/tools/analysis/post_tokenization_check.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) 2023 Ant Group. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import MMapIndexDatasetParser
17 | import MMapTokenIdsBinChecker
18 | import argparse
19 | import os
20 | 
21 | 
22 | def process_dataset(index_dataset_path, input_ids_bin_path, loss_mask_bin_path, tokenizer_path, detokenization_output_path, seq_len, random_sampling_num):
23 |     # 检查index dataset IDX文件
24 |     mmap_index_dataset_checker = MMapIndexDatasetParser.MMapIndexDatasetChecker(index_dataset_path)
25 |     if not os.path.exists(input_ids_bin_path) or not os.path.isfile(input_ids_bin_path):
26 |         print(f"给定的input_ids.bin路径不存在或不是文件:{input_ids_bin_path}")
27 |         return False
28 | 
29 |     check_result = mmap_index_dataset_checker.check(bin_bytes_size=os.path.getsize(input_ids_bin_path), seq_len=seq_len)
30 |     if not check_result:
31 |         print('!!!'*40)
32 |         print(f"!\033[1;31;47mIDX检查未通过 {index_dataset_path}\033[0m")
33 |         print('!!!'*40)
34 |         return False
35 | 
36 |     # 检查input ids BIN文件
37 |     mmap_token_ids_bin_checker = MMapTokenIdsBinChecker.MMapTokenIdsBinChecker(input_ids_bin_path=input_ids_bin_path, 
38 |                                                                                loss_mask_bin_path=loss_mask_bin_path, 
39 |                                                                                tokenizer_path=tokenizer_path,
40 |                                                                                detokenize_output_path=detokenization_output_path,
41 |                                                                                seq_len=seq_len,
42 |                                                                                element_size=mmap_index_dataset_checker.mmap_index_dataset._dtype_size,
43 |                                                                                dtype=mmap_index_dataset_checker.mmap_index_dataset._dtype,
44 |                                                                                sample_total=mmap_index_dataset_checker.mmap_index_dataset._len,
45 |                                                                                ramdom_sampling_num=random_sampling_num)
46 |     
47 |     check_result = mmap_token_ids_bin_checker.check()
48 |     if not check_result:
49 |         print('!!!'*40)
50 |         print(f'!\033[1;31;47m【ERROR】数据集{loss_mask_bin_path}抽检未通过\033[0m')
51 |         print('!!!'*40)
52 |         return False
53 |     else:
54 |         print('###'*40)
55 |         print(f'#\033[1;32;47m【OK】数据集{loss_mask_bin_path}抽检通过\033[0m')
56 |         print('###'*40)
57 |         return True
58 | 
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     parser = argparse.ArgumentParser(description="")
63 |     parser.add_argument('datasets_dir', help="GPT Neox MMAP方式生成的数据集目录路径，要求该路径下每个子目录对应一个数据集")
64 |     parser.add_argument('tokenizer_path', help="词表文件路径")
65 |     parser.add_argument('detokenization_output_dir', help='存储detokenization结果文件的目录路径')
66 |     parser.add_argument('seq_len', help="每个样本token数量")
67 |     parser.add_argument('--random_sampling_num', '-rsn', type=int,default=5, help='要随机抽样检查的数量，默认是100')
68 | 
69 |     args = parser.parse_args()
70 | 
71 |     for dir in os.listdir(args.datasets_dir):
72 |         #if dir != 'codecompletion':
73 |         #    continue
74 |         for file in os.listdir(os.path.join(args.datasets_dir, dir)):
75 |             if file.endswith('_input_ids.idx'):
76 |                 mmap_index_dataset_path = os.path.join(args.datasets_dir, dir, file)
77 |             elif file.endswith('_input_ids.bin'):
78 |                 mmap_input_ids_bin_path = os.path.join(args.datasets_dir, dir, file)
79 |             elif file.endswith('loss_mask.bin'):
80 |                 mmap_loss_mask_bin_path = os.path.join(args.datasets_dir, dir, file)
81 | 
82 |         detokenization_output_path = os.path.join(args.detokenization_output_dir, f"{dir}.txt")
83 |         print(f'\n\n开始检查数据集{dir}....')
84 |         process_dataset(mmap_index_dataset_path, mmap_input_ids_bin_path, mmap_loss_mask_bin_path, args.tokenizer_path, detokenization_output_path, int(args.seq_len), args.random_sampling_num)


--------------------------------------------------------------------------------
/utils/hselect.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Functions ported from the R package sm.
  3 | 
  4 | Implements different bandwidth selection methods, including:
  5 | - Scott's rule of thumb
  6 | - Silverman's rule of thumb
  7 | - Sheather-Jones estimator
  8 | '''
  9 | 
 10 | import numpy as np
 11 | # import distributions as distr
 12 | 
 13 | 
 14 | __all__ = ['wmean',
 15 |            'wvar',
 16 |            'dnorm',
 17 |            'hsilverman',
 18 |            'hscott',
 19 |            'hnorm',
 20 |            'hsj']
 21 | 
 22 | 
 23 | def wmean(x, w):
 24 |     '''
 25 |     Weighted mean
 26 |     '''
 27 |     return sum(x * w) / float(sum(w))
 28 | 
 29 | 
 30 | def wvar(x, w):
 31 |     '''
 32 |     Weighted variance
 33 |     '''
 34 |     return sum(w * (x - wmean(x, w)) ** 2) / float(sum(w) - 1)
 35 | 
 36 | 
 37 | def dnorm(x):
 38 |     return distr.normal.pdf(x, 0.0, 1.0)
 39 | 
 40 | 
 41 | def bowman(x):
 42 |     pass
 43 |     # TODO: implement?
 44 |     #hx = median(abs(x - median(x))) / 0.6745 * (4 / 3 / r.n) ^ 0.2
 45 |     #hy = median(abs(y - median(y))) / 0.6745 * (4 / 3 / r.n) ^ 0.2
 46 |     #h = sqrt(hy * hx)
 47 | 
 48 | 
 49 | def hsilverman(x, weights=None):
 50 |     IQR = np.percentile(x, 75) - np.percentile(x, 25)
 51 |     A = min(np.std(x, ddof=1), IQR / 1.349)
 52 | 
 53 |     if weights is None:
 54 |         weights = np.ones(len(x))
 55 |     n = float(sum(weights))
 56 | 
 57 |     return 0.9 * A * n ** (-0.2)
 58 | 
 59 | 
 60 | def hscott(x, weights=None):
 61 | 
 62 |     IQR = np.percentile(x, 75) - np.percentile(x, 25)
 63 |     A = min(np.std(x, ddof=1), IQR / 1.349)
 64 | 
 65 |     if weights is None:
 66 |         weights = np.ones(len(x))
 67 |     n = float(sum(weights))
 68 | 
 69 |     return 1.059 * A * n ** (-0.2)
 70 | 
 71 | 
 72 | def hnorm(x, weights=None):
 73 |     '''
 74 |     Bandwidth estimate assuming f is normal. See paragraph 2.4.2 of
 75 |     Bowman and Azzalini[1]_ for details.
 76 | 
 77 |     References
 78 |     ----------
 79 |     .. [1] Applied Smoothing Techniques for Data Analysis: the
 80 |         Kernel Approach with S-Plus Illustrations.
 81 |         Bowman, A.W. and Azzalini, A. (1997).
 82 |         Oxford University Press, Oxford
 83 |     '''
 84 | 
 85 |     x = np.asarray(x)
 86 | 
 87 |     if weights is None:
 88 |         weights = np.ones(len(x))
 89 | 
 90 |     n = float(sum(weights))
 91 | 
 92 |     if len(x.shape) == 1:
 93 |         sd = np.sqrt(wvar(x, weights))
 94 |         return sd * (4 / (3 * n)) ** (1 / 5.0)
 95 | 
 96 |     # TODO: make this work for more dimensions
 97 |     # ((4 / (p + 2) * n)^(1 / (p+4)) * sigma_i
 98 |     if len(x.shape) == 2:
 99 |         ndim = x.shape[1]
100 |         sd = np.sqrt(np.apply_along_axis(wvar, 1, x, weights))
101 |         return (4.0 / ((ndim + 2.0) * n) ** (1.0 / (ndim + 4.0))) * sd
102 | 
103 | 
104 | def hsj(x, weights=None):
105 |     '''
106 |     Sheather-Jones bandwidth estimator [1]_.
107 | 
108 |     References
109 |     ----------
110 |     .. [1] A reliable data-based bandwidth selection method for kernel
111 |         density estimation. Simon J. Sheather and Michael C. Jones.
112 |         Journal of the Royal Statistical Society, Series B. 1991
113 |     '''
114 | 
115 |     h0 = hnorm(x)
116 |     v0 = sj(x, h0)
117 | 
118 |     if v0 > 0:
119 |         hstep = 1.1
120 |     else:
121 |         hstep = 0.9
122 | 
123 |     h1 = h0 * hstep
124 |     v1 = sj(x, h1)
125 | 
126 |     while v1 * v0 > 0:
127 |         h0 = h1
128 |         v0 = v1
129 |         h1 = h0 * hstep
130 |         v1 = sj(x, h1)
131 | 
132 |     return h0 + (h1 - h0) * abs(v0) / (abs(v0) + abs(v1))
133 | 
134 | 
135 | def sj(x, h):
136 |     '''
137 |     Equation 12 of Sheather and Jones [1]_
138 | 
139 |     References
140 |     ----------
141 |     .. [1] A reliable data-based bandwidth selection method for kernel
142 |         density estimation. Simon J. Sheather and Michael C. Jones.
143 |         Journal of the Royal Statistical Society, Series B. 1991
144 |     '''
145 |     phi6 = lambda x: (x ** 6 - 15 * x ** 4 + 45 * x ** 2 - 15) * dnorm(x)
146 |     phi4 = lambda x: (x ** 4 - 6 * x ** 2 + 3) * dnorm(x)
147 | 
148 |     n = len(x)
149 |     one = np.ones((1, n))
150 | 
151 |     lam = np.percentile(x, 75) - np.percentile(x, 25)
152 |     a = 0.92 * lam * n ** (-1 / 7.0)
153 |     b = 0.912 * lam * n ** (-1 / 9.0)
154 | 
155 |     W = np.tile(x, (n, 1))
156 |     W = W - W.T
157 | 
158 |     W1 = phi6(W / b)
159 |     tdb = np.dot(np.dot(one, W1), one.T)
160 |     tdb = -tdb / (n * (n - 1) * b ** 7)
161 | 
162 |     W1 = phi4(W / a)
163 |     sda = np.dot(np.dot(one, W1), one.T)
164 |     sda = sda / (n * (n - 1) * a ** 5)
165 | 
166 |     alpha2 = 1.357 * (abs(sda / tdb)) ** (1 / 7.0) * h ** (5 / 7.0)
167 | 
168 |     W1 = phi4(W / alpha2)
169 |     sdalpha2 = np.dot(np.dot(one, W1), one.T)
170 |     sdalpha2 = sdalpha2 / (n * (n - 1) * alpha2 ** 5)
171 | 
172 |     return (distr.normal.pdf(0, 0, np.sqrt(2)) /
173 |             (n * abs(sdalpha2[0, 0]))) ** 0.2 - h


--------------------------------------------------------------------------------
/utils/learning_rates.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021, EleutherAI
  2 | # This file is based on code by the authors denoted below and has been modified from its original version.
  3 | #
  4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | """Learning rate decay functions."""
 19 | 
 20 | import math
 21 | 
 22 | # from .common_utils import print_rank_0
 23 | 
 24 | 
 25 | class AnnealingLR(object):
 26 |     """Anneals the learning rate."""
 27 | 
 28 |     def __init__(
 29 |         self,
 30 |         optimizer,
 31 |         start_lr,
 32 |         warmup_iter,
 33 |         total_iters,
 34 |         decay_style,
 35 |         last_iter,
 36 |         min_lr=0.0,
 37 |         use_checkpoint_lr_scheduler=True,
 38 |         override_lr_scheduler=False,
 39 |         use_mup=False,
 40 |     ):
 41 | 
 42 |         # Class values.
 43 |         self.optimizer = optimizer
 44 |         self.start_lr = start_lr
 45 |         self.min_lr = min_lr
 46 |         self.warmup_iter = warmup_iter
 47 |         self.num_iters = last_iter
 48 |         self.end_iter = total_iters
 49 |         assert self.end_iter > 0
 50 |         self.decay_style = decay_style
 51 |         self.override_lr_scheduler = override_lr_scheduler
 52 |         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
 53 |         self.use_mup = use_mup
 54 |         if self.override_lr_scheduler:
 55 |             assert not self.use_checkpoint_lr_scheduler, (
 56 |                 "both override and " "use-checkpoint are set."
 57 |             )
 58 |         # Set the learning rate
 59 |         self.step(self.num_iters)
 60 | 
 61 |         print("> learning rate decay style: {}".format(self.decay_style))
 62 | 
 63 |     def update_lr(self, lr):
 64 |         self.start_lr = lr
 65 |     
 66 |     def get_lr(self):
 67 |         """Learning rate decay functions from:
 68 |         https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 69 | 
 70 |         num_iters_ = min(self.num_iters, self.end_iter - self.warmup_iter)
 71 |         # Warmup.
 72 |         if self.warmup_iter > 0 and self.num_iters <= self.warmup_iter:
 73 |             return float(self.start_lr) * num_iters_ / self.warmup_iter
 74 | 
 75 |         num_iters_ = num_iters_ - self.warmup_iter
 76 |         if self.decay_style == "linear":
 77 |             lr = self.start_lr * (self.end_iter - num_iters_) / self.end_iter
 78 |         elif self.decay_style == "cosine":
 79 |             lr = (
 80 |                 self.start_lr
 81 |                 / 2.0
 82 |                 * (math.cos(math.pi * num_iters_ / self.end_iter) + 1)
 83 |             )
 84 |         elif self.decay_style == "exponential":
 85 |             # exp(-0.693) = 1/2
 86 |             lr = self.start_lr * math.exp(-0.693 * num_iters_ / self.end_iter)
 87 |         else:
 88 |             lr = self.start_lr
 89 |         return max(lr, self.min_lr)
 90 | 
 91 |     def step(self, step_num=None):
 92 |         """Set lr for all parameters groups."""
 93 |         if step_num is None:
 94 |             step_num = self.num_iters + 1
 95 |         self.num_iters = step_num
 96 |         new_lr = self.get_lr()
 97 |         for group in self.optimizer.param_groups:
 98 |             if self.use_mup and "width_mult" in group:
 99 |                 group["lr"] = new_lr / group["width_mult"]
100 |             else:
101 |                 group["lr"] = new_lr
102 | 
103 |     def state_dict(self):
104 |         state_dict = {
105 |             "start_lr": self.start_lr,
106 |             "warmup_iter": self.warmup_iter,
107 |             "num_iters": self.num_iters,
108 |             "decay_style": self.decay_style,
109 |             "end_iter": self.end_iter,
110 |             "min_lr": self.min_lr,
111 |         }
112 |         return state_dict
113 | 
114 |     def _check_and_set(self, cls_value, sd_value, name):
115 |         """Auxiliary function for checking the values in the checkpoint and
116 |         setting them."""
117 |         if self.override_lr_scheduler:
118 |             print_rank_0(" > overriding {} value to {}".format(name, cls_value))
119 |             return cls_value
120 | 
121 |         if not self.use_checkpoint_lr_scheduler:
122 |             assert cls_value == sd_value, (
123 |                 "AnnealingLR: class input value"
124 |                 "and checkpoint values for {} do not match".format(name)
125 |             )
126 |         print_rank_0(" > using checkpoint value {} for {}".format(sd_value, name))
127 |         return sd_value
128 | 
129 |     def load_state_dict(self, sd):
130 | 
131 |         self.start_lr = self._check_and_set(
132 |             self.start_lr, sd["start_lr"], "learning rate"
133 |         )
134 |         self.min_lr = self._check_and_set(
135 |             self.min_lr, sd["min_lr"], "minimum learning rate"
136 |         )
137 |         self.warmup_iter = self._check_and_set(
138 |             self.warmup_iter, sd["warmup_iter"], "warmup iterations"
139 |         )
140 |         self.end_iter = self._check_and_set(
141 |             self.end_iter, sd["end_iter"], "total number of iterations"
142 |         )
143 |         self.decay_style = self._check_and_set(
144 |             self.decay_style, sd["decay_style"], "decay style"
145 |         )
146 | 
147 |         self.num_iters = sd["num_iters"]
148 |         self.step(self.num_iters)
149 | 


--------------------------------------------------------------------------------
/model/peft/utils/mapping.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright (c) 2023 Ant Group. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import sys
 18 | sys.path.append("..")
 19 | sys.path.append("../..")
 20 | import torch
 21 | from peft.utils import (
 22 |     TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
 23 |     TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING
 24 | )
 25 | 
 26 | 
 27 | # needed for prefix-tuning of bloom model
 28 | def bloom_model_postprocess_past_key_value(past_key_values):
 29 |     past_key_values = torch.cat(past_key_values)
 30 |     (
 31 |         total_layers,
 32 |         batch_size,
 33 |         num_attention_heads,
 34 |         num_virtual_tokens,
 35 |         head_dim,
 36 |     ) = past_key_values.shape
 37 |     keys = past_key_values[: total_layers // 2]
 38 |     keys = keys.transpose(2, 3).reshape(
 39 |         total_layers // 2,
 40 |         batch_size * num_attention_heads,
 41 |         head_dim,
 42 |         num_virtual_tokens,
 43 |     )
 44 |     values = past_key_values[total_layers // 2 :]
 45 |     values = values.reshape(
 46 |         total_layers // 2,
 47 |         batch_size * num_attention_heads,
 48 |         num_virtual_tokens,
 49 |         head_dim,
 50 |     )
 51 | 
 52 |     return tuple(zip(keys, values))
 53 | 
 54 | 
 55 | NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
 56 |     "t5": ["q", "v"],
 57 |     "mt5": ["q", "v"],
 58 |     "bart": ["q_proj", "v_proj"],
 59 |     "gpt2": ["c_attn"],
 60 |     "bloom": ["query_key_value"],
 61 |     "bloomz": ["query_key_value"],
 62 |     "blip-2": ["q", "v", "q_proj", "v_proj"],
 63 |     "opt": ["q_proj", "v_proj"],
 64 |     "gptj": ["q_proj", "v_proj"],
 65 |     "gpt_neox": ["query_key_value"],
 66 |     "gpt_neo": ["q_proj", "v_proj"],
 67 |     "bert": ["query", "value"],
 68 |     "roberta": ["query", "value"],
 69 |     "xlm-roberta": ["query", "value"],
 70 |     "electra": ["query", "value"],
 71 |     "deberta-v2": ["query_proj", "value_proj"],
 72 |     "deberta": ["in_proj"],
 73 |     "layoutlm": ["query", "value"],
 74 |     "llama": ["q_proj", "v_proj"],
 75 |     "chatglm": ["query_key_value"],
 76 |     "antglm": ["query_key_value"],
 77 |     "glm": ["query_key_value"],
 78 | }
 79 | 
 80 | NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = {
 81 |     "t5": ["q", "k", "v", "o", "wi", "wo"],
 82 |     "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
 83 |     "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
 84 |     # "gpt2": ["c_attn"],
 85 |     "bloom": ["query_key_value"],
 86 |     "bloomz": ["query_key_value"],
 87 |     "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
 88 |     # "gptj": ["q_proj", "v_proj"],
 89 |     # "gpt_neox": ["query_key_value"],
 90 |     # "gpt_neo": ["q_proj", "v_proj"],
 91 |     # "bert": ["query", "value"],
 92 |     "roberta": ["query", "key", "value", "dense"],
 93 |     # "xlm-roberta": ["query", "value"],
 94 |     # "electra": ["query", "value"],
 95 |     "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
 96 |     "chatglm": ["query_key_value"],
 97 |     "antglm": ["query_key_value"],
 98 |     "glm": ["query_key_value"],
 99 |     # "deberta": ["in_proj"],
100 |     # "layoutlm": ["query", "value"],
101 | }
102 | 
103 | TRANSFORMERS_MODELS_TO_LORA_LAGE_TARGET_MODULES_MAPPING = {
104 |     "t5": ["q", "k", "v", "o", "wi", "wo"],
105 |     "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
106 |     "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
107 |     # "gpt2": ["c_attn"],
108 |     "bloom": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
109 |     "bloomz": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
110 |     "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
111 |     # "gptj": ["q_proj", "v_proj"],
112 |     # "gpt_neox": ["query_key_value"],
113 |     # "gpt_neo": ["q_proj", "v_proj"],
114 |     # "bert": ["query", "value"],
115 |     "roberta": ["query", "key", "value", "dense"],
116 |     # "xlm-roberta": ["query", "value"],
117 |     # "electra": ["query", "value"],
118 |     "llama": ["q_proj", "v_proj"],
119 |     "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
120 |     "antglm": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
121 |     "glm": ["query_key_value", "dense"]
122 |     # "deberta": ["in_proj"],
123 |     # "layoutlm": ["query", "value"],
124 | }
125 | 
126 | TRANSFORMERS_MODELS_TO_ROUTELORA_TARGET_MODULES_MAPPING = {
127 |     "t5": ["q", "k", "v", "o", "wi", "wo"],
128 |     "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
129 |     "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
130 |     "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
131 |     "roberta": ["query", "key", "value", "dense"],
132 |     "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
133 |     "chatglm": ["query_key_value"],
134 |     "glm": ["query_key_value"]
135 | }
136 | 
137 | TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING = {
138 |     "glm": [0, 22],
139 |     "antglm": [17, 22],
140 |     "bloom": [17, 22],
141 |     "bloomz": [17, 22],
142 | }
143 | 
144 | TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {
145 |     "bloom": bloom_model_postprocess_past_key_value,
146 |     "bloomz": bloom_model_postprocess_past_key_value,
147 | }
148 | 
149 | WEIGHTS_NAME = "adapter_model.bin"
150 | CONFIG_NAME = "adapter_config.json"
151 | 
152 | 
153 | TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.update(
154 |     NEW_TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
155 | )
156 | TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING.update(
157 |     NEW_TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING
158 | )
159 | 


--------------------------------------------------------------------------------
/model/gpt_neox/tokenization_gpt_neox_fast.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2023 Ant Group
  3 | # This file is based on code by the authors denoted below and has been modified from its original version.
  4 | #
  5 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | """Tokenization classes for GPTNeoX."""
 19 | import json
 20 | from typing import TYPE_CHECKING, List, Optional, Tuple
 21 | 
 22 | from tokenizers import pre_tokenizers
 23 | 
 24 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 25 | from transformers.utils import logging
 26 | 
 27 | 
 28 | if TYPE_CHECKING:
 29 |     from transformers.pipelines.conversational import Conversation
 30 | 
 31 | 
 32 | logger = logging.get_logger(__name__)
 33 | 
 34 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 35 | 
 36 | PRETRAINED_VOCAB_FILES_MAP = {
 37 |     "tokenizer_file": {
 38 |         "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/tokenizer.json",
 39 |     },
 40 | }
 41 | 
 42 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 43 |     "gpt-neox-20b": 2048,
 44 | }
 45 | 
 46 | 
 47 | class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
 48 |     """
 49 |     Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
 50 |     Byte-Pair-Encoding.
 51 | 
 52 |     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
 53 |     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 54 | 
 55 |     ```python
 56 |     >>> from transformers import GPTNeoXTokenizerFast
 57 | 
 58 |     >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("gpt2")
 59 |     >>> tokenizer("Hello world")["input_ids"]
 60 |     [15496, 995]
 61 | 
 62 |     >>> tokenizer(" Hello world")["input_ids"]
 63 |     [18435, 995]
 64 |     ```
 65 | 
 66 |     You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
 67 |     the model was not pretrained this way, it might yield a decrease in performance.
 68 | 
 69 |     <Tip>
 70 | 
 71 |     When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
 72 | 
 73 |     </Tip>
 74 | 
 75 |     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
 76 |     refer to this superclass for more information regarding those methods.
 77 | 
 78 |     Args:
 79 |         vocab_file (`str`):
 80 |             Path to the vocabulary file.
 81 |         merges_file (`str`):
 82 |             Path to the merges file.
 83 |         errors (`str`, *optional*, defaults to `"replace"`):
 84 |             Paradigm to follow when decoding bytes to UTF-8. See
 85 |             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
 86 |         unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
 87 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
 88 |             token instead.
 89 |         bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
 90 |             The beginning of sequence token.
 91 |         eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
 92 |             The end of sequence token.
 93 |         add_prefix_space (`bool`, *optional*, defaults to `False`):
 94 |             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
 95 |             other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
 96 |         trim_offsets (`bool`, *optional*, defaults to `True`):
 97 |             Whether or not the post-processing step should trim offsets to avoid including whitespaces.
 98 |     """
 99 | 
100 |     vocab_files_names = VOCAB_FILES_NAMES
101 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
102 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
103 |     model_input_names = ["input_ids", "attention_mask"]
104 | 
105 |     def __init__(
106 |         self,
107 |         vocab_file=None,
108 |         merges_file=None,
109 |         tokenizer_file=None,
110 |         unk_token="<|endoftext|>",
111 |         bos_token="<|endoftext|>",
112 |         eos_token="<|endoftext|>",
113 |         add_prefix_space=False,
114 |         **kwargs,
115 |     ):
116 |         super().__init__(
117 |             vocab_file,
118 |             merges_file,
119 |             tokenizer_file=tokenizer_file,
120 |             unk_token=unk_token,
121 |             bos_token=bos_token,
122 |             eos_token=eos_token,
123 |             add_prefix_space=add_prefix_space,
124 |             **kwargs,
125 |         )
126 | 
127 |         pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
128 |         if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
129 |             pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
130 |             pre_tok_state["add_prefix_space"] = add_prefix_space
131 |             self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
132 | 
133 |         self.add_prefix_space = add_prefix_space
134 | 
135 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
136 |         files = self._tokenizer.model.save(save_directory, name=filename_prefix)
137 |         return tuple(files)
138 | 
139 |     def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
140 |         """This corresponds to DialoGPT variants of models."""
141 |         input_ids = []
142 |         for is_user, text in conversation.iter_texts():
143 |             input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
144 | 
145 |         if len(input_ids) > self.model_max_length:
146 |             input_ids = input_ids[-self.model_max_length :]
147 |         return input_ids
148 | 


--------------------------------------------------------------------------------
/model/peft/tuner/bitfit.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2023 Ant Group. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import sys
 17 | sys.path.append("..")
 18 | sys.path.append("../..")
 19 | import torch
 20 | import importlib
 21 | from enum import Enum
 22 | from peft.utils import PeftType
 23 | from dataclasses import dataclass, field, asdict
 24 | from typing import Optional, List
 25 | 
 26 | from .pe_base_model import PEBaseModel
 27 | from model.peft.utils import PetuningConfig
 28 | from model.peft.utils.others import _freeze_model
 29 | 
 30 | 
 31 | def is_alps_available():
 32 |     return importlib.util.find_spec("alps") is not None
 33 | 
 34 | 
 35 | if is_alps_available():
 36 |     from alps.util import logger
 37 | else:
 38 |     import logging
 39 |     logger = logging.getLogger(__file__)
 40 | 
 41 | 
 42 | class PEBitfitModel(PEBaseModel):
 43 |     """
 44 |     只训练模型bias：参考 https://arxiv.org/pdf/2106.10199.pdf
 45 |     model: huggingface transformers model
 46 |     tokenizer:  huggingface transformers tokenizer
 47 |     """
 48 | 
 49 |     def __init__(self, model):
 50 |         self.model = model
 51 | 
 52 |     def get_model(self):
 53 |         not_freeze_param_name = ["bias"]
 54 |         set_parameter_requires_grad(self.model, not_freeze_param_name)
 55 |         return self.model
 56 | 
 57 |     @classmethod
 58 |     def restore(self, model=None, path=None):
 59 |         logger.info("bitfit不需要额外加载参数")
 60 |         return model
 61 | 
 62 | 
 63 | # 根据名称锁定参数层
 64 | def set_parameter_requires_grad(model, freeze_param_name=[]):
 65 |     if not isinstance(freeze_param_name, list):
 66 |         freeze_param_name = [freeze_param_name]
 67 | 
 68 |     for idx, (name, param) in enumerate(model.named_parameters()):
 69 |         for p in freeze_param_name:
 70 |             if p not in name:
 71 |                 param.requires_grad = False
 72 |         # 打印参数层名
 73 |     for idx, (name, param) in enumerate(model.named_parameters()):
 74 |         for p in freeze_param_name:
 75 |             if p in name:
 76 |                 print("trainable parameter name is:")
 77 |                 print(name)
 78 |                 param.requires_grad = True
 79 | 
 80 | 
 81 | @dataclass
 82 | class PeftBitfitConfig(PetuningConfig):
 83 |     """
 84 |     This is the configuration class to store the configuration of a [`PeftBitfitModel`].
 85 | 
 86 |     Args:
 87 |         modules_to_save (`List[str]`):List of modules apart from LoRA layers to be set as trainable
 88 |             and saved in the final checkpoint.
 89 |     """
 90 | 
 91 |     modules_to_save: Optional[List[str]] = field(
 92 |         default=None,
 93 |         metadata={
 94 |             "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
 95 |             "For example, in Sequence Classification or Token Classification tasks, "
 96 |             "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
 97 |         },
 98 |     )
 99 | 
100 |     def __post_init__(self):
101 |         self.peft_type = PeftType.BITFIT
102 | 
103 | 
104 | class PeftBitfitModel(torch.nn.Module):
105 |     """
106 |     Creates Bitfit model for ant peft.
107 | 
108 |     Args:
109 |         model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers.
110 |         config ([`PeftBitfitConfig`]): The configuration of the Bitfit model.
111 | 
112 |     Returns:
113 |         `torch.nn.Module`: The Bitfit model.
114 | 
115 |     Example:
116 | 
117 |         ```python
118 |         >>> from solutions.antllm.antllm.models.glm.modeling_glm import GLMForConditionalGeneration
119 |         >>> from solutions.antllm.antllm.models.peft.tuner import PeftBitfitConfig, PeftBitfitModel
120 |         >>> from peft import LoraModel, LoraConfig
121 | 
122 |         >>> config = PeftBitfitConfig()
123 | 
124 |         >>> model = GLMForConditionalGeneration.from_pretrained("path_to_model")
125 |         >>> roem_model = PeftBitfitModel(config, model)
126 |         ```
127 | 
128 |     **Attributes**:
129 |         - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed.
130 |         - **peft_config** ([`PeftBitfitConfig`]): The configuration of the Bitfit model.
131 |     """
132 | 
133 |     def __init__(self, model, config, adapter_name):
134 |         super().__init__()
135 |         self.model = model
136 | 
137 |         self.forward = self.model.forward
138 |         self.peft_config = config
139 |         self.add_adapter(adapter_name, self.peft_config[adapter_name])
140 | 
141 |     def add_adapter(self, adapter_name, config=None):
142 |         if not isinstance(config, PeftBitfitConfig):
143 |             raise ValueError(
144 |                 f"The PeftBitfitModel need PeftBitfitConfig, but get {type(config)}."
145 |             )
146 | 
147 |         if config is not None:
148 |             config = self._prepare_lora_config(config)
149 |             self.peft_config[adapter_name] = config
150 | 
151 |         if len(self.peft_config) > 1:
152 |             raise ValueError(
153 |                 "BitfitModel supports only 1 peft config or name."
154 |                 "Because it only freeze the shallow layers without any additional parameters."
155 |             )
156 | 
157 |         self.model = PEBitfitModel(self.model).get_model()
158 | 
159 |         if self.peft_config[adapter_name].inference_mode:
160 |             _freeze_model(self.model)
161 | 
162 |     @staticmethod
163 |     def _prepare_lora_config(peft_config):
164 |         if peft_config.inference_mode:
165 |             peft_config.merge_weights = True
166 |         return peft_config
167 | 
168 |     def __getattr__(self, name: str):
169 |         """Forward missing attributes to the wrapped module."""
170 |         try:
171 |             return super().__getattr__(name)  # defer to nn.Module's logic
172 |         except AttributeError:
173 |             return getattr(self.model, name)
174 | 
175 |     def get_peft_config_as_dict(self, inference: bool = False):
176 |         config_dict = {}
177 |         for key, value in self.peft_config.items():
178 |             config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
179 |             if inference:
180 |                 config["inference_mode"] = True
181 |         config_dict[key] = config
182 |         return config


--------------------------------------------------------------------------------
/data/samplers.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021, EleutherAI
  2 | # This file is based on code by the authors denoted below and has been modified from its original version.
  3 | #
  4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | """Batch samplers that work with either random or sequential data samplers."""
 19 | 
 20 | import torch
 21 | from torch.utils import data
 22 | 
 23 | 
 24 | class RandomSampler(data.sampler.Sampler):
 25 |     """Based off of pytorch RandomSampler and DistributedSampler. Essentially
 26 |     a RandomSampler, but this class lets the user set an epoch like
 27 |     DistributedSampler Samples elements randomly. If without replacement, then
 28 |     sample from a shuffled dataset. If with replacement, then user can
 29 |     specify ``num_samples`` to draw.
 30 |     Arguments:
 31 |         data_source (Dataset): dataset to sample from
 32 |         num_samples (int): number of samples to draw, default=len(dataset)
 33 |         replacement (bool): samples are drawn with replacement if ``True``,
 34 |         default=False
 35 |     """
 36 | 
 37 |     def __init__(self, data_source, replacement=False, num_samples=None):
 38 |         self.data_source = data_source
 39 |         self.replacement = replacement
 40 |         self._num_samples = num_samples
 41 |         self.epoch = -1
 42 | 
 43 |         if self._num_samples is not None and replacement is False:
 44 |             raise ValueError(
 45 |                 "With replacement=False, num_samples should not "
 46 |                 "be specified, since a random permute will be "
 47 |                 "performed."
 48 |             )
 49 | 
 50 |         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
 51 |             raise ValueError(
 52 |                 "num_samples should be a positive integer "
 53 |                 "value, but got num_samples={}".format(self.num_samples)
 54 |             )
 55 |         if not isinstance(self.replacement, bool):
 56 |             raise ValueError(
 57 |                 "replacement should be a boolean value, but got "
 58 |                 "replacement={}".format(self.replacement)
 59 |             )
 60 | 
 61 |     @property
 62 |     def num_samples(self):
 63 |         # dataset size might change at runtime
 64 |         if self._num_samples is None:
 65 |             return len(self.data_source)
 66 |         return self._num_samples
 67 | 
 68 |     def __iter__(self):
 69 |         n = len(self.data_source)
 70 |         g = torch.Generator()
 71 |         if self.epoch >= 0:
 72 |             g.manual_seed(self.epoch)
 73 |         if self.replacement:
 74 |             return iter(
 75 |                 torch.randint(
 76 |                     high=n, size=(self.num_samples,), dtype=torch.int64, generator=g
 77 |                 ).tolist()
 78 |             )
 79 |         return iter(torch.randperm(n, generator=g).tolist())
 80 | 
 81 |     def __len__(self):
 82 |         return self.num_samples
 83 | 
 84 |     def set_epoch(self, epoch):
 85 |         self.epoch = epoch
 86 | 
 87 | 
 88 | class DistributedBatchSampler(data.sampler.BatchSampler):
 89 |     """Similar to normal implementation of distributed sampler, except
 90 |     implementation is at the batch sampler level, instead of just the
 91 |     sampler level. This allows wrapping of arbitrary data samplers
 92 |     (sequential, random, WeightedRandomSampler, etc.) with this batch
 93 |     sampler.
 94 | 
 95 |     The `interleave` argument specifies how to distribute a batch. A value
 96 |     of True combined with the above random sampler is equivalent to pytorch's
 97 |     torch.utils.data.distributed.DistributedSampler.
 98 | 
 99 |     For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2
100 |     specifying True will result in the following samples for each gpu:
101 |         GPU0: [0,2,4,6] GPU1: [1,3,5,7]
102 |     specifying False will result in the following samples:
103 |         GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
104 | 
105 |     def __init__(
106 |         self,
107 |         sampler,
108 |         batch_size,
109 |         drop_last,
110 |         rank=-1,
111 |         world_size=2,
112 |         wrap_last=False,
113 |         interleave=False,
114 |     ):
115 |         super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last)
116 |         if rank == -1:
117 |             assert False, "should not be here"
118 |             rank = torch.distributed.get_rank()
119 |         self.rank = rank
120 |         self.world_size = world_size
121 |         self.sampler.wrap_around = 0
122 |         self.wrap_around = 0
123 |         self.wrap_last = wrap_last
124 |         self.start_iter = 0
125 |         self.interleave = interleave
126 | 
127 |     def __iter__(self):
128 |         batch = []
129 |         i = 0
130 |         for idx in self.data_iterator(self.sampler, wrap_around=False):
131 |             batch.append(idx)
132 |             if len(batch) == self.batch_size:
133 |                 tbatch = self._batch(batch)
134 |                 if i >= self.start_iter:
135 |                     yield tbatch
136 |                     self.start_iter = 0
137 |                 i += 1
138 |                 batch = []
139 |         batch_len = len(batch)
140 |         if batch_len > 0 and not self.drop_last:
141 |             if self.wrap_last:
142 |                 self.sampler.wrap_around -= self.batch_size
143 |                 self.wrap_around += len(batch)
144 |                 self.wrap_around %= self.batch_size
145 |             yield self._batch(batch)
146 |         if self.wrap_last:
147 |             self.sampler.wrap_around += self.batch_size
148 | 
149 |     def data_iterator(self, _iter, wrap_around=False):
150 |         """iterates through data and handles wrap around"""
151 |         for i, idx in enumerate(_iter):
152 |             if i < self.wrap_around % self.batch_size:
153 |                 continue
154 |             if wrap_around:
155 |                 self.wrap_around += 1
156 |                 self.wrap_around %= self.batch_size
157 |             yield idx
158 | 
159 |     def _batch(self, batch):
160 |         """extracts samples only pertaining to this worker's batch"""
161 |         if self.interleave:
162 |             return batch[self.rank : self.batch_size : self.world_size]
163 |         start = self.rank * self.batch_size // self.world_size
164 |         end = (self.rank + 1) * self.batch_size // self.world_size
165 |         return batch[start:end]
166 | 


--------------------------------------------------------------------------------
/data/get_data_from_hf.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append("..")
  3 | import random
  4 | 
  5 | from utils.common_utils import main_process_first
  6 | from itertools import chain  # noqa: E402
  7 | from datasets import load_dataset, load_from_disk  # noqa: E402
  8 | 
  9 | 
 10 | def get_hf_dataset(args):
 11 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
 12 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
 13 |     # (the dataset will be downloaded automatically from the datasets Hub).
 14 |     #
 15 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
 16 |     # 'text' is found. You can easily tweak this behavior (see below).
 17 |     #
 18 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
 19 |     # download the dataset.
 20 |     if args.dataset_name is not None:
 21 |         # Downloading and loading a dataset from the hub.
 22 |         raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
 23 |         if "validation" not in raw_datasets.keys():
 24 |             raw_datasets["validation"] = load_dataset(
 25 |                 args.dataset_name,
 26 |                 args.dataset_config_name,
 27 |                 split=f"train[:{args.validation_split_percentage}%]",
 28 |             )
 29 |             raw_datasets["train"] = load_dataset(
 30 |                 args.dataset_name,
 31 |                 args.dataset_config_name,
 32 |                 split=f"train[{args.validation_split_percentage}%:]",
 33 |             )
 34 |     elif args.dataset_path is not None:
 35 |         raw_datasets = load_from_disk(args.dataset_path)
 36 |     else:
 37 |         data_files = {}
 38 |         dataset_args = {}
 39 |         if args.train_file is not None:
 40 |             data_files["train"] = args.train_file
 41 |         if args.validation_file is not None:
 42 |             data_files["validation"] = args.validation_file
 43 |         extension = args.train_file.split(".")[-1]
 44 |         if extension == "txt":
 45 |             extension = "text"
 46 |             dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
 47 |         raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
 48 |         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
 49 |         if "validation" not in raw_datasets.keys():
 50 |             raw_datasets["validation"] = load_dataset(
 51 |                 extension,
 52 |                 data_files=data_files,
 53 |                 split=f"train[:{args.validation_split_percentage}%]",
 54 |                 **dataset_args,
 55 |             )
 56 |             raw_datasets["train"] = load_dataset(
 57 |                 extension,
 58 |                 data_files=data_files,
 59 |                 split=f"train[{args.validation_split_percentage}%:]",
 60 |                 **dataset_args,
 61 |             )
 62 |     
 63 |     return raw_datasets
 64 | 
 65 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
 66 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
 67 | 
 68 | 
 69 | def preprocess_hf_datasets(args, raw_datasets, tokenizer, logger):
 70 |     # Preprocessing the datasets.
 71 |     # First we tokenize all the texts.
 72 |     column_names = raw_datasets["train"].column_names
 73 |     text_column_name = "text" if "text" in column_names else column_names[0]
 74 | 
 75 |     def tokenize_function(examples):
 76 |         return tokenizer(examples[text_column_name])
 77 | 
 78 |     with main_process_first():
 79 |         tokenized_datasets = raw_datasets.map(
 80 |             tokenize_function,
 81 |             batched=True,
 82 |             num_proc=args.preprocessing_num_workers,
 83 |             remove_columns=column_names,
 84 |             load_from_cache_file=not args.overwrite_cache,
 85 |             desc="Running tokenizer on dataset",
 86 |         )
 87 | 
 88 |     if args.block_size is None:
 89 |         block_size = tokenizer.model_max_length
 90 |         if block_size > 1024:
 91 |             logger.warning(
 92 |                 "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
 93 |                 " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
 94 |                 " override this default with `--block_size xxx`."
 95 |             )
 96 |         block_size = 1024
 97 |     else:
 98 |         if args.block_size > tokenizer.model_max_length:
 99 |             logger.warning(
100 |                 f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
101 |                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
102 |             )
103 |         block_size = min(args.block_size, tokenizer.model_max_length)
104 | 
105 |     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
106 |     def group_texts(examples):
107 |         # Concatenate all texts.
108 |         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
109 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
110 |         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
111 |         # customize this part to your needs.
112 |         if total_length >= block_size:
113 |             total_length = (total_length // block_size) * block_size
114 |         # Split by chunks of max_len.
115 |         result = {
116 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
117 |             for k, t in concatenated_examples.items()
118 |         }
119 |         result["labels"] = result["input_ids"].copy()
120 |         return result
121 | 
122 |     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
123 |     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
124 |     # to preprocess.
125 |     #
126 |     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
127 |     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
128 | 
129 |     with main_process_first():
130 |         lm_datasets = tokenized_datasets.map(
131 |             group_texts,
132 |             batched=True,
133 |             num_proc=args.preprocessing_num_workers,
134 |             load_from_cache_file=not args.overwrite_cache,
135 |             desc=f"Grouping texts in chunks of {block_size}",
136 |         )
137 | 
138 |     train_dataset = lm_datasets["train"]
139 |     eval_dataset = lm_datasets["validation"]
140 | 
141 |     # Log a few random samples from the training set:
142 |     # for index in random.sample(range(len(train_dataset)), 3):
143 |     #     logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
144 |     
145 |     return lm_datasets
146 | 


--------------------------------------------------------------------------------
/model/peft/tuner/roem.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2023 Ant Group. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import sys
 17 | sys.path.append("..")
 18 | sys.path.append("../..")
 19 | import torch
 20 | import importlib
 21 | from enum import Enum
 22 | from peft.utils import PeftType
 23 | from dataclasses import dataclass, field, asdict
 24 | from typing import Optional, List, Union
 25 | 
 26 | from .pe_base_model import PEBaseModel
 27 | from model.peft.utils import (
 28 |     PetuningConfig,
 29 |     TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING
 30 | )
 31 | from model.peft.utils.others import _freeze_model
 32 | 
 33 | 
 34 | def is_alps_available():
 35 |     return importlib.util.find_spec("alps") is not None
 36 | 
 37 | 
 38 | if is_alps_available():
 39 |     from alps.util import logger
 40 | else:
 41 |     import logging
 42 |     logger = logging.getLogger(__file__)
 43 | 
 44 | 
 45 | class PEROEMModel(PEBaseModel):
 46 |     """
 47 |     只训练模型中间偏上层mlp：参考 https://arxiv.org/pdf/2202.05262.pdf ； https://arxiv.org/abs/2012.14913
 48 |     model: huggingface transformers model
 49 |     tokenizer:  huggingface transformers tokenizer
 50 |     """
 51 | 
 52 |     def __init__(self, model, model_name, task_type=None):
 53 |         self.model = model
 54 |         self.model_name = model_name
 55 | 
 56 |     def get_model(self):
 57 |         layer_mapping = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[self.model_name]
 58 |         assert len(layer_mapping) == 2
 59 |         not_freeze_param_name = []
 60 |         for i in range(layer_mapping[0], layer_mapping[1]):
 61 |             no_freeze_name = str(i) + ".mlp"
 62 |             logger.info(f"Freeze the {no_freeze_name} layer of model")
 63 |             not_freeze_param_name.append(no_freeze_name)
 64 |         set_parameter_requires_grad(self.model, not_freeze_param_name)
 65 |         return self.model
 66 | 
 67 |     @classmethod
 68 |     def restore(self, model=None, path=None):
 69 |         logger.info("roem不需要额外加载参数")
 70 |         return model
 71 | 
 72 | 
 73 | # 根据名称锁定参数层
 74 | def set_parameter_requires_grad(model, freeze_param_name=[]):
 75 |     if not isinstance(freeze_param_name, list):
 76 |         freeze_param_name = [freeze_param_name]
 77 | 
 78 |     for idx, (name, param) in enumerate(model.named_parameters()):
 79 |         for p in freeze_param_name:
 80 |             if p not in name:
 81 |                 param.requires_grad = False
 82 |         # 打印参数层名
 83 |     for idx, (name, param) in enumerate(model.named_parameters()):
 84 |         for p in freeze_param_name:
 85 |             if p in name:
 86 |                 print("The name of used parameter used by ROEM is:")
 87 |                 print(name)
 88 |                 param.requires_grad = True
 89 | 
 90 | 
 91 | @dataclass
 92 | class PeftROEMConfig(PetuningConfig):
 93 |     """
 94 |     This is the configuration class to store the configuration of a [`PeftROEMModel`].
 95 | 
 96 |     Args:
 97 |         target_layers (`Union[List[int], int]`): The names of the modules to apply Lora to.
 98 |     """
 99 | 
100 |     target_layers: Optional[Union[List[int], int]] = field(
101 |         default=None,
102 |         metadata={
103 |             "help": "List of layers of the model to freeze the parameters."
104 |             "For example, [20, 30] or '30' "
105 |         },
106 |     )
107 | 
108 |     def __post_init__(self):
109 |         self.peft_type = PeftType.ROEM
110 | 
111 | 
112 | class PeftROEMModel(torch.nn.Module):
113 |     """
114 |     Creates ROEM model for ant peft.
115 | 
116 |     Args:
117 |         model ([`~transformers.PreTrainedModel`]): The model to be freeze with some layers.
118 |         config ([`PeftROEMConfig`]): The configuration of the ROEM model.
119 | 
120 |     Returns:
121 |         `torch.nn.Module`: The ROEM model.
122 | 
123 |     Example:
124 | 
125 |         ```python
126 |         >>> from solutions.antllm.antllm.models.glm.modeling_glm import GLMForConditionalGeneration
127 |         >>> from solutions.antllm.antllm.models.peft.tuner import PeftROEMConfig, PeftROEMModel
128 |         >>> from peft import LoraModel, LoraConfig
129 | 
130 |         >>> config = PeftROEMConfig(
131 |         ...     target_layers=[17, 22],
132 |         ... )
133 | 
134 |         >>> model = GLMForConditionalGeneration.from_pretrained("path_to_model")
135 |         >>> roem_model = PeftROEMModel(config, model)
136 |         ```
137 | 
138 |     **Attributes**:
139 |         - **model** ([`~transformers.PreTrainedModel`]) -- The model to be freezed.
140 |         - **peft_config** ([`PeftROEMConfig`]): The configuration of the ROEM model.
141 |     """
142 | 
143 |     def __init__(self, model, config, adapter_name):
144 |         super().__init__()
145 |         self.model = model
146 | 
147 |         self.forward = self.model.forward
148 |         self.peft_config = config
149 |         self.add_adapter(adapter_name, self.peft_config[adapter_name])
150 | 
151 |     def add_adapter(self, adapter_name, config=None):
152 |         if not isinstance(config, PeftROEMConfig):
153 |             raise ValueError(
154 |                 f"The PeftROEMModel need PeftROEMConfig, but get {type(config)}."
155 |             )
156 | 
157 |         model_config = self.model.config.to_dict() if hasattr(self.model.config, "to_dict") else self.model.config
158 |         if config is not None:
159 |             config = self._prepare_lora_config(config, model_config)
160 |             self.peft_config[adapter_name] = config
161 | 
162 |         if len(self.peft_config) > 1:
163 |             raise ValueError(
164 |                 "ROEMModel supports only 1 peft config or name."
165 |                 "Because it only freeze the shallow layers without any additional parameters."
166 |             )
167 | 
168 |         model_name = model_config["model_type"]
169 |         self.model = PEROEMModel(self.model, model_name).get_model()
170 | 
171 |         if self.peft_config[adapter_name].inference_mode:
172 |             _freeze_model(self.model)
173 | 
174 |     @staticmethod
175 |     def _prepare_lora_config(peft_config, model_config):
176 |         if peft_config.target_layers is None:
177 |             if model_config["model_type"] not in TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING:
178 |                 raise ValueError("Please specify `target_layers` in `peft_config`")
179 |             peft_config.target_layers = TRANSFORMERS_MODELS_ROME_LAYER_MODULES_MAPPING[model_config["model_type"]]
180 |         if peft_config.inference_mode:
181 |             peft_config.merge_weights = True
182 |         return peft_config
183 | 
184 |     def __getattr__(self, name: str):
185 |         """Forward missing attributes to the wrapped module."""
186 |         try:
187 |             return super().__getattr__(name)  # defer to nn.Module's logic
188 |         except AttributeError:
189 |             return getattr(self.model, name)
190 | 
191 |     def get_peft_config_as_dict(self, inference: bool = False):
192 |         config_dict = {}
193 |         for key, value in self.peft_config.items():
194 |             config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
195 |             if inference:
196 |                 config["inference_mode"] = True
197 |         config_dict[key] = config
198 |         return config


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 | <img src="https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/logo.png" width="540px">
  3 | </p>
  4 | 
  5 | <p align="center">
  6 | 🤗 <a href="https://huggingface.co/codefuse-ai/Collinear-Constrained-Attention" target="_blank">Hugging Face(is coming)</a> 
  7 | • 
  8 | 🤖 <a href="https://modelscope.cn/models/codefuse-ai/Collinear-Constrained-Attention/summary" target="_blank">ModelScope(is coming)</a> 
  9 |   • 
 10 | 📄 <a href="https://arxiv.org/abs/2309.08646" target="_blank">Paper</a>
 11 | </p>
 12 | 
 13 | <div align="center">
 14 | 
 15 | [![GitHub issues](https://img.shields.io/github/issues/codefuse-ai/Collinear-Constrained-Attention)](https://github.com/codefuse-ai/Collinear-Constrained-Attention/issues)
 16 | [![GitHub Repo stars](https://img.shields.io/github/stars/codefuse-ai/Collinear-Constrained-Attention?style=social)](https://github.com/codefuse-ai/Collinear-Constrained-Attention)
 17 | 
 18 | </div>
 19 | 
 20 | [comment]: <> ([<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Weights & Biases monitoring" height=20>]&#40;https://wandb.ai/eleutherai/neox&#41;)
 21 | 
 22 | This repository provides an implementation of [CoCA (Collinear Constrained Attention)](https://arxiv.org/abs/2309.08646). This implementation is based on 2 transformer models in [Hugging Face]().
 23 | 
 24 | - [GPT-NeoX](https://github.com/huggingface/transformers/tree/main/src/transformers/models/gpt_neox) which is an [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs.
 25 | - [LLaMA](https://github.com/huggingface/transformers/tree/main/src/transformers/models/llama) from Meta AI team.
 26 | 
 27 | We just point out those modifications which made to implement CoCA here. For more information about model training and inference, we recommend [transformers](https://github.com/huggingface/transformers).
 28 | 
 29 | For practicality, we enhanced CoCA's computational and spatial efficiency with [opt_einsum](https://github.com/dgasmith/opt_einsum), view this repository for more information.
 30 | 
 31 | ![Model Structure](https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/model.png "Model Structure")
 32 | 
 33 | ![PPL Performance](https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/PPL.png "PPL Performance") ![Passkey Performance](https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/passkey.png "Passkey Performance")
 34 | 
 35 | [comment]: <> (<img src="https://github.com/codefuse-ai/Collinear-Constrained-Attention/blob/master/assets/PPL.png" width="210px">)
 36 | 
 37 | ## 🚀 Quick Start
 38 | 
 39 | ### 💻 Environment
 40 | Atorch is an optimized torch version by Ant Group, it's not available for opensource community yet. It will be opensource in near future. Before that, you may use origin torch version instead.
 41 | 
 42 | ### 📂 Datasets
 43 | You can use raw data or tokenized data for training.
 44 | 
 45 | When using raw data, please ensure the data format as:
 46 | ```json
 47 | {"content" : "It is a sentence for training."}
 48 | ```
 49 | using `.jsonl` for saving data.
 50 | 
 51 | You can also use tokenized data saving in `.bin` via [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) tokenizer.
 52 | ```bash
 53 | python ./data/tokenization/generate_dataset.py
 54 | ```
 55 | notice to modify `input_dict`, `conver_type_list`, `output_name`, `seq_length` for your own dataset.
 56 | 
 57 | ### 🏋️‍♂️ Training
 58 | You can train a model from scratch as follows:
 59 | ```bash
 60 | bash ./train/run_coca.sh 32 1 8 2
 61 | ```
 62 | 
 63 | - first parameter means `per gpu batch size`
 64 | - second parameter means `tensor parallel`(larger than 1 is not supported yet)
 65 | - third parameter means `data parallel`, equals to the number of GPUs
 66 | - last parameter means `train epochs`
 67 | 
 68 | If you want to load a pre-trained model, set `--pretrained_model_path $PRETRAINED_MODEL_PATH \`.
 69 | 
 70 | ### 🧠 Inference
 71 | CoCA can be loaded using the `transformers` functionality:
 72 | 
 73 | ```python
 74 | from model.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXConfig
 75 | from transformers import AutoTokenizer
 76 | from transformers import GenerationConfig
 77 | 
 78 | config = GPTNeoXConfig.from_pretrained(checkpoint)
 79 | config.is_decoder = True
 80 | 
 81 | # If you want to inference out of training length, 
 82 | # CoCA is compatible with NTK-aware scaled RoPE and performs much more better than original attention structure
 83 | rope_scaling= {"type": "dynamic", "factor": 4.0}
 84 | config.rope_scaling = rope_scaling
 85 | 
 86 | model = GPTNeoXForCausalLM.from_pretrained(checkpoint, 
 87 |                                            config=config, 
 88 |                                            device_map="auto")
 89 | 
 90 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side="left")
 91 | tokenizer.add_special_tokens({'eos_token': "<|endoftext|>"})
 92 | tokenizer.add_special_tokens({'pad_token': "<|pad|>"})
 93 | ```
 94 | 
 95 | ## 📝 Administrative Notes
 96 | 
 97 | ### 📚 Citing CoCA
 98 | 
 99 | If you have found the CoCA library helpful in your work, you can cite this repository as
100 | 
101 | ```bibtex
102 | @inproceedings{zhu2024coca,
103 |     title={CoCA: Fusing Position Embedding with Collinear Constrained Attention in Transformers for Long Context Window Extending}, 
104 |     author={Shiyi Zhu and Jing Ye and Wei Jiang and Siqiao Xue and Qi Zhang and Yifan Wu and Jianguo Li},
105 |     booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics},
106 |     month = august,
107 |     year = {2024},
108 |     publisher = {Association for Computational Linguistics},
109 | }
110 | ```
111 | 
112 | ### 📜 Licensing
113 | 
114 | This repository hosts code of CoCA project. Copyright (c) 2023, Ant Group. Licensed under the Apache License:
115 | 
116 |     Licensed under the Apache License, Version 2.0 (the "License");
117 |     you may not use this file except in compliance with the License.
118 |     You may obtain a copy of the License at
119 |     
120 |         http://www.apache.org/licenses/LICENSE-2.0
121 |     
122 |     Unless required by applicable law or agreed to in writing, software
123 |     distributed under the License is distributed on an "AS IS" BASIS,
124 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
125 |     See the License for the specific language governing permissions and
126 |     limitations under the License.
127 | 
128 | This repository is based off code written by EleutherAI that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by EleutherAI maintain a EleutherAI copyright header. When the EleutherAI code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
129 | 
130 | This repository is based off code written by Meta AI that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by Meta AI maintain a Meta AI copyright header. When the Meta AI code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
131 | 
132 | This repository is based off code written by NVIDIA that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by NVIDIA maintain a NVIDIA copyright header. All files that do not contain such a header are the exclusive copyright of EleutherAI. When the NVIDIA code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
133 | 
134 | This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate.
135 | 
136 | For full terms, see the `LICENSE` file. If you have any questions, comments, or concerns about licensing please email me at zhushiyi.zsy@antgroup.com.
137 | 


--------------------------------------------------------------------------------
/model/gpt_neox/configuration_gpt_neox.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2023 Ant Group
  3 | # This file is based on code by the authors denoted below and has been modified from its original version.
  4 | #
  5 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
  6 | #
  7 | # Licensed under the Apache License, Version 2.0 (the "License");
  8 | # you may not use this file except in compliance with the License.
  9 | # You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | """ GPTNeoX model configuration"""
 19 | 
 20 | from transformers.configuration_utils import PretrainedConfig
 21 | from transformers.utils import logging
 22 | 
 23 | 
 24 | logger = logging.get_logger(__name__)
 25 | 
 26 | GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 27 |     "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
 28 |     # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
 29 | }
 30 | 
 31 | 
 32 | class GPTNeoXConfig(PretrainedConfig):
 33 |     r"""
 34 |     This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an
 35 |     GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration
 36 |     with the defaults will yield a similar configuration to that of the GPTNeoX
 37 |     [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture.
 38 | 
 39 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 40 |     documentation from [`PretrainedConfig`] for more information.
 41 | 
 42 | 
 43 |     Args:
 44 |         vocab_size (`int`, *optional*, defaults to 50432):
 45 |             Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the
 46 |             `inputs_ids` passed when calling [`GPTNeoXModel`].
 47 |         hidden_size (`int`, *optional*, defaults to 6144):
 48 |             Dimension of the encoder layers and the pooler layer.
 49 |         num_hidden_layers (`int`, *optional*, defaults to 44):
 50 |             Number of hidden layers in the Transformer encoder.
 51 |         num_attention_heads (`int`, *optional*, defaults to 64):
 52 |             Number of attention heads for each attention layer in the Transformer encoder.
 53 |         intermediate_size (`int`, *optional*, defaults to 24576):
 54 |             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 55 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
 56 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 57 |             `"relu"`, `"selu"` and `"gelu_new"` are supported.
 58 |         rotary_pct (`float`, *optional*, defaults to 0.25):
 59 |             percentage of hidden dimensions to allocate to rotary embeddings
 60 |         rotary_emb_base (`int`, *optional*, defaults to 10000)
 61 |             base for computing rotary embeddings frequency
 62 |         max_position_embeddings (`int`, *optional*, defaults to 2048):
 63 |             The maximum sequence length that this model might ever be used with. Typically set this to something large
 64 |             just in case (e.g., 512 or 1024 or 2048).
 65 |         initializer_range (`float`, *optional*, defaults to 1e-5):
 66 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 67 |         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
 68 |             The epsilon used by the layer normalization layers.
 69 |         use_cache (`bool`, *optional*, defaults to `True`):
 70 |             Whether or not the model should return the last key/values attentions (not used by all models). Only
 71 |             relevant if `config.is_decoder=True`.
 72 |         use_parallel_residual (`bool`, *optional*, defaults to `True`):
 73 |             Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
 74 |             speedup at large scales (e.g. 20B).
 75 |         rope_scaling (`Dict`, *optional*):
 76 |             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
 77 |             strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
 78 |             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
 79 |             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
 80 |             these scaling strategies behave:
 81 |             https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
 82 |             experimental feature, subject to breaking API changes in future versions.
 83 |         Example:
 84 | 
 85 |     ```python
 86 |     >>> from transformers import GPTNeoXConfig, GPTNeoXModel
 87 | 
 88 |     >>> # Initializing a GPTNeoX gpt-neox-20b style configuration
 89 |     >>> configuration = GPTNeoXConfig()
 90 | 
 91 |     >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration
 92 |     >>> model = GPTNeoXModel(configuration)  # doctest: +SKIP
 93 | 
 94 |     >>> # Accessing the model configuration
 95 |     >>> configuration = model.config  # doctest: +SKIP
 96 |     ```"""
 97 |     model_type = "gpt_neox"
 98 | 
 99 |     def __init__(
100 |         self,
101 |         vocab_size=50432,
102 |         hidden_size=6144,
103 |         num_hidden_layers=44,
104 |         num_attention_heads=64,
105 |         intermediate_size=24576,
106 |         hidden_act="gelu",
107 |         rotary_pct=0.25,
108 |         rotary_emb_base=10000,
109 |         max_position_embeddings=2048,
110 |         initializer_range=0.02,
111 |         layer_norm_eps=1e-5,
112 |         use_cache=True,
113 |         bos_token_id=0,
114 |         eos_token_id=2,
115 |         tie_word_embeddings=False,
116 |         use_parallel_residual=True,
117 |         rope_scaling=None,
118 |         **kwargs
119 |     ):
120 |         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
121 |         self.vocab_size = vocab_size
122 |         self.max_position_embeddings = max_position_embeddings
123 |         self.hidden_size = hidden_size
124 |         self.num_hidden_layers = num_hidden_layers
125 |         self.num_attention_heads = num_attention_heads
126 |         self.intermediate_size = intermediate_size
127 |         self.hidden_act = hidden_act
128 |         self.rotary_pct = rotary_pct
129 |         self.rotary_emb_base = rotary_emb_base
130 |         self.initializer_range = initializer_range
131 |         self.layer_norm_eps = layer_norm_eps
132 |         self.use_cache = use_cache
133 |         self.tie_word_embeddings = tie_word_embeddings
134 |         self.use_parallel_residual = use_parallel_residual
135 |         self.rope_scaling = rope_scaling
136 |         self._rope_scaling_validation()
137 | 
138 |         if self.hidden_size % self.num_attention_heads != 0:
139 |             raise ValueError(
140 |                 "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
141 |             )
142 | 
143 |     # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
144 |     def _rope_scaling_validation(self):
145 |         """
146 |         Validate the `rope_scaling` configuration.
147 |         """
148 |         if self.rope_scaling is None:
149 |             return
150 | 
151 |         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
152 |             raise ValueError(
153 |                 "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
154 |                 f"got {self.rope_scaling}"
155 |             )
156 |         rope_scaling_type = self.rope_scaling.get("type", None)
157 |         rope_scaling_factor = self.rope_scaling.get("factor", None)
158 |         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
159 |             raise ValueError(
160 |                 f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
161 |             )
162 |         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
163 |             raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
164 | 


--------------------------------------------------------------------------------
/model/llama/configuration_llama.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
  5 | # and OPT implementations in this library. It has been modified from its
  6 | # original forms to accommodate minor architectural differences compared
  7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
  8 | #
  9 | # Licensed under the Apache License, Version 2.0 (the "License");
 10 | # you may not use this file except in compliance with the License.
 11 | # You may obtain a copy of the License at
 12 | #
 13 | #     http://www.apache.org/licenses/LICENSE-2.0
 14 | #
 15 | # Unless required by applicable law or agreed to in writing, software
 16 | # distributed under the License is distributed on an "AS IS" BASIS,
 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 | # See the License for the specific language governing permissions and
 19 | # limitations under the License.
 20 | """ LLaMA model configuration"""
 21 | 
 22 | from transformers.configuration_utils import PretrainedConfig
 23 | from transformers.utils import logging
 24 | 
 25 | 
 26 | logger = logging.get_logger(__name__)
 27 | 
 28 | LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 29 | 
 30 | 
 31 | class LlamaConfig(PretrainedConfig):
 32 |     r"""
 33 |     This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
 34 |     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
 35 |     defaults will yield a similar configuration to that of the LLaMA-7B.
 36 | 
 37 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 38 |     documentation from [`PretrainedConfig`] for more information.
 39 | 
 40 | 
 41 |     Args:
 42 |         vocab_size (`int`, *optional*, defaults to 32000):
 43 |             Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
 44 |             `inputs_ids` passed when calling [`LlamaModel`]
 45 |         hidden_size (`int`, *optional*, defaults to 4096):
 46 |             Dimension of the hidden representations.
 47 |         intermediate_size (`int`, *optional*, defaults to 11008):
 48 |             Dimension of the MLP representations.
 49 |         num_hidden_layers (`int`, *optional*, defaults to 32):
 50 |             Number of hidden layers in the Transformer encoder.
 51 |         num_attention_heads (`int`, *optional*, defaults to 32):
 52 |             Number of attention heads for each attention layer in the Transformer encoder.
 53 |         num_key_value_heads (`int`, *optional*):
 54 |             This is the number of key_value heads that should be used to implement Grouped Query Attention. If
 55 |             `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
 56 |             `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
 57 |             converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
 58 |             by meanpooling all the original heads within that group. For more details checkout [this
 59 |             paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
 60 |             `num_attention_heads`.
 61 |         pretraining_tp (`int`, *optional*, defaults to `1`):
 62 |             Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
 63 |             document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
 64 |             necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
 65 |             issue](https://github.com/pytorch/pytorch/issues/76232).
 66 |         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
 67 |             The non-linear activation function (function or string) in the decoder.
 68 |         max_position_embeddings (`int`, *optional*, defaults to 2048):
 69 |             The maximum sequence length that this model might ever be used with. Typically set this to something large
 70 |             just in case (e.g., 512 or 1024 or 2048).
 71 |         initializer_range (`float`, *optional*, defaults to 0.02):
 72 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 73 |         rms_norm_eps (`float`, *optional*, defaults to 1e-12):
 74 |             The epsilon used by the rms normalization layers.
 75 |         use_cache (`bool`, *optional*, defaults to `True`):
 76 |             Whether or not the model should return the last key/values attentions (not used by all models). Only
 77 |             relevant if `config.is_decoder=True`.
 78 |         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
 79 |             Whether to tie weight embeddings
 80 |         rope_scaling (`Dict`, *optional*):
 81 |             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
 82 |             strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
 83 |             is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
 84 |             `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
 85 |             these scaling strategies behave:
 86 |             https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
 87 |             experimental feature, subject to breaking API changes in future versions.
 88 | 
 89 |         Example:
 90 | 
 91 |     ```python
 92 |     >>> from transformers import LlamaModel, LlamaConfig
 93 | 
 94 |     >>> # Initializing a LLaMA llama-7b style configuration
 95 |     >>> configuration = LlamaConfig()
 96 | 
 97 |     >>> # Initializing a model from the llama-7b style configuration
 98 |     >>> model = LlamaModel(configuration)
 99 | 
100 |     >>> # Accessing the model configuration
101 |     >>> configuration = model.config
102 |     ```"""
103 |     model_type = "llama"
104 |     keys_to_ignore_at_inference = ["past_key_values"]
105 | 
106 |     def __init__(
107 |         self,
108 |         vocab_size=32000,
109 |         hidden_size=4096,
110 |         intermediate_size=11008,
111 |         num_hidden_layers=32,
112 |         num_attention_heads=32,
113 |         num_key_value_heads=None,
114 |         hidden_act="silu",
115 |         max_position_embeddings=2048,
116 |         initializer_range=0.02,
117 |         rms_norm_eps=1e-6,
118 |         use_cache=True,
119 |         pad_token_id=None,
120 |         bos_token_id=1,
121 |         eos_token_id=2,
122 |         pretraining_tp=1,
123 |         tie_word_embeddings=False,
124 |         rope_scaling=None,
125 |         **kwargs,
126 |     ):
127 |         self.vocab_size = vocab_size
128 |         self.max_position_embeddings = max_position_embeddings
129 |         self.hidden_size = hidden_size
130 |         self.intermediate_size = intermediate_size
131 |         self.num_hidden_layers = num_hidden_layers
132 |         self.num_attention_heads = num_attention_heads
133 | 
134 |         # for backward compatibility
135 |         if num_key_value_heads is None:
136 |             num_key_value_heads = num_attention_heads
137 | 
138 |         self.num_key_value_heads = num_key_value_heads
139 |         self.hidden_act = hidden_act
140 |         self.initializer_range = initializer_range
141 |         self.rms_norm_eps = rms_norm_eps
142 |         self.pretraining_tp = pretraining_tp
143 |         self.use_cache = use_cache
144 |         self.rope_scaling = rope_scaling
145 |         self._rope_scaling_validation()
146 | 
147 |         super().__init__(
148 |             pad_token_id=pad_token_id,
149 |             bos_token_id=bos_token_id,
150 |             eos_token_id=eos_token_id,
151 |             tie_word_embeddings=tie_word_embeddings,
152 |             **kwargs,
153 |         )
154 | 
155 |     def _rope_scaling_validation(self):
156 |         """
157 |         Validate the `rope_scaling` configuration.
158 |         """
159 |         if self.rope_scaling is None:
160 |             return
161 | 
162 |         if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
163 |             raise ValueError(
164 |                 "`rope_scaling` must be a dictionary with with two fields, `name` and `factor`, "
165 |                 f"got {self.rope_scaling}"
166 |             )
167 |         rope_scaling_type = self.rope_scaling.get("type", None)
168 |         rope_scaling_factor = self.rope_scaling.get("factor", None)
169 |         if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
170 |             raise ValueError(
171 |                 f"`rope_scaling`'s name field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
172 |             )
173 |         if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
174 |             raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")


--------------------------------------------------------------------------------
/tools/analysis/MMapTokenIdsBinChecker.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2023 Ant Group. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import sys
 17 | sys.path.append("..")
 18 | sys.path.append("../..")
 19 | 
 20 | import os
 21 | import struct
 22 | from transformers import PreTrainedTokenizerFast
 23 | import random
 24 | import numpy as np
 25 | from model.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast
 26 | 
 27 | tokenizer_vocab_file = '/mnt/user/bingchang/multisft/code/13b/code/v1-old/gpt-neox-2.0-sft-6b/tokenizer-ant-v5.json'
 28 | 
 29 | table = {ord(f): ord(t) for f, t in zip(
 30 |          u'，。！？：【】（）％＃＠＆１２３４５６７８９０',
 31 |          u',.!?:[]()%#@&1234567890')}
 32 | 
 33 | 
 34 | def punctuation_format(text):
 35 |     # Replace non-breaking space with space
 36 |     text = text.strip() + '\n'
 37 |     text = text.replace('\u202f', ' ').replace('\xa0', ' ')
 38 |     # change chinese punctuation to english ones
 39 |     text = text.translate(table)
 40 |     return text
 41 | 
 42 | 
 43 | def save_to_file(file_path, text):
 44 |     """
 45 |     写给定的<text>追加写入到<file_path>文件中
 46 |     """
 47 |     with open(file_path, 'a') as f:
 48 |         f.write(f'{text}')
 49 | 
 50 | 
 51 | def detokenize(input_ids, tokenizer, padding_token=None):
 52 |     """
 53 |     使用给定的<tokenizer>对给定的token id列表<input_ids>进行解码，如果给定了padding_token，则将padding部分移除
 54 |     """
 55 |     result = tokenizer.decode(input_ids)
 56 |     if padding_token and padding_token in result:
 57 |         result = result[:result.index(padding_token)]
 58 |     return result
 59 | 
 60 | 
 61 | def convert_bytes_to_elements(byte_data, dtype):
 62 |     """
 63 |     将字节数组转为对应数据类型数组
 64 |     """
 65 |     result = np.frombuffer(byte_data, dtype=dtype)
 66 |     return [x for x in result]
 67 | 
 68 | 
 69 | class MMapTokenIdsBinChecker:
 70 |     """
 71 |     检查GPT Neox MMAP方式生成的input_ids.bin文件
 72 |     """
 73 |     
 74 |     # 用于检查的随机采样数量
 75 |     _SAMPLING_NUM = 100
 76 | 
 77 |     _SEED = 202306192219
 78 | 
 79 |     _PADING_TOKEN = "<|pad|>"
 80 | 
 81 |     def __init__ (self, input_ids_bin_path:str, loss_mask_bin_path:str, tokenizer_path:str, detokenize_output_path:str, seq_len:int, element_size:int, dtype:np.dtype, sample_total:int, ramdom_sampling_num:int):
 82 |         assert os.path.exists(input_ids_bin_path), (
 83 |             "给定的input_ids.bin文件路径不存在"
 84 |             "请确保给定的路径是存在的"
 85 |         )
 86 |         assert os.path.isfile(input_ids_bin_path), (
 87 |             "给定的input_ids.bin文件不是一个文件"
 88 |             "请确保给定的是一个GPT Neox MMAP方式生成的input_ids.bin文件"
 89 |         )
 90 |         assert os.path.exists(loss_mask_bin_path) and os.path.isfile(loss_mask_bin_path), (
 91 |             "给定的loss_mask.bin文件路径不存在或者非文件"
 92 |             "请确保给定有效的loss_mask.bin文件路径"
 93 |         )
 94 |         assert os.path.exists(tokenizer_path) and os.path.isfile(tokenizer_path), (
 95 |             "给定的词表文件不存在或者不是一个文件"
 96 |             "请确保给定有效的词表文件路径"
 97 |         )
 98 | 
 99 |         self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
100 | 
101 |         self._SAMPLING_NUM = ramdom_sampling_num
102 |         
103 |         sampled_input_ids = []
104 |         sampled_loss_masks = []
105 |         sampled_indexes = []
106 |         with open(input_ids_bin_path, 'rb') as fb, open(loss_mask_bin_path, 'rb') as f_lm:
107 |             # 随机选取若干个样本，以进行detokenization验证和loss mask验证
108 |             random.seed(self._SEED)
109 |             random_indexes = random.sample(range(0, sample_total), min(self._SAMPLING_NUM, sample_total))
110 |             print('随机采样样本索引为：', random_indexes)
111 |             # 依次处理每个取样的样本
112 |             for i in random_indexes:
113 |                 # 通过设定文件offset位置，读取取样的一个样本
114 |                 reset_pos = max(0, i-1)*seq_len*element_size
115 |                 fb.seek(reset_pos)
116 |                 data = fb.read(element_size*seq_len)
117 |                 # 将样本从byte序列转为int序列
118 |                 token_ids = convert_bytes_to_elements(data, dtype)
119 |                 sampled_input_ids.append(token_ids)
120 |                 text = detokenize(token_ids, self.tokenizer, self._PADING_TOKEN)
121 |                 # 保存到文件中供人工校验
122 |                 save_to_file(detokenize_output_path, '\n' + '[' + str(i) + ']' + '=*='*30 + '\n')
123 |                 save_to_file(detokenize_output_path, f"{text}\n")
124 |                 # 读取样本对应的loss_mask，用于检查是否只有<bot>部分的loss mask为1
125 |                 f_lm.seek(reset_pos)
126 |                 loss_mask_data = convert_bytes_to_elements(f_lm.read(seq_len*element_size), dtype)
127 |                 sampled_loss_masks.append(loss_mask_data)
128 | 
129 |                 # my_text = punctuation_format(text)
130 |                 # my_tokenizer = GPTNeoXTokenizerFast.from_pretrained("/mnt/user/fuhang/checkpoints/neox-2.0-125m-sst-0614/hf_ckpt")
131 |                 # my_tokenizer.eod_token = "<|endoftext|>"
132 |                 # my_tokenizer.pad_token = "<|extratoken_1|>"
133 |                 # my_tokenizer.sop_token = "<|endoftext|>"  # 适配multi task dataset
134 |                 # my_tokenizer.eop_token = "<|endoftext|>"
135 |                 # my_tokenizer.eod_id = my_tokenizer.convert_tokens_to_ids(my_tokenizer.eod_token)
136 |                 # my_tokenizer.pad_id = my_tokenizer.convert_tokens_to_ids(my_tokenizer.pad_token)
137 |                 # my_token_ids = my_tokenizer(my_text)['input_ids']
138 | 
139 |                 # sampled_indexes.append(i)
140 |                 # if i == 1926485:
141 |                 #     print('\n\n', '=*='*50, '\n', token_ids)
142 |                 #     print(i, text)
143 | 
144 |                 # print('\n\n', '=*='*50, '\n', token_ids)
145 |                 print('\n\n', '=*='*50, '\n')
146 |                 print('token ids: ')
147 |                 print(token_ids)
148 |                 print('loss mask:')
149 |                 print(loss_mask_data)
150 |                 # print('\n\n', '=*='*50, '\n')
151 |                 # print('my token ids: ')
152 |                 # print(my_token_ids)
153 |                 # print(i)
154 |                 # print(text)
155 | 
156 |                 
157 |         self._sampled_input_ids = sampled_input_ids
158 |         self._sampled_loss_masks = sampled_loss_masks
159 |         self._sampled_indexes = sampled_indexes
160 | 
161 |     
162 |     def check_loss_mask(self):
163 |         """
164 |         检查是否只有bot角色的内容对应的loss mask为1
165 |         """
166 |         for i in range(len(self._sampled_input_ids)):
167 |             sampled_input_ids = self._sampled_input_ids[i]
168 |             sampled_loss_mask = self._sampled_loss_masks[i]
169 | 
170 |             #print(i, 'input_ids', sampled_input_ids)
171 |             #print('\n')
172 |             #print(i, 'loss mask', len(sampled_loss_mask), sampled_loss_mask)
173 |             #print('\n\n', '=*='*30)
174 | 
175 |             # 找出loss mask为1的片段
176 |             pieces = []
177 |             if 1 not in sampled_loss_mask:
178 |                 print(f'\033[1;31;47m【异常】样本{self._sampled_indexes[i]} loss mask全为0\033[0m')
179 |                 print('detokenizee', detokenize(sampled_input_ids, self.tokenizer, self._PADING_TOKEN))
180 |                 print('input_ids', sampled_input_ids)
181 |                 return False
182 |             if 0 not in sampled_loss_mask:
183 |                 print(f'\033[1;31;47m【异常】样本{self._sampled_indexes[i]} loss mask全为1\033[0m')
184 |                 return False
185 |     
186 |             start_index = sampled_loss_mask.index(1)
187 |             accul_index = 0
188 |             while start_index > -1:
189 |                 #print('start_index', start_index)
190 |                 
191 |                 if 0 in sampled_loss_mask[start_index:]:
192 |                     end_index = sampled_loss_mask[start_index:].index(0)
193 |                     end_index = len(sampled_loss_mask)
194 |                 else:
195 |                     print(self._sampled_loss_masks[i])
196 |                     end_index = start_index + end_index
197 |                 #print('end_index', end_index)
198 | 
199 |                 pieces.append((accul_index + start_index, accul_index + end_index))
200 | 
201 |                 sampled_input_ids = sampled_input_ids[end_index:]
202 |                 sampled_loss_mask = sampled_loss_mask[end_index:]
203 |                 accul_index += end_index
204 | 
205 |                 if 1 not in sampled_loss_mask:
206 |                     break
207 |                 start_index = sampled_loss_mask.index(1)
208 |             
209 | 
210 |             # 检查每段loss mask为1的数据对应的token ids之前的三个词是否是<|role_start|>bot<|role_end|>，最后一个词是否是<|end|>
211 |             for piece in pieces:
212 |                 token_ids_piece = self._sampled_input_ids[i][max(0, piece[0]-3):piece[1]]
213 |                 text_piece = detokenize(token_ids_piece, self.tokenizer, self._PADING_TOKEN)
214 |                 if not text_piece.startswith("<|role_start|>bot<|role_end|>") or not text_piece.endswith('<|end|>'):
215 |                     print(f'\033[1;31;47m【异常】样本{self._sampled_indexes[i]}存在loss mask为1但对应的不是bot片段:{text_piece}\033[0m')
216 |                     return False
217 |                 
218 |         return True
219 |     
220 | 
221 |     def check(self):
222 |         return self.check_loss_mask()
223 | 


--------------------------------------------------------------------------------
/model/llama/tokenization_llama_fast.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | import os
 16 | from shutil import copyfile
 17 | from typing import TYPE_CHECKING, Optional, Tuple
 18 | 
 19 | from tokenizers import processors
 20 | 
 21 | from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
 22 | from transformers.utils import is_sentencepiece_available, logging
 23 | from transformers.utils.versions import require_version
 24 | 
 25 | 
 26 | if TYPE_CHECKING:
 27 |     from transformers.pipelines.conversational import Conversation
 28 | 
 29 | require_version("tokenizers>=0.13.3")
 30 | 
 31 | if is_sentencepiece_available():
 32 |     from .tokenization_llama import LlamaTokenizer
 33 | else:
 34 |     LlamaTokenizer = None
 35 | 
 36 | logger = logging.get_logger(__name__)
 37 | VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
 38 | 
 39 | B_INST, E_INST = "[INST]", "[/INST]"
 40 | B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
 41 | 
 42 | # fmt: off
 43 | DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
 44 | answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
 45 |  that your responses are socially unbiased and positive in nature.
 46 | 
 47 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
 48 | correct. If you don't know the answer to a question, please don't share false information."""
 49 | # fmt: on
 50 | 
 51 | 
 52 | class LlamaTokenizerFast(PreTrainedTokenizerFast):
 53 |     """
 54 |     Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
 55 | 
 56 |     This uses notably ByteFallback and no normalization.
 57 | 
 58 |     ```
 59 |     from transformers import LlamaTokenizerFast
 60 | 
 61 |     tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
 62 |     tokenizer.encode("Hello this is a test")
 63 |     >>> [1, 15043, 445, 338, 263, 1243]
 64 |     ```
 65 | 
 66 |     If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
 67 |     call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
 68 |     values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
 69 |     [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
 70 | 
 71 | 
 72 |     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
 73 |     refer to this superclass for more information regarding those methods.
 74 | 
 75 |     Args:
 76 |         vocab_file (`str`):
 77 |             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
 78 |             contains the vocabulary necessary to instantiate a tokenizer.
 79 |         tokenizer_file (`str`):
 80 |             [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
 81 |             contains everything needed to load the tokenizer.
 82 | 
 83 |         clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
 84 |             Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
 85 |             spaces.
 86 | 
 87 |         bos_token (`str`, *optional*, defaults to `"<s>"`):
 88 |             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 89 | 
 90 |         eos_token (`str`, *optional*, defaults to `"</s>"`):
 91 |             The end of sequence token.
 92 | 
 93 |         unk_token (`str`, *optional*, defaults to `"<unk>"`):
 94 |             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
 95 |             token instead.
 96 |     """
 97 | 
 98 |     vocab_files_names = VOCAB_FILES_NAMES
 99 |     slow_tokenizer_class = LlamaTokenizer
100 |     padding_side = "left"
101 |     model_input_names = ["input_ids", "attention_mask"]
102 | 
103 |     def __init__(
104 |         self,
105 |         vocab_file=None,
106 |         tokenizer_file=None,
107 |         clean_up_tokenization_spaces=False,
108 |         unk_token="<unk>",
109 |         bos_token="<s>",
110 |         eos_token="</s>",
111 |         add_bos_token=True,
112 |         add_eos_token=False,
113 |         **kwargs,
114 |     ):
115 |         super().__init__(
116 |             vocab_file=vocab_file,
117 |             tokenizer_file=tokenizer_file,
118 |             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
119 |             unk_token=unk_token,
120 |             bos_token=bos_token,
121 |             eos_token=eos_token,
122 |             **kwargs,
123 |         )
124 |         self._add_bos_token = add_bos_token
125 |         self._add_eos_token = add_eos_token
126 |         self.update_post_processor()
127 | 
128 |         self.vocab_file = vocab_file
129 |         self.can_save_slow_tokenizer = False if not self.vocab_file else True
130 | 
131 |     def update_post_processor(self):
132 |         """
133 |         Updates the underlying post processor with the current `bos_token` and `eos_token`.
134 |         """
135 |         bos = self.bos_token
136 |         bos_token_id = self.bos_token_id
137 | 
138 |         eos = self.eos_token
139 |         eos_token_id = self.eos_token_id
140 | 
141 |         single = f"{(bos+':0 ') * self.add_bos_token}$A:0{(' '+eos+':0') * self.add_eos_token}"
142 |         pair = f"{single}{(' '+bos+':1') * self.add_bos_token} $B:1{(' '+eos+':1') * self.add_eos_token}"
143 | 
144 |         special_tokens = []
145 |         if self.add_bos_token:
146 |             special_tokens.append((bos, bos_token_id))
147 |         if self.add_eos_token:
148 |             special_tokens.append((eos, eos_token_id))
149 |         self._tokenizer.post_processor = processors.TemplateProcessing(
150 |             single=single, pair=pair, special_tokens=special_tokens
151 |         )
152 | 
153 |     @property
154 |     def add_eos_token(self):
155 |         return self._add_eos_token
156 | 
157 |     @property
158 |     def add_bos_token(self):
159 |         return self._add_bos_token
160 | 
161 |     @add_eos_token.setter
162 |     def add_eos_token(self, value):
163 |         self._add_eos_token = value
164 |         self.update_post_processor()
165 | 
166 |     @add_bos_token.setter
167 |     def add_bos_token(self, value):
168 |         self._add_bos_token = value
169 |         self.update_post_processor()
170 | 
171 |     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
172 |         if not self.can_save_slow_tokenizer:
173 |             raise ValueError(
174 |                 "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
175 |                 "tokenizer."
176 |             )
177 | 
178 |         if not os.path.isdir(save_directory):
179 |             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
180 |             return
181 |         out_vocab_file = os.path.join(
182 |             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
183 |         )
184 | 
185 |         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
186 |             copyfile(self.vocab_file, out_vocab_file)
187 | 
188 |         return (out_vocab_file,)
189 | 
190 |     def _build_conversation_input_ids(self, conversation: "Conversation"):
191 |         """Builds the input ids for a conversation.
192 |         This is the format used in the provided examples. System prompts should be manually added at the beginning of
193 |         the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
194 |         ```
195 |         <bos>[INST] B_SYS SytemPrompt E_SYS Prompt [/INST] Answer <eos>
196 |         <bos>[INST] Prompt [/INST] Answer <eos>
197 |         <bos>[INST] Prompt [/INST]
198 |         ```
199 | 
200 |         If you want to use your own system prompt, make sure to use both `B_SYS` and `E_SYS` use the following:
201 |         ```python
202 |         >>> from transformers import Conversation
203 | 
204 |         >>> Conversation(
205 |         ...     "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
206 |         ... )
207 |         ```
208 |         Args:
209 |             conversation (`Conversation`):
210 |                 Conversation to build input ids for.
211 |         Returns:
212 |             `List[int]`:
213 |                 Input ids for the conversation.
214 |         """
215 |         if len(conversation.past_user_inputs) > 0:
216 |             if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
217 |                 conversation.past_user_inputs[0] = (
218 |                     B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
219 |                 )
220 |         elif conversation.new_user_input:
221 |             if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
222 |                 conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
223 |         else:
224 |             raise ValueError("Last message must be from user")
225 | 
226 |         dialogue = list(conversation.iter_texts())
227 |         if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
228 |             [not is_user for is_user, msg in dialogue[1::2]]
229 |         ):
230 |             raise ValueError(
231 |                 "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
232 |             )
233 | 
234 |         dialog_tokens = []
235 |         dialog_tokens += sum(
236 |             [
237 |                 [self.bos_token_id]
238 |                 + self.encode(
239 |                     f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
240 |                 )
241 |                 + [self.eos_token_id]
242 |                 for prompt, answer in zip(dialogue[::2], dialogue[1::2])
243 |             ],
244 |             [],
245 |         )
246 |         dialog_tokens += [self.bos_token_id] + self.encode(
247 |             f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
248 |         )
249 |         return dialog_tokens


--------------------------------------------------------------------------------
/data/tokenization/preprocess_data.py:
--------------------------------------------------------------------------------
  1 | """Processing data for pretraining."""
  2 | 
  3 | import argparse
  4 | import multiprocessing
  5 | import os
  6 | import sys
  7 | import numpy as np
  8 | import random
  9 | # sys.path.append(
 10 | #     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
 11 | # )
 12 | 
 13 | # 将父目录的父目录加入path 
 14 | current_path = os.path.abspath(__file__)
 15 | parent_dir = os.path.dirname(os.path.dirname(current_path))
 16 | grandparent_dir = os.path.dirname(parent_dir)
 17 | sys.path.append(grandparent_dir)
 18 | # print(grandparent_dir)
 19 | 
 20 | import data.tokenization.lm_dataformat as lmd
 21 | 
 22 | import time
 23 | import tqdm
 24 | import torch
 25 | import ftfy
 26 | import glob
 27 | 
 28 | from tokenizer import build_tokenizer
 29 | from threading import Semaphore
 30 | 
 31 | 
 32 | 
 33 | 
 34 | table = {ord(f):ord(t) for f,t in zip(
 35 |      u'，。！？：【】（）％＃＠＆１２３４５６７８９０',
 36 |      u',.!?:[]()%#@&1234567890')}
 37 | 
 38 | 
 39 | def punctuation_format(text: str):
 40 |     # Replace non-breaking space with space
 41 |     # text = text.strip() + '\n'
 42 |     text = text.replace('\u202f', ' ').replace('\xa0', ' ')
 43 |     # change chinese punctuation to english ones
 44 |     text = text.translate(table)
 45 |     return text
 46 | 
 47 | def is_prompt_answer_format(data):
 48 | 
 49 |     if "prompt" in data and "answer" in data:
 50 |         return True
 51 |     else:
 52 |         return False
 53 | 
 54 | 
 55 | def is_chatml_format(data):
 56 |     if "chat_rounds" in data and len(data["chat_rounds"]) > 0:
 57 |         return True
 58 |     else:
 59 |         return False
 60 | 
 61 | 
 62 | def is_text_format(data):
 63 |     if "text" in data:
 64 |         return True
 65 |     else:
 66 |         return False
 67 | 
 68 | class Encoder(object):
 69 |     def __init__(self, args, tokenizer=None):
 70 |         self.args = args
 71 |         self.tokenizer = tokenizer
 72 | 
 73 |     def initializer(self):
 74 |         # Use Encoder class as a container for global data
 75 |         if self.tokenizer is None:
 76 |             self.tokenizer = build_tokenizer(self.args)
 77 |         else:
 78 |             self.tokenizer = self.tokenizer
 79 | 
 80 |     def encode(self, text):
 81 |         if self.args.ftfy:
 82 |             text = ftfy.fix_text(text)
 83 |         ids = {}
 84 |         for key in self.args.jsonl_keys:
 85 |             doc_ids = []
 86 |             text_ids = self.tokenizer.encode(text, add_special_tokens=False)
 87 |             if len(text_ids) > 0:
 88 |                 doc_ids.append(text_ids)
 89 |             if self.args.append_eod:
 90 |                 doc_ids[-1].append(self.tokenizer.eod_id)
 91 |             ids[key] = doc_ids
 92 |         return ids, len(text)
 93 | 
 94 | 
 95 | class UniformEncoder(Encoder):
 96 |     def __init__(self, args, mode='sft', tokenizer=None):
 97 |         super().__init__(args, tokenizer=tokenizer)
 98 |         self.mode = mode
 99 |         # 实际计算时会Shift一位，因此这里seq_length + 1
100 |         if args.load_raw_dataset:
101 |             self.seq_length = args.seq_length + 1
102 |             self.stride = args.seq_length
103 |         else:
104 |             self.seq_length = args.seq_length
105 |             
106 |         self.remain_input_ids = []
107 |         self.remain_loss_mask = []
108 | 
109 |     def encode(self, data):
110 |         
111 |         encode_res = {
112 |             "input_ids":[],
113 |             "loss_mask":[]
114 |         }
115 | 
116 |         if is_prompt_answer_format(data):
117 |             data_type = 'prompt_answer'
118 |         elif is_chatml_format(data):
119 |             data_type = 'chatML'
120 |         elif is_text_format(data):
121 |             data_type = 'text'
122 |         else:
123 |             raise ValueError("data format not supported, please use prompt/answer, or chatML or pretrain text")
124 | 
125 |         for token_res in self._tokenize_fields(data, data_type=data_type):
126 |             for k, v in token_res.items():
127 |                 encode_res[k].append(v)
128 |         
129 |         length = 0
130 |         if data_type == 'prompt_answer':
131 |             length = len(data['prompt']) + len(data['answer'])
132 |         elif data_type == 'chatML':
133 |             for chat in data['chat_rounds']:
134 |                 length += len(chat['content'])
135 |         elif data_type == 'text':
136 |             length += len(data['text'])
137 |         
138 |         return encode_res, length
139 | 
140 | 
141 |     def _tokenize_fields(self, data, data_type):
142 | 
143 |         CHAT_COL = 'chat_rounds'
144 |         ROLE_COL = 'role'
145 |         CONTENT_COL = 'content'
146 | 
147 |         PROMPT_COL = 'prompt'
148 |         ANSWER_COL = 'answer'
149 |         SYSTEM_COL = 'system'
150 | 
151 |         TEXT_COL = 'text'
152 | 
153 |         if self.mode == 'sft':
154 |             HUMAN = 'human'
155 |             BOT = 'bot'
156 |             SYSTEM = 'system'
157 |             ROLE_START_MARKER = '<|role_start|>'
158 |             ROLE_END_MARKER = '<|role_end|>'
159 |         elif self.mode == 'pretrain' or data_type == 'text':
160 |             HUMAN = ''
161 |             BOT = ''
162 |             SYSTEM = ''
163 |             ROLE_START_MARKER = ''
164 |             ROLE_END_MARKER = ''
165 |         else:
166 |             raise ValueError(f"tokenize_mode does not support {self.mode}, please use sft or pretrain")
167 | 
168 | 
169 |         human_marker_ids = self.tokenizer.encode(f"{ROLE_START_MARKER}{HUMAN}{ROLE_END_MARKER}", add_special_tokens=False)
170 |         bot_marker_ids = self.tokenizer.encode(f"{ROLE_START_MARKER}{BOT}{ROLE_END_MARKER}", add_special_tokens=False)
171 |         system_marker_ids = self.tokenizer.encode(f"{ROLE_START_MARKER}{SYSTEM}{ROLE_END_MARKER}", add_special_tokens=False)
172 |         sft_end_marker_ids = [self.tokenizer.eod_id]
173 | 
174 |         # 处理逻辑：
175 |         # 统一处理SST,单轮、多轮sft的需求
176 | 
177 |         input_ids = []
178 |         loss_mask = []
179 | 
180 |         if data_type == "prompt_answer":
181 |             system = data.get(SYSTEM_COL, '')
182 |             prompt = data[PROMPT_COL]
183 |             answer = data[ANSWER_COL]
184 |             system = punctuation_format(system)
185 |             prompt = punctuation_format(prompt)
186 |             answer = punctuation_format(answer)
187 |             system_ids = system_marker_ids + self.tokenizer.encode(system, add_special_tokens=False) if system else []
188 |             prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
189 |             answer_ids = self.tokenizer.encode(answer, add_special_tokens=False) + sft_end_marker_ids
190 |             input_ids += system_ids + human_marker_ids + prompt_ids + bot_marker_ids + answer_ids
191 |             loss_mask += [0] * len(system_ids) + [0] * len(human_marker_ids) + [0] * len(prompt_ids) + \
192 |                          [0] * len(bot_marker_ids) + [1] * len(answer_ids)
193 |         elif data_type == 'chatML':
194 |             chat = data[CHAT_COL]
195 |             for r in chat:
196 |                 role = r[ROLE_COL]
197 |                 content = r[CONTENT_COL]
198 |                 content = punctuation_format(content)
199 |                 # if not content.endswith('\n'):  # chatML格式
200 |                 #     content = content + '\n'
201 |                 if role == HUMAN:
202 |                     role_marker_ids = human_marker_ids
203 |                     content_ids = self.tokenizer.encode(content, add_special_tokens=False)
204 |                 elif role == BOT:
205 |                     # 每一个bot输出结尾的eod,计算loss, 学会在哪里停， human和system的eod不需要计算loss
206 |                     role_marker_ids = bot_marker_ids
207 |                     content_ids = self.tokenizer.encode(content, add_special_tokens=False) + sft_end_marker_ids
208 |                 elif role == SYSTEM:
209 |                     role_marker_ids = system_marker_ids
210 |                     content_ids = self.tokenizer.encode(content, add_special_tokens=False)
211 |                 else:
212 |                     raise ValueError(f"Role {role} not supported.")
213 | 
214 |                 input_ids += role_marker_ids + content_ids
215 |                 masklet = [1] if role == BOT else [0]
216 |                 loss_mask += [0] * len(role_marker_ids) + masklet * len(content_ids)
217 |         elif data_type == "text":
218 |             text = data[TEXT_COL]
219 |             text = punctuation_format(text)
220 |             text_ids = self.tokenizer.encode(text, add_special_tokens=False) + sft_end_marker_ids
221 |             input_ids += text_ids
222 |             loss_mask += [1] * len(text_ids)
223 |         else:
224 |             raise ValueError(
225 |                 f"data_type does not support {self.args.data_type}, please use chatML or prompt_answer or text(for pretrain)")
226 |             
227 |         # print(self.mode)
228 |         if self.mode == 'pretrain':
229 |             # change loss mask to all 1s
230 |             input_ids = input_ids
231 |             loss_mask = [1] * len(loss_mask)
232 |         elif self.mode == 'sft':
233 |             # do nothing
234 |             input_ids = input_ids
235 |             loss_mask = loss_mask
236 | 
237 |         assert len(input_ids) == len(loss_mask)
238 |         if self.args.padding_mode == 'padding':
239 |             if len(input_ids) <= self.seq_length:
240 |                 yield self.padding(input_ids, loss_mask)
241 | 
242 |             # 如果超长，直接抛弃 or 使用seq_length窗口滑动采样
243 |             else:
244 |                 # cursor = 0
245 |                 # while cursor < len(input_ids):
246 |                 #     end_idx = min(cursor + self.seq_length, len(input_ids))
247 |                 #     yield self.padding(input_ids[cursor: end_idx], loss_mask[cursor: end_idx])
248 |                 #     cursor = end_idx
249 |                 yield {}
250 |         elif self.args.padding_mode == 'concat':
251 |             input_ids = self.remain_input_ids + input_ids
252 |             loss_mask = self.remain_loss_mask + loss_mask
253 |             if len(input_ids) < self.seq_length:
254 |                 self.remain_input_ids = input_ids
255 |                 self.remain_loss_mask = loss_mask
256 |                 assert len(self.remain_input_ids) == len(self.remain_loss_mask)
257 |                 yield {}
258 |             else:
259 |                 cursor = 0
260 |                 while cursor + self.seq_length <= len(input_ids):
261 |                     yield {
262 |                         "input_ids": input_ids[cursor: cursor + self.seq_length],
263 |                         "loss_mask": loss_mask[cursor: cursor + self.seq_length]
264 |                     }
265 |                     cursor = cursor + self.stride
266 |                 self.remain_input_ids = input_ids[cursor:]
267 |                 self.remain_loss_mask = loss_mask[cursor:]
268 |                 assert len(self.remain_input_ids) == len(self.remain_loss_mask)
269 |                 yield {}
270 |         elif self.args.padding_mode == 'pack':
271 |             if len(input_ids) > self.seq_length:
272 |                 yield {}
273 |             elif len(self.remain_input_ids) + len(input_ids) > self.seq_length:
274 |                 input_ids, self.remain_input_ids = self.remain_input_ids, input_ids
275 |                 loss_mask, self.remain_loss_mask = self.remain_loss_mask, loss_mask
276 |                 assert len(input_ids) == len(loss_mask)
277 |                 yield self.padding(input_ids, loss_mask)
278 |             else:
279 |                 self.remain_input_ids = self.remain_input_ids + input_ids
280 |                 self.remain_loss_mask = self.remain_loss_mask + loss_mask
281 |                 assert len(self.remain_input_ids) == len(self.remain_loss_mask)
282 |                 yield {}
283 | 
284 | 
285 |     def padding(self, input_ids, loss_mask):
286 |         pad_id = self.tokenizer.pad_id
287 |         assert len(input_ids) <= self.seq_length, f"padding sequence: {len(input_ids)} > {self.seq_length}"
288 |         input_ids += [pad_id] * (self.seq_length - len(input_ids))
289 |         loss_mask += [0] * (self.seq_length - len(loss_mask))
290 |         return {
291 |             "input_ids": input_ids,
292 |             "loss_mask": loss_mask
293 |         } 
294 | 
295 | # 输入为args.input, 使用","进行分割，每一个itme可以是jsonl或文件夹
296 | def find_jsonl_fnames(inputs):
297 |     fnames = []
298 |     for p in inputs.split(","):
299 |         if not os.path.isdir(p):
300 |             if p.endswith(".jsonl"):
301 |                 print(f"loading from {p}")
302 |                 fnames.append(p)
303 |         else:
304 |             p_list = glob.glob(p + "/*")
305 |             for p_ in p_list:
306 |                 if p_.endswith(".jsonl"):
307 |                     print(f"loading from {p_}")
308 |                     fnames.append(p_)
309 |     return fnames
310 | 
311 | def yield_from_files(fnames: list, semaphore):
312 |     """
313 |     Iterator over input documents using lm_dataformat. Should be able to handle jsons / texts /
314 |     other compressed formats. Also filters out empty documents.
315 | 
316 |     :param fnames: list of filenames
317 |     """
318 | 
319 |     def yielder(fname, semaphore):
320 |         for f in filter(lambda x: x, lmd.Reader(fname).stream_data(key=['task', 'src_language', 'src_code', 'tgt_language', 'tgt_code', 'sql', 'prompt', 'answer', 'bad_answer'])):
321 |             semaphore.acquire()
322 |             yield f
323 | 
324 |     for fname in fnames:
325 |         semaphore.acquire()
326 | 
327 |         yield from yielder(fname, semaphore)


--------------------------------------------------------------------------------
/model/llama/convert_llama_weights_to_hf.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import argparse
 15 | import gc
 16 | import json
 17 | import os
 18 | import shutil
 19 | import warnings
 20 | 
 21 | import torch
 22 | 
 23 | # from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
 24 | from .configuration_llama import LlamaConfig
 25 | from .modeling_llama import LlamaForCausalLM
 26 | from .tokenization_llama import LlamaTokenizer
 27 | 
 28 | 
 29 | try:
 30 |     # from transformers import LlamaTokenizerFast
 31 |     from tokenization_llama_fast import LlamaTokenizerFast
 32 | except ImportError as e:
 33 |     warnings.warn(e)
 34 |     warnings.warn(
 35 |         "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
 36 |     )
 37 |     LlamaTokenizerFast = None
 38 | 
 39 | """
 40 | Sample usage:
 41 | 
 42 | ```
 43 | python src/transformers/models/llama/convert_llama_weights_to_hf.py \
 44 |     --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
 45 | ```
 46 | 
 47 | Thereafter, models can be loaded via:
 48 | 
 49 | ```py
 50 | from transformers import LlamaForCausalLM, LlamaTokenizer
 51 | 
 52 | model = LlamaForCausalLM.from_pretrained("/output/path")
 53 | tokenizer = LlamaTokenizer.from_pretrained("/output/path")
 54 | ```
 55 | 
 56 | Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
 57 | come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
 58 | """
 59 | 
 60 | INTERMEDIATE_SIZE_MAP = {
 61 |     "7B": 11008,
 62 |     "13B": 13824,
 63 |     "30B": 17920,
 64 |     "65B": 22016,
 65 |     "70B": 28672,
 66 | }
 67 | NUM_SHARDS = {
 68 |     "7B": 1,
 69 |     "7Bf": 1,
 70 |     "13B": 2,
 71 |     "13Bf": 2,
 72 |     "30B": 4,
 73 |     "65B": 8,
 74 |     "70B": 8,
 75 |     "70Bf": 8,
 76 | }
 77 | 
 78 | 
 79 | def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
 80 |     return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
 81 | 
 82 | 
 83 | def read_json(path):
 84 |     with open(path, "r") as f:
 85 |         return json.load(f)
 86 | 
 87 | 
 88 | def write_json(text, path):
 89 |     with open(path, "w") as f:
 90 |         json.dump(text, f)
 91 | 
 92 | 
 93 | def write_model(model_path, input_base_path, model_size, safe_serialization=True):
 94 |     os.makedirs(model_path, exist_ok=True)
 95 |     tmp_model_path = os.path.join(model_path, "tmp")
 96 |     os.makedirs(tmp_model_path, exist_ok=True)
 97 | 
 98 |     params = read_json(os.path.join(input_base_path, "params.json"))
 99 |     num_shards = NUM_SHARDS[model_size]
100 |     n_layers = params["n_layers"]
101 |     n_heads = params["n_heads"]
102 |     n_heads_per_shard = n_heads // num_shards
103 |     dim = params["dim"]
104 |     dims_per_head = dim // n_heads
105 |     base = 10000.0
106 |     inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
107 | 
108 |     if "n_kv_heads" in params:
109 |         num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
110 |         num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
111 |         key_value_dim = dim // num_key_value_heads
112 |     else:  # compatibility with other checkpoints
113 |         num_key_value_heads = n_heads
114 |         num_local_key_value_heads = n_heads_per_shard
115 |         key_value_dim = dim
116 | 
117 |     # permute for sliced rotary
118 |     def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
119 |         return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
120 | 
121 |     print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
122 |     # Load weights
123 |     if model_size == "7B":
124 |         # Not sharded
125 |         # (The sharded implementation would also work, but this is simpler.)
126 |         loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
127 |     else:
128 |         # Sharded
129 |         loaded = [
130 |             torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
131 |             for i in range(num_shards)
132 |         ]
133 |     param_count = 0
134 |     index_dict = {"weight_map": {}}
135 |     for layer_i in range(n_layers):
136 |         filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
137 |         if model_size == "7B":
138 |             # Unsharded
139 |             state_dict = {
140 |                 f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
141 |                     loaded[f"layers.{layer_i}.attention.wq.weight"]
142 |                 ),
143 |                 f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
144 |                     loaded[f"layers.{layer_i}.attention.wk.weight"]
145 |                 ),
146 |                 f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
147 |                 f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
148 |                 f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
149 |                 f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
150 |                 f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
151 |                 f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
152 |                 f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
153 |             }
154 |         else:
155 |             # Sharded
156 |             # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
157 |             # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
158 |             # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
159 | 
160 |             state_dict = {
161 |                 f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][
162 |                     f"layers.{layer_i}.attention_norm.weight"
163 |                 ].clone(),
164 |                 f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][
165 |                     f"layers.{layer_i}.ffn_norm.weight"
166 |                 ].clone(),
167 |             }
168 |             state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
169 |                 torch.cat(
170 |                     [
171 |                         loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
172 |                         for i in range(num_shards)
173 |                     ],
174 |                     dim=0,
175 |                 ).reshape(dim, dim)
176 |             )
177 |             state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
178 |                 torch.cat(
179 |                     [
180 |                         loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(
181 |                             num_local_key_value_heads, dims_per_head, dim
182 |                         )
183 |                         for i in range(num_shards)
184 |                     ],
185 |                     dim=0,
186 |                 ).reshape(key_value_dim, dim),
187 |                 num_key_value_heads,
188 |                 key_value_dim,
189 |                 dim,
190 |             )
191 |             state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
192 |                 [
193 |                     loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(
194 |                         num_local_key_value_heads, dims_per_head, dim
195 |                     )
196 |                     for i in range(num_shards)
197 |                 ],
198 |                 dim=0,
199 |             ).reshape(key_value_dim, dim)
200 | 
201 |             state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
202 |                 [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
203 |             )
204 |             state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
205 |                 [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
206 |             )
207 |             state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
208 |                 [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
209 |             )
210 |             state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
211 |                 [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
212 |             )
213 | 
214 |         state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
215 |         for k, v in state_dict.items():
216 |             index_dict["weight_map"][k] = filename
217 |             param_count += v.numel()
218 |         torch.save(state_dict, os.path.join(tmp_model_path, filename))
219 | 
220 |     filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
221 |     if model_size == "7B":
222 |         # Unsharded
223 |         state_dict = {
224 |             "model.embed_tokens.weight": loaded["tok_embeddings.weight"],
225 |             "model.norm.weight": loaded["norm.weight"],
226 |             "lm_head.weight": loaded["output.weight"],
227 |         }
228 |     else:
229 |         state_dict = {
230 |             "model.norm.weight": loaded[0]["norm.weight"],
231 |             "model.embed_tokens.weight": torch.cat(
232 |                 [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1
233 |             ),
234 |             "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
235 |         }
236 | 
237 |     for k, v in state_dict.items():
238 |         index_dict["weight_map"][k] = filename
239 |         param_count += v.numel()
240 |     torch.save(state_dict, os.path.join(tmp_model_path, filename))
241 | 
242 |     # Write configs
243 |     index_dict["metadata"] = {"total_size": param_count * 2}
244 |     write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
245 |     ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1
246 |     multiple_of = params["multiple_of"] if "multiple_of" in params else 256
247 |     config = LlamaConfig(
248 |         hidden_size=dim,
249 |         intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of),
250 |         num_attention_heads=params["n_heads"],
251 |         num_hidden_layers=params["n_layers"],
252 |         rms_norm_eps=params["norm_eps"],
253 |         num_key_value_heads=num_key_value_heads,
254 |     )
255 |     config.save_pretrained(tmp_model_path)
256 | 
257 |     # Make space so we can load the model properly now.
258 |     del state_dict
259 |     del loaded
260 |     gc.collect()
261 | 
262 |     print("Loading the checkpoint in a Llama model.")
263 |     model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
264 |     # Avoid saving this as part of the config.
265 |     del model.config._name_or_path
266 | 
267 |     print("Saving in the Transformers format.")
268 |     model.save_pretrained(model_path, safe_serialization=safe_serialization)
269 |     shutil.rmtree(tmp_model_path)
270 | 
271 | 
272 | def write_tokenizer(tokenizer_path, input_tokenizer_path):
273 |     # Initialize the tokenizer based on the `spm` model
274 |     tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast
275 |     print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.")
276 |     tokenizer = tokenizer_class(input_tokenizer_path)
277 |     tokenizer.save_pretrained(tokenizer_path)
278 | 
279 | 
280 | def main():
281 |     parser = argparse.ArgumentParser()
282 |     parser.add_argument(
283 |         "--input_dir",
284 |         help="Location of LLaMA weights, which contains tokenizer.model and model folders",
285 |     )
286 |     parser.add_argument(
287 |         "--model_size",
288 |         choices=["7B", "7Bf", "13B", "13Bf", "30B", "65B", "70B", "70Bf", "tokenizer_only"],
289 |         help="'f' models correspond to the finetuned versions, and are specific to the Llama2 official release. For more details on Llama2, checkout the original repo: https://huggingface.co/meta-llama",
290 |     )
291 |     parser.add_argument(
292 |         "--output_dir",
293 |         help="Location to write HF model and tokenizer",
294 |     )
295 |     parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
296 |     args = parser.parse_args()
297 |     if args.model_size != "tokenizer_only":
298 |         write_model(
299 |             model_path=args.output_dir,
300 |             input_base_path=os.path.join(args.input_dir, args.model_size),
301 |             model_size=args.model_size,
302 |             safe_serialization=args.safe_serialization,
303 |         )
304 |     spm_path = os.path.join(args.input_dir, "tokenizer.model")
305 |     write_tokenizer(args.output_dir, spm_path)
306 | 
307 | 
308 | if __name__ == "__main__":
309 |     main()


--------------------------------------------------------------------------------
/model/peft/utils/others.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # Copyright (c) 2023 Ant Group. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | import sys
 18 | sys.path.append("..")
 19 | sys.path.append("../..")
 20 | import copy
 21 | 
 22 | import torch
 23 | from .config import PetuningConfig
 24 | from peft.utils import PromptLearningConfig, PeftType
 25 | 
 26 | 
 27 | def prepare_model_for_int8_training(model, use_gradient_checkpointing=True):
 28 |     r"""
 29 |     This method wraps the entire protocol for preparing a model before running a training. This includes:
 30 |         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
 31 |         head to fp32
 32 | 
 33 |     Args:
 34 |         model, (`transformers.PreTrainedModel`):
 35 |             The loaded model from `transformers`
 36 |     """
 37 |     loaded_in_8bit = getattr(model, "is_loaded_in_8bit", False)
 38 | 
 39 |     for name, param in model.named_parameters():
 40 |         # freeze base model's layers
 41 |         param.requires_grad = False
 42 | 
 43 |     # cast all non INT8 parameters to fp32
 44 |     for param in model.parameters():
 45 |         if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
 46 |             param.data = param.data.to(torch.float32)
 47 | 
 48 |     if loaded_in_8bit and use_gradient_checkpointing:
 49 |         # For backward compatibility
 50 |         if hasattr(model, "enable_input_require_grads"):
 51 |             model.enable_input_require_grads()
 52 |         else:
 53 | 
 54 |             def make_inputs_require_grad(module, input, output):
 55 |                 output.requires_grad_(True)
 56 | 
 57 |             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
 58 | 
 59 |         # enable gradient checkpointing for memory efficiency
 60 |         model.gradient_checkpointing_enable()
 61 | 
 62 |     return model
 63 | 
 64 | 
 65 | def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True):
 66 |     r"""
 67 |     This method wraps the entire protocol for preparing a model before running a training. This includes:
 68 |         1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
 69 |         head to fp32
 70 | 
 71 |     Args:
 72 |         model, (`transformers.PreTrainedModel`):
 73 |             The loaded model from `transformers`
 74 |     """
 75 |     loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)
 76 | 
 77 |     for name, param in model.named_parameters():
 78 |         # freeze base model's layers
 79 |         param.requires_grad = False
 80 |     
 81 |     # cast all non INT8 parameters to fp32
 82 |     for param in model.parameters():
 83 |         if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
 84 |             param.data = param.data.to(torch.float32)
 85 |     
 86 |     if loaded_in_kbit and use_gradient_checkpointing:
 87 |         # For backward compatibility
 88 |         if hasattr(model, "enable_input_require_grads"):
 89 |             model.enable_input_require_grads()
 90 |         else:
 91 |             
 92 |             def make_inputs_require_grad(module, input, output):
 93 |                 output.requires_grad_(True)
 94 | 
 95 |             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
 96 | 
 97 |         # enable gradient checkpointing for memory efficiency
 98 |         model.gradient_checkpointing_enable()
 99 | 
100 |     return model
101 | 
102 | 
103 | # copied from transformers.models.bart.modeling_bart
104 | def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
105 |     """
106 |     Shift input ids one token to the right.
107 | 
108 |     Args:
109 |         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids
110 |         pad_token_id (`int`): The id of the `padding` token.
111 |         decoder_start_token_id (`int`): The id of the `start` token.
112 |     """
113 |     shifted_input_ids = input_ids.new_zeros(input_ids.shape)
114 |     shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
115 |     shifted_input_ids[:, 0] = decoder_start_token_id
116 | 
117 |     if pad_token_id is None:
118 |         raise ValueError("self.model.config.pad_token_id has to be defined.")
119 |     # replace possible -100 values in labels by `pad_token_id`
120 |     shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
121 | 
122 |     return shifted_input_ids
123 | 
124 | 
125 | class ModulesToSaveWrapper(torch.nn.Module):
126 |     def __init__(self, module_to_save, adapter_name):
127 |         super().__init__()
128 |         self.original_module = module_to_save
129 |         self.modules_to_save = torch.nn.ModuleDict({})
130 |         self.update(adapter_name)
131 |         self.active_adapter = adapter_name
132 | 
133 |     def update(self, adapter_name):
134 |         self.modules_to_save.update(torch.nn.ModuleDict({adapter_name: copy.deepcopy(self.original_module)}))
135 | 
136 |     def forward(self, *args, **kwargs):
137 |         if self.active_adapter not in self.modules_to_save:
138 |             return self.original_module(*args, **kwargs)
139 |         return self.modules_to_save[self.active_adapter](*args, **kwargs)
140 | 
141 | 
142 | def _get_submodules(model, key):
143 |     parent = model.get_submodule(".".join(key.split(".")[:-1]))
144 |     target_name = key.split(".")[-1]
145 |     target = model.get_submodule(key)
146 |     return parent, target, target_name
147 | 
148 | 
149 | def _freeze_adapter(model, adapter_name):
150 |     for n, p in model.named_parameters():
151 |         if adapter_name in n:
152 |             p.requires_grad = False
153 | 
154 | 
155 | def _freeze_model(model):
156 |     for n, p in model.named_parameters():
157 |         p.requires_grad = False
158 | 
159 | 
160 | def _set_trainable(model, adapter_name):
161 |     key_list = [key for key, _ in model.named_modules()]
162 |     for key in key_list:
163 |         target_module_found = any(key.endswith(target_key) for target_key in model.modules_to_save)
164 |         if target_module_found:
165 |             parent, target, target_name = _get_submodules(model, key)
166 |             if isinstance(target, ModulesToSaveWrapper):
167 |                 target.update(adapter_name)
168 |             else:
169 |                 for param in target.parameters():
170 |                     param.requires_grad = True
171 |                 setattr(parent, target_name, ModulesToSaveWrapper(target, adapter_name))
172 | 
173 | 
174 | def _set_adapter(model, adapter_name):
175 |     for module in model.modules():
176 |         if isinstance(module, ModulesToSaveWrapper):
177 |             module.active_adapter = adapter_name
178 | 
179 | 
180 | def fsdp_auto_wrap_policy(model):
181 |     import functools
182 |     import os
183 | 
184 |     from accelerate import FullyShardedDataParallelPlugin
185 |     from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
186 | 
187 |     from peft.tuners import PrefixEncoder, PromptEmbedding, PromptEncoder
188 | 
189 |     def lambda_policy_fn(module):
190 |         if (
191 |             len(list(module.named_children())) == 0
192 |             and getattr(module, "weight", None) is not None
193 |             and module.weight.requires_grad
194 |         ):
195 |             return True
196 |         return False
197 | 
198 |     lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
199 |     transformer_wrap_policy = functools.partial(
200 |         transformer_auto_wrap_policy,
201 |         transformer_layer_cls=(
202 |             PrefixEncoder,
203 |             PromptEncoder,
204 |             PromptEmbedding,
205 |             FullyShardedDataParallelPlugin.get_module_class_from_name(
206 |                 model, os.environ.get("FSDP_TRANSFORMER_CLS_TO_WRAP", "")
207 |             ),
208 |         ),
209 |     )
210 | 
211 |     auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy])
212 |     return auto_wrap_policy
213 | 
214 | 
215 | def transpose(weight, fan_in_fan_out):
216 |     return weight.T if fan_in_fan_out else weight
217 | 
218 | 
219 | def get_peft_model_state_dict(model, state_dict=None, adapter_name="default"):
220 |     """
221 |     Get the state dict of the Peft model.
222 | 
223 |     Args:
224 |         model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP,
225 |         the model should be the underlying model/unwrapped model (i.e. model.module).
226 |         state_dict (`dict`, *optional*, defaults to `None`):
227 |             The state dict of the model. If not provided, the state dict of the model
228 |         will be used.
229 |     """
230 |     config = model.peft_config[adapter_name]
231 |     if state_dict is None:
232 |         state_dict = model.state_dict()
233 |     if config.peft_type in (PeftType.LORA, PeftType.ADALORA, PeftType.ROUTELORA, PeftType.UNIPELT):
234 |         # to_return = lora_state_dict(model, bias=model.peft_config.bias)
235 |         # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py`
236 |         # to be used directly with the state dict which is necessary when using DeepSpeed or FSDP
237 |         bias = config.bias
238 |         if bias == "none":
239 |             to_return = {k: state_dict[k] for k in state_dict if "lora_" in k}
240 |         elif bias == "all":
241 |             to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k}
242 |         elif bias == "lora_only":
243 |             to_return = {}
244 |             for k in state_dict:
245 |                 if "lora_" in k:
246 |                     to_return[k] = state_dict[k]
247 |                     bias_name = k.split("lora_")[0] + "bias"
248 |                     if bias_name in state_dict:
249 |                         to_return[bias_name] = state_dict[bias_name]
250 |         else:
251 |             raise NotImplementedError
252 |         to_return = {k: v for k, v in to_return.items() if (("lora_" in k and adapter_name in k) or ("bias" in k))}
253 |         if config.peft_type == PeftType.ADALORA:
254 |             rank_pattern = config.rank_pattern
255 |             if rank_pattern is not None:
256 |                 rank_pattern = {k.replace(f".{adapter_name}", ""): v for k, v in rank_pattern.items()}
257 |                 config.rank_pattern = rank_pattern
258 |                 to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name)
259 | 
260 |     elif config.peft_type == PeftType.ADAPTION_PROMPT:
261 |         to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")}
262 |     elif isinstance(config, PromptLearningConfig):
263 |         to_return = {}
264 |         if config.inference_mode:
265 |             prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
266 |         else:
267 |             prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name)
268 |         to_return["prompt_embeddings"] = prompt_embeddings
269 |     elif isinstance(config, PetuningConfig):
270 |         to_return = state_dict
271 |     else:
272 |         raise NotImplementedError
273 |     if model.modules_to_save is not None:
274 |         for key, value in state_dict.items():
275 |             if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save):
276 |                 to_return[key.replace("modules_to_save.", "")] = value
277 | 
278 |     to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()}
279 |     return to_return
280 | 
281 | 
282 | def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default"):
283 |     """
284 |     Set the state dict of the Peft model.
285 | 
286 |     Args:
287 |         model ([`PeftModel`]): The Peft model.
288 |         peft_model_state_dict (`dict`): The state dict of the Peft model.
289 |     """
290 |     config = model.peft_config[adapter_name]
291 |     state_dict = {}
292 |     if model.modules_to_save is not None:
293 |         for key, value in peft_model_state_dict.items():
294 |             if any(module_name in key for module_name in model.modules_to_save):
295 |                 for module_name in model.modules_to_save:
296 |                     if module_name in key:
297 |                         key = key.replace(module_name, f"{module_name}.modules_to_save.{adapter_name}")
298 |                         break
299 |             state_dict[key] = value
300 |     else:
301 |         state_dict = peft_model_state_dict
302 | 
303 |     if config.peft_type in (PeftType.LORA, PeftType.ADALORA, PeftType.ROUTELORA, PeftType.UNIPELT):
304 |         peft_model_state_dict = {}
305 |         for k, v in state_dict.items():
306 |             if "lora_" in k:
307 |                 suffix = k.split("lora_")[1]
308 |                 if "." in suffix:
309 |                     suffix_to_replace = ".".join(suffix.split(".")[1:])
310 |                     k = k.replace(suffix_to_replace, f"{adapter_name}.{suffix_to_replace}")
311 |                 else:
312 |                     k = f"{k}.{adapter_name}"
313 |                 peft_model_state_dict[k] = v
314 |             else:
315 |                 peft_model_state_dict[k] = v
316 |         if config.peft_type == PeftType.ADALORA:
317 |             rank_pattern = config.rank_pattern
318 |             if rank_pattern is not None:
319 |                 model.resize_modules_by_rank_pattern(rank_pattern, adapter_name)
320 |     elif isinstance(config, PromptLearningConfig) or config.peft_type == PeftType.ADAPTION_PROMPT:
321 |         peft_model_state_dict = state_dict
322 |     elif isinstance(config, PetuningConfig):
323 |         peft_model_state_dict = state_dict
324 |     else:
325 |         raise NotImplementedError
326 | 
327 |     model.load_state_dict(peft_model_state_dict, strict=False)
328 |     if isinstance(config, PromptLearningConfig):
329 |         model.prompt_encoder[adapter_name].embedding.load_state_dict(
330 |             {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True
331 |         )
332 | 


--------------------------------------------------------------------------------
/utils/common_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright (c) 2023 Ant Group. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import os
 17 | import math
 18 | import torch
 19 | import atorch
 20 | import logging
 21 | import numpy as np
 22 | from collections.abc import Mapping  # noqa: E402
 23 | from contextlib import contextmanager  # noqa: E402
 24 | from torch.distributed.fsdp import (
 25 |     FullyShardedDataParallel as FSDP,
 26 |     # BackwardPrefetch,
 27 |     FullStateDictConfig,
 28 |     StateDictType,
 29 | )
 30 | from transformers import get_scheduler
 31 | from utils.learning_rates import AnnealingLR
 32 | TASK2ID = {}
 33 | ID2TASK = {}
 34 | logger = logging.getLogger(__name__)
 35 | 
 36 | def get_rank():
 37 |     return atorch.rank()
 38 | 
 39 | 
 40 | def get_local_rank():
 41 |     return atorch.local_rank()
 42 | 
 43 | 
 44 | def is_main_process():
 45 |     return atorch.rank() == 0
 46 | 
 47 | 
 48 | def is_local_main_process():
 49 |     return atorch.local_rank() == 0
 50 | 
 51 | 
 52 | def print_rank_0(*message):
 53 |     """If distributed is initialized print only on rank 0."""
 54 |     if torch.distributed.is_initialized():
 55 |         if torch.distributed.get_rank() == 0:
 56 |             print(*message, flush=True)
 57 |     else:
 58 |         print(*message, flush=True)
 59 | 
 60 | 
 61 | def get_world_size():
 62 |     return atorch.world_size()
 63 | 
 64 | 
 65 | def wait_for_everyone():
 66 |     torch.distributed.barrier()
 67 | 
 68 | 
 69 | def atorch_init_distributed(backend="nccl"):
 70 |     atorch.init_distributed(backend, set_cuda_device_using_local_rank=True)
 71 |     # atorch.init_distributed(backend)
 72 | 
 73 | 
 74 | def atorch_reset_distributed():
 75 |     atorch.reset_distributed()
 76 | 
 77 | 
 78 | def _goes_first(is_main):
 79 |     if is_main is False:
 80 |         wait_for_everyone()
 81 |     yield
 82 |     if is_main is True:
 83 |         wait_for_everyone()
 84 | 
 85 | 
 86 | def get_model_params_num(model):
 87 |     """
 88 |     Get params number of the model
 89 |     Args:
 90 |         model: model(required)
 91 |     Returns:
 92 |         the number of parameters of model
 93 |     """
 94 |     num = 0
 95 |     for _, param in model.named_parameters():
 96 |         num += param.nelement()
 97 |     return num
 98 | 
 99 | 
100 | @contextmanager
101 | def main_process_first():
102 |     yield from _goes_first(is_main_process())
103 | 
104 | 
105 | def unwrap_model(model):
106 |     """
107 |     Recursively unwraps a model from potential containers (as used in distributed training).
108 | 
109 |     Args:
110 |         model (`torch.nn.Module`): The model to unwrap.
111 |     """
112 |     # since there could be multiple levels of wrapping, unwrap recursively
113 |     if hasattr(model, "module"):
114 |         return unwrap_model(model.module)
115 |     else:
116 |         return model
117 | 
118 | 
119 | def honor_type(obj, generator):
120 |     """
121 |     Cast a generator to the same type as obj (list, tuple or namedtuple)
122 |     """
123 |     try:
124 |         return type(obj)(generator)
125 |     except TypeError:
126 |         # Some objects may not be able to instantiate from a generator directly
127 |         return type(obj)(*list(generator))
128 | 
129 | 
130 | def recursively_apply(
131 |     func,
132 |     data,
133 |     *args,
134 |     test_type=lambda t: isinstance(t, torch.Tensor),
135 |     error_on_other_type=False,
136 |     **kwargs,
137 | ):
138 |     if isinstance(data, (tuple, list)):
139 |         return honor_type(
140 |             data,
141 |             (
142 |                 recursively_apply(
143 |                     func,
144 |                     o,
145 |                     *args,
146 |                     test_type=test_type,
147 |                     error_on_other_type=error_on_other_type,
148 |                     **kwargs,
149 |                 )
150 |                 for o in data
151 |             ),
152 |         )
153 |     elif isinstance(data, Mapping):
154 |         return type(data)(
155 |             {
156 |                 k: recursively_apply(
157 |                     func,
158 |                     v,
159 |                     *args,
160 |                     test_type=test_type,
161 |                     error_on_other_type=error_on_other_type,
162 |                     **kwargs,
163 |                 )
164 |                 for k, v in data.items()
165 |             }
166 |         )
167 |     elif test_type(data):
168 |         return func(data, *args, **kwargs)
169 |     elif error_on_other_type:
170 |         raise TypeError(
171 |             f"Can't apply {func.__name__} on object of type {type(data)}, only of nested list/tuple/dicts of objects "
172 |             f"that satisfy {test_type.__name__}."
173 |         )
174 |     return data
175 | 
176 | 
177 | def gather(tensor):
178 |     def _gpu_gather_one(tensor):
179 |         if tensor.ndim == 0:
180 |             tensor = tensor.clone()[None]
181 |         output_tensors = [tensor.clone() for _ in range(torch.distributed.get_world_size())]
182 |         torch.distributed.all_gather(output_tensors, tensor)
183 |         return torch.cat(output_tensors, dim=0)
184 | 
185 |     return recursively_apply(_gpu_gather_one, tensor, error_on_other_type=True)
186 | 
187 | 
188 | def save_ckpt(model, optimizer, lr_scheduler, epoch, steps, save_path, logger):
189 |     if isinstance(model, FSDP):
190 |         print('Saving a FSDP model')
191 |         optim_state_dict = FSDP.full_optim_state_dict(model, optimizer)
192 |         save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
193 |         with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):
194 |             model_state_dict = model.state_dict()
195 |         lrs_state_dict = lr_scheduler.state_dict()
196 |     else:
197 |         print('Saving a normal model')
198 |         model_state_dict = model.state_dict()
199 |         optim_state_dict = optimizer.state_dict()
200 |         lrs_state_dict = lr_scheduler.state_dict()
201 |     # rank0 保存
202 |     if is_main_process():
203 |         torch.save(
204 |             {
205 |                 "epoch": epoch + 1,
206 |                 "step": steps,
207 |                 "state_dict": model_state_dict,
208 |                 "optimizer": optim_state_dict,
209 |                 "lrs_state_dict": lrs_state_dict,
210 |             },
211 |             save_path,
212 |         )
213 |         logger.info(f"Saved checkpoint {save_path} (epoch {epoch + 1} @ {steps} steps)")
214 |     wait_for_everyone()
215 |     # torch.distributed.barrier()  # other rank waiting
216 | 
217 | 
218 | def scheduler_and_resume(args, train_dataloader, model, optimizer, checkpoint):
219 |     # Scheduler and math around the number of training steps.
220 |     overrode_max_steps = False
221 |     args.num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
222 |     if args.max_steps == -1:
223 |         args.max_steps = args.num_train_epochs * args.num_update_steps_per_epoch
224 |         overrode_max_steps = True
225 | 
226 |     lr_scheduler = AnnealingLR(
227 |         optimizer,
228 |         start_lr=args.learning_rate,
229 |         warmup_iter=args.num_warmup_steps,
230 |         total_iters=args.max_steps * args.gradient_accumulation_steps,
231 |         decay_style=args.lr_scheduler_type,
232 |         last_iter=0,
233 |         min_lr=args.min_lr,
234 |         use_checkpoint_lr_scheduler=True,
235 |     )
236 |     # lr_scheduler = get_scheduler(
237 |     #     name=args.lr_scheduler_type,
238 |     #     optimizer=optimizer,
239 |     #     num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
240 |     #     num_training_steps=args.max_steps * args.gradient_accumulation_steps
241 |     # )
242 | 
243 |     if args.resume_from_checkpoint is not None:
244 |         if os.path.isfile(args.resume_from_checkpoint):
245 |             starting_epoch = checkpoint["epoch"] - 1
246 |             steps = checkpoint["step"]
247 |             args.resume_step = steps
248 |             # Restore the optim state
249 |             if optimizer is not None:
250 |                 if isinstance(model, FSDP):
251 |                     print('Loading optimizer for a FSDP model')
252 |                     full_osd = checkpoint["optimizer"]
253 |                     sharded_osd = FSDP.scatter_full_optim_state_dict(full_osd, model)
254 |                     optimizer.load_state_dict(sharded_osd)
255 |                 else:
256 |                     print('Loading optimizer for a normal model')
257 |                     optimizer.load_state_dict(checkpoint["optimizer"])
258 |                 logging.info("Optimizer state is restored from the checkpoint")
259 |             if lr_scheduler is not None:
260 |                 lr_scheduler.load_state_dict(checkpoint["lrs_state_dict"])
261 |             logging.info(f"Loaded checkpoint '{args.resume_from_checkpoint}' (epoch {checkpoint['epoch']} @ {steps} steps)")
262 |         else:
263 |             logger.info(f"No optimizer and lr scheduler checkpoint found at '{args.resume_from_checkpoint}'")
264 | 
265 |     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
266 |     args.num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
267 |     if overrode_max_steps:
268 |         args.max_steps = args.num_train_epochs * args.num_update_steps_per_epoch
269 |     # Afterwards we recalculate our number of training epochs
270 |     args.num_train_epochs = math.ceil(args.max_steps / args.num_update_steps_per_epoch)
271 | 
272 |     # Figure out how many steps we should save the Accelerator states
273 |     # if args.checkpointing_steps is not None and args.checkpointing_steps.isdigit():
274 |     #     args.checkpointing_steps = int(args.checkpointing_steps)
275 |     
276 |     return args, lr_scheduler, optimizer
277 | 
278 | 
279 | # def get_tflops(model_numel, batch_size, seq_len, step_time):
280 | #     return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
281 | 
282 | 
283 | def get_computation_speed(batch_size_per_device, seq_len, step_time):
284 | 
285 |     return batch_size_per_device * seq_len / (step_time + 1e-12)
286 | 
287 | 
288 | def human_readable_flops(num):
289 |     for unit in [
290 |         "",
291 |         "KFLOPS",
292 |         "MFLOPS",
293 |         "GFLOPS",
294 |         "TFLOPS",
295 |         "PFLOPS",
296 |         "EFLOPS",
297 |         "ZFLOPS",
298 |     ]:
299 |         if abs(num) < 1000.0:
300 |             return "%3.1f%s" % (num, unit)
301 |         num /= 1000.0
302 |     return "%.1f%s" % (num, "Yi")
303 | 
304 | 
305 | def get_tflops_new(args, batch_size, seq_len, step_time):
306 |     sl = seq_len
307 |     L = args.num_hidden_layers
308 |     h = args.hidden_size
309 |     V = args.vocab_size
310 |     flops = (96 * batch_size * sl * L * h * h * (1 + sl / (6 * h) + V / (16 * L * h)) / step_time)
311 |     return human_readable_flops(flops)
312 | 
313 | 
314 | def get_tflops_megatron(total_model_param, hidden_size, num_hidden_layers, 
315 |                         batch_size_per_device, seq_len, step_time):
316 | 
317 |     ff = total_model_param * 6
318 |     attn = seq_len * hidden_size * num_hidden_layers * 60
319 |     flops = (
320 |         batch_size_per_device
321 |         * seq_len
322 |         * (ff + attn)
323 |         / step_time
324 |     )
325 |     return human_readable_flops(flops)
326 | 
327 | 
328 | def is_old_version(path):
329 |     new_vocab_files = ['merge.model']
330 |     new_vocab_file_exists = []
331 |     for filename in new_vocab_files:
332 |         if not os.path.exists(os.path.join(path, filename)):
333 |             new_vocab_file_exists.append(False)
334 |         else:
335 |             new_vocab_file_exists.append(True)
336 |     if all(new_vocab_file_exists):
337 |         return False
338 |     if any(new_vocab_file_exists):
339 |         return 'new_version_file_absent'
340 |     else:
341 |         return True
342 | 
343 | 
344 | def generate_task_id(data_paths, train_mode):
345 |     data_prefixes = list(data_paths[1:-1].split(','))
346 |     print("data paths: ")
347 |     print(data_prefixes)
348 | 
349 |     for i, prefix in enumerate(data_prefixes):
350 |         if train_mode == 'sft':
351 |             task_name = prefix.split('/')[-1]
352 |         else:
353 |             task_name = prefix.split('/')[-2]
354 |         TASK2ID[task_name] = i
355 |         ID2TASK[i] = task_name
356 | 
357 | 
358 | class EarlyStopping:
359 |     """Early stops the training if validation loss doesn't improve after a given patience."""
360 |     def __init__(self, patience=7, verbose=False, delta=0):
361 |         """
362 |         Args:
363 |             patience (int): How long to wait after last time validation loss improved.
364 |                             Default: 7
365 |             verbose (bool): If True, prints a message for each validation loss improvement. 
366 |                             Default: False
367 |             delta (float): Minimum change in the monitored quantity to qualify as an improvement.
368 |                             Default: 0
369 |         """
370 |         self.patience = patience
371 |         self.verbose = verbose
372 |         self.counter = 0
373 |         self.best_score = None
374 |         self.early_stop = False
375 |         self.val_loss_min = np.Inf
376 |         self.delta = delta
377 | 
378 |     def __call__(self, val_loss, model):
379 | 
380 |         score = -val_loss
381 | 
382 |         if self.best_score is None:
383 |             self.best_score = score
384 |             # self.save_checkpoint(val_loss, model)
385 |         elif score < self.best_score + self.delta:
386 |             self.counter += 1
387 |             print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
388 |             if self.counter >= self.patience:
389 |                 self.early_stop = True
390 |         else:
391 |             self.best_score = score
392 |             # self.save_checkpoint(val_loss, model)
393 |             self.counter = 0
394 | 
395 |     def save_checkpoint(self, val_loss, model):
396 |         '''Saves model when validation loss decrease.'''
397 |         if self.verbose:
398 |             print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
399 |         torch.save(model.state_dict(), 'checkpoint.pt')	# 这里会存储迄今最优模型的参数
400 |         self.val_loss_min = val_loss
401 | 
402 | 


--------------------------------------------------------------------------------
/tokenizer/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021, EleutherAI
  2 | # This file is based on code by the authors denoted below and has been modified from its original version.
  3 | #
  4 | # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | """Megatron tokenizers."""
 19 | 
 20 | from abc import ABC
 21 | from abc import abstractmethod
 22 | 
 23 | from tokenizers import Tokenizer
 24 | from transformers import GPT2Tokenizer, GPT2TokenizerFast
 25 | import numpy as np
 26 | import sentencepiece as spm
 27 | from typing import List, Union
 28 | from .gpt2_tokenization import GPT2Tokenizer
 29 | from utils.common_utils import print_rank_0, is_old_version
 30 | from model.glm.tokenization_glm import GLMTokenizer
 31 | 
 32 | def build_tokenizer(args):
 33 |     """Initialize tokenizer."""
 34 |     print_rank_0("> building {} tokenizer ...".format(args.tokenizer_type))
 35 |     # if args.rank == 0:
 36 |     #     print("> building {} tokenizer ...".format(args.tokenizer_type), flush=True)
 37 | 
 38 |     # Select and instantiate the tokenizer.
 39 |     if args.tokenizer_type.lower() == "GPT2BPETokenizer".lower():
 40 |         assert args.vocab_file is not None
 41 |         assert args.merge_file is not None
 42 |         tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
 43 |     elif args.tokenizer_type.lower() == "SPMTokenizer".lower():
 44 |         assert args.vocab_file is not None
 45 |         tokenizer = SentencePieceTokenizer(args.vocab_file)
 46 |     elif args.tokenizer_type.lower() == "HFTokenizer".lower():
 47 |         assert args.vocab_file is not None
 48 |         tokenizer = HFTokenizer(args.vocab_file)
 49 |     elif args.tokenizer_type.lower() == "HFGPT2Tokenizer".lower():
 50 |         if args.vocab_file is None:
 51 |             print(
 52 |                 "WARNING: No vocab file found, loading Huggingface's pretrained GPT2Tokenizer"
 53 |             )
 54 |         tokenizer = HFGPT2Tokenizer(args.vocab_file)
 55 |     elif args.tokenizer_type.lower() == "CharLevelTokenizer".lower():
 56 |         tokenizer = CharLevelTokenizer(vocab_size=512)
 57 |     elif args.tokenizer_type.lower() == "TiktokenTokenizer".lower():
 58 |         assert args.vocab_file is not None
 59 |         tokenizer = TiktokenTokenizer(args.vocab_file)
 60 |     elif args.tokenizer_type.lower() == "GLMTokenizer".lower():
 61 |         if is_old_version(args.pretrained_model_path):
 62 |             print('is an old version')
 63 |             from model.glm.tokenization_glm_deprecated import GLMChineseTokenizer
 64 |             args.glm_mask = '[sMASK]'
 65 |             old_version_tokenizer = True
 66 |             tokenizer = GLMChineseTokenizer.from_pretrained(args.pretrained_model_path, trust_remote_code=True)
 67 |         else:
 68 |             print('is not an old version')
 69 |             old_version_tokenizer = False
 70 |             tokenizer = GLMTokenizer.from_pretrained(args.pretrained_model_path, trust_remote_code=True)
 71 |     else:
 72 |         raise NotImplementedError(
 73 |             "{} tokenizer is not " "implemented.".format(args.tokenizer_type)
 74 |         )
 75 | 
 76 |     # Add vocab size.
 77 |     args.padded_vocab_size = _vocab_size_with_padding(tokenizer.vocab_size, args)
 78 | 
 79 |     return tokenizer
 80 | 
 81 | 
 82 | def _vocab_size_with_padding(orig_vocab_size, args):
 83 |     """Pad vocab size so it is divisible by model parallel size and
 84 |     still having GPU friendly size."""
 85 | 
 86 |     after = orig_vocab_size
 87 |     multiple = args.make_vocab_size_divisible_by * args.model_parallel_size
 88 |     while (after % multiple) != 0:
 89 |         after += 1
 90 |     print_rank_0(
 91 |         " > padded vocab (size: {}) with {} dummy tokens "
 92 |         "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after)
 93 |     )
 94 |     # if args.rank == 0:
 95 |     #     print(
 96 |     #         " > padded vocab (size: {}) with {} dummy tokens "
 97 |     #         "(new size: {})".format(orig_vocab_size, after - orig_vocab_size, after),
 98 |     #         flush=True,
 99 |     #     )
100 |     return after
101 | 
102 | 
103 | class AbstractTokenizer(ABC):
104 |     """Abstract class for tokenizer."""
105 | 
106 |     def __init__(self, name):
107 |         self.name = name
108 |         super().__init__()
109 | 
110 |     @property
111 |     @abstractmethod
112 |     def vocab_size(self):
113 |         pass
114 | 
115 |     @property
116 |     @abstractmethod
117 |     def vocab(self):
118 |         """Dictionary from vocab text token to id token."""
119 |         pass
120 | 
121 |     @property
122 |     @abstractmethod
123 |     def inv_vocab(self):
124 |         """Dictionary from vocab id token to text token."""
125 |         pass
126 | 
127 |     @abstractmethod
128 |     def tokenize(self, text):
129 |         pass
130 | 
131 |     def detokenize(self, token_ids):
132 |         raise NotImplementedError(
133 |             "detokenizer is not implemented for {} " "tokenizer".format(self.name)
134 |         )
135 | 
136 |     @property
137 |     def cls(self):
138 |         raise NotImplementedError(
139 |             "CLS is not provided for {} " "tokenizer".format(self.name)
140 |         )
141 | 
142 |     @property
143 |     def sep(self):
144 |         raise NotImplementedError(
145 |             "SEP is not provided for {} " "tokenizer".format(self.name)
146 |         )
147 | 
148 |     @property
149 |     def pad(self):
150 |         raise NotImplementedError(
151 |             "PAD is not provided for {} " "tokenizer".format(self.name)
152 |         )
153 | 
154 |     @property
155 |     def eod(self):
156 |         raise NotImplementedError(
157 |             "EOD is not provided for {} " "tokenizer".format(self.name)
158 |         )
159 | 
160 |     @property
161 |     def mask(self):
162 |         raise NotImplementedError(
163 |             "MASK is not provided for {} " "tokenizer".format(self.name)
164 |         )
165 | 
166 | 
167 | class _GPT2BPETokenizer(AbstractTokenizer):
168 |     """Original GPT2 BPE tokenizer."""
169 | 
170 |     def __init__(self, vocab_file, merge_file):
171 |         name = "GPT2 BPE"
172 |         super().__init__(name)
173 | 
174 |         self.tokenizer = GPT2Tokenizer(
175 |             vocab_file, merge_file, errors="replace", special_tokens=[], max_len=None
176 |         )
177 |         self.eod_id = self.tokenizer.encoder["<|endoftext|>"]
178 | 
179 |     @property
180 |     def vocab_size(self):
181 |         return len(self.tokenizer.encoder)
182 | 
183 |     @property
184 |     def vocab(self):
185 |         return self.tokenizer.encoder
186 | 
187 |     @property
188 |     def inv_vocab(self):
189 |         return self.tokenizer.decoder
190 | 
191 |     def tokenize(self, text):
192 |         return self.tokenizer.encode(text)
193 | 
194 |     def detokenize(self, token_ids):
195 |         return self.tokenizer.decode(token_ids)
196 | 
197 |     @property
198 |     def eod(self):
199 |         return self.eod_id
200 | 
201 | 
202 | class SentencePieceTokenizer(AbstractTokenizer):
203 |     """Designed to Integrate SP's Tokenizer."""
204 | 
205 |     def __init__(self, vocab_file):
206 |         name = "SPM"
207 |         super().__init__(name)
208 | 
209 |         self.tokenizer = spm.SentencePieceProcessor(model_file=vocab_file)
210 |         # self.eod_id = self.tokenizer.piece_to_id("<|endoftext|>")
211 |         self.eod_id = self.tokenizer.piece_to_id("</s>")
212 |         self.pad_id = self.tokenizer.piece_to_id("[PAD]")
213 |         self.unk_id = self.tokenizer.piece_to_id("<unk>")
214 | 
215 |     @property
216 |     def vocab_size(self):
217 |         return self.tokenizer.get_piece_size()
218 | 
219 |     @property
220 |     def vocab(self):
221 |         return {
222 |             self.tokenizer.id_to_piece(idx): idx
223 |             for idx in range(self.tokenizer.get_piece_size())
224 |         }
225 | 
226 |     @property
227 |     def inv_vocab(self):
228 |         return {
229 |             idx: self.tokenizer.id_to_piece(idx)
230 |             for idx in range(self.tokenizer.get_piece_size())
231 |         }
232 | 
233 |     def tokenize(self, text):
234 |         return self.tokenizer.encode(text)
235 | 
236 |     def detokenize(self, token_ids):
237 |         return self.tokenizer.decode(token_ids)
238 | 
239 |     @property
240 |     def eod(self):
241 |         return self.eod_id
242 | 
243 | 
244 | class HFTokenizer(AbstractTokenizer):
245 |     """Designed to Integrate HF's Tokenizer library."""
246 | 
247 |     def __init__(self, vocab_file):
248 |         name = "HFTokenizer"
249 |         super().__init__(name)
250 | 
251 |         self.tokenizer = Tokenizer.from_file(vocab_file)
252 |         # self.eod_id = self.tokenizer.token_to_id("<|endoftext|>")
253 |         self.eod_id = self.tokenizer.token_to_id("<|end|>")
254 |         # self.pad_id = self.tokenizer.token_to_id("<|padding|>")
255 |         
256 |         # 新词表没有<|padding|>, 用<|extratoken_1|>代替，和tokenization一致
257 |         # self.pad_id = self.tokenizer.token_to_id("<|extratoken_1|>")
258 |         self.pad_id = self.tokenizer.token_to_id("<|pad|>")
259 | 
260 |     @property
261 |     def vocab_size(self):
262 |         return self.tokenizer.get_vocab_size()
263 | 
264 |     @property
265 |     def vocab(self):
266 |         return self.tokenizer.get_vocab()
267 | 
268 |     @property
269 |     def inv_vocab(self):
270 |         return self.tokenizer.decoder
271 | 
272 |     def tokenize(self, text: str):
273 |         return self.tokenizer.encode(text).ids
274 | 
275 |     def tokenize_batch(self, text_batch: Union[List[str], str]):
276 |         return self.tokenizer.encode_batch(text_batch)
277 | 
278 |     def detokenize(self, token_ids):
279 |         return self.tokenizer.decode(token_ids)
280 | 
281 |     @property
282 |     def eod(self):
283 |         return self.eod_id
284 | 
285 | 
286 | class HFGPT2Tokenizer(AbstractTokenizer):
287 |     """Designed to Integrate the pretrained OpenAI GPT2 Tokenizers from HF"""
288 | 
289 |     def __init__(self, vocab_file=None, fast=True):
290 |         name = "HFGPT2Tokenizer"
291 |         if fast:
292 |             name += "Fast"
293 |         super().__init__(name)
294 |         if vocab_file is None:
295 |             vocab_file = "gpt2"
296 |         if fast:
297 |             self.tokenizer = GPT2TokenizerFast.from_pretrained(vocab_file)
298 |         else:
299 |             self.tokenizer = GPT2Tokenizer.from_pretrained(vocab_file)
300 | 
301 |         self.tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
302 |         self.eod_id = self.tokenizer.eos_token_id
303 |         self.pad_id = self.tokenizer.pad_token_id
304 | 
305 |     @property
306 |     def vocab_size(self):
307 |         return len(self.tokenizer)
308 | 
309 |     @property
310 |     def vocab(self):
311 |         return self.tokenizer.get_vocab()
312 | 
313 |     @property
314 |     def inv_vocab(self):
315 |         return self.tokenizer._tokenizer.decoder
316 | 
317 |     def tokenize(self, text: str):
318 |         return self.tokenizer.encode(text)
319 | 
320 |     def tokenize_batch(self, text_batch: Union[List[str], str]):
321 |         if isinstance(text_batch, str):
322 |             text_batch = [text_batch]
323 |         return [self.tokenize(t) for t in text_batch]
324 | 
325 |     def detokenize(self, token_ids):
326 |         return self.tokenizer.decode(token_ids)
327 | 
328 |     @property
329 |     def eod(self):
330 |         return self.eod_id
331 | 
332 | 
333 | class CharLevelTokenizer(AbstractTokenizer):
334 |     """Character Level Tokenizer"""
335 | 
336 |     def __init__(self, vocab_size):
337 |         name = "CharLevelTokenizer"
338 |         super().__init__(name)
339 |         self._vocab_size = vocab_size
340 |         self.eod_id = 0
341 |         self.pad_id = 1
342 | 
343 |     def clamp(self, n):
344 |         return max(32, min(n, self.vocab_size))
345 | 
346 |     @property
347 |     def vocab_size(self):
348 |         return self._vocab_size
349 | 
350 |     @property
351 |     def vocab(self):
352 |         raise NotImplementedError
353 | 
354 |     @property
355 |     def inv_vocab(self):
356 |         raise NotImplementedError
357 | 
358 |     def decode_token(self, token: int):
359 |         return str(chr(self.clamp(token)))
360 | 
361 |     def tokenize(self, text: str):
362 |         return list(np.fromstring(text, dtype=np.uint8))
363 | 
364 |     def tokenize_batch(self, text_batch: Union[List[str], str]):
365 |         if isinstance(text_batch, list):
366 |             return [self.tokenize(s) for s in text_batch]
367 |         else:
368 |             return self.tokenize(text_batch)
369 | 
370 |     def detokenize(self, token_ids):
371 |         return "".join(list(map(self.decode_token, token_ids)))
372 | 
373 |     @property
374 |     def eod(self):
375 |         return self.eod_id
376 | 
377 | 
378 | class TiktokenTokenizer(AbstractTokenizer):
379 |     """Tokenizer from OpenAI's tiktoken implementation"""
380 | 
381 |     def __init__(self, vocab_file):
382 |         try:
383 |             import tiktoken
384 |         except ModuleNotFoundError:
385 |             print("Please install tiktoken: (https://github.com/openai/tiktoken)")
386 |             raise Exception
387 | 
388 |         name = "TiktokenTokenizer"
389 |         super().__init__(name)
390 | 
391 |         self.tokenizer = tiktoken.get_encoding(vocab_file)
392 |         self.eod_id = self.tokenizer.eot_token
393 |         self.pad_id = None
394 | 
395 |     @property
396 |     def vocab_size(self):
397 |         return self.tokenizer.n_vocab
398 | 
399 |     @property
400 |     def vocab(self):
401 |         raise NotImplementedError(
402 |             "TiktokenTokenizer does not implement vocabulary access."
403 |         )
404 | 
405 |     @property
406 |     def inv_vocab(self):
407 |         raise NotImplementedError(
408 |             "TiktokenTokenizer does not implement vocabulary access. \
409 |                 To get the idx-th token in vocabulary, use tokenizer.decode([idx]) ."
410 |         )
411 | 
412 |     def tokenize(self, text: str):
413 |         return self.tokenizer.encode(text)  # ,  allowed_special="all")
414 | 
415 |     def tokenize_batch(self, text_batch: List[str]):
416 |         return self.tokenizer.encode_batch(text_batch, allowed_special="all")
417 | 
418 |     def detokenize(self, token_ids):
419 |         return self.tokenizer.decode(tokens=token_ids, errors="strict")
420 | 
421 |     @property
422 |     def eod(self):
423 |         return self.eod_id
424 | 
425 |     @property
426 |     def pad(self):
427 |         raise NotImplementedError
428 | 


--------------------------------------------------------------------------------