├── images ├── contact_me_qr.png ├── transnormerllm-arch.png └── transnormerllm-benchmark.png ├── requirements.txt ├── fine-tune ├── requirements.txt ├── run.sh ├── configs │ └── zero3.json ├── utils.py └── train.py ├── LICENSE ├── README_CN.md └── README.md /images/contact_me_qr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenNLPLab/TransnormerLLM/HEAD/images/contact_me_qr.png -------------------------------------------------------------------------------- /images/transnormerllm-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenNLPLab/TransnormerLLM/HEAD/images/transnormerllm-arch.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | transformers 3 | torch==2.0.0 4 | sentencepiece 5 | tokenizers 6 | triton==2.0.0 7 | einops 8 | -------------------------------------------------------------------------------- /images/transnormerllm-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenNLPLab/TransnormerLLM/HEAD/images/transnormerllm-benchmark.png -------------------------------------------------------------------------------- /fine-tune/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | transformers 3 | torch==2.0.0 4 | sentencepiece 5 | tokenizers 6 | accelerate 7 | deepspeed 8 | triton==2.0.0 9 | einops 10 | -------------------------------------------------------------------------------- /fine-tune/run.sh: -------------------------------------------------------------------------------- 1 | GPUs=$1 2 | MODEL=path/to/model 3 | DATA_PATH=path/to/data 4 | 5 | torchrun \ 6 | --nproc_per_node=$GPUs \ 7 | train.py \ 8 | --model_name_or_path $MODEL \ 9 | --data_path $DATA_PATH \ 10 | --output_dir output/test \ 11 | --num_train_epochs 1 \ 12 | --per_device_train_batch_size 2 \ 13 | --per_device_eval_batch_size 1 \ 14 | --gradient_accumulation_steps 1 \ 15 | --bf16 true \ 16 | --adam_beta1 0.9 \ 17 | --adam_beta2 0.95 \ 18 | --evaluation_strategy "no" \ 19 | --save_strategy "steps" \ 20 | --save_steps 5000 \ 21 | --save_total_limit 30 \ 22 | --learning_rate 1e-4 \ 23 | --weight_decay 0.1 \ 24 | --warmup_ratio 0.1 \ 25 | --lr_scheduler_type "cosine" \ 26 | --deepspeed 'configs/zero3.json' \ 27 | --logging_steps 1 \ 28 | --dataloader_num_workers 24 \ 29 | --ddp_find_unused_parameters false \ 30 | --tf32 true \ 31 | -------------------------------------------------------------------------------- /fine-tune/configs/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "optimizer": { 6 | "type": "AdamW", 7 | "params": { 8 | "lr": "auto", 9 | "betas": "auto", 10 | "eps": "auto", 11 | "weight_decay": "auto" 12 | } 13 | }, 14 | "scheduler": { 15 | "type": "WarmupDecayLR", 16 | "params": { 17 | "total_num_steps": "auto", 18 | "warmup_min_lr": "auto", 19 | "warmup_max_lr": "auto", 20 | "warmup_num_steps": "auto" 21 | } 22 | }, 23 | "zero_optimization": { 24 | "stage": 3, 25 | "overlap_comm": true, 26 | "contiguous_gradients": true, 27 | "sub_group_size": 1e9, 28 | "stage3_gather_16bit_weights_on_model_save": true 29 | }, 30 | "gradient_accumulation_steps": "auto", 31 | "gradient_clipping": "auto", 32 | "steps_per_print": 5, 33 | "train_batch_size": "auto", 34 | "train_micro_batch_size_per_gpu": "auto", 35 | "wall_clock_breakdown": false, 36 | "activation_checkpointing": { 37 | "partition_activations": false, 38 | "cpu_checkpointing": false, 39 | "contiguous_memory_optimization": false, 40 | "number_checkpoints": null, 41 | "synchronize_checkpoint_boundary": false, 42 | "profile": false 43 | } 44 | } -------------------------------------------------------------------------------- /fine-tune/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 OpenNLPLab 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import io 16 | import json 17 | import os 18 | 19 | 20 | def _make_w_io_base(f, mode: str): 21 | if not isinstance(f, io.IOBase): 22 | f_dirname = os.path.dirname(f) 23 | if f_dirname != "": 24 | os.makedirs(f_dirname, exist_ok=True) 25 | f = open(f, mode=mode) 26 | return f 27 | 28 | 29 | def _make_r_io_base(f, mode: str): 30 | if not isinstance(f, io.IOBase): 31 | f = open(f, mode=mode) 32 | return f 33 | 34 | 35 | def jdump(obj, f, mode="w", indent=4, default=str): 36 | """Dump a str or dictionary to a file in json format. 37 | 38 | Args: 39 | obj: An object to be written. 40 | f: A string path to the location on disk. 41 | mode: Mode for opening the file. 42 | indent: Indent for storing json dictionaries. 43 | default: A function to handle non-serializable entries; defaults to `str`. 44 | """ 45 | f = _make_w_io_base(f, mode) 46 | if isinstance(obj, (dict, list)): 47 | json.dump(obj, f, indent=indent, default=default) 48 | elif isinstance(obj, str): 49 | f.write(obj) 50 | else: 51 | raise ValueError(f"Unexpected type: {type(obj)}") 52 | f.close() 53 | 54 | 55 | def jload(f, mode="r"): 56 | """Load a .json file into a dictionary.""" 57 | f = _make_r_io_base(f, mode) 58 | jdict = json.load(f) 59 | f.close() 60 | return jdict 61 | -------------------------------------------------------------------------------- /fine-tune/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 OpenNLPLab 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import copy 16 | from dataclasses import dataclass, field 17 | import logging 18 | from typing import Dict, Optional, Sequence 19 | 20 | import torch 21 | from torch.utils.data import Dataset 22 | import transformers 23 | from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer 24 | import utils 25 | 26 | IGNORE_INDEX = -100 27 | HUMAN_PREFIX = 'Human:\n' 28 | ASSISTANT_PREFIX = 'Assistant:\n' 29 | 30 | 31 | @dataclass 32 | class ModelArguments: 33 | model_name_or_path: Optional[str] = field( 34 | default="OpenNLPLab/transnormerllm-410m") 35 | 36 | 37 | @dataclass 38 | class DataArguments: 39 | data_path: str = field(default=None, 40 | metadata={"help": "Path to the training data."}) 41 | 42 | 43 | @dataclass 44 | class TrainingArguments(transformers.TrainingArguments): 45 | cache_dir: Optional[str] = field(default=None) 46 | optim: str = field(default="adamw_torch") 47 | model_max_length: int = field( 48 | default=2048, 49 | metadata={ 50 | "help": 51 | "Maximum sequence length. Sequences will be right padded (and possibly truncated)." 52 | }, 53 | ) 54 | 55 | 56 | def _tokenize_fn(strings: Sequence[str], 57 | tokenizer: transformers.PreTrainedTokenizer) -> Dict: 58 | """Tokenize a list of strings.""" 59 | tokenized_list = [ 60 | tokenizer( 61 | text, 62 | return_tensors="pt", 63 | # padding="longest", 64 | max_length=2048, 65 | truncation=True, 66 | ) for text in strings 67 | ] 68 | input_ids = labels = [ 69 | tokenized.input_ids[0] for tokenized in tokenized_list 70 | ] 71 | input_ids_lens = labels_lens = [ 72 | tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() 73 | for tokenized in tokenized_list 74 | ] 75 | return dict( 76 | input_ids=input_ids, 77 | labels=labels, 78 | input_ids_lens=input_ids_lens, 79 | labels_lens=labels_lens, 80 | ) 81 | 82 | 83 | def preprocess( 84 | sources: Sequence[str], 85 | targets: Sequence[str], 86 | tokenizer: transformers.PreTrainedTokenizer, 87 | ) -> Dict: 88 | """Preprocess the data by tokenizing.""" 89 | examples = [s + t for s, t in zip(sources, targets)] 90 | examples_tokenized, sources_tokenized = [ 91 | _tokenize_fn(strings, tokenizer) for strings in (examples, sources) 92 | ] 93 | input_ids = examples_tokenized["input_ids"] 94 | labels = copy.deepcopy(input_ids) 95 | 96 | for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): 97 | label[:source_len] = IGNORE_INDEX 98 | return dict(input_ids=input_ids, labels=labels) 99 | 100 | 101 | class SupervisedDataset(Dataset): 102 | """Dataset for supervised fine-tuning.""" 103 | 104 | def __init__(self, data_path: str, 105 | tokenizer: transformers.PreTrainedTokenizer): 106 | super(SupervisedDataset, self).__init__() 107 | logging.warning("Loading data...") 108 | 109 | list_data_dict = utils.jload(data_path) 110 | 111 | logging.warning("Formatting inputs...") 112 | sources = [] 113 | targets = [] 114 | 115 | for example in list_data_dict: 116 | if len(example['instruction']) > 0 and len(example['input']) > 0: 117 | sources.append(example['instruction'] + '\n' + HUMAN_PREFIX + 118 | example['input'] + '\n' + ASSISTANT_PREFIX) 119 | else: 120 | _input = example['instruction'] if len( 121 | example['input']) == 0 else example['input'] 122 | sources.append(HUMAN_PREFIX + _input + '\n' + ASSISTANT_PREFIX) 123 | 124 | targets.append(example['output'] + tokenizer.eos_token) 125 | 126 | logging.warning("Tokenizing inputs... This may take some time...") 127 | data_dict = preprocess(sources, targets, tokenizer) 128 | 129 | self.input_ids = data_dict["input_ids"] 130 | self.labels = data_dict["labels"] 131 | 132 | def __len__(self): 133 | return len(self.input_ids) 134 | 135 | def __getitem__(self, i) -> Dict[str, torch.Tensor]: 136 | return dict(input_ids=self.input_ids[i], labels=self.labels[i]) 137 | 138 | 139 | @dataclass 140 | class DataCollatorForSupervisedDataset(object): 141 | """Collate examples for supervised fine-tuning.""" 142 | 143 | tokenizer: transformers.PreTrainedTokenizer 144 | 145 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: 146 | input_ids, labels = tuple([instance[key] for instance in instances] 147 | for key in ("input_ids", "labels")) 148 | input_ids = torch.nn.utils.rnn.pad_sequence( 149 | input_ids, 150 | batch_first=True, 151 | padding_value=self.tokenizer.pad_token_id) 152 | labels = torch.nn.utils.rnn.pad_sequence(labels, 153 | batch_first=True, 154 | padding_value=IGNORE_INDEX) 155 | 156 | return dict( 157 | input_ids=input_ids, 158 | labels=labels, 159 | attention_mask=input_ids.ne(self.tokenizer.pad_token_id).to( 160 | torch.int), 161 | ) 162 | 163 | 164 | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, 165 | data_args) -> Dict: 166 | """Make dataset and collator for supervised fine-tuning.""" 167 | train_dataset = SupervisedDataset(tokenizer=tokenizer, 168 | data_path=data_args.data_path) 169 | data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) 170 | return dict(train_dataset=train_dataset, 171 | eval_dataset=None, 172 | data_collator=data_collator) 173 | 174 | 175 | def train(): 176 | parser = transformers.HfArgumentParser( 177 | (ModelArguments, DataArguments, TrainingArguments)) 178 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 179 | 180 | # load model 181 | model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, 182 | trust_remote_code=True, 183 | torch_dtype=getattr( 184 | torch, 'bfloat16')) 185 | model.train() 186 | 187 | # load tokenizer 188 | tokenizer = AutoTokenizer.from_pretrained( 189 | model_args.model_name_or_path, 190 | use_fast=True, 191 | model_max_length=training_args.model_max_length, 192 | padding_side="right", 193 | trust_remote_code=True) 194 | # setup pad token 195 | model.config.pad_token_id = 0 196 | tokenizer.pad_token_id = 0 197 | 198 | data_module = make_supervised_data_module(tokenizer=tokenizer, 199 | data_args=data_args) 200 | 201 | torch.cuda.empty_cache() 202 | trainer = Trainer(model=model, 203 | tokenizer=tokenizer, 204 | args=training_args, 205 | **data_module) 206 | 207 | trainer.train() 208 | trainer.save_state() 209 | trainer.save_model(output_dir=training_args.output_dir) 210 | 211 | 212 | if __name__ == "__main__": 213 | train() 214 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2023] [OpenNLPLab] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 |

6 | TransNormerLLM -- A Faster and Better LLM 7 |

8 |
9 | 10 |

11 | 🤗 Hugging Face • 12 | 🤖 Model Scope • 13 | 💬 Discord • 14 | 💬 微信 15 |

16 |
17 | 18 | [![license](https://img.shields.io/github/license/modelscope/modelscope.svg)](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE) 19 |

20 |

21 | English | 22 | 中文 23 |

24 |

25 |
26 | 27 | ------ 28 | - [入门简介](#入门简介) 29 | - [开源模型](#开源模型) 30 | - [评测结果](#评测结果) 31 | - [通用领域](#通用领域) 32 | - [模型结果](#模型结果) 33 | - [推理部署](#推理部署) 34 | - [安装依赖](#安装依赖) 35 | - [特别注意](#特别注意) 36 | - [Python 推理代码](#python-推理代码) 37 | - [基础模型推理演示](#基础模型推理演示) 38 | - [微调模型](#微调模型) 39 | - [依赖安装](#依赖安装) 40 | - [训练](#训练) 41 | - [社区生态](#社区生态) 42 | - [许可声明](#许可声明) 43 | - [声明](#声明) 44 | - [协议](#协议) 45 | - [致谢](#致谢) 46 | - [引用](#引用) 47 | 48 | # 入门简介 49 | 50 | 我们正在重新定义大型语言模型(LLM)。该代码仓库是[TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf)的官方实现。 我们的 TransNormerLLM 开放权现在可供个人、创作者、研究人员和各种规模的企业使用,以便他们能够负责任地实验、创新和扩展他们的想法。 51 | 52 | 我们开放的版本包含 TransNormerLLM 模型实现、开源权重和监督微调 (SFT) 的起始代码。 我们将展示如何加载 [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf) 模型、运行 SFT 并对其进行推理的示例。 53 | 54 | - TransNormerLLM 是第一个基于线性注意力的 LLM,在准确性和效率方面均优于传统的基于 softmax 注意力的模型。 它是在具有 **1.4 万亿** 的高质量token语料库上进行训练的。 55 | - TransNormerLLM 从之前的线性注意力架构 TransNormer 演变而来,进行了一系列的优化,包括 LRPE 位置嵌入、闪电注意力加速、新的门控和标准化机制。 56 | - TransNormerLLM 在多个广受认可的中文、英文以及多语言通用和特定领域基准测试中取得了同类规模的非常有竞争性的性能。 57 | - 此版本包括具有 **385M**、**1B** 和 **7B** 参数的 **Base** 版本。 58 | - 所有版本均完全开放给学术研究。 开发者只需通过电子邮件申请并获得官方商业许可即可免费商业使用。 59 | - 欲了解更多信息,欢迎阅读我们的学术论文[TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf)。 60 | 61 | ![](./images/TransNormerLLM-arch.png) 62 | 63 | # 开源模型 64 | 65 | 具体发布版本及下载链接如下: 66 | 67 | | | 基础模型 | 68 | |:-------:|:-----------:| 69 | | 385M | 🤗 [TransNormerLLM-385M](https://huggingface.co/OpenNLPLab/TransNormerLLM-385M) | 70 | | 1B | 🤗 [TransNormerLLM-1B](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B) | 71 | | 7B | 🤗 [TransNormerLLM-7B](https://huggingface.co/OpenNLPLab/TransNormerLLM-7B) | 72 | 73 | # 评测结果 74 | 75 | 为了验证 TransNormerLLM,我们在 Commonsense Reasoning Task、MMLU、CMMLU 和 C-Eval 上测试了 385M、1B 和 7B 模型。 为了进行比较,我们选择了几个开源模型作为比较,包括基于 Transformer 的模型,如 OPT、Pythia、BLOOM、GPT-Neo、GPT-J、MPT、Falcon、LLaMA1/2、OpenLLAMA v1/v2、Baichuan 1/ 2、ChatGLM 1/2,以及非Transformer模型RWKV。 可以看出,与这些模型相比,TransNormerLLM仍然具有很强的竞争力。 76 | 77 | **常识推理** 我们报告 BoolQ、PIQA、SIQA、 78 | HellaSwag、WinoGrande、ARC 简单和挑战、OpenBookQA 及其平均值。 我们使用 LM-Eval-Harness 报告所有基准测试的0-shot结果。 79 | 与现有最先进的大语言模型相比,我们所有的模型都取得了具有竞争力的表现,展示了理解和应用常识推理的卓越能力。 80 | 81 | **汇总基准** 82 | 我们报告 MMLU、CMMLU、C-Eval 的总体结果。使用官方脚本来评估 MMLU、CMMLU 和 C-Eval,所有评估结果均采用 5-shot结果。 与业界顶级的开源模型相比,我们的模型在英文和中文基准测试中都表现出了相匹配的性能。 83 | 84 | 85 | ## 通用领域 86 | 87 | 在通用领域,我们对以下数据集进行了 5-shot 测试: 88 | - [C-Eval](https://cevalbenchmark.com/index.html#home)是一个综合性的中文基础模型评估数据集,涵盖52个学科和四个难度级别。 我们使用该数据集的开发集作为小样本学习的来源,并在测试集上进行测试。 我们的评估方法遵循 [LM-Evaluation-Harness](https://github.com/EleutherAI/lm-evaluation-harness)。 89 | - [MMLU](https://arxiv.org/abs/2009.03300)是一个英语评估数据集,包含57个任务,涵盖小学数学、美国历史、计算机科学、法律等,难度从高中水平到专家水平 。 它是主流的LLM评估数据集。 我们使用其[官方](https://github.com/hendrycks/test)评估方法。 90 | - [CMMLU](https://github.com/haonan-li/CMMLU)是一个涵盖67个主题的综合中文评估基准,专门用于评估语言模型在中文背景下的知识和推理能力。 我们采用了其[官方](https://github.com/haonan-li/CMMLU)评估方法。 91 | 92 | 93 | ### 模型结果 94 | 95 | **常识推理和通用领域性能比较。** 为了公平比较,我们报告了我们使用其发布的模型重现的竞争方法的结果。 PS:参数大小(十亿)。 T:Tokens(万亿)。 HS:HellaSwag。 WG:WinoGrande。 96 | 97 | | Model | PS | T | BoolQ | PIQA | HS | WG | ARC-e | ARC-c | OBQA | MMLU | CMMLU | C-Eval | 98 | |-------------|------|------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------| 99 | | OPT | 0.35 | 0.30 | 57.74 | 64.58 | 36.69 | 52.49 | 44.02 | 23.89 | 28.20 | 26.02 | 25.34 | 25.71 | 100 | | Pythia | 0.40 | 0.30 | 60.40 | 67.08 | 40.52 | 53.59 | 51.81 | 24.15 | 29.40 | 25.99 | 25.16 | 24.81 | 101 | | BLOOM | 0.56 | 0.35 | 55.14 | 64.09 | 36.97 | 52.80 | 47.35 | 23.98 | 28.20 | 24.80 | 25.35 | 27.14 | 102 | | RWKV | 0.43 | - | - | 67.52 | 40.90 | 51.14 | 52.86 | 25.17 | 32.40 | 24.85 | - | - | 103 | | **Ours** | 0.39 | 1.0 | 62.14 | 66.70 | 46.27 | 54.46 | 55.43 | 27.99 | 32.40 | 25.90 | 25.05 | 25.24 | 104 | | GPT-Neo | 1.3 | 0.3 | 61.99 | 71.11 | 48.93 | 54.93 | 56.19 | 25.85 | 33.60 | 24.82 | 26.03 | 23.94 | 105 | | OPT | 1.3 | 0.3 | 57.77 | 71.71 | 53.70 | 59.35 | 57.24 | 29.69 | 33.20 | 24.96 | 24.97 | 25.32 | 106 | | Pythia | 1.4 | 0.3 | 60.73 | 70.67 | 47.18 | 53.51 | 56.99 | 26.88 | 31.40 | 26.55 | 25.13 | 24.25 | 107 | | BLOOM | 1.1 | 0.35 | 59.08 | 67.14 | 42.98 | 54.93 | 51.47 | 25.68 | 29.40 | 27.30 | 25.09 | 26.50 | 108 | | RWKV | 1.5 | - | - | 72.36 | 52.48 | 54.62 | 60.48 | 29.44 | 34.00 | 25.77 | - | - | 109 | | Falcon | 1.0 | 0.35 | 61.38 | 75.14 | 61.50 | 60.30 | 63.38 | 32.17 | 35.60 | 25.28 | 24.88 | 25.66 | 110 | | **Ours** | 1.0 | 1.2 | 63.27 | 72.09 | 56.49 | 60.38 | 63.68 | 35.24 | 36.60 | 27.10 | 25.88 | 26.01 | 111 | | GPT-J | 6.9 | 0.3 | 65.44 | 75.41 | 66.25 | 64.09 | 66.92 | 36.60 | 38.20 | 25.40 | 26.47 | 23.39 | 112 | | OPT | 6.7 | 0.3 | 66.18 | 76.22 | 67.21 | 65.19 | 65.66 | 34.64 | 37.20 | 24.57 | 25.36 | 25.32 | 113 | | Pythia | 6.9 | 0.3 | 63.46 | 75.14 | 63.92 | 60.77 | 67.34 | 35.41 | 37.00 | 24.64 | 25.56 | 26.40 | 114 | | BLOOM | 7.1 | 0.35 | 62.91 | 72.69 | 62.33 | 64.01 | 65.11 | 33.45 | 35.80 | 26.25 | 24.97 | 24.25 | 115 | | RWKV | 7.4 | - | - | 76.06 | 65.51 | 61.01 | 67.80 | 37.46 | 40.20 | 24.96 | - | - | 116 | | MPT | 6.9 | 1.0 | 73.88 | 79.43 | 76.25 | 68.27 | 74.79 | 41.72 | 42.20 | 30.80 | 25.99 | 24.06 | 117 | | Falcon | 7.2 | 1.5 | 73.73 | 79.38 | 76.3 | 67.17 | 74.62 | 43.60 | 43.80 | 27.79 | 25.73 | 22.92 | 118 | | Baichuan1 | 7.0 | 1.2 | 70.09 | 76.01 | 70.06 | 64.09 | 71.72 | 40.53 | 38.20 | 42.30 | 44.43 | 42.80 | 119 | | Baichuan2 | 7.0 | 2.6 | 72.72 | 76.50 | 72.17 | 68.35 | 75.17 | 42.32 | 39.60 | 54.16 | 57.07 | 54.00 | 120 | | ChatGLM1 | 6.7 | 1.0 | 74.74 | 68.88 | 45.57 | 52.25 | 48.78 | 31.66 | 36.80 | 40.63 | 37.48 | 40.23 | 121 | | ChatGLM2 | 7.1 | 1.4 | 77.65 | 69.37 | 50.51 | 57.62 | 59.13 | 34.30 | 37.00 | 45.46 | 48.80 | 52.55 | 122 | | OpenLLaMAv1 | 6.7 | 1.0 | 70.43 | 75.68 | 69.23 | 66.69 | 71.17 | 38.57 | 39.00 | 30.49 | 25.40 | 26.09 | 123 | | OpenLLaMAv2 | 6.7 | 1.0 | 72.20 | 78.84 | 74.51 | 65.67 | 72.39 | 41.30 | 41.00 | 41.29 | 29.58 | 30.01 | 124 | | LLaMA1 | 6.7 | 1.0 | 76.50 | 79.80 | 76.10 | 70.10 | 72.80 | 47.60 | 57.20 | 35.10 | 25.62 | 25.72 | 125 | | LLaMA2 | 6.7 | 2.0 | 77.68 | 78.07 | 76.02 | 68.98 | 76.30 | 46.33 | 44.20 | 45.30 | 32.96 | 33.20 | 126 | | **Ours** | 6.8 | 1.4 | 75.87 | 80.09 | 75.21 | 66.06 | 75.42 | 44.40 | 63.40 | 43.10 | 47.99 | 43.18 | 127 | 128 | 129 | # 推理部署 130 | 131 | 推理所需的模型权重、源代码和配置已在 Hugging Face 上发布。 下载链接可以在本文档开头的[表格](#开源模型)中找到。 下面,我们以 TransNormerLLM-1B 为例演示各种推理方法。 程序会自动从Hugging Face下载所需的资源。 132 | ## 安装依赖 133 | 134 | ```shell 135 | pip install -r requirements.txt 136 | ``` 137 | 138 | ## 特别注意 139 | 如果遇到Triton相关错误,请设置以下环境变量: 140 | ``` 141 | export use_triton=False 142 | ``` 143 | 144 | ## Python 推理代码 145 | 146 | ### 基础模型推理演示 147 | 148 | ```python 149 | >>> from transformers import AutoModelForCausalLM, AutoTokenizer 150 | >>> tokenizer = AutoTokenizer.from_pretrained("OpenNLPLab/TransNormerLLM-1B", trust_remote_code=True) 151 | >>> model = AutoModelForCausalLM.from_pretrained("OpenNLPLab/TransNormerLLM-1B", device_map="auto", trust_remote_code=True) 152 | ``` 153 | 154 | > 在上面的代码片段中,模型加载指定`device_map='auto'`,它将使用所有可用的GPU。 如果需要指定要使用的设备,可以通过类似于“export CUDA_VISIBLE_DEVICES=0,1”(使用0和1显卡)的方式进行控制。 155 | 156 | # 微调模型 157 | 158 | ## 依赖安装 159 | 160 | ```shell 161 | git clone https://github.com/OpenNLPLab/TransNormerLLM.git 162 | cd TransNormerLLM/fine-tune 163 | pip install -r requirements.txt 164 | ``` 165 | - 要使用LoRA等轻量级微调方法,您必须另外安装[peft](https://github.com/huggingface/peft)。 166 | 167 | ## 训练 168 | 169 | 下面,我们提供了使用 ZeRO-3 在单台机器上微调 TransNormerLLM-7B-Base 的示例。 170 | 171 | 训练数据:`alpaca_data.json`。 此示例数据取自 [alpaca_data.json](https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json),包含 52,002 个条目的选择,并已重新格式化。 主要目的是演示如何SFT我们的模型,不保证有效性。 172 | 173 | ```shell 174 | torchrun \ 175 | --nproc_per_node=8 \ 176 | train.py \ 177 | --model_name_or_path OpenNLPLab/TransNormerLLM-1B \ 178 | --data_path ./alpaca_data.json \ 179 | --output_dir output \ 180 | --num_train_epochs 1 \ 181 | --per_device_train_batch_size 2 \ 182 | --per_device_eval_batch_size 1 \ 183 | --gradient_accumulation_steps 1 \ 184 | --bf16 true \ 185 | --adam_beta1 0.9 \ 186 | --adam_beta2 0.95 \ 187 | --evaluation_strategy "no" \ 188 | --save_strategy "steps" \ 189 | --save_steps 5000 \ 190 | --save_total_limit 30 \ 191 | --learning_rate 1e-4 \ 192 | --weight_decay 0.1 \ 193 | --warmup_ratio 0.1 \ 194 | --lr_scheduler_type "cosine" \ 195 | --deepspeed 'configs/zero3.json' \ 196 | --logging_steps 1 \ 197 | --dataloader_num_workers 24 \ 198 | --ddp_find_unused_parameters false \ 199 | --tf32 true \ 200 | ``` 201 | 202 | # 社区生态 203 | 204 | **📢📢📢我们将不断更新这里社区和生态系统对 TransNormerLLM 的支持😀😀😀** 205 | - [nanoTransnormer](https://github.com/Doraemonzzz/nanoTransNormer) 206 | 207 | # 许可声明 208 | 209 | ## 声明 210 | 211 | 212 | 我们特此声明,我们的团队没有开发过任何基于 TransNormerLLM 模型的应用程序,也没有在 iOS、Android、Web 或任何其他平台上开发过。 我们强烈呼吁所有用户不要利用TransNormerLLM模型进行任何危害国家/社会安全或违法的活动。 此外,我们要求用户不要将 TransNormerLLM 模型用于未经过适当安全审查和备案的互联网服务。 我们希望所有用户都能遵守这一原则,确保技术的发展在规范、合法的环境中进行。 213 | 214 | 我们已尽力确保模型训练过程中使用的数据的合规性。 然而,尽管我们付出了巨大的努力,由于模型和数据的复杂性,仍然可能会出现一些不可预见的问题。 因此,如果因使用TransNormerLLM开源模型而出现任何问题,包括但不限于数据安全问题、舆情风险,或模型被误导、滥用、传播或不当利用带来的任何风险和问题, 我们将不承担任何责任。 215 | 216 | ## 协议 217 | 218 | TransNormerLLM 模型的社区使用需要遵守 [Apache 2.0](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE) 和 [TransNormerLLM 模型社区许可证](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B/blob/main/TransNormerLLM模型社区许可协议.pdf)。 TransNormerLLM 模型支持商业用途。 如果您计划将TransNormerLLM模型或其衍生品用于商业目的,请确保您的实体满足以下条件: 219 | 220 | 1. 您或您关联公司的服务或产品的日活跃用户(DAU)低于100万。 221 | 2. 您或您的关联公司都不是软件服务提供商或云服务提供商。 222 | 3. 未经 TransNormerLLM 许可,您或您的关联公司不可能将给予您的商业许可授予或重新授权给其他第三方。 223 | 224 | 满足上述条件后,您需要通过以下联系邮箱提交TransNormerLLM模型社区许可协议所需的申请材料:opennlplab@gmail.com。 一旦获得批准,TransNormerLLM 将特此授予您非排他性、全球性、不可转让、不可再许可、可撤销的商业版权许可。 225 | 226 | ## 致谢 227 | 我们的项目基于如下开源项目进行开发: 228 | - [Baichuan](https://github.com/baichuan-inc/Baichuan-7B)用于tokenizer部分。 229 | - [metaseq](https://github.com/facebookresearch/metaseq)用于训练部分。 230 | - [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)用于测评部分。 231 | 232 | 233 | ## 引用 234 | 235 | 如果您想引用我们的工作,请使用以下参考文献: 236 | ``` 237 | @misc{qin2024transnormerllm, 238 | title={TransNormerLLM: A Faster and Better Large Language Model with Improved TransNormer}, 239 | author={Zhen Qin and Dong Li and Weigao Sun and Weixuan Sun and Xuyang Shen and Xiaodong Han and Yunshen Wei and Baohong Lv and Xiao Luo and Yu Qiao and Yiran Zhong}, 240 | year={2024}, 241 | eprint={2307.14995}, 242 | archivePrefix={arXiv}, 243 | primaryClass={cs.CL} 244 | } 245 | 246 | @misc{qin2024lightning, 247 | title={Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models}, 248 | author={Zhen Qin and Weigao Sun and Dong Li and Xuyang Shen and Weixuan Sun and Yiran Zhong}, 249 | year={2024}, 250 | eprint={2401.04658}, 251 | archivePrefix={arXiv}, 252 | primaryClass={cs.CL} 253 | } 254 | 255 | ``` 256 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 |

6 | TransNormerLLM -- A Faster and Better LLM 7 |

8 |
9 | 10 |

11 | 🤗 Hugging Face • 12 | 🤖 Model Scope • 13 | 💬 Discord • 14 | 💬 WeChat • 15 | 🔢 GPTQ 16 |

17 |
18 | 19 | 20 | [![license](https://img.shields.io/github/license/modelscope/modelscope.svg)](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE) 21 |

22 |

23 | English | 24 | 中文 25 |

26 |

27 |
28 | 29 | 30 | ------ 31 | - [Introduction](#introduction) 32 | - [Released Weights](#released-weights) 33 | - [Benchmark Results](#benchmark-results) 34 | - [General Domain](#general-domain) 35 | - [Model Results](#model-results) 36 | - [Inference and Deployment](#inference-and-deployment) 37 | - [Dependency Installation](#dependency-installation) 38 | - [Notice](#notice) 39 | - [Python Code Inference](#python-code-inference) 40 | - [Demonstration of Base Model Inference](#demonstration-of-base-model-inference) 41 | - [Fine-tuning the Model](#fine-tuning-the-model) 42 | - [Dependency Installation](#dependency-installation-1) 43 | - [Training](#training) 44 | - [Community and Ecosystem](#community-and-ecosystem) 45 | - [Disclaimer, License and Citation](#disclaimer-license-and-citation) 46 | - [Disclaimer](#disclaimer) 47 | - [License](#license) 48 | - [Acknowledgments](#acknowledgments) 49 | - [Citation](#citation) 50 | 51 | # Introduction 52 | 53 | We are re-inventing the Large Language Model (LLM). This is the official implementation of [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf). Our opened weights of TransNormerLLM are now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly. 54 | 55 | Our release contains the TransNormerLLM model implementation, the open-source weights and the starting code for Supervised Fine-tuning (SFT). We will show examples on how to load [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf) models, run SFT and inference on it. 56 | 57 | - TransNormerLLM is the first linear attention-based LLM that outperforms conventional softmax attention-based models in terms of both accuracy and efficiency. It was trained on a high-quality corpus with up to **1.4 trillion** tokens. 58 | - TransNormerLLM evolves from the previous linear attention architecture TransNormer by making advanced modifications that include LRPE positional embedding, Lightning Attention acceleration, new gating and normalization mechanisms. 59 | - TransNormerLLM achieved competitive performance of its size on multiple well-approved Chinese, English, and multi-language general and domain-specific benchmarks. 60 | - This release includes **Base** versions with **385M**, **1B**, and **7B** parameters. 61 | - All versions are fully open to academic research. Developers only need to apply via email and obtain official commercial permission to use it for free commercially. 62 | - For more information, welcome reading our academic paper [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf). 63 | - 🔥Get excited!🔥 Our **15B** model is currently in training! Click the [link](https://api.wandb.ai/links/opennlplab/kip314lq) 👀 to track our thrilling progress in real time! 🚀 64 | 65 | ![](./images/TransNormerLLM-arch.png) 66 | 67 | # Released Weights 68 | 69 | The specific released versions and download links are shown as below: 70 | 71 | | | Base Models | 72 | | :---------------: | :----------------------------------------------------------------------------: | 73 | | 385M | 🤗 [TransNormerLLM-385M](https://huggingface.co/OpenNLPLab/TransNormerLLM-385M) | 74 | | 1B | 🤗 [TransNormerLLM-1B](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B) | 75 | | 7B | 🤗 [TransNormerLLM-7B](https://huggingface.co/OpenNLPLab/TransNormerLLM-7B) | 76 | 77 | # Benchmark Results 78 | 79 | To validate TransNormerLLM, we tested our 385M, 1B, and 7B models on Commonsense Reasoning Task, MMLU, CMMLU, and C-Eval. For comparison, we selected several open-source models as competitors, including Transformer-based models such as OPT, Pythia, BLOOM, GPT-Neo, GPT-J, MPT, Falcon, LLaMA1/2, OpenLLAMA v1/v2, Baichuan 1/2, ChatGLM 1/2, and non-Transformer model RWKV. It can be observed that, compared to these models, TransNormerLLM remains highly competitive. 80 | 81 | **Commonsense Reasoning** We report BoolQ, PIQA, SIQA, 82 | HellaSwag, WinoGrande, ARC easy and challenge, OpenBookQA and their average. We report 0-shot results for all benchmarks using LM-Eval-Harness. 83 | All of our models achieve competitive performance compared to existing state-of-the-art LLMs, showcasing a remarkable ability to comprehend and apply commonsense reasoning. 84 | 85 | **Aggregated Benchmarks** 86 | We report the overall results for MMLU, CMMLU, C-Eval. Official scripts were used for evaluating MMLU, CMMLU, and C-Eval, with all evaluation results being conducted with a 5-shot setup. In comparison to top-tier open-source models available in the industry, our models have demonstrated matched performance in both English and Chinese benchmarks. 87 | 88 | ## General Domain 89 | 90 | In the general domain, we conducted 5-shot tests on the following datasets: 91 | - [C-Eval](https://cevalbenchmark.com/index.html#home) is a comprehensive Chinese basic model evaluation dataset, covering 52 disciplines and four levels of difficulty. Our evaluation approach followed that of [LM-Evaluation-Harness](https://github.com/EleutherAI/lm-evaluation-harness). 92 | - [MMLU](https://arxiv.org/abs/2009.03300) is an English evaluation dataset comprising 57 tasks, encompassing elementary math, American history, computer science, law, etc. The difficulty ranges from high school level to expert level. It's a mainstream LLM evaluation dataset. We used its [official](https://github.com/hendrycks/test) evaluation approach. 93 | - [CMMLU](https://github.com/haonan-li/CMMLU) is a comprehensive Chinese evaluation benchmark covering 67 topics, specifically designed to assess language models' knowledge and reasoning capabilities in a Chinese context. We adopted its [official](https://github.com/haonan-li/CMMLU) evaluation approach. 94 | 95 | 96 | ### Model Results 97 | **Performance Comparison on Commonsense Reasoning and Aggregated Benchmarks.** For a fair comparison, we report competing methods' results reproduced by us using their released models. PS: parameter size (billion). T: tokens (trillion). HS: HellaSwag. WG: WinoGrande. 98 | 99 | | Model | PS | T | BoolQ | PIQA | HS | WG | ARC-e | ARC-c | OBQA | MMLU | CMMLU | C-Eval | 100 | | ----------- | ---- | ---- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ------ | 101 | | OPT | 0.35 | 0.30 | 57.74 | 64.58 | 36.69 | 52.49 | 44.02 | 23.89 | 28.20 | 26.02 | 25.34 | 25.71 | 102 | | Pythia | 0.40 | 0.30 | 60.40 | 67.08 | 40.52 | 53.59 | 51.81 | 24.15 | 29.40 | 25.99 | 25.16 | 24.81 | 103 | | BLOOM | 0.56 | 0.35 | 55.14 | 64.09 | 36.97 | 52.80 | 47.35 | 23.98 | 28.20 | 24.80 | 25.35 | 27.14 | 104 | | RWKV | 0.43 | - | - | 67.52 | 40.90 | 51.14 | 52.86 | 25.17 | 32.40 | 24.85 | - | - | 105 | | **Ours** | 0.39 | 1.0 | 62.14 | 66.70 | 46.27 | 54.46 | 55.43 | 27.99 | 32.40 | 25.90 | 25.05 | 25.24 | 106 | | GPT-Neo | 1.3 | 0.3 | 61.99 | 71.11 | 48.93 | 54.93 | 56.19 | 25.85 | 33.60 | 24.82 | 26.03 | 23.94 | 107 | | OPT | 1.3 | 0.3 | 57.77 | 71.71 | 53.70 | 59.35 | 57.24 | 29.69 | 33.20 | 24.96 | 24.97 | 25.32 | 108 | | Pythia | 1.4 | 0.3 | 60.73 | 70.67 | 47.18 | 53.51 | 56.99 | 26.88 | 31.40 | 26.55 | 25.13 | 24.25 | 109 | | BLOOM | 1.1 | 0.35 | 59.08 | 67.14 | 42.98 | 54.93 | 51.47 | 25.68 | 29.40 | 27.30 | 25.09 | 26.50 | 110 | | RWKV | 1.5 | - | - | 72.36 | 52.48 | 54.62 | 60.48 | 29.44 | 34.00 | 25.77 | - | - | 111 | | Falcon | 1.0 | 0.35 | 61.38 | 75.14 | 61.50 | 60.30 | 63.38 | 32.17 | 35.60 | 25.28 | 24.88 | 25.66 | 112 | | **Ours** | 1.0 | 1.2 | 63.27 | 72.09 | 56.49 | 60.38 | 63.68 | 35.24 | 36.60 | 27.10 | 25.88 | 26.01 | 113 | | GPT-J | 6.9 | 0.3 | 65.44 | 75.41 | 66.25 | 64.09 | 66.92 | 36.60 | 38.20 | 25.40 | 26.47 | 23.39 | 114 | | OPT | 6.7 | 0.3 | 66.18 | 76.22 | 67.21 | 65.19 | 65.66 | 34.64 | 37.20 | 24.57 | 25.36 | 25.32 | 115 | | Pythia | 6.9 | 0.3 | 63.46 | 75.14 | 63.92 | 60.77 | 67.34 | 35.41 | 37.00 | 24.64 | 25.56 | 26.40 | 116 | | BLOOM | 7.1 | 0.35 | 62.91 | 72.69 | 62.33 | 64.01 | 65.11 | 33.45 | 35.80 | 26.25 | 24.97 | 24.25 | 117 | | RWKV | 7.4 | - | - | 76.06 | 65.51 | 61.01 | 67.80 | 37.46 | 40.20 | 24.96 | - | - | 118 | | MPT | 6.9 | 1.0 | 73.88 | 79.43 | 76.25 | 68.27 | 74.79 | 41.72 | 42.20 | 30.80 | 25.99 | 24.06 | 119 | | Falcon | 7.2 | 1.5 | 73.73 | 79.38 | 76.3 | 67.17 | 74.62 | 43.60 | 43.80 | 27.79 | 25.73 | 22.92 | 120 | | Baichuan1 | 7.0 | 1.2 | 70.09 | 76.01 | 70.06 | 64.09 | 71.72 | 40.53 | 38.20 | 42.30 | 44.43 | 42.80 | 121 | | Baichuan2 | 7.0 | 2.6 | 72.72 | 76.50 | 72.17 | 68.35 | 75.17 | 42.32 | 39.60 | 54.16 | 57.07 | 54.00 | 122 | | ChatGLM1 | 6.7 | 1.0 | 74.74 | 68.88 | 45.57 | 52.25 | 48.78 | 31.66 | 36.80 | 40.63 | 37.48 | 40.23 | 123 | | ChatGLM2 | 7.1 | 1.4 | 77.65 | 69.37 | 50.51 | 57.62 | 59.13 | 34.30 | 37.00 | 45.46 | 48.80 | 52.55 | 124 | | OpenLLaMAv1 | 6.7 | 1.0 | 70.43 | 75.68 | 69.23 | 66.69 | 71.17 | 38.57 | 39.00 | 30.49 | 25.40 | 26.09 | 125 | | OpenLLaMAv2 | 6.7 | 1.0 | 72.20 | 78.84 | 74.51 | 65.67 | 72.39 | 41.30 | 41.00 | 41.29 | 29.58 | 30.01 | 126 | | LLaMA1 | 6.7 | 1.0 | 76.50 | 79.80 | 76.10 | 70.10 | 72.80 | 47.60 | 57.20 | 35.10 | 25.62 | 25.72 | 127 | | LLaMA2 | 6.7 | 2.0 | 77.68 | 78.07 | 76.02 | 68.98 | 76.30 | 46.33 | 44.20 | 45.30 | 32.96 | 33.20 | 128 | | **Ours** | 6.8 | 1.4 | 75.87 | 80.09 | 75.21 | 66.06 | 75.42 | 44.40 | 63.40 | 43.10 | 47.99 | 43.18 | 129 | 130 | 131 | # Inference and Deployment 132 | 133 | The model weights, source code, and configuration needed for inference have been released on Hugging Face. Download links can be found in the [table](#released-weights). Below, we demonstrate various inference methods using TransNormerLLM-1B as an example. The program will automatically download the required resources from Hugging Face. 134 | 135 | ## Dependency Installation 136 | 137 | ```shell 138 | pip install -r requirements.txt 139 | ``` 140 | 141 | ## Notice 142 | If you encounter errors related to Triton, please set the following environment variables: 143 | ``` 144 | export use_triton=False 145 | ``` 146 | 147 | 148 | ## Python Code Inference 149 | 150 | ### Demonstration of Base Model Inference 151 | 152 | ```python 153 | >>> from transformers import AutoModelForCausalLM, AutoTokenizer 154 | >>> tokenizer = AutoTokenizer.from_pretrained("OpenNLPLab/TransNormerLLM-1B", trust_remote_code=True) 155 | ``` 156 | 157 | > In the above code snippets, the model loading specifies `device_map='auto'`, which will use all available GPUs. If you need to specify the device(s) to use, you can control it in a way similar to `export CUDA_VISIBLE_DEVICES=0,1` (using the 0 and 1 graphics cards). 158 | 159 | 160 | # Fine-tuning the Model 161 | 162 | ## Dependency Installation 163 | 164 | ```shell 165 | git clone https://github.com/OpenNLPLab/TransNormerLLM.git 166 | cd TransNormerLLM/fine-tune 167 | pip install -r requirements.txt 168 | ``` 169 | - To use lightweight fine-tuning methods like LoRA, you must additionally install [peft](https://github.com/huggingface/peft). 170 | 171 | ## Training 172 | 173 | Below, we provide an example of fine-tuning the TransNormerLLM-1B on a single machine with ZeRO-3. 174 | 175 | Training Data: `alpaca_data.json`. This sample data was drawn from [alpaca_data.json](https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json), consisting of a selection of 52,002 entries, and has been reformatted. The main purpose is to demonstrate how to SFT our model, and effectiveness is not guaranteed. 176 | 177 | ```shell 178 | torchrun \ 179 | --nproc_per_node=8 \ 180 | train.py \ 181 | --model_name_or_path OpenNLPLab/TransNormerLLM-1B \ 182 | --data_path ./alpaca_data.json \ 183 | --output_dir output \ 184 | --num_train_epochs 1 \ 185 | --per_device_train_batch_size 2 \ 186 | --per_device_eval_batch_size 1 \ 187 | --gradient_accumulation_steps 1 \ 188 | --bf16 true \ 189 | --adam_beta1 0.9 \ 190 | --adam_beta2 0.95 \ 191 | --evaluation_strategy "no" \ 192 | --save_strategy "steps" \ 193 | --save_steps 5000 \ 194 | --save_total_limit 30 \ 195 | --learning_rate 1e-4 \ 196 | --weight_decay 0.1 \ 197 | --warmup_ratio 0.1 \ 198 | --lr_scheduler_type "cosine" \ 199 | --deepspeed 'configs/zero3.json' \ 200 | --logging_steps 1 \ 201 | --dataloader_num_workers 24 \ 202 | --ddp_find_unused_parameters false \ 203 | --tf32 true \ 204 | ``` 205 | 206 | # Community and Ecosystem 207 | 208 | **📢📢📢 We will continuously update the support for TransNormerLLM from the community and ecosystem here 😀😀😀** 209 | - [nanoTransnormer](https://github.com/Doraemonzzz/nanoTransNormer) 210 | 211 | # Disclaimer, License and Citation 212 | 213 | ## Disclaimer 214 | We hereby declare that our team has not developed any applications based on TransNormerLLM models, not on iOS, Android, the web, or any other platform. We strongly call on all users not to use TransNormerLLM models for any activities that harm national / social security or violate the law. Also, we ask users not to use TransNormerLLM models for Internet services that have not undergone appropriate security reviews and filings. We hope that all users can abide by this principle and ensure that the development of technology proceeds in a regulated and legal environment. 215 | 216 | We have done our best to ensure the compliance of the data used in the model training process. However, despite our considerable efforts, there may still be some unforeseeable issues due to the complexity of the model and data. Therefore, if any problems arise due to the use of TransNormerLLM open-source models, including but not limited to data security issues, public opinion risks, or any risks and problems brought about by the model being misled, abused, spread or improperly exploited, we will not assume any responsibility. 217 | 218 | ## License 219 | The community usage of TransNormerLLM model requires adherence to [Apache 2.0](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE) and [Community License for TransNormerLLM Model](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B/blob/main/Community%20License%20for%20TransNormerLLM%20Model.pdf). The TransNormerLLM model supports commercial use. If you plan to use the TransNormerLLM model or its derivatives for commercial purposes, please ensure that your entity meets the following conditions: 220 | 221 | 1. The Daily Active Users (DAU) of your or your affiliate's service or product is less than 1 million. 222 | 2. Neither you nor your affiliates are software service providers or cloud service providers. 223 | 3. There is no possibility for you or your affiliates to grant the commercial license given to you, to reauthorize it to other third parties without TransNormerLLM's permission. 224 | 225 | Upon meeting the above conditions, you need to submit the application materials required by the TransNormerLLM Model Community License Agreement via the following contact email: opennlplab@gmail.com. Once approved, TransNormerLLM will hereby grant you a non-exclusive, global, non-transferable, non-sublicensable, revocable commercial copyright license. 226 | 227 | ## Acknowledgments 228 | Our project is developed based on the following open source projects: 229 | - [Baichuan](https://github.com/baichuan-inc/Baichuan-7B) for the tokenizer. 230 | - [metaseq](https://github.com/facebookresearch/metaseq) for training. 231 | - [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for evaluation. 232 | 233 | ## Citation 234 | If you wish to cite our work, please use the following reference: 235 | ``` 236 | @misc{qin2024transnormerllm, 237 | title={TransNormerLLM: A Faster and Better Large Language Model with Improved TransNormer}, 238 | author={Zhen Qin and Dong Li and Weigao Sun and Weixuan Sun and Xuyang Shen and Xiaodong Han and Yunshen Wei and Baohong Lv and Xiao Luo and Yu Qiao and Yiran Zhong}, 239 | year={2024}, 240 | eprint={2307.14995}, 241 | archivePrefix={arXiv}, 242 | primaryClass={cs.CL} 243 | } 244 | 245 | @misc{qin2024lightning, 246 | title={Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models}, 247 | author={Zhen Qin and Weigao Sun and Dong Li and Xuyang Shen and Weixuan Sun and Yiran Zhong}, 248 | year={2024}, 249 | eprint={2401.04658}, 250 | archivePrefix={arXiv}, 251 | primaryClass={cs.CL} 252 | } 253 | 254 | ``` 255 | --------------------------------------------------------------------------------