├── images
    ├── contact_me_qr.png
    ├── transnormerllm-arch.png
    └── transnormerllm-benchmark.png
├── requirements.txt
├── fine-tune
    ├── requirements.txt
    ├── run.sh
    ├── configs
    │   └── zero3.json
    ├── utils.py
    └── train.py
├── LICENSE
├── README_CN.md
└── README.md


/images/contact_me_qr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNLPLab/TransnormerLLM/HEAD/images/contact_me_qr.png


--------------------------------------------------------------------------------
/images/transnormerllm-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNLPLab/TransnormerLLM/HEAD/images/transnormerllm-arch.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | transformers
3 | torch==2.0.0
4 | sentencepiece
5 | tokenizers
6 | triton==2.0.0
7 | einops
8 | 


--------------------------------------------------------------------------------
/images/transnormerllm-benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenNLPLab/TransnormerLLM/HEAD/images/transnormerllm-benchmark.png


--------------------------------------------------------------------------------
/fine-tune/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | transformers
 3 | torch==2.0.0
 4 | sentencepiece
 5 | tokenizers
 6 | accelerate
 7 | deepspeed
 8 | triton==2.0.0
 9 | einops
10 | 


--------------------------------------------------------------------------------
/fine-tune/run.sh:
--------------------------------------------------------------------------------
 1 | GPUs=$1
 2 | MODEL=path/to/model
 3 | DATA_PATH=path/to/data
 4 | 
 5 | torchrun \
 6 |     --nproc_per_node=$GPUs \
 7 |     train.py \
 8 |     --model_name_or_path $MODEL \
 9 |     --data_path $DATA_PATH \
10 |     --output_dir output/test \
11 |     --num_train_epochs 1 \
12 |     --per_device_train_batch_size 2 \
13 |     --per_device_eval_batch_size 1 \
14 |     --gradient_accumulation_steps 1 \
15 |     --bf16 true \
16 |     --adam_beta1 0.9 \
17 |     --adam_beta2 0.95 \
18 |     --evaluation_strategy "no" \
19 |     --save_strategy "steps" \
20 |     --save_steps 5000 \
21 |     --save_total_limit 30 \
22 |     --learning_rate 1e-4 \
23 |     --weight_decay 0.1 \
24 |     --warmup_ratio 0.1 \
25 |     --lr_scheduler_type "cosine" \
26 |     --deepspeed 'configs/zero3.json' \
27 |     --logging_steps 1 \
28 |     --dataloader_num_workers 24 \
29 |     --ddp_find_unused_parameters false \
30 |     --tf32 true \
31 | 


--------------------------------------------------------------------------------
/fine-tune/configs/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": "auto"
 4 |     },
 5 |     "optimizer": {
 6 |         "type": "AdamW",
 7 |         "params": {
 8 |             "lr": "auto",
 9 |             "betas": "auto",
10 |             "eps": "auto",
11 |             "weight_decay": "auto"
12 |         }
13 |     },
14 |     "scheduler": {
15 |         "type": "WarmupDecayLR",
16 |         "params": {
17 |             "total_num_steps": "auto",
18 |             "warmup_min_lr": "auto",
19 |             "warmup_max_lr": "auto",
20 |             "warmup_num_steps": "auto"
21 |         }
22 |     },
23 |     "zero_optimization": {
24 |         "stage": 3,
25 |         "overlap_comm": true,
26 |         "contiguous_gradients": true,
27 |         "sub_group_size": 1e9,
28 |         "stage3_gather_16bit_weights_on_model_save": true
29 |     },
30 |     "gradient_accumulation_steps": "auto",
31 |     "gradient_clipping": "auto",
32 |     "steps_per_print": 5,
33 |     "train_batch_size": "auto",
34 |     "train_micro_batch_size_per_gpu": "auto",
35 |     "wall_clock_breakdown": false,
36 |     "activation_checkpointing": {
37 |         "partition_activations": false,
38 |         "cpu_checkpointing": false,
39 |         "contiguous_memory_optimization": false,
40 |         "number_checkpoints": null,
41 |         "synchronize_checkpoint_boundary": false,
42 |         "profile": false
43 |     }
44 | }


--------------------------------------------------------------------------------
/fine-tune/utils.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2023 OpenNLPLab
 2 | #
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | #
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | import io
16 | import json
17 | import os
18 | 
19 | 
20 | def _make_w_io_base(f, mode: str):
21 |     if not isinstance(f, io.IOBase):
22 |         f_dirname = os.path.dirname(f)
23 |         if f_dirname != "":
24 |             os.makedirs(f_dirname, exist_ok=True)
25 |         f = open(f, mode=mode)
26 |     return f
27 | 
28 | 
29 | def _make_r_io_base(f, mode: str):
30 |     if not isinstance(f, io.IOBase):
31 |         f = open(f, mode=mode)
32 |     return f
33 | 
34 | 
35 | def jdump(obj, f, mode="w", indent=4, default=str):
36 |     """Dump a str or dictionary to a file in json format.
37 | 
38 |     Args:
39 |         obj: An object to be written.
40 |         f: A string path to the location on disk.
41 |         mode: Mode for opening the file.
42 |         indent: Indent for storing json dictionaries.
43 |         default: A function to handle non-serializable entries; defaults to `str`.
44 |     """
45 |     f = _make_w_io_base(f, mode)
46 |     if isinstance(obj, (dict, list)):
47 |         json.dump(obj, f, indent=indent, default=default)
48 |     elif isinstance(obj, str):
49 |         f.write(obj)
50 |     else:
51 |         raise ValueError(f"Unexpected type: {type(obj)}")
52 |     f.close()
53 | 
54 | 
55 | def jload(f, mode="r"):
56 |     """Load a .json file into a dictionary."""
57 |     f = _make_r_io_base(f, mode)
58 |     jdict = json.load(f)
59 |     f.close()
60 |     return jdict
61 | 


--------------------------------------------------------------------------------
/fine-tune/train.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2023 OpenNLPLab
  2 | #
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | #
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | import copy
 16 | from dataclasses import dataclass, field
 17 | import logging
 18 | from typing import Dict, Optional, Sequence
 19 | 
 20 | import torch
 21 | from torch.utils.data import Dataset
 22 | import transformers
 23 | from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer
 24 | import utils
 25 | 
 26 | IGNORE_INDEX = -100
 27 | HUMAN_PREFIX = 'Human:\n'
 28 | ASSISTANT_PREFIX = 'Assistant:\n'
 29 | 
 30 | 
 31 | @dataclass
 32 | class ModelArguments:
 33 |     model_name_or_path: Optional[str] = field(
 34 |         default="OpenNLPLab/transnormerllm-410m")
 35 | 
 36 | 
 37 | @dataclass
 38 | class DataArguments:
 39 |     data_path: str = field(default=None,
 40 |                            metadata={"help": "Path to the training data."})
 41 | 
 42 | 
 43 | @dataclass
 44 | class TrainingArguments(transformers.TrainingArguments):
 45 |     cache_dir: Optional[str] = field(default=None)
 46 |     optim: str = field(default="adamw_torch")
 47 |     model_max_length: int = field(
 48 |         default=2048,
 49 |         metadata={
 50 |             "help":
 51 |             "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
 52 |         },
 53 |     )
 54 | 
 55 | 
 56 | def _tokenize_fn(strings: Sequence[str],
 57 |                  tokenizer: transformers.PreTrainedTokenizer) -> Dict:
 58 |     """Tokenize a list of strings."""
 59 |     tokenized_list = [
 60 |         tokenizer(
 61 |             text,
 62 |             return_tensors="pt",
 63 |             # padding="longest",
 64 |             max_length=2048,
 65 |             truncation=True,
 66 |         ) for text in strings
 67 |     ]
 68 |     input_ids = labels = [
 69 |         tokenized.input_ids[0] for tokenized in tokenized_list
 70 |     ]
 71 |     input_ids_lens = labels_lens = [
 72 |         tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
 73 |         for tokenized in tokenized_list
 74 |     ]
 75 |     return dict(
 76 |         input_ids=input_ids,
 77 |         labels=labels,
 78 |         input_ids_lens=input_ids_lens,
 79 |         labels_lens=labels_lens,
 80 |     )
 81 | 
 82 | 
 83 | def preprocess(
 84 |     sources: Sequence[str],
 85 |     targets: Sequence[str],
 86 |     tokenizer: transformers.PreTrainedTokenizer,
 87 | ) -> Dict:
 88 |     """Preprocess the data by tokenizing."""
 89 |     examples = [s + t for s, t in zip(sources, targets)]
 90 |     examples_tokenized, sources_tokenized = [
 91 |         _tokenize_fn(strings, tokenizer) for strings in (examples, sources)
 92 |     ]
 93 |     input_ids = examples_tokenized["input_ids"]
 94 |     labels = copy.deepcopy(input_ids)
 95 | 
 96 |     for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
 97 |         label[:source_len] = IGNORE_INDEX
 98 |     return dict(input_ids=input_ids, labels=labels)
 99 | 
100 | 
101 | class SupervisedDataset(Dataset):
102 |     """Dataset for supervised fine-tuning."""
103 | 
104 |     def __init__(self, data_path: str,
105 |                  tokenizer: transformers.PreTrainedTokenizer):
106 |         super(SupervisedDataset, self).__init__()
107 |         logging.warning("Loading data...")
108 | 
109 |         list_data_dict = utils.jload(data_path)
110 | 
111 |         logging.warning("Formatting inputs...")
112 |         sources = []
113 |         targets = []
114 | 
115 |         for example in list_data_dict:
116 |             if len(example['instruction']) > 0 and len(example['input']) > 0:
117 |                 sources.append(example['instruction'] + '\n' + HUMAN_PREFIX +
118 |                                example['input'] + '\n' + ASSISTANT_PREFIX)
119 |             else:
120 |                 _input = example['instruction'] if len(
121 |                     example['input']) == 0 else example['input']
122 |                 sources.append(HUMAN_PREFIX + _input + '\n' + ASSISTANT_PREFIX)
123 | 
124 |             targets.append(example['output'] + tokenizer.eos_token)
125 | 
126 |         logging.warning("Tokenizing inputs... This may take some time...")
127 |         data_dict = preprocess(sources, targets, tokenizer)
128 | 
129 |         self.input_ids = data_dict["input_ids"]
130 |         self.labels = data_dict["labels"]
131 | 
132 |     def __len__(self):
133 |         return len(self.input_ids)
134 | 
135 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
136 |         return dict(input_ids=self.input_ids[i], labels=self.labels[i])
137 | 
138 | 
139 | @dataclass
140 | class DataCollatorForSupervisedDataset(object):
141 |     """Collate examples for supervised fine-tuning."""
142 | 
143 |     tokenizer: transformers.PreTrainedTokenizer
144 | 
145 |     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
146 |         input_ids, labels = tuple([instance[key] for instance in instances]
147 |                                   for key in ("input_ids", "labels"))
148 |         input_ids = torch.nn.utils.rnn.pad_sequence(
149 |             input_ids,
150 |             batch_first=True,
151 |             padding_value=self.tokenizer.pad_token_id)
152 |         labels = torch.nn.utils.rnn.pad_sequence(labels,
153 |                                                  batch_first=True,
154 |                                                  padding_value=IGNORE_INDEX)
155 | 
156 |         return dict(
157 |             input_ids=input_ids,
158 |             labels=labels,
159 |             attention_mask=input_ids.ne(self.tokenizer.pad_token_id).to(
160 |                 torch.int),
161 |         )
162 | 
163 | 
164 | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
165 |                                 data_args) -> Dict:
166 |     """Make dataset and collator for supervised fine-tuning."""
167 |     train_dataset = SupervisedDataset(tokenizer=tokenizer,
168 |                                       data_path=data_args.data_path)
169 |     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
170 |     return dict(train_dataset=train_dataset,
171 |                 eval_dataset=None,
172 |                 data_collator=data_collator)
173 | 
174 | 
175 | def train():
176 |     parser = transformers.HfArgumentParser(
177 |         (ModelArguments, DataArguments, TrainingArguments))
178 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
179 | 
180 |     # load model
181 |     model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path,
182 |                                                  trust_remote_code=True,
183 |                                                  torch_dtype=getattr(
184 |                                                      torch, 'bfloat16'))
185 |     model.train()
186 | 
187 |     # load tokenizer
188 |     tokenizer = AutoTokenizer.from_pretrained(
189 |         model_args.model_name_or_path,
190 |         use_fast=True,
191 |         model_max_length=training_args.model_max_length,
192 |         padding_side="right",
193 |         trust_remote_code=True)
194 |     # setup pad token
195 |     model.config.pad_token_id = 0
196 |     tokenizer.pad_token_id = 0
197 | 
198 |     data_module = make_supervised_data_module(tokenizer=tokenizer,
199 |                                               data_args=data_args)
200 | 
201 |     torch.cuda.empty_cache()
202 |     trainer = Trainer(model=model,
203 |                       tokenizer=tokenizer,
204 |                       args=training_args,
205 |                       **data_module)
206 | 
207 |     trainer.train()
208 |     trainer.save_state()
209 |     trainer.save_model(output_dir=training_args.output_dir)
210 | 
211 | 
212 | if __name__ == "__main__":
213 |     train()
214 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2023] [OpenNLPLab]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable first-line-h1 -->
  2 | <!-- markdownlint-disable html -->
  3 | 
  4 | <div align="center">
  5 | <h1>
  6 |   TransNormerLLM -- A Faster and Better LLM
  7 | </h1>
  8 | </div>
  9 | 
 10 | <p align="center">
 11 | 🤗 <a href="https://huggingface.co/OpenNLPLab/" target="_blank">Hugging Face</a> •
 12 | 🤖 <a href="https://modelscope.cn/models/OpenNLPLab/TransNormerLLM-1B/" target="_blank">Model Scope</a> •
 13 | 💬 <a href="https://discord.com/invite/GB3zWJAK" target="_blank">Discord</a> •
 14 | 💬 <a href="./images/contact_me_qr.png" target="_blank">微信</a> 
 15 | </p>
 16 | <div align="center">
 17 | 
 18 | [![license](https://img.shields.io/github/license/modelscope/modelscope.svg)](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE)
 19 | <h4 align="center">
 20 |     <p>
 21 |         <a href="https://github.com/OpenNLPLab/TransNormerLLM/blob/main/README.md">English</a> |
 22 |         <b>中文</b> 
 23 |     <p>
 24 | </h4>
 25 | </div>
 26 | 
 27 | ------
 28 | - [入门简介](#入门简介)
 29 | - [开源模型](#开源模型)
 30 | - [评测结果](#评测结果)
 31 |   - [通用领域](#通用领域)
 32 |     - [模型结果](#模型结果)
 33 | - [推理部署](#推理部署)
 34 |   - [安装依赖](#安装依赖)
 35 |   - [特别注意](#特别注意)
 36 |   - [Python 推理代码](#python-推理代码)
 37 |     - [基础模型推理演示](#基础模型推理演示)
 38 | - [微调模型](#微调模型)
 39 |   - [依赖安装](#依赖安装)
 40 |   - [训练](#训练)
 41 | - [社区生态](#社区生态)
 42 | - [许可声明](#许可声明)
 43 |   - [声明](#声明)
 44 |   - [协议](#协议)
 45 |   - [致谢](#致谢)
 46 |   - [引用](#引用)
 47 | 
 48 | # 入门简介
 49 | 
 50 | 我们正在重新定义大型语言模型（LLM）。该代码仓库是[TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf)的官方实现。 我们的 TransNormerLLM 开放权现在可供个人、创作者、研究人员和各种规模的企业使用，以便他们能够负责任地实验、创新和扩展他们的想法。
 51 | 
 52 | 我们开放的版本包含 TransNormerLLM 模型实现、开源权重和监督微调 (SFT) 的起始代码。 我们将展示如何加载 [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf) 模型、运行 SFT 并对其进行推理的示例。
 53 | 
 54 | - TransNormerLLM 是第一个基于线性注意力的 LLM，在准确性和效率方面均优于传统的基于 softmax 注意力的模型。 它是在具有 **1.4 万亿** 的高质量token语料库上进行训练的。
 55 | - TransNormerLLM 从之前的线性注意力架构 TransNormer 演变而来，进行了一系列的优化，包括 LRPE 位置嵌入、闪电注意力加速、新的门控和标准化机制。
 56 | - TransNormerLLM 在多个广受认可的中文、英文以及多语言通用和特定领域基准测试中取得了同类规模的非常有竞争性的性能。
 57 | - 此版本包括具有 **385M**、**1B** 和 **7B** 参数的 **Base** 版本。
 58 | - 所有版本均完全开放给学术研究。 开发者只需通过电子邮件申请并获得官方商业许可即可免费商业使用。
 59 | - 欲了解更多信息，欢迎阅读我们的学术论文[TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf)。
 60 | 
 61 | ![](./images/TransNormerLLM-arch.png)
 62 | 
 63 | # 开源模型
 64 | 
 65 | 具体发布版本及下载链接如下：
 66 | 
 67 | |         | 基础模型  | 
 68 | |:-------:|:-----------:|
 69 | | 385M      | 🤗 [TransNormerLLM-385M](https://huggingface.co/OpenNLPLab/TransNormerLLM-385M) | 
 70 | | 1B     | 🤗 [TransNormerLLM-1B](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B) |
 71 | | 7B   | 🤗 [TransNormerLLM-7B](https://huggingface.co/OpenNLPLab/TransNormerLLM-7B) | 
 72 | 
 73 | # 评测结果
 74 | 
 75 | 为了验证 TransNormerLLM，我们在 Commonsense Reasoning Task、MMLU、CMMLU 和 C-Eval 上测试了 385M、1B 和 7B 模型。 为了进行比较，我们选择了几个开源模型作为比较，包括基于 Transformer 的模型，如 OPT、Pythia、BLOOM、GPT-Neo、GPT-J、MPT、Falcon、LLaMA1/2、OpenLLAMA v1/v2、Baichuan 1/ 2、ChatGLM 1/2，以及非Transformer模型RWKV。 可以看出，与这些模型相比，TransNormerLLM仍然具有很强的竞争力。
 76 | 
 77 | **常识推理** 我们报告 BoolQ、PIQA、SIQA、
 78 | HellaSwag、WinoGrande、ARC 简单和挑战、OpenBookQA 及其平均值。 我们使用 LM-Eval-Harness 报告所有基准测试的0-shot结果。
 79 | 与现有最先进的大语言模型相比，我们所有的模型都取得了具有竞争力的表现，展示了理解和应用常识推理的卓越能力。
 80 | 
 81 | **汇总基准**
 82 | 我们报告 MMLU、CMMLU、C-Eval 的总体结果。使用官方脚本来评估 MMLU、CMMLU 和 C-Eval，所有评估结果均采用 5-shot结果。 与业界顶级的开源模型相比，我们的模型在英文和中文基准测试中都表现出了相匹配的性能。
 83 | 
 84 | 
 85 | ## 通用领域
 86 | 
 87 | 在通用领域，我们对以下数据集进行了 5-shot 测试：
 88 | - [C-Eval](https://cevalbenchmark.com/index.html#home)是一个综合性的中文基础模型评估数据集，涵盖52个学科和四个难度级别。 我们使用该数据集的开发集作为小样本学习的来源，并在测试集上进行测试。 我们的评估方法遵循 [LM-Evaluation-Harness](https://github.com/EleutherAI/lm-evaluation-harness)。
 89 | - [MMLU](https://arxiv.org/abs/2009.03300)是一个英语评估数据集，包含57个任务，涵盖小学数学、美国历史、计算机科学、法律等，难度从高中水平到专家水平 。 它是主流的LLM评估数据集。 我们使用其[官方](https://github.com/hendrycks/test)评估方法。
 90 | - [CMMLU](https://github.com/haonan-li/CMMLU)是一个涵盖67个主题的综合中文评估基准，专门用于评估语言模型在中文背景下的知识和推理能力。 我们采用了其[官方](https://github.com/haonan-li/CMMLU)评估方法。
 91 | 
 92 | 
 93 | ### 模型结果
 94 | 
 95 | **常识推理和通用领域性能比较。** 为了公平比较，我们报告了我们使用其发布的模型重现的竞争方法的结果。  PS：参数大小（十亿）。 T：Tokens（万亿）。 HS：HellaSwag。 WG：WinoGrande。
 96 | 
 97 | | Model       | PS   | T    | BoolQ          | PIQA           | HS             | WG             | ARC-e          | ARC-c          | OBQA           | MMLU           | CMMLU          | C-Eval         |
 98 | |-------------|------|------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
 99 | | OPT         | 0.35 | 0.30 | 57.74          | 64.58          | 36.69          | 52.49          | 44.02          | 23.89          | 28.20          | 26.02          | 25.34          | 25.71          |
100 | | Pythia      | 0.40 | 0.30 | 60.40          | 67.08          | 40.52          | 53.59          | 51.81          | 24.15          | 29.40          | 25.99          | 25.16          | 24.81          |
101 | | BLOOM       | 0.56 | 0.35 | 55.14          | 64.09          | 36.97          | 52.80          | 47.35          | 23.98          | 28.20          | 24.80          | 25.35          | 27.14          |
102 | | RWKV        | 0.43 | -    | -              | 67.52   | 40.90 | 51.14 | 52.86 | 25.17 | 32.40 | 24.85          | -              | -              |
103 | | **Ours**        | 0.39 | 1.0  | 62.14          | 66.70          | 46.27          | 54.46          | 55.43          | 27.99          | 32.40          | 25.90          | 25.05          | 25.24          |
104 | | GPT-Neo     | 1.3  | 0.3  | 61.99          | 71.11          | 48.93          | 54.93          | 56.19          | 25.85          | 33.60          | 24.82          | 26.03          | 23.94          |
105 | | OPT         | 1.3  | 0.3  | 57.77          | 71.71          | 53.70          | 59.35          | 57.24          | 29.69          | 33.20          | 24.96          | 24.97          | 25.32          |
106 | | Pythia      | 1.4  | 0.3  | 60.73          | 70.67          | 47.18          | 53.51          | 56.99          | 26.88          | 31.40          | 26.55          | 25.13          | 24.25          |
107 | | BLOOM       | 1.1  | 0.35 | 59.08          | 67.14          | 42.98          | 54.93          | 51.47          | 25.68          | 29.40          | 27.30          | 25.09          | 26.50          |
108 | | RWKV        | 1.5  | -    | -              | 72.36 | 52.48 | 54.62 | 60.48 | 29.44 | 34.00 | 25.77          | -              | -              |
109 | | Falcon      | 1.0  | 0.35 | 61.38          | 75.14          | 61.50          | 60.30          | 63.38          | 32.17          | 35.60          | 25.28          | 24.88          | 25.66          |
110 | | **Ours**        | 1.0  | 1.2  | 63.27          | 72.09          | 56.49          | 60.38          | 63.68          | 35.24          | 36.60          | 27.10          | 25.88          | 26.01          |
111 | | GPT-J       | 6.9  | 0.3  | 65.44          | 75.41          | 66.25          | 64.09          | 66.92          | 36.60          | 38.20          | 25.40          | 26.47          | 23.39          |
112 | | OPT         | 6.7  | 0.3  | 66.18          | 76.22          | 67.21          | 65.19          | 65.66          | 34.64          | 37.20          | 24.57          | 25.36          | 25.32          |
113 | | Pythia      | 6.9  | 0.3  | 63.46          | 75.14          | 63.92          | 60.77          | 67.34          | 35.41          | 37.00          | 24.64          | 25.56          | 26.40          |
114 | | BLOOM       | 7.1  | 0.35 | 62.91          | 72.69          | 62.33          | 64.01          | 65.11          | 33.45          | 35.80          | 26.25          | 24.97          | 24.25          |
115 | | RWKV        | 7.4  | -    | -              | 76.06 | 65.51 | 61.01 | 67.80 | 37.46 | 40.20 | 24.96          | -              | -              |
116 | | MPT         | 6.9  | 1.0  | 73.88          | 79.43          | 76.25          | 68.27          | 74.79          | 41.72          | 42.20          | 30.80          | 25.99          | 24.06          |
117 | | Falcon      | 7.2  | 1.5  | 73.73          | 79.38          | 76.3           | 67.17          | 74.62          | 43.60          | 43.80          | 27.79          | 25.73          | 22.92          |
118 | | Baichuan1   | 7.0  | 1.2  | 70.09          | 76.01          | 70.06          | 64.09          | 71.72          | 40.53          | 38.20          | 42.30 | 44.43 | 42.80 |
119 | | Baichuan2   | 7.0  | 2.6  | 72.72          | 76.50          | 72.17          | 68.35          | 75.17          | 42.32          | 39.60          | 54.16 | 57.07 | 54.00 |
120 | | ChatGLM1    | 6.7  | 1.0  | 74.74          | 68.88          | 45.57          | 52.25          | 48.78          | 31.66          | 36.80          | 40.63 | 37.48          | 40.23 |
121 | | ChatGLM2    | 7.1  | 1.4  | 77.65          | 69.37          | 50.51          | 57.62          | 59.13          | 34.30          | 37.00          | 45.46 | 48.80          | 52.55 |
122 | | OpenLLaMAv1 | 6.7  | 1.0  | 70.43          | 75.68          | 69.23          | 66.69          | 71.17          | 38.57          | 39.00          | 30.49          | 25.40          | 26.09          |
123 | | OpenLLaMAv2 | 6.7  | 1.0  | 72.20          | 78.84          | 74.51          | 65.67          | 72.39          | 41.30          | 41.00          | 41.29          | 29.58          | 30.01          |
124 | | LLaMA1      | 6.7  | 1.0  | 76.50 | 79.80 | 76.10 | 70.10 | 72.80 | 47.60 | 57.20 | 35.10 | 25.62          | 25.72          |
125 | | LLaMA2      | 6.7  | 2.0  | 77.68 | 78.07 | 76.02 | 68.98 | 76.30 | 46.33 | 44.20 | 45.30 | 32.96          | 33.20          |
126 | | **Ours**        | 6.8  | 1.4  | 75.87          | 80.09          | 75.21          | 66.06          | 75.42          | 44.40          | 63.40          | 43.10          | 47.99          | 43.18          |
127 | 
128 | 
129 | # 推理部署
130 | 
131 | 推理所需的模型权重、源代码和配置已在 Hugging Face 上发布。 下载链接可以在本文档开头的[表格](#开源模型)中找到。 下面，我们以 TransNormerLLM-1B 为例演示各种推理方法。 程序会自动从Hugging Face下载所需的资源。
132 | ## 安装依赖
133 | 
134 | ```shell
135 | pip install -r requirements.txt
136 | ```
137 | 
138 | ## 特别注意
139 | 如果遇到Triton相关错误，请设置以下环境变量:
140 | ```
141 | export use_triton=False
142 | ```
143 | 
144 | ## Python 推理代码
145 | 
146 | ### 基础模型推理演示
147 | 
148 | ```python
149 | >>> from transformers import AutoModelForCausalLM, AutoTokenizer
150 | >>> tokenizer = AutoTokenizer.from_pretrained("OpenNLPLab/TransNormerLLM-1B", trust_remote_code=True)
151 | >>> model = AutoModelForCausalLM.from_pretrained("OpenNLPLab/TransNormerLLM-1B", device_map="auto", trust_remote_code=True)
152 | ```
153 | 
154 | > 在上面的代码片段中，模型加载指定`device_map='auto'`，它将使用所有可用的GPU。 如果需要指定要使用的设备，可以通过类似于“export CUDA_VISIBLE_DEVICES=0,1”（使用0和1显卡）的方式进行控制。
155 | 
156 | # 微调模型
157 | 
158 | ## 依赖安装
159 | 
160 | ```shell
161 | git clone https://github.com/OpenNLPLab/TransNormerLLM.git
162 | cd TransNormerLLM/fine-tune
163 | pip install -r requirements.txt
164 | ```
165 | - 要使用LoRA等轻量级微调方法，您必须另外安装[peft](https://github.com/huggingface/peft)。
166 | 
167 | ## 训练
168 | 
169 | 下面，我们提供了使用 ZeRO-3 在单台机器上微调 TransNormerLLM-7B-Base 的示例。
170 | 
171 | 训练数据：`alpaca_data.json`。 此示例数据取自 [alpaca_data.json](https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json)，包含 52,002 个条目的选择，并已重新格式化。 主要目的是演示如何SFT我们的模型，不保证有效性。
172 | 
173 | ```shell
174 | torchrun \
175 |     --nproc_per_node=8 \
176 |     train.py \
177 |     --model_name_or_path OpenNLPLab/TransNormerLLM-1B \
178 |     --data_path ./alpaca_data.json \
179 |     --output_dir output \
180 |     --num_train_epochs 1 \
181 |     --per_device_train_batch_size 2 \
182 |     --per_device_eval_batch_size 1 \
183 |     --gradient_accumulation_steps 1 \
184 |     --bf16 true \
185 |     --adam_beta1 0.9 \
186 |     --adam_beta2 0.95 \
187 |     --evaluation_strategy "no" \
188 |     --save_strategy "steps" \
189 |     --save_steps 5000 \
190 |     --save_total_limit 30 \
191 |     --learning_rate 1e-4 \
192 |     --weight_decay 0.1 \
193 |     --warmup_ratio 0.1 \
194 |     --lr_scheduler_type "cosine" \
195 |     --deepspeed 'configs/zero3.json' \
196 |     --logging_steps 1 \
197 |     --dataloader_num_workers 24 \
198 |     --ddp_find_unused_parameters false \
199 |     --tf32 true \
200 | ```
201 | 
202 | # 社区生态
203 | 
204 | **📢📢📢我们将不断更新这里社区和生态系统对 TransNormerLLM 的支持😀😀😀**
205 | - [nanoTransnormer](https://github.com/Doraemonzzz/nanoTransNormer)
206 | 
207 | # 许可声明
208 | 
209 | ## 声明
210 | 
211 | 
212 | 我们特此声明，我们的团队没有开发过任何基于 TransNormerLLM 模型的应用程序，也没有在 iOS、Android、Web 或任何其他平台上开发过。 我们强烈呼吁所有用户不要利用TransNormerLLM模型进行任何危害国家/社会安全或违法的活动。 此外，我们要求用户不要将 TransNormerLLM 模型用于未经过适当安全审查和备案的互联网服务。 我们希望所有用户都能遵守这一原则，确保技术的发展在规范、合法的环境中进行。
213 | 
214 | 我们已尽力确保模型训练过程中使用的数据的合规性。 然而，尽管我们付出了巨大的努力，由于模型和数据的复杂性，仍然可能会出现一些不可预见的问题。 因此，如果因使用TransNormerLLM开源模型而出现任何问题，包括但不限于数据安全问题、舆情风险，或模型被误导、滥用、传播或不当利用带来的任何风险和问题， 我们将不承担任何责任。
215 | 
216 | ## 协议
217 | 
218 | TransNormerLLM 模型的社区使用需要遵守 [Apache 2.0](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE) 和 [TransNormerLLM 模型社区许可证](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B/blob/main/TransNormerLLM模型社区许可协议.pdf)。  TransNormerLLM 模型支持商业用途。 如果您计划将TransNormerLLM模型或其衍生品用于商业目的，请确保您的实体满足以下条件：
219 | 
220 |    1. 您或您关联公司的服务或产品的日活跃用户（DAU）低于100万。
221 |    2. 您或您的关联公司都不是软件服务提供商或云服务提供商。
222 |    3. 未经 TransNormerLLM 许可，您或您的关联公司不可能将给予您的商业许可授予或重新授权给其他第三方。
223 | 
224 | 满足上述条件后，您需要通过以下联系邮箱提交TransNormerLLM模型社区许可协议所需的申请材料：opennlplab@gmail.com。 一旦获得批准，TransNormerLLM 将特此授予您非排他性、全球性、不可转让、不可再许可、可撤销的商业版权许可。
225 | 
226 | ## 致谢
227 | 我们的项目基于如下开源项目进行开发:
228 | - [Baichuan](https://github.com/baichuan-inc/Baichuan-7B)用于tokenizer部分。
229 | - [metaseq](https://github.com/facebookresearch/metaseq)用于训练部分。
230 | - [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)用于测评部分。
231 | 
232 | 
233 | ## 引用
234 | 
235 | 如果您想引用我们的工作，请使用以下参考文献：
236 | ```
237 | @misc{qin2024transnormerllm,
238 |       title={TransNormerLLM: A Faster and Better Large Language Model with Improved TransNormer},
239 |       author={Zhen Qin and Dong Li and Weigao Sun and Weixuan Sun and Xuyang Shen and Xiaodong Han and Yunshen Wei and Baohong Lv and Xiao Luo and Yu Qiao and Yiran Zhong},
240 |       year={2024},
241 |       eprint={2307.14995},
242 |       archivePrefix={arXiv},
243 |       primaryClass={cs.CL}
244 | }
245 | 
246 | @misc{qin2024lightning,
247 |       title={Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models},
248 |       author={Zhen Qin and Weigao Sun and Dong Li and Xuyang Shen and Weixuan Sun and Yiran Zhong},
249 |       year={2024},
250 |       eprint={2401.04658},
251 |       archivePrefix={arXiv},
252 |       primaryClass={cs.CL}
253 | }
254 | 
255 | ```
256 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable first-line-h1 -->
  2 | <!-- markdownlint-disable html -->
  3 | 
  4 | <div align="center">
  5 | <h1>
  6 |   TransNormerLLM -- A Faster and Better LLM
  7 | </h1>
  8 | </div>
  9 | 
 10 | <p align="center">
 11 | 🤗 <a href="https://huggingface.co/OpenNLPLab/" target="_blank">Hugging Face</a> •
 12 | 🤖 <a href="https://modelscope.cn/models/OpenNLPLab/TransNormerLLM-7B" target="_blank">Model Scope</a> •
 13 | 💬 <a href="https://discord.gg/A8UrpM6A4" target="_blank">Discord</a> •
 14 | 💬 <a href="./images/contact_me_qr.png" target="_blank">WeChat</a> •
 15 | 🔢 <a href="https://github.com/LaaZa/AutoGPTQ/tree/TransNormer" target="_blank">GPTQ</a> 
 16 | </p>
 17 | <div align="center">
 18 | 
 19 | 
 20 | [![license](https://img.shields.io/github/license/modelscope/modelscope.svg)](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE)
 21 | <h4 align="center">
 22 |     <p>
 23 |         <b>English</b> |
 24 |         <a href="https://github.com/OpenNLPLab/TransNormerLLM/blob/main/README_CN.md">中文</a>
 25 |     <p>
 26 | </h4>
 27 | </div>
 28 | 
 29 | 
 30 | ------
 31 | - [Introduction](#introduction)
 32 | - [Released Weights](#released-weights)
 33 | - [Benchmark Results](#benchmark-results)
 34 |   - [General Domain](#general-domain)
 35 |     - [Model Results](#model-results)
 36 | - [Inference and Deployment](#inference-and-deployment)
 37 |   - [Dependency Installation](#dependency-installation)
 38 |   - [Notice](#notice)
 39 |   - [Python Code Inference](#python-code-inference)
 40 |     - [Demonstration of Base Model Inference](#demonstration-of-base-model-inference)
 41 | - [Fine-tuning the Model](#fine-tuning-the-model)
 42 |   - [Dependency Installation](#dependency-installation-1)
 43 |   - [Training](#training)
 44 | - [Community and Ecosystem](#community-and-ecosystem)
 45 | - [Disclaimer, License and Citation](#disclaimer-license-and-citation)
 46 |   - [Disclaimer](#disclaimer)
 47 |   - [License](#license)
 48 |   - [Acknowledgments](#acknowledgments)
 49 |   - [Citation](#citation)
 50 | 
 51 | # Introduction
 52 | 
 53 | We are re-inventing the Large Language Model (LLM). This is the official implementation of [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf). Our opened weights of TransNormerLLM are now accessible to individuals, creators, researchers and businesses of all sizes so that they can experiment, innovate and scale their ideas responsibly.
 54 | 
 55 | Our release contains the TransNormerLLM model implementation, the open-source weights and the starting code for Supervised Fine-tuning (SFT). We will show examples on how to load [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf) models, run SFT and inference on it.
 56 | 
 57 | - TransNormerLLM is the first linear attention-based LLM that outperforms conventional softmax attention-based models in terms of both accuracy and efficiency. It was trained on a high-quality corpus with up to **1.4 trillion** tokens.
 58 | - TransNormerLLM evolves from the previous linear attention architecture TransNormer by making advanced modifications that include LRPE positional embedding, Lightning Attention acceleration, new gating and normalization mechanisms.
 59 | - TransNormerLLM achieved competitive performance of its size on multiple well-approved Chinese, English, and multi-language general and domain-specific benchmarks.
 60 | - This release includes **Base** versions with **385M**, **1B**, and **7B** parameters.
 61 | - All versions are fully open to academic research. Developers only need to apply via email and obtain official commercial permission to use it for free commercially.
 62 | - For more information, welcome reading our academic paper [TransNormerLLM](https://arxiv.org/pdf/2307.14995.pdf).
 63 | - 🔥Get excited!🔥 Our **15B** model is currently in training! Click the [link](https://api.wandb.ai/links/opennlplab/kip314lq) 👀 to track our thrilling progress in real time! 🚀
 64 | 
 65 | ![](./images/TransNormerLLM-arch.png)
 66 | 
 67 | # Released Weights
 68 | 
 69 | The specific released versions and download links are shown as below:
 70 | 
 71 | |                   |                                  Base Models                                   |
 72 | | :---------------: | :----------------------------------------------------------------------------: |
 73 | |       385M        | 🤗 [TransNormerLLM-385M](https://huggingface.co/OpenNLPLab/TransNormerLLM-385M) |
 74 | |        1B         |   🤗 [TransNormerLLM-1B](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B)   |
 75 | |        7B         |   🤗 [TransNormerLLM-7B](https://huggingface.co/OpenNLPLab/TransNormerLLM-7B)   |
 76 | 
 77 | # Benchmark Results
 78 | 
 79 | To validate TransNormerLLM, we tested our 385M, 1B, and 7B models on Commonsense Reasoning Task, MMLU, CMMLU, and C-Eval. For comparison, we selected several open-source models as competitors, including Transformer-based models such as OPT, Pythia, BLOOM, GPT-Neo, GPT-J, MPT, Falcon, LLaMA1/2, OpenLLAMA v1/v2, Baichuan 1/2, ChatGLM 1/2, and non-Transformer model RWKV. It can be observed that, compared to these models, TransNormerLLM remains highly competitive.
 80 | 
 81 | **Commonsense Reasoning** We report BoolQ, PIQA, SIQA,
 82 | HellaSwag, WinoGrande, ARC easy and challenge, OpenBookQA and their average. We report 0-shot results for all benchmarks using LM-Eval-Harness.
 83 | All of our models achieve competitive performance compared to existing state-of-the-art LLMs, showcasing a remarkable ability to comprehend and apply commonsense reasoning.
 84 | 
 85 | **Aggregated Benchmarks**
 86 | We report the overall results for MMLU, CMMLU, C-Eval. Official scripts were used for evaluating MMLU, CMMLU, and C-Eval, with all evaluation results being conducted with a 5-shot setup. In comparison to top-tier open-source models available in the industry, our models have demonstrated matched performance in both English and Chinese benchmarks.
 87 | 
 88 | ## General Domain
 89 | 
 90 | In the general domain, we conducted 5-shot tests on the following datasets:
 91 | - [C-Eval](https://cevalbenchmark.com/index.html#home) is a comprehensive Chinese basic model evaluation dataset, covering 52 disciplines and four levels of difficulty. Our evaluation approach followed that of [LM-Evaluation-Harness](https://github.com/EleutherAI/lm-evaluation-harness).
 92 | - [MMLU](https://arxiv.org/abs/2009.03300) is an English evaluation dataset comprising 57 tasks, encompassing elementary math, American history, computer science, law, etc. The difficulty ranges from high school level to expert level. It's a mainstream LLM evaluation dataset. We used its [official](https://github.com/hendrycks/test) evaluation approach.
 93 | - [CMMLU](https://github.com/haonan-li/CMMLU) is a comprehensive Chinese evaluation benchmark covering 67 topics, specifically designed to assess language models' knowledge and reasoning capabilities in a Chinese context. We adopted its [official](https://github.com/haonan-li/CMMLU) evaluation approach.
 94 | 
 95 | 
 96 | ### Model Results
 97 | **Performance Comparison on Commonsense Reasoning and Aggregated Benchmarks.** For a fair comparison, we report competing methods' results reproduced by us using their released models. PS: parameter size (billion). T: tokens (trillion). HS: HellaSwag. WG: WinoGrande.
 98 | 
 99 | | Model       | PS   | T    | BoolQ | PIQA  | HS    | WG    | ARC-e | ARC-c | OBQA  | MMLU  | CMMLU | C-Eval |
100 | | ----------- | ---- | ---- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ------ |
101 | | OPT         | 0.35 | 0.30 | 57.74 | 64.58 | 36.69 | 52.49 | 44.02 | 23.89 | 28.20 | 26.02 | 25.34 | 25.71  |
102 | | Pythia      | 0.40 | 0.30 | 60.40 | 67.08 | 40.52 | 53.59 | 51.81 | 24.15 | 29.40 | 25.99 | 25.16 | 24.81  |
103 | | BLOOM       | 0.56 | 0.35 | 55.14 | 64.09 | 36.97 | 52.80 | 47.35 | 23.98 | 28.20 | 24.80 | 25.35 | 27.14  |
104 | | RWKV        | 0.43 | -    | -     | 67.52 | 40.90 | 51.14 | 52.86 | 25.17 | 32.40 | 24.85 | -     | -      |
105 | | **Ours**    | 0.39 | 1.0  | 62.14 | 66.70 | 46.27 | 54.46 | 55.43 | 27.99 | 32.40 | 25.90 | 25.05 | 25.24  |
106 | | GPT-Neo     | 1.3  | 0.3  | 61.99 | 71.11 | 48.93 | 54.93 | 56.19 | 25.85 | 33.60 | 24.82 | 26.03 | 23.94  |
107 | | OPT         | 1.3  | 0.3  | 57.77 | 71.71 | 53.70 | 59.35 | 57.24 | 29.69 | 33.20 | 24.96 | 24.97 | 25.32  |
108 | | Pythia      | 1.4  | 0.3  | 60.73 | 70.67 | 47.18 | 53.51 | 56.99 | 26.88 | 31.40 | 26.55 | 25.13 | 24.25  |
109 | | BLOOM       | 1.1  | 0.35 | 59.08 | 67.14 | 42.98 | 54.93 | 51.47 | 25.68 | 29.40 | 27.30 | 25.09 | 26.50  |
110 | | RWKV        | 1.5  | -    | -     | 72.36 | 52.48 | 54.62 | 60.48 | 29.44 | 34.00 | 25.77 | -     | -      |
111 | | Falcon      | 1.0  | 0.35 | 61.38 | 75.14 | 61.50 | 60.30 | 63.38 | 32.17 | 35.60 | 25.28 | 24.88 | 25.66  |
112 | | **Ours**    | 1.0  | 1.2  | 63.27 | 72.09 | 56.49 | 60.38 | 63.68 | 35.24 | 36.60 | 27.10 | 25.88 | 26.01  |
113 | | GPT-J       | 6.9  | 0.3  | 65.44 | 75.41 | 66.25 | 64.09 | 66.92 | 36.60 | 38.20 | 25.40 | 26.47 | 23.39  |
114 | | OPT         | 6.7  | 0.3  | 66.18 | 76.22 | 67.21 | 65.19 | 65.66 | 34.64 | 37.20 | 24.57 | 25.36 | 25.32  |
115 | | Pythia      | 6.9  | 0.3  | 63.46 | 75.14 | 63.92 | 60.77 | 67.34 | 35.41 | 37.00 | 24.64 | 25.56 | 26.40  |
116 | | BLOOM       | 7.1  | 0.35 | 62.91 | 72.69 | 62.33 | 64.01 | 65.11 | 33.45 | 35.80 | 26.25 | 24.97 | 24.25  |
117 | | RWKV        | 7.4  | -    | -     | 76.06 | 65.51 | 61.01 | 67.80 | 37.46 | 40.20 | 24.96 | -     | -      |
118 | | MPT         | 6.9  | 1.0  | 73.88 | 79.43 | 76.25 | 68.27 | 74.79 | 41.72 | 42.20 | 30.80 | 25.99 | 24.06  |
119 | | Falcon      | 7.2  | 1.5  | 73.73 | 79.38 | 76.3  | 67.17 | 74.62 | 43.60 | 43.80 | 27.79 | 25.73 | 22.92  |
120 | | Baichuan1   | 7.0  | 1.2  | 70.09 | 76.01 | 70.06 | 64.09 | 71.72 | 40.53 | 38.20 | 42.30 | 44.43 | 42.80  |
121 | | Baichuan2   | 7.0  | 2.6  | 72.72 | 76.50 | 72.17 | 68.35 | 75.17 | 42.32 | 39.60 | 54.16 | 57.07 | 54.00  |
122 | | ChatGLM1    | 6.7  | 1.0  | 74.74 | 68.88 | 45.57 | 52.25 | 48.78 | 31.66 | 36.80 | 40.63 | 37.48 | 40.23  |
123 | | ChatGLM2    | 7.1  | 1.4  | 77.65 | 69.37 | 50.51 | 57.62 | 59.13 | 34.30 | 37.00 | 45.46 | 48.80 | 52.55  |
124 | | OpenLLaMAv1 | 6.7  | 1.0  | 70.43 | 75.68 | 69.23 | 66.69 | 71.17 | 38.57 | 39.00 | 30.49 | 25.40 | 26.09  |
125 | | OpenLLaMAv2 | 6.7  | 1.0  | 72.20 | 78.84 | 74.51 | 65.67 | 72.39 | 41.30 | 41.00 | 41.29 | 29.58 | 30.01  |
126 | | LLaMA1      | 6.7  | 1.0  | 76.50 | 79.80 | 76.10 | 70.10 | 72.80 | 47.60 | 57.20 | 35.10 | 25.62 | 25.72  |
127 | | LLaMA2      | 6.7  | 2.0  | 77.68 | 78.07 | 76.02 | 68.98 | 76.30 | 46.33 | 44.20 | 45.30 | 32.96 | 33.20  |
128 | | **Ours**    | 6.8  | 1.4  | 75.87 | 80.09 | 75.21 | 66.06 | 75.42 | 44.40 | 63.40 | 43.10 | 47.99 | 43.18  |
129 | 
130 | 
131 | # Inference and Deployment
132 | 
133 | The model weights, source code, and configuration needed for inference have been released on Hugging Face. Download links can be found in the [table](#released-weights). Below, we demonstrate various inference methods using TransNormerLLM-1B as an example. The program will automatically download the required resources from Hugging Face.
134 | 
135 | ## Dependency Installation
136 | 
137 | ```shell
138 | pip install -r requirements.txt
139 | ```
140 | 
141 | ## Notice
142 | If you encounter errors related to Triton, please set the following environment variables:
143 | ```
144 | export use_triton=False
145 | ```
146 | 
147 | 
148 | ## Python Code Inference
149 | 
150 | ### Demonstration of Base Model Inference
151 | 
152 | ```python
153 | >>> from transformers import AutoModelForCausalLM, AutoTokenizer
154 | >>> tokenizer = AutoTokenizer.from_pretrained("OpenNLPLab/TransNormerLLM-1B", trust_remote_code=True)
155 | ```
156 | 
157 | > In the above code snippets, the model loading specifies `device_map='auto'`, which will use all available GPUs. If you need to specify the device(s) to use, you can control it in a way similar to `export CUDA_VISIBLE_DEVICES=0,1` (using the 0 and 1 graphics cards).
158 | 
159 | 
160 | # Fine-tuning the Model
161 | 
162 | ## Dependency Installation
163 | 
164 | ```shell
165 | git clone https://github.com/OpenNLPLab/TransNormerLLM.git
166 | cd TransNormerLLM/fine-tune
167 | pip install -r requirements.txt
168 | ```
169 | - To use lightweight fine-tuning methods like LoRA, you must additionally install [peft](https://github.com/huggingface/peft).
170 | 
171 | ## Training
172 | 
173 | Below, we provide an example of fine-tuning the TransNormerLLM-1B on a single machine with ZeRO-3.
174 | 
175 | Training Data: `alpaca_data.json`. This sample data was drawn from [alpaca_data.json](https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json), consisting of a selection of 52,002 entries, and has been reformatted. The main purpose is to demonstrate how to SFT our model, and effectiveness is not guaranteed.
176 | 
177 | ```shell
178 | torchrun \
179 |     --nproc_per_node=8 \
180 |     train.py \
181 |     --model_name_or_path OpenNLPLab/TransNormerLLM-1B \
182 |     --data_path ./alpaca_data.json \
183 |     --output_dir output \
184 |     --num_train_epochs 1 \
185 |     --per_device_train_batch_size 2 \
186 |     --per_device_eval_batch_size 1 \
187 |     --gradient_accumulation_steps 1 \
188 |     --bf16 true \
189 |     --adam_beta1 0.9 \
190 |     --adam_beta2 0.95 \
191 |     --evaluation_strategy "no" \
192 |     --save_strategy "steps" \
193 |     --save_steps 5000 \
194 |     --save_total_limit 30 \
195 |     --learning_rate 1e-4 \
196 |     --weight_decay 0.1 \
197 |     --warmup_ratio 0.1 \
198 |     --lr_scheduler_type "cosine" \
199 |     --deepspeed 'configs/zero3.json' \
200 |     --logging_steps 1 \
201 |     --dataloader_num_workers 24 \
202 |     --ddp_find_unused_parameters false \
203 |     --tf32 true \
204 | ```
205 | 
206 | # Community and Ecosystem
207 | 
208 | **📢📢📢 We will continuously update the support for TransNormerLLM from the community and ecosystem here 😀😀😀**
209 | - [nanoTransnormer](https://github.com/Doraemonzzz/nanoTransNormer)
210 | 
211 | # Disclaimer, License and Citation
212 | 
213 | ## Disclaimer
214 | We hereby declare that our team has not developed any applications based on TransNormerLLM models, not on iOS, Android, the web, or any other platform. We strongly call on all users not to use TransNormerLLM models for any activities that harm national / social security or violate the law. Also, we ask users not to use TransNormerLLM models for Internet services that have not undergone appropriate security reviews and filings. We hope that all users can abide by this principle and ensure that the development of technology proceeds in a regulated and legal environment.
215 | 
216 | We have done our best to ensure the compliance of the data used in the model training process. However, despite our considerable efforts, there may still be some unforeseeable issues due to the complexity of the model and data. Therefore, if any problems arise due to the use of TransNormerLLM open-source models, including but not limited to data security issues, public opinion risks, or any risks and problems brought about by the model being misled, abused, spread or improperly exploited, we will not assume any responsibility.
217 | 
218 | ## License
219 | The community usage of TransNormerLLM model requires adherence to [Apache 2.0](https://github.com/OpenNLPLab/TransNormerLLM/blob/main/LICENSE) and [Community License for TransNormerLLM Model](https://huggingface.co/OpenNLPLab/TransNormerLLM-1B/blob/main/Community%20License%20for%20TransNormerLLM%20Model.pdf). The TransNormerLLM model supports commercial use. If you plan to use the TransNormerLLM model or its derivatives for commercial purposes, please ensure that your entity meets the following conditions:
220 | 
221 |   1. The Daily Active Users (DAU) of your or your affiliate's service or product is less than 1 million.
222 |   2. Neither you nor your affiliates are software service providers or cloud service providers.
223 |   3. There is no possibility for you or your affiliates to grant the commercial license given to you, to reauthorize it to other third parties without TransNormerLLM's permission.
224 | 
225 | Upon meeting the above conditions, you need to submit the application materials required by the TransNormerLLM Model Community License Agreement via the following contact email: opennlplab@gmail.com. Once approved, TransNormerLLM will hereby grant you a non-exclusive, global, non-transferable, non-sublicensable, revocable commercial copyright license.
226 | 
227 | ## Acknowledgments
228 | Our project is developed based on the following open source projects:
229 | - [Baichuan](https://github.com/baichuan-inc/Baichuan-7B) for the tokenizer.
230 | - [metaseq](https://github.com/facebookresearch/metaseq) for training.
231 | - [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for evaluation.
232 | 
233 | ## Citation
234 | If you wish to cite our work, please use the following reference:
235 | ```
236 | @misc{qin2024transnormerllm,
237 |       title={TransNormerLLM: A Faster and Better Large Language Model with Improved TransNormer},
238 |       author={Zhen Qin and Dong Li and Weigao Sun and Weixuan Sun and Xuyang Shen and Xiaodong Han and Yunshen Wei and Baohong Lv and Xiao Luo and Yu Qiao and Yiran Zhong},
239 |       year={2024},
240 |       eprint={2307.14995},
241 |       archivePrefix={arXiv},
242 |       primaryClass={cs.CL}
243 | }
244 | 
245 | @misc{qin2024lightning,
246 |       title={Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models},
247 |       author={Zhen Qin and Weigao Sun and Dong Li and Xuyang Shen and Weixuan Sun and Yiran Zhong},
248 |       year={2024},
249 |       eprint={2401.04658},
250 |       archivePrefix={arXiv},
251 |       primaryClass={cs.CL}
252 | }
253 | 
254 | ```
255 | 


--------------------------------------------------------------------------------