├── .gitignore ├── LICENSE ├── README.md ├── README_zh.md ├── assets ├── eng1.png ├── eng2.png ├── eng3.png ├── image1.png ├── image2.png ├── image3.png ├── instruct_loss.png ├── logo.png ├── multiturn_chat.jpeg ├── multiturn_chat_en.jpeg ├── multiturn_chat_en.jpg └── pretrain_loss.png ├── chat_server.py ├── configs ├── accelerate_configs │ ├── ds_stage1.yaml │ ├── ds_stage2.yaml │ ├── ds_stage3.yaml │ └── ds_stage3_offload.yaml ├── instruct_config.yaml ├── model_configs │ ├── 13B.json │ ├── 33B.json │ ├── 65B.json │ └── 7B.json ├── pretrain_config.yaml └── tokenizer_models │ ├── 10w_vocab_wudao5_pile10.model │ ├── 4w_cn_vocab_wudao15.model │ ├── llama_tokenizer.model │ └── llama_tokenizer_extended.model ├── data ├── download_instruct.sh ├── download_the_pile.sh ├── download_wudao.sh ├── preprocess_instruction.py ├── preprocess_the_pile.py └── preprocess_wudao.py ├── dataset ├── dataset.py └── validation.py ├── requirements.txt ├── solver └── trainer.py ├── train_lm.py └── utils ├── convert_ckpt.py ├── merge_tokenizer.py ├── speed_test ├── accelerate │ ├── ddp.yaml │ ├── deepspeed_stage1.yaml │ ├── deepspeed_stage2.yaml │ ├── deepspeed_stage3.yaml │ ├── deepspeed_stage3_dynamo.yaml │ ├── deepspeed_stage3_offload.yaml │ ├── fsdp.yaml │ ├── megatron.yaml │ ├── run.py │ └── run.sh ├── colossal-ai │ ├── run.py │ ├── run.sh │ └── utils.py └── lightning │ └── run.py └── train_tokenizer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .DS_Store 131 | pretrain_data/ 132 | wandb/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 S 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 11 | [**中文**](./README_zh.md) | [**English**](./README.md) 12 | 13 | ![camel](assets/logo.png) 14 | 15 | # Open-Llama 16 | 17 |

18 | GitHub 19 | GitHub release (latest by date) 20 | GitHub top language 21 | GitHub last commit 22 |

23 | 24 | Open-Llama is an open-source project that offers a complete training pipeline for building large language models, ranging from dataset preparation to tokenization, pre-training, prompt tuning, lora, and the reinforcement learning technique RLHF. 25 | 26 | **You can try this model directly from the [Demo](http://home.ustc.edu.cn/~sl9292/).** 27 | 28 | Join [discord](https://discord.gg/TrKxrTpnab) to discuss the development of large language models. 29 | 30 | ## **Main contents** 31 | 32 | - **Support Transformers/HuggingFace.** The CheckPoint after Instruct-tuning is open-source on [Hugging Face: s-JoL/Open-Llama-V2](https://huggingface.co/s-JoL/Open-Llama-V2). 33 | 34 | - **By adopting the same evaluation method as the FastChat project, Open-Llama's performance is compared to GPT3.5’s. After testing, it can reach 89% of GPT3.5's performance on Chinese questions.** 35 | 36 | - **The training speed reaches 3620 tokens/s, faster than the 3370 tokens/s reported in the original Llama paper, reaching the current state-of-the-art level.** 37 | 38 | ``` python 39 | from transformers import AutoModelForCausalLM, AutoTokenizer 40 | 41 | tokenizer = AutoTokenizer.from_pretrained("s-JoL/Open-Llama-V2", use_fast=False) 42 | model = AutoModelForCausalLM.from_pretrained("s-JoL/Open-Llama-V2", device_map="auto") 43 | 44 | inputs = tokenizer('user:implement quick sort in python\nsystem:', return_tensors='pt', return_attention_mask=False, add_special_tokens=False) 45 | for k, v in inputs.items(): 46 | inputs[k] = v.cuda() 47 | pred = model.generate(**inputs, max_new_tokens=512, do_sample=True) 48 | print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)) 49 | 50 | ``` 51 | The CheckPoint after pre-training only is also uploaded to [s-JoL/Open-Llama-V2-pretrain](https://huggingface.co/s-JoL/Open-Llama-V2-pretrain). 52 | 53 | We have completed 330B token pre-training, training a total of 80 K steps. The Global Batch Size is consistent with Llama at 4M. 54 | Using a total of 7 parts of data to constitute the Instruction-tuning data, the model has certain programming abilities, mathematical abilities, and multi-turn dialogue abilities. Specific data can be found in the Instruction-Tuning section. 55 | 56 | Below is a display of the model's multi-turn dialogue ability regarding code: 57 | 58 | ![image4](assets/multiturn_chat_en.jpg) 59 | 60 | ## **Updates** 61 | 62 | **[2023.5.8] Release v2.1** 63 | 64 | - This update adds support for larger model training. Using DeepSpeed stage3 + offload + activation checkpoint, you can **train a 65B model with A100-80G**. 65 | 66 | - The peft library is introduced to **support training such as lora**. 67 | 68 | - The following table compares the training speed of Open-Llama and the original Llama, and the performance data of Llama is quoted from the original Llama paper. 69 | 70 | 71 | | | DeepSpeed Stage | Offload | Activation Checkpoint | Total Token | GPU hours | Speed token/s/gpu | Batch Size | 72 | |----------------|-----------------|---------|-----------------------|-------------|-----------|-------------------|------------| 73 | | Open-Llama 7B | 1 | False | False | 173.7B | 13412 | 3620 | 2 | 74 | | Open-Llama 13B | 3 | False | True | - | - | 1856 | 24 | 75 | | Open-Llama 33B | 3 | False | True | - | - | 708 | 12 | 76 | | Open-Llama 65B | 3 | True | True | - | - | 369 | 12 | 77 | | Llama 7B | - | - | - | 1T | 82432 | 3370 | - | 78 | | Llama 13B | - | - | - | 1T | 135168 | 2055 | - | 79 | | Llama 33B | - | - | - | 1.4T | 530432 | 733 | - | 80 | | Llama 65B | - | - | - | 1.4T | 1022362 | 380 | - | 81 | 82 | **[2023.4.28] Release v2.0** 83 | 84 | This update mainly includes the following aspects, increasing the effective training speed by **50%** compared to the v1 version, reducing padding from **30%** to **5%**, and improving training speed from **3200 tokens/s** to **3620 tokens/s**. 0.95 * 3620 / (0.7 * 3200) = 1.521 85 | 86 | 1. Use Hugging Face's datasets library for data reading, with the process as follows: 87 | 1. Use the transform function to unify data formats from different datasets to {'text': 'xxx'} 88 | 2. Tokenize using Tokenizer 89 | 3. Sample long sequences; currently, three modes are provided: truncation, sampling (refer to the [Gopher paper](https://arxiv.org/abs/2112.11446)), and splitting 90 | 4. Optional: concatenate texts from different docs, reducing padding in the data and accelerating training. In the v1 version, padding accounted for **30%**; after concatenation, padding is reduced to **5%**. 91 | 2. Add Trainer, which can be reused for both pre-training and instruction fine-tuning, see solver/trainer.py 92 | 3. Unify the pre-training and instruction fine-tuning training entry to train_lm.py 93 | 4. Provide more convenient configuration, see configs/pretrain_config.yaml 94 | 5. Provide functionality to continue pre-training based on other pre-trained models and supplementing vocabulary 95 | 6. Resuming training from a checkpoint is supported, including loading optimizer parameters/learning rate and skipping duplicate data 96 | 97 | [2023.4.16] Release v1.0 98 | 99 | Basic pre-training and instruction fine-tuning codes are provided, with a training speed comparable to that of the original Llama. The pre-trained and fine-tuned models are already open-sourced on Hugging Face. 100 | 101 | v1 version code can be seen at https://github.com/s-JoL/Open-Llama/tree/v1.0 102 | 103 | ## **Features** 104 | 105 | ### Easy to use 106 | 107 | We believe that ease of use is one of the most important features when building large language models. To make Open-LLAMA more accessible, we have focused on the following aspects: 108 | 109 | - **Minimal implementation**: We have adopted the simplest implementation methods, lowering the entry threshold and allowing beginners to get started with ease. 110 | - **Complete pipeline**: We have published the complete code from dataset construction to training, making every step in the process of building a large language model clear and visible. 111 | 112 | ### High performance 113 | 114 | Due to the high cost of training large language models, high performance is also crucial when building them. To achieve high-performance training, we have employed the following techniques: 115 | 116 | - **Fused CUDA kernel**: Using the fused CUDA kernel provided in [xformers](https://github.com/facebookresearch/xformers) can fuse multiple operations, reducing data transfer between the GPU and CPU, thereby improving training efficiency. 117 | - **Parallelized training**: We employ the [Accelerate](https://huggingface.co/docs/accelerate/index) library to support parallelized training on multiple GPUs to speed up the training process. 118 | 119 | For a 7B model, the training speed with the native PyTorch Llama model in Transformers is **1378 tokens/s/GPU**. Using this codebase, the training speed reaches **3626 tokens/s/GPU**, exceeding **3370 tokens/s/GPU** reported in the [original Llama paper](https://arxiv.org/pdf/2302.13971.pdf). 120 | 121 | If pre-training with 500B tokens, 38300 GPU hours are required. According to the hourly price for 8 A100-80G Spot GPUs on Google Cloud, which is 12.6 US dollars, the total cost is 60,300 US dollars. 122 | When using the unaccelerated version for training, the cost is 158,744 US dollars. The final training cost is reduced by 98,000 US dollars. 123 | 124 | For more testing, see [performance comparison with other open-source models](https://github.com/s-JoL/Open-Llama#%E5%92%8C%E5%85%B6%E4%BB%96%E5%BC%80%E6%BA%90%E6%A8%A1%E5%9E%8B%E6%80%A7%E8%83%BD%E5%AF%B9%E6%AF%94). 125 | 126 | ### Versatility 127 | 128 | When training language models, our goal is to build a versatile model that can handle different languages and domains. To achieve this, we have employed the following strategies: 129 | 130 | - **Multi-language support**: We support multiple language corpora, including English, Chinese, Japanese, and many other languages, allowing users to choose according to their requirements. 131 | - **Domain versatility**: We hope that the model can not only help with everyday questions but also assist in professional domains such as science, law, etc. 132 | - **Interaction with the world**: By incorporating reinforcement learning (RL), we hope to give the model the ability to interact with the world. 133 | 134 | ## **Requirements** 135 | 136 | - Python 3.7 or higher 137 | - PyTorch 1.13 138 | - [Transformers library](https://github.com/huggingface/transformers) 139 | - [Accelerate library](https://huggingface.co/docs/accelerate/index) 140 | - CUDA 11.6 or higher (for GPU acceleration) 141 | - Hardware configuration: currently using (64 CPU, 1000G Memory, 8xA100-80G) x N. There is a rather curious phenomenon that when more CPUs are used, the system runs slightly slower. I speculate this may have something to do with the multi-processing of dataloader. 142 | 143 | ## **Getting Started** 144 | ### Installation 145 | 146 | Use the following command to install related dependencies: 147 | 148 | ```bash 149 | pip install -r requirements.txt 150 | ``` 151 | 152 | ### Dataset Preparation 153 | 154 | Currently provided are the Wudao dataset open-sourced by Zhiyuan and the Pile dataset open-sourced by EleutherAI. Dataset download and processing scripts are located in the data directory. 155 | Due to the required agreement for downloading the Wudao dataset, you may need to modify the link in download_wudao. [Wudao](https://data.baai.ac.cn/details/WuDaoCorporaText). 156 | 157 | Thanks to [@skepsun](https://github.com/skepsun)'s suggestion, using scidb to download the wudao dataset does not require login, and the download is more stable. https://github.com/s-JoL/Open-Llama/issues/42. 158 | 159 | **Note that data download may fail. It is recommended to divide the download and processing in the script into two parts for multiple attempts, which will automatically resume downloads from breakpoints.** 160 | 161 | Run the following commands to download the data and perform partitioning: 162 | ```bash 163 | bash data/download_the_pile.sh 164 | bash data/download_wudao.sh 165 | ``` 166 | The data will be stored as small files, with a maximum of 16384 lines per file, for easy reading during multi-process training. The storage format is jsonl.zst, compressed using zstd, with a final data size of 519.5 GB, consisting of 16,466 files in total. 167 | 168 | The Pile dataset contains 210,607,728 JSON lines, while the Wudao dataset contains 59,132,213 JSON lines. 169 | 170 | The specific data format is as follows: 171 | ``` 172 | WuDao 173 | {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'} 174 | 175 | The Pile 176 | {'text': 'some text', 'meta': {'pile_set_name': 'Github'}} 177 | ``` 178 | Check the data integrity in [issue](https://github.com/s-JoL/Open-Llama/issues/5). 179 | 180 | ### Related Tools 181 | 182 | In the utils directory, training tokenizer/supplementing existing tokenizer models and conversion checkpoint code are provided. 183 | 184 | Use SentencePiece to train a tokenizer with the following command: 185 | 186 | ```bash 187 | python3 utils/train_tokenizer.py 188 | ``` 189 | 190 | In configs, a tokenizer model with a 40k vocabulary, trained only using the Wudao dataset (4w_cn_vocab_wudao15.model), is provided. 191 | 192 | To supplement the vocabulary based on an existing tokenizer model, refer to: 193 | 194 | ```bash 195 | python3 utils/merge_tokenizer.py 196 | ``` 197 | 198 | A bilingual English and Chinese tokenizer model (llama_tokenizer_extended.model) is created by merging the META official tokenizer model with the 40k Chinese tokenizer mentioned above. 199 | 200 | To convert existing Llama model checkpoints, refer to: 201 | 202 | ```bash 203 | python3 utils/convert_ckpt.py 204 | ``` 205 | 206 | ### Data Loading 207 | 208 | Data loading-related code can be found in dataset/dataset.py, which includes pre-training and instruction fine-tuning data processing. To add other datasets, only the transform function needs to be modified. 209 | 210 | The data loading process is as follows: 211 | 212 | 1. Use the transform function to unify data formats from different datasets to {'text': 'xxx'} 213 | 2. Tokenize using Tokenizer 214 | 3. Sample long sequences; currently, three modes are provided: truncation, sampling (refer to the Gopher paper), and splitting 215 | 4. Optional: concatenate texts from different docs, reducing padding in the data and accelerating training. In the v1 version, padding accounted for 30%; after concatenation, padding is reduced to 5%. 216 | 217 | Use the following command to view the output of DataLoader and check the correctness of tokenization: 218 | 219 | ```bash 220 | python3 dataset/dataset.py 221 | ``` 222 | 223 | ### Model Structure 224 | 225 | We modified according to the section 2.4 Efficient implementation of the [Llama](https://github.com/facebookresearch/llama) paper in the Transformers library, and also referenced other papers to introduce some optimizations. Specifically, we used the memory_efficient_attention operation from the [xformers library](https://github.com/facebookresearch/xformers) open-sourced by META for Self Attention computation, which has a significant performance improvement of approximately 30%. Further details can be found in [modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/open_llama/modeling_open_llama.py#L229). 226 | 227 | Additionally, we referred to [Bloom](https://huggingface.co/bigscience/bloom) and introduced Stable Embedding for Token Embedding to better stabilize training. 228 | 229 | Finally, we referenced [PALM](https://arxiv.org/abs/2204.02311) and employed Shared Input-Output Embeddings. 230 | 231 | ### Pre-training 232 | 233 | We use multi-GPU parallel training based on the Accelerate library, with the following start command: 234 | 235 | ```bash 236 | accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/pretrain_config.yaml --model_config configs/model_configs/7B.json 237 | ``` 238 | In some cases, you may need to specify the following parameters: 239 | 240 | ``` 241 | --main_process_ip 242 | --main_process_port 243 | --num_processes 244 | --num_machines 245 | --machine_rank 246 | ``` 247 | 248 | We use [Wandb](https://wandb.ai/) for visualizing training. You need to modify the WANDB_API_KEY environment variable yourself. 249 | 250 | Among them, we use DeepSpeed stage1 to reduce memory usage. For Accelerate-related configurations, see configs/accelerate_configs. 251 | 252 | Training related hyperparameters can be found in configs/pretrain_config.yaml. 253 | 254 | The default parameters use LlamaTokenizer with a supplemented 40k Chinese vocabulary tokenizer model, and the model size is 7B. The specific configuration is as follows: 255 | 256 | | max_length | batch_size | learning_rate | weight_decay | params | dimension | n heads | n layer | vocab_size | 257 | |------------|------------------|---------------|--------------|--------|-----------|---------|---------|------------| 258 | | 2048 | 2 | 2e-4 | 1e-1 | 7.03B | 4096 | 32 | 32 | 68762 | 259 | 260 | ``` 261 | ============================================================================================================== 262 | Layer (type:depth-idx) Output Shape Param # 263 | ============================================================================================================== 264 | OpenLlamaForCausalLM [1, 32, 64, 128] -- 265 | ├─OpenLlamaModel: 1-1 [1, 32, 64, 128] -- 266 | │ └─Embedding: 2-1 [1, 64, 4096] 281,649,152 267 | │ └─ModuleList: 2-2 -- -- 268 | │ │ └─OpenLlamaDecoderLayer: 3x32 [1, 64, 4096] 202,383,360 269 | │ └─OpenLlamaRMSNorm: 2-3 [1, 64, 4096] 4,096 270 | ├─Linear: 1-2 [1, 64, 68762] 281,649,152 271 | ============================================================================================================== 272 | Total params: 7,039,569,920 273 | Trainable params: 7,039,569,920 274 | Non-trainable params: 0 275 | Total mult-adds (G): 7.04 276 | ``` 277 | 278 | Pre-training loss from scratch is shown below: 279 | 280 | ![loss](assets/pretrain_loss.png) 281 | 282 | ### Instruction-Tuning 283 | 284 | We use the currently available seven datasets for Instruction-tuning, and more tasks and our own datasets will be added later. 285 | 286 | - [yizhongw/self_instruct](https://huggingface.co/datasets/yizhongw/self_instruct) 287 | - [BelleGroup/train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) 288 | - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) 289 | - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) 290 | - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) 291 | - [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) 292 | - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) 293 | 294 | The ShareGPT_Vicuna_unfiltered dataset has some issues in the datastes processing, so we directly downloaded the original data and reprocessed it. 295 | We performed some preprocessing on the original data, with the format as follows: 296 | 297 | ``` 298 | user: {prompt}\nsystem: {completion} 299 | ``` 300 | 301 | The startup command is basically the same as pre-training: 302 | 303 | ```bash 304 | accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/instruct_config.yaml --model_config configs/model_configs/7B.json 305 | ``` 306 | 307 | In some cases, you may need to specify the following parameters: 308 | 309 | ``` 310 | --main_process_ip 311 | --main_process_port 312 | --num_processes 313 | --num_machines 314 | --machine_rank 315 | ``` 316 | 317 | The loss during the process is shown below, with a total of 3 epochs: 318 | 319 | ![loss](assets/instruct_loss.png) 320 | 321 | ### RLHF 322 | 323 | Not available yet. 324 | 325 | ### Server 326 | 327 | For multi-turn dialogue, use chat_server.py. 328 | 329 | Developed based on Gradio. 330 | 331 | ## Performance Comparison 332 | 333 | ### Training Framework 334 | 335 | In terms of training frameworks, we tested Hugging Face's open-source Accelerate library, PyTorch Lightning, and HPC-AI's open-source ColossalAI. We found that their performance differences are relatively small when fully utilizing GPUs. Therefore, we chose the relatively simple-to-implement Accelerate library as the training framework. 336 | 337 | The test code can be found in utils/speed_test.py. 338 | 339 | The model structure used during the testing process is: 340 | | Model | n gpu | n layer | n heads | hidden size | vocab size | seq length | 341 | |-------|-------|---------|---------|-------------|------------|------------| 342 | | GPT2 | 2 | 6 | heads | 4096 | 250100 | 1024 | 343 | 344 | The test results are shown below, indicating that when the GPUs are fully utilized, the differences in speed and memory consumption are not significant. 345 | | | Hugging Face | Hugging Face | ColossalAI | ColossalAI | ColossalAI | 346 | |-----------------|-----------------------------------|------------------------------------|--------------------------------------------------------|--------------------------------------------------------|------------------------------------| 347 | | config | without activation ckpt, bs2 | without activation ckpt, max_bs=12 | with activation ckpt, bs2 | without activation ckpt, bs2 | without activation ckpt, max_bs=10 | 348 | | second pre step | 0.336, fw=0.033, bw=0.3, opt=5e-6 | 1.25 | 0.347 | 0.308, fw=0.067, bw=0.152, opt=0.088 | 1.055 | 349 | | gpu memory | nvidia-smi 45445 | | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps | 350 | 351 | ### Performance Optimization 352 | 353 | In the earliest version, we used the native Llama implementation from DeepSpeed stage2 + Transformers for training. However, the speed was significantly different from what was claimed in the paper. Therefore, we carried out a series of optimizations afterwards, and we list each step of the performance improvement below for reference. 354 | 355 | The paper mentioned that for the 6.7B model, 1T token was used for training and the final GPU time was 82432, from which the training speed was roughly calculated as 3370 token/s/gpu. After using the following optimizations, the speed is now basically consistent with what was claimed in the paper when tested on 20x8 A100-80G. It is expected that more fusion operators will be added in the future to achieve better performance. 356 | 357 | | | V1 | V2 | 358 | |---------------------|--------------|------------------------------------| 359 | | Dataset | self implemented | datasets | 360 | | Model | Transformers | Transformers+xformers | 361 | | Optimizer | Pytorch Adam | Fused Adam | 362 | | DeepSpeed | stage2 | stage1 | 363 | | Grad Accumulation | 4 | 12 | 364 | | Return Padding Mask | yes | no | 365 | | Speed token/s/gpu | 1378 | 3637 | 366 | 367 | ### Comparison with Other Open-source Models 368 | 369 | The following table summarizes the performance of currently available open-source models. In all cases, the GPU device used is A100. Due to differences in the size and structure of the models, it is difficult to make accurate performance comparisons. As a rough estimate, it can be assumed that the speed is generally inversely proportional to the size of the model parameters, which is confirmed by the performance of Llama with models of different sizes. Based on this rough estimate, it can be seen that the performance using our project is significantly better than that of other projects. 370 | 371 | | Model | Open-Llama | LLAMA | LLAMA | LLAMA | OPT | Bloom | GLM | GPT-NEOX | CPM-ANT | CodeGeeX | 372 | |---------------------|------------|----------|---------|-----------|---------|--------------------|-------|----------|---------|-----------| 373 | | Model size | 7.0B | 6.7B | 13B | 65B | 175B | 175B | 130B | 20B | 10B | 13B | 374 | | Token | | 1T | 1T | 1.4T | 180B | 366B | 400B | 402B | 200B | 13.9B | 375 | | GPU Hour | | 82,432 | 135,168 | 1,022,362 | 809,472 | 1,082,990 | 43776 | 175680 | 47040 | 3072 | 376 | | speed token/s/gpu | 3637 | 3370 | 2055 | 380 | 61.8 | 93.9 | 105.7 | 635.6 | 1181 | 1257 | 377 | | 相关依赖 | xformers | xformers | | | measeq | Megatron-DeepSpeed | | | BMtrain | MindSpore | 378 | | speed token*params B/s/gpu | 25728 | 22579 | 26715 | 24700 | 10815 | 16432 | 13741 | 12712 | 11810 | 16341 | 379 | 380 | ## Future Plans 381 | 382 | 1. Integrate RLHF code. 383 | 2. Use Triton to add more high-performance operators to further improve performance. 384 | 3. Add code for building pre-training datasets based on Common Crawl and open related datasets. 385 | 4. Add code for multimodal training. 386 | 387 | ## Citation 388 | 389 | ``` 390 | @misc{openllama, 391 | title={Open-Llama}, 392 | author={s-JoL}, 393 | year={2023}, 394 | howpublished={\url{https://github.com/s-JoL/Open-Llama}}, 395 | } 396 | ``` 397 | 398 |

399 | 400 | Star History Chart 401 | 402 |

403 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | 11 | [**中文**](./README_zh.md) | [**English**](./README.md) 12 | 13 | ![camel](assets/logo.png) 14 | 15 | # Open-Llama 16 | 17 |

18 | GitHub 19 | GitHub release (latest by date) 20 | GitHub top language 21 | GitHub last commit 22 |

23 | 24 | Open-Llama是一个开源项目,提供了一整套用于构建大型语言模型的训练流程,从数据集准备到分词、预训练、指令调优,lora, 以及强化学习技术 RLHF。 25 | 26 | **可从[Demo](http://home.ustc.edu.cn/~sl9292/)直接试用本模型。** 27 | 28 | 加入[discord](https://discord.gg/TrKxrTpnab)一起讨论大语言模型的发展。 29 | 30 | ## **主要内容** 31 | 32 | - **支持Transformers/Hugging Face直接调用。** 经过Instruct-tuning的CheckPoint已开源在[Hugging Face: s-JoL/Open-Llama-V2](https://huggingface.co/s-JoL/Open-Llama-V2)。 33 | 34 | - **采用FastChat项目相同方法测评Open-Llama的效果和GPT3.5的效果对比,经过测试在中文问题上可以达到GPT3.5 89%的水平。** 35 | 36 | - **训练速度达到3620 token/s,快于Llama原文中的3370 token/s,达到目前sota的水平。** 37 | 38 | 39 | ``` python 40 | from transformers import AutoModelForCausalLM, AutoTokenizer 41 | 42 | tokenizer = AutoTokenizer.from_pretrained("s-JoL/Open-Llama-V2", use_fast=False) 43 | model = AutoModelForCausalLM.from_pretrained("s-JoL/Open-Llama-V2", device_map="auto") 44 | 45 | inputs = tokenizer('user:implement quick sort in python\nsystem:', return_tensors='pt', return_attention_mask=False, add_special_tokens=False) 46 | for k, v in inputs.items(): 47 | inputs[k] = v.cuda() 48 | pred = model.generate(**inputs, max_new_tokens=512, do_sample=True) 49 | print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)) 50 | 51 | ``` 52 | 只经过预训练的CheckPoint也上传至[s-JoL/Open-Llama-V2-pretrain](https://huggingface.co/s-JoL/Open-Llama-V2-pretrain)。 53 | 54 | 我们完成了330B token的预训练,总共训练80 K step,Global Batch Size和Llama中一致为4M。 55 | 使用总共7部分数据构成Instruction-tuning数据,模型具有一定的编程能力、数学能力和多轮对话能力,具体数据见Instruction-Tuning部分。 56 | 57 | 如下是一个关于代码的多轮对话能力的展示 58 | 59 | ![image4](assets/multiturn_chat.jpeg) 60 | 61 | ## **更新** 62 | 63 | **[2023.5.8] Release v2.1** 64 | 65 | - 本次更新加入对更大模型训练的支持,使用DeepSpeed stage3 + offload + activation checkpoint可以在**A100-80G训练65B模型**。 66 | 67 | - 引入peft库**支持lora**等训练。 68 | 69 | - 下表对比了Open-Llama和Llama原文的训练速度,Llama性能数据引自Llama原文。 70 | 71 | 72 | | | DeepSpeed Stage | Offload | Activation Checkpoint | Total Token | GPU hours | Speed token/s/gpu | Batch Size | 73 | |----------------|-----------------|---------|-----------------------|-------------|-----------|-------------------|------------| 74 | | Open-Llama 7B | 1 | False | False | 173.7B | 13412 | 3620 | 2 | 75 | | Open-Llama 13B | 3 | False | True | - | - | 1856 | 24 | 76 | | Open-Llama 33B | 3 | False | True | - | - | 708 | 12 | 77 | | Open-Llama 65B | 3 | True | True | - | - | 369 | 12 | 78 | | Llama 7B | - | - | - | 1T | 82432 | 3370 | - | 79 | | Llama 13B | - | - | - | 1T | 135168 | 2055 | - | 80 | | Llama 33B | - | - | - | 1.4T | 530432 | 733 | - | 81 | | Llama 65B | - | - | - | 1.4T | 1022362 | 380 | - | 82 | 83 | **[2023.4.28] Release v2.0** 84 | 85 | 本次更新主要包含以下几个方面,相对于v1版本提升有效训练速度**50%**,其中pad从**30%**减少至**5%**,训练速度从**3200token/s**提升至**3620token/s**。0.95 * 3620/(0.7 * 3200)=1.521 86 | 1. 使用Hugging Face的datasets库进行数据读取,具体流程如下 87 | 1. 使用transform函数将不同数据集的数据统一格式为{'text': 'xxx'} 88 | 2. 使用Tokenizer进行分词 89 | 3. 对长序列进行采样,目前提供三种模式,分别是:截断/采样(参考[Gopher论文](https://arxiv.org/abs/2112.11446))/切分 90 | 4. 可选:对来自不同doc的文本进行拼接。减少了数据中的pad,加速训练;在v1版本中pad占比为**30%**,使用拼接后pad占比降低为**5%**。 91 | 2. 加入Trainer,对于预训练和指令微调都可以复用,见solver/trainer.py 92 | 3. 统一预训练和指令微调训练入口为train_lm.py 93 | 4. 提供更方便的配置,可见configs/pretrain_config.yaml 94 | 5. 提供基于其他预训练模型补充词表,继续预训练功能 95 | 6. 支持从中断点继续训练,包括加载优化器参数/学习率和跳过重复数据 96 | 97 | [2023.4.16] Release v1.0 98 | 99 | 提供基础的预训练和指令微调代码,训练速度达到Llama原文速度。预训练和指令微调后的模型已经开源在Hugging Face。 100 | 101 | v1版代码可见https://github.com/s-JoL/Open-Llama/tree/v1.0 102 | 103 | ## **特性** 104 | 105 | ### 易用性 106 | 107 | 我们认为易用性是构建大型语言模型时最重要的特性之一。为了使 Open-LLAMA 更加易于使用,我们特别注重了以下几点: 108 | 109 | - **最简实现**:我们采用了最简单的实现方式,降低了入门的门槛,让初学者也能轻松上手。 110 | - **流程完整**:我们发布了从数据集构建到训练的完整代码,使得构建一个大语言模型的每一步流程都清晰可见。 111 | 112 | ### 高性能 113 | 114 | 由于训练大语言模型的成本高昂,因此在构建大型语言模型时,高性能也是非常重要的。为了实现高性能的训练,我们发布使用了以下技术: 115 | 116 | - **Fused CUDA kernel**:使用[xformers](https://github.com/facebookresearch/xformers)中提供的 fused CUDA kernel 可以将多个操作融合在一起,减少了 GPU 和 CPU 之间的数据传输,从而提高了训练效率。 117 | - **并行化训练**:我们使用[Accelerate](https://huggingface.co/docs/accelerate/index)库支持在多个 GPU 上进行并行化训练,以加快训练速度。 118 | 119 | 对于7B模型,使用Transformers中Pytorch原生版本的Llama模型训练训练速度为**1378 token/s/gpu**,使用本代码库训练速度达到**3626 token/s/gpu**,超过[Llama原文](https://arxiv.org/pdf/2302.13971.pdf)中的**3370 token/s/gpu**。 120 | 121 | 如果使用500B token进行预训练,需要训练38300 GPU时。按照Google Cloud上A100-80G Spot的价格计算,8卡每小时价格为12.6美元,则总价格为60300美元。 122 | 当使用未加速版本训练时,价格为158744美元。最终降低训练成本9.8万美元。 123 | 124 | 更多测试可见[和其他开源模型性能对比](https://github.com/s-JoL/Open-Llama#%E5%92%8C%E5%85%B6%E4%BB%96%E5%BC%80%E6%BA%90%E6%A8%A1%E5%9E%8B%E6%80%A7%E8%83%BD%E5%AF%B9%E6%AF%94)。 125 | ### 通用性 126 | 127 | 在训练语言模型时,我们希望能够构建一个通用的模型,可以适用于不同的语言和不同的领域。为了实现这一点,我们采用了以下策略: 128 | 129 | - **多语言支持**:我们支持多种语言的语料库,包括英语、中文、日语等多种语言,让用户可以根据自己的需求进行选择。 130 | - **领域通用性**:我们希望模型不仅能在日常问题上能产生帮助,同时希望在专业领域如科学、法律等也能帮助人类。 131 | - **和世界交互**:希望通过加入RL使得模型具备和世界交互的能力 132 | 133 | ## **要求** 134 | 135 | - Python 3.7 或更高版本 136 | - PyTorch 1.13 137 | - [Transformers库](https://github.com/huggingface/transformers) 138 | - [Accelerate库](https://huggingface.co/docs/accelerate/index) 139 | - CUDA 11.6 或更高版本(用于 GPU 加速) 140 | - 硬件配置:目前使用(64 CPU, 1000G Memory, 8xA100-80G) x N,有个比较神奇的现象当使用更多cpu时反而会慢一点,猜测这和dataloader的多进程有一定关系。 141 | 142 | ## **入门指南** 143 | ### 安装 144 | 145 | 使用下面的命令安装相关依赖 146 | 147 | ```bash 148 | pip install -r requirements.txt 149 | ``` 150 | 151 | ### 数据集准备 152 | 153 | 目前给出了智源开源的悟道数据集和EleutherAI开源的the pile数据集。数据集下载和处理代码在data目录下。 154 | 其中悟道数据集由于需要同意一些协议才能下载因此可能需要修改一下download_wudao中的链接,[悟道](https://data.baai.ac.cn/details/WuDaoCorporaText)。 155 | 156 | 感谢[@skepsun](https://github.com/skepsun)的建议,使用scidb下载wudao数据集不需要登陆,并且下载更稳定一些。https://github.com/s-JoL/Open-Llama/issues/42 157 | 158 | **注意数据下载可能出现失败,建议将script中的下载和处理分成两部分来运行,可以将下载多运行机会,会自动断点续传。** 159 | 160 | 运行下面的命令进行数据下载并进行分片 161 | ```bash 162 | bash data/download_the_pile.sh 163 | bash data/download_wudao.sh 164 | ``` 165 | 数据将按照每个文件最大16384行存储为小文件,便于后续使用多进程训练时进行读取。存储格式为jsonl.zst,使用zstd进行压缩,最终数据大小为519.5G,合计16466个文件。 166 | 167 | 其中the pile数据集包含210607728行json line,悟道数据集包含59132213行json line。 168 | 169 | 具体数据格式如下 170 | ``` 171 | WuDao 172 | {'id': 1, 'dataType': '百科', 'title': 'some title', 'content': 'some content'} 173 | 174 | The Pile 175 | {'text': 'some text', 'meta': {'pile_set_name': 'Github'}} 176 | ``` 177 | 验证数据完整性可见 [issue](https://github.com/s-JoL/Open-Llama/issues/5) 178 | 179 | ### 相关工具 180 | 在utils目录中提供了训练分词/补充现有分词模型和转换ckpt的代码。 181 | 182 | 使用SentencePiece训练分词器参考如下命令 183 | ```bash 184 | python3 utils/train_tokenizer.py 185 | ``` 186 | 在configs中提供了只使用wudao数据集训练的4w词表的分词模型 4w_cn_vocab_wudao15.model 187 | 188 | 根据已有分词模型补充词表参考 189 | ```bash 190 | python3 utils/merge_tokenizer.py 191 | ``` 192 | 根据META官方的分词模型和上面的4w中文合并为中英文双语的分词模型 llama_tokenizer_extended.model 193 | 194 | 转换现有的Llama模型ckpt参考 195 | ```bash 196 | python3 utils/convert_ckpt.py 197 | ``` 198 | 199 | ### 数据读取 200 | 数据读取相关代码可见dataset/dataset.py,包含了预训练和指令微调数据的处理,如需加入其他数据集只需要修改其中的transform函数。 201 | 202 | 数据读取流程如下: 203 | 1. 使用transform函数将不同数据集的数据统一格式为{'text': 'xxx'} 204 | 2. 使用Tokenizer进行分词 205 | 3. 对长序列进行采样,目前提供三种模式,分别是:截断/采样(参考Gopher论文)/切分 206 | 4. 可选:对来自不同doc的文本进行拼接。减少了数据中的pad,加速训练;在v1版本中pad占比为30%,使用拼接后pad占比降低为5%。 207 | 208 | 使用如下命令查看DataLoader输出的结果,并检查分词正确性 209 | ```bash 210 | python3 dataset/dataset.py 211 | ``` 212 | 213 | ### 模型结构 214 | 我们基于Transformers库中的[Llama](https://github.com/facebookresearch/llama)参考论文原文中的2.4 Efficient implementation一节进行了修改, 215 | 同时还参考了一些其他论文引入了一些优化。具体来说,我们引入了由META开源的[xformers库](https://github.com/facebookresearch/xformers)中的memory_efficient_attention操作来进行 216 | Self Attention的计算,这对于性能有明显的提升,提升大约30%。 217 | 具体可以参见[modeling_llama.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/open_llama/modeling_open_llama.py#L229) 218 | 219 | 同时我们还参考了[Bloom](https://huggingface.co/bigscience/bloom),对于Token Embedding引入了Stable Embedding以更好的稳定训练。 220 | 221 | 最后我们参考[PALM](https://arxiv.org/abs/2204.02311),使用了Shared Input-Output Embeddings。 222 | 223 | ### 预训练 224 | 我们基于Accelerate库进行多GPU并行训练,启动命令如下 225 | ```bash 226 | accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/pretrain_config.yaml --model_config configs/model_configs/7B.json 227 | ``` 228 | 某些情况下可能需要指定下列参数 229 | ``` 230 | --main_process_ip 231 | --main_process_port 232 | --num_processes 233 | --num_machines 234 | --machine_rank 235 | ``` 236 | 我们使用[Wandb](https://wandb.ai/)进行训练的可视化,需要自行修改环境变量 WANDB_API_KEY 。 237 | 238 | 其中我们使用了DeepSpeed stage1以减少显存占用。accelerate相关配置可见configs/accelerate_configs。 239 | 240 | 训练相关超参数可见configs/pretrain_config.yaml 241 | 242 | 其中默认参数为使用LlamaTokenizer补充4w中文的词表的分词模型,模型大小为7B,具体配置如下 243 | 244 | | max_length | batch_size | learning_rate | weight_decay | params | dimension | n heads | n layer | vocab_size | 245 | |------------|------------------|---------------|--------------|--------|-----------|---------|---------|------------| 246 | | 2048 | 2 | 2e-4 | 1e-1 | 7.03B | 4096 | 32 | 32 | 68762 | 247 | 248 | ``` 249 | ============================================================================================================== 250 | Layer (type:depth-idx) Output Shape Param # 251 | ============================================================================================================== 252 | OpenLlamaForCausalLM [1, 32, 64, 128] -- 253 | ├─OpenLlamaModel: 1-1 [1, 32, 64, 128] -- 254 | │ └─Embedding: 2-1 [1, 64, 4096] 281,649,152 255 | │ └─ModuleList: 2-2 -- -- 256 | │ │ └─OpenLlamaDecoderLayer: 3x32 [1, 64, 4096] 202,383,360 257 | │ └─OpenLlamaRMSNorm: 2-3 [1, 64, 4096] 4,096 258 | ├─Linear: 1-2 [1, 64, 68762] 281,649,152 259 | ============================================================================================================== 260 | Total params: 7,039,569,920 261 | Trainable params: 7,039,569,920 262 | Non-trainable params: 0 263 | Total mult-adds (G): 7.04 264 | ``` 265 | 266 | 从头预训练Loss如下 267 | ![](assets/pretrain_loss.png) 268 | 269 | ### Instruction-Tuning 270 | 271 | 我们使用目前开源的七个数据集进行Instruction-tuning,后续会加入更多的任务以及自己构建的数据集。 272 | - [yizhongw/self_instruct](https://huggingface.co/datasets/yizhongw/self_instruct) 273 | - [BelleGroup/train_0.5M_CN](https://huggingface.co/datasets/BelleGroup/train_0.5M_CN) 274 | - [BelleGroup/train_1M_CN](https://huggingface.co/datasets/BelleGroup/train_1M_CN) 275 | - [BelleGroup/multiturn_chat_0.8M](https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M) 276 | - [BelleGroup/school_math_0.25M](https://huggingface.co/datasets/BelleGroup/school_math_0.25M) 277 | - [anon8231489123/ShareGPT_Vicuna_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) 278 | - [Graverman/Instruct-to-Code](https://huggingface.co/datasets/Graverman/Instruct-to-Code) 279 | 280 | 其中ShareGPT_Vicuna_unfiltered数据在datastes的处理有些问题,我们直接下载原数据重新进行了处理。 281 | 我们对原始数据进行了一些预处理,格式如下 282 | ``` 283 | user: {prompt}\nsystem: {completion} 284 | ``` 285 | 286 | 启动命令和预训练基本一致 287 | ```bash 288 | accelerate launch --config_file configs/accelerate_configs/ds_stage1.yaml train_lm.py --train_config configs/instruct_config.yaml --model_config configs/model_configs/7B.json 289 | ``` 290 | 某些情况下可能需要指定下列参数 291 | ``` 292 | --main_process_ip 293 | --main_process_port 294 | --num_processes 295 | --num_machines 296 | --machine_rank 297 | ``` 298 | 299 | 过程中Loss如下,总计使用3个epoch 300 | ![loss](assets/instruct_loss.png) 301 | ### RLHF 302 | 暂无 303 | ### Server 304 | 305 | 多轮对话使用chat_server.py 306 | 307 | 基于Gradio开发。 308 | ## 性能对比 309 | 310 | ### 训练框架 311 | 在训练框架方面我们测试了Hugging Face开源的Accelerate库pytorch-lightning和HPC-AI开源的ColossalAI,我们测试在打满显卡时性能差异较小。因此最终选择了实现相对简单的Accelerate库作为训练框架 312 | 313 | 测试代码可见utils/speed_test.py 314 | 315 | 测试过程中使用的模型结构为 316 | | Model | n gpu | n layer | n heads | hidden size | vocab size | seq length | 317 | |-------|-------|---------|---------|-------------|------------|------------| 318 | | GPT2 | 2 | 6 | heads | 4096 | 250100 | 1024 | 319 | 320 | 测试结果如下,可以看到当打满时速度和显存相差不大 321 | | | Hugging Face | Hugging Face | ColossalAI | ColossalAI | ColossalAI | 322 | |-----------------|-----------------------------------|------------------------------------|--------------------------------------------------------|--------------------------------------------------------|------------------------------------| 323 | | config | without activation ckpt, bs2 | without activation ckpt, max_bs=12 | with activation ckpt, bs2 | without activation ckpt, bs2 | without activation ckpt, max_bs=10 | 324 | | second pre step | 0.336, fw=0.033, bw=0.3, opt=5e-6 | 1.25 | 0.347 | 0.308, fw=0.067, bw=0.152, opt=0.088 | 1.055 | 325 | | gpu memory | nvidia-smi 45445 | | fw+bw+opt=21053.63+22064.12+17987.52, nvidia-smi 40961 | fw+bw+opt=24684.74+21087.13+17987.52, nvidia-smi 46821 | oom after 10 steps, 疑似有内存泄漏 | 326 | 327 | ### 性能优化 328 | 在最早版本中我们使用DeepSpeed stage2 + Transformers中的原生Llama实现进行训练但是速度和论文中所说的相差较大,因此后续我们进行了一系列的优化,我们将每一步的性能提升列在下面可供参考。 329 | 330 | 论文中提到对于6.7B模型使用了1T token进行训练,最终的gpu时为82432,因此可以计算出他的训练速度大致为3370 token/s/gpu。 331 | 当使用下面的优化后速度开源基本和论文中速度一致,使用20x8 A100-80G进行测试。预计加入更多融合算子开源取得更好的性能。 332 | 333 | | | V1 | V2 | 334 | |---------------------|--------------|------------------------------------| 335 | | Dataset | self implemented | datasets | 336 | | Model | Transformers | Transformers+xformers | 337 | | Optimizer | Pytorch Adam | Fused Adam | 338 | | DeepSpeed | stage2 | stage1 | 339 | | Grad Accumulation | 4 | 12 | 340 | | Return Padding Mask | yes | no | 341 | | Speed token/s/gpu | 1378 | 3637 | 342 | 343 | ### 和其他开源模型性能对比 344 | 下表是一个对目前开源模型性能的一个总结,使用GPU device均为A100,由于模型大小各不相同结构也有一定差异,难以准确的对比性能,作为一个粗略估计可以认为速度和模型参数量基本呈反比关系,这一点看Llama不同大小的模型可以得到印证。基于这个粗略估计可以看到使用本项目的性能明显由于其他项目。 345 | 346 | | Model | Open-Llama | LLAMA | LLAMA | LLAMA | OPT | Bloom | GLM | GPT-NEOX | CPM-ANT | CodeGeeX | 347 | |---------------------|------------|----------|---------|-----------|---------|--------------------|-------|----------|---------|-----------| 348 | | Model size | 7.0B | 6.7B | 13B | 65B | 175B | 175B | 130B | 20B | 10B | 13B | 349 | | Token | | 1T | 1T | 1.4T | 180B | 366B | 400B | 402B | 200B | 13.9B | 350 | | GPU Hour | | 82,432 | 135,168 | 1,022,362 | 809,472 | 1,082,990 | 43776 | 175680 | 47040 | 3072 | 351 | | speed token/s/gpu | 3637 | 3370 | 2055 | 380 | 61.8 | 93.9 | 105.7 | 635.6 | 1181 | 1257 | 352 | | 相关依赖 | xformers | xformers | | | measeq | Megatron-DeepSpeed | | | BMtrain | MindSpore | 353 | | speed token*params B/s/gpu | 25728 | 22579 | 26715 | 24700 | 10815 | 16432 | 13741 | 12712 | 11810 | 16341 | 354 | 355 | ## 后续计划 356 | 357 | 1. 加入RLHF代码 358 | 2. 使用[Triton](https://github.com/openai/triton)加入更多高性能算子,进一步提升性能 359 | 3. 加入根据Common Crawl构建预训练数据集相关代码,并开源相关数据集 360 | 4. 加入多模态训练代码 361 | 362 | ## 引用 363 | 364 | ``` 365 | @misc{openllama, 366 | title={Open-Llama}, 367 | author={s-JoL}, 368 | year={2023}, 369 | howpublished={\url{https://github.com/s-JoL/Open-Llama}}, 370 | } 371 | ``` 372 | 373 |

374 | 375 | Star History Chart 376 | 377 |

378 | 379 | 465 | -------------------------------------------------------------------------------- /assets/eng1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/eng1.png -------------------------------------------------------------------------------- /assets/eng2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/eng2.png -------------------------------------------------------------------------------- /assets/eng3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/eng3.png -------------------------------------------------------------------------------- /assets/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/image1.png -------------------------------------------------------------------------------- /assets/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/image2.png -------------------------------------------------------------------------------- /assets/image3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/image3.png -------------------------------------------------------------------------------- /assets/instruct_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/instruct_loss.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/logo.png -------------------------------------------------------------------------------- /assets/multiturn_chat.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/multiturn_chat.jpeg -------------------------------------------------------------------------------- /assets/multiturn_chat_en.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/multiturn_chat_en.jpeg -------------------------------------------------------------------------------- /assets/multiturn_chat_en.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/multiturn_chat_en.jpg -------------------------------------------------------------------------------- /assets/pretrain_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/assets/pretrain_loss.png -------------------------------------------------------------------------------- /chat_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-06 22:30:10 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-05-12 15:07:36 6 | FilePath: /Open-Llama/chat_server.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import torch 12 | import logging 13 | import gradio as gr 14 | from transformers import AutoModelForCausalLM, AutoTokenizer 15 | 16 | 17 | tokenizer = AutoTokenizer.from_pretrained("s-JoL/Open-Llama-V2", use_fast=False) 18 | model = AutoModelForCausalLM.from_pretrained( 19 | "s-JoL/Open-Llama-V2", torch_dtype=torch.bfloat16, device_map="auto" 20 | ) 21 | logging.warning("ready") 22 | 23 | 24 | with gr.Blocks() as demo: 25 | gr.Markdown( 26 | """ 27 | # [Open-Llama](https://github.com/s-JoL/Open-Llama) 28 | 完全使用Open-Llama项目从0开始训练的Instruct-GPT模型,当长时间无响应(如20s以上)可刷新重试。 29 | 30 | Instruct-GPT model is trained from scratch using the Open-Llama project without relying on any other pre-trained models. If there is no response for a long time (such as more than 20 seconds), please refresh and try again. 31 | """ 32 | ) 33 | chatbot = gr.Chatbot() 34 | msg = gr.Textbox() 35 | clear = gr.Button("Clear") 36 | 37 | def user(user_message, history): 38 | logging.warning(user_message) 39 | return "", history + [[user_message, None]] 40 | 41 | def bot(history): 42 | context = [] 43 | round = 0 44 | for prompt, completion in history: 45 | round += 1 46 | if completion is None: 47 | inputs = "user:{}\nsystem:".format(prompt) 48 | inputs = tokenizer( 49 | inputs, 50 | return_tensors="pt", 51 | add_special_tokens=False, 52 | return_attention_mask=False, 53 | ) 54 | context.append(inputs["input_ids"]) 55 | else: 56 | inputs = "user:{}\nsystem:{}".format(prompt, completion) 57 | inputs = tokenizer( 58 | inputs, 59 | return_tensors="pt", 60 | add_special_tokens=True, 61 | return_attention_mask=False, 62 | ) 63 | context.append(inputs["input_ids"]) 64 | context = torch.cat(context, dim=-1) 65 | context = context[:, -1024:] 66 | inputs_len = context.shape[1] 67 | context = context.cuda() 68 | pred = model.generate(input_ids=context, max_new_tokens=1024, do_sample=True) 69 | pred = pred[:, inputs_len:] 70 | pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True) 71 | logging.warning(pred) 72 | bot_message = pred 73 | history[-1][1] = bot_message 74 | return history 75 | 76 | msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( 77 | bot, chatbot, chatbot 78 | ) 79 | clear.click(lambda: None, None, chatbot, queue=False) 80 | gr.Markdown( 81 | """ 82 | 当前体验服务生成的所有内容都是由人工智能模型生成,我们对其生成内容的准确性、完整性和功能性不做任何保证,并且其生成的内容不代表我们的态度或观点。 83 | 84 | 联系方式: sl12160010@gmail.com 对于该项目有任何意见和建议都欢迎联系我. 85 | 86 | Contact information: sl12160010@gmail.com. Any opinions or suggestions regarding the project are welcome to be addressed to me through this email. 87 | """ 88 | ) 89 | 90 | demo.launch() 91 | -------------------------------------------------------------------------------- /configs/accelerate_configs/ds_stage1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 1 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | use_cpu: false -------------------------------------------------------------------------------- /configs/accelerate_configs/ds_stage2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | use_cpu: false -------------------------------------------------------------------------------- /configs/accelerate_configs/ds_stage3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero_stage: 3 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | use_cpu: false -------------------------------------------------------------------------------- /configs/accelerate_configs/ds_stage3_offload.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | deepspeed_multinode_launcher: standard 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: cpu 6 | offload_param_device: cpu 7 | zero3_init_flag: true 8 | zero_stage: 3 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | use_cpu: false -------------------------------------------------------------------------------- /configs/instruct_config.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | mode: "instruct" 3 | data: 4 | mixed: "data/instruction_data/part-*.jsonl.zst" 5 | pad_to_max: False 6 | sequence_sample_mode: "none" 7 | concat_multiple_sequence: True 8 | num_sequences: 50 9 | seq_length: 2048 10 | tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model" 11 | split_by_shard: False 12 | train: 13 | train_batch_size: 2 14 | # 1B token for 1 epoch, 5epoch 15 | num_training_steps: 20000 16 | num_warmup_steps: 500 17 | initializer_range: 1.0e-2 18 | lr: 2.0e-4 19 | weight_decay: 1.0e-1 20 | ckpt: "data/saved_model/ckpt.pth" 21 | train_num_workers: 16 22 | gradient_accumulation_steps: 1 23 | prefetch_factor: 100 24 | train_and_eval: False 25 | gradient_checkpointing_enable: False 26 | use_lora: False 27 | # global step 28 | log_interval: 50 29 | eval_interval: 500 30 | save_interval: 1000 31 | work_dir: "data/saved_ckpt/7B_instruction" 32 | project_name: "Llama Instruction" 33 | -------------------------------------------------------------------------------- /configs/model_configs/13B.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "OpenLlamaForCausalLM" 4 | ], 5 | "attention_dropout_prob": 0.1, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_dropout_prob": 0.1, 10 | "hidden_size": 5120, 11 | "initializer_range": 1e-2, 12 | "intermediate_size": 13824, 13 | "max_position_embeddings": 2048, 14 | "model_type": "open-llama", 15 | "num_attention_heads": 40, 16 | "num_hidden_layers": 40, 17 | "pad_token_id": 32000, 18 | "rms_norm_eps": 1e-05, 19 | "shared_input_output_embedding": false, 20 | "tie_word_embeddings": false, 21 | "torch_dtype": "float32", 22 | "use_cache": true, 23 | "use_memorry_efficient_attention": true, 24 | "use_stable_embedding": false, 25 | "vocab_size": 68762 26 | } 27 | -------------------------------------------------------------------------------- /configs/model_configs/33B.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "OpenLlamaForCausalLM" 4 | ], 5 | "attention_dropout_prob": 0.1, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_dropout_prob": 0.1, 10 | "hidden_size": 6656, 11 | "initializer_range": 1e-2, 12 | "intermediate_size": 17920, 13 | "max_position_embeddings": 2048, 14 | "model_type": "open-llama", 15 | "num_attention_heads": 52, 16 | "num_hidden_layers": 60, 17 | "pad_token_id": 32000, 18 | "rms_norm_eps": 1e-05, 19 | "shared_input_output_embedding": false, 20 | "tie_word_embeddings": false, 21 | "torch_dtype": "float32", 22 | "use_cache": true, 23 | "use_memorry_efficient_attention": true, 24 | "use_stable_embedding": false, 25 | "vocab_size": 68762 26 | } 27 | -------------------------------------------------------------------------------- /configs/model_configs/65B.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "OpenLlamaForCausalLM" 4 | ], 5 | "attention_dropout_prob": 0.1, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_dropout_prob": 0.1, 10 | "hidden_size": 8192, 11 | "initializer_range": 1e-2, 12 | "intermediate_size": 22016, 13 | "max_position_embeddings": 2048, 14 | "model_type": "open-llama", 15 | "num_attention_heads": 64, 16 | "num_hidden_layers": 80, 17 | "pad_token_id": 32000, 18 | "rms_norm_eps": 1e-05, 19 | "shared_input_output_embedding": false, 20 | "tie_word_embeddings": false, 21 | "torch_dtype": "float32", 22 | "use_cache": true, 23 | "use_memorry_efficient_attention": true, 24 | "use_stable_embedding": false, 25 | "vocab_size": 68762 26 | } 27 | -------------------------------------------------------------------------------- /configs/model_configs/7B.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "OpenLlamaForCausalLM" 4 | ], 5 | "attention_dropout_prob": 0.1, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_dropout_prob": 0.1, 10 | "hidden_size": 4096, 11 | "initializer_range": 1e-2, 12 | "intermediate_size": 11008, 13 | "max_position_embeddings": 2048, 14 | "model_type": "open-llama", 15 | "num_attention_heads": 32, 16 | "num_hidden_layers": 32, 17 | "pad_token_id": 32000, 18 | "rms_norm_eps": 1e-05, 19 | "shared_input_output_embedding": false, 20 | "tie_word_embeddings": false, 21 | "torch_dtype": "float32", 22 | "use_cache": true, 23 | "use_memorry_efficient_attention": true, 24 | "use_stable_embedding": false, 25 | "vocab_size": 68762 26 | } 27 | -------------------------------------------------------------------------------- /configs/pretrain_config.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | mode: "pretrain" 3 | data: 4 | wudao: "data/pretrain_data/part-wudao*.jsonl.zst" 5 | the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst" 6 | pad_to_max: False 7 | sequence_sample_mode: "none" 8 | concat_multiple_sequence: True 9 | num_sequences: 10 10 | seq_length: 2048 11 | tokenizer_model_path: "configs/tokenizer_models/llama_tokenizer_extended.model" 12 | split_by_shard: False 13 | train: 14 | train_batch_size: 2 15 | num_training_steps: 500000 16 | num_warmup_steps: 2000 17 | initializer_range: 1.0e-2 18 | lr: 2.0e-4 19 | weight_decay: 1.0e-1 20 | ckpt: null 21 | train_num_workers: 16 22 | gradient_accumulation_steps: 12 23 | prefetch_factor: 100 24 | train_and_eval: False 25 | gradient_checkpointing_enable: False 26 | use_lora: False 27 | # global step 28 | log_interval: 5 29 | eval_interval: 500 30 | save_interval: 1000 31 | work_dir: "data/saved_ckpt/7B" 32 | project_name: "Llama Pretrain" 33 | -------------------------------------------------------------------------------- /configs/tokenizer_models/10w_vocab_wudao5_pile10.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/configs/tokenizer_models/10w_vocab_wudao5_pile10.model -------------------------------------------------------------------------------- /configs/tokenizer_models/4w_cn_vocab_wudao15.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/configs/tokenizer_models/4w_cn_vocab_wudao15.model -------------------------------------------------------------------------------- /configs/tokenizer_models/llama_tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/configs/tokenizer_models/llama_tokenizer.model -------------------------------------------------------------------------------- /configs/tokenizer_models/llama_tokenizer_extended.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/s-JoL/Open-Llama/0157b6938d547f64e257b55055b5aa3f6cfd3e7f/configs/tokenizer_models/llama_tokenizer_extended.model -------------------------------------------------------------------------------- /data/download_instruct.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ### 3 | # @Author: s-JoL(sl12160010@gmail.com) 4 | # @Date: 2023-04-05 23:18:10 5 | # @LastEditors: s-JoL(sl12160010@gmail.com) 6 | # @LastEditTime: 2023-05-04 08:24:17 7 | # @FilePath: /Open-Llama/data/download_instruct.sh 8 | # @Description: 9 | # 10 | # Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 11 | ### 12 | mkdir data/instruction_data 13 | wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json' -O data/sg_90k_part1_html_cleaned.json 14 | wget -c --tries 3 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json' -O data/sg_90k_part2_html_cleaned.json 15 | python3 data/preprocess_instruction.py -------------------------------------------------------------------------------- /data/download_the_pile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ### 3 | # @Author: s-JoL(sl12160010@gmail.com) 4 | # @Date: 2023-03-16 21:21:38 5 | # @LastEditors: s-JoL(sl12160010@gmail.com) 6 | # @LastEditTime: 2023-03-26 22:58:02 7 | # @FilePath: /Open-Llama/data/download_the_pile.sh 8 | # @Description: 9 | # download the pile dataset and preprocess 10 | # Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 11 | ### 12 | start=0 13 | end=29 14 | mkdir data/the_pile 15 | for (( i=$start; i<=$end; i++ )) 16 | do 17 | url="https://the-eye.eu/public/AI/pile/train/$(printf "%02d" $i).jsonl.zst" 18 | echo "Downloading file: $url" 19 | curl -C - $url -o data/the_pile/"$(printf "%02d" $i).jsonl.zst" 20 | done 21 | 22 | wait 23 | 24 | echo "All files downloaded successfully." 25 | mkdir data/pretrain_data 26 | python3 data/preprocess_the_pile.py -------------------------------------------------------------------------------- /data/download_wudao.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ### 3 | # @Author: s-JoL(sl12160010@gmail.com) 4 | # @Date: 2023-03-16 21:21:56 5 | # @LastEditors: s-JoL(sl12160010@gmail.com) 6 | # @LastEditTime: 2023-03-26 22:58:11 7 | # @FilePath: /Open-Llama/data/download_wudao.sh 8 | # @Description: 9 | # download wudao dataset and preprocess 10 | # Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 11 | ### 12 | apt install unrar 13 | 14 | wget -v -c 'https://download.scidb.cn/download?fileId=63a30383fed6a8a9e8454302&dataSetType=organization&fileName=WuDaoCorporaText-2.0-open.rar' -O data/WuDaoCorpus2.0_base_200G.rar 15 | 16 | # for i in {1..100} 17 | # do 18 | # curl -C - --retry 100 'https://dorc.baai.ac.cn/resources/data/WuDaoCorpora2.0/WuDaoCorpus2.0_base_200G.rar?AccessKeyId=AKLTNasiLRBBTcOgPqzlkPzu1w&Expires=1679127659&Signature=7jh%2FpnJyC2hAeumm9EjaeE5HN9E%3D' -o data/WuDaoCorpus2.0_base_200G.rar 19 | # done 20 | 21 | unrar x data/WuDaoCorpus2.0_base_200G.rar data/ 22 | mkdir data/pretrain_data 23 | python3 data/preprocess_wudao.py -------------------------------------------------------------------------------- /data/preprocess_instruction.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-03-30 20:52:10 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-05-04 08:32:04 6 | FilePath: /Open-Llama/data/preprocess_instruction.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import json 12 | from tqdm import tqdm 13 | import zstandard as zstd 14 | from datasets import load_dataset 15 | 16 | 17 | root_dir = "data" 18 | write_path = "data/instruction_data/part-{}-{}.jsonl.zst" 19 | dataset_map = { 20 | "yizhongw/self_instruct": "self_instruct", 21 | "BelleGroup/train_0.5M_CN": "belle_0.5M", 22 | "BelleGroup/train_1M_CN": "belle_1M", 23 | "BelleGroup/train_2M_CN": "belle_2M", 24 | "BelleGroup/school_math_0.25M": "belle_school_math_0.25M", 25 | "BelleGroup/multiturn_chat_0.8M": "belle_multiturn_chat_0.8M", 26 | "Graverman/Instruct-to-Code": "instruct_to_code", 27 | "qwedsacf/grade-school-math-instructions": "grade_school_math", 28 | "camel-ai/math": "camel_ai_math", 29 | "camel-ai/physics": "camel_ai_physics", 30 | "camel-ai/chemistry": "camel_ai_chemistry", 31 | "camel-ai/biology": "camel_ai_biology", 32 | ("bigscience/xP3mt", "code"): "xP3mt_code", 33 | ("bigscience/xP3mt", "zh"): "xP3mt_zh", 34 | } 35 | 36 | 37 | def process_hf_dataset(name, local_name): 38 | if isinstance(name, str): 39 | dataset = load_dataset(name) 40 | else: 41 | dataset = load_dataset(*name) 42 | total_num = 0 43 | file_num = 1 44 | wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8") 45 | for line in tqdm(dataset["train"]): 46 | line = json.dumps(line) 47 | if total_num % 1024 == 0 and total_num > 0: 48 | file_num += 1 49 | wfp.close() 50 | wfp = zstd.open( 51 | write_path.format(local_name, file_num), "wb", encoding="utf-8" 52 | ) 53 | wfp.write(line.encode("utf-8")) 54 | wfp.write(b"\n") 55 | total_num += 1 56 | wfp.close() 57 | print( 58 | "{} preprocess done. Total line: {}, Total file: {}".format( 59 | name, total_num, file_num 60 | ) 61 | ) 62 | 63 | 64 | for k, v in dataset_map.items(): 65 | process_hf_dataset(k, v) 66 | 67 | local_name = "sharegpt_90K" 68 | total_num = 0 69 | file_num = 1 70 | wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8") 71 | with open("{}/sg_90k_part1_html_cleaned.json".format(root_dir), "r") as fp: 72 | data1 = json.load(fp) 73 | with open("{}/sg_90k_part2_html_cleaned.json".format(root_dir), "r") as fp: 74 | data2 = json.load(fp) 75 | data = data1 + data2 76 | for line in tqdm(data): 77 | line = json.dumps(line) 78 | if total_num % 1024 == 0 and total_num > 0: 79 | file_num += 1 80 | wfp.close() 81 | wfp = zstd.open(write_path.format(local_name, file_num), "wb", encoding="utf-8") 82 | wfp.write(line.encode("utf-8")) 83 | wfp.write(b"\n") 84 | total_num += 1 85 | wfp.close() 86 | print( 87 | "anon8231489123/ShareGPT_Vicuna_unfiltered preprocess done. Total line: {}, Total file: {}".format( 88 | total_num, file_num 89 | ) 90 | ) 91 | -------------------------------------------------------------------------------- /data/preprocess_the_pile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-03-16 22:35:38 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-03-26 22:59:38 6 | FilePath: /Open-Llama/data/preprocess_the_pile.py 7 | Description: 8 | Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 9 | making it easy for parallel training to perform streaming reads. 10 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 11 | """ 12 | import json 13 | from glob import glob 14 | from tqdm import tqdm 15 | import zstandard as zstd 16 | 17 | paths = glob("data/the_pile/*.jsonl.zst") 18 | write_path = "data/pretrain_data/part-pile-{}.jsonl.zst" 19 | total_num = 0 20 | file_num = 1 21 | wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") 22 | for path in tqdm(paths, total=len(paths)): 23 | with zstd.open(path, "r", encoding="utf-8") as fp: 24 | for line in fp: 25 | if total_num % 16384 == 0 and total_num > 0: 26 | file_num += 1 27 | wfp.close() 28 | wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") 29 | wfp.write(line.encode("utf-8")) 30 | total_num += 1 31 | wfp.close() 32 | print("total line: {}\ntotal files: {}".format(total_num, file_num)) 33 | -------------------------------------------------------------------------------- /data/preprocess_wudao.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-03-16 22:10:44 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-03-26 22:59:55 6 | FilePath: /Open-Llama/data/preprocess_wudao.py 7 | Description: 8 | Parse the dataset from the raw files and split them into different jsonl files based on the preset maximum number of lines, 9 | making it easy for parallel training to perform streaming reads. 10 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 11 | """ 12 | import json 13 | from glob import glob 14 | from tqdm import tqdm 15 | import zstandard as zstd 16 | 17 | paths = glob("data/WuDaoCorpus2.0_base_200G/part*") 18 | write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst" 19 | total_num = 0 20 | file_num = 1 21 | wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") 22 | for path in tqdm(paths, total=len(paths)): 23 | with open(path, "r") as fp: 24 | data = json.load(fp) 25 | for line in data: 26 | if total_num % 16384 == 0 and total_num > 0: 27 | file_num += 1 28 | wfp.close() 29 | wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") 30 | wfp.write(json.dumps(line).encode("utf-8")) 31 | wfp.write("\n".encode("utf-8")) 32 | total_num += 1 33 | wfp.close() 34 | print("total line: {}\ntotal files: {}".format(total_num, file_num)) 35 | -------------------------------------------------------------------------------- /dataset/dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-24 20:05:21 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-05-06 23:30:37 6 | FilePath: /Open-Llama/dataset/dataset.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import math 12 | import torch 13 | import random 14 | from glob import glob 15 | from datasets import load_dataset 16 | 17 | 18 | random.seed(42) 19 | 20 | 21 | def pretrain_transform(batch): 22 | # wudao preprocess 23 | if "title" in batch and "content" in batch: 24 | assert len(batch["title"]) == 1 25 | batch["text"] = [batch["title"][0] + "\n" + batch["content"][0]] 26 | elif "text" in batch: 27 | pass 28 | else: 29 | raise Exception("Unrecognized pretrain dataset format.") 30 | return batch 31 | 32 | 33 | def instruct_transform(batch): 34 | # self instruct preprocess 35 | if "prompt" in batch and "completion" in batch: 36 | prompt = batch["prompt"][0] 37 | completion = batch["completion"][0] 38 | if prompt.endswith("Output:"): 39 | prompt = prompt[:-7] 40 | text = "user:{}\nsystem:{}".format(prompt.strip(), completion.strip()) 41 | texts = [text] 42 | # belle preprocess 43 | elif "instruction" in batch and "output" in batch: 44 | prompt = batch["instruction"][0].replace("\\n", "") 45 | prompt = prompt.strip("") 46 | 47 | completion = batch["output"][0].replace("\\n", "") 48 | completion = completion.strip("") 49 | # multi turn chat 50 | if "Human:" in prompt: 51 | texts = [] 52 | chats = prompt + completion 53 | chats = chats.split("Human:") 54 | for chat in chats: 55 | if chat.strip() == "": 56 | continue 57 | res = chat.split("Assistant:") 58 | if len(res) != 2: 59 | continue 60 | prompt, completion = res 61 | prompt = prompt.strip() 62 | completion = completion.strip() 63 | chat = "user:{}\nsystem:{}".format(prompt, completion) 64 | texts.append(chat) 65 | texts = ["[multiturn_sep]".join(texts)] 66 | else: 67 | text = "user:{}\nsystem:{}".format(prompt, completion) 68 | texts = [text] 69 | # instruct code preprocess 70 | elif "instruction" in batch and "answer" in batch: 71 | prompt = batch["instruction"][0].replace("\\n", "") 72 | prompt = prompt.strip("") 73 | 74 | completion = batch["answer"][0].replace("\\n", "") 75 | completion = completion.strip("") 76 | text = "user:{}\nsystem:{}".format(prompt, completion) 77 | texts = [text] 78 | # share gpt preprocess 79 | elif "conversations" in batch: 80 | chats = batch["conversations"][0] 81 | if chats[0]["from"] != "human": 82 | chats = chats[1:] 83 | texts = [] 84 | for i in range(len(chats) // 2): 85 | prompt = chats[2 * i] 86 | completion = chats[2 * i + 1] 87 | if not (prompt["from"] == "human" and completion["from"] == "gpt"): 88 | continue 89 | prompt = prompt["value"] 90 | prompt = prompt.strip() 91 | completion = completion["value"] 92 | completion = completion.strip() 93 | chat = "user:{}\nsystem:{}".format(prompt, completion) 94 | texts.append(chat) 95 | texts = ["[multiturn_sep]".join(texts)] 96 | # xP3 preprocess 97 | elif "inputs" in batch and "targets" in batch: 98 | inputs = batch["inputs"][0] 99 | targets = batch["targets"][0] 100 | text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip()) 101 | texts = [text] 102 | # camel-ai preprocess 103 | elif "message_1" in batch and "message_2" in batch: 104 | inputs = batch["message_1"][0] 105 | targets = batch["message_2"][0] 106 | text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip()) 107 | texts = [text] 108 | # grade-school-math-instructions preprocess 109 | elif "INSTRUCTION" in batch and "RESPONSE" in batch: 110 | inputs = batch["INSTRUCTION"][0] 111 | targets = batch["RESPONSE"][0] 112 | text = "user:{}\nsystem:{}".format(inputs.strip(), targets.strip()) 113 | texts = [text] 114 | else: 115 | raise Exception("Unrecognized instruct dataset format.") 116 | return {"text": texts} 117 | 118 | 119 | def split_multiturn(batch): 120 | return {"text": batch["text"][0].split("[multiturn_sep]")} 121 | 122 | 123 | def sample_sequence_gen(seq_length, eos_token_id): 124 | def sample_sequence(line): 125 | doc_length = line["input_ids"].shape[0] 126 | if doc_length <= seq_length: 127 | start = 0 128 | else: 129 | if random.random() < 1 / 4: 130 | start = 0 131 | else: 132 | start = random.randint(0, doc_length - seq_length) 133 | input_ids = line["input_ids"][start : start + seq_length] 134 | if input_ids[-1] != eos_token_id: 135 | input_ids[-1] = eos_token_id 136 | return {"input_ids": input_ids} 137 | 138 | return sample_sequence 139 | 140 | 141 | def split_sequence_gen(seq_length): 142 | def split_sequence(batch): 143 | input_ids = batch["input_ids"][0] 144 | out = [] 145 | while len(input_ids) >= (1 + len(out)) * seq_length: 146 | out.append(input_ids[len(out) * seq_length : (1 + len(out)) * seq_length]) 147 | return {"input_ids": out} 148 | 149 | return split_sequence 150 | 151 | 152 | def concat_multiple_sequence_gen(seq_length, pad_token_id): 153 | def concat_multiple_sequence(batch): 154 | concat_input_ids = torch.cat(batch["input_ids"], dim=0) 155 | length = concat_input_ids.shape[0] 156 | chunks = math.ceil(length / seq_length) 157 | pad_length = chunks * seq_length - length 158 | pad = torch.ones(pad_length, dtype=concat_input_ids.dtype) * pad_token_id 159 | concat_input_ids = torch.cat([concat_input_ids, pad], dim=0) 160 | input_ids = torch.chunk(concat_input_ids, chunks) 161 | return {"input_ids": input_ids} 162 | 163 | return concat_multiple_sequence 164 | 165 | 166 | def get_labels_gen(pad_token_id): 167 | def get_labels(line): 168 | input_ids = line["input_ids"] 169 | labels = input_ids.clone() 170 | labels[labels == pad_token_id] = -100 171 | return {"labels": labels} 172 | 173 | return get_labels 174 | 175 | 176 | def construct_dataset( 177 | dataset_config, tokenizer, return_raw_text=False, world_size=None 178 | ): 179 | all_data_files = [] 180 | for name, pattern in dataset_config["data"].items(): 181 | data_files = glob(pattern) 182 | assert len(data_files) > 0 183 | all_data_files.extend(data_files) 184 | random.shuffle(all_data_files) 185 | # 当shard可以被world_size整除时 split_dataset_by_node 会直接按shard进行划分,否则会读所有数据然后跳过一部分,可能会慢一点 186 | # https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.distributed.split_dataset_by_node 187 | if world_size is not None: 188 | num_shards = len(all_data_files) 189 | all_data_files = all_data_files[: num_shards // world_size * world_size] 190 | dataset = load_dataset( 191 | "json", data_files=all_data_files, split="train", streaming=True 192 | ) 193 | # shuffle 194 | dataset = dataset.shuffle(seed=42) 195 | # 文本预处理转换为统一格式 196 | if dataset_config["mode"] == "pretrain": 197 | dataset = dataset.map(pretrain_transform, batched=True, batch_size=1) 198 | elif dataset_config["mode"] == "instruct": 199 | dataset = dataset.map(instruct_transform, batched=True, batch_size=1) 200 | dataset = dataset.select_columns("text") 201 | dataset = dataset.map(split_multiturn, batched=True, batch_size=1) 202 | else: 203 | raise Exception("Dataset mode: {} not found.".format(dataset_config["mode"])) 204 | 205 | full_dataset = dataset 206 | 207 | # to visualize 208 | if return_raw_text: 209 | return full_dataset 210 | 211 | seq_length = dataset_config["seq_length"] 212 | pad_to_max = dataset_config.get("pad_to_max", True) 213 | sequence_sample_mode = dataset_config.get("sequence_sample_mode", "truncation") 214 | truncation = sequence_sample_mode == "truncation" 215 | concat_multiple_sequence = dataset_config.get("concat_multiple_sequence", False) 216 | # tokenize 217 | if pad_to_max: 218 | full_dataset = full_dataset.map( 219 | lambda x: tokenizer( 220 | x["text"], 221 | return_tensors="pt", 222 | return_attention_mask=False, 223 | padding="max_length", 224 | max_length=seq_length, 225 | truncation=truncation, 226 | ) 227 | ) 228 | else: 229 | full_dataset = full_dataset.map( 230 | lambda x: tokenizer( 231 | x["text"], 232 | return_tensors="pt", 233 | return_attention_mask=False, 234 | truncation=truncation, 235 | ) 236 | ) 237 | 238 | # format 239 | full_dataset = full_dataset.map(lambda x: {"input_ids": x["input_ids"][0]}) 240 | full_dataset = full_dataset.select_columns("input_ids") 241 | 242 | # sequence_sample 243 | if sequence_sample_mode == "truncation": 244 | pass 245 | elif sequence_sample_mode == "none": 246 | pass 247 | elif sequence_sample_mode == "sample": 248 | assert pad_to_max or concat_multiple_sequence 249 | full_dataset = full_dataset.map( 250 | sample_sequence_gen(seq_length, tokenizer.eos_token_id) 251 | ) 252 | elif sequence_sample_mode == "split": 253 | assert not concat_multiple_sequence 254 | full_dataset = full_dataset.map( 255 | split_sequence_gen(seq_length), batched=True, batch_size=1 256 | ) 257 | else: 258 | raise Exception( 259 | "Unknown sequence_sample mode: {}.".format(sequence_sample_mode) 260 | ) 261 | 262 | # concat multiple sequence 263 | if concat_multiple_sequence: 264 | num_sequences = dataset_config["num_sequences"] 265 | full_dataset = full_dataset.map( 266 | concat_multiple_sequence_gen(seq_length, tokenizer.pad_token_id), 267 | batched=True, 268 | batch_size=num_sequences, 269 | drop_last_batch=True, 270 | ) 271 | 272 | # add label 273 | full_dataset = full_dataset.map(get_labels_gen(tokenizer.pad_token_id)) 274 | 275 | # shuffle 276 | full_dataset = full_dataset.shuffle(seed=42) 277 | return full_dataset 278 | 279 | 280 | if __name__ == "__main__": 281 | import time 282 | from unicodedata import normalize 283 | from torch.utils.data import DataLoader 284 | from transformers import LlamaTokenizer 285 | 286 | data_config = { 287 | "mode": "pretrain", 288 | "data": {"mixed": "data/pretrain_data/part-*.jsonl.zst"}, 289 | "pad_to_max": False, 290 | "sequence_sample_mode": "sample", 291 | "concat_multiple_sequence": True, 292 | "num_sequences": 10, 293 | "seq_length": 2048, 294 | } 295 | tokenizer = LlamaTokenizer( 296 | "configs/tokenizer_models/llama_tokenizer_extended.model", 297 | pad_token="", 298 | add_bos_token=False, 299 | add_eos_token=True, 300 | ) 301 | pretrain_dataset = construct_dataset(data_config, tokenizer, True) 302 | start = time.time() 303 | for i, line in enumerate(pretrain_dataset): 304 | raw_text = line["text"] 305 | # raw_text = normalize("NFKC", raw_text) 306 | input_ids = tokenizer( 307 | line["text"], return_tensors="pt", return_attention_mask=False 308 | )["input_ids"][0] 309 | decode_text = tokenizer.decode(input_ids, skip_special_tokens=True) 310 | if raw_text != decode_text and "▁" not in raw_text: 311 | print(raw_text, "\n", decode_text) 312 | if i == 3000: 313 | break 314 | print("all checked in {} seconds.".format(time.time() - start)) 315 | pretrain_dataset = construct_dataset(data_config, tokenizer) 316 | print(pretrain_dataset.n_shards) 317 | pretrain_loader = DataLoader(pretrain_dataset, batch_size=2, num_workers=16) 318 | for batch in pretrain_loader: 319 | for k, v in batch.items(): 320 | print(k, v.shape, "\n", v) 321 | break 322 | -------------------------------------------------------------------------------- /dataset/validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-03-18 00:06:41 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-03-27 01:09:20 6 | FilePath: /Open-Llama/dataset/validation.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | val_set = [ 12 | "白日依山尽,", 13 | "君不见,黄河之水天上来,奔流到海不复回。君不见,", 14 | "秦孝公据崤函之固,拥雍州之地,君臣固守以窥周室,有席卷天下,包举宇内,囊括四海之意,并吞八荒之心。", 15 | "古之学者必有师。师者,所以传道受业解惑也。人非生而知之者,孰能无惑?", 16 | "当我醒来时,我发现自己在一个完全陌生的地方。我看到周围没有人,只有一张纸条。", 17 | "这是一个斗气决定一切的大陆。在加玛帝国乌坦城,有个天才少年萧炎打破了所有族人的修炼纪录,一时间万人敬仰,众人艳羡。但不知为何,", 18 | "人工智能技术在图像识别领域取得了很大的进展,然而在复杂场景下仍然存在一些问题,例如", 19 | "In recent years, there has been increasing interest in the use of machine learning to", 20 | "已知三个数分别为1, 2, 3,则它们的平均数是", 21 | "小明总共有15个苹果,他分别给了3个人两个苹果,然后自己又吃了一个苹果,那么它还剩几个苹果?", 22 | "根据牛顿第二定律,物体的加速度等于", 23 | "碳纳米管是一种新型的材料,具有非常独特的电学和光学性质。在过去的几年中,我们对碳纳", 24 | "下面是一段用python写的快速排序的代码:", 25 | "The quantum many-body problem is a fundamental problem in condensed matter physics. Despite decades of research, there is still no exact solution to this problem for large systems. In this paper, we propose a novel approach based on", 26 | "下面是一个使用 PyTorch 和 Transformer 的示例代码,用于训练一个文本分类模型:import torch\nimport torch.nn as nn\nfrom torch.utils.data import DataLoader, Dataset", 27 | ] 28 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.13.1 2 | torchvision 3 | torchaudio 4 | zstandard 5 | accelerate 6 | datasets 7 | wandb 8 | deepspeed 9 | absl-py 10 | torchinfo 11 | scikit-learn 12 | datasets==2.10.1 13 | matplotlib 14 | seaborn 15 | sentencepiece 16 | triton 17 | functorch==1.13.1 18 | xformers==0.0.16 19 | gradio 20 | peft 21 | transformers -------------------------------------------------------------------------------- /solver/trainer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-24 20:05:21 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-05-08 22:51:42 6 | FilePath: /Open-Llama/solver/trainer.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import time 12 | import wandb 13 | import torch 14 | import logging 15 | from torchinfo import summary 16 | from deepspeed.ops.adam import FusedAdam 17 | from transformers import get_cosine_schedule_with_warmup 18 | 19 | from dataset.validation import val_set 20 | 21 | 22 | class Trainer: 23 | def __init__(self, config, raw_model, train_loader, tokenizer, accelerator): 24 | self.config = config 25 | self.raw_model = raw_model 26 | self.train_loader = train_loader 27 | self.tokenizer = tokenizer 28 | self.accelerator = accelerator 29 | self.train_and_eval = config["train"].get("train_and_eval", False) 30 | self.gradient_accumulation_steps = config["train"].get( 31 | "gradient_accumulation_steps", 1 32 | ) 33 | self.lr_scheduler_factor = ( 34 | accelerator.num_processes / accelerator.gradient_accumulation_steps 35 | ) 36 | self.log_interval = ( 37 | self.config["log_interval"] * accelerator.gradient_accumulation_steps 38 | ) 39 | self.eval_interval = ( 40 | self.config["eval_interval"] * accelerator.gradient_accumulation_steps 41 | ) 42 | self.save_interval = ( 43 | self.config["save_interval"] * accelerator.gradient_accumulation_steps 44 | ) 45 | self.work_dir = self.config["work_dir"] 46 | # self.get_model_info() 47 | if accelerator.is_main_process: 48 | wandb.init(project=self.config["project_name"]) 49 | 50 | def get_model_info(self): 51 | with torch.no_grad(): 52 | summary( 53 | self.raw_model.cuda(), 54 | input_data=torch.ones(1, 64, dtype=torch.int64).cuda(), 55 | ) 56 | 57 | def get_optimizer(self): 58 | no_decay = ["bias", "LayerNorm.weight", "layernorm.weight"] 59 | if self.config["train"].get("use_lora", False): 60 | optimizer_grouped_parameters = self.raw_model.parameters() 61 | else: 62 | optimizer_grouped_parameters = [ 63 | { 64 | "params": [ 65 | p 66 | for n, p in self.raw_model.named_parameters() 67 | if not any(nd in n for nd in no_decay) 68 | ], 69 | "weight_decay": self.config["train"]["weight_decay"], 70 | }, 71 | { 72 | "params": [ 73 | p 74 | for n, p in self.raw_model.named_parameters() 75 | if any(nd in n for nd in no_decay) 76 | ], 77 | "weight_decay": 0.0, 78 | }, 79 | ] 80 | self.optim = FusedAdam( 81 | optimizer_grouped_parameters, 82 | lr=self.config["train"]["lr"], 83 | betas=(0.9, 0.95), 84 | ) 85 | 86 | def get_lr_scheduler(self): 87 | self.scheduler = get_cosine_schedule_with_warmup( 88 | self.optim, 89 | num_warmup_steps=self.config["train"]["num_warmup_steps"] 90 | * self.lr_scheduler_factor, 91 | num_training_steps=self.config["train"]["num_training_steps"] 92 | * self.lr_scheduler_factor, 93 | ) 94 | 95 | def prepare(self): 96 | ( 97 | _, 98 | self.model, 99 | self.optim, 100 | self.scheduler, 101 | ) = self.accelerator.prepare( 102 | self.train_loader, self.raw_model, self.optim, self.scheduler 103 | ) 104 | self.optim.zero_grad() 105 | self.global_step = 0 106 | try: 107 | self.accelerator.load_state(self.work_dir) 108 | self.global_step = self.scheduler.scheduler._step_count - 1 109 | self.global_step = self.global_step // self.accelerator.num_processes 110 | logging.warning("Restored ckpt from {}".format(self.work_dir)) 111 | except: 112 | logging.warning("No ckpt found in {}".format(self.work_dir)) 113 | if self.global_step > 0: 114 | skip_steps = self.global_step * self.gradient_accumulation_steps 115 | logging.warning("Skiped {} steps.".format(skip_steps)) 116 | self.train_loader_skiped = self.accelerator.skip_first_batches( 117 | self.train_loader, num_batches=skip_steps 118 | ) 119 | else: 120 | self.train_loader_skiped = self.train_loader 121 | self.accelerator.wait_for_everyone() 122 | 123 | def train_step(self, batch): 124 | out = self.model(**batch) 125 | total_loss = out.loss 126 | losses = {"total_loss": total_loss} 127 | self.accelerator.backward(total_loss) 128 | self.optim.step() 129 | self.scheduler.step() 130 | self.optim.zero_grad() 131 | return losses 132 | 133 | def train(self): 134 | self.get_optimizer() 135 | self.get_lr_scheduler() 136 | self.prepare() 137 | self.start_time = time.time() 138 | self.epoch = 0 139 | self.data_step = 0 140 | while True: 141 | if self.data_step >= self.config["train"]["num_training_steps"]: 142 | break 143 | if self.epoch == 0: 144 | train_loader = self.train_loader_skiped 145 | else: 146 | train_loader = self.train_loader 147 | for batch in train_loader: 148 | # end training 149 | if self.data_step >= self.config["train"]["num_training_steps"]: 150 | break 151 | # data to device 152 | for k, v in batch.items(): 153 | batch[k] = v.to(self.accelerator.device, non_blocking=True) 154 | self.model.train() 155 | # train step 156 | with self.accelerator.accumulate(self.model): 157 | losses = self.train_step(batch) 158 | if self.accelerator.sync_gradients: 159 | self.global_step += 1 160 | # log 161 | if ( 162 | self.data_step % self.log_interval == 0 163 | and self.data_step > 0 164 | and self.accelerator.is_main_process 165 | ): 166 | self.log(losses) 167 | # eval/vis model output 168 | if ( 169 | self.data_step % self.eval_interval == 0 170 | and self.accelerator.is_main_process 171 | and self.train_and_eval 172 | ): 173 | self.eval() 174 | # save state 175 | if self.data_step % self.save_interval == 0 and self.data_step > 0: 176 | self.accelerator.save_state(self.work_dir) 177 | self.data_step += 1 178 | self.epoch += 1 179 | wandb.finish() 180 | 181 | def log(self, losses): 182 | cost_time = time.time() - self.start_time 183 | self.start_time = time.time() 184 | tokens = ( 185 | self.config["train"]["train_batch_size"] 186 | * self.log_interval 187 | * self.config["data"]["seq_length"] 188 | ) 189 | wandb.log({"Training/Token per second per gpu": tokens / cost_time}) 190 | for k, v in losses.items(): 191 | wandb.log({"Losses/{}".format(k): v}) 192 | current_lr = self.optim.param_groups[0]["lr"] 193 | wandb.log({"Training/LR": current_lr}) 194 | if self.optim.scaler is not None: 195 | wandb.log({"Training/Loss Scale": self.optim.scaler.get_scale()}) 196 | wandb.log({"Training/Data Step": self.data_step}) 197 | wandb.log({"Training/Global Step": self.global_step}) 198 | wandb.log({"Training/Epoch": self.epoch}) 199 | self.accelerator.print( 200 | "Epoch: {}, Global Step: {}, Data Step: {}, Loss: {}, Token per second per gpu: {}".format( 201 | self.epoch, 202 | self.global_step, 203 | self.data_step, 204 | losses["total_loss"], 205 | tokens / cost_time, 206 | ) 207 | ) 208 | 209 | def eval(self): 210 | text_table = wandb.Table(columns=["question", "pred"]) 211 | self.model.eval() 212 | with torch.no_grad(): 213 | for data in val_set: 214 | raw_inputs = data 215 | inputs = self.tokenizer( 216 | raw_inputs, 217 | return_tensors="pt", 218 | add_special_tokens=False, 219 | return_attention_mask=False, 220 | ) 221 | input_length = inputs["input_ids"].shape[1] 222 | for k, v in inputs.items(): 223 | inputs[k] = v.to(self.accelerator.device) 224 | pred = self.model.generate( 225 | **inputs, max_new_tokens=256, do_sample=True, repetition_penalty=2.0 226 | ) 227 | pred = pred[0, input_length:] 228 | pred = self.tokenizer.decode(pred.cpu(), skip_special_tokens=True) 229 | text_table.add_data(raw_inputs, pred) 230 | wandb.log({"Predictions on {}".format(self.global_step): text_table}) 231 | -------------------------------------------------------------------------------- /train_lm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-12 19:12:42 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-05-17 22:20:32 6 | FilePath: /Open-Llama/train_lm.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import yaml 12 | import math 13 | import logging 14 | from absl import app 15 | from absl import flags 16 | from accelerate import Accelerator 17 | from torch.utils.data import DataLoader 18 | from peft import LoraConfig, TaskType, get_peft_model 19 | from datasets.distributed import split_dataset_by_node 20 | from transformers import AutoConfig, AutoModelForCausalLM, LlamaTokenizer 21 | 22 | from dataset.dataset import construct_dataset 23 | from solver.trainer import Trainer 24 | 25 | FLAGS = flags.FLAGS 26 | flags.DEFINE_string("train_config", None, "Training config path") 27 | flags.DEFINE_string( 28 | "model_config", "configs/model_configs/7B.json", "Model config path" 29 | ) 30 | 31 | 32 | def main(argv): 33 | with open(FLAGS.train_config, "r", encoding="utf-8") as fp: 34 | config = yaml.load(fp, Loader=yaml.FullLoader) 35 | 36 | accelerator = Accelerator( 37 | gradient_accumulation_steps=config["train"].get( 38 | "gradient_accumulation_steps", 1 39 | ) 40 | ) 41 | tokenizer = LlamaTokenizer( 42 | config["data"]["tokenizer_model_path"], 43 | pad_token="", 44 | add_bos_token=False, 45 | add_eos_token=True, 46 | ) 47 | data_config = config["data"] 48 | if data_config.get("split_by_shard", False): 49 | train_dataset = construct_dataset( 50 | data_config, tokenizer, world_size=accelerator.num_processes 51 | ) 52 | else: 53 | train_dataset = construct_dataset(data_config, tokenizer) 54 | train_dataset = split_dataset_by_node( 55 | train_dataset, 56 | rank=accelerator.process_index, 57 | world_size=accelerator.num_processes, 58 | ) 59 | train_loader = DataLoader( 60 | train_dataset, 61 | batch_size=config["train"]["train_batch_size"], 62 | num_workers=config["train"]["train_num_workers"], 63 | prefetch_factor=config["train"].get("prefetch_factor", 2), 64 | pin_memory=True, 65 | ) 66 | # smaller initializer_range make training more stable 67 | # add stabel embedding to token embedding 68 | model_config = AutoConfig.from_pretrained(FLAGS.model_config) 69 | # Make the vocab size divisible by 16 70 | # https://huggingface.co/docs/transformers/main_classes/deepspeed#how-to-choose-which-zero-stage-and-offloads-to-use-for-best-performance 71 | # https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/ 72 | # vocab_size = math.ceil(tokenizer.vocab_size / 16) * 16 73 | # logging.warning( 74 | # "Round vocab_size from {} to {}.".format(tokenizer.vocab_size, vocab_size) 75 | # ) 76 | vocab_size = tokenizer.vocab_size 77 | model_config.vocab_size = vocab_size 78 | model_config.pad_token_id = tokenizer.pad_token_id 79 | # 使用AutoModel可以在Deepspeed.zero.Init()下正确的生效,而直接使用如OpenLlamaModel不能正确生效,导致浪费大量内存空间 80 | # https://github.com/huggingface/accelerate/pull/932 81 | if config["train"]["ckpt"] is not None: 82 | raw_model = AutoModelForCausalLM.from_pretrained( 83 | config["train"]["ckpt"], config=model_config 84 | ) 85 | logging.warning("Loaded ckpt from: {}".format(config["train"]["ckpt"])) 86 | else: 87 | raw_model = AutoModelForCausalLM.from_config(model_config) 88 | # lora 89 | if config["train"].get("use_lora", False): 90 | # gradient ckpt bug, https://github.com/huggingface/transformers/issues/23170 91 | if hasattr(raw_model, "enable_input_require_grads"): 92 | raw_model.enable_input_require_grads() 93 | else: 94 | 95 | def make_inputs_require_grad(module, input, output): 96 | output.requires_grad_(True) 97 | 98 | raw_model.get_input_embeddings().register_forward_hook( 99 | make_inputs_require_grad 100 | ) 101 | peft_config = LoraConfig( 102 | task_type=TaskType.CAUSAL_LM, 103 | target_modules=["q_proj", "v_proj"], 104 | inference_mode=False, 105 | r=1, 106 | lora_alpha=32, 107 | lora_dropout=0.1, 108 | ) 109 | raw_model = get_peft_model(raw_model, peft_config) 110 | raw_model.print_trainable_parameters() 111 | if config["train"].get("gradient_checkpointing_enable", False): 112 | raw_model.gradient_checkpointing_enable() 113 | trainer = Trainer(config, raw_model, train_loader, tokenizer, accelerator) 114 | trainer.train() 115 | 116 | 117 | if __name__ == "__main__": 118 | app.run(main) 119 | -------------------------------------------------------------------------------- /utils/convert_ckpt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-28 19:55:13 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-05-06 23:30:29 6 | FilePath: /Open-Llama/utils/convert_ckpt.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import torch 12 | import sentencepiece as spm 13 | 14 | 15 | sp_model = spm.SentencePieceProcessor( 16 | model_file="configs/tokenizer_models/llama_tokenizer_extended.model" 17 | ) 18 | merged_vocab_size = sp_model.vocab_size() 19 | ckpt = torch.load("data/llama_raw_ckpt/7B/consolidated.00.pth") 20 | 21 | raw_vocab_size, hidden_size = ckpt["tok_embeddings.weight"].shape 22 | extended_tok_embeddings = torch.randn(merged_vocab_size - raw_vocab_size, hidden_size) 23 | extended_tok_embeddings = extended_tok_embeddings * 0.001 24 | ckpt["tok_embeddings.weight"] = torch.cat( 25 | [ckpt["tok_embeddings.weight"], extended_tok_embeddings], dim=0 26 | ) 27 | 28 | extended_out_embeddings = torch.randn(merged_vocab_size - raw_vocab_size, hidden_size) 29 | extended_out_embeddings = extended_out_embeddings * 0.001 30 | ckpt["output.weight"] = torch.cat( 31 | [ckpt["output.weight"], extended_out_embeddings], dim=0 32 | ) 33 | 34 | rename_map = { 35 | "tok_embeddings.weight": "model.embed_tokens.weight", 36 | "norm.weight": "model.norm.weight", 37 | "output.weight": "lm_head.weight", 38 | } 39 | 40 | for f, t in rename_map.items(): 41 | v = ckpt.pop(f) 42 | ckpt[t] = v 43 | 44 | from_names = [ 45 | "layers.{}.attention.wq.weight", 46 | "layers.{}.attention.wk.weight", 47 | "layers.{}.attention.wv.weight", 48 | "layers.{}.attention.wo.weight", 49 | "layers.{}.feed_forward.w1.weight", 50 | "layers.{}.feed_forward.w2.weight", 51 | "layers.{}.feed_forward.w3.weight", 52 | "layers.{}.attention_norm.weight", 53 | "layers.{}.ffn_norm.weight", 54 | "layers.{}.attention.inner_attention.rope.freqs", 55 | ] 56 | 57 | to_names = [ 58 | "model.layers.{}.self_attn.q_proj.weight", 59 | "model.layers.{}.self_attn.k_proj.weight", 60 | "model.layers.{}.self_attn.v_proj.weight", 61 | "model.layers.{}.self_attn.o_proj.weight", 62 | "model.layers.{}.mlp.gate_proj.weight", 63 | "model.layers.{}.mlp.down_proj.weight", 64 | "model.layers.{}.mlp.up_proj.weight", 65 | "model.layers.{}.input_layernorm.weight", 66 | "model.layers.{}.post_attention_layernorm.weight", 67 | "model.layers.{}.self_attn.rotary_emb.inv_freq", 68 | ] 69 | 70 | for layer in range(32): 71 | for f, t in zip(from_names, to_names): 72 | f = f.format(layer) 73 | t = t.format(layer) 74 | v = ckpt.pop(f) 75 | ckpt[t] = v 76 | torch.save(ckpt, "data/llama_raw_ckpt/7B/extended.pth") 77 | -------------------------------------------------------------------------------- /utils/merge_tokenizer.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import sentencepiece as spm 3 | from sentencepiece import sentencepiece_model_pb2 as model 4 | 5 | raw_model = model.ModelProto() 6 | raw_model.ParseFromString( 7 | open("configs/tokenizer_models/llama_tokenizer.model", "rb").read() 8 | ) 9 | 10 | exist_pieces = set([p.piece for p in raw_model.pieces]) 11 | cn_model = model.ModelProto() 12 | cn_model.ParseFromString( 13 | open("configs/tokenizer_models/4w_cn_vocab_wudao15.model", "rb").read() 14 | ) 15 | 16 | for p in tqdm(cn_model.pieces, total=len(cn_model.pieces)): 17 | if p.piece not in exist_pieces: 18 | raw_model.pieces.append(p) 19 | 20 | with open("configs/tokenizer_models/llama_tokenizer_extended.model", "wb") as f: 21 | f.write(raw_model.SerializeToString()) 22 | 23 | sp_model = spm.SentencePieceProcessor( 24 | model_file="configs/tokenizer_models/llama_tokenizer_extended.model" 25 | ) 26 | 27 | print("merged vocab size: {}".format(sp_model.vocab_size())) 28 | -------------------------------------------------------------------------------- /utils/speed_test/accelerate/ddp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: MULTI_GPU 4 | fsdp_config: {} 5 | machine_rank: 0 6 | main_process_ip: null 7 | main_process_port: null 8 | main_training_function: main 9 | mixed_precision: bf16 10 | num_machines: 1 11 | num_processes: 2 12 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/deepspeed_stage1.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero_stage: 1 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_process_ip: null 13 | main_process_port: null 14 | main_training_function: main 15 | mixed_precision: bf16 16 | num_machines: 1 17 | num_processes: 2 18 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/deepspeed_stage2.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | fsdp_config: {} 11 | machine_rank: 0 12 | main_process_ip: null 13 | main_process_port: null 14 | main_training_function: main 15 | mixed_precision: bf16 16 | num_machines: 1 17 | num_processes: 2 18 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/deepspeed_stage3.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | fsdp_config: {} 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: null 15 | main_training_function: main 16 | mixed_precision: bf16 17 | num_machines: 1 18 | num_processes: 2 19 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/deepspeed_stage3_dynamo.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | fsdp_config: {} 12 | dynamo_config: 13 | dynamo_backend: INDUCTOR 14 | dynamo_mode: default 15 | dynamo_use_dynamic: false 16 | dynamo_use_fullgraph: false 17 | machine_rank: 0 18 | main_process_ip: null 19 | main_process_port: null 20 | main_training_function: main 21 | mixed_precision: bf16 22 | num_machines: 1 23 | num_processes: 2 24 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/deepspeed_stage3_offload.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: cpu 6 | offload_param_device: cpu 7 | zero3_init_flag: true 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | fsdp_config: {} 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: null 15 | main_training_function: main 16 | mixed_precision: bf16 17 | num_machines: 1 18 | num_processes: 2 19 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch_policy: BACKWARD_PRE 8 | fsdp_offload_params: false 9 | fsdp_sharding_strategy: 1 10 | fsdp_state_dict_type: FULL_STATE_DICT 11 | fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer 12 | machine_rank: 0 13 | main_process_ip: null 14 | main_process_port: null 15 | main_training_function: main 16 | mixed_precision: 'bf16' 17 | num_machines: 1 18 | num_processes: 2 19 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/megatron.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: {} 3 | distributed_type: MEGATRON_LM 4 | downcast_bf16: 'no' 5 | fsdp_config: {} 6 | machine_rank: 0 7 | main_process_ip: null 8 | main_process_port: null 9 | main_training_function: main 10 | megatron_lm_config: 11 | megatron_lm_gradient_clipping: 1.0 12 | megatron_lm_num_micro_batches: 2 13 | megatron_lm_pp_degree: 2 14 | megatron_lm_recompute_activations: true 15 | megatron_lm_sequence_parallelism: true 16 | megatron_lm_tp_degree: 2 17 | megatron_lm_use_distributed_optimizer: true 18 | mixed_precision: bf16 19 | num_machines: 1 20 | num_processes: 4 21 | rdzv_backend: static 22 | same_network: true 23 | use_cpu: false -------------------------------------------------------------------------------- /utils/speed_test/accelerate/run.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-08 22:44:44 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-04-08 23:15:57 6 | FilePath: /Open-Llama/speed_test/accelerate/run.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import time 12 | import torch 13 | from deepspeed.ops.adam import FusedAdam 14 | from accelerate import Accelerator, DistributedType 15 | from transformers import LlamaForCausalLM, LlamaConfig 16 | 17 | batch_size = 32 18 | seq_length = 2048 19 | vocab_size = 32000 20 | total_step = 2 21 | use_activation_ckpt = True 22 | 23 | 24 | class FakeSet(torch.utils.data.Dataset): 25 | def __getitem__(self, idx): 26 | return torch.randint(0, vocab_size, (seq_length,)) 27 | 28 | def __len__(self): 29 | return 1000000000 30 | 31 | 32 | accelerator = Accelerator() 33 | raw_model = LlamaForCausalLM( 34 | LlamaConfig( 35 | vocab_size=vocab_size, 36 | ) 37 | ) 38 | if use_activation_ckpt: 39 | raw_model.gradient_checkpointing_enable() 40 | optimizer = FusedAdam(raw_model.parameters(), lr=1e-5) 41 | 42 | train_loader = torch.utils.data.DataLoader(FakeSet(), batch_size=batch_size) 43 | if accelerator.distributed_type == DistributedType.FSDP: 44 | accelerator.print("FSDP") 45 | model = accelerator.prepare(raw_model) 46 | optimizer, train_loader = accelerator.prepare(optimizer, train_loader) 47 | else: 48 | model, optimizer, train_loader = accelerator.prepare( 49 | raw_model, optimizer, train_loader 50 | ) 51 | 52 | 53 | def train(model, optimizer, train_loader): 54 | start_time = time.time() 55 | for i, batch in enumerate(train_loader): 56 | if i == total_step: 57 | break 58 | optimizer.zero_grad() 59 | out = model(input_ids=batch, labels=batch) 60 | loss = out.loss 61 | accelerator.backward(loss) 62 | optimizer.step() 63 | end_time = time.time() 64 | return end_time - start_time 65 | 66 | 67 | accelerator.print("total time: {}".format(train(model, optimizer, train_loader))) 68 | -------------------------------------------------------------------------------- /utils/speed_test/accelerate/run.sh: -------------------------------------------------------------------------------- 1 | ### 2 | # @Author: s-JoL(sl12160010@gmail.com) 3 | # @Date: 2023-04-08 22:44:27 4 | # @LastEditors: s-JoL(sl12160010@gmail.com) 5 | # @LastEditTime: 2023-04-11 21:58:43 6 | # @FilePath: /Open-Llama/speed_test/accelerate/run.sh 7 | # @Description: 8 | # 9 | # Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | ### 11 | total_gpu=8 12 | accelerate launch --config_file deepspeed_stage2.yaml --main_process_ip 127.0.0.1 --main_process_port 23335 --num_processes $total_gpu run.py -------------------------------------------------------------------------------- /utils/speed_test/colossal-ai/run.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-11 20:07:35 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-04-11 21:56:23 6 | FilePath: /Open-Llama/speed_test/colossal-ai/run.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import os 12 | from functools import partial 13 | from time import time 14 | 15 | import psutil 16 | import torch 17 | import torch.nn as nn 18 | from transformers import LlamaForCausalLM, LlamaConfig 19 | from utils import get_data, get_profile_context, get_tflops, get_time_stamp 20 | from packaging import version 21 | from torch.nn.parallel import DistributedDataParallel as DDP 22 | 23 | import colossalai 24 | from colossalai.logging import disable_existing_loggers, get_dist_logger 25 | from colossalai.nn.optimizer import HybridAdam 26 | from colossalai.tensor import ( 27 | ColoParameter, 28 | ComputePattern, 29 | ComputeSpec, 30 | ProcessGroup, 31 | ReplicaSpec, 32 | ShardSpec, 33 | ) 34 | from colossalai.utils import get_current_device 35 | from colossalai.zero import ColoInitContext, zero_model_wrapper, zero_optim_wrapper 36 | 37 | CAI_VERSION = colossalai.__version__ 38 | 39 | 40 | def parse_args(): 41 | parser = colossalai.get_default_parser() 42 | parser.add_argument( 43 | "--distplan", 44 | type=str, 45 | default="CAI_Gemini", 46 | help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].", 47 | ) 48 | parser.add_argument( 49 | "--tp_degree", 50 | type=int, 51 | default=1, 52 | help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.", 53 | ) 54 | parser.add_argument( 55 | "--placement", 56 | type=str, 57 | default="cpu", 58 | help="Placement Policy for Gemini. Valid when using colossalai as dist plan.", 59 | ) 60 | parser.add_argument( 61 | "--shardinit", 62 | action="store_true", 63 | help="Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.", 64 | ) 65 | parser.add_argument( 66 | "--batch_size", 67 | type=int, 68 | default=8, 69 | help="batch size per DP group of training.", 70 | ) 71 | parser.add_argument( 72 | "--model_type", 73 | type=str, 74 | default="Llama-7B", 75 | help="model model scale", 76 | ) 77 | parser.add_argument( 78 | "--train_step", 79 | type=int, 80 | default=10, 81 | help="training iterations for test", 82 | ) 83 | 84 | args = parser.parse_args() 85 | return args 86 | 87 | 88 | def model_builder(VOCAB_SIZE, checkpoint=False): 89 | raw_model = LlamaForCausalLM( 90 | LlamaConfig( 91 | vocab_size=VOCAB_SIZE, 92 | ) 93 | ) 94 | if checkpoint: 95 | raw_model.gradient_checkpointing_enable() 96 | return raw_model 97 | 98 | 99 | # Parameter Sharding Strategies for Tensor Parallelism 100 | def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup): 101 | spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D)) 102 | param.set_tensor_spec(*spec) 103 | 104 | 105 | def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup): 106 | split_param_single_dim_tp1d(0, param, pg) 107 | 108 | 109 | def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup): 110 | split_param_single_dim_tp1d(-1, param, pg) 111 | 112 | 113 | class GPTLMLoss(nn.Module): 114 | def __init__(self): 115 | super().__init__() 116 | self.loss_fn = nn.CrossEntropyLoss() 117 | 118 | def forward(self, logits, labels): 119 | shift_logits = logits[..., :-1, :].contiguous() 120 | shift_labels = labels[..., 1:].contiguous() 121 | # Flatten the tokens 122 | return self.loss_fn( 123 | shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) 124 | ) 125 | 126 | 127 | def get_cpu_mem(): 128 | return psutil.Process().memory_info().rss / 1024**2 129 | 130 | 131 | def get_gpu_mem(): 132 | return torch.cuda.memory_allocated() / 1024**2 133 | 134 | 135 | def get_mem_info(prefix=""): 136 | return f"{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB" 137 | 138 | 139 | def get_model_size(model: nn.Module): 140 | total_numel = 0 141 | for module in model.modules(): 142 | for p in module.parameters(recurse=False): 143 | total_numel += p.numel() 144 | return total_numel 145 | 146 | 147 | def model_size_formatter(numel: int) -> str: 148 | GB_SIZE = 10**9 149 | MB_SIZE = 10**6 150 | KB_SIZE = 10**3 151 | if numel >= GB_SIZE: 152 | return f"{numel / GB_SIZE:.1f}B" 153 | elif numel >= MB_SIZE: 154 | return f"{numel / MB_SIZE:.1f}M" 155 | elif numel >= KB_SIZE: 156 | return f"{numel / KB_SIZE:.1f}K" 157 | else: 158 | return str(numel) 159 | 160 | 161 | def set_cpu_maximum_parallelism(): 162 | conf_str = torch.__config__.parallel_info() 163 | inter_str = conf_str.split("hardware_concurrency() : ")[1] 164 | max_concurrency = inter_str.split("\n")[0] 165 | os.environ["OMP_NUM_THREADS"] = max_concurrency 166 | print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.") 167 | 168 | 169 | # Tensor Parallel 170 | def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup): 171 | """tensor_parallelize 172 | Sharding the Model Parameters. 173 | 174 | Args: 175 | model (torch.nn.Module): a torch module to be sharded 176 | """ 177 | for mn, module in model.named_modules(): 178 | for pn, param in module.named_parameters(recurse=False): 179 | # NOTE() a param maybe shared by two modules 180 | if hasattr(param, "visited"): 181 | continue 182 | 183 | # if shard init, then convert param to replica and use the dp-only ProcessGroup 184 | param: ColoParameter = param 185 | param.set_dist_spec(ReplicaSpec()) 186 | param.set_process_group(pg) 187 | 188 | # shard it w.r.t tp pattern 189 | if "mlp.c_fc" in mn: 190 | if "weight" in pn or "bias" in pn: 191 | split_param_col_tp1d(param, pg) # colmn slice 192 | # keep the shape of the output from c_fc 193 | param.compute_spec.set_output_replicate(False) 194 | else: 195 | param.set_dist_spec(ReplicaSpec()) 196 | elif "mlp.c_proj" in mn: 197 | if "weight" in pn: 198 | split_param_row_tp1d(param, pg) # row slice 199 | else: 200 | param.set_dist_spec(ReplicaSpec()) 201 | elif "wte" in mn or "wpe" in mn: 202 | split_param_col_tp1d(param, pg) # colmn slice 203 | elif "c_attn" in mn or "c_proj" in mn: 204 | split_param_col_tp1d(param, pg) # colmn slice 205 | else: 206 | param.set_dist_spec(ReplicaSpec()) 207 | param.visited = True 208 | 209 | 210 | def main(): 211 | # version check 212 | # this example is supposed to work for versions greater than 0.2.0 213 | assert version.parse(CAI_VERSION) >= version.parse("0.2.0") 214 | 215 | set_cpu_maximum_parallelism() 216 | args = parse_args() 217 | 218 | # if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]: 219 | if args.distplan not in [ 220 | "CAI_ZeRO1", 221 | "CAI_ZeRO2", 222 | "CAI_Gemini", 223 | "Pytorch_DDP", 224 | "Pytorch_ZeRO", 225 | ]: 226 | raise TypeError(f"{args.distplan} is error") 227 | 228 | # batch size per DP degree 229 | BATCH_SIZE = args.batch_size 230 | SEQ_LEN = 2048 231 | VOCAB_SIZE = 32000 232 | 233 | NUM_STEPS = args.train_step 234 | 235 | WARMUP_STEPS = 1 236 | assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps" 237 | assert ( 238 | NUM_STEPS - WARMUP_STEPS 239 | ) % 2 == 1, "the number of valid steps should be odd to take the median" 240 | PROF_FLAG = False # The flag of profiling, False by default 241 | 242 | disable_existing_loggers() 243 | colossalai.launch_from_torch(config={}) 244 | 245 | logger = get_dist_logger() 246 | logger.info( 247 | f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0] 248 | ) 249 | 250 | # build criterion 251 | criterion = GPTLMLoss() 252 | 253 | torch.manual_seed(123) 254 | if args.distplan.startswith("CAI"): 255 | # all param must use the same process group. 256 | world_size = torch.distributed.get_world_size() 257 | shard_pg = ProcessGroup(tp_degree=world_size) if args.shardinit else None 258 | default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None 259 | 260 | if args.shardinit and args.distplan != "CAI_Gemini": 261 | raise RuntimeError("You can only use shardinit with CAI_Gemini") 262 | 263 | # build GPT model 264 | with ColoInitContext( 265 | device=get_current_device(), 266 | dtype=torch.half, 267 | default_dist_spec=default_dist_spec, 268 | default_pg=shard_pg, 269 | ): 270 | model = model_builder(VOCAB_SIZE, checkpoint=True) 271 | 272 | tp_pg = ProcessGroup(tp_degree=args.tp_degree) 273 | # Tensor Parallelism (TP) 274 | # You should notice that v0.1.10 is not compatible with TP degree > 1 275 | if args.tp_degree > 1: 276 | tensor_parallelize(model, tp_pg) 277 | 278 | # asign running configurations 279 | gemini_config = None 280 | if args.distplan.startswith("CAI_ZeRO"): 281 | optim_config = dict( 282 | reduce_bucket_size=12 * 1024 * 1024, 283 | overlap_communication=True, 284 | verbose=True, 285 | ) 286 | elif args.distplan == "CAI_Gemini": 287 | gemini_config = dict( 288 | strict_ddp_mode=args.tp_degree == 1, 289 | device=get_current_device(), 290 | placement_policy=args.placement, 291 | pin_memory=True, 292 | hidden_dim=model.model.config.hidden_size, 293 | search_range_mb=128, 294 | ) 295 | optim_config = dict(gpu_margin_mem_ratio=0.0) 296 | else: 297 | raise RuntimeError 298 | 299 | # build a highly optimized gpu/cpu optimizer 300 | optimizer = HybridAdam(model.parameters(), lr=1e-3) 301 | 302 | if args.distplan == "CAI_ZeRO1": 303 | zero_stage = 1 304 | elif args.distplan == "CAI_ZeRO2": 305 | zero_stage = 2 306 | elif args.distplan == "CAI_Gemini": 307 | zero_stage = 3 308 | else: 309 | raise RuntimeError 310 | 311 | # wrap your model and optimizer 312 | model = zero_model_wrapper(model, zero_stage, gemini_config) 313 | optimizer = zero_optim_wrapper(model, optimizer, optim_config=optim_config) 314 | 315 | logger.info(get_mem_info(prefix="After init optim, "), ranks=[0]) 316 | elif args.distplan.startswith("Pytorch"): 317 | assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples." 318 | model = model_builder(VOCAB_SIZE, checkpoint=True).cuda() 319 | model = DDP(model) 320 | if args.distplan.endswith("DDP"): 321 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) 322 | elif args.distplan.endswith("ZeRO"): 323 | from torch.distributed.optim import ZeroRedundancyOptimizer 324 | 325 | optimizer = ZeroRedundancyOptimizer( 326 | model.parameters(), optimizer_class=torch.optim.Adam, lr=1e-3 327 | ) 328 | else: 329 | raise RuntimeError 330 | 331 | # model is shared after TP 332 | numel = get_model_size(model) 333 | logger.info(f"the size of testing model size is {model_size_formatter(numel)}.") 334 | logger.info(get_mem_info(prefix="After init model, "), ranks=[0]) 335 | 336 | # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu 337 | # = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree) 338 | # = batch_per_DP_group * numel * seq_len * 8 339 | get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN) 340 | 341 | torch.cuda.synchronize() 342 | model.train() 343 | tflops_list = [] 344 | 345 | def train_step(): 346 | # we just use randomly generated data here 347 | input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE) 348 | optimizer.zero_grad() 349 | 350 | start = time() 351 | outputs = model(input_ids, attn_mask)[0] 352 | loss = criterion(outputs, input_ids) 353 | torch.cuda.synchronize() 354 | fwd_end = time() 355 | fwd_time = fwd_end - start 356 | logger.info(get_mem_info(prefix=f"[{n + 1}/{NUM_STEPS}] Forward "), ranks=[0]) 357 | 358 | if args.distplan.startswith("CAI"): 359 | optimizer.backward(loss) 360 | elif args.distplan.startswith("Pytorch"): 361 | loss.backward() 362 | else: 363 | raise RuntimeError 364 | 365 | torch.cuda.synchronize() 366 | bwd_end = time() 367 | bwd_time = bwd_end - fwd_end 368 | logger.info(get_mem_info(prefix=f"[{n + 1}/{NUM_STEPS}] Backward "), ranks=[0]) 369 | 370 | optimizer.step() 371 | torch.cuda.synchronize() 372 | optim_time = time() - bwd_end 373 | step_time = time() - start 374 | logger.info( 375 | get_mem_info(prefix=f"[{n + 1}/{NUM_STEPS}] Optimizer step "), ranks=[0] 376 | ) 377 | 378 | step_tflops = get_tflops_func(step_time) 379 | logger.info( 380 | f"[{n + 1}/{NUM_STEPS}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s", 381 | ranks=[0], 382 | ) 383 | if n >= WARMUP_STEPS: 384 | tflops_list.append(step_tflops) 385 | 386 | demo_profiler = get_profile_context( 387 | PROF_FLAG, 388 | WARMUP_STEPS, 389 | NUM_STEPS - WARMUP_STEPS, 390 | save_dir=f"profile/{get_time_stamp()}-demo", 391 | ) 392 | 393 | with demo_profiler as prof: 394 | start_time = time() 395 | for n in range(NUM_STEPS): 396 | train_step() 397 | prof.step() 398 | end_time = time() 399 | print("total time: {}".format(end_time - start_time)) 400 | 401 | tflops_list.sort() 402 | median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS 403 | logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}") 404 | torch.cuda.synchronize() 405 | 406 | 407 | if __name__ == "__main__": 408 | main() 409 | -------------------------------------------------------------------------------- /utils/speed_test/colossal-ai/run.sh: -------------------------------------------------------------------------------- 1 | set -x 2 | # distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"] 3 | export DISTPLAN=${DISTPLAN:-"CAI_Gemini"} 4 | 5 | # The following options only valid when DISTPLAN="colossalai" 6 | export GPUNUM=${GPUNUM:-8} 7 | export TPDEGREE=${TPDEGREE:-1} 8 | export PLACEMENT=${PLACEMENT:-"auto"} 9 | export USE_SHARD_INIT=${USE_SHARD_INIT:-True} 10 | export BATCH_SIZE=${BATCH_SIZE:-40} 11 | export MODEL_TYPE=${MODEL_TYPE:-"Llama-7B"} 12 | export TRAIN_STEP=${TRAIN_STEP:-10} 13 | # export PYTHONPATH=$PWD:$PYTHONPATH 14 | 15 | if [ ${USE_SHARD_INIT} = "True" ]; then 16 | USE_SHARD_INIT="--shardinit" 17 | else 18 | USE_SHARD_INIT="" 19 | fi 20 | 21 | mkdir -p gemini_logs 22 | 23 | torchrun --nproc_per_node=${GPUNUM} --rdzv_endpoint=127.0.0.1:23335 run.py \ 24 | --tp_degree=${TPDEGREE} \ 25 | --model_type=${MODEL_TYPE} \ 26 | --batch_size=${BATCH_SIZE} \ 27 | --placement=${PLACEMENT} \ 28 | ${USE_SHARD_INIT} \ 29 | --distplan=${DISTPLAN} \ 30 | --train_step=${TRAIN_STEP} \ 31 | 2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log 32 | -------------------------------------------------------------------------------- /utils/speed_test/colossal-ai/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from contextlib import nullcontext 3 | 4 | import torch 5 | from torch.profiler import ( 6 | ProfilerActivity, 7 | profile, 8 | schedule, 9 | tensorboard_trace_handler, 10 | ) 11 | 12 | 13 | class DummyProfiler: 14 | def __init__(self): 15 | self.step_number = 0 16 | 17 | def step(self): 18 | self.step_number += 1 19 | 20 | 21 | # Randomly Generated Data 22 | def get_data(batch_size, seq_len, vocab_size): 23 | input_ids = torch.randint( 24 | 0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device() 25 | ) 26 | attention_mask = torch.ones_like(input_ids) 27 | return input_ids, attention_mask 28 | 29 | 30 | def get_tflops(model_numel, batch_size, seq_len, step_time): 31 | return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12) 32 | 33 | 34 | def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir): 35 | if enable_flag: 36 | return profile( 37 | activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], 38 | schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps), 39 | on_trace_ready=tensorboard_trace_handler(save_dir), 40 | record_shapes=True, 41 | profile_memory=True, 42 | ) 43 | else: 44 | return nullcontext(DummyProfiler()) 45 | 46 | 47 | def get_time_stamp(): 48 | cur_time = time.strftime("%d-%H:%M", time.localtime()) 49 | return cur_time 50 | -------------------------------------------------------------------------------- /utils/speed_test/lightning/run.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-04-11 20:07:35 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-04-11 21:56:07 6 | FilePath: /Open-Llama/speed_test/lightning/run.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import time 12 | import torch 13 | import lightning.pytorch as pl 14 | from deepspeed.ops.adam import FusedAdam 15 | from transformers import LlamaForCausalLM, LlamaConfig 16 | from lightning.pytorch.strategies import DeepSpeedStrategy 17 | 18 | 19 | batch_size = 2 20 | seq_length = 2048 21 | vocab_size = 32000 22 | total_step = 100 23 | use_activation_ckpt = False 24 | 25 | 26 | class FakeSet(torch.utils.data.Dataset): 27 | def __getitem__(self, idx): 28 | return torch.randint(0, vocab_size, (seq_length,)) 29 | 30 | def __len__(self): 31 | return 1000000000 32 | 33 | 34 | class SpeedTest(pl.LightningModule): 35 | def __init__(self): 36 | super().__init__() 37 | self.model = LlamaForCausalLM( 38 | LlamaConfig( 39 | vocab_size=vocab_size, 40 | ) 41 | ) 42 | if use_activation_ckpt: 43 | self.model.gradient_checkpointing_enable() 44 | self.start_time = None 45 | 46 | def training_step(self, batch, batch_idx): 47 | out = self.model(batch, labels=batch) 48 | loss = out.loss 49 | if self.start_time is None: 50 | print("start") 51 | self.start_time = time.time() 52 | return loss 53 | 54 | def configure_optimizers(self): 55 | optimizer = FusedAdam(self.trainer.model.parameters(), lr=1e-5) 56 | return optimizer 57 | 58 | 59 | model = SpeedTest() 60 | train_loader = torch.utils.data.DataLoader(FakeSet(), batch_size=batch_size) 61 | 62 | strategy = DeepSpeedStrategy( 63 | stage=2, 64 | offload_optimizer=False, 65 | offload_parameters=False, 66 | process_group_backend="nccl", 67 | ) 68 | trainer = pl.Trainer( 69 | limit_train_batches=total_step, 70 | max_epochs=1, 71 | devices=8, 72 | accelerator="gpu", 73 | strategy=strategy, 74 | precision=16, 75 | enable_checkpointing=False, 76 | ) 77 | 78 | 79 | def train(model, train_loader): 80 | start_time = time.time() 81 | trainer.fit(model=model, train_dataloaders=train_loader) 82 | end_time = time.time() 83 | return end_time - model.start_time 84 | 85 | 86 | print("total time: {}".format(train(model, train_loader))) 87 | -------------------------------------------------------------------------------- /utils/train_tokenizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: s-JoL(sl12160010@gmail.com) 3 | Date: 2023-03-24 20:49:03 4 | LastEditors: s-JoL(sl12160010@gmail.com) 5 | LastEditTime: 2023-05-06 23:34:14 6 | FilePath: /Open-Llama/utils/train_tokenizer.py 7 | Description: 8 | 9 | Copyright (c) 2023 by s-JoL(sl12160010@gmail.com), All Rights Reserved. 10 | """ 11 | import random 12 | from glob import glob 13 | from datasets import load_dataset 14 | 15 | 16 | random.seed(42) 17 | 18 | wudao_pattern = "data/pretrain_data/part-wudao-*.jsonl.zst" 19 | wudao_paths = glob(wudao_pattern) 20 | random.shuffle(wudao_paths) 21 | 22 | pile_pattern = "data/pretrain_data/part-pile-*.jsonl.zst" 23 | pile_paths = glob(pile_pattern) 24 | random.shuffle(pile_paths) 25 | 26 | paths = wudao_paths[:5] + pile_paths[:10] 27 | 28 | dataset = load_dataset("json", data_files=paths, split="train", streaming=True) 29 | dataset = dataset.shuffle(seed=42) 30 | 31 | 32 | def transform(dataset): 33 | for line in dataset: 34 | if "title" in line and "content" in line: 35 | yield line["title"] + "\n" + line["content"] 36 | else: 37 | yield line["text"] 38 | 39 | 40 | data_iter = transform(dataset) 41 | 42 | import io 43 | import sentencepiece as spm 44 | 45 | # Loads model from URL as iterator and stores the model to BytesIO. 46 | model = io.BytesIO() 47 | spm.SentencePieceTrainer.train( 48 | sentence_iterator=data_iter, 49 | model_writer=model, 50 | shuffle_input_sentence=False, 51 | train_extremely_large_corpus=True, 52 | # hyperparameters of tokenizer 53 | max_sentence_length=16384, 54 | pad_id=3, 55 | model_type="BPE", 56 | vocab_size=100000, 57 | # split digits and fallback to byte same as Llama. 58 | # set split_by_unicode_script to True to avoid grouping punctuation and characters together. 59 | split_digits=True, 60 | split_by_unicode_script=True, 61 | byte_fallback=True, 62 | # reserve whitespace and \n and \t etc. for code generation 63 | allow_whitespace_only_pieces=True, 64 | remove_extra_whitespaces=False, 65 | # Llama use identity instead of nfkc 66 | normalization_rule_name="nfkc", 67 | ) 68 | 69 | # Serialize the model as file. 70 | with open("configs/tokenizer_models/10w_vocab_wudao5_pile10.model", "wb") as f: 71 | f.write(model.getvalue()) 72 | 73 | # Directly load the model from serialized model. 74 | sp = spm.SentencePieceProcessor(model_proto=model.getvalue()) 75 | print(sp.decode(sp.encode("只因你太美🤗▃ \n 1"))) 76 | --------------------------------------------------------------------------------