├── .gitignore ├── LICENSE ├── MODEL_LICENSE ├── README.md ├── README_zh.md ├── benchmark.py ├── configs ├── model_glm_130b.sh ├── model_glm_130b_int4.sh ├── model_glm_130b_int8.sh └── model_glm_130b_v100.sh ├── cuda ├── Makefile └── quantization.cu ├── docs ├── evaluate-your-own-tasks.md ├── inference-with-fastertransformer.md ├── low-resource-inference.md ├── media │ └── 16613396005977.jpg └── quantization.md ├── evaluate.py ├── evaluation ├── __init__.py ├── configs.py ├── dataset.py ├── metrics.py ├── model.py ├── tasks.py └── utils.py ├── generate.py ├── generation ├── __init__.py └── strategies.py ├── initialize.py ├── kernels ├── __init__.py └── quantization.fatbin ├── logs ├── README.md ├── main-log-en.md └── main-log.md ├── quantization ├── __init__.py ├── functional.py └── layers.py ├── requirements.txt ├── resources ├── 03DF31017FE184DB45D41DFFC6F80EF0.png ├── 33872E48D3539EA132B74BCF5EFF458F.png ├── 49BF334CB352BAA19F7D55460B1DBCA9.gif ├── 7CB441707D1035B2890AA2164C5B6EAC.png ├── 7D6433A42D189E2E6FBC62BE066BCE91.png ├── 849024E93FA85347F7F6443932911922.png ├── AE18F14396E2D22BC0BC8DD77EFD3414.png ├── E42321373D22DE198231279B5856BB42.png ├── F48B69263360688CCA21E915F4B1A98B.png ├── WECHAT.md ├── multitask_list.txt └── wechat.jpg ├── scripts ├── benchmark.sh ├── evaluate.sh ├── evaluate_multiple_node.sh └── generate.sh ├── tasks ├── bloom │ ├── glue_cola.yaml │ ├── glue_mnli.yaml │ ├── glue_qnli.yaml │ ├── glue_wnli.yaml │ ├── math_qa.yaml │ ├── mc_taco.yaml │ ├── openbook_qa.yaml │ ├── pubmed_qa.yaml │ ├── superglue_axb.yaml │ └── superglue_axg.yaml ├── chinese │ ├── clue │ │ ├── afqmc.yaml │ │ ├── c3.yaml │ │ ├── cluewsc.yaml │ │ ├── cmnli.yaml │ │ ├── cmrc2018.yaml │ │ ├── csl.yaml │ │ ├── drcd.yaml │ │ └── ocnli.yaml │ └── fewclue │ │ ├── bustm.yaml │ │ ├── chidf.yaml │ │ ├── cluewscf.yaml │ │ ├── cslf.yaml │ │ ├── eprstmt.yaml │ │ └── ocnlif.yaml ├── ethnic │ ├── crows-pair │ │ ├── crows-pair.yaml │ │ └── tasks.py │ ├── ethos │ │ ├── ethos-fewshot-multi.yaml │ │ ├── ethos-fewshot-single.yaml │ │ ├── ethos-oneshot.yaml │ │ └── ethos-zeroshot.yaml │ └── stereoset │ │ ├── stereoset.yaml │ │ └── tasks.py ├── lambada │ ├── lambada-unidirectional.yaml │ ├── lambada.yaml │ ├── strategy.py │ └── task.py ├── language-modeling │ ├── pile.py │ ├── pile.yaml │ ├── ptb.yaml │ ├── wikitext-103.yaml │ └── wikitext-2.yaml └── mmlu │ ├── mmlu.yaml │ └── task.py └── tools ├── __init__.py ├── convert_tp.py └── tokenize_pile.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | __pycache__ 3 | samples 4 | .DS_Store 5 | .idea 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright Aohan Zeng 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MODEL_LICENSE: -------------------------------------------------------------------------------- 1 | The GLM-130B License 2 | 3 | 1. Definitions 4 | 5 | “Licensor” means the GLM-130B Model Team that distributes its Software. 6 | 7 | “Software” means the GLM-130B model parameters made available under this license. 8 | 9 | 2. License Grant 10 | 11 | Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes. 12 | 13 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 14 | 15 | 3. Restriction 16 | 17 | You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes. 18 | 19 | You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings. 20 | 21 | 4. Disclaimer 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 | 25 | 5. Limitation of Liability 26 | 27 | EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 28 | 29 | 6. Dispute Resolution 30 | 31 | This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing. 32 | 33 | Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com. 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 🌐 Blog • ⏬ Download Model • 🪧 Demo • ✉️ Email • 📃 Paper [ICLR 2023]
5 |

6 | 7 |

8 | 💬 Google Group (Updates) or Wechat Group or Slack channel (Discussions) 9 |

10 | 11 | # GLM-130B: An Open Bilingual Pre-Trained Model 12 | 13 | GLM-130B is an open bilingual (English & Chinese) bidirectional dense model with 130 billion parameters, pre-trained using the algorithm of [General Language Model (GLM)](https://aclanthology.org/2022.acl-long.26). It is designed to support inference tasks with the 130B parameters on **a single A100 (40G * 8)** or **V100 (32G * 8) server**. With INT4 quantization, the hardware requirements can further be reduced to **a single server with 4 * RTX 3090 (24G)** with **almost no performance degradation**. As of July 3rd, 2022, GLM-130B has been trained on over 400 billion text tokens (200B each for Chinese and English) and it has the following unique features: 14 | 15 | - **Bilingual:** supports both English and Chinese. 16 | - **Performance (EN):** better than GPT-3 175B (+4.0%), OPT-175B (+5.5%), and BLOOM-176B (+13.0%) on LAMBADA and slightly better than GPT-3 175B (+0.9%) on MMLU. 17 | - **Performance (CN):** significantly better than ERNIE TITAN 3.0 260B on 7 zero-shot CLUE datasets (+24.26%) and 5 zero-shot FewCLUE datasets (+12.75%). 18 | - **Fast Inference:** supports fast inference on both [SAT](https://github.com/THUDM/SwissArmyTransformer) and [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) (up to 2.5X faster) with a single A100 server. 19 | - **Reproducibility:** all results (30+ tasks) can be easily reproduced with open-sourced code and model checkpoints. 20 | - **Cross-Platform:** supports training and inference on NVIDIA, Hygon DCU, Ascend 910, and Sunway (Will be released soon). 21 | 22 | This repository mainly focus on the evaluation of GLM-130B. If you find our work and our open-sourced efforts useful, ⭐️ to encourage our following development! :) 23 | 24 | ## News 25 | - **[2023.06.25]** Release [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), an updated version of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) which introduces **Stronger Performance** (MMLU (+23%), CEval (+33%), GSM8K (+571%), BBH (+60%)), **Longer Context** (from 2K in ChatGLM-6B to 32K, and trained with a context length of 8K during the dialogue alignment), and **More Efficient Inference** (speeds up by 42% under the official implementation; the dialogue length supported by 6G GPU memory has increased from 1K to 8K). More details please refer to [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B)。 26 | - **[2023.06.14]** We release the research [WebGLM](https://github.com/THUDM/WebGLM), which enables efficient and accurate web-enhanced question answering. All code and data are released! 27 | - **[2023.03.14]** We are happy to introduce [ChatGLM](https://chatglm.cn/blog), a bilingual dialogue language model based on GLM-130B, and its open-sourced version [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) which can be run under only **6GB** GPU memory! 28 | - **[2023.01.21]** GLM-130B has been accepted to [ICLR 2023](https://iclr.cc/Conferences/2023)! 29 | - **[2022.10.06]** Our [paper](http://arxiv.org/abs/2210.02414) for GLM-130B is out! 30 | - **[2022.08.24]** We are proud to publish the quantized version for GLM-130B. While preserving the activation precision as FP16, the model weights can be quantized to as low as **INT4 with almost no degradation of performance**, further reducing the hardware requirements of the GLM-130B to **a single server with 4 * RTX 3090 (24G)**! See [Quantization of GLM-130B](docs/quantization.md) for details. 31 | 32 | For smaller models, please find [monolingual GLMs](https://github.com/THUDM/GLM) (English: 10B/2B/515M/410M/335M/110M, Chinese: 10B/335M) and an [1B multilingual GLM](https://github.com/THUDM/Multilingual-GLM) (104 languages). 33 | 34 | ## Getting Started 35 | 36 | ### Environment Setup 37 | 38 | #### Hardware 39 | 40 | | **Hardware** | **GPU Memory** | **Quantization** | **Weight Offload** | 41 | | --------------- | -------------- | ---------------- | ------------------ | 42 | | 8 * A100 | 40 GB | No | No | 43 | | 8 * V100 | 32 GB | No | Yes (BMInf) | 44 | | 8 * V100 | 32 GB | INT8 | No | 45 | | 8 * RTX 3090 | 24 GB | INT8 | No | 46 | | 4 * RTX 3090 | 24 GB | INT4 | No | 47 | | 8 * RTX 2080 Ti | 11 GB | INT4 | No | 48 | 49 | It is recommended to use the an A100 (40G * 8) server, as all GLM-130B evaluation results (~30 tasks) reported can be easily reproduced with a single A100 server in about half a day. With INT8/INT4 quantization, efficient inference on **a single server with 4 * RTX 3090 (24G)** is possible, see [Quantization of GLM-130B](docs/quantization.md) for details. Combining quantization and weight offloading techniques, GLM-130B can also be inferenced on servers with even smaller GPU memory, see [Low-Resource Inference](docs/low-resource-inference.md) for details. 50 | 51 | #### Software 52 | 53 | The GLM-130B code is built on the top of [SAT](https://github.com/THUDM/SwissArmyTransformer). We recommend using [Miniconda](https://docs.conda.io/en/latest/miniconda.html) to manage your environment and installing additional dependencies via `pip install -r requirements.txt`. Here are the recommended environment configurations: 54 | 55 | - Python 3.9+ / CUDA 11+ / PyTorch 1.10+ / DeepSpeed 0.6+ / Apex (**installation with CUDA and C++ extensions is required, see [here](https://github.com/NVIDIA/apex/#linux)**) 56 | - SwissArmyTransformer>=0.2.11 is required for quantization 57 | 58 | #### Model weights 59 | 60 | Download the GLM-130B’s model checkpoint from [here](https://docs.google.com/forms/d/e/1FAIpQLSehr5Dh_i3TwACmFFi8QEgIVNYGmSPwV0GueIcsUev0NEfUug/viewform?usp=sf_link), make sure all 60 chunks are downloaded completely, then use the following command to merge them into a single archive file and extract it: 61 | 62 | ```bash 63 | cat glm-130b-sat.tar.part_* > glm-130b-sat.tar 64 | tar xvf glm-130b-sat.tar 65 | ``` 66 | 67 | Set `CHECKPOINT_PATH` in `configs/model_glm_130b.sh` to the path of the extracted folder. Since the checkpoint file is up to 260G, it is recommended to use the SSD or RAM disk to reduce the checkpoint loading time. Since the checkpoint we distribute is in 8-way tensor parallel, a conversion scripts is also provided if you need to change the tensor parallel dimension. 68 | 69 | ```bash 70 | python tools/convert_tp.py \ 71 | --input-folder \ 72 | --output-folder \ 73 | --target-tp 74 | ``` 75 | 76 | ### Left-To-Right Generation / Blank Filling 77 | 78 | ```bash 79 | bash scripts/generate.sh --input-source interactive 80 | ``` 81 | 82 | You can also specify an input file by `--input-source input.txt`. 83 | 84 | GLM-130B uses two different mask tokens: `[MASK]` for short blank filling and `[gMASK]` for left-to-right long text generation. When the input does not contain any MASK token, `[gMASK]` will be automatically appended to the end of the text. 85 | 86 |
87 | Left-To-Right Generation Examples 88 | 89 | #### Example 1 90 | Input: Who is the greatest artist? The greatest artist is [gMASK] 91 | 92 | Output: Who is the greatest artist? The greatest artist is not the one who paints the most beautiful picture, or writes the best poem, or plays the best symphony. The greatest artist is the one whose work has the greatest impact on the lives of others. The Greatest Artist is the artist who inspires us to be our best selves. The Great Artist is not the artist whose work is flawless. The great artist is not one who is perfect. The true great artist has the ability to make us feel something. The artist who makes us feel is the great artist. The art of the great artists is to make you feel something that you have never felt before. The greatness of an artist is in the ability of their work to touch the lives and hearts of others, and to make them feel something they have never experienced before. This is the true greatness of the artist. 2. The Significance of the Great Artist The significance of the greatest artist cannot be overstated. The significance and impact of the art of a great artist cannot even be measured. The impact of a single work of art can change the course of history. A single work can change a person’s life. A work of great art can make you laugh, make you cry, make your heart race, or make you think. A great work of artistic expression has the power to change the way you look at the world. The work of a truly great artist can change your life forever. 3. The Impact of the Greatest Artist The greatest artists have the ability and the power of their art to change people’s lives. They have the power and the ability, through their art, to make people feel something new. The power of the work of the truly great artists cannot be measured, but it can be felt. The greater the impact of an artists work, the greater the power it has to change lives. 4. The Influence of the Art of the Artist The art and the work created by the greatest artists has the potential to change a life. The influence of the artists work cannot be quantified, but is felt by those who experience it. The truly great work has an ability to touch people’ lives in a way that no other work can. 93 | 94 | #### Example 2 (Chinese) 95 | Input: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] 96 | 97 | Output: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:避寒的话,建议去海南三亚。 三亚位于海南岛的最南端,是中国最南部的热带滨海旅游城市,中国海滨城市,是中国空气质量最好的城市、全国最长寿地区(平均寿命80岁)。三亚市别称鹿城,又被称为"东方夏威夷",位居中国四大一线旅游城市"三威杭厦"之首,拥有全岛最美丽的海滨风光。三亚拥有被无数城市嫉妒的清新空气,柔和海滩。在沙滩悠闲散步、沐浴傍晚温和阳光,在海边玩耍,在雨林里面呼吸健康,欣赏自然奇观,一切都是那么令人享受。 气候 三亚地处热带,属热带季风气候区域,终年气温高,寒暑变化不大,年平均气温25.5度。三亚最热的月份是6月,平均气温为28.5°C,极端最高气温为35.7°C ,三亚没有明显的冬季,而且冬季更是到海滨旅游的最好时候,所以三亚是全年全天候避寒、消暑、度假、旅游的好地方。不过,三亚最旺的旅游时间是从10月份到农历春节,这个时候的人特别多,房间也不好订,建议最好避开高峰。三亚的三个旅游高峰期是"五一"、"十一"、"春节",这三个节日期间的房价都会上浮几倍,如果你选择了这三个假期来三亚的话要提前一个星期定房和咨询。 98 |
99 | 100 |
101 | Blank Filling Examples 102 | 103 | #### Example 1 104 | Input: Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai. 105 | 106 | Output: Ng is an adjunct professor at Stanford University (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.
107 | 108 | #### Example 2 (Chinese) 109 | Input: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。 110 | 111 | Output: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念拿破仑胜利而建,门高25米,顶上矗立两武士青铜古兵车铸像。 112 |
113 | 114 |
115 | Arguments Useful in Generation 116 | 117 | - `--input-source [path] or "interactive"` The input file's path. It can also be "interactive", which will launch a CLI. 118 | - `—-output-path [path]` The folder containing the results. 119 | - `—-out-seq-length [int]` The maximum sequence length for generation (including context). 120 | - `—-min-gen-length [int]` The minimum generation length for each MASK. 121 | - `—-sampling-strategy "BaseStrategy" or "BeamSearchStrategy"`. The sampling strategy used. 122 | - For BeamSearchStrategy: 123 | - `—-num-beams [int]` The number of beams. 124 | - `—-length-penalty [float]` The maximum sequence length for generation (including context). 125 | - `—-no-repeat-ngram-size [int]` Prohibit repeated n-gram generation. 126 | - `—-print-all-beam` Print the generated results for all beams. 127 | - For BaseStrategy: 128 | - `—-top-k [int]` Top k sampling. 129 | - `—-top-p [float]` Top p sampling. 130 | - `—-temperature [float]` The sampling temperature. 131 |
132 | 133 | ### Evaluation 134 | 135 | We use the YAML file to define tasks. Specifically, you can add multiple tasks or folders at a time for evaluation, and the evaluation script will automatically collect all YAML files under those folders recursively. 136 | 137 | ``` 138 | bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... 139 | ``` 140 | 141 | Download our evaluation dataset [here](https://cloud.tsinghua.edu.cn/f/826f0df4356f4022a264/), and set `DATA_PATH` in `scripts/evaluate.sh` to your local dataset directory. The task folder contains the YAML files for 30+ tasks we evaluated for GLM-130B. Take the [CoLA](https://nyu-mll.github.io/CoLA/) task for example, run `bash scripts/evaluate.sh tasks/bloom/glue_cola.yaml`, which outputs an accuracy of ~65% for the best prompt and ~57% for the median. 142 | 143 |
144 | Expected Output 145 | 146 | ```plain 147 | MultiChoiceTaskConfig(name='glue_cola', type=, path='/thudm/LargeScale/data/zeroshot/bloom/glue_cola', module=None, metrics=['Accuracy'], use_task_mask=False, use_multitask_encoding=False, unidirectional=False, max_seq_length=2048, file_pattern={'validation': '**/validation.jsonl'}, micro_batch_size=8) 148 | Evaluating task glue_cola: 149 | Evaluating group validation: 150 | Finish Following_sentence_acceptable/mul/validation.jsonl, Accuracy = 42.665 151 | Finish Make_sense_yes_no/mul/validation.jsonl, Accuracy = 56.951 152 | Finish Previous_sentence_acceptable/mul/validation.jsonl, Accuracy = 65.197 153 | Finish editing/mul/validation.jsonl, Accuracy = 57.622 154 | Finish is_this_correct/mul/validation.jsonl, Accuracy = 65.197 155 | Evaluation results of task glue_cola: 156 | Group validation Accuracy: max = 65.197, median = 57.622, average = 57.526 157 | Finish task glue_cola in 101.2s. 158 | ``` 159 |
160 | 161 | Multi-node evaluation can be configured by setting `HOST_FILE_PATH`(required by the [DeepSpeed lanucher](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)) in `scripts/evaluate_multiple_node.sh`. Set `DATA_PATH` in `scripts/evaluate_multiple_node.sh` and run the following command to evaluate all the tasks in `./task` directory. 162 | 163 | ``` 164 | bash scripts/evaluate_multiple_node.sh ./tasks 165 | ``` 166 | 167 | See [Evaluate Your Own Tasks](docs/evaluate-your-own-tasks.md) for details on how to add new tasks. 168 | 169 | ### 2.5X faster Inference using FasterTransformer 170 | 171 | By adapting the GLM-130B model to [FasterTransfomer](https://github.com/NVIDIA/FasterTransformer), a highly optimized transformer model library by NVIDIA, we can reach up to 2.5X speedup on generation, see [Inference with FasterTransformer](docs/inference-with-fastertransformer.md) for details. 172 | 173 | 174 | 175 | ## License 176 | 177 | This repository is licensed under the [Apache-2.0 license](LICENSE). The use of GLM-130B model weights is subject to the [Model License](MODEL_LICENSE). 178 | 179 | ## Citation 180 | 181 | If you find our work useful, please consider citing GLM-130B: 182 | 183 | ``` 184 | @article{zeng2022glm, 185 | title={Glm-130b: An open bilingual pre-trained model}, 186 | author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others}, 187 | journal={arXiv preprint arXiv:2210.02414}, 188 | year={2022} 189 | } 190 | ``` 191 | 192 | You may also consider GLM's original work in your reference: 193 | 194 | ``` 195 | @inproceedings{du2022glm, 196 | title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling}, 197 | author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie}, 198 | booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 199 | pages={320--335}, 200 | year={2022} 201 | } 202 | ``` 203 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | from initialize import initialize, initialize_model_and_tokenizer 4 | 5 | if __name__ == "__main__": 6 | args = initialize(extra_args_provider=lambda parser: None) 7 | model, tokenizer = initialize_model_and_tokenizer(args) 8 | 9 | for seq_len in [512, 1024, 2048]: 10 | torch.distributed.barrier() 11 | start = time.time() 12 | with torch.no_grad(): 13 | _, *_ = model( 14 | torch.ones(1, seq_len, device=torch.cuda.current_device(), dtype=torch.int64), 15 | torch.arange(seq_len, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1), 16 | torch.randn(1, 1, seq_len, seq_len, device=torch.cuda.current_device()) < 0.5, 17 | ) 18 | torch.distributed.barrier() 19 | if torch.distributed.get_rank() == 0: 20 | print(f"Encode {seq_len}: {(time.time() - start) * 1000:.2f} ms") 21 | -------------------------------------------------------------------------------- /configs/model_glm_130b.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="glm-130b" 2 | CHECKPOINT_PATH="" 3 | MP_SIZE=8 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ 5 | --num-layers 70 \ 6 | --hidden-size 12288 \ 7 | --inner-hidden-size 32768 \ 8 | --vocab-size 150528 \ 9 | --num-attention-heads 96 \ 10 | --max-sequence-length 2048 \ 11 | --tokenizer-type icetk-glm-130B \ 12 | --layernorm-order post \ 13 | --load ${CHECKPOINT_PATH} \ 14 | --skip-init \ 15 | --fp16" 16 | -------------------------------------------------------------------------------- /configs/model_glm_130b_int4.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="glm-130b" 2 | CHECKPOINT_PATH="" 3 | MP_SIZE=4 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ 5 | --num-layers 70 \ 6 | --hidden-size 12288 \ 7 | --inner-hidden-size 32768 \ 8 | --vocab-size 150528 \ 9 | --num-attention-heads 96 \ 10 | --max-sequence-length 2048 \ 11 | --tokenizer-type icetk-glm-130B \ 12 | --layernorm-order post \ 13 | --quantization-bit-width 4 \ 14 | --load ${CHECKPOINT_PATH} \ 15 | --skip-init \ 16 | --fp16" 17 | -------------------------------------------------------------------------------- /configs/model_glm_130b_int8.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="glm-130b" 2 | CHECKPOINT_PATH="" 3 | MP_SIZE=8 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ 5 | --num-layers 70 \ 6 | --hidden-size 12288 \ 7 | --inner-hidden-size 32768 \ 8 | --vocab-size 150528 \ 9 | --num-attention-heads 96 \ 10 | --max-sequence-length 2048 \ 11 | --tokenizer-type icetk-glm-130B \ 12 | --layernorm-order post \ 13 | --quantization-bit-width 8 \ 14 | --load ${CHECKPOINT_PATH} \ 15 | --skip-init \ 16 | --fp16" 17 | -------------------------------------------------------------------------------- /configs/model_glm_130b_v100.sh: -------------------------------------------------------------------------------- 1 | MODEL_TYPE="glm-130b" 2 | CHECKPOINT_PATH="" 3 | MP_SIZE=8 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ 5 | --num-layers 70 \ 6 | --hidden-size 12288 \ 7 | --inner-hidden-size 32768 \ 8 | --vocab-size 150528 \ 9 | --num-attention-heads 96 \ 10 | --max-sequence-length 2048 \ 11 | --tokenizer-type icetk-glm-130B \ 12 | --layernorm-order post \ 13 | --load ${CHECKPOINT_PATH} \ 14 | --skip-init \ 15 | --fp16 \ 16 | --bminf \ 17 | --bminf-memory-limit 25" 18 | -------------------------------------------------------------------------------- /cuda/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | OPTIONS=-gencode arch=compute_61,code=sm_61 \ 3 | -gencode arch=compute_62,code=sm_62 \ 4 | -gencode arch=compute_70,code=sm_70 \ 5 | -gencode arch=compute_72,code=sm_72 \ 6 | -gencode arch=compute_75,code=sm_75 \ 7 | -gencode arch=compute_80,code=sm_80 \ 8 | -gencode arch=compute_86,code=sm_86 9 | 10 | TARGETS=$(patsubst %.cu, %.fatbin, $(wildcard *.cu)) 11 | 12 | all: $(TARGETS) 13 | 14 | %.fatbin: %.cu 15 | $(NVCC) -fatbin $^ $(OPTIONS) -o $@ 16 | 17 | .PHONY : clean, copy 18 | clean: 19 | rm $(TARGETS) 20 | 21 | copy: 22 | cp $(TARGETS) ../kernels/ 23 | -------------------------------------------------------------------------------- /cuda/quantization.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | template 4 | __device__ void 5 | int4WeightExtractionDevice(const int8_t* weight, 6 | const T* scale_list, 7 | T* output, 8 | const int n, 9 | const int k) 10 | { 11 | for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ 12 | int8_t original = weight[i]; 13 | int8_t high = original >> 4; 14 | int8_t low = original << 4; low = low >> 4; 15 | output[i * 2] = T(high) * scale_list[blockIdx.x]; 16 | output[i * 2 + 1] = T(low) * scale_list[blockIdx.x]; 17 | } 18 | } 19 | 20 | __device__ void 21 | int4WeightCompressionDevice(const int8_t* input, 22 | int8_t* output, 23 | const int n, 24 | const int k) 25 | { 26 | for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ 27 | output[i] = (input[i * 2] << 4) | (input[i * 2 + 1] & 0b00001111); 28 | } 29 | } 30 | 31 | template 32 | __device__ void 33 | int8WeightExtractionDevice(const int8_t* weight, 34 | const T* scale_list, 35 | T* output, 36 | const int n, 37 | const int k) 38 | { 39 | for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ 40 | output[i] = T(weight[i]) * scale_list[blockIdx.x]; 41 | } 42 | } 43 | 44 | extern "C" __global__ void int4WeightExtractionHalf(const int8_t* weight, 45 | const half* scale_list, 46 | half* output, 47 | const int n, 48 | const int k){ 49 | int4WeightExtractionDevice(weight, scale_list, output, n, k); 50 | } 51 | 52 | extern "C" __global__ void int4WeightExtractionFloat(const int8_t* weight, 53 | const float* scale_list, 54 | float* output, 55 | const int n, 56 | const int k){ 57 | int4WeightExtractionDevice(weight, scale_list, output, n, k); 58 | } 59 | 60 | extern "C" __global__ void int8WeightExtractionHalf(const int8_t* weight, 61 | const half* scale_list, 62 | half* output, 63 | const int n, 64 | const int k){ 65 | int8WeightExtractionDevice(weight, scale_list, output, n, k); 66 | } 67 | 68 | extern "C" __global__ void int8WeightExtractionFloat(const int8_t* weight, 69 | const float* scale_list, 70 | float* output, 71 | const int n, 72 | const int k){ 73 | int8WeightExtractionDevice(weight, scale_list, output, n, k); 74 | } 75 | 76 | extern "C" __global__ void int4WeightCompression(const int8_t* input, 77 | int8_t* output, 78 | const int n, 79 | const int k){ 80 | int4WeightCompressionDevice(input, output, n, k); 81 | } 82 | -------------------------------------------------------------------------------- /docs/evaluate-your-own-tasks.md: -------------------------------------------------------------------------------- 1 | # Evaluate Your Own Tasks 2 | 3 | ## YAML file for tasks 4 | 5 | We use the YAML file to define tasks, this allows us to easily evaluate multiple tasks at a single run and configure them independently. Specifically, you can add multiple tasks or folders at a time for evaluation, and the script will automatically collect all YAML files under those folders recursively. 6 | 7 | ``` 8 | # Single node 9 | bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... 10 | # Multi node 11 | bash scripts/evaluate_multiple_node.sh task1.yaml task2.yaml dir1 dir2 ... 12 | ``` 13 | 14 | We support two types of evaluation tasks: multi-choice and generation. The YAML config options for both tasks are defined in `evaluation/configs.py`. Basically, all types of tasks share common configs defining task information: 15 | 16 | ```yaml 17 | name: 'glue_cola' # Task Name 18 | type: 'mul' # Task type, 'gen' (generate) or 'mul' (multiple choice) 19 | path: 'bloom/glue_cola' # task data path relative to DATA_PATH in 'evaluate.sh' 20 | use_task_mask: False # Whether use [gMASK] for evaluation 21 | unidirectional: False # Whether use unidirectional attention 22 | max_seq_length: 2048 # Max sequence length 23 | file-pattern: # Organize jsonl file in groups 24 | validation: "**/validation.jsonl" # Will search for all file named 'validation.jsonl' in `DATA_PATH/bloom/glue_cola` using glob.glob() 25 | micro-batch-size: 30 # 'gen' task only support mbs = 1 for now 26 | ``` 27 | 28 | See configuration details for multi-choice and generation tasks in `evaluation/configs.py`. 29 | 30 | ## Data format for tasks 31 | 32 | We recommend organizing the task data in the following structure and setup up two groups named "validation" and "test" in the `file-pattern` config so that it becomes very easy to evaluate different prompts on both validation and test sets independently. 33 | 34 | ```bash 35 | DATA_PATH 36 | └── task_name 37 | ├── prompt_1 38 | │   ├── test.jsonl 39 | │   └── val.jsonl 40 | ├── prompt_2 41 | │   ├── test.jsonl 42 | │   └── val.jsonl 43 | └── prompt_3 44 | ├── test.jsonl 45 | └── val.jsonl 46 | ``` 47 | 48 | The evaluation data for each prompt are organized into jsonline format. For multi-choice tasks, the format of each line of JSON should be 49 | 50 | ```json 51 | { 52 | "inputs_pretokenized": "Context and question here", 53 | "choices_pretokenized": ["Choice 1", "Choice 2", "Choice 3"], 54 | "label": int 55 | } 56 | ``` 57 | 58 | The default metric for the multi-choice task is Accuracy. 59 | 60 | For the generation task, the format of each line of JSON should be 61 | 62 | ```json 63 | { 64 | "inputs_pretokenized": "Context and question here", 65 | "targets_pretokenized": ["Target 1", "Target 2", "Target 3"], 66 | "label": int 67 | } 68 | ``` 69 | 70 | The default metrics for the generation task are EM(Exact-Match) and F1. Given inputs, the sequence generated by the model will be metricized separately from all targets and the highest value will be taken. 71 | 72 | 73 | ## Implement Your Metrics 74 | 75 | You can customize your evaluation metrics function and add it to `DEFAULT_METRICS` in `evaluation/metrics.py`, and then you can specify `metric: ['Your metric name']` in the task YAML file. 76 | 77 | ## Fully customize the evaluation process 78 | 79 | By default, we implement classes named `MultiChoiceTask` and `GenerationTask` in `evaluation/tasks.py` for multi-choice tasks and generation tasks, respectively. 80 | 81 | You can implement a new task class and inherit from one of these two classes, and implement the `process_single_batch` function to define how to process a batch of inputs and get the predictions. Following [Big-Bench](https://github.com/google/BIG-bench/#creating-the-task), we implemented two methods you can use for your evaluation: 82 | 83 | - `model.cond_log_prob()`: Compute the probabilities of provided model outputs for given inputs. 84 | - `model.generate_text()`: Generate text for given inputs. 85 | 86 | Once you have created the new task class, you need to specify the relative path to import the class in the `module` field of the task YAML file. See `tasks/lambada/tasks.py` and `tasks/lambada/lambada.yaml` for how we customize the beam search generation strategy for LAMBADA tasks and configure the YAML file. 87 | -------------------------------------------------------------------------------- /docs/inference-with-fastertransformer.md: -------------------------------------------------------------------------------- 1 | # Inference with FasterTransformer 2 | 3 | [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. 4 | 5 | We adapted the GLM-130B based on Fastertransformer for fast inference, with details in [benchmark](#benchmark) section. 6 | 7 | ## Download the Model 8 | 9 | See [Get Model](/README.md#environment-setup). 10 | 11 | ## Recommend: Run With Docker 12 | 13 | Use Docker to quickly build a Flask API application for GLM-130B. 14 | 15 | ### Requirements 16 | 17 | - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) 18 | 19 | ### Build Container Image 20 | 21 | ```bash 22 | git clone https://github.com/THUDM/FasterTransformer.git 23 | cd FasterTransformer 24 | bash docker/build.sh 25 | ``` 26 | 27 | ### Run API With Checkpoints 28 | 29 | Set MPSIZE to the number of gpus needed for the checkpoints, and DATA_TYPE to checkpoints precision. The checkpoint we distribute is in 8-way tensor parallel in FP16 precision, a conversion scripts is also provided if you need to change the tensor parallel dimension and the weight precision. 30 | 31 | ```bash 32 | # Convert the checkpoint to MP=4, DATA_TYPE=INT4 33 | python tools/convert_tp.py \ 34 | --input-folder \ 35 | --output-folder \ 36 | --target-tp 8 \ 37 | --quantization-bit-width 4 \ 38 | # Run API 39 | docker run -it --rm --gpus all --shm-size=10g -p 5000:5000 \ 40 | -v /49300:/checkpoints:ro \ 41 | -e MPSIZE=4 -e DATA_TYPE=int4 \ 42 | ftglm:latest 43 | ``` 44 | 45 | ### Test 46 | 47 | #### Benchmark 48 | 49 | ```bash 50 | python3 examples/pytorch/glm/glm_server_test.py 51 | ``` 52 | 53 | #### Web Demo 54 | 55 | ```bash 56 | pip install gradio 57 | python3 examples/pytorch/glm/glm_server_frontend_test.py 58 | ``` 59 | 60 | ## Manual Configuration 61 | 62 | ### Requirements 63 | 64 | - CMake >= 3.13 for PyTorch 65 | - CUDA 11.0 or newer version 66 | - NCCL 2.10 or newer version 67 | - Python 3 is recommended because some features are not supported in python 2 68 | - PyTorch: Verify on 1.10.1, >= 1.8.0 should work. 69 | 70 | ### Setup Using Docker 71 | 72 | ```bash 73 | docker run -it --rm --gpus all nvcr.io/nvidia/pytorch:22.09-py3 /bin/bash 74 | conda install -y pybind11 75 | ``` 76 | 77 | ### Setup Using Conda 78 | 79 | As another way, all the packages can be installed using conda. 80 | 81 | > Some of our current [structure](https://github.com/THUDM/FasterTransformer/blob/main/src/fastertransformer/th_op/glm/GlmOp.h#L30) requires that `g++` and `libtorch` produce the same results, so a pre-compiled `libtorch` may only work with `g++-7` or `g++-9`. And although GLM-130B itself does not rely on openmpi, FasterTransformer requires it during the build process. We are working on these issues. 82 | 83 | ```bash 84 | conda install -y cmake pybind11 85 | conda install -y -c conda-forge cudatoolkit-dev cudnn 86 | cp -r $CONDA_PREFIX/lib/libcudnn* /usr/local/cuda/lib64/ 87 | cp -r $CONDA_PREFIX/include/cudnn*.h /usr/local/cuda/include/ 88 | ``` 89 | 90 | If it's hard to install cudatoolkit-dev and cudnn by conda, just install them from [NVIDIA Developer](https://developer.nvidia.com/cuda-downloads), and make sure cmake is able to find cudnn. 91 | 92 | ```bash 93 | cp cudnn/include/cudnn*.h /usr/local/cuda/include 94 | cp cudnn/lib/libcudnn* /usr/local/cuda/lib64 95 | chmod a+r /usr/local/cuda/include/cudnn*.h 96 | chmod a+r /usr/local/cuda/lib64/libcudnn* 97 | ``` 98 | 99 | GLM-130B is trained with FP16 precision, a total of 260G of GPU memory is required to store model weights. The model is tested with 8 * 40G A100s. 100 | 101 | ### Build 102 | 103 | Get the code and install all dependencies: 104 | 105 | ```bash 106 | git clone https://github.com/THUDM/FasterTransformer.git 107 | mkdir -p FasterTransformer/build 108 | cd FasterTransformer/build 109 | pip3 install icetk transformers 110 | ``` 111 | 112 | Note: the `xx` of `-DSM=xx` in following scripts means the compute capability of your GPU. For example, 60 (P40) or 61 (P4) or 70 (V100) or 75(T4) or 80 (A100) or 86(RTX 3090). Default setting is including 70, 75, 80 and 86. 113 | 114 | ```bash 115 | cmake -DSM=80 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .. 116 | make -j 117 | ``` 118 | 119 | ### Run GLM-130B 120 | 121 | Generate the `gemm_config.in` file. 122 | 123 | ```bash 124 | # ./bin/gpt_gemm 125 | ./bin/gpt_gemm 1 1 128 96 128 49152 150528 1 8 126 | ``` 127 | 128 | Running GLM_130B in Pytorch and Flask. 129 | 130 | ```bash 131 | bash ../examples/pytorch/glm/glm-server.sh 132 | ``` 133 | 134 | You need to check and edit this file to set arguments such as `CHECKPOINT_PATH`. 135 | 136 | ## Optimization methods 137 | 138 | Optimization in GLM_130B are similar to optimization in GPT and GPT-J, describing in the [FasterTransformer/gpt_guide.md](https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md). Meanwhile, some of the operators are differ from GPT, such as the implementation of RotaryEmbedding, and the use of GeGLU, so we add them additionally into FasterTransformer. 139 | 140 | ## Benchmark 141 | 142 | - Hardware: DGX-A100(8 * 40G) 143 | 144 | ## Encode 145 | 146 | | **Sequence Len** | 512 | 1024 | 2048 | 147 | | ---------- | ------ | ------ | ------ | 148 | | Megatron | 145 ms | 250 ms | 453 ms | 149 | | FasterTransformer | 120 ms | 220 ms | OOM | 150 | 151 | ## Decode 152 | 153 | | **Sequence Len** | 512 | 1024 | 2048 | 154 | | ---------- | ------- | ------- | -------- | 155 | | Megatron | 45.21 s | 89.00 s | 179.22 s | 156 | | FasterTransformer | 18.77 s | 39.81 s | 89.88 s | 157 | -------------------------------------------------------------------------------- /docs/low-resource-inference.md: -------------------------------------------------------------------------------- 1 | # Low-resource Inference with BMInf 2 | 3 | GLM-130B is trained with 4-way tensor parallel and 8-way pipeline parallel for efficiency. Then the checkpoint is converted into a 8-way tensor parallel one in order to inference the model in a single node. GLM-130B has 130 billion parameters in FP16 precision, a total of 260G of GPU memory is required to store model weights. The DGX-A100 server has 8 A100s and provides an amount of 320G of GPU memory (640G for 80G A100 version) so it suits GLM-130B well. 4 | 5 | However, a server with 8 * 32G V100 only provides an amount of 256G of GPU memory, which indicates that the full loading of model weights is not possible. Fortunately, with the swap-in-and-out feature between CPU and GPU memory provided by the [BMInf](https://github.com/OpenBMB/BMInf) library, GLM-130B can still run on servers with a smaller amount of GPU memory. After joint debugging with the BMInf team, we achieved a resonable evaluation efficiency on DGX-1 servers with 8 * 32G V100 by carefully overlapping computation and communication, see the [benchmark section](#benchmark) for details. 6 | 7 | We have integrated BMInf into our codebase, just install BMInf via `pip install bminf`, and change the model configuration file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_v100.sh` in your launch shell script. The default BMInf config is for V100 servers, you can also adjust the maximum memory the model weights can occupy on one GPU by setting `--bminf-memory-limit` according to your GPU memory in the model config file. 8 | 9 | ## Benchmark 10 | 11 | ### Evaluation 12 | 13 | - CoLA task on the validation set 14 | - Micro Batch Size = 30 15 | - BMInf: 25GB model weights in GPU memory limit by: `--bminf-memory-limit 25` 16 | 17 | | | Peak GPU Memory | Time | 18 | | -------------- | ---------- | ------ | 19 | | A100-SAT | 40.3 G | 74.6 s | 20 | | V100-SAT | OOM | OOM | 21 | | V100-SAT-BMInf | 32.3 G | 196.0 s | 22 | 23 | The `micro-batch-size` config in task YAML files is configured according to the maximum utilization of the DGX-A100 server. If you encounter an OOM error on the V100 server, please adjust the `micro-batch-size` appropriately. 24 | 25 | ### Text generation 26 | 27 | In text generation, due to the small amount of calculation per model forward (usually <10 tokens/forward using beam search strategy), the communication between the CPU and GPU memory becomes the bottleneck. With the help of the BMInf team, we did an in-depth profile on our V100 server. Given a 25GB model weight limit per GPU, a total of 13 layers need to be copied from CPU to GPU for a single forward, each layer will take about 75ms on IO, indicating that the real IO speed between CPU and GPU is `260GB / 70 / 8 / 75ms = 6.19GB/s`. Our V100 server uses PCI-E 3.0 and two V100s share a switch, so the theoretical bandwidth for each GPU is 8GB/s, close to our profiling results. A server with PCI-E 4.0 will greatly reduce the IO time. Even that, long text generation tokens can still take several minutes so **we do not recommend using V100 servers in text generation scenario**. For this, we are working on INT8 quantization so that GLM-130B can even fit a single RTX-3090 server (24G * 8). 28 | 29 | -------------------------------------------------------------------------------- /docs/media/16613396005977.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/docs/media/16613396005977.jpg -------------------------------------------------------------------------------- /docs/quantization.md: -------------------------------------------------------------------------------- 1 | # Quantization of GLM-130B 2 | 3 | ## Usage 4 | 5 | > Please note that SwissArmyTransformer>=0.2.11 is required for quantization 6 | 7 | Set `CHECKPOINT_PATH` in `configs/model_glm_130b_{int4/int8}.sh` to your local checkpoint folder. The model will be first initialized from the FP16 checkpoint on the CPU memory, then dynamically quantized and transferred to the GPU memory. So please make sure you have enough CPU memory (>260GB) to store the FP16 model weights. 8 | 9 | You need to pay attention to the tensor parallel dimension of the model checkpoint, we only provide the checkpoint in 8-way tensor parallel, i.e. 8 GPUs store a whole model. If you need to do inference on a small number of GPUs, e.g. 4 * RTX 3090 GPUs with INT4 precision, you first need to convert the checkpoint to 4-way tensor parallel using the following command and modify `MP_SIZE` in corresponding model config file. 10 | 11 | ```bash 12 | python tools/convert_tp.py \ 13 | --input-folder \ 14 | --output-folder \ 15 | --target-tp 4 16 | ``` 17 | 18 | Finally, change the model config file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_{int4/int8}.sh` in your scripts (e.g. `scripts/generate.sh`), then run your scripts just as normal. 19 | 20 | By default, the full precision checkpoint is expected to be loaded. Run the conversion script with `--quantization-bit-width <4 or 8>` will produce quantized model weights. To load from a quantized checkpoint, you should add `--from-quantized-checkpoint` in your model config file. 21 | 22 | ## Evaluation Results 23 | 24 | | | **MMLU(Accuracy↑)** | **LAMBADA(Accuracy↑ )** | **WikiText-2(PPL↓)** | **WikiText-103(PPL↓)** | **PTB(PPL↓)** | 25 | | ---- | -------- | ----------- | ------------------- | --------------------- | ------------ | 26 | | FP16 | 44.751 | 80.206 | 10.901 | 10.759 | 18.964 | 27 | | INT8 | 44.709 | 80.206 | 10.904 | 10.763 | 18.994 | 28 | | INT4 | 44.801 | 79.468 | 11.167 | 11.046 | 19.535 | 29 | 30 | ## Space and Speed Benchmark 31 | 32 | | **Hardware** | **GPU Memory** | **Precison** | **512** | **1024** | **2048** | 33 | | ------------ | -------------- | ------------ | -------- | -------- | -------- | 34 | | 8 * A100 | 40 GB | FP16 | 45.21 s | 89.00 s | 179.22 s | 35 | | 8 * V100 | 32 GB | INT8 | 106.35 s | 216.50 s | 449.17 s | 36 | | 4 * RTX 3090 | 24 GB | INT4 | 138.66 s | 292.69 s | 649.64 s | 37 | | 8 * RTX 2080 Ti | 11 GB | INT4 | 117.39 s | 240.96 s | 528.66 s | 38 | 39 | 40 | The above results in the table is tests with SAT. Using FasterTransformer can speed up more than 2X, as shown in the table below, and the detailed usage is shown in [Inference with FasterTransformer](../docs/inference-with-fastertransformer.md). 41 | 42 | | **Hardware** | **GPU Memory** | **Precison** | **128** Encode / Decode | **512** Encode / Decode | **1024** Encode / Decode | **2048** Encode / Decode | 43 | | --------------- | -------------- | ------------ | ----------------------- | ----------------------- | ------------------------ | ------------------------ | 44 | | 8 * A100 | 40 GB | INT4 | 145 ms / 4.29 s | 183 ms / 17.7 s | 313 ms / 37.8 s | 495 ms / 86.0 s | 45 | | 4 * A100 | 80 GB | INT4 | 174 ms / 6.62 s | 272 ms / 27.1 s | 439 ms / 56.2 s | 810 ms / 123 s | 46 | | 8 * V100 | 32 GB | INT4 | 309 ms / 6.97 s | 666 ms / 28.1 s | 1208 ms / 58.4 s | 2304 ms / 125 s | 47 | | 4 * V100 | 32 GB | INT4 | 448 ms / 11.4 s | 843 ms / 45.87 s | 1488 ms / 93.5 s | 2803 ms / 196 s | 48 | | 8 * RTX 3090 | 24 GB | INT4 | 283 ms / 5.07 s | 915 ms / 20.5 s | 1793 ms / 42.7 s | 3477 ms / 90.3 s | 49 | | 4 * RTX 3090 | 24 GB | INT4 | 374 ms / 8.16 s | 1300 ms / 32.3 s | OOM / 66.5 s | OOM / 150 s | 50 | | 8 * RTX 2080 Ti | 11 GB | INT4 | 392 ms / 6.77 s | 1044 ms / 27.29 s | OOM / 56.02 s | OOM / OOM | 51 | 52 | ## Details 53 | 54 | Typical methods quantize both model weights and activations to INT8, enabling the INT8 matrix multiplication kernel for efficiency. However, we found that there are outliers in GLM-130B's activations, making it hard to reduce the precision of activations. 55 | 56 | Concurrently, researchers from [Meta AI](https://arxiv.org/abs/2208.07339) also found the emergent outliers issue in large-scale transformers (>6.8B), which is consistent with our observations on GLM-130B. They conducted an in-depth analysis and found that the outliers make up only about 0.1% of all feature dimensions, so it's possible to make a decomposition for matrix multiplication that focuses on high precision multiplication for these particular dimensions. 57 | 58 | | ![](media/16613396005977.jpg) | 59 | |:--:| 60 | | *Distribution of outliers (the white ones) in GLM-130B's activation* | 61 | 62 | Unfortunately, the outliers in GLM-130B can sometimes make up at most 30% of the feature dimension, possibly because we used GLU as a variant of FFN. Therefore, a mixed-precision decomposition for matmul can be much less efficient than a single FP16 matmul. After a few weeks of trial, we finally decided to keep the precision of activations to FP16 and only consider the quantization of model weights. In that case, the quantized model parameters are dynamically converted to FP16 precision at runtime, introducing a small computational overhead but greatly reducing GPU memory requirements for storing model weights. 63 | 64 | We quantized all linear layers as they take up most of the model parameters. All model weights, excluding input/output embedding, layernorm and bias terms are quantized using vector-wise symmetric quantization. At the quantization precision of INT4, two INT4 weights are compressed into one INT8 weight for saving GPU memory usage, so that only 70GB of GPU memory approximately is required for INT4 model weights. 65 | 66 | 67 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | import time 2 | import importlib 3 | 4 | from os.path import join, isdir, isfile, relpath 5 | from glob import glob 6 | 7 | from evaluation import BaseConfig, ModelForEvaluation, DEFAULT_CLASS, print_rank_0 8 | from initialize import initialize, initialize_model_and_tokenizer 9 | 10 | 11 | def add_evaluation_specific_args(parser): 12 | """Arguments for evaluation""" 13 | group = parser.add_argument_group("evaluation", "Evaluation configurations") 14 | 15 | # Task 16 | group.add_argument("--task", nargs="+", default=[], help="All task config to evaluation") 17 | group.add_argument("--data-path", type=str, required=True, help="Data dir path for all tasks") 18 | return parser 19 | 20 | 21 | def find_all_tasks(all_task_config_path): 22 | tasks = [] 23 | for task in all_task_config_path: 24 | if isdir(task): 25 | tasks += [relpath(path, ".") for path in glob(join(task, "**/*.yaml"), recursive=True)] 26 | elif isfile(task): 27 | tasks.append(task) 28 | return tasks 29 | 30 | 31 | def evaluate_all_tasks(data_path, model, tokenizer, all_task_config_path, task_classes): 32 | for config_path, task_class in zip(all_task_config_path, task_classes): 33 | config = task_class.config_class().from_yaml_file(config_path) 34 | config.path = join(data_path, config.path) 35 | task = task_class(model, tokenizer, config) 36 | task.evaluate() 37 | 38 | 39 | def main(): 40 | args = initialize(extra_args_provider=add_evaluation_specific_args) 41 | args.task = find_all_tasks(args.task) 42 | 43 | task_classes = [] 44 | print_rank_0("> Loading task configs") 45 | for task_config_path in args.task: 46 | config = BaseConfig.from_yaml_file(task_config_path) 47 | if config.module: 48 | path = ".".join(config.module.split(".")[:-1]) 49 | module = importlib.import_module(path) 50 | class_name = config.module.split(".")[-1] 51 | task_class = getattr(module, class_name) 52 | task_classes.append(task_class) 53 | else: 54 | task_classes.append(DEFAULT_CLASS[config.type]) 55 | print_rank_0(f" Task {config.name} loaded from config {task_config_path}") 56 | print_rank_0(f"> Successfully load {len(task_classes)} task{'s' if len(task_classes) > 1 else ''}") 57 | 58 | model, tokenizer = initialize_model_and_tokenizer(args) 59 | model = ModelForEvaluation(model) 60 | 61 | start = time.time() 62 | evaluate_all_tasks(args.data_path, model, tokenizer, args.task, task_classes) 63 | print_rank_0(f"Finish {len(task_classes)} task{'s' if len(task_classes) > 1 else ''} in {time.time() - start:.1f}s") 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .configs import * 2 | from .model import ModelForEvaluation 3 | from .tasks import BaseTask, GenerationTask, MultiChoiceTask, LanguageModelTask 4 | from .dataset import GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset 5 | from .metrics import qa_evaluate 6 | from .utils import print_rank_0 7 | 8 | DEFAULT_CLASS = { 9 | TaskType.GENERATION: GenerationTask, 10 | TaskType.MULTICHOICE: MultiChoiceTask, 11 | TaskType.LANGUAGE_MODEL: LanguageModelTask, 12 | } 13 | -------------------------------------------------------------------------------- /evaluation/configs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from dataclass_wizard import YAMLWizard 3 | from dataclasses import dataclass, field 4 | from enum import Enum 5 | from typing import Optional, List, Dict 6 | 7 | 8 | class TaskType(Enum): 9 | MULTICHOICE = "mul" 10 | GENERATION = "gen" 11 | LANGUAGE_MODEL = "lm" 12 | OTHER = "other" 13 | 14 | 15 | @dataclass 16 | class BaseConfig(YAMLWizard): 17 | name: str # Task name 18 | type: TaskType # Task type 19 | path: str # task data path relative to DATA_PATH 20 | 21 | module: Optional[str] = None # Custom task module file, optional 22 | metrics: List[str] = field(default_factory=list) # Evaluation metrics 23 | 24 | use_task_mask: bool = False # Whether to use [gMASK] for evaluation 25 | use_multitask_encoding: bool = False # Not supported now 26 | unidirectional: bool = False # Whether to use unidirectional attention 27 | max_seq_length: int = 2048 # Max sequence length 28 | file_pattern: str | Dict[str, str] = "**/*.json*" # Organize data file in groups 29 | 30 | micro_batch_size: int = 1 # 'gen' task only support mbs = 1 for now 31 | 32 | def __post_init__(self): 33 | assert self.use_task_mask or not self.unidirectional, "[MASK] doesn't support unidirectional attention" 34 | 35 | 36 | @dataclass 37 | class MultiChoiceTaskConfig(BaseConfig): 38 | module = "evaluation.MultiChoiceTask" 39 | metrics: List[str] = field(default_factory=lambda: ["Accuracy"]) 40 | 41 | 42 | @dataclass 43 | class GenerationTaskConfig(BaseConfig): 44 | module = "evaluation.GenerationTask" 45 | metrics: List[str] = field(default_factory=lambda: ["EM", "F1"]) 46 | sampling_strategy: str = "BaseStrategy" 47 | num_beams: int = 4 48 | length_penalty: float = 1.0 49 | no_repeat_ngram_size: int = 3 50 | min_gen_length: int = 0 51 | max_gen_length: int = 128 52 | 53 | 54 | @dataclass 55 | class LanguageModelTaskConfig(BaseConfig): 56 | module = "evaluation.LanguageModelTask" 57 | metrics: List[str] = field(default_factory=lambda: ["PPL"]) 58 | 59 | generation_length: int = 256 # Generated length in each window 60 | -------------------------------------------------------------------------------- /evaluation/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import json 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from typing import List, Union 9 | from abc import ABC, abstractmethod 10 | from scipy.linalg import block_diag 11 | from itertools import accumulate 12 | from bisect import bisect_right 13 | 14 | from SwissArmyTransformer import get_tokenizer 15 | 16 | from .configs import BaseConfig, MultiChoiceTaskConfig, GenerationTaskConfig, LanguageModelTaskConfig 17 | from .utils import get_tokenized_input 18 | 19 | 20 | def pad_batch(tokens, position_ids, attention_mask, max_seq_length): 21 | attention_mask = np.pad( 22 | attention_mask, 23 | pad_width=((0, max_seq_length - len(tokens)),), 24 | mode="constant", 25 | constant_values=0, 26 | ) 27 | tokens = np.concatenate((tokens, np.zeros(max_seq_length - len(tokens), dtype=np.int64))) 28 | position_ids = np.concatenate((position_ids, np.zeros(max_seq_length - len(position_ids), dtype=np.int64))) 29 | return tokens, position_ids, attention_mask 30 | 31 | 32 | class EvaluationDataset(torch.utils.data.Dataset, ABC): 33 | """ 34 | Jsonlines of { 35 | "text": context 36 | "choices": [choice_id1,...], if not None, len(target) == 1 37 | "label": If generation task -1, else [0, len(choices)) 38 | } 39 | If [MASK] not in context, will append [MASK] after text 40 | """ 41 | 42 | def __init__(self, path: Union[str, List[str]], config: BaseConfig): 43 | self.path = path if isinstance(path, list) else [path] 44 | self.config = config 45 | self.max_seq_length = self.config.max_seq_length 46 | self.dtype = np.int64 47 | 48 | self.tokenizer = get_tokenizer() 49 | self.mask_id = self.tokenizer.get_command("[MASK]") 50 | self.gmask_id = self.tokenizer.get_command("[gMASK]") 51 | 52 | self.data = [] 53 | for p in self.path: 54 | self.process_single_file(p) 55 | 56 | @property 57 | def has_collate_fn(self) -> bool: 58 | return False 59 | 60 | def collate_fn(self, samples): 61 | return None 62 | 63 | def process_single_file(self, path): 64 | with open(os.path.join(path), "r", encoding="utf-8") as file: 65 | for line in file: 66 | item = json.loads(line) 67 | self.data.append(self.process_single_item(item)) 68 | 69 | @abstractmethod 70 | def process_single_item(self, item) -> dict: 71 | pass 72 | 73 | def __len__(self): 74 | return len(self.data) 75 | 76 | 77 | class GenerationTaskDataset(EvaluationDataset): 78 | config: GenerationTaskConfig 79 | 80 | def process_single_item(self, item): 81 | text, targets = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "targets") 82 | if len(text) + self.config.max_gen_length + 2 > self.config.max_seq_length: 83 | text_length = self.config.max_seq_length - self.config.max_gen_length - 2 84 | text = text[len(text) - text_length : len(text)] 85 | return {"text": text, "targets": targets} 86 | 87 | @property 88 | def has_collate_fn(self) -> bool: 89 | return True 90 | 91 | def collate_fn(self, samples): 92 | TILE = 32 93 | length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE 94 | 95 | token_batch, position_id_batch, attention_mask_batch = [], [], [] 96 | context_length_batch, target_position_id_batch = [], [] 97 | 98 | for sample in samples: 99 | token, position_id, attention_mask = pad_batch( 100 | sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad 101 | ) 102 | token_batch.append(token) 103 | position_id_batch.append(position_id) 104 | attention_mask_batch.append(attention_mask) 105 | context_length_batch.append(sample['context_length']) 106 | target_position_id_batch.append(sample['target_position_id']) 107 | return { 108 | "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64), 109 | "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64), 110 | "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5, 111 | "context_length": torch.tensor(context_length_batch, dtype=torch.int64), 112 | "target_position_ids": torch.tensor(np.array(target_position_id_batch), dtype=torch.int64), 113 | } 114 | 115 | @staticmethod 116 | def build_generation_sample(text, max_gen_length, use_task_mask, unidirectional=True): 117 | tokenizer = get_tokenizer() 118 | 119 | sop_id = tokenizer.get_command("sop") 120 | mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]") 121 | 122 | token = np.array(text, dtype=np.int64) 123 | 124 | blank_filling = mask_id in text 125 | if blank_filling: 126 | assert not unidirectional, "Unidirectional attention doesn't support blank filling" 127 | assert not use_task_mask, "Unidirectional attention doesn't support task mask" 128 | mask_position = text.index(mask_id) 129 | token = np.concatenate((token, [sop_id])) 130 | else: 131 | mask_position = len(token) 132 | if unidirectional: 133 | token = np.concatenate(([mask_id, sop_id], token)) 134 | else: 135 | token = np.concatenate((token, [mask_id, sop_id])) 136 | context_length = len(token) 137 | 138 | position_id = np.arange(0, context_length, dtype=np.int64) 139 | target_position_id = np.arange(context_length, context_length + max_gen_length, dtype=np.int64) 140 | if not use_task_mask: 141 | position_id[context_length - 1:] = mask_position 142 | target_position_id[:] = mask_position 143 | 144 | attention_mask = np.tril(np.ones((context_length, context_length), dtype=np.int64)) 145 | if not unidirectional: 146 | attention_mask[: context_length - 1, : context_length - 1] = 1 147 | 148 | item = { 149 | "token": token, 150 | "position_id": position_id, 151 | "target_position_id": target_position_id, 152 | "attention_mask": attention_mask, 153 | "context_length": context_length, 154 | } 155 | return item 156 | 157 | def __getitem__(self, idx): 158 | item = self.data[idx] 159 | sample = self.build_generation_sample( 160 | item["text"], 161 | max_gen_length=self.config.max_gen_length, 162 | use_task_mask=self.config.use_task_mask, 163 | unidirectional=self.config.unidirectional, 164 | ) 165 | sample["targets"] = [np.array(target, dtype=self.dtype) for target in item["targets"]] 166 | return sample 167 | 168 | 169 | class MultiChoiceTaskDataset(EvaluationDataset): 170 | config: MultiChoiceTaskConfig 171 | 172 | def __init__(self, path, config: MultiChoiceTaskConfig): 173 | self.is_single_token = True # set to False later in process_single_item func 174 | super().__init__(path, config) 175 | 176 | @property 177 | def has_collate_fn(self) -> bool: 178 | return True 179 | 180 | def collate_fn(self, samples): 181 | TILE = 32 182 | length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE 183 | 184 | token_batch, position_id_batch, attention_mask_batch = [], [], [] 185 | choices_batch, choice_target_ids_batch = [], [] 186 | 187 | for sample in samples: 188 | token, position_id, attention_mask = pad_batch( 189 | sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad 190 | ) 191 | token_batch.append(token) 192 | position_id_batch.append(position_id) 193 | attention_mask_batch.append(attention_mask) 194 | choices_batch.append(sample["choices"]) 195 | choice_target_ids_batch.append(sample["choice_target_ids"]) 196 | 197 | return { 198 | "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64), 199 | "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64), 200 | "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5, 201 | "choices": choices_batch, 202 | "choice_target_ids": choice_target_ids_batch, 203 | "is_single_token": self.is_single_token, 204 | } 205 | 206 | def process_single_item(self, item): 207 | text, choices, label = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "choices"), item["label"] 208 | 209 | tgt_seq_length = sum([len(choice) for choice in choices]) 210 | if tgt_seq_length == len(choices): 211 | # For single token, we only insert one [sop] 212 | tgt_seq_length = 1 213 | 214 | assert tgt_seq_length < self.config.max_seq_length 215 | if len(text) + tgt_seq_length + 2 > self.config.max_seq_length: 216 | text_length = self.config.max_seq_length - tgt_seq_length - 2 217 | text = text[len(text) - text_length : len(text)] 218 | 219 | assert not ( 220 | self.mask_id in text and self.config.use_multitask_encoding 221 | ), "Unified multitask encoding don't support blank filling" 222 | 223 | if tgt_seq_length != 1: 224 | self.is_single_token = False 225 | 226 | return { 227 | "text": text, 228 | "choices": choices, 229 | "label": label, 230 | } 231 | 232 | @staticmethod 233 | def build_multiple_choice_sample( 234 | text, choices, is_single_token, unified_multitask_encoding=False, use_task_mask=False 235 | ): 236 | tokenizer = get_tokenizer() 237 | 238 | sop_id = tokenizer.get_command("sop") 239 | mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]") 240 | 241 | token = np.array(text, dtype=np.int64) 242 | target = np.array(text, dtype=np.int64) 243 | position_id = np.arange(len(text), dtype=np.int64) 244 | choice_target_id = [] 245 | 246 | blank_filling = mask_id in text 247 | if not blank_filling: 248 | mask_position = len(token) 249 | token = np.concatenate((token, [mask_id])) 250 | target = np.concatenate((target, [mask_id])) 251 | position_id = np.concatenate((position_id, [mask_position])) 252 | else: 253 | mask_position = text.index(mask_id) 254 | 255 | division = len(token) 256 | attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)] 257 | 258 | for choice in choices: 259 | if use_task_mask == False: 260 | position_id = np.concatenate( 261 | ( 262 | position_id, 263 | [mask_position] * len(choice) 264 | if blank_filling or not unified_multitask_encoding 265 | else np.arange(mask_position, mask_position + len(choice), dtype=np.int64), 266 | ) 267 | ) 268 | else: 269 | position_id = np.concatenate( 270 | ( 271 | position_id, 272 | np.arange(division, division + len(choice), dtype=np.int64), 273 | ) 274 | ) 275 | 276 | choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64)) 277 | attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64))) 278 | token = np.concatenate((token, [sop_id], choice[:-1])) 279 | target = np.concatenate((target, choice)) 280 | 281 | if is_single_token: 282 | break 283 | 284 | attention_mask = block_diag(*attention_mask) 285 | attention_mask[: len(token), :division] = 1 286 | 287 | if is_single_token: 288 | choices = np.array(choices, dtype=np.int64).squeeze().tolist() 289 | 290 | item = { 291 | "token": token, 292 | "position_id": position_id, 293 | "attention_mask": attention_mask, 294 | "choices": choices, 295 | "choice_target_ids": choice_target_id[0] if is_single_token else choice_target_id, 296 | } 297 | return item 298 | 299 | def __getitem__(self, idx): 300 | item = self.data[idx] 301 | sample = self.build_multiple_choice_sample( 302 | item["text"], 303 | item["choices"], 304 | is_single_token=self.is_single_token, 305 | unified_multitask_encoding=self.config.use_multitask_encoding, 306 | use_task_mask=self.config.use_task_mask, 307 | ) 308 | sample["label"] = item["label"] 309 | return sample 310 | 311 | 312 | class LanguageModelTaskDataset(EvaluationDataset): 313 | config: LanguageModelTaskConfig 314 | left_weights: List[int] 315 | weights: List[int] 316 | 317 | def process_single_file(self, path): 318 | num_sequences = [] 319 | with open(os.path.join(path), "r", encoding="utf-8") as file: 320 | raw_text = file.read() 321 | tokens = self.tokenizer.tokenize(raw_text) 322 | self.data.append( 323 | { 324 | "raw_text": tokens, 325 | "num_original_tokens": len(raw_text.strip().split(" ")), 326 | "num_sequences": max( 327 | math.ceil( 328 | max(len(tokens) - (self.config.max_seq_length - 1), 0) / self.config.generation_length 329 | ) 330 | + 1, 331 | 1, 332 | ), 333 | } 334 | ) 335 | num_sequences.append(self.data[-1]["num_sequences"]) 336 | self.weights = list(accumulate(num_sequences)) 337 | self.left_weights = [0] + self.weights[:-1] 338 | 339 | def process_single_item(self, item): 340 | pass 341 | 342 | def __len__(self): 343 | return self.data[0]["num_sequences"] 344 | 345 | def __getitem__(self, idx): 346 | document_idx = bisect_right(self.weights, idx) 347 | idx = idx - self.left_weights[document_idx] 348 | start_idx = idx * self.config.generation_length 349 | end_idx = start_idx + self.config.max_seq_length - 1 # for additional [gMASK] 350 | tokens = self.data[document_idx]["raw_text"][start_idx:end_idx] 351 | 352 | mask_id = self.gmask_id if self.config.use_task_mask else self.mask_id 353 | sop_id = self.tokenizer.get_command("sop") 354 | 355 | if idx == 0 or self.config.unidirectional: 356 | prompt, text = [], tokens 357 | else: 358 | prompt_length = self.config.max_seq_length - 1 - self.config.generation_length 359 | prompt, text = tokens[:prompt_length], tokens[prompt_length:] 360 | 361 | seq_length = len(prompt) + len(text) + 1 362 | attention_mask = np.tril(np.ones((seq_length, seq_length), dtype=np.int64)) 363 | attention_mask[: len(prompt) + 1, : len(prompt) + 1] = 1 364 | 365 | return { 366 | "tokens": np.array(prompt + [mask_id, sop_id] + text[:-1], dtype=np.int64), 367 | "targets": np.array(prompt + [mask_id] + text, dtype=np.int64), 368 | "position_ids": np.arange(0, seq_length, dtype=np.int64), 369 | "attention_mask": attention_mask < 0.5, 370 | "loss_masks": np.array([0] * (len(prompt) + 1) + [1] * len(text), dtype=np.int64), 371 | } 372 | -------------------------------------------------------------------------------- /evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | import re 2 | import math 3 | import string 4 | import functools 5 | 6 | import torch 7 | import numpy as np 8 | 9 | from typing import Tuple, List 10 | from collections import Counter 11 | from collections import defaultdict 12 | from SwissArmyTransformer import get_tokenizer 13 | 14 | from .utils import print_rank_0 15 | 16 | 17 | def accuracy_metric(predictions, examples): 18 | count = 0 19 | num_predictions = max(len(predictions), 1) 20 | assert len(predictions) == len(examples) 21 | for prediction, example in zip(predictions, examples): 22 | count += prediction == example["label"] 23 | return count * 100.0 / num_predictions 24 | 25 | 26 | def F1_metric(predictions, examples): 27 | assert len(predictions) == len(examples) 28 | from sklearn.metrics import f1_score 29 | 30 | truth = [] 31 | for prediction, example in zip(predictions, examples): 32 | truth.append(example["label"]) 33 | return f1_score(truth, predictions, average="micro") * 100.0 34 | 35 | 36 | def precision_metric(predictions, examples): 37 | assert len(predictions) == len(examples) 38 | from sklearn.metrics import precision_score 39 | 40 | truth = [] 41 | for prediction, example in zip(predictions, examples): 42 | truth.append(example["label"]) 43 | return precision_score(truth, predictions, average="micro") * 100.0 44 | 45 | 46 | def recall_metric(predictions, examples): 47 | assert len(predictions) == len(examples) 48 | from sklearn.metrics import recall_score 49 | 50 | truth = [] 51 | for prediction, example in zip(predictions, examples): 52 | truth.append(example["label"]) 53 | return recall_score(truth, predictions, average="micro") * 100.0 54 | 55 | 56 | def normalize_answer(s): 57 | """Lower text and remove punctuation, articles and extra whitespace.""" 58 | 59 | def remove_articles(text): 60 | return re.sub(r"\b(a|an|the)\b", " ", text) 61 | 62 | def white_space_fix(text): 63 | return " ".join(text.split()) 64 | 65 | def remove_punc(text): 66 | exclude = set(string.punctuation) 67 | return "".join(ch for ch in text if ch not in exclude) 68 | 69 | def lower(text): 70 | return text.lower() 71 | 72 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 73 | 74 | 75 | def f1_score(prediction, ground_truth): 76 | prediction_tokens = normalize_answer(prediction).split() 77 | ground_truth_tokens = normalize_answer(ground_truth).split() 78 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 79 | num_same = sum(common.values()) 80 | if num_same == 0: 81 | return 0 82 | precision = 1.0 * num_same / len(prediction_tokens) 83 | recall = 1.0 * num_same / len(ground_truth_tokens) 84 | f1 = (2 * precision * recall) / (precision + recall) 85 | return f1 86 | 87 | 88 | def exact_match_score(prediction, ground_truth): 89 | return normalize_answer(prediction) == normalize_answer(ground_truth) 90 | 91 | 92 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 93 | if not ground_truths: 94 | return 0.0 95 | scores_for_ground_truths = [] 96 | for ground_truth in ground_truths: 97 | score = metric_fn(prediction, ground_truth) 98 | scores_for_ground_truths.append(score) 99 | return max(scores_for_ground_truths) 100 | 101 | 102 | def qa_evaluate(predictions, examples, metric): 103 | assert len(examples) == len(predictions) 104 | tokenizer = get_tokenizer() 105 | 106 | score = 0.0 107 | for example, prediction in zip(examples, predictions): 108 | ground_truths = [tokenizer.tokenizer.decode(target) for target in example["targets"]] 109 | prediction = tokenizer.tokenizer.decode(prediction) 110 | if ground_truths: 111 | score += metric_max_over_ground_truths(metric, prediction, ground_truths) 112 | score = 100.0 * score / len(predictions) 113 | return score 114 | 115 | 116 | qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score) 117 | qa_f1 = functools.partial(qa_evaluate, metric=f1_score) 118 | 119 | 120 | def calculate_perplexity(loss: List[float], data): 121 | return math.exp(min(20, np.sum(loss) / data[0]["num_original_tokens"])) 122 | 123 | 124 | def special_for_dataset(predictions, examples): 125 | print_rank_0("Metrics not found, maybe dataset special metric or metric name error") 126 | return True 127 | 128 | 129 | DEFAULT_METRICS = defaultdict(lambda: special_for_dataset) 130 | DEFAULT_METRICS.update( 131 | { 132 | "EM": qa_exact_match, 133 | "F1": qa_f1, 134 | "Accuracy": accuracy_metric, 135 | "PPL": calculate_perplexity, 136 | "Precision": precision_metric, 137 | "Recall": recall_metric, 138 | "F1_mul": F1_metric, 139 | } 140 | ) 141 | -------------------------------------------------------------------------------- /evaluation/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from typing import List, Union 4 | 5 | from SwissArmyTransformer.generation.autoregressive_sampling import update_mems, get_masks_and_position_ids_default 6 | from SwissArmyTransformer.mpu import vocab_parallel_cross_entropy 7 | 8 | 9 | def batch_filling_sequence( 10 | model, 11 | seqs, 12 | context_lengths, 13 | strategy, 14 | max_memory_length=100000, 15 | get_masks_and_position_ids=get_masks_and_position_ids_default, 16 | mems=None, 17 | **kw_args 18 | ): 19 | ''' 20 | seq: [2, 3, 5, ..., -1(to be generated), -1, ...] 21 | mems: [num_layers, batch_size, len_mems(index), mem_hidden_size] 22 | cache, should be first mems.shape[1] parts of context_tokens. 23 | mems are the first-level citizens here, but we don't assume what is memorized. 24 | input mems are used when multi-phase generation. 25 | ''' 26 | assert len(seqs.shape) == 2 27 | 28 | # building the initial tokens, attention_mask, and position_ids 29 | batch_size, context_length = seqs.shape 30 | seqs, attention_mask, position_ids = get_masks_and_position_ids(seqs) 31 | tokens = seqs[..., :context_length] 32 | if attention_mask.dtype != torch.bool: 33 | attention_mask = attention_mask.type_as(next(model.parameters())) # if fp16 34 | # initialize generation 35 | counter = context_length - 1 # Last fixed index is ``counter'' 36 | index = 0 if mems is None else mems.shape[2] # Next forward starting index, also the length of cache. 37 | num_beams = 1 38 | # step-by-step generation 39 | while counter < seqs.shape[1] - 1: 40 | # Now, we want to generate seq[counter + 1], 41 | # token[:, index: counter+1] needs forwarding. 42 | # forward 43 | tokens = tokens.reshape(batch_size * num_beams, -1) 44 | mems = mems.reshape(mems.shape[0], batch_size * num_beams, mems.shape[-2], mems.shape[-1]) if mems is not None else None 45 | logits, *output_per_layers = model( 46 | tokens[:, index:], 47 | position_ids[..., index: counter+1], 48 | attention_mask[..., index: counter+1, :counter+1], # TODO memlen 49 | mems=mems, 50 | **kw_args 51 | ) 52 | mem_kv = [o['mem_kv'] for o in output_per_layers] 53 | mems = update_mems(mem_kv, mems, max_memory_length=max_memory_length) 54 | if counter == context_length - 1: 55 | logits = logits[torch.arange(batch_size), context_lengths - 1] 56 | else: 57 | logits = logits[:, -1] 58 | counter += 1 59 | index = counter 60 | # if torch.distributed.get_rank() == 0: 61 | # print(f"counter: {counter}: logits: {logits.float().abs().mean()}") 62 | # sampling 63 | logits = logits.reshape(batch_size, num_beams, -1) 64 | tokens = tokens.reshape(batch_size, num_beams, -1) 65 | mems = mems.reshape(mems.shape[0], batch_size, num_beams, mems.shape[-2], mems.shape[-1]) 66 | tokens, mems = strategy.forward(logits, tokens, mems) 67 | if len(tokens.shape) == 3 and num_beams == 1: 68 | num_beams = tokens.shape[1] 69 | position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, -1).reshape(batch_size * num_beams, -1) 70 | attention_mask_shape = attention_mask.shape[-3:] 71 | attention_mask = attention_mask.unsqueeze(1).expand(batch_size, num_beams, -1, -1, -1).reshape( 72 | batch_size * num_beams, *attention_mask_shape) 73 | if strategy.is_done: 74 | break 75 | return strategy.finalize(tokens, mems) 76 | 77 | 78 | class ModelForEvaluation(torch.nn.Module): 79 | def __init__(self, model): 80 | super().__init__() 81 | 82 | self.model = model 83 | self.device = next(self.model.parameters()).device 84 | 85 | @staticmethod 86 | def process_data(batch, device): 87 | return ( 88 | batch["tokens"].to(device=device).long(), 89 | batch["position_ids"].to(device=device).long(), 90 | batch["attention_mask"].to(device=device).bool().unsqueeze(1), 91 | ) 92 | 93 | def cond_log_prob(self, batch) -> List[List[float]]: 94 | """ 95 | @return: Conditional log probability of each option 96 | """ 97 | tokens, position_ids, attention_mask = self.process_data(batch, self.device) 98 | choices_batch, choice_target_ids_batch = batch["choices"], batch["choice_target_ids"] 99 | is_single_token = batch["is_single_token"] 100 | 101 | self.model.eval() 102 | with torch.no_grad(): 103 | logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None) 104 | logits_batch = torch.nn.functional.log_softmax(logits, dim=-1) 105 | 106 | # output: [b, sq, vocab] 107 | log_probs = [] 108 | 109 | if is_single_token: # Single token 110 | for logits, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch): 111 | log_probs.append(logits[choice_target_ids[0], choices].tolist()) 112 | else: # Multi token 113 | for output, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch): 114 | log_probs_single = [] 115 | for choice, choice_target_id in zip(choices, choice_target_ids): 116 | tmp = output[choice_target_id, choice] 117 | log_probs_single.append(tmp.sum().tolist()) 118 | log_probs.append(log_probs_single) 119 | return log_probs 120 | 121 | def generate_text(self, sample, strategy, return_all_beams=False) -> Union[ 122 | List[List[int]], List[List[List[int]]]]: 123 | """ 124 | @return: A list of text model generated, sorted by score in descending order 125 | """ 126 | 127 | seqs = sample["tokens"].to(device=self.device).long() 128 | context_lengths = sample["context_length"].long() 129 | 130 | def get_masks_and_position_ids(seq): 131 | batch_size = seq.shape[0] 132 | max_gen_length = sample['target_position_ids'].shape[-1] 133 | tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode='constant', value=-1) 134 | position_ids = torch.cat((sample['position_ids'], sample['target_position_ids']), dim=-1) 135 | position_ids = position_ids.to(device=self.device).long() 136 | attention_mask = sample["attention_mask"].to(device=self.device) 137 | context_mask = attention_mask[torch.arange(batch_size), context_lengths - 1].unsqueeze(1).repeat(1, 138 | max_gen_length, 139 | 1) 140 | causal_mask = torch.tril(context_mask.new_ones((batch_size, max_gen_length, max_gen_length))) < 0.5 141 | generation_mask = torch.cat( 142 | (context_mask, causal_mask), dim=-1) 143 | attention_mask = torch.nn.functional.pad(attention_mask, (0, max_gen_length), mode='constant', value=1) 144 | attention_mask = torch.cat((attention_mask, generation_mask), dim=1) 145 | attention_mask = attention_mask.bool().unsqueeze(1) 146 | return tokens, attention_mask, position_ids 147 | 148 | self.model.eval() 149 | with torch.no_grad(): 150 | output = batch_filling_sequence( 151 | self.model, 152 | seqs, 153 | context_lengths, 154 | get_masks_and_position_ids=get_masks_and_position_ids, 155 | strategy=strategy, 156 | )[0] 157 | 158 | if isinstance(output, torch.Tensor): # different strategies 159 | output = output.tolist() 160 | 161 | output_targets = [] 162 | context_length = seqs.shape[1] 163 | for lines in output: 164 | lines = lines.tolist() if isinstance(lines, torch.Tensor) else lines 165 | output_target = [] 166 | if not isinstance(lines, list): 167 | lines = [lines] 168 | for line in lines: 169 | unfinished = line.index(-1) if -1 in line else len(line) 170 | if line[unfinished - 1] in strategy.end_tokens: 171 | unfinished -= 1 172 | line = line[context_length:unfinished] 173 | output_target.append(line) 174 | if not return_all_beams: 175 | output_targets.append(output_target[0]) 176 | else: 177 | output_targets.append(output_target) 178 | return output_targets 179 | 180 | 181 | def calculate_loss(self, batch) -> List[float]: 182 | tokens, position_ids, attention_mask = self.process_data(batch, self.device) 183 | targets, loss_masks = ( 184 | batch["targets"].to(device=self.device).long(), 185 | batch["loss_masks"].to(device=self.device).long(), 186 | ) 187 | 188 | original_parallel_output = self.model.transformer.parallel_output 189 | self.model.transformer.parallel_output = True 190 | self.model.eval() 191 | 192 | with torch.no_grad(): 193 | logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None) 194 | losses = vocab_parallel_cross_entropy(logits.contiguous().float(), targets) 195 | loss = torch.sum(losses * loss_masks, dim=-1) 196 | 197 | self.model.transformer.parallel_output = original_parallel_output 198 | 199 | return loss.tolist() 200 | -------------------------------------------------------------------------------- /evaluation/tasks.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | import numpy as np 4 | import torch.distributed as dist 5 | 6 | from typing import Dict, Callable, Type, Tuple, List, Any 7 | from abc import ABC, abstractmethod 8 | from glob import glob 9 | from os.path import join, relpath 10 | from collections import defaultdict 11 | 12 | from SwissArmyTransformer.tokenization.icetk_glm_130B.ice_tokenizer import _IceTokenizer 13 | 14 | from generation import BaseStrategy, BeamSearchStrategy 15 | from .configs import BaseConfig, GenerationTaskConfig, MultiChoiceTaskConfig, LanguageModelTaskConfig 16 | from .model import ModelForEvaluation 17 | from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset 18 | from .utils import build_data_loader, gather_result, print_rank_0 19 | from .metrics import DEFAULT_METRICS 20 | 21 | 22 | class BaseTask(ABC): 23 | model: ModelForEvaluation 24 | tokenizer: _IceTokenizer 25 | config: BaseConfig 26 | file_groups: Dict[str, List[str]] 27 | 28 | @classmethod 29 | def config_class(cls) -> Type[BaseConfig]: 30 | return BaseConfig 31 | 32 | @property 33 | def metrics(self) -> Dict[str, Callable]: 34 | return {metric: DEFAULT_METRICS[metric] for metric in self.config.metrics} 35 | 36 | def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: BaseConfig): 37 | self.model = model 38 | self.tokenizer = tokenizer 39 | self.config = config 40 | self.config.metrics = list(self.metrics.keys()) 41 | 42 | self.file_groups = self.get_file_groups() 43 | self.verbose = dist.get_rank() == 0 44 | 45 | def get_file_groups(self): 46 | pattern_group = {} 47 | if isinstance(self.config.file_pattern, str): 48 | pattern_group["all"] = self.config.file_pattern 49 | else: 50 | pattern_group = self.config.file_pattern 51 | return { 52 | name: [ 53 | relpath(path, start=self.config.path) 54 | for path in sorted(glob(join(self.config.path, pattern), recursive=True)) 55 | ] 56 | for name, pattern in pattern_group.items() 57 | } 58 | 59 | def evaluate(self): 60 | dist.barrier() 61 | start = time.time() 62 | print_rank_0("\n") 63 | print_rank_0(f"{self.config}") 64 | print_rank_0(f"Evaluating task {self.config.name}:") 65 | 66 | result_dict_all = {} 67 | 68 | for group_name, filelist in self.file_groups.items(): 69 | print_rank_0(f" Evaluating group {group_name}:") 70 | 71 | result_dict_group = {} 72 | for file in filelist: 73 | dataset = self.build_dataset(file) 74 | dataloader = build_data_loader( 75 | dataset, 76 | micro_batch_size=self.config.micro_batch_size, 77 | num_workers=1, 78 | drop_last=False, 79 | collate_fn=dataset.collate_fn if dataset.has_collate_fn else None, 80 | ) 81 | 82 | prediction = [] 83 | with torch.no_grad(): 84 | for _, batch in enumerate(dataloader): 85 | prediction.append(self.predict_single_batch(batch)) 86 | 87 | prediction = gather_result(prediction, len(dataset), self.config.micro_batch_size) 88 | result_dict = {key: metric(prediction, dataset.data) for key, metric in self.metrics.items()} 89 | result_dict_group[file] = (result_dict, len(dataset)) 90 | 91 | if self.verbose: 92 | self.report_single_metrics(file, result_dict) 93 | 94 | result_dict_all[group_name] = result_dict_group 95 | 96 | print_rank_0(f"Evaluation results of task {self.config.name}:") 97 | 98 | if self.verbose: 99 | for group_name, result_dict_group in result_dict_all.items(): 100 | self.report_group_metrics(group_name, result_dict_group) 101 | self.report_overall_metrics( 102 | {k: v for result_dict_group in result_dict_all.values() for k, v in result_dict_group.items()}, 103 | ) 104 | 105 | print_rank_0(f"Finish task {self.config.name} in {time.time() - start:.1f}s.") 106 | 107 | def report_single_metrics(self, file: str, result_dict: Dict[str, float]): 108 | output_str = f" Finish {file}" 109 | for key, value in result_dict.items(): 110 | output_str += f", {key} = {value:.3f}" 111 | print_rank_0(output_str) 112 | 113 | @staticmethod 114 | def calc_group_metrics(result_dict_group: Dict[str, Tuple[Dict[str, float], int]]): 115 | metrics_dict = defaultdict(lambda: []) 116 | weight = [] 117 | for file, (result_dict, length) in result_dict_group.items(): 118 | for key, value in result_dict.items(): 119 | metrics_dict[key].append(value) 120 | weight.append(length) 121 | return { 122 | name: { 123 | "max": np.max(value), 124 | "median": np.median(value), 125 | "average": np.average(value, weights=weight), 126 | } 127 | for name, value in metrics_dict.items() 128 | } 129 | 130 | def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1): 131 | stats_dict = self.calc_group_metrics(result_dict_group) 132 | if len(stats_dict) == 1: 133 | name, stats = next(iter(stats_dict.items())) 134 | print_rank_0( 135 | " " * level + f"Group {group_name} {name}: max = {stats['max']:.3f}, " 136 | f"median = {stats['median']:.3f}, average = {stats['average']:.3f}" 137 | ) 138 | else: 139 | print_rank_0(" " * level + f" Group {group_name}: ") 140 | for name, stats in stats_dict.items(): 141 | print( 142 | " " * (level + 1) + f"Metric {name}: max = {stats['max']:.3f}, " 143 | f"median = {stats['median']:.3f}, average = {stats['average']:.3f}" 144 | ) 145 | 146 | def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]): 147 | pass 148 | 149 | @abstractmethod 150 | def predict_single_batch(self, batch) -> List[Any]: 151 | pass 152 | 153 | @abstractmethod 154 | def build_dataset(self, relative_path: str) -> EvaluationDataset: 155 | pass 156 | 157 | 158 | class GenerationTask(BaseTask, ABC): 159 | config: GenerationTaskConfig 160 | 161 | @classmethod 162 | def config_class(cls): 163 | return GenerationTaskConfig 164 | 165 | def build_dataset(self, relative_path): 166 | return GenerationTaskDataset(join(self.config.path, relative_path), self.config) 167 | 168 | def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: GenerationTaskConfig): 169 | super(GenerationTask, self).__init__(model, tokenizer, config) 170 | 171 | end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")] 172 | if self.config.sampling_strategy == "BaseStrategy": 173 | self.strategy = BaseStrategy(batch_size=self.config.micro_batch_size, temperature=1.0, top_k=1, 174 | end_tokens=end_tokens) 175 | elif self.config.sampling_strategy == "BeamSearchStrategy": 176 | self.strategy = BeamSearchStrategy( 177 | self.config.micro_batch_size, 178 | self.config.num_beams, 179 | length_penalty=self.config.length_penalty, 180 | consider_end=True, 181 | end_tokens=end_tokens, 182 | no_repeat_ngram_size=self.config.no_repeat_ngram_size, 183 | min_gen_length=self.config.min_gen_length, 184 | deterministic=True, # For evaluation, we need a determined generation strategy 185 | ) 186 | else: 187 | raise ValueError(f"unknown strategy {self.config.sampling_strategy}") 188 | 189 | def predict_single_batch(self, batch) -> List[List[int]]: 190 | output = self.model.generate_text(batch, self.strategy, return_all_beams=False) 191 | return output 192 | 193 | 194 | class MultiChoiceTask(BaseTask, ABC): 195 | config: MultiChoiceTaskConfig 196 | 197 | @classmethod 198 | def config_class(cls): 199 | return MultiChoiceTaskConfig 200 | 201 | def build_dataset(self, relative_path): 202 | return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.config) 203 | 204 | def predict_single_batch(self, batch) -> List[int]: 205 | log_probs = self.model.cond_log_prob(batch) 206 | return [np.argmax(log_probs_single).item() for log_probs_single in log_probs] 207 | 208 | 209 | class LanguageModelTask(BaseTask, ABC): 210 | config: LanguageModelTaskConfig 211 | 212 | @classmethod 213 | def config_class(cls): 214 | return LanguageModelTaskConfig 215 | 216 | def build_dataset(self, relative_path): 217 | return LanguageModelTaskDataset(join(self.config.path, relative_path), self.config) 218 | 219 | def predict_single_batch(self, batch) -> List[float]: 220 | return self.model.calculate_loss(batch) 221 | -------------------------------------------------------------------------------- /evaluation/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as dist 3 | 4 | from SwissArmyTransformer import mpu, get_tokenizer 5 | 6 | 7 | def print_rank_0(*args, **kwargs): 8 | if torch.distributed.get_rank() == 0: 9 | print(*args, **kwargs) 10 | 11 | 12 | def build_data_loader(dataset, micro_batch_size, num_workers, drop_last, collate_fn=None): 13 | # Sampler. 14 | world_size = mpu.get_data_parallel_world_size() 15 | rank = mpu.get_data_parallel_rank() 16 | sampler = torch.utils.data.distributed.DistributedSampler( 17 | dataset, num_replicas=world_size, rank=rank, shuffle=False 18 | ) 19 | 20 | # Data loader. Note that batch size is the per GPU batch size. 21 | data_loader = torch.utils.data.DataLoader( 22 | dataset, 23 | batch_size=micro_batch_size, 24 | sampler=sampler, 25 | shuffle=False, 26 | num_workers=num_workers, 27 | drop_last=drop_last, 28 | pin_memory=True, 29 | collate_fn=collate_fn, 30 | ) 31 | 32 | return data_loader 33 | 34 | 35 | def gather_result(prediction, total_length, micro_batch_size): 36 | """ 37 | @param prediction: Local predictions with order defined by distributed sampler 38 | @param total_length: Total sample num 39 | @return: [sample_0, sample_1, ..., sample_{total_length-1}] 40 | """ 41 | torch.cuda.empty_cache() 42 | world_size = mpu.get_data_parallel_world_size() 43 | prediction_gathered = [None for _ in range(world_size)] 44 | dist.all_gather_object(prediction_gathered, prediction, group=mpu.get_data_parallel_group()) 45 | prediction = [] 46 | for i in range(len(prediction_gathered[0])): 47 | for j in range(micro_batch_size): 48 | for k in range(world_size): 49 | if j < len(prediction_gathered[k][i]): 50 | prediction.append(prediction_gathered[k][i][j]) 51 | prediction = prediction[:total_length] 52 | return prediction 53 | 54 | 55 | def get_tokenized_input(item, key): 56 | if key in item: 57 | return item[key] 58 | tokenizer = get_tokenizer() 59 | pretokenized_key = key + "_pretokenized" 60 | assert pretokenized_key in item 61 | if isinstance(item[pretokenized_key], list): 62 | result = [] 63 | for raw in item[pretokenized_key]: 64 | result.append(tokenizer.tokenize(raw)) 65 | return result 66 | else: 67 | return tokenizer.tokenize(item[pretokenized_key]) 68 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import stat 4 | import re 5 | 6 | from functools import partial 7 | from typing import List, Tuple 8 | 9 | from SwissArmyTransformer import mpu 10 | from evaluation.model import batch_filling_sequence 11 | from generation import BeamSearchStrategy, BaseStrategy 12 | from SwissArmyTransformer.generation.utils import timed_name, generate_continually 13 | from initialize import initialize, initialize_model_and_tokenizer 14 | 15 | 16 | def add_generation_specific_args(parser): 17 | parser.add_argument("--sampling-strategy", type=str, default="BaseStrategy", help="Type of sampling strategy.") 18 | parser.add_argument("--min-gen-length", type=int, default=0, help="The minimum length each blank should generate.") 19 | parser.add_argument( 20 | "--print-all-beams", action="store_true", help="Print all output generated by beam search strategy." 21 | ) 22 | 23 | 24 | def isEnglish(s): 25 | try: 26 | s.encode(encoding="utf-8").decode("ascii") 27 | except UnicodeDecodeError: 28 | return False 29 | else: 30 | return True 31 | 32 | 33 | def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False): 34 | context_length = seq.shape[1] 35 | tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode="constant", value=-1) 36 | attention_mask = torch.ones((1, tokens.shape[-1], tokens.shape[-1]), device=tokens.device) 37 | attention_mask.tril_() 38 | attention_mask[..., : context_length - 1] = 1 39 | attention_mask.unsqueeze_(1) 40 | attention_mask = (attention_mask < 0.5).bool() 41 | 42 | position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device) 43 | if not gmask: 44 | position_ids[context_length - 1 :] = mask_position 45 | 46 | position_ids = position_ids.unsqueeze(0) 47 | 48 | return tokens, attention_mask, position_ids 49 | 50 | 51 | def fill_blanks(raw_text: str, model, tokenizer, strategy) -> Tuple[List[str], List[str], List[List[str]]]: 52 | # add MASK 53 | generation_mask = "[gMASK]" 54 | if "[MASK]" in raw_text: 55 | generation_mask = "[MASK]" 56 | elif "[sMASK]" in raw_text: 57 | generation_mask = "[sMASK]" 58 | use_gmask = "[MASK]" not in raw_text and "[sMASK]" not in raw_text 59 | 60 | mask_pattern = r"\[[sg]?MASK\]" 61 | text_list = re.split(mask_pattern, raw_text) 62 | pattern_list = re.compile(mask_pattern).findall(raw_text) 63 | seq = [] 64 | for i in range(len(pattern_list)): 65 | pattern = pattern_list[i] 66 | sub_text = text_list[i] 67 | seq.extend(tokenizer.tokenize(sub_text)) 68 | seq.append(tokenizer.get_command(pattern)) 69 | 70 | seq.extend(tokenizer.tokenize(text_list[-1])) 71 | 72 | if "MASK]" not in raw_text: 73 | seq += [tokenizer.get_command(generation_mask)] 74 | raw_text += " " + generation_mask 75 | if not raw_text.endswith("MASK]"): 76 | seq = seq + [tokenizer.get_command("eos")] 77 | if mpu.get_model_parallel_rank() == 0: 78 | print("\nInput: {}\n".format(raw_text)) 79 | if len(seq) > args.max_sequence_length: 80 | raise ValueError("text too long.") 81 | 82 | # generation 83 | is_english = isEnglish(raw_text) 84 | output_list = [seq] 85 | num_output = args.num_beams if args.sampling_strategy == "BeamSearchStrategy" else 1 86 | last_pos, answers, answers_with_style, blanks = ( 87 | [0] * num_output, 88 | ["" for _ in range(num_output)], 89 | ["" for _ in range(num_output)], 90 | [[] for _ in range(num_output)], 91 | ) 92 | 93 | # continually detect the first mark position 94 | while True: 95 | seq = output_list[0] 96 | # detect mask position 97 | mask_token = tokenizer.get_command(generation_mask) 98 | if mask_token not in seq: 99 | break 100 | mask_position = seq.index(mask_token) 101 | 102 | output_list = [] 103 | 104 | input_seq = torch.cuda.LongTensor( 105 | [seq + [tokenizer.get_command("sop")]], 106 | device=args.device, 107 | ) 108 | output, _ = batch_filling_sequence( 109 | model, 110 | input_seq, 111 | torch.cuda.LongTensor([input_seq.shape[-1]], device=args.device), 112 | strategy=strategy, 113 | get_masks_and_position_ids=partial( 114 | get_masks_and_position_ids, 115 | mask_position=mask_position, 116 | max_gen_length=args.out_seq_length - input_seq.shape[-1], 117 | gmask=use_gmask, 118 | ), 119 | ) 120 | if isinstance(output, torch.Tensor): # different strategies 121 | output = output.tolist() 122 | output = output[0] # batch_size = 1 123 | output_list.extend(output) 124 | 125 | # clip -1s and fill back generated things into seq 126 | for i in range(len(output_list)): 127 | output = output_list[i].tolist() if isinstance(output_list[i], torch.Tensor) else output_list[i] 128 | try: 129 | unfinished = output.index(-1) 130 | except ValueError: 131 | unfinished = len(output) 132 | if output[unfinished - 1] in strategy.end_tokens: 133 | unfinished -= 1 134 | bog = output.index(tokenizer.get_command("sop")) 135 | 136 | prefix = tokenizer.detokenize(output[last_pos[i] : mask_position]) 137 | blank = tokenizer.detokenize(output[bog + 1 : unfinished]) 138 | answers_with_style[i] += ( 139 | prefix 140 | + (" " if is_english else "") 141 | + ("\033[4m" if use_gmask else "\x1b[0;32m\033[4m") 142 | + blank 143 | + ("\033[0m" if use_gmask else "\033[0m\x1b[0m") 144 | + (" " if is_english else "") 145 | ) 146 | blanks[i].append(blank) 147 | last_pos[i] = mask_position + unfinished - (bog + 1) 148 | output_list[i] = output[:mask_position] + output[bog + 1 : unfinished] + output[mask_position + 1 : bog] 149 | 150 | for i, output in enumerate(output_list): 151 | if output[-1] == tokenizer.get_command("eos"): 152 | output = output[:-1] 153 | answers_with_style[i] += tokenizer.detokenize(output[last_pos[i] :]) 154 | answers[i] = tokenizer.detokenize(output) 155 | 156 | return answers, answers_with_style, blanks 157 | 158 | 159 | def main(args): 160 | model, tokenizer = initialize_model_and_tokenizer(args) 161 | 162 | end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")] 163 | 164 | if args.sampling_strategy == "BaseStrategy": 165 | strategy = BaseStrategy( 166 | batch_size=1, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, end_tokens=end_tokens 167 | ) 168 | elif args.sampling_strategy == "BeamSearchStrategy": 169 | strategy = BeamSearchStrategy( 170 | 1, 171 | args.num_beams, 172 | length_penalty=args.length_penalty, 173 | consider_end=True, 174 | end_tokens=end_tokens, 175 | no_repeat_ngram_size=args.no_repeat_ngram_size, 176 | min_gen_length=args.min_gen_length, 177 | ) 178 | else: 179 | raise ValueError(f"unknown strategy {args.sampling_strategy}") 180 | 181 | def process(raw_text): 182 | if args.with_id: 183 | query_id, raw_text = raw_text.split("\t") 184 | 185 | answers, answers_with_style, blanks = fill_blanks(raw_text, model, tokenizer, strategy) 186 | 187 | # save 188 | if args.with_id: 189 | full_path = os.path.join(args.output_path, query_id + ".txt") 190 | else: 191 | prefix = raw_text.replace("/", "")[:20] 192 | full_path = timed_name(prefix, ".txt", args.output_path) 193 | if mpu.get_model_parallel_rank() == 0: 194 | if args.print_all_beams and len(answers) > 1: 195 | for idx, answer_with_style in enumerate(answers_with_style): 196 | print(f"Output beam {idx}:", answer_with_style) # print the first. 197 | if len(answer_with_style) > 120: 198 | print("") 199 | else: 200 | print(f"Output:", answers_with_style[0]) # print the first. 201 | with open(full_path, "w", encoding="utf-8") as fout: 202 | for answer in answers: 203 | fout.write(answer + "\n") 204 | 205 | os.chmod(full_path, stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU) 206 | 207 | os.makedirs(args.output_path, exist_ok=True) 208 | generate_continually(process, args.input_source) 209 | 210 | 211 | if __name__ == "__main__": 212 | args = initialize(extra_args_provider=add_generation_specific_args) 213 | 214 | with torch.no_grad(): 215 | main(args) 216 | -------------------------------------------------------------------------------- /generation/__init__.py: -------------------------------------------------------------------------------- 1 | from .strategies import BaseStrategy, BeamSearchStrategy 2 | -------------------------------------------------------------------------------- /generation/strategies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from SwissArmyTransformer.generation.sampling_strategies.base_strategy import top_k_logits 5 | 6 | class BaseStrategy: 7 | def __init__(self, batch_size, invalid_slices=[], temperature=1., top_k=200, eps=1e-4, top_p=0.0, end_tokens=None): 8 | self.batch_size = batch_size 9 | self.invalid_slices = invalid_slices 10 | self.temperature = temperature 11 | self.topk = top_k 12 | self.top_p = top_p 13 | self.eps = eps 14 | if end_tokens is None: 15 | end_tokens = [] 16 | self.end_tokens = end_tokens 17 | self._is_done = np.zeros(self.batch_size, dtype=np.bool) 18 | 19 | @property 20 | def is_done(self) -> bool: 21 | return self._is_done.all() 22 | 23 | def forward(self, logits, tokens, mems, temperature=None): 24 | logits = logits.view(-1, logits.size(-1)) 25 | batch_size = tokens.shape[0] 26 | if temperature is None: 27 | temperature = self.temperature 28 | logits = logits / temperature 29 | for invalid_slice in self.invalid_slices: 30 | logits[..., invalid_slice] = -65504 31 | 32 | logits = top_k_logits(logits, self.topk, self.top_p) 33 | probs = F.softmax(logits.float(), dim=-1) # float is essetial, due to a bug in Pytorch 34 | pred = torch.multinomial(probs, num_samples=1) 35 | for i in range(self.batch_size): 36 | if i >= batch_size: 37 | self._is_done[i] = True 38 | elif self._is_done[i]: 39 | pred[i] = -1 40 | elif pred[i].item() in self.end_tokens: 41 | self._is_done[i] = True 42 | tokens = torch.cat((tokens, pred.view(tokens.shape[:-1] + (1,))), dim=-1) 43 | return tokens, mems 44 | 45 | def finalize(self, tokens, mems): 46 | self._is_done = np.zeros(self.batch_size, dtype=np.bool) 47 | return tokens, mems 48 | 49 | 50 | class BeamSearchStrategy: 51 | def __init__( 52 | self, 53 | batch_size, 54 | num_beams, 55 | length_penalty=1.0, 56 | consider_end=False, 57 | end_tokens=[], 58 | invalid_slices=[], 59 | no_repeat_ngram_size=0, 60 | min_gen_length=0, 61 | deterministic=False, 62 | ): 63 | self.batch_size = batch_size 64 | self.num_beams = num_beams 65 | self.length_penalty = length_penalty 66 | self.end_tokens = end_tokens 67 | self.ngram = no_repeat_ngram_size 68 | self.min_gen_length = min_gen_length 69 | self.invalid_slices = invalid_slices 70 | self.consider_end = consider_end 71 | self.deterministic = deterministic 72 | self._init_cache() 73 | 74 | def _init_cache(self): 75 | self.end_beams = [[] for _ in range(self.batch_size)] # list of LongTensors 76 | self.end_beams_penalized_scores = [[] for _ in range(self.batch_size)] # list of LongTensors 77 | self.cached_beam_scores = 0 # [batch_size] 78 | self.cached_beam_ngram_bans = [[{} for _ in range(self.num_beams)] for _ in range(self.batch_size)] 79 | self.length_generated = 0 80 | self._is_done = np.zeros(self.batch_size, dtype=np.bool) 81 | 82 | def _add_end_beams(self, score, beam, batch_idx): 83 | score = score / ((5.0 + len(beam)) / 6) ** self.length_penalty # Magic number for OpenNMT 84 | for i in range(len(self.end_beams[batch_idx]), -1, -1): 85 | if i == 0 or score < self.end_beams_penalized_scores[batch_idx][i - 1]: 86 | break 87 | self.end_beams[batch_idx].insert(i, beam) 88 | self.end_beams_penalized_scores[batch_idx].insert(i, score) 89 | 90 | self.end_beams[batch_idx] = self.end_beams[batch_idx][: self.num_beams] 91 | self.end_beams_penalized_scores[batch_idx] = self.end_beams_penalized_scores[batch_idx][: self.num_beams] 92 | 93 | @property 94 | def is_done(self) -> bool: 95 | return self._is_done.all() 96 | 97 | def forward(self, logits, tokens, mems): 98 | batch_size, num_beams, vocab_size = logits.shape 99 | seq_len = tokens.shape[-1] 100 | logits = logits.float() 101 | for invalid_slice in self.invalid_slices: 102 | logits[..., invalid_slice] = -65504 103 | if self.min_gen_length > self.length_generated: 104 | for end_token in self.end_tokens: 105 | logits[..., end_token] = -65504 106 | if self.ngram > 0 and seq_len > self.ngram: 107 | for batch_idx in range(batch_size): 108 | for i in range(num_beams): 109 | ngram_prefix = tokens[batch_idx, i, -(self.ngram - 1) :].tolist() # TODO ngram=1 110 | for banned_index in self.cached_beam_ngram_bans[batch_idx][i].get(tuple(ngram_prefix), []): 111 | logits[batch_idx, i, banned_index] = -65504 112 | 113 | next_token_scores = F.log_softmax(logits, dim=-1) # [batch_size, vocab_size] 114 | prev_scores = self.cached_beam_scores 115 | if isinstance(prev_scores, torch.Tensor): 116 | prev_scores = prev_scores[..., None].expand_as(next_token_scores) 117 | next_token_scores = next_token_scores + prev_scores 118 | 119 | next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) 120 | 121 | probs = F.softmax(next_token_scores, dim=-1) 122 | if num_beams < self.num_beams: # First token 123 | probs = probs[..., :vocab_size] 124 | if self.deterministic: 125 | next_tokens = torch.topk(probs, k=(max(1, len(self.end_tokens)) + 1) * self.num_beams).indices # [2*nb] 126 | else: 127 | next_tokens = torch.multinomial( 128 | probs, num_samples=(max(1, len(self.end_tokens)) + 1) * self.num_beams 129 | ) # [2*nb] 130 | next_token_scores = next_token_scores[torch.arange(batch_size).unsqueeze(1), next_tokens] 131 | next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1) 132 | next_tokens = next_tokens[torch.arange(batch_size).unsqueeze(1), _indices] 133 | 134 | next_indices = torch.div(next_tokens, vocab_size, rounding_mode="trunc") 135 | next_tokens = next_tokens % vocab_size 136 | 137 | # select out end beams or continue beams 138 | beam_continue_batch, score_continue_batch, mems_continue_batch = [], [], [] 139 | for batch_idx in range(batch_size): 140 | beam_continue = [] 141 | scores_continue = [] 142 | bans_continue = [] 143 | mems_contiue = [] 144 | for i in range(len(next_tokens[batch_idx])): 145 | beam = torch.cat((tokens[batch_idx, next_indices[batch_idx, i]], next_tokens[batch_idx, i : i + 1])) 146 | if not self._is_done[batch_idx] and int(next_tokens[batch_idx, i]) in self.end_tokens: 147 | self._add_end_beams(next_token_scores[batch_idx, i], beam, batch_idx) 148 | elif len(beam_continue) < self.num_beams: 149 | beam_continue.append(beam) 150 | mems_contiue.append(mems[:, batch_idx, next_indices[batch_idx, i]]) 151 | # update caches 152 | scores_continue.append(next_token_scores[batch_idx, i]) 153 | if self.ngram > 0: 154 | bans = self.cached_beam_ngram_bans[batch_idx][next_indices[batch_idx, i]].copy() 155 | # TODO ngram=1 156 | ngram_prefix = tuple(tokens[batch_idx, next_indices[batch_idx, i], -(self.ngram - 1):].tolist()) 157 | bans[ngram_prefix] = bans.get(ngram_prefix, tuple()) + (next_tokens[batch_idx, i],) 158 | bans_continue.append(bans) 159 | else: 160 | break 161 | beam_continue_batch.append(torch.stack(beam_continue)) 162 | mems_continue_batch.append(torch.stack(mems_contiue, dim=1)) 163 | score_continue_batch.append(scores_continue) 164 | self.cached_beam_ngram_bans[batch_idx] = bans_continue 165 | tokens = torch.stack(beam_continue_batch) 166 | mems = torch.stack(mems_continue_batch, dim=1) 167 | self.cached_beam_scores = torch.tensor(score_continue_batch, device=logits.device) 168 | self.length_generated += 1 169 | for batch_idx in range(self.batch_size): 170 | if batch_idx >= batch_size: 171 | self._is_done[batch_idx] = True 172 | elif ( 173 | len(self.end_beams[batch_idx]) == self.num_beams 174 | and self.end_beams_penalized_scores[batch_idx][-1] 175 | >= self.cached_beam_scores[batch_idx].max() / ((5.0 + (seq_len + 1)) / 6) ** self.length_penalty 176 | ): # We're done if none of current tokens will better than the worst in end_beams 177 | self._is_done[batch_idx] = True 178 | 179 | return tokens, mems 180 | 181 | def finalize(self, tokens, mems): 182 | if self.consider_end: 183 | batch_size, num_beams = tokens.shape[:2] 184 | for batch_idx in range(batch_size): 185 | if not self._is_done[batch_idx]: 186 | for i in range(num_beams): 187 | self._add_end_beams(self.cached_beam_scores[batch_idx, i], tokens[batch_idx, i], batch_idx) 188 | mems = None 189 | ret = self.end_beams[:batch_size] 190 | else: 191 | ret = tokens 192 | self._init_cache() 193 | return ret, mems 194 | -------------------------------------------------------------------------------- /initialize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import time 4 | 5 | from quantization import quantize 6 | 7 | from SwissArmyTransformer import get_args, get_tokenizer 8 | from SwissArmyTransformer.arguments import initialize_distributed 9 | from SwissArmyTransformer.training import load_checkpoint 10 | from SwissArmyTransformer.model import GLM130B 11 | from SwissArmyTransformer.mpu import get_model_parallel_world_size, get_model_parallel_rank, get_model_parallel_group 12 | 13 | 14 | def add_bminf_args(parser): 15 | """Arguments for BMInf""" 16 | group = parser.add_argument_group("BMInf") 17 | 18 | group.add_argument("--bminf", action="store_true", help="Use BMInf to support low resource evaluation") 19 | group.add_argument("--bminf-memory-limit", type=int, default=20, help="Max memory for model per GPU (in GB)") 20 | return parser 21 | 22 | 23 | def add_quantization_args(parser): 24 | group = parser.add_argument_group("Quantization") 25 | 26 | group.add_argument("--quantization-bit-width", type=int, default=None) 27 | group.add_argument("--from-quantized-checkpoint", action="store_true", help="Loading from a quantized checkpoint") 28 | 29 | 30 | def add_initialization_args(parser): 31 | group = parser.add_argument_group("Initialization") 32 | 33 | group.add_argument( 34 | "--sequential-initialization", 35 | action="store_true", 36 | help="Initialize sequentially in tensor parallel group (reduce CPU RAM for initialization)", 37 | ) 38 | 39 | 40 | def initialize(extra_args_provider): 41 | parser = argparse.ArgumentParser(add_help=False) 42 | add_bminf_args(parser) 43 | add_quantization_args(parser) 44 | add_initialization_args(parser) 45 | GLM130B.add_model_specific_args(parser) 46 | extra_args_provider(parser) 47 | known, args_list = parser.parse_known_args() 48 | args = get_args(args_list) 49 | args = argparse.Namespace(**vars(args), **vars(known)) 50 | args.do_train = False 51 | initialize_distributed(args) 52 | return args 53 | 54 | 55 | def initialize_model_and_tokenizer(args): 56 | tokenizer = get_tokenizer(args) 57 | 58 | torch.distributed.barrier() 59 | start = time.time() 60 | 61 | for i in range(get_model_parallel_world_size()): 62 | if get_model_parallel_rank() == i: 63 | # Initialize model 64 | model = GLM130B(args).half() 65 | 66 | if args.from_quantized_checkpoint: 67 | assert args.quantization_bit_width is not None 68 | # Quantize model before moving to GPU 69 | model = quantize(model, args.quantization_bit_width) 70 | 71 | # Load checkpoint 72 | load_checkpoint(model, args) 73 | 74 | if args.quantization_bit_width is not None and not args.from_quantized_checkpoint: 75 | # Quantize model before moving to GPU 76 | model = quantize(model, args.quantization_bit_width) 77 | 78 | if args.bminf: 79 | import bminf 80 | 81 | if torch.distributed.get_rank() == 0: 82 | print(f"> BMInf activated, memory limit: {args.bminf_memory_limit} GB") 83 | with torch.cuda.device(args.device): 84 | model = bminf.wrapper(model, quantization=False, memory_limit=args.bminf_memory_limit << 30) 85 | else: 86 | model = model.to(args.device) 87 | if args.sequential_initialization: 88 | torch.distributed.barrier(group=get_model_parallel_group()) 89 | 90 | torch.distributed.barrier() 91 | if torch.distributed.get_rank() == 0: 92 | print(f"> Model initialized in {time.time() - start:.1f}s") 93 | 94 | torch.cuda.empty_cache() 95 | model.eval() 96 | 97 | # generate rotary embedding cache 98 | original_parallel_output = model.transformer.parallel_output 99 | model.transformer.parallel_output = True 100 | with torch.no_grad(): 101 | _, *_ = model( 102 | torch.ones(1, args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64), 103 | torch.arange(args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1), 104 | torch.randn( 105 | 1, 106 | 1, 107 | args.max_sequence_length, 108 | args.max_sequence_length, 109 | device=torch.cuda.current_device(), 110 | ) 111 | < 0.5, 112 | ) 113 | model.transformer.parallel_output = original_parallel_output 114 | torch.distributed.barrier() 115 | 116 | return model, tokenizer 117 | -------------------------------------------------------------------------------- /kernels/__init__.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | import torch 3 | import ctypes 4 | 5 | from typing import List 6 | from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up 7 | 8 | RESOURCE_PACKAGE_NAME = __name__ 9 | 10 | 11 | class Kernel: 12 | def __init__(self, filename: str, function_names: List[str]): 13 | filename = filename + ".fatbin" 14 | if not pkg_resources.resource_exists(RESOURCE_PACKAGE_NAME, filename): 15 | raise RuntimeError("File `%s` not found in `%s`" % (filename, RESOURCE_PACKAGE_NAME)) 16 | self.filename = filename 17 | self.code = pkg_resources.resource_string(RESOURCE_PACKAGE_NAME, filename) 18 | self._function_names = function_names 19 | self._cmodule = LazyKernelCModule(self.code) 20 | 21 | for name in self._function_names: 22 | setattr(self, name, KernelFunction(self._cmodule, name)) 23 | 24 | 25 | kernels = Kernel( 26 | "quantization", 27 | [ 28 | "int4WeightCompression", 29 | "int4WeightExtractionFloat", 30 | "int4WeightExtractionHalf", 31 | "int8WeightExtractionFloat", 32 | "int8WeightExtractionHalf", 33 | ], 34 | ) 35 | 36 | 37 | def compress_int4_weight(weight: torch.Tensor): # (n, m) 38 | with torch.cuda.device(weight.device): 39 | n, m = weight.size(0), weight.size(1) 40 | assert m % 2 == 0 41 | m = m // 2 42 | out = torch.empty(n, m, dtype=torch.int8, device="cuda") 43 | stream = torch.cuda.current_stream() 44 | 45 | gridDim = (n, 1, 1) 46 | blockDim = (min(round_up(m, 32), 1024), 1, 1) 47 | 48 | kernels.int4WeightCompression( 49 | gridDim, 50 | blockDim, 51 | 0, 52 | stream, 53 | [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], 54 | ) 55 | return out 56 | 57 | 58 | def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): 59 | if source_bit_width == 8: 60 | func = kernels.int8WeightExtractionHalf 61 | elif source_bit_width == 4: 62 | func = kernels.int4WeightExtractionHalf 63 | else: 64 | assert False, "Unsupported bit-width" 65 | 66 | with torch.cuda.device(weight.device): 67 | n, m = weight.size(0), weight.size(1) 68 | out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda") 69 | stream = torch.cuda.current_stream() 70 | 71 | gridDim = (n, 1, 1) 72 | blockDim = (min(round_up(m, 32), 1024), 1, 1) 73 | 74 | func( 75 | gridDim, 76 | blockDim, 77 | 0, 78 | stream, 79 | [ 80 | ctypes.c_void_p(weight.data_ptr()), 81 | ctypes.c_void_p(scale_list.data_ptr()), 82 | ctypes.c_void_p(out.data_ptr()), 83 | ctypes.c_int32(n), 84 | ctypes.c_int32(m), 85 | ], 86 | ) 87 | return out 88 | 89 | 90 | if __name__ == "__main__": 91 | weight = torch.randn(4, 32).to(torch.int8).cuda() 92 | scale = torch.ones(weight.size(0)).to(torch.half).cuda() 93 | 94 | print(weight) 95 | b = compress_int4_weight(weight) 96 | print(b) 97 | 98 | a = extract_weight_to_half(b, scale, source_bit_width=4) 99 | print(a) 100 | -------------------------------------------------------------------------------- /kernels/quantization.fatbin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/kernels/quantization.fatbin -------------------------------------------------------------------------------- /logs/README.md: -------------------------------------------------------------------------------- 1 | # Training Logs 2 | 3 | `main-log.md` contains detailed information about each restart of training during GLM-130B training. 4 | 5 | Tensorboard logs is available at [here](https://cloud.tsinghua.edu.cn/f/503ef9fa85b84fbba9ef/). 6 | -------------------------------------------------------------------------------- /logs/main-log-en.md: -------------------------------------------------------------------------------- 1 | # The training notes of GLM-130B 2 | 3 | ## Basic Information about GLM-130B 4 | 5 | - 130B:70 layers,12288 hidden size,32768 ffn hidden size, 150000 vocab size 6 | - MP = 4, PP = 8 7 | - GLM + Rotary Positional Embedding + GeGLU + DeepNorm 8 | - FP32 softmax with QKV scaling(no PB-Relax) 9 | - Shrink embedding gradient with $\alpha=0.1$ 10 | - Global batch size: 4224 11 | 12 | ## Environment 13 | 14 | - PyTorch 1.11 / CUDA 11.3 15 | - LargeScale@400893da37bb5cbe22c29e41c02a052369cc72ce 16 | - DeepSpeed 0.6.1 17 | - apex@master 18 | 19 | ## Speed Testing (with Different Batch Sizes) 20 | 21 | - 96 nodes, BSZ=176 * 24=4224 22 | - glm-130B-2022.05.05-19:34:16:134TFLOPS, 88.5s/iter, 48samples/s, 23 | - 96 nodes, BSZ=256 * 24=6144 24 | - glm-130B-2022.05.05-19:43:13:141TFLOPS, 122.5s/iter, 50samples/s 25 | 26 | ## 2022-05-06 04:00 Training starts 27 | 28 | - glm-130B-2022.05.05-19:53:15 29 | 30 | ## 2022-05-07 20:14 Node failure 31 | 32 | n30041, n30157 break down, changing saving interval to 100 steps (originally 500 steps, too long), restart from 4000 step 33 | 34 | - glm-130B-2022.05.07-13:44:59 35 | 36 | ## 2022-05-10 00:00 Increase alpha for embedding shrink, as we think the original alpha is too small (originally 0.1) 37 | 38 | add `--shrink-embedding-gradient-steps 6000 500` to warmup alpha to 1 from 6000 step within 500 steps 39 | 40 | - glm-130B-2022.05.09-16:02:04 41 | 42 | ## 2022-05-11 12:13 Node failure 43 | 44 | n30115 breaks down, restart from 7300 step 45 | 46 | - glm-130B-2022.05.11-05:55:32 47 | 48 | ## 2022-05-20 00:03 Node failure 49 | 50 | n30066 breaks down, restart from 15400 step 51 | 52 | - glm-130B-2022.05.19-19:56:19 53 | 54 | Switch to another node pool, and restart from 15600 step 55 | 56 | - glm-130B-2022.05.20-01:58:57 57 | 58 | ## 2022-05-21 12:40 Replace node 59 | 60 | Finding that the training flop is only 127T, smaller than before; suspecting that the n30076 we have replaced in has some unknown errors and kicking it out from 16600 step; nothing changes 61 | 62 | ## 2022-05-22 19:27 Node failure 63 | 64 | n30126 loses connection 65 | 66 | - glm-130B-2022.05.22-14:15:41 67 | 68 | ## 2022-05-26 04:30 Node failure 69 | 70 | n30039 reports missing GPUs 71 | 72 | - glm-130B-2022.05.25-22:23:12 73 | 74 | 75 | ## 2022-05-28 11:50 Change Multi-task Instruction Pre-training (MIP) data (abolished) 76 | 77 | Restarts from 22800 step, change MIP data to the correct one (English & Chinese) 78 | 79 | - glm-130B-2022.05.28-03:52:26 80 | - events.out.tfevents.1653709957.9droa42ltcad5-0.1858.0 (abolished) 81 | 82 | ## 2022-05-28 16:50 Change MIP data 83 | 84 | New MIP data (English & Chinese) leads to NaN loss at 22900 step; finding too much noises in Chinese multi-task data; switch to vanilla T0 training datasets 85 | 86 | - glm-130B-2022.05.28-09:18:12 87 | - events.out.tfevents.1653729502.9droa42ltcad5-0.5648.0(移除) 88 | 89 | ## 2022-05-28 20:50 Add warmup (abolished) 90 | 91 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C850748B-92A4-4F9F-932F-AD22330895D6_2/E8MboG8vrTTb2N51FRhkb6wsB4eyrD77USmM992obQgz/Image.png) 92 | 93 | Vanilla T0 datasets still lead to disconvergence; suspecting a changed task ratio leads to the instability; add argument `--warmup-samples-after-loading 2112000` to warmup 500 steps from 22800 step 94 | 95 | - glm-130B-2022.05.28-12:57:24 96 | - events.out.tfevents.1653742654.9droa42ltcad5-0.7942.0(移除) 97 | 98 | ## 2022-05-29 01:30 Disconverges again, switch to self-supervised pre-training only (abolished) 99 | 100 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/028DE014-00FE-4521-BEEB-EF3F61BB8DA1_2/mgYybTR1OLgPkBysqMiUgGYNyIg8OQnf1yXI66grBeMz/Image.png) 101 | 102 | - Disconverges after warmup; suspecting that the distribution change is still too large; trying to restart using self-supervised pre-training only with data reshuffle, loading from 22800 step 103 | - glm-130B-2022.05.28-18:05:33 104 | - events.out.tfevents.1653761143.9droa42ltcad5-0.9744.0 (abolished) 105 | - global_step23200_text 106 | + Configuration file 107 | 108 | ## 2022-05-29 Smoothing distribution shift (abolished) 109 | 110 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E2BC463F-E519-461E-B1B0-99551DA940BE_2/0ZqN22TLyqRTvqOy6JNLeixEy4TarDJEF7DOvdh3saIz/Image.png) 111 | 112 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/9C7AC4B3-59AB-471A-872E-41CCBAE7E90D_2/0rpEmyAOcIkLyDGR2R4RQiBeUwbWIWiaHbHcwosx6yAz/Image.png) 113 | 114 | Self-supervised pre-training only seems to be stable; trying to smooth the distribution shift via a warmed-up ratio of correct T0 data from 22800 step 115 | 116 | - glm-130B-2022.05.29-05:17:06 117 | - events.out.tfevents.1653801436.9droa42ltcad5-0.13868.0 (abolished) 118 | 119 | ## 2022-05-29 22:40 Smoothing data distribution shift & warmup learning rate 120 | 121 | - Disconverges; suspecting that learning rate requires warmup in this process, too 122 | 123 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/F5532A86-3AAC-4CCE-AC9B-A976B7736D7F_2/M4JZx5GYzNPuysPHXrn0R5Oo54rBhDwQxdErkOpFOhEz/Image.png) 124 | 125 | - Restart from 22800, warmup correct MIP data ratio and learning rate for 2000 steps; warmup embedding gradient shrink alpha from 0.2 to 1 by 6000 steps 126 | - glm-130B-2022.05.29-17:35:45 127 | 128 | ## 2022-05-30 14:00 Node and file system failure 129 | 130 | Finding the warmup steps for embedding gradient shrink to be wrong (26850 steps instead of 6000 steps); changing the warmup steps implementation (according to the absolute number of samples); restarting from global_step23200 131 | 132 | We discover that the restart is stacked in the data loading, which turns out to be an error of the Lustre file system. The result is that we cannot read the 2.3T text corpora and the engineer cannot help to recover the data, and we have to copy data from backup disk to the file system again (which takes few days) 133 | 134 | - glm-130B-2022.05.31-02:18:24 135 | 136 | ## 2022.05.03 20:00 Add DeepStruct data to MIP 137 | 138 | - Keeping the original warmup process; adding DeepStruct data to MIP portion; restart from 23500 step 139 | 140 | ## 2022-06-01 22:22 Replace MIP data to a cleaner version 141 | 142 | Finding one noisy prompt in the task data for T0 (qqp) and DeepStruct respectively; removing them and restarting from 24500 step 143 | 144 | - glm-130B-2022.06.01-14:24:33 145 | 146 | ## 2022-06-02 12:00 Node failure 147 | 148 | - n30145 CPU error, restarting from 25000 step; removing the warmup process as it has ended 149 | - glm-130B-2022.06.02-04:35:05 150 | 151 | ## 2022-06-02 09:30 Start to print multitask loss 152 | 153 | From 25800 step, we print multitask loss 154 | 155 | - glm-130B-2022.06.03-01:40:12 156 | 157 | ## 2022-06-02 15:00 Reduce learning rate and print gpt/bert loss 158 | 159 | The loss decreases slowly, and we think it might be attributed to a too large learning rate; from 26000 step, we half the learning rate 160 | 161 | - glm-130B-2022.06.03-07:26:16 162 | 163 | ## 2022-06-06 17:00 Node cluster maintenance 164 | 165 | The node cluster needs an upgrade from 9 am to 5 am 166 | 167 | - glm-130B-2022.06.06-10:00:39 168 | 169 | PS: we observe a significant improvement of the file system's reading speed; only need 1 minute to load the checkpoint now 170 | 171 | ## 2022-06-08 08:00 Node failure 172 | 173 | - glm-130B-2022.06.08-00:00:37 174 | 175 | ## 2022-06-09 13:30 Unexpected termination of the training 176 | 177 | Restarting from 23100 step; suspecting the network communication problem 178 | 179 | - glm-130B-2022.06.09-05:27:54 180 | 181 | ## 2022-06-12 10:00 Loss explodes 182 | 183 | From 33700 step, the training loss explodes. The loss-scale reduces drastically around 33710 step, and the loss explodes at 33740 step 184 | 185 | - tensorboard record:glm-130B-33700 186 | 187 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C46C7CFE-1B79-491C-90FC-5A88AE90E9DF_2/7ICMyH8v6GhAgngz5bVaDKwzYjFPyk99Ax27R5w56wMz/Image.png) 188 | 189 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E56BCDE0-C798-429F-81E0-1A07CCB9BC0E_2/Ig2rfKnPmLadg39Jc38UEdK90LDxlAxoH0AxmAygxzAz/Image.png) 190 | 191 | - Restaring from 33600 step, reduce shrink embedding gradient from 1.0 to 0.5 192 | - glm-130B-2022.06.12-02:20:49 193 | 194 | ## 2022-06-14 03:00 Loss explodes 195 | 196 | At 35250 step, the loss explodes again; almost the same behavior as it is in 33700 step; breaking down without any signs 197 | 198 | tensorboard record:glm-130B-35250 199 | 200 | - Restarting from 35200 step, and shrinking embedding gradient from 0.5 to 0.1 201 | - glm-130B-2022.06.14-02:28:21 202 | 203 | ## 2022-06-19 00:10 Node failure 204 | 205 | n30085 breaks down, restarting from 39600 step 206 | 207 | - glm-130B-2022.06.18-17:49:53 208 | 209 | ## 2022-06-20 09:10 Loss explodes 210 | 211 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/CA344108-3B01-469C-9ABE-C41002F76484_2/oEvBST5MP0I7S4qHmQUeE7DoPCsGFSrveAOOSyitSUwz/Image.png) 212 | 213 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/FED0DE40-A710-4259-AE98-26BCB9568C7A_2/kH4FijsPDVJFzkbaxz7BiX0RZrul1Wrye6cE5EV8ZG0z/Image.png) 214 | 215 | - tensorboard record:glm-130B-40800 216 | - `--skip-train-iteration-range 40701-40900` 217 | - Restarting from 40700 step and skipping the noisy data in 40701-40900 steps 218 | - glm-130B-2022.06.20-03:36:13 219 | 220 | ## 2022-06-22 10:40 Gradient spikes 221 | 222 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/0B7E0A0C-4B11-4F52-BF10-E6B11A533BEF_2/yb1zC07di9zux8jbAi15gpqlstGHXZyjyMBEjO0gNKUz/Image.png) 223 | 224 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/1C60424A-0290-4070-9327-DF9DFD135020_2/XyVoPs1yMLIuzUyrDixSYfgjc2Y2Nuor20GCz0nSPkAz/Image.png) 225 | 226 | - The gradient norm experiences a spike, which seems to recover automatically; but the training loss experiences a drastic change 227 | - `--skip-train-iteration-range 40701-40900` 228 | - Restarting from 42400 and skipping data in 42401-42600 steps 229 | - glm-130B-2022.06.22-02:38:20 230 | 231 | ## 2022-06-22 21:00 Gradient spikes 232 | 233 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/1D7D801C-3226-4CB0-978C-F19B4DA46721_2/nmg9r87OFrdErZvY9xjiDIHvgPVLv39vy8ZVtGkj2H0z/Image.png) 234 | 235 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/5F5CA3D6-AF58-4087-9806-1529D3A2EF6C_2/WSQqyBdv1rvzvNloXE6Ssql7GxMDoULU38FAQCv3778z/Image.png) 236 | 237 | - The gradient norm experiences a spike again, but the loss-scale seems stable. We think it might recover automatically. 238 | - Rethinking on the repeating gradient spikes in recent days, we speculate it might be attributed to a too-slow learning rate decay in the late stage of pre-training; reducing minimum lr from 8e-6 to 4e-6 239 | - `--min-lr 4e-6` 240 | - Restarting from 42700 step 241 | - glm-130B-2022.06.22-13:03:53 242 | 243 | ## 2022.06.26 16:00 Node failure 244 | 245 | - Unexpected NVLink Error; restarting training 246 | - glm-130B-2022.06.26-13:13:51 247 | 248 | ## 2022.06.29 00:00 Recover position_id 249 | 250 | - Restarting training from 48100 step; using another more consistent positional encoding (the original one has a different implementation for \[MASK\] and \[gMASK\]) 251 | - glm-130B-2022.06.29-13:53:21 252 | -------------------------------------------------------------------------------- /logs/main-log.md: -------------------------------------------------------------------------------- 1 | # GLM-130B 训练日志 2 | 3 | ## 模型信息 4 | 5 | - 130B:70 layers,12288 hidden size,32768 ffn hidden size, 150000 vocab size 6 | - MP = 4, PP = 8 7 | - GLM + Rotary Positional Embedding + GeGLU + DeepNorm 8 | - FP32 softmax with QKV scaling(no PB-Relax) 9 | - Shrink embedding gradient with $\alpha=0.1$ 10 | - Global batch size: 4224 11 | 12 | ## 环境版本 13 | 14 | - PyTorch 1.11 / CUDA 11.3 15 | - LargeScale@400893da37bb5cbe22c29e41c02a052369cc72ce 16 | - DeepSpeed 0.6.1 17 | - apex@master 18 | 19 | ## 测速 20 | 21 | - 96 nodes, BSZ=176 * 24=4224 22 | - glm-130B-2022.05.05-19:34:16:134TFLOPS, 88.5s/iter, 48samples/s, 23 | - 96 nodes, BSZ=256 * 24=6144 24 | - glm-130B-2022.05.05-19:43:13:141TFLOPS, 122.5s/iter, 50samples/s 25 | 26 | ## 2022-05-06 04:00 开始训练 27 | 28 | - glm-130B-2022.05.05-19:53:15 29 | 30 | ## 2022-05-07 20:14 节点故障 31 | 32 | 坏掉 n30041, n30157 两个点,更改保存间隔为 100step,从 4000 step 开始训练 33 | 34 | - glm-130B-2022.05.07-13:44:59 35 | 36 | ## 2022-05-10 00:00 提升 alpha 37 | 38 | 加入 `--shrink-embedding-gradient-steps 6000 500` 从 6000 step 开始训练 39 | 40 | - glm-130B-2022.05.09-16:02:04 41 | 42 | ## 2022-05-11 12:13 节点故障 43 | 44 | 坏掉 n30115 节点,从 7300 step 开始训练 45 | 46 | - glm-130B-2022.05.11-05:55:32 47 | 48 | ## 2022-05-20 00:03 节点故障 49 | 50 | 坏掉 n30066 节点,从 15400 step 开始训练 51 | 52 | - glm-130B-2022.05.19-19:56:19 53 | 54 | 再换一批节点,从 15600 step 开始训练 55 | 56 | - glm-130B-2022.05.20-01:58:57 57 | 58 | ## 2022-05-21 12:40 换节点 59 | 60 | 训练效率一直只有 127T 左右,怀疑之前加入的 n30076 存在问题,踢出后从 16600 step 开始训练,似乎不解决问题。 61 | 62 | ## 2022-05-22 19:27 节点故障 63 | 64 | n30126 失联 65 | 66 | - glm-130B-2022.05.22-14:15:41 67 | 68 | ## 2022-05-26 04:30 节点故障 69 | 70 | n30039 掉卡 71 | 72 | - glm-130B-2022.05.25-22:23:12 73 | 74 | 75 | ## 2022-05-28 11:50 更换中英多任务数据(废除) 76 | 77 | 从 22800 开始训练,换中英多任务数据 78 | 79 | - glm-130B-2022.05.28-03:52:26 80 | - events.out.tfevents.1653709957.9droa42ltcad5-0.1858.0(移除) 81 | 82 | ## 2022-05-28 16:50 更换英文多任务数据(废除) 83 | 84 | 换新的多任务数据 22900 左右出现 nan,挂掉训练,检查发现中文多任务数据噪声极大,从 22800 换成平衡后的 t0 原始数据开始训练 85 | 86 | - glm-130B-2022.05.28-09:18:12 87 | - events.out.tfevents.1653729502.9droa42ltcad5-0.5648.0(移除) 88 | 89 | ## 2022-05-28 20:50 加入 warmup(废除) 90 | 91 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C850748B-92A4-4F9F-932F-AD22330895D6_2/E8MboG8vrTTb2N51FRhkb6wsB4eyrD77USmM992obQgz/Image.png) 92 | 93 | 换上平衡后且不泄漏的 t0 原始数据开始训练仍然有问题,推测是平衡后一些任务占比变大,其实等价于加入新任务的情况,加入参数 `--warmup-samples-after-loading 2112000` warmup 500 步从 22800 开始训练 94 | 95 | - glm-130B-2022.05.28-12:57:24 96 | - events.out.tfevents.1653742654.9droa42ltcad5-0.7942.0(移除) 97 | 98 | ## 2022-05-29 01:30 再次爆炸,换纯文本(废除) 99 | 100 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/028DE014-00FE-4521-BEEB-EF3F61BB8DA1_2/mgYybTR1OLgPkBysqMiUgGYNyIg8OQnf1yXI66grBeMz/Image.png) 101 | 102 | - warmup 以后还是炸了,分析可能是 distribution 变动仍然太过剧烈,先换纯文本 + reshuffle 尝试训练,从 22800 加载 103 | - glm-130B-2022.05.28-18:05:33 104 | - events.out.tfevents.1653761143.9droa42ltcad5-0.9744.0(废除) 105 | - global_step23200_text 106 | + 配置文件 107 | 108 | ## 2022-05-29 逐渐修改数据分布(废除) 109 | 110 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E2BC463F-E519-461E-B1B0-99551DA940BE_2/0ZqN22TLyqRTvqOy6JNLeixEy4TarDJEF7DOvdh3saIz/Image.png) 111 | 112 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/9C7AC4B3-59AB-471A-872E-41CCBAE7E90D_2/0rpEmyAOcIkLyDGR2R4RQiBeUwbWIWiaHbHcwosx6yAz/Image.png) 113 | 114 | 文本似乎能稳定,那么尝试逐渐平滑修改数据分布, 从 22800 开始,逐渐修改数据分布到 t0 平衡数据 115 | 116 | - glm-130B-2022.05.29-05:17:06 117 | - events.out.tfevents.1653801436.9droa42ltcad5-0.13868.0(废除) 118 | 119 | ## 2022-05-29 22:40 逐渐修改数据分布并全面 warmup 120 | 121 | - 又挂了,分析可能是换新分布学习率也需要 warmup 122 | 123 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/F5532A86-3AAC-4CCE-AC9B-A976B7736D7F_2/M4JZx5GYzNPuysPHXrn0R5Oo54rBhDwQxdErkOpFOhEz/Image.png) 124 | 125 | - 从 22800 开始训练,数据和 lr 都 warmup 2000 步,shrink embbeding graident 从 0.2 warmup 6000 步到 1 126 | - glm-130B-2022.05.29-17:35:45 127 | 128 | ## 2022-05-30 14:00 挂节点 129 | 130 | 更改了一下参数配置,发现之前 shrink embedding 的步数写错了(26850 步),现在改成 6000 步。升级了一下 lr auto warmup 的逻辑,写成绝对 samples 数量。从 global_step23200 开始 131 | 132 | 我们发现这次训练卡在了数据加载,排查后发现是 Lustre 文件系统的故障,导致 2.3T 文本数据读不出来,且工程师无法修复;最终重新从移动硬盘拷贝了一次数据 133 | 134 | - glm-130B-2022.05.31-02:18:24 135 | 136 | ## 2022.05.03 20:00 加 DeepStruct 数据 137 | 138 | - 维持原有 transform 过程不变,但直接加入 DeepStruct 数据,从 23500 开始 139 | 140 | ## 2022-06-01 22:22 换清洗数据 141 | 142 | 之前的多任务数据 t0 和 deepsturct 各有一个任务的 target 异常,重新清洗后更换,从 24500 开始 143 | 144 | - glm-130B-2022.06.01-14:24:33 145 | 146 | ## 2022-06-02 12:00 节点故障 147 | 148 | - n30145 CPU 故障,从 25000 重启训练,lr 和 数据集已经 transfromer 完毕,所以配置直接去掉 warmup 149 | - glm-130B-2022.06.02-04:35:05 150 | 151 | ## 2022-06-02 09:30 加入 multitask loss 打印 152 | 153 | 25800steps 开始,加入 multitask loss 打印 154 | 155 | - glm-130B-2022.06.03-01:40:12 156 | 157 | ## 2022-06-02 15:00 降低学习率,加入 gpt/bert loss 打印 158 | 159 | loss 降低比较慢,讨论可能是学习率太大了,26000steps 开始,学习率砍半 160 | 161 | - glm-130B-2022.06.03-07:26:16 162 | 163 | ## 2022-06-06 17:00 集群维护 164 | 165 | 集群从 9 点到 5 点升级驱动,从 开始训练 166 | 167 | - glm-130B-2022.06.06-10:00:39 168 | 169 | PS:观察到共享文件系统读取速度显著改善,现在加载 ckpt 几乎只需要 1 分钟 170 | 171 | ## 2022-06-08 08:00 坏点 172 | 173 | - glm-130B-2022.06.08-00:00:37 174 | 175 | ## 2022-06-09 13:30 训练卡住 176 | 177 | 23100 开始恢复 178 | 179 | - glm-130B-2022.06.09-05:27:54 180 | 181 | ## 2022-06-12 10:00 loss 爆炸 182 | 183 | 33700 开始 loss 炸了,loss-scale 在 33710 左右突然下跌然后 loss 在 33740 左右爆炸 184 | 185 | - tensorboard 记录:glm-130B-33700 186 | 187 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C46C7CFE-1B79-491C-90FC-5A88AE90E9DF_2/7ICMyH8v6GhAgngz5bVaDKwzYjFPyk99Ax27R5w56wMz/Image.png) 188 | 189 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E56BCDE0-C798-429F-81E0-1A07CCB9BC0E_2/Ig2rfKnPmLadg39Jc38UEdK90LDxlAxoH0AxmAygxzAz/Image.png) 190 | 191 | - 从 33600 开始加载,shrink embedding gradient 1 → 0.5 192 | - glm-130B-2022.06.12-02:20:49 193 | 194 | ## 2022-06-14 03:00 loss 爆炸 195 | 196 | 35250 loss 又炸了,和 33700 的表现几乎一样,都是完全没有征兆突然爆炸 197 | 198 | tensorboard 记录:glm-130B-35250 199 | 200 | - 从 35200 开始加载,shrink embedding gradient 0.5 → 0.1 201 | - glm-130B-2022.06.14-02:28:21 202 | 203 | ## 2022-06-19 00:10 节点故障 204 | 205 | n30085 挂了,从 39600 恢复 206 | 207 | - glm-130B-2022.06.18-17:49:53 208 | 209 | ## 2022-06-20 09:10 loss 爆炸 210 | 211 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/CA344108-3B01-469C-9ABE-C41002F76484_2/oEvBST5MP0I7S4qHmQUeE7DoPCsGFSrveAOOSyitSUwz/Image.png) 212 | 213 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/FED0DE40-A710-4259-AE98-26BCB9568C7A_2/kH4FijsPDVJFzkbaxz7BiX0RZrul1Wrye6cE5EV8ZG0z/Image.png) 214 | 215 | - tensorboard 记录:glm-130B-40800 216 | - `--skip-train-iteration-range 40701-40900` 217 | - 从 40700 开始重新加载并跳过 40701-40900 数据 218 | - glm-130B-2022.06.20-03:36:13 219 | 220 | ## 2022-06-22 10:40 梯度 spike 221 | 222 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/0B7E0A0C-4B11-4F52-BF10-E6B11A533BEF_2/yb1zC07di9zux8jbAi15gpqlstGHXZyjyMBEjO0gNKUz/Image.png) 223 | 224 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/1C60424A-0290-4070-9327-DF9DFD135020_2/XyVoPs1yMLIuzUyrDixSYfgjc2Y2Nuor20GCz0nSPkAz/Image.png) 225 | 226 | - grad 有点小 spike,看起来后续恢复了,但 loss 似乎遇到了比较大的波动 227 | - `--skip-train-iteration-range 40701-40900` 228 | - 从 42400 开始重新加载并跳过 42401-42600 数据 229 | - glm-130B-2022.06.22-02:38:20 230 | 231 | ## 2022-06-22 21:00 梯度 spike 232 | 233 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/1D7D801C-3226-4CB0-978C-F19B4DA46721_2/nmg9r87OFrdErZvY9xjiDIHvgPVLv39vy8ZVtGkj2H0z/Image.png) 234 | 235 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/5F5CA3D6-AF58-4087-9806-1529D3A2EF6C_2/WSQqyBdv1rvzvNloXE6Ssql7GxMDoULU38FAQCv3778z/Image.png) 236 | 237 | - grad 又有 spike,但是 loss-scale 没有一降到底,推测应该可以恢复 238 | - 这几天的反复 spike,我们分析可能是后期 learning rate 降低太慢,将 min-lr 从 8e-6 调整到 4e-6 239 | - `--min-lr 4e-6` 240 | - 从 42700 加载开始训练 241 | - glm-130B-2022.06.22-13:03:53 242 | 243 | ## 2022.06.26 16:00 节点故障 244 | 245 | - 节点 NVLink Error,重启训练 246 | - glm-130B-2022.06.26-13:13:51 247 | 248 | ## 2022.06.29 00:00 恢复 position_id 249 | 250 | - 48100 从原先配置开始训练 251 | - glm-130B-2022.06.29-13:53:21 252 | -------------------------------------------------------------------------------- /quantization/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .layers import QuantizedColumnParallelLinear 4 | from .layers import QuantizedRowParallelLinear 5 | 6 | 7 | def quantize(model, weight_bit_width): 8 | """Replace fp16 linear with quantized linear""" 9 | 10 | if torch.distributed.get_rank() == 0: 11 | print(f"> Quantizing model weight to {weight_bit_width} bits") 12 | 13 | for layer in model.transformer.layers: 14 | layer.attention.query_key_value = QuantizedColumnParallelLinear( 15 | weight_bit_width=weight_bit_width, 16 | weight=layer.attention.query_key_value.weight.to(torch.cuda.current_device()), 17 | input_size=layer.attention.query_key_value.input_size, 18 | output_size=layer.attention.query_key_value.output_size, 19 | bias=True, 20 | gather_output=False, 21 | params_dtype=torch.half, 22 | name="query_key_value", 23 | skip_init=True, 24 | device=layer.attention.query_key_value.weight.device, 25 | ) 26 | layer.attention.dense = QuantizedRowParallelLinear( 27 | weight_bit_width=weight_bit_width, 28 | weight=layer.attention.dense.weight.to(torch.cuda.current_device()), 29 | input_size=layer.attention.dense.input_size, 30 | output_size=layer.attention.dense.output_size, 31 | bias=True, 32 | input_is_parallel=True, 33 | params_dtype=torch.half, 34 | name="dense", 35 | skip_init=True, 36 | device=layer.attention.dense.weight.device, 37 | ) 38 | layer.mlp.dense_h_to_4h = QuantizedColumnParallelLinear( 39 | weight_bit_width=weight_bit_width, 40 | weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), 41 | input_size=layer.mlp.dense_h_to_4h.input_size, 42 | output_size=layer.mlp.dense_h_to_4h.output_size, 43 | bias=True, 44 | gather_output=False, 45 | params_dtype=torch.half, 46 | name="dense_h_to_4h", 47 | skip_init=True, 48 | device=layer.mlp.dense_h_to_4h.weight.device, 49 | ) 50 | layer.mlp.dense_4h_to_h = QuantizedRowParallelLinear( 51 | weight_bit_width=weight_bit_width, 52 | weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), 53 | input_size=layer.mlp.dense_4h_to_h.input_size, 54 | output_size=layer.mlp.dense_4h_to_h.output_size, 55 | bias=True, 56 | input_is_parallel=True, 57 | params_dtype=torch.half, 58 | name="dense_h_to_4h", 59 | skip_init=True, 60 | device=layer.mlp.dense_4h_to_h.weight.device, 61 | ) 62 | 63 | return model 64 | -------------------------------------------------------------------------------- /quantization/functional.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from kernels import extract_weight_to_half 4 | 5 | 6 | class W8A16Linear(torch.autograd.Function): 7 | @staticmethod 8 | def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): 9 | ctx.inp_shape = inp.size() 10 | ctx.weight_shape = quant_w.size() 11 | ctx.weight_bit_width = weight_bit_width 12 | out_features = quant_w.size(0) 13 | inp = inp.contiguous().view(-1, inp.size(-1)) 14 | weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) 15 | output = inp.mm(weight.t()) 16 | ctx.save_for_backward(inp, quant_w, scale_w) 17 | return output.view(*(ctx.inp_shape[:-1] + (out_features,))) 18 | 19 | @staticmethod 20 | def backward(ctx, grad_output: torch.Tensor): 21 | inp, quant_w, scale_w = ctx.saved_tensors 22 | weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) 23 | grad_output = grad_output.contiguous().view(-1, weight.size(0)) 24 | grad_input = grad_output.mm(weight) 25 | grad_weight = grad_output.t().mm(inp) 26 | return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None 27 | -------------------------------------------------------------------------------- /quantization/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.parameter import Parameter 3 | 4 | from SwissArmyTransformer.mpu import copy_to_model_parallel_region 5 | from SwissArmyTransformer.mpu import gather_from_model_parallel_region 6 | from SwissArmyTransformer.mpu import reduce_from_model_parallel_region 7 | from SwissArmyTransformer.mpu import scatter_to_model_parallel_region 8 | from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear 9 | 10 | from .functional import W8A16Linear 11 | from kernels import compress_int4_weight 12 | 13 | 14 | class QuantizedColumnParallelLinear(ColumnParallelLinear): 15 | def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs): 16 | super(QuantizedColumnParallelLinear, self).__init__(*args, **kwargs) 17 | self.weight_bit_width = weight_bit_width 18 | 19 | shape = self.weight.shape 20 | del self.weight 21 | 22 | if weight is None: 23 | self.weight = torch.empty( 24 | shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"] 25 | ) 26 | self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"]) 27 | else: 28 | self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() 29 | self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) 30 | if weight_bit_width == 4: 31 | self.weight = compress_int4_weight(self.weight) 32 | 33 | self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False) 34 | self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False) 35 | 36 | def forward(self, input_): 37 | # Set up backprop all-reduce. 38 | input_parallel = copy_to_model_parallel_region(input_) 39 | # Matrix multiply. 40 | output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width) 41 | if self.bias is not None: 42 | output_parallel = output_parallel + self.bias 43 | if self.gather_output: 44 | # All-gather across the partitions. 45 | output = gather_from_model_parallel_region(output_parallel) 46 | else: 47 | output = output_parallel 48 | return output 49 | 50 | 51 | class QuantizedRowParallelLinear(RowParallelLinear): 52 | def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs): 53 | super(QuantizedRowParallelLinear, self).__init__(*args, **kwargs) 54 | self.weight_bit_width = weight_bit_width 55 | 56 | shape = self.weight.shape 57 | del self.weight 58 | 59 | if weight is None: 60 | self.weight = torch.empty( 61 | shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"] 62 | ) 63 | self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"]) 64 | else: 65 | self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() 66 | self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) 67 | if weight_bit_width == 4: 68 | self.weight = compress_int4_weight(self.weight) 69 | 70 | self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False) 71 | self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False) 72 | 73 | def forward(self, input_): 74 | # Set up backprop all-reduce. 75 | if self.input_is_parallel: 76 | input_parallel = input_ 77 | else: 78 | input_parallel = scatter_to_model_parallel_region(input_) 79 | # Matrix multiply. 80 | output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width) 81 | # All-reduce across all the partitions. 82 | output_ = reduce_from_model_parallel_region(output_parallel) 83 | if self.bias is not None: 84 | output = output_ + self.bias 85 | else: 86 | output = output_ 87 | return output 88 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | SwissArmyTransformer>=0.2.12,<0.3 2 | icetk 3 | apex 4 | scipy 5 | dataclass_wizard 6 | cpm_kernels 7 | -------------------------------------------------------------------------------- /resources/03DF31017FE184DB45D41DFFC6F80EF0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/03DF31017FE184DB45D41DFFC6F80EF0.png -------------------------------------------------------------------------------- /resources/33872E48D3539EA132B74BCF5EFF458F.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/33872E48D3539EA132B74BCF5EFF458F.png -------------------------------------------------------------------------------- /resources/49BF334CB352BAA19F7D55460B1DBCA9.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/49BF334CB352BAA19F7D55460B1DBCA9.gif -------------------------------------------------------------------------------- /resources/7CB441707D1035B2890AA2164C5B6EAC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/7CB441707D1035B2890AA2164C5B6EAC.png -------------------------------------------------------------------------------- /resources/7D6433A42D189E2E6FBC62BE066BCE91.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/7D6433A42D189E2E6FBC62BE066BCE91.png -------------------------------------------------------------------------------- /resources/849024E93FA85347F7F6443932911922.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/849024E93FA85347F7F6443932911922.png -------------------------------------------------------------------------------- /resources/AE18F14396E2D22BC0BC8DD77EFD3414.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/AE18F14396E2D22BC0BC8DD77EFD3414.png -------------------------------------------------------------------------------- /resources/E42321373D22DE198231279B5856BB42.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/E42321373D22DE198231279B5856BB42.png -------------------------------------------------------------------------------- /resources/F48B69263360688CCA21E915F4B1A98B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/F48B69263360688CCA21E915F4B1A98B.png -------------------------------------------------------------------------------- /resources/WECHAT.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 |

扫码关注公众号,加入「GLM-130B 交流群」

5 |

Scan the QR code to follow the official account and join the "GLM-130B Discussion Group"

6 |
7 | 8 | -------------------------------------------------------------------------------- /resources/multitask_list.txt: -------------------------------------------------------------------------------- 1 | super_glue/wsc.fixed 2 | winogrande/winogrande_xl 3 | super_glue/rte 4 | glue/mrpc 5 | glue/qqp 6 | paws/labeled_final 7 | ai2_arc/ARC_Challenge 8 | ai2_arc/ARC_Easy 9 | kilt_tasks/hotpot_qa 10 | trivia_qa/unfiltered 11 | web_questions 12 | wiki_qa 13 | adversarial_qa/dbidaf 14 | adversarial_qa/dbert 15 | adversarial_qa/droberta 16 | duorc/SelfRC 17 | duorc/ParaphraseRC 18 | ropes 19 | squad_v2 20 | super_glue/record 21 | quoref 22 | tydiqa 23 | cos_e/v1.11 24 | cosmos_qa 25 | dream 26 | openbookqa/main 27 | qasc 28 | quail 29 | quarel 30 | quartz 31 | race/high 32 | race/middle 33 | sciq 34 | social_i_qa 35 | super_glue/boolq 36 | super_glue/multirc 37 | wiki_hop/original 38 | wiqa 39 | piqa 40 | amazon_polarity 41 | app_reviews 42 | imdb 43 | rotten_tomatoes 44 | yelp_review_full 45 | super_glue/copa 46 | hellaswag 47 | common_gen 48 | wiki_bio 49 | cnndailymail/3.0.0 50 | gigaword 51 | multi_news 52 | samsum 53 | xsum 54 | ag_news 55 | dbpedia_14 56 | trec 57 | super_glue/wic 58 | tacred 59 | conll04 (joint entity relation extraction) 60 | nyt29 (joint entity relation extraction) 61 | ace2005 (joint entity relation extraction) 62 | ade (joint entity relation extraction) 63 | conll03 (named entity recognition) 64 | ontonotes (named entity recognition) 65 | genia (named entity recognition) 66 | conll05 (semantic role labeling) 67 | conll12 (semantic role labeling) 68 | propbank (semantic role labeling) 69 | ace05 (event extraction) 70 | multi_woz_2.1 (dialogue state tracking) 71 | -------------------------------------------------------------------------------- /resources/wechat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/wechat.jpg -------------------------------------------------------------------------------- /scripts/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $0) 4 | script_dir=$(dirname $script_path) 5 | main_dir=$(dirname $script_dir) 6 | 7 | source "${main_dir}/configs/model_glm_130b.sh" 8 | 9 | ARGS="${main_dir}/benchmark.py \ 10 | --mode inference \ 11 | $MODEL_ARGS" 12 | 13 | TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S') 14 | EXP_NAME=${TIMESTAMP} 15 | 16 | mkdir -p logs 17 | 18 | run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}" 19 | echo $run_cmd 20 | eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log 21 | -------------------------------------------------------------------------------- /scripts/evaluate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $0) 4 | script_dir=$(dirname $script_path) 5 | main_dir=$(dirname $script_dir) 6 | 7 | source "${main_dir}/configs/model_glm_130b.sh" 8 | 9 | DATA_PATH="" 10 | 11 | ARGS="${main_dir}/evaluate.py \ 12 | --mode inference \ 13 | --data-path $DATA_PATH \ 14 | --task $* \ 15 | $MODEL_ARGS" 16 | 17 | TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S') 18 | EXP_NAME=${TIMESTAMP} 19 | 20 | mkdir -p logs 21 | 22 | run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}" 23 | eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log 24 | -------------------------------------------------------------------------------- /scripts/evaluate_multiple_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NUM_WORKERS=16 4 | NUM_GPUS_PER_WORKER=8 5 | HOST_FILE_PATH="" 6 | OPTIONS_NCCL="NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 CUDA_LAUNCH_BLOCKING=0" 7 | 8 | script_path=$(realpath $0) 9 | script_dir=$(dirname $script_path) 10 | main_dir=$(dirname $script_dir) 11 | 12 | source "${main_dir}/configs/model_glm_130b.sh" 13 | 14 | DATA_PATH="" 15 | 16 | ARGS="${main_dir}/evaluate.py \ 17 | --mode inference \ 18 | --data-path $DATA_PATH \ 19 | --task $* \ 20 | $MODEL_ARGS" 21 | 22 | TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S') 23 | EXP_NAME=${TIMESTAMP} 24 | 25 | mkdir -p logs 26 | 27 | run_cmd="${OPTIONS_NCCL} deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} ${ARGS}" 28 | eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log 29 | -------------------------------------------------------------------------------- /scripts/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | script_path=$(realpath $0) 4 | script_dir=$(dirname $script_path) 5 | main_dir=$(dirname $script_dir) 6 | 7 | source "${main_dir}/configs/model_glm_130b.sh" 8 | 9 | SEED=1234 10 | MAX_OUTPUT_LENGTH=256 11 | MIN_GEN_LENGTH=0 12 | # BeamSearchStrategy args 13 | NUM_BEAMS=4 14 | LENGTH_PENALTY=1.0 15 | NO_REPEAT_NGRAM=3 16 | # BaseStrategy args 17 | TEMP=1.0 18 | TOPK=0 19 | TOPP=0.7 20 | 21 | ARGS="${main_dir}/generate.py \ 22 | --seed $SEED \ 23 | --mode inference \ 24 | --sampling-strategy BaseStrategy \ 25 | --out-seq-length $MAX_OUTPUT_LENGTH \ 26 | --min-gen-length $MIN_GEN_LENGTH \ 27 | --num-beams $NUM_BEAMS \ 28 | --length-penalty $LENGTH_PENALTY \ 29 | --no-repeat-ngram-size $NO_REPEAT_NGRAM \ 30 | --temperature $TEMP \ 31 | --top_k $TOPK \ 32 | --top_p $TOPP \ 33 | --output-path samples \ 34 | $MODEL_ARGS \ 35 | $*" 36 | 37 | run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}" 38 | eval ${run_cmd} 39 | -------------------------------------------------------------------------------- /tasks/bloom/glue_cola.yaml: -------------------------------------------------------------------------------- 1 | name: 'glue_cola' 2 | type: 'mul' 3 | path: 'bloom/glue_cola' 4 | file-pattern: 5 | validation: "**/validation.jsonl" 6 | micro-batch-size: 30 -------------------------------------------------------------------------------- /tasks/bloom/glue_mnli.yaml: -------------------------------------------------------------------------------- 1 | name: 'glue_mnli' 2 | type: 'mul' 3 | path: 'bloom/glue_mnli' 4 | file-pattern: 5 | validation-matched: "**/validation_matched.jsonl" 6 | validation-mismatched: "**/validation_mismatched.jsonl" 7 | micro_batch_size: 8 -------------------------------------------------------------------------------- /tasks/bloom/glue_qnli.yaml: -------------------------------------------------------------------------------- 1 | name: 'glue_qnli' 2 | type: 'mul' 3 | path: 'bloom/glue_qnli' 4 | file-pattern: 5 | validation: "**/validation.jsonl" 6 | micro_batch_size: 6 -------------------------------------------------------------------------------- /tasks/bloom/glue_wnli.yaml: -------------------------------------------------------------------------------- 1 | name: 'glue_wnli' 2 | type: 'mul' 3 | path: 'bloom/glue_wnli' 4 | file-pattern: 5 | validation: "**/validation.jsonl" 6 | micro_batch_size: 16 -------------------------------------------------------------------------------- /tasks/bloom/math_qa.yaml: -------------------------------------------------------------------------------- 1 | name: 'math_qa' 2 | type: 'mul' 3 | path: 'bloom/math_qa' 4 | file-pattern: 5 | validation: "**/validation.jsonl" 6 | test: "**/test.jsonl" 7 | micro_batch_size: 6 -------------------------------------------------------------------------------- /tasks/bloom/mc_taco.yaml: -------------------------------------------------------------------------------- 1 | name: 'mc_taco' 2 | type: 'gen' 3 | path: 'bloom/mc_taco' 4 | file-pattern: 5 | validation: "**/validation_pp.jsonl" 6 | test: "**/test_pp.jsonl" -------------------------------------------------------------------------------- /tasks/bloom/openbook_qa.yaml: -------------------------------------------------------------------------------- 1 | name: 'openbook_qa' 2 | type: 'mul' 3 | path: 'bloom/openbookqa_main' 4 | file-pattern: 5 | test: "**/test.jsonl" 6 | validation: "**/validation.jsonl" 7 | micro_batch_size: 18 -------------------------------------------------------------------------------- /tasks/bloom/pubmed_qa.yaml: -------------------------------------------------------------------------------- 1 | name: 'pubmed_qa' 2 | type: 'mul' 3 | path: 'bloom/pubmed_qa_pqa_labeled' 4 | file-pattern: 5 | train: "**/train.jsonl" 6 | micro_batch_size: 2 -------------------------------------------------------------------------------- /tasks/bloom/superglue_axb.yaml: -------------------------------------------------------------------------------- 1 | name: 'superglue_axb' 2 | type: 'mul' 3 | path: 'bloom/super_glue_axb' 4 | file-pattern: 5 | test: "**/test.jsonl" 6 | micro_batch_size: 16 -------------------------------------------------------------------------------- /tasks/bloom/superglue_axg.yaml: -------------------------------------------------------------------------------- 1 | name: 'superglue_axg' 2 | type: 'mul' 3 | path: 'bloom/super_glue_axg' 4 | file-pattern: 5 | test: "**/test.jsonl" 6 | micro_batch_size: 34 -------------------------------------------------------------------------------- /tasks/chinese/clue/afqmc.yaml: -------------------------------------------------------------------------------- 1 | name: 'AFQMC' 2 | type: 'mul' 3 | path: 'CLUE/afqmc' 4 | micro_batch_size: 16 -------------------------------------------------------------------------------- /tasks/chinese/clue/c3.yaml: -------------------------------------------------------------------------------- 1 | name: 'C3' 2 | type: 'mul' 3 | path: 'CLUE/c3' 4 | micro_batch_size: 2 -------------------------------------------------------------------------------- /tasks/chinese/clue/cluewsc.yaml: -------------------------------------------------------------------------------- 1 | name: 'CLUEWSC2020' 2 | type: 'mul' 3 | path: 'CLUE/cluewsc' 4 | micro_batch_size: 18 -------------------------------------------------------------------------------- /tasks/chinese/clue/cmnli.yaml: -------------------------------------------------------------------------------- 1 | name: 'CMNLI' 2 | type: 'mul' 3 | path: 'CLUE/cmnli' 4 | micro_batch_size: 16 -------------------------------------------------------------------------------- /tasks/chinese/clue/cmrc2018.yaml: -------------------------------------------------------------------------------- 1 | name: "CMRC2018" 2 | type: "gen" 3 | path: "CLUE/cmrc2018" 4 | -------------------------------------------------------------------------------- /tasks/chinese/clue/csl.yaml: -------------------------------------------------------------------------------- 1 | name: 'CSL' 2 | type: 'mul' 3 | path: 'CLUE/csl' 4 | micro_batch_size: 3 -------------------------------------------------------------------------------- /tasks/chinese/clue/drcd.yaml: -------------------------------------------------------------------------------- 1 | name: "DRCD" 2 | type: "gen" 3 | path: "CLUE/drcd" 4 | -------------------------------------------------------------------------------- /tasks/chinese/clue/ocnli.yaml: -------------------------------------------------------------------------------- 1 | name: 'OCNLI_50K' 2 | type: 'mul' 3 | path: 'CLUE/ocnli' 4 | micro_batch_size: 24 -------------------------------------------------------------------------------- /tasks/chinese/fewclue/bustm.yaml: -------------------------------------------------------------------------------- 1 | name: 'BUSTM' 2 | type: 'mul' 3 | path: 'CLUE/bustm' 4 | file-pattern: 5 | dev: "**/dev_few_all.jsonl" 6 | test: "**/test_public.jsonl" 7 | micro_batch_size: 56 8 | -------------------------------------------------------------------------------- /tasks/chinese/fewclue/chidf.yaml: -------------------------------------------------------------------------------- 1 | name: 'CHIDF' 2 | type: 'mul' 3 | path: 'CLUE/chid-fc' 4 | file-pattern: 5 | dev: "**/dev_few_all.jsonl" 6 | test: "**/test_public.jsonl" 7 | micro_batch_size: 16 -------------------------------------------------------------------------------- /tasks/chinese/fewclue/cluewscf.yaml: -------------------------------------------------------------------------------- 1 | name: 'CLUEWSCF' 2 | type: 'mul' 3 | path: 'CLUE/cluewsc-fc' 4 | file-pattern: 5 | dev: "**/dev_few_all.jsonl" 6 | test: "**/test_public.jsonl" 7 | micro_batch_size: 16 -------------------------------------------------------------------------------- /tasks/chinese/fewclue/cslf.yaml: -------------------------------------------------------------------------------- 1 | name: 'CSLF' 2 | type: 'mul' 3 | path: 'CLUE/csl-fc' 4 | file-pattern: 5 | dev: "**/dev_few_all.jsonl" 6 | test: "**/test_public.jsonl" 7 | micro_batch_size: 2 -------------------------------------------------------------------------------- /tasks/chinese/fewclue/eprstmt.yaml: -------------------------------------------------------------------------------- 1 | name: 'EPRSTMT' 2 | type: 'mul' 3 | path: 'CLUE/eprstmt-fc' 4 | file-pattern: 5 | dev: "**/dev_few_all.jsonl" 6 | test: "**/test_public.jsonl" 7 | micro_batch_size: 6 -------------------------------------------------------------------------------- /tasks/chinese/fewclue/ocnlif.yaml: -------------------------------------------------------------------------------- 1 | name: 'OCNLIF' 2 | type: 'mul' 3 | path: 'CLUE/ocnli-fc' 4 | file-pattern: 5 | dev: "**/dev_few_all.jsonl" 6 | test: "**/test_public.jsonl" 7 | micro_batch_size: 24 -------------------------------------------------------------------------------- /tasks/ethnic/crows-pair/crows-pair.yaml: -------------------------------------------------------------------------------- 1 | name: "CROWS" 2 | type: "mul" 3 | path: "data" 4 | module: "tasks.ethnic.crows-pair.tasks.CrowsPairTask" 5 | file-pattern: 6 | test: "**/crows-pair-dataset.jsonl" 7 | 8 | micro-batch-size: 1 -------------------------------------------------------------------------------- /tasks/ethnic/crows-pair/tasks.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from typing import Dict, Tuple, List 3 | from abc import ABC 4 | from collections import defaultdict 5 | from evaluation import ( 6 | MultiChoiceTask, 7 | MultiChoiceTaskConfig, 8 | ) 9 | from evaluation.dataset import ( 10 | MultiChoiceTaskDataset, 11 | ) 12 | from evaluation.utils import ( 13 | print_rank_0, 14 | get_tokenized_input, 15 | ) 16 | 17 | 18 | class CrowsPairTask(MultiChoiceTask, ABC): 19 | config: MultiChoiceTaskConfig 20 | 21 | def build_dataset(self, relative_path): 22 | return CrowsPairDataset(join(self.config.path, relative_path), self.config) 23 | 24 | def predict_single_batch(self, batch) -> List[int]: 25 | log_probs = self.model.cond_log_prob(batch) 26 | return log_probs 27 | 28 | def CrowsPairMetric(self, predictions, examples): 29 | print_rank_0("Special metric for CrowsPair") 30 | results = defaultdict(float) 31 | labels = defaultdict() 32 | for prediction, example in zip(predictions, examples): 33 | prediction = prediction[0] 34 | if example["sent_ID"] == 1: 35 | results[example["pair_ID"]] = results[example["pair_ID"]] + prediction 36 | else: 37 | results[example["pair_ID"]] = results[example["pair_ID"]] - prediction 38 | labels[example["pair_ID"]] = example["bias_type"] 39 | cat_postivie = defaultdict(int) 40 | cat_tt = defaultdict(int) 41 | final = defaultdict(int) 42 | for val1, val2 in zip(results.values(), labels.values()): 43 | if val1 >= 0: 44 | cat_postivie[val2] = cat_postivie[val2] + 1 45 | else: 46 | cat_postivie[val2] = cat_postivie[val2] 47 | cat_tt[val2] = cat_tt[val2] + 1 48 | for key, val in cat_postivie.items(): 49 | final[key] = val / cat_tt[key] 50 | return final 51 | 52 | def report_single_metrics(self, file: str, result_dict: Dict[str, float]): 53 | pass 54 | 55 | @property 56 | def metrics(self): 57 | return {"CP": self.CrowsPairMetric} 58 | 59 | def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1): 60 | for result in result_dict_group.values(): 61 | result = result[0] 62 | for value1 in result.items(): 63 | value1 = value1[1] 64 | for key, value in value1.items(): 65 | print_rank_0("category:{cat} score:{score}".format(cat=key, score=round(value * 100,2))) 66 | 67 | 68 | class CrowsPairDataset(MultiChoiceTaskDataset): 69 | 70 | config: MultiChoiceTaskConfig 71 | 72 | def __init__(self, path, config: MultiChoiceTaskConfig): 73 | self.is_single_token = True # set to False later in process_single_item func 74 | self.eval_data = [] 75 | super().__init__(path, config) 76 | 77 | def process_single_item(self, item): 78 | text, choices, label = ( 79 | get_tokenized_input(item, "inputs"), 80 | get_tokenized_input(item, "choices"), 81 | item["label"], 82 | ) 83 | pair_ID, sent_ID, bias_type = ( 84 | item["pair_ID"], 85 | item["sent_ID"], 86 | item["bias_type"], 87 | ) 88 | tgt_seq_length = sum([len(choice) for choice in choices]) 89 | if tgt_seq_length == len(choices): 90 | # For single token, we only insert one [sop] 91 | tgt_seq_length = 1 92 | 93 | assert tgt_seq_length < self.config.max_seq_length 94 | if len(text) + tgt_seq_length + 2 > self.config.max_seq_length: 95 | text_length = self.config.max_seq_length - tgt_seq_length - 2 96 | text = text[len(text) - text_length : len(text)] 97 | 98 | assert not ( 99 | self.mask_id in text and self.config.use_multitask_encoding 100 | ), "Unified multitask encoding don't support blank filling" 101 | 102 | if tgt_seq_length != 1: 103 | self.is_single_token = False 104 | 105 | dataset = { 106 | "text": text, 107 | "choices": choices, 108 | "label": label, 109 | "pair_ID": pair_ID, 110 | "sent_ID": sent_ID, 111 | "bias_type": bias_type, 112 | } 113 | 114 | return dataset 115 | -------------------------------------------------------------------------------- /tasks/ethnic/ethos/ethos-fewshot-multi.yaml: -------------------------------------------------------------------------------- 1 | name: "ETHOS_fewshot_multi" 2 | type: "mul" 3 | path: "data" 4 | file-pattern: 5 | test: "**/ethos-few-shot-multi.jsonl" 6 | 7 | micro-batch-size: 32 -------------------------------------------------------------------------------- /tasks/ethnic/ethos/ethos-fewshot-single.yaml: -------------------------------------------------------------------------------- 1 | name: "ETHOS_fewshot_single" 2 | type: "mul" 3 | path: "data" 4 | file-pattern: 5 | test: "**/ethos-few-shot-single.jsonl" 6 | 7 | micro-batch-size: 32 -------------------------------------------------------------------------------- /tasks/ethnic/ethos/ethos-oneshot.yaml: -------------------------------------------------------------------------------- 1 | name: "ETHOS_oneshot" 2 | type: "mul" 3 | path: "data" 4 | file-pattern: 5 | test: "**/ethos-one-shot.jsonl" 6 | 7 | micro-batch-size: 64 -------------------------------------------------------------------------------- /tasks/ethnic/ethos/ethos-zeroshot.yaml: -------------------------------------------------------------------------------- 1 | name: "ETHOS_zeroshot" 2 | type: "mul" 3 | path: "data" 4 | file-pattern: 5 | test: "**/ethos-zero-shot.jsonl" 6 | 7 | micro-batch-size: 128 -------------------------------------------------------------------------------- /tasks/ethnic/stereoset/stereoset.yaml: -------------------------------------------------------------------------------- 1 | name: "StereoSet" 2 | type: "mul" 3 | path: "data" 4 | module: "tasks.ethnic.stereoset.tasks.StereoSetTask" 5 | use_task_mask: True 6 | file-pattern: 7 | test: "**/stereoset-dataset.jsonl" 8 | 9 | micro-batch-size: 64 10 | -------------------------------------------------------------------------------- /tasks/ethnic/stereoset/tasks.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | from collections import defaultdict 3 | from abc import ABC 4 | import numpy as np 5 | from typing import Dict, Tuple, List 6 | from evaluation import ( 7 | MultiChoiceTask, 8 | MultiChoiceTaskConfig, 9 | ) 10 | from evaluation.dataset import ( 11 | MultiChoiceTaskDataset, 12 | ) 13 | from evaluation.utils import ( 14 | print_rank_0, 15 | get_tokenized_input, 16 | ) 17 | 18 | 19 | class StereoSetTask(MultiChoiceTask, ABC): 20 | config: MultiChoiceTaskConfig 21 | 22 | def build_dataset(self, relative_path): 23 | return StereoSetDataset(join(self.config.path, relative_path), self.config) 24 | 25 | def predict_single_batch(self, batch) -> List[int]: 26 | log_probs = self.model.cond_log_prob(batch) 27 | normalize_log_probs = [] 28 | for origin_datas, predicts in zip(batch.get("choices"), log_probs): 29 | normalize_log_probs_single = [] 30 | for origin_data, predict in zip(origin_datas, predicts): 31 | normalize_log_probs_single.append(predict / len(origin_data)) 32 | normalize_log_probs.append(normalize_log_probs_single) 33 | return [np.argmax(log_probs_single).item() for log_probs_single in normalize_log_probs] 34 | 35 | def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1): 36 | for tmp1 in result_dict_group.values(): 37 | tmp1 = tmp1[0] 38 | for result in tmp1.values(): 39 | print("LMS") 40 | for key, val in result[0].items(): 41 | print_rank_0("cat:{key} score:{score}".format(key=key, score=round(val, 2))) 42 | print("SS") 43 | for key, val in result[1].items(): 44 | print_rank_0("cat:{key} score:{score}".format(key=key, score=round(val, 2))) 45 | print("ICAT") 46 | for key, val in result[2].items(): 47 | print_rank_0("cat:{key} score:{score}".format(key=key, score=round(val, 2))) 48 | 49 | def StereoSetMetric(self, predictions, examples): 50 | print_rank_0("Special metric for StereoSet") 51 | assert len(predictions) == len(examples) 52 | results = defaultdict(list) 53 | for prediction, example in zip(predictions, examples): 54 | # print(prediction,example["goal_label"],example["goal_label"][prediction]) 55 | predict_label = example["goal_label"][prediction] 56 | example_type = example["bias_type"] 57 | results[example_type].append(predict_label) 58 | results["overall"].append(predict_label) 59 | ss = {} 60 | lms = {} 61 | icat = {} 62 | for key, values in results.items(): 63 | label_dict = {"unrelated": 0, "stereotype": 0, "anti-stereotype": 0} 64 | for label_keys in values: 65 | label_dict[label_keys] = label_dict.get(label_keys, 0) + 1 66 | tt = len(values) 67 | lms[key] = (label_dict["anti-stereotype"] + label_dict["stereotype"]) / tt * 100 68 | if label_dict["stereotype"] + label_dict["anti-stereotype"] == 0: 69 | ss[key] = 0 70 | else: 71 | ss[key] = label_dict["stereotype"] / (label_dict["anti-stereotype"] + label_dict["stereotype"]) * 100 72 | 73 | icat[key] = lms[key] * (min(ss[key], 100.0 - ss[key]) / 50.0) 74 | return [lms, ss, icat] 75 | 76 | def report_single_metrics(self, file: str, result_dict: Dict[str, float]): 77 | pass 78 | 79 | @property 80 | def metrics(self): 81 | return {"SS_ICAT": self.StereoSetMetric} 82 | 83 | 84 | class StereoSetDataset(MultiChoiceTaskDataset): 85 | config: MultiChoiceTaskConfig 86 | 87 | def __init__(self, path, config: MultiChoiceTaskConfig): 88 | self.is_single_token = True # set to False later in process_single_item func 89 | self.eval_data = [] 90 | super().__init__(path, config) 91 | 92 | def process_single_item(self, item): 93 | text, choices, label = ( 94 | get_tokenized_input(item, "inputs"), 95 | get_tokenized_input(item, "choices"), 96 | item["label"], 97 | ) 98 | # "ID":example.ID,"bias_type":example.bias_type,"goal_label":goal_label 99 | ID, bias_type, goal_label = item["ID"], item["bias_type"], item["goal_label"] 100 | tgt_seq_length = sum([len(choice) for choice in choices]) 101 | if tgt_seq_length == len(choices): 102 | # For single token, we only insert one [sop] 103 | tgt_seq_length = 1 104 | 105 | assert tgt_seq_length < self.config.max_seq_length 106 | if len(text) + tgt_seq_length + 2 > self.config.max_seq_length: 107 | text_length = self.config.max_seq_length - tgt_seq_length - 2 108 | text = text[len(text) - text_length : len(text)] 109 | 110 | assert not ( 111 | self.mask_id in text and self.config.use_multitask_encoding 112 | ), "Unified multitask encoding don't support blank filling" 113 | 114 | if tgt_seq_length != 1: 115 | self.is_single_token = False 116 | 117 | dataset = { 118 | "text": text, 119 | "choices": choices, 120 | "label": label, 121 | "ID": ID, 122 | "bias_type": bias_type, 123 | "goal_label": goal_label, 124 | } 125 | 126 | return dataset 127 | -------------------------------------------------------------------------------- /tasks/lambada/lambada-unidirectional.yaml: -------------------------------------------------------------------------------- 1 | name: "LAMBADA-unidirectional" 2 | type: "gen" 3 | module: "tasks.lambada.task.LAMBADA" 4 | path: "lambada/lambada" 5 | file-pattern: 6 | test: "**/test.jsonl" 7 | validation: "**/validation.jsonl" 8 | 9 | sampling_strategy: "BeamSearchStrategy" 10 | num_beams: 16 11 | max_gen_length: 5 12 | use_task_mask: true 13 | unidirectional: true 14 | -------------------------------------------------------------------------------- /tasks/lambada/lambada.yaml: -------------------------------------------------------------------------------- 1 | name: "LAMBADA" 2 | type: "gen" 3 | module: "tasks.lambada.task.LAMBADA" 4 | path: "lambada/lambada" 5 | file-pattern: 6 | test: "**/test.jsonl" 7 | validation: "**/validation.jsonl" 8 | 9 | sampling_strategy: "BeamSearchStrategy" 10 | num_beams: 16 11 | max_gen_length: 5 12 | use_task_mask: true 13 | -------------------------------------------------------------------------------- /tasks/lambada/strategy.py: -------------------------------------------------------------------------------- 1 | from generation import BeamSearchStrategy 2 | 3 | 4 | class BeamSearchStrategyForLAMBADA(BeamSearchStrategy): 5 | def __init__(self, *args, banned_prefix=[], **kwargs): 6 | super().__init__(*args, **kwargs) 7 | self.banned_prefix = banned_prefix 8 | 9 | def forward(self, logits, tokens, mems): 10 | batch_size, num_beams, vocab_size = logits.shape 11 | logits = logits.float() 12 | for prefix in self.banned_prefix: 13 | if self.length_generated == len(prefix) - 1: 14 | if len(prefix) == 1: 15 | logits[..., prefix[0]] = -65504 16 | else: 17 | for i in range(batch_size): 18 | for j in range(num_beams): 19 | if tokens[i, j, -(len(prefix) - 1) :].tolist() == prefix[:-1]: 20 | logits[i, j, prefix[-1]] = -65504 21 | return super().forward(logits, tokens, mems) 22 | -------------------------------------------------------------------------------- /tasks/lambada/task.py: -------------------------------------------------------------------------------- 1 | from string import punctuation 2 | from functools import partial 3 | from typing import List 4 | 5 | from evaluation import qa_evaluate, GenerationTask 6 | 7 | from .strategy import BeamSearchStrategyForLAMBADA 8 | 9 | 10 | def exact_match_score(prediction, ground_truth): 11 | return prediction.strip() == ground_truth.strip() 12 | 13 | 14 | class LAMBADA(GenerationTask): 15 | @property 16 | def metrics(self): 17 | return {"Accuracy": partial(qa_evaluate, metric=exact_match_score)} 18 | 19 | def __init__(self, model, tokenizer, config_path): 20 | super(LAMBADA, self).__init__(model, tokenizer, config_path) 21 | 22 | if self.config.sampling_strategy == "BeamSearchStrategy": 23 | banned_prefix = [[46010], [146337]] # "'" and "``" 24 | invalid_slices = [20068, 146010, 146337] 25 | for p in punctuation: 26 | pp = tokenizer.tokenize(p) 27 | if len(pp) == 1: 28 | invalid_slices.append(pp[0]) 29 | banned_prefix.append(pp) 30 | self.strategy = BeamSearchStrategyForLAMBADA( 31 | batch_size=self.config.micro_batch_size, 32 | num_beams=self.config.num_beams, 33 | length_penalty=self.config.length_penalty, 34 | consider_end=True, 35 | end_tokens=self.strategy.end_tokens, 36 | invalid_slices=invalid_slices, 37 | banned_prefix=banned_prefix, 38 | no_repeat_ngram_size=self.config.no_repeat_ngram_size, 39 | min_gen_length=self.config.min_gen_length, 40 | deterministic=True, 41 | ) 42 | 43 | def get_first_word_tokens(self, tokens): 44 | text = self.tokenizer.tokenizer.decode(tokens).strip() 45 | return self.tokenizer.tokenize(text.split(" ")[0]) 46 | 47 | def predict_single_batch(self, batch): 48 | outputs_batch: List[List[List[int]]] = self.model.generate_text(batch, self.strategy, return_all_beams=True) 49 | predictions = [] 50 | for outputs in outputs_batch: 51 | found = False 52 | for output in outputs: 53 | text = self.tokenizer.tokenizer.decode(output).strip() 54 | spl = text.split(" ") 55 | if len(spl) >= 2 and spl[1] in punctuation: 56 | predictions.append(self.get_first_word_tokens(output)) 57 | found = True 58 | break 59 | if not found: 60 | predictions.append(self.get_first_word_tokens(outputs[0])) 61 | return predictions 62 | -------------------------------------------------------------------------------- /tasks/language-modeling/pile.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import json 4 | 5 | from typing import * 6 | from os.path import join 7 | from bisect import bisect_right 8 | from itertools import accumulate 9 | from collections import defaultdict 10 | 11 | from evaluation import LanguageModelTask, LanguageModelTaskDataset, print_rank_0 12 | 13 | 14 | def calculate_bpb_score(loss: List[float], data: List[Dict]): 15 | loss_per_category = defaultdict(lambda: 0.0) 16 | utf8_length_per_category = defaultdict(lambda: 0.0) 17 | weights = [] 18 | for item in data: 19 | weights.append(item["num_sequences"]) 20 | utf8_length_per_category[item["meta"]["pile_set_name"]] += item["utf8_length"] 21 | weights = list(accumulate(weights)) 22 | for idx in range(len(loss)): 23 | document_idx = bisect_right(weights, idx) 24 | loss_per_category[data[document_idx]["meta"]["pile_set_name"]] += loss[idx] 25 | return { 26 | name: (loss_per_category[name] / utf8_length_per_category[name] / math.log(2)) for name in loss_per_category 27 | } 28 | 29 | 30 | class Pile(LanguageModelTask): 31 | @property 32 | def metrics(self) -> Dict[str, Callable]: 33 | return {"BPB": calculate_bpb_score} 34 | 35 | def build_dataset(self, relative_path): 36 | return PileDataset(join(self.config.path, relative_path), self.config) 37 | 38 | def report_single_metrics(self, file: str, result_dict: Dict[str, float]): 39 | pass 40 | 41 | def report_group_metrics( 42 | self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, Dict[str, float]], int]], level=1 43 | ): 44 | output_str = f" Finish group {group_name}:\n" 45 | result = list(result_dict_group.values())[0][0]["BPB"] 46 | for key, value in result.items(): 47 | output_str += f" {key} = {value:.3f}\n" 48 | print_rank_0(output_str) 49 | pass 50 | 51 | def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]): 52 | pass 53 | 54 | 55 | class PileDataset(LanguageModelTaskDataset): 56 | def __len__(self): 57 | return self.weights[-1] 58 | 59 | def process_single_file(self, path): 60 | num_sequences = [] 61 | with open(os.path.join(path), "r", encoding="utf-8") as file: 62 | for line in file: 63 | item = json.loads(line) 64 | if len(item["text"]) == 0: 65 | continue 66 | self.data.append( 67 | { 68 | "raw_text": item["text"], 69 | "utf8_length": len(item["text_pretokenized"].encode("utf-8")), 70 | "num_sequences": max( 71 | math.ceil( 72 | max(len(item["text"]) - (self.config.max_seq_length - 1), 0) 73 | / self.config.generation_length 74 | ) 75 | + 1, 76 | 1, 77 | ), 78 | "meta": item["meta"], 79 | } 80 | ) 81 | num_sequences.append(self.data[-1]["num_sequences"]) 82 | self.weights = list(accumulate(num_sequences)) 83 | self.left_weights = [0] + self.weights[:-1] 84 | -------------------------------------------------------------------------------- /tasks/language-modeling/pile.yaml: -------------------------------------------------------------------------------- 1 | name: "Pile" 2 | type: "lm" 3 | module: "tasks.language-modeling.pile.Pile" 4 | path: "pile" 5 | file-pattern: 6 | test: "**/test_tokenized.jsonl" 7 | # validation: "**/val_tokenized.jsonl" 8 | 9 | generation-length: 1024 10 | use_task_mask: true 11 | -------------------------------------------------------------------------------- /tasks/language-modeling/ptb.yaml: -------------------------------------------------------------------------------- 1 | name: "Penn Treebank" 2 | type: "lm" 3 | path: "ptbdataset" 4 | file-pattern: 5 | test: "**/ptb.test.txt" 6 | 7 | generation-length: 256 8 | use_task_mask: true 9 | -------------------------------------------------------------------------------- /tasks/language-modeling/wikitext-103.yaml: -------------------------------------------------------------------------------- 1 | name: "WikiText-103" 2 | type: "lm" 3 | path: "wikitext-103" 4 | file-pattern: 5 | test: "**/wiki.test.tokens" 6 | 7 | generation-length: 256 8 | use_task_mask: true 9 | -------------------------------------------------------------------------------- /tasks/language-modeling/wikitext-2.yaml: -------------------------------------------------------------------------------- 1 | name: "WikiText-2" 2 | type: "lm" 3 | path: "wikitext-2" 4 | file-pattern: 5 | test: "**/wiki.test.tokens" 6 | 7 | generation-length: 256 8 | use_task_mask: true 9 | -------------------------------------------------------------------------------- /tasks/mmlu/mmlu.yaml: -------------------------------------------------------------------------------- 1 | name: "MMLU" 2 | type: "mul" 3 | module: "tasks.mmlu.task.MMLU" 4 | path: "MMLU" 5 | file-pattern: 6 | stem: "stem/*.json" 7 | social_sciences: "social_sciences/*.json" 8 | humanities: "humanities/*.json" 9 | other: "other/*.json" 10 | micro-batch-size: 1 -------------------------------------------------------------------------------- /tasks/mmlu/task.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from typing import Dict, Tuple 4 | 5 | from evaluation import MultiChoiceTask 6 | 7 | categories = { 8 | "STEM": [ 9 | "Abstract Algebra", 10 | "Anatomy", 11 | "Astronomy", 12 | "College Biology", 13 | "College Chemistry", 14 | "College Computer Science", 15 | "College Mathematics", 16 | "College Physics", 17 | "Computer Security", 18 | "Conceptual Physics", 19 | "Electrical Engineering", 20 | "Elementary Mathematics", 21 | "High School Biology", 22 | "High School Chemistry", 23 | "High School Computer Science", 24 | "High School Mathematics", 25 | "High School Physics", 26 | "High School Statistics", 27 | "Machine Learning", 28 | ], 29 | "Other": [ 30 | "Business Ethics", 31 | "Clinical Knowledge", 32 | "College Medicine", 33 | "Global Facts", 34 | "Human Aging", 35 | "Management", 36 | "Marketing", 37 | "Medical Genetics", 38 | "Miscellaneous", 39 | "Nutrition", 40 | "Professional Accounting", 41 | "Professional Medicine", 42 | "Virology", 43 | ], 44 | "Social Sciences": [ 45 | "Econometrics", 46 | "High School Geography", 47 | "High School Government and Politics", 48 | "High School Macroeconomics", 49 | "High School Microeconomics", 50 | "High School Psychology", 51 | "Human Sexuality", 52 | "Professional Psychology", 53 | "Public Relations", 54 | "Security Studies", 55 | "Sociology", 56 | "US Foreign Policy", 57 | ], 58 | "Humanities": [ 59 | "Formal Logic", 60 | "High School European History", 61 | "High School US History", 62 | "High School World History", 63 | "International Law", 64 | "Jurisprudence", 65 | "Logical Fallacies", 66 | "Moral Disputes", 67 | "Moral Scenarios", 68 | "Philosophy", 69 | "Prehistory", 70 | "Professional Law", 71 | "World Religions", 72 | ], 73 | } 74 | 75 | 76 | class MMLU(MultiChoiceTask): 77 | def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]): 78 | self.report_group_metrics("Overall", result_dict_all, level=0) 79 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/tools/__init__.py -------------------------------------------------------------------------------- /tools/convert_tp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import argparse 5 | import glob 6 | 7 | from typing import * 8 | 9 | sys.path.append(".") 10 | 11 | SEQUENTIAL_LAYERS = [ 12 | "input_layernorm.weight", 13 | "input_layernorm.bias", 14 | "attention.dense.bias", 15 | "post_attention_layernorm.weight", 16 | "post_attention_layernorm.bias", 17 | "mlp.dense_4h_to_h.bias", 18 | "attention.rotary_emb.inv_freq", 19 | "final_layernorm.weight", 20 | "final_layernorm.bias", 21 | ] 22 | 23 | GLU_LAYERS = [ 24 | "mlp.dense_h_to_4h.weight", 25 | "mlp.dense_h_to_4h.bias", 26 | ] 27 | 28 | QUANTIZED_LAYERS = [ 29 | "attention.dense.weight", 30 | "attention.query_key_value.weight", 31 | "mlp.dense_h_to_4h.weight", 32 | "mlp.dense_4h_to_h.weight", 33 | ] 34 | 35 | LAYER_CONCAT_DIM = {"attention.dense.weight": 1, "mlp.dense_4h_to_h.weight": 1} 36 | 37 | 38 | def parse_arguments(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--input-folder", default=None, type=str, help="Input SAT checkpoint folder") 41 | parser.add_argument("--output-folder", default=None, type=str, help="Output SAT checkpoint folder") 42 | parser.add_argument("--target-tp", default=4, type=int, help="Target TP degree") 43 | parser.add_argument("--quantization-bit-width", default=None, type=int, help="Quantization bit width") 44 | 45 | args = parser.parse_args() 46 | if args.quantization_bit_width is not None: 47 | assert args.quantization_bit_width in [4, 8] 48 | 49 | return args 50 | 51 | 52 | def merge_weights( 53 | key: str, 54 | sd_list: List[Dict], 55 | tp_index: int, 56 | original_tp: int, 57 | target_tp: int, 58 | cat_dim: int, 59 | is_glu: bool, 60 | quantization_bit_width: Optional[int], 61 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 62 | if original_tp >= target_tp: 63 | if is_glu: 64 | if original_tp > target_tp: 65 | num_part = original_tp // target_tp 66 | assert len(sd_list) == num_part 67 | part1, part2 = [], [] 68 | for i in range(len(sd_list)): 69 | chunks = torch.chunk(sd_list[i][key], 2, dim=cat_dim) 70 | part1.append(chunks[0]) 71 | part2.append(chunks[1]) 72 | merged_sd = torch.cat(part1 + part2, dim=cat_dim) 73 | else: 74 | merged_sd = sd_list[0][key] 75 | else: 76 | merged_sd = torch.cat([sd[key] for sd in sd_list], dim=cat_dim) 77 | else: 78 | assert len(sd_list) == 1 79 | num_part = target_tp // original_tp 80 | if is_glu: 81 | offset = tp_index % num_part 82 | chunks = torch.chunk(sd_list[0][key], num_part * 2, dim=cat_dim) 83 | merged_sd = torch.cat([chunks[offset], chunks[num_part + offset]], dim=cat_dim) 84 | else: 85 | # without clone, torch will save entire tensor 86 | merged_sd = torch.chunk(sd_list[0][key], num_part, dim=cat_dim)[tp_index % num_part].clone() 87 | 88 | if quantization_bit_width is not None: 89 | from kernels import compress_int4_weight 90 | 91 | weight = merged_sd.cuda() 92 | weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (quantization_bit_width - 1)) - 1)).half() 93 | weight = torch.round(weight / weight_scale[:, None]).to(torch.int8) 94 | if quantization_bit_width == 4: 95 | weight = compress_int4_weight(weight) 96 | return weight.cpu(), weight_scale.cpu() 97 | 98 | return merged_sd 99 | 100 | 101 | def create_checkpoint( 102 | sd_list: List[Dict], tp_index: int, original_tp: int, target_tp: int, quantization_bit_width: Optional[int] 103 | ) -> Dict: 104 | new_sd = {} 105 | for key in sd_list[0].keys(): 106 | name = ".".join(key.split(".")[3 if key.startswith("transformer.layers") else 1 :]) 107 | if name in SEQUENTIAL_LAYERS: 108 | new_sd[key] = sd_list[0][key] 109 | else: 110 | new_sd[key] = merge_weights( 111 | key, 112 | sd_list, 113 | tp_index=tp_index, 114 | original_tp=original_tp, 115 | target_tp=target_tp, 116 | cat_dim=LAYER_CONCAT_DIM.get(name, 0), 117 | is_glu=name in GLU_LAYERS, 118 | quantization_bit_width=quantization_bit_width if name in QUANTIZED_LAYERS else None, 119 | ) 120 | if quantization_bit_width is not None and name in QUANTIZED_LAYERS: 121 | new_sd[key], new_sd[f"{key}_scale"] = new_sd[key] 122 | new_sd = {"module": new_sd} 123 | return new_sd 124 | 125 | 126 | def main(args): 127 | iteration = open(os.path.join(args.input_folder, "latest"), "r").read() 128 | original_tp = len(glob.glob(os.path.join(args.input_folder, iteration, "mp_rank_*_model_states.pt"))) 129 | print(f"Iteration {iteration} from {args.input_folder} to {args.output_folder}") 130 | os.makedirs(args.output_folder, exist_ok=True) 131 | with open(os.path.join(args.output_folder, "latest"), "w") as file: 132 | file.write(str(iteration)) 133 | os.makedirs(os.path.join(args.output_folder, iteration), exist_ok=True) 134 | 135 | for i in range(0, args.target_tp): 136 | save_path = os.path.join(args.output_folder, iteration, f"mp_rank_{i:02}_model_states.pt") 137 | print(f"Processing {save_path}") 138 | num_parts = original_tp // args.target_tp 139 | sd_list = [ 140 | torch.load( 141 | os.path.join(args.input_folder, iteration, f"mp_rank_{j:02}_model_states.pt"), map_location="cpu" 142 | )["module"] 143 | for j in ( 144 | range(i * num_parts, (i + 1) * num_parts) 145 | if args.target_tp <= original_tp 146 | else [i // (args.target_tp // original_tp)] 147 | ) 148 | ] 149 | torch.save(create_checkpoint(sd_list, i, original_tp, args.target_tp, args.quantization_bit_width), save_path) 150 | 151 | 152 | if __name__ == "__main__": 153 | args = parse_arguments() 154 | main(args) 155 | -------------------------------------------------------------------------------- /tools/tokenize_pile.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tqdm 3 | from icetk import icetk 4 | from multiprocessing import Pool 5 | 6 | DATA_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val.jsonl" 7 | OUTPUT_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val_tokenized.jsonl" 8 | 9 | 10 | def get_data(line): 11 | item = json.loads(line) 12 | item["text_pretokenized"] = item["text"] 13 | item["text"] = icetk.encode(item["text_pretokenized"]) 14 | return json.dumps(item) + "\n" 15 | 16 | 17 | with open(DATA_PATH, "r") as file: 18 | data = file.readlines() 19 | 20 | with Pool(16) as p: 21 | result = list(tqdm.tqdm(p.imap(get_data, data), total=len(data))) 22 | 23 | with open(OUTPUT_PATH, "w") as file: 24 | file.writelines(result) 25 | --------------------------------------------------------------------------------