├── .gitignore
├── LICENSE
├── MODEL_LICENSE
├── README.md
├── README_zh.md
├── benchmark.py
├── configs
    ├── model_glm_130b.sh
    ├── model_glm_130b_int4.sh
    ├── model_glm_130b_int8.sh
    └── model_glm_130b_v100.sh
├── cuda
    ├── Makefile
    └── quantization.cu
├── docs
    ├── evaluate-your-own-tasks.md
    ├── inference-with-fastertransformer.md
    ├── low-resource-inference.md
    ├── media
    │   └── 16613396005977.jpg
    └── quantization.md
├── evaluate.py
├── evaluation
    ├── __init__.py
    ├── configs.py
    ├── dataset.py
    ├── metrics.py
    ├── model.py
    ├── tasks.py
    └── utils.py
├── generate.py
├── generation
    ├── __init__.py
    └── strategies.py
├── initialize.py
├── kernels
    ├── __init__.py
    └── quantization.fatbin
├── logs
    ├── README.md
    ├── main-log-en.md
    └── main-log.md
├── quantization
    ├── __init__.py
    ├── functional.py
    └── layers.py
├── requirements.txt
├── resources
    ├── 03DF31017FE184DB45D41DFFC6F80EF0.png
    ├── 33872E48D3539EA132B74BCF5EFF458F.png
    ├── 49BF334CB352BAA19F7D55460B1DBCA9.gif
    ├── 7CB441707D1035B2890AA2164C5B6EAC.png
    ├── 7D6433A42D189E2E6FBC62BE066BCE91.png
    ├── 849024E93FA85347F7F6443932911922.png
    ├── AE18F14396E2D22BC0BC8DD77EFD3414.png
    ├── E42321373D22DE198231279B5856BB42.png
    ├── F48B69263360688CCA21E915F4B1A98B.png
    ├── WECHAT.md
    ├── multitask_list.txt
    └── wechat.jpg
├── scripts
    ├── benchmark.sh
    ├── evaluate.sh
    ├── evaluate_multiple_node.sh
    └── generate.sh
├── tasks
    ├── bloom
    │   ├── glue_cola.yaml
    │   ├── glue_mnli.yaml
    │   ├── glue_qnli.yaml
    │   ├── glue_wnli.yaml
    │   ├── math_qa.yaml
    │   ├── mc_taco.yaml
    │   ├── openbook_qa.yaml
    │   ├── pubmed_qa.yaml
    │   ├── superglue_axb.yaml
    │   └── superglue_axg.yaml
    ├── chinese
    │   ├── clue
    │   │   ├── afqmc.yaml
    │   │   ├── c3.yaml
    │   │   ├── cluewsc.yaml
    │   │   ├── cmnli.yaml
    │   │   ├── cmrc2018.yaml
    │   │   ├── csl.yaml
    │   │   ├── drcd.yaml
    │   │   └── ocnli.yaml
    │   └── fewclue
    │   │   ├── bustm.yaml
    │   │   ├── chidf.yaml
    │   │   ├── cluewscf.yaml
    │   │   ├── cslf.yaml
    │   │   ├── eprstmt.yaml
    │   │   └── ocnlif.yaml
    ├── ethnic
    │   ├── crows-pair
    │   │   ├── crows-pair.yaml
    │   │   └── tasks.py
    │   ├── ethos
    │   │   ├── ethos-fewshot-multi.yaml
    │   │   ├── ethos-fewshot-single.yaml
    │   │   ├── ethos-oneshot.yaml
    │   │   └── ethos-zeroshot.yaml
    │   └── stereoset
    │   │   ├── stereoset.yaml
    │   │   └── tasks.py
    ├── lambada
    │   ├── lambada-unidirectional.yaml
    │   ├── lambada.yaml
    │   ├── strategy.py
    │   └── task.py
    ├── language-modeling
    │   ├── pile.py
    │   ├── pile.yaml
    │   ├── ptb.yaml
    │   ├── wikitext-103.yaml
    │   └── wikitext-2.yaml
    └── mmlu
    │   ├── mmlu.yaml
    │   └── task.py
└── tools
    ├── __init__.py
    ├── convert_tp.py
    └── tokenize_pile.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | __pycache__
3 | samples
4 | .DS_Store
5 | .idea
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright Aohan Zeng
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MODEL_LICENSE:
--------------------------------------------------------------------------------
 1 | The GLM-130B License
 2 | 
 3 | 1. Definitions
 4 | 
 5 | “Licensor” means the GLM-130B Model Team that distributes its Software.
 6 | 
 7 | “Software” means the GLM-130B model parameters made available under this license.
 8 | 
 9 | 2. License Grant
10 | 
11 | Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
12 | 
13 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
14 | 
15 | 3. Restriction
16 | 
17 | You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
18 | 
19 | You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
20 | 
21 | 4. Disclaimer
22 | 
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 | 
25 | 5. Limitation of Liability
26 | 
27 | EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
28 | 
29 | 6. Dispute Resolution
30 | 
31 | This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
32 | 
33 | Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img src="resources/7D6433A42D189E2E6FBC62BE066BCE91.png">
  2 | 
  3 | <p align="center">
  4 |    🌐 <a href="http://keg.cs.tsinghua.edu.cn/glm-130b/posts/glm-130b/" target="_blank">Blog</a> • ⏬ <a href="https://docs.google.com/forms/d/e/1FAIpQLSehr5Dh_i3TwACmFFi8QEgIVNYGmSPwV0GueIcsUev0NEfUug/viewform" target="_blank">Download Model</a> • 🪧 <a href="https://huggingface.co/spaces/THUDM/GLM-130B" target="_blank">Demo</a> • ✉️ <a href="mailto:glm-130b@googlegroups.com">Email</a> • 📃 <a href="https://arxiv.org/abs/2210.02414" target="_blank">Paper [ICLR 2023]</a><br>
  5 | </p>
  6 | 
  7 | <p align="center">
  8 |    💬 <a href="https://groups.google.com/g/glm-130b-forum" target="_blank">Google Group</a> (Updates) or <a href="https://github.com/THUDM/GLM-130B/blob/main/resources/WECHAT.md" target="_blank">Wechat Group</a> or <a href="https://join.slack.com/t/glm-130b/shared_invite/zt-1f2ih11xy-EAuDComTAr~XVB3MywE9Cg" target="_blank">Slack channel</a> (Discussions)
  9 | </p>
 10 | 
 11 | # GLM-130B: An Open Bilingual Pre-Trained Model
 12 | 
 13 | GLM-130B is an open bilingual (English & Chinese) bidirectional dense model with 130 billion parameters, pre-trained using the algorithm of [General Language Model (GLM)](https://aclanthology.org/2022.acl-long.26). It is designed to support inference tasks with the 130B parameters on **a single A100 (40G * 8)** or **V100 (32G * 8) server**. With INT4 quantization, the  hardware requirements can further be reduced to **a single server with 4 * RTX 3090 (24G)** with **almost no performance degradation**. As of July 3rd, 2022, GLM-130B has been trained on over 400 billion text tokens (200B each for Chinese and English) and it has the following unique features:
 14 |  
 15 | - **Bilingual:** supports both English and Chinese. 
 16 | - **Performance (EN):** better than GPT-3 175B (+4.0%), OPT-175B (+5.5%), and BLOOM-176B (+13.0%) on LAMBADA and slightly better than GPT-3 175B (+0.9%) on MMLU.
 17 | - **Performance (CN):** significantly better than ERNIE TITAN 3.0 260B on 7 zero-shot CLUE datasets (+24.26%) and 5 zero-shot FewCLUE datasets (+12.75%). 
 18 | - **Fast Inference:** supports fast inference on both [SAT](https://github.com/THUDM/SwissArmyTransformer) and [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) (up to 2.5X faster) with a single A100 server.
 19 | - **Reproducibility:** all results (30+ tasks) can be easily reproduced with open-sourced code and model checkpoints.
 20 | - **Cross-Platform:** supports training and inference on NVIDIA, Hygon DCU, Ascend 910, and Sunway (Will be released soon).
 21 | 
 22 | This repository mainly focus on the evaluation of GLM-130B. If you find our work and our open-sourced efforts useful, ⭐️ to encourage our following development! :)
 23 | 
 24 | ## News
 25 | - **[2023.06.25]** Release [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), an updated version of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) which introduces **Stronger Performance** (MMLU (+23%), CEval (+33%), GSM8K (+571%), BBH (+60%)), **Longer Context** (from 2K in ChatGLM-6B to 32K, and trained with a context length of 8K during the dialogue alignment), and **More Efficient Inference** (speeds up by 42% under the official implementation; the dialogue length supported by 6G GPU memory has increased from 1K to 8K). More details please refer to [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B)。
 26 | - **[2023.06.14]** We release the research [WebGLM](https://github.com/THUDM/WebGLM), which enables efficient and accurate web-enhanced question answering. All code and data are released!
 27 | - **[2023.03.14]** We are happy to introduce [ChatGLM](https://chatglm.cn/blog), a bilingual dialogue language model based on GLM-130B, and its open-sourced version [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B) which can be run under only **6GB** GPU memory! 
 28 | - **[2023.01.21]** GLM-130B has been accepted to [ICLR 2023](https://iclr.cc/Conferences/2023)!
 29 | - **[2022.10.06]** Our [paper](http://arxiv.org/abs/2210.02414) for GLM-130B is out!
 30 | - **[2022.08.24]** We are proud to publish the quantized version for GLM-130B.  While preserving the activation precision as FP16, the model weights can be quantized to as low as **INT4 with almost no degradation of performance**, further reducing the hardware requirements of the GLM-130B to **a single server with 4 * RTX 3090 (24G)**! See [Quantization of GLM-130B](docs/quantization.md) for details.
 31 | 
 32 | For smaller models, please find [monolingual GLMs](https://github.com/THUDM/GLM) (English: 10B/2B/515M/410M/335M/110M, Chinese: 10B/335M) and an [1B multilingual GLM](https://github.com/THUDM/Multilingual-GLM) (104 languages).
 33 | 
 34 | ## Getting Started
 35 | 
 36 | ### Environment Setup
 37 | 
 38 | #### Hardware
 39 | 
 40 | | **Hardware**    | **GPU Memory** | **Quantization** | **Weight Offload** |
 41 | | --------------- | -------------- | ---------------- | ------------------ |
 42 | | 8 * A100        | 40 GB          | No               | No                 |
 43 | | 8 * V100        | 32 GB          | No               | Yes (BMInf)        |
 44 | | 8 * V100        | 32 GB          | INT8             | No                 |
 45 | | 8 * RTX 3090    | 24 GB          | INT8             | No                 |
 46 | | 4 * RTX 3090    | 24 GB          | INT4             | No                 |
 47 | | 8 * RTX 2080 Ti | 11 GB          | INT4             | No        |
 48 | 
 49 | It is recommended to use the an A100 (40G * 8) server, as all GLM-130B evaluation results (~30 tasks) reported can be easily reproduced with a single A100 server in about half a day. With INT8/INT4 quantization, efficient inference on **a single server with 4 * RTX 3090 (24G)** is possible, see [Quantization of GLM-130B](docs/quantization.md) for details. Combining quantization and weight offloading techniques, GLM-130B can also be inferenced on servers with even smaller GPU memory, see [Low-Resource Inference](docs/low-resource-inference.md) for details.
 50 | 
 51 | #### Software
 52 | 
 53 | The GLM-130B code is built on the top of [SAT](https://github.com/THUDM/SwissArmyTransformer). We recommend using [Miniconda](https://docs.conda.io/en/latest/miniconda.html) to manage your environment and installing additional dependencies via `pip install -r requirements.txt`. Here are the recommended environment configurations:
 54 | 
 55 | - Python 3.9+ / CUDA 11+ / PyTorch 1.10+ / DeepSpeed 0.6+ / Apex (**installation with CUDA and C++ extensions is required, see [here](https://github.com/NVIDIA/apex/#linux)**)
 56 | - SwissArmyTransformer>=0.2.11 is required for quantization
 57 | 
 58 | #### Model weights
 59 | 
 60 | Download the GLM-130B’s model checkpoint from [here](https://docs.google.com/forms/d/e/1FAIpQLSehr5Dh_i3TwACmFFi8QEgIVNYGmSPwV0GueIcsUev0NEfUug/viewform?usp=sf_link), make sure all 60 chunks are downloaded completely, then use the following command to merge them into a single archive file and extract it:
 61 | 
 62 | ```bash
 63 | cat glm-130b-sat.tar.part_* > glm-130b-sat.tar
 64 | tar xvf glm-130b-sat.tar
 65 | ```
 66 | 
 67 | Set `CHECKPOINT_PATH` in `configs/model_glm_130b.sh` to the path of the extracted folder. Since the checkpoint file is up to 260G, it is recommended to use the SSD or RAM disk to reduce the checkpoint loading time. Since the checkpoint we distribute is in 8-way tensor parallel, a conversion scripts is also provided if you need to change the tensor parallel dimension.
 68 | 
 69 | ```bash
 70 | python tools/convert_tp.py \
 71 |     --input-folder <SRC_CKPT_PATH>  \
 72 |     --output-folder <DST_CKPT_PATH> \
 73 |     --target-tp <TARGET_TP>
 74 | ```
 75 | 
 76 | ### Left-To-Right Generation / Blank Filling
 77 | 
 78 | ```bash
 79 | bash scripts/generate.sh --input-source interactive
 80 | ```
 81 | 
 82 | You can also specify an input file by `--input-source input.txt`.
 83 | 
 84 | GLM-130B uses two different mask tokens: `[MASK]` for short blank filling and `[gMASK]` for left-to-right long text generation. When the input does not contain any MASK token, `[gMASK]` will be automatically appended to the end of the text.
 85 | 
 86 | <details>
 87 | <summary><b>Left-To-Right Generation Examples</b></summary>
 88 |     
 89 | #### Example 1
 90 | Input: Who is the greatest artist? The greatest artist is [gMASK]
 91 | 
 92 | Output: Who is the greatest artist? The greatest artist is not the one who paints the most beautiful picture, or writes the best poem, or plays the best symphony. The greatest artist is the one whose work has the greatest impact on the lives of others. The Greatest Artist is the artist who inspires us to be our best selves. The Great Artist is not the artist whose work is flawless. The great artist is not one who is perfect. The true great artist has the ability to make us feel something. The artist who makes us feel is the great artist. The art of the great artists is to make you feel something that you have never felt before. The greatness of an artist is in the ability of their work to touch the lives and hearts of others, and to make them feel something they have never experienced before. This is the true greatness of the artist. 2. The Significance of the Great Artist The significance of the greatest artist cannot be overstated. The significance and impact of the art of a great artist cannot even be measured. The impact of a single work of art can change the course of history. A single work can change a person’s life. A work of great art can make you laugh, make you cry, make your heart race, or make you think. A great work of artistic expression has the power to change the way you look at the world. The work of a truly great artist can change your life forever. 3. The Impact of the Greatest Artist The greatest artists have the ability and the power of their art to change people’s lives. They have the power and the ability, through their art, to make people feel something new. The power of the work of the truly great artists cannot be measured, but it can be felt. The greater the impact of an artists work, the greater the power it has to change lives. 4. The Influence of the Art of the Artist The art and the work created by the greatest artists has the potential to change a life. The influence of the artists work cannot be quantified, but is felt by those who experience it. The truly great work has an ability to touch people’ lives in a way that no other work can.
 93 |   
 94 | #### Example 2 (Chinese)
 95 | Input: 问题：冬天，中国哪座城市最适合避寒？问题描述：能推荐一些国内适合冬天避寒的城市吗？回答用户：旅游爱好者 回答： [gMASK]
 96 |   
 97 | Output: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:避寒的话,建议去海南三亚。 三亚位于海南岛的最南端,是中国最南部的热带滨海旅游城市,中国海滨城市,是中国空气质量最好的城市、全国最长寿地区(平均寿命80岁)。三亚市别称鹿城,又被称为"东方夏威夷",位居中国四大一线旅游城市"三威杭厦"之首,拥有全岛最美丽的海滨风光。三亚拥有被无数城市嫉妒的清新空气,柔和海滩。在沙滩悠闲散步、沐浴傍晚温和阳光,在海边玩耍,在雨林里面呼吸健康,欣赏自然奇观,一切都是那么令人享受。 气候 三亚地处热带,属热带季风气候区域,终年气温高,寒暑变化不大,年平均气温25.5度。三亚最热的月份是6月,平均气温为28.5°C,极端最高气温为35.7°C ,三亚没有明显的冬季,而且冬季更是到海滨旅游的最好时候,所以三亚是全年全天候避寒、消暑、度假、旅游的好地方。不过,三亚最旺的旅游时间是从10月份到农历春节,这个时候的人特别多,房间也不好订,建议最好避开高峰。三亚的三个旅游高峰期是"五一"、"十一"、"春节",这三个节日期间的房价都会上浮几倍,如果你选择了这三个假期来三亚的话要提前一个星期定房和咨询。
 98 | </details>
 99 | 
100 | <details>
101 | <summary><b>Blank Filling Examples</b></summary>
102 |   
103 | #### Example 1
104 | Input: Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.
105 |   
106 | Output: Ng is an adjunct professor at Stanford University (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.<br/>
107 | 
108 | #### Example 2 (Chinese)
109 | Input: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建，门高25米，顶上矗立两武士青铜古兵车铸像。
110 |   
111 | Output: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念拿破仑胜利而建,门高25米,顶上矗立两武士青铜古兵车铸像。</code></pre>
112 | </details>
113 | 
114 | <details>
115 | <summary><b>Arguments Useful in Generation</b></summary>
116 | 
117 | - `--input-source [path] or "interactive"` The input file's path. It can also be "interactive", which will launch a CLI.
118 | - `—-output-path [path]` The folder containing the results.
119 | - `—-out-seq-length [int]` The maximum sequence length for generation (including context).
120 | - `—-min-gen-length [int]` The minimum generation length for each MASK.
121 | - `—-sampling-strategy "BaseStrategy" or "BeamSearchStrategy"`. The sampling strategy used.
122 |   - For BeamSearchStrategy:
123 |      - `—-num-beams [int]` The number of beams.
124 |      - `—-length-penalty [float]` The maximum sequence length for generation (including context).
125 |      - `—-no-repeat-ngram-size [int]` Prohibit repeated n-gram generation.
126 |      - `—-print-all-beam` Print the generated results for all beams.
127 |   - For BaseStrategy:
128 |      - `—-top-k [int]` Top k sampling.
129 |      - `—-top-p [float]` Top p sampling.
130 |      - `—-temperature [float]` The sampling temperature.
131 | </details>
132 | 
133 | ### Evaluation
134 | 
135 | We use the YAML file to define tasks. Specifically, you can add multiple tasks or folders at a time for evaluation, and the evaluation script will automatically collect all YAML files under those folders recursively.
136 | 
137 | ```
138 | bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ...
139 | ```
140 | 
141 | Download our evaluation dataset [here](https://cloud.tsinghua.edu.cn/f/826f0df4356f4022a264/), and set `DATA_PATH` in `scripts/evaluate.sh` to your local dataset directory. The task folder contains the YAML files for 30+ tasks we evaluated for GLM-130B. Take the [CoLA](https://nyu-mll.github.io/CoLA/) task for example, run `bash scripts/evaluate.sh tasks/bloom/glue_cola.yaml`, which outputs an accuracy of ~65% for the best prompt and ~57% for the median.
142 | 
143 | <details>
144 | <summary>Expected Output</summary>
145 |   
146 | ```plain
147 | MultiChoiceTaskConfig(name='glue_cola', type=<TaskType.MULTICHOICE: 'mul'>, path='/thudm/LargeScale/data/zeroshot/bloom/glue_cola', module=None, metrics=['Accuracy'], use_task_mask=False, use_multitask_encoding=False, unidirectional=False, max_seq_length=2048, file_pattern={'validation': '**/validation.jsonl'}, micro_batch_size=8)
148 | Evaluating task glue_cola:
149 |   Evaluating group validation:
150 |       Finish Following_sentence_acceptable/mul/validation.jsonl, Accuracy = 42.665
151 |       Finish Make_sense_yes_no/mul/validation.jsonl, Accuracy = 56.951
152 |       Finish Previous_sentence_acceptable/mul/validation.jsonl, Accuracy = 65.197
153 |       Finish editing/mul/validation.jsonl, Accuracy = 57.622
154 |       Finish is_this_correct/mul/validation.jsonl, Accuracy = 65.197
155 | Evaluation results of task glue_cola:
156 |   Group validation Accuracy: max = 65.197, median = 57.622, average = 57.526
157 | Finish task glue_cola in 101.2s. 
158 | ```
159 | </details>
160 | 
161 | Multi-node evaluation can be configured by setting `HOST_FILE_PATH`(required by the [DeepSpeed lanucher](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)) in `scripts/evaluate_multiple_node.sh`. Set `DATA_PATH` in `scripts/evaluate_multiple_node.sh` and run the following command to evaluate all the tasks in `./task` directory.
162 | 
163 | ```
164 | bash scripts/evaluate_multiple_node.sh ./tasks
165 | ```
166 | 
167 | See [Evaluate Your Own Tasks](docs/evaluate-your-own-tasks.md) for details on how to add new tasks.
168 | 
169 | ### 2.5X faster Inference using FasterTransformer
170 | 
171 | By adapting the GLM-130B model to [FasterTransfomer](https://github.com/NVIDIA/FasterTransformer), a highly optimized transformer model library by NVIDIA, we can reach up to 2.5X speedup on generation, see [Inference with FasterTransformer](docs/inference-with-fastertransformer.md) for details.
172 | 
173 | 
174 | 
175 | ## License
176 | 
177 | This repository is licensed under the [Apache-2.0 license](LICENSE). The use of GLM-130B model weights is subject to the [Model License](MODEL_LICENSE).
178 | 
179 | ## Citation
180 | 
181 | If you find our work useful, please consider citing GLM-130B:
182 | 
183 | ```
184 | @article{zeng2022glm,
185 |   title={Glm-130b: An open bilingual pre-trained model},
186 |   author={Zeng, Aohan and Liu, Xiao and Du, Zhengxiao and Wang, Zihan and Lai, Hanyu and Ding, Ming and Yang, Zhuoyi and Xu, Yifan and Zheng, Wendi and Xia, Xiao and others},
187 |   journal={arXiv preprint arXiv:2210.02414},
188 |   year={2022}
189 | }
190 | ```
191 | 
192 | You may also consider GLM's original work in your reference:
193 | 
194 | ```
195 | @inproceedings{du2022glm,
196 |   title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling},
197 |   author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie},
198 |   booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
199 |   pages={320--335},
200 |   year={2022}
201 | }
202 | ```
203 | 


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | from initialize import initialize, initialize_model_and_tokenizer
 4 | 
 5 | if __name__ == "__main__":
 6 |     args = initialize(extra_args_provider=lambda parser: None)
 7 |     model, tokenizer = initialize_model_and_tokenizer(args)
 8 | 
 9 |     for seq_len in [512, 1024, 2048]:
10 |         torch.distributed.barrier()
11 |         start = time.time()
12 |         with torch.no_grad():
13 |             _, *_ = model(
14 |                 torch.ones(1, seq_len, device=torch.cuda.current_device(), dtype=torch.int64),
15 |                 torch.arange(seq_len, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1),
16 |                 torch.randn(1, 1, seq_len, seq_len, device=torch.cuda.current_device()) < 0.5,
17 |             )
18 |         torch.distributed.barrier()
19 |         if torch.distributed.get_rank() == 0:
20 |             print(f"Encode {seq_len}: {(time.time() - start) * 1000:.2f} ms")
21 | 


--------------------------------------------------------------------------------
/configs/model_glm_130b.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="glm-130b"
 2 | CHECKPOINT_PATH="<your checkpoint path>"
 3 | MP_SIZE=8
 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \
 5 |             --num-layers 70 \
 6 |             --hidden-size 12288 \
 7 |             --inner-hidden-size 32768 \
 8 |             --vocab-size 150528 \
 9 |             --num-attention-heads 96 \
10 |             --max-sequence-length 2048 \
11 |             --tokenizer-type icetk-glm-130B \
12 |             --layernorm-order post \
13 |             --load ${CHECKPOINT_PATH} \
14 |             --skip-init \
15 |             --fp16"
16 | 


--------------------------------------------------------------------------------
/configs/model_glm_130b_int4.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="glm-130b"
 2 | CHECKPOINT_PATH="<your checkpoint path>"
 3 | MP_SIZE=4
 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \
 5 |             --num-layers 70 \
 6 |             --hidden-size 12288 \
 7 |             --inner-hidden-size 32768 \
 8 |             --vocab-size 150528 \
 9 |             --num-attention-heads 96 \
10 |             --max-sequence-length 2048 \
11 |             --tokenizer-type icetk-glm-130B \
12 |             --layernorm-order post \
13 |             --quantization-bit-width 4 \
14 |             --load ${CHECKPOINT_PATH} \
15 |             --skip-init \
16 |             --fp16"
17 | 


--------------------------------------------------------------------------------
/configs/model_glm_130b_int8.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="glm-130b"
 2 | CHECKPOINT_PATH="<your checkpoint path>"
 3 | MP_SIZE=8
 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \
 5 |             --num-layers 70 \
 6 |             --hidden-size 12288 \
 7 |             --inner-hidden-size 32768 \
 8 |             --vocab-size 150528 \
 9 |             --num-attention-heads 96 \
10 |             --max-sequence-length 2048 \
11 |             --tokenizer-type icetk-glm-130B \
12 |             --layernorm-order post \
13 |             --quantization-bit-width 8 \
14 |             --load ${CHECKPOINT_PATH} \
15 |             --skip-init \
16 |             --fp16"
17 | 


--------------------------------------------------------------------------------
/configs/model_glm_130b_v100.sh:
--------------------------------------------------------------------------------
 1 | MODEL_TYPE="glm-130b"
 2 | CHECKPOINT_PATH="<your checkpoint path>"
 3 | MP_SIZE=8
 4 | MODEL_ARGS="--model-parallel-size ${MP_SIZE} \
 5 |             --num-layers 70 \
 6 |             --hidden-size 12288 \
 7 |             --inner-hidden-size 32768 \
 8 |             --vocab-size 150528 \
 9 |             --num-attention-heads 96 \
10 |             --max-sequence-length 2048 \
11 |             --tokenizer-type icetk-glm-130B \
12 |             --layernorm-order post \
13 |             --load ${CHECKPOINT_PATH} \
14 |             --skip-init \
15 |             --fp16 \
16 |             --bminf \
17 |             --bminf-memory-limit 25"
18 | 


--------------------------------------------------------------------------------
/cuda/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | OPTIONS=-gencode arch=compute_61,code=sm_61 \
 3 | 		-gencode arch=compute_62,code=sm_62 \
 4 | 		-gencode arch=compute_70,code=sm_70 \
 5 | 		-gencode arch=compute_72,code=sm_72 \
 6 | 		-gencode arch=compute_75,code=sm_75 \
 7 | 		-gencode arch=compute_80,code=sm_80 \
 8 | 		-gencode arch=compute_86,code=sm_86
 9 | 
10 | TARGETS=$(patsubst %.cu, %.fatbin, $(wildcard *.cu))
11 | 
12 | all: $(TARGETS)
13 | 
14 | %.fatbin: %.cu
15 | 	$(NVCC) -fatbin $^ $(OPTIONS) -o $@
16 | 
17 | .PHONY : clean, copy
18 | clean:
19 | 	rm $(TARGETS)
20 | 
21 | copy:
22 | 	cp $(TARGETS) ../kernels/
23 | 


--------------------------------------------------------------------------------
/cuda/quantization.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_fp16.h>
 2 | 
 3 | template<typename T>
 4 | __device__ void
 5 | int4WeightExtractionDevice(const int8_t* weight,
 6 |                                 const T* scale_list,
 7 |                                 T* output,
 8 |                                 const int n,
 9 |                                 const int k)
10 | {
11 |     for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){
12 |         int8_t original = weight[i];
13 |         int8_t high = original >> 4;
14 |         int8_t low = original << 4; low = low >> 4;
15 |         output[i * 2] = T(high) * scale_list[blockIdx.x];
16 |         output[i * 2 + 1] = T(low) * scale_list[blockIdx.x];
17 |     }
18 | }
19 | 
20 | __device__ void
21 | int4WeightCompressionDevice(const int8_t* input,
22 |                                 int8_t* output,
23 |                                 const int n,
24 |                                 const int k)
25 | {
26 |     for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){
27 |         output[i] = (input[i * 2] << 4) | (input[i * 2 + 1] & 0b00001111);
28 |     }
29 | }
30 | 
31 | template<typename T>
32 | __device__ void
33 | int8WeightExtractionDevice(const int8_t* weight,
34 |                                 const T* scale_list,
35 |                                 T* output,
36 |                                 const int n,
37 |                                 const int k)
38 | {
39 |     for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){
40 |         output[i] = T(weight[i]) * scale_list[blockIdx.x];
41 |     }
42 | }
43 | 
44 | extern "C" __global__ void int4WeightExtractionHalf(const int8_t* weight,
45 |                                 const half* scale_list,
46 |                                 half* output,
47 |                                 const int n,
48 |                                 const int k){
49 |                                     int4WeightExtractionDevice<half>(weight, scale_list, output, n, k);
50 |                                 }
51 | 
52 | extern "C" __global__ void int4WeightExtractionFloat(const int8_t* weight,
53 |                                 const float* scale_list,
54 |                                 float* output,
55 |                                 const int n,
56 |                                 const int k){
57 |                                     int4WeightExtractionDevice<float>(weight, scale_list, output, n, k);
58 |                                 }
59 | 
60 | extern "C" __global__ void int8WeightExtractionHalf(const int8_t* weight,
61 |                                 const half* scale_list,
62 |                                 half* output,
63 |                                 const int n,
64 |                                 const int k){
65 |                                     int8WeightExtractionDevice<half>(weight, scale_list, output, n, k);
66 |                                 }
67 | 
68 | extern "C" __global__ void int8WeightExtractionFloat(const int8_t* weight,
69 |                                 const float* scale_list,
70 |                                 float* output,
71 |                                 const int n,
72 |                                 const int k){
73 |                                     int8WeightExtractionDevice<float>(weight, scale_list, output, n, k);
74 |                                 }
75 | 
76 | extern "C" __global__ void int4WeightCompression(const int8_t* input,
77 |                                 int8_t* output,
78 |                                 const int n,
79 |                                 const int k){
80 |                                     int4WeightCompressionDevice(input, output, n, k);
81 |                                 }
82 | 


--------------------------------------------------------------------------------
/docs/evaluate-your-own-tasks.md:
--------------------------------------------------------------------------------
 1 | # Evaluate Your Own Tasks
 2 | 
 3 | ## YAML file for tasks
 4 | 
 5 | We use the YAML file to define tasks, this allows us to easily evaluate multiple tasks at a single run and configure them independently. Specifically, you can add multiple tasks or folders  at a time for evaluation, and the script will automatically collect all YAML files under those folders recursively.
 6 | 
 7 | ```
 8 | # Single node
 9 | bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ...
10 | # Multi node
11 | bash scripts/evaluate_multiple_node.sh task1.yaml task2.yaml dir1 dir2 ...
12 | ```
13 | 
14 | We support two types of evaluation tasks: multi-choice and generation. The YAML config options for both tasks are defined in `evaluation/configs.py`. Basically, all types of tasks share common configs defining task information:
15 | 
16 | ```yaml
17 | name: 'glue_cola'  # Task Name
18 | type: 'mul'  # Task type, 'gen' (generate) or 'mul' (multiple choice)
19 | path: 'bloom/glue_cola'  # task data path relative to DATA_PATH in 'evaluate.sh'
20 | use_task_mask: False # Whether use [gMASK] for evaluation
21 | unidirectional: False # Whether use unidirectional attention
22 | max_seq_length: 2048  # Max sequence length
23 | file-pattern: # Organize jsonl file in groups
24 |   validation: "**/validation.jsonl" # Will search for all file named 'validation.jsonl' in `DATA_PATH/bloom/glue_cola` using glob.glob()
25 | micro-batch-size: 30 # 'gen' task only support mbs = 1 for now
26 | ```
27 | 
28 | See configuration details for multi-choice and generation tasks in `evaluation/configs.py`.
29 | 
30 | ## Data format for tasks
31 | 
32 | We recommend organizing the task data in the following structure and setup up two groups named "validation" and "test" in the `file-pattern` config so that it becomes very easy to evaluate different prompts on both validation and test sets independently.
33 | 
34 | ```bash
35 | DATA_PATH
36 | └── task_name
37 |     ├── prompt_1
38 |     │   ├── test.jsonl
39 |     │   └── val.jsonl
40 |     ├── prompt_2
41 |     │   ├── test.jsonl
42 |     │   └── val.jsonl
43 |     └── prompt_3
44 |         ├── test.jsonl
45 |         └── val.jsonl
46 | ```
47 | 
48 | The evaluation data for each prompt are organized into jsonline format. For multi-choice tasks, the format of each line of JSON should be
49 | 
50 | ```json
51 | {
52 |     "inputs_pretokenized": "Context and question here",
53 |     "choices_pretokenized": ["Choice 1", "Choice 2", "Choice 3"],
54 |     "label": int
55 | }
56 | ```
57 | 
58 | The default metric for the multi-choice task is Accuracy.
59 | 
60 | For the generation task, the format of each line of JSON should be
61 | 
62 | ```json
63 | {
64 |     "inputs_pretokenized": "Context and question here",
65 |     "targets_pretokenized": ["Target 1", "Target 2", "Target 3"],
66 |     "label": int
67 | }
68 | ```
69 | 
70 | The default metrics for the generation task are EM(Exact-Match) and F1. Given inputs, the sequence generated by the model will be metricized separately from all targets and the highest value will be taken.
71 | 
72 | 
73 | ## Implement Your Metrics
74 | 
75 | You can customize your evaluation metrics function and add it to `DEFAULT_METRICS` in `evaluation/metrics.py`, and then you can specify `metric: ['Your metric name']` in the task YAML file.
76 | 
77 | ## Fully customize the evaluation process
78 | 
79 | By default, we implement classes named `MultiChoiceTask` and `GenerationTask` in `evaluation/tasks.py` for multi-choice tasks and generation tasks, respectively. 
80 | 
81 | You can implement a new task class and inherit from one of these two classes, and implement the `process_single_batch` function to define how to process a batch of inputs and get the predictions. Following [Big-Bench](https://github.com/google/BIG-bench/#creating-the-task), we implemented two methods you can use for your evaluation:
82 | 
83 | - `model.cond_log_prob()`: Compute the probabilities of provided model outputs for given inputs.
84 | - `model.generate_text()`: Generate text for given inputs.
85 | 
86 | Once you have created the new task class, you need to specify the relative path to import the class in the `module` field of the task YAML file.  See `tasks/lambada/tasks.py` and `tasks/lambada/lambada.yaml` for how we customize the beam search generation strategy for LAMBADA tasks and configure the YAML file.
87 | 


--------------------------------------------------------------------------------
/docs/inference-with-fastertransformer.md:
--------------------------------------------------------------------------------
  1 | # Inference with FasterTransformer
  2 | 
  3 | [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA.
  4 | 
  5 | We adapted the GLM-130B based on Fastertransformer for fast inference, with details in [benchmark](#benchmark) section.
  6 | 
  7 | ## Download the Model
  8 | 
  9 | See [Get Model](/README.md#environment-setup).
 10 | 
 11 | ## Recommend: Run With Docker
 12 | 
 13 | Use Docker to quickly build a Flask API application for GLM-130B.
 14 | 
 15 | ### Requirements
 16 | 
 17 | - [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
 18 | 
 19 | ### Build Container Image
 20 | 
 21 | ```bash
 22 | git clone https://github.com/THUDM/FasterTransformer.git
 23 | cd FasterTransformer
 24 | bash docker/build.sh
 25 | ```
 26 | 
 27 | ### Run API With Checkpoints
 28 | 
 29 | Set MPSIZE to the number of gpus needed for the checkpoints, and DATA_TYPE to checkpoints precision. The checkpoint we distribute is in 8-way tensor parallel in FP16 precision, a conversion scripts is also provided if you need to change the tensor parallel dimension and the weight precision.
 30 | 
 31 | ```bash
 32 | # Convert the checkpoint to MP=4, DATA_TYPE=INT4
 33 | python tools/convert_tp.py \
 34 |     --input-folder <SRC_CKPT_PATH>  \
 35 |     --output-folder <DST_CKPT_PATH> \
 36 |     --target-tp 8 \
 37 |     --quantization-bit-width 4 \
 38 | # Run API
 39 | docker run -it --rm --gpus all --shm-size=10g -p 5000:5000 \
 40 |            -v <DST_CKPT_PATH>/49300:/checkpoints:ro \
 41 |            -e MPSIZE=4 -e DATA_TYPE=int4 \
 42 |            ftglm:latest
 43 | ```
 44 | 
 45 | ### Test
 46 | 
 47 | #### Benchmark
 48 | 
 49 | ```bash
 50 | python3 examples/pytorch/glm/glm_server_test.py
 51 | ```
 52 | 
 53 | #### Web Demo
 54 | 
 55 | ```bash
 56 | pip install gradio
 57 | python3 examples/pytorch/glm/glm_server_frontend_test.py
 58 | ```
 59 | 
 60 | ## Manual Configuration
 61 | 
 62 | ### Requirements
 63 | 
 64 | - CMake >= 3.13 for PyTorch
 65 | - CUDA 11.0 or newer version
 66 | - NCCL 2.10 or newer version
 67 | - Python 3 is recommended because some features are not supported in python 2
 68 | - PyTorch: Verify on 1.10.1, >= 1.8.0 should work.
 69 | 
 70 | ### Setup Using Docker
 71 | 
 72 | ```bash
 73 | docker run -it --rm --gpus all nvcr.io/nvidia/pytorch:22.09-py3 /bin/bash
 74 | conda install -y pybind11
 75 | ```
 76 | 
 77 | ### Setup Using Conda
 78 | 
 79 | As another way, all the packages can be installed using conda.
 80 | 
 81 | > Some of our current [structure](https://github.com/THUDM/FasterTransformer/blob/main/src/fastertransformer/th_op/glm/GlmOp.h#L30) requires that `g++` and `libtorch` produce the same results, so a pre-compiled `libtorch` may only work with `g++-7` or `g++-9`. And although GLM-130B itself does not rely on openmpi, FasterTransformer requires it during the build process. We are working on these issues.
 82 | 
 83 | ```bash
 84 | conda install -y cmake pybind11
 85 | conda install -y -c conda-forge cudatoolkit-dev cudnn
 86 | cp -r $CONDA_PREFIX/lib/libcudnn* /usr/local/cuda/lib64/
 87 | cp -r $CONDA_PREFIX/include/cudnn*.h /usr/local/cuda/include/
 88 | ```
 89 | 
 90 | If it's hard to install cudatoolkit-dev and cudnn by conda, just install them from [NVIDIA Developer](https://developer.nvidia.com/cuda-downloads), and make sure cmake is able to find cudnn.
 91 | 
 92 | ```bash
 93 | cp cudnn/include/cudnn*.h /usr/local/cuda/include
 94 | cp cudnn/lib/libcudnn* /usr/local/cuda/lib64
 95 | chmod a+r /usr/local/cuda/include/cudnn*.h 
 96 | chmod a+r /usr/local/cuda/lib64/libcudnn*
 97 | ```
 98 | 
 99 | GLM-130B is trained with FP16 precision, a total of 260G of GPU memory is required to store model weights. The model is tested with 8 * 40G A100s.
100 | 
101 | ### Build
102 | 
103 | Get the code and install all dependencies:
104 | 
105 | ```bash
106 | git clone https://github.com/THUDM/FasterTransformer.git
107 | mkdir -p FasterTransformer/build
108 | cd FasterTransformer/build
109 | pip3 install icetk transformers
110 | ```
111 | 
112 | Note: the `xx` of `-DSM=xx` in following scripts means the compute capability of your GPU. For example, 60 (P40) or 61 (P4) or 70 (V100) or 75(T4) or 80 (A100) or 86(RTX 3090).  Default setting is including 70, 75, 80 and 86.
113 | 
114 | ```bash
115 | cmake -DSM=80 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON ..
116 | make -j
117 | ```
118 | 
119 | ### Run GLM-130B
120 | 
121 | Generate the `gemm_config.in` file.
122 | 
123 | ```bash
124 | # ./bin/gpt_gemm <batch_size> <beam_width> <max_input_len> <head_number> <size_per_head> <inter_size> <vocab_size> <data_type> <tensor_para_size>
125 | ./bin/gpt_gemm 1 1 128 96 128 49152 150528 1 8
126 | ```
127 | 
128 | Running GLM_130B in Pytorch and Flask.
129 | 
130 | ```bash
131 | bash ../examples/pytorch/glm/glm-server.sh
132 | ```
133 | 
134 | You need to check and edit this file to set arguments such as `CHECKPOINT_PATH`.
135 | 
136 | ## Optimization methods
137 | 
138 | Optimization in GLM_130B are similar to optimization in GPT and GPT-J, describing in the [FasterTransformer/gpt_guide.md](https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md). Meanwhile, some of the operators are differ from GPT, such as the implementation of RotaryEmbedding, and the use of GeGLU, so we add them additionally into FasterTransformer.
139 | 
140 | ## Benchmark
141 | 
142 | - Hardware: DGX-A100(8 * 40G)
143 | 
144 | ## Encode
145 | 
146 | | **Sequence Len**   | 512    | 1024   | 2048   |
147 | | ---------- | ------ | ------ | ------ |
148 | | Megatron   | 145 ms | 250 ms | 453 ms |
149 | | FasterTransformer | 120 ms | 220 ms | OOM  |
150 | 
151 | ## Decode
152 | 
153 | | **Sequence Len**  | 512     | 1024    | 2048     |
154 | | ---------- | ------- | ------- | -------- |
155 | | Megatron   | 45.21 s | 89.00 s | 179.22 s |
156 | | FasterTransformer | 18.77 s | 39.81 s | 89.88 s  |
157 | 


--------------------------------------------------------------------------------
/docs/low-resource-inference.md:
--------------------------------------------------------------------------------
 1 | # Low-resource Inference with BMInf
 2 | 
 3 | GLM-130B is trained with 4-way tensor parallel and 8-way pipeline parallel for efficiency. Then the checkpoint is converted into a 8-way tensor parallel one in order to inference the model in a single node. GLM-130B has 130 billion parameters in FP16 precision, a total of 260G of GPU memory is required to store model weights. The DGX-A100 server has 8 A100s and provides an amount of 320G of GPU memory (640G for 80G A100 version)  so it suits GLM-130B well. 
 4 | 
 5 | However, a server with 8 * 32G V100 only provides an amount of 256G of GPU memory, which indicates that the full loading of model weights is not possible. Fortunately, with the swap-in-and-out feature between CPU and GPU memory provided by the [BMInf](https://github.com/OpenBMB/BMInf) library, GLM-130B can still run on servers with a smaller amount of GPU memory. After joint debugging with the BMInf team, we achieved a resonable evaluation efficiency on DGX-1 servers with 8 * 32G V100 by carefully overlapping computation and communication, see the [benchmark section](#benchmark) for details.
 6 | 
 7 | We have integrated BMInf into our codebase, just install BMInf via `pip install bminf`, and change the model configuration file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_v100.sh` in your launch shell script. The default BMInf config is for V100 servers, you can also adjust the maximum memory the model weights can occupy on one GPU by setting `--bminf-memory-limit` according to your GPU memory in the model config file.
 8 | 
 9 | ## Benchmark
10 | 
11 | ### Evaluation
12 | 
13 | - CoLA task on the validation set
14 | - Micro Batch Size = 30
15 | - BMInf: 25GB model weights in GPU memory limit by: `--bminf-memory-limit 25`
16 | 
17 | |                | Peak GPU Memory | Time   |
18 | | -------------- | ---------- | ------ |
19 | | A100-SAT       | 40.3 G     | 74.6 s |
20 | | V100-SAT       | OOM        | OOM    |
21 | | V100-SAT-BMInf | 32.3 G     | 196.0 s |
22 | 
23 | The `micro-batch-size` config in task YAML files is configured according to the maximum utilization of the DGX-A100 server. If you encounter an OOM error on the V100 server, please adjust the `micro-batch-size` appropriately.
24 | 
25 | ### Text generation
26 | 
27 | In text generation, due to the small amount of calculation per model forward (usually <10 tokens/forward using beam search strategy), the communication between the CPU and GPU memory becomes the bottleneck. With the help of the BMInf team, we did an in-depth profile on our V100 server. Given a 25GB model weight limit per GPU, a total of 13 layers need to be copied from CPU to GPU for a single forward, each layer will take about 75ms on IO, indicating that the real IO speed between CPU and GPU is `260GB / 70 / 8 / 75ms = 6.19GB/s`. Our V100 server uses PCI-E 3.0 and two V100s share a switch, so the theoretical bandwidth for each GPU is 8GB/s, close to our profiling results. A server with PCI-E 4.0 will greatly reduce the IO time. Even that, long text generation tokens can still take several minutes so **we do not recommend using V100 servers in text generation scenario**. For this, we are working on INT8 quantization so that GLM-130B can even fit a single RTX-3090 server (24G * 8).
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/media/16613396005977.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/docs/media/16613396005977.jpg


--------------------------------------------------------------------------------
/docs/quantization.md:
--------------------------------------------------------------------------------
 1 | # Quantization of GLM-130B
 2 | 
 3 | ## Usage
 4 | 
 5 | > Please note that SwissArmyTransformer>=0.2.11 is required for quantization
 6 | 
 7 | Set `CHECKPOINT_PATH` in `configs/model_glm_130b_{int4/int8}.sh` to your local checkpoint folder. The model will be first initialized from the FP16 checkpoint on the CPU memory, then dynamically quantized and transferred to the GPU memory. So please make sure you have enough CPU memory (>260GB) to store the FP16 model weights.
 8 | 
 9 | You need to pay attention to the tensor parallel dimension of the model checkpoint, we only provide the checkpoint in 8-way tensor parallel, i.e. 8 GPUs store a whole model. If you need to do inference on a small number of GPUs, e.g. 4 * RTX 3090 GPUs with INT4 precision, you first need to convert the checkpoint to 4-way tensor parallel using the following command and modify `MP_SIZE` in corresponding model config file.
10 | 
11 | ```bash
12 | python tools/convert_tp.py \
13 |     --input-folder <SRC_CKPT_PATH>  \
14 |     --output-folder <DST_CKPT_PATH> \
15 |     --target-tp 4
16 | ```
17 | 
18 | Finally, change the model config file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_{int4/int8}.sh` in your scripts (e.g. `scripts/generate.sh`), then run your scripts just as normal.
19 |  
20 | By default, the full precision checkpoint is expected to be loaded. Run the conversion script with `--quantization-bit-width <4 or 8>` will produce quantized model weights. To load from a quantized checkpoint, you should add `--from-quantized-checkpoint` in your model config file.
21 | 
22 | ## Evaluation Results
23 | 
24 | |   | **MMLU（Accuracy↑）** | **LAMBADA（Accuracy↑  ）** | **WikiText-2（PPL↓）** | **WikiText-103（PPL↓）** | **PTB（PPL↓）** |
25 | | ---- | -------- | ----------- | ------------------- | --------------------- | ------------ |
26 | | FP16 | 44.751   | 80.206      | 10.901              | 10.759                | 18.964       |
27 | | INT8 | 44.709   | 80.206      | 10.904              | 10.763                | 18.994       |
28 | | INT4 | 44.801   | 79.468      | 11.167              | 11.046                | 19.535       |
29 | 
30 | ## Space and Speed Benchmark
31 | 
32 | | **Hardware** | **GPU Memory** | **Precison** | **512**  | **1024** | **2048** |
33 | | ------------ | -------------- | ------------ | -------- | -------- | -------- |
34 | | 8 * A100     | 40 GB          | FP16         | 45.21 s  | 89.00 s  | 179.22 s |
35 | | 8 * V100     | 32 GB          | INT8         | 106.35 s | 216.50 s | 449.17 s |
36 | | 4 * RTX 3090 | 24 GB          | INT4         | 138.66 s | 292.69 s | 649.64 s |
37 | | 8 * RTX 2080 Ti | 11 GB | INT4 | 117.39 s | 240.96 s | 528.66 s |
38 | 
39 | 
40 | The above results in the table is tests with SAT. Using FasterTransformer can speed up more than 2X, as shown in the table below, and the detailed usage is shown in [Inference with FasterTransformer](../docs/inference-with-fastertransformer.md).
41 | 
42 | | **Hardware**    | **GPU Memory** | **Precison** | **128** Encode / Decode | **512** Encode / Decode | **1024** Encode / Decode | **2048** Encode / Decode |
43 | | --------------- | -------------- | ------------ | ----------------------- | ----------------------- | ------------------------ | ------------------------ |
44 | | 8 * A100        | 40 GB          | INT4         | 145 ms / 4.29 s         | 183 ms / 17.7 s         | 313 ms / 37.8 s          | 495 ms / 86.0 s          |
45 | | 4 * A100        | 80 GB          | INT4         | 174 ms / 6.62 s         | 272 ms / 27.1 s         | 439 ms / 56.2 s          | 810 ms / 123 s           |
46 | | 8 * V100        | 32 GB          | INT4         | 309 ms / 6.97 s         | 666 ms / 28.1 s         | 1208 ms / 58.4 s         | 2304 ms / 125 s          |
47 | | 4 * V100        | 32 GB          | INT4         | 448 ms / 11.4 s         | 843 ms / 45.87 s        | 1488 ms / 93.5 s         | 2803 ms / 196 s          |
48 | | 8 * RTX 3090    | 24 GB          | INT4         | 283 ms / 5.07 s         | 915 ms / 20.5 s         | 1793 ms / 42.7 s         | 3477 ms / 90.3 s         |
49 | | 4 * RTX 3090    | 24 GB          | INT4         | 374 ms / 8.16 s         | 1300 ms / 32.3 s        | OOM / 66.5 s             | OOM / 150 s              |
50 | | 8 * RTX 2080 Ti | 11 GB          | INT4         | 392 ms / 6.77 s         | 1044 ms / 27.29 s       | OOM / 56.02 s            | OOM / OOM                |
51 | 
52 | ## Details
53 | 
54 | Typical methods quantize both model weights and activations to INT8, enabling the INT8 matrix multiplication kernel for efficiency. However, we found that there are outliers in GLM-130B's activations, making it hard to reduce the precision of activations. 
55 | 
56 | Concurrently, researchers from [Meta AI](https://arxiv.org/abs/2208.07339) also found the emergent outliers issue in large-scale transformers (>6.8B), which is consistent with our observations on GLM-130B. They conducted an in-depth analysis and found that the outliers make up only about 0.1% of all feature dimensions, so it's possible to make a decomposition for matrix multiplication that focuses on high precision multiplication for these particular dimensions.
57 | 
58 | | ![](media/16613396005977.jpg) | 
59 | |:--:| 
60 | | *Distribution of outliers (the white ones) in GLM-130B's activation* |
61 | 
62 | Unfortunately, the outliers in GLM-130B can sometimes make up at most 30% of the feature dimension, possibly because we used GLU as a variant of FFN. Therefore, a mixed-precision decomposition for matmul can be much less efficient than a single FP16 matmul. After a few weeks of trial, we finally decided to keep the precision of activations to FP16 and only consider the quantization of model weights. In that case, the quantized model parameters are dynamically converted to FP16 precision at runtime, introducing a small computational overhead but greatly reducing GPU memory requirements for storing model weights.
63 | 
64 | We quantized all linear layers as they take up most of the model parameters. All model weights, excluding input/output embedding, layernorm and bias terms are quantized using vector-wise symmetric quantization. At the quantization precision of INT4, two INT4 weights are compressed into one INT8 weight for saving GPU memory usage, so that only 70GB of GPU memory approximately is required for INT4 model weights.
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import importlib
 3 | 
 4 | from os.path import join, isdir, isfile, relpath
 5 | from glob import glob
 6 | 
 7 | from evaluation import BaseConfig, ModelForEvaluation, DEFAULT_CLASS, print_rank_0
 8 | from initialize import initialize, initialize_model_and_tokenizer
 9 | 
10 | 
11 | def add_evaluation_specific_args(parser):
12 |     """Arguments for evaluation"""
13 |     group = parser.add_argument_group("evaluation", "Evaluation configurations")
14 | 
15 |     # Task
16 |     group.add_argument("--task", nargs="+", default=[], help="All task config to evaluation")
17 |     group.add_argument("--data-path", type=str, required=True, help="Data dir path for all tasks")
18 |     return parser
19 | 
20 | 
21 | def find_all_tasks(all_task_config_path):
22 |     tasks = []
23 |     for task in all_task_config_path:
24 |         if isdir(task):
25 |             tasks += [relpath(path, ".") for path in glob(join(task, "**/*.yaml"), recursive=True)]
26 |         elif isfile(task):
27 |             tasks.append(task)
28 |     return tasks
29 | 
30 | 
31 | def evaluate_all_tasks(data_path, model, tokenizer, all_task_config_path, task_classes):
32 |     for config_path, task_class in zip(all_task_config_path, task_classes):
33 |         config = task_class.config_class().from_yaml_file(config_path)
34 |         config.path = join(data_path, config.path)
35 |         task = task_class(model, tokenizer, config)
36 |         task.evaluate()
37 | 
38 | 
39 | def main():
40 |     args = initialize(extra_args_provider=add_evaluation_specific_args)
41 |     args.task = find_all_tasks(args.task)
42 | 
43 |     task_classes = []
44 |     print_rank_0("> Loading task configs")
45 |     for task_config_path in args.task:
46 |         config = BaseConfig.from_yaml_file(task_config_path)
47 |         if config.module:
48 |             path = ".".join(config.module.split(".")[:-1])
49 |             module = importlib.import_module(path)
50 |             class_name = config.module.split(".")[-1]
51 |             task_class = getattr(module, class_name)
52 |             task_classes.append(task_class)
53 |         else:
54 |             task_classes.append(DEFAULT_CLASS[config.type])
55 |         print_rank_0(f"    Task {config.name} loaded from config {task_config_path}")
56 |     print_rank_0(f"> Successfully load {len(task_classes)} task{'s' if len(task_classes) > 1 else ''}")
57 | 
58 |     model, tokenizer = initialize_model_and_tokenizer(args)
59 |     model = ModelForEvaluation(model)
60 | 
61 |     start = time.time()
62 |     evaluate_all_tasks(args.data_path, model, tokenizer, args.task, task_classes)
63 |     print_rank_0(f"Finish {len(task_classes)} task{'s' if len(task_classes) > 1 else ''} in {time.time() - start:.1f}s")
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .configs import *
 2 | from .model import ModelForEvaluation
 3 | from .tasks import BaseTask, GenerationTask, MultiChoiceTask, LanguageModelTask
 4 | from .dataset import GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset
 5 | from .metrics import qa_evaluate
 6 | from .utils import print_rank_0
 7 | 
 8 | DEFAULT_CLASS = {
 9 |     TaskType.GENERATION: GenerationTask,
10 |     TaskType.MULTICHOICE: MultiChoiceTask,
11 |     TaskType.LANGUAGE_MODEL: LanguageModelTask,
12 | }
13 | 


--------------------------------------------------------------------------------
/evaluation/configs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from dataclass_wizard import YAMLWizard
 3 | from dataclasses import dataclass, field
 4 | from enum import Enum
 5 | from typing import Optional, List, Dict
 6 | 
 7 | 
 8 | class TaskType(Enum):
 9 |     MULTICHOICE = "mul"
10 |     GENERATION = "gen"
11 |     LANGUAGE_MODEL = "lm"
12 |     OTHER = "other"
13 | 
14 | 
15 | @dataclass
16 | class BaseConfig(YAMLWizard):
17 |     name: str  # Task name
18 |     type: TaskType  # Task type
19 |     path: str  # task data path relative to DATA_PATH
20 | 
21 |     module: Optional[str] = None  # Custom task module file, optional
22 |     metrics: List[str] = field(default_factory=list)  # Evaluation metrics
23 | 
24 |     use_task_mask: bool = False  # Whether to use [gMASK] for evaluation
25 |     use_multitask_encoding: bool = False  # Not supported now
26 |     unidirectional: bool = False  # Whether to use unidirectional attention
27 |     max_seq_length: int = 2048  # Max sequence length
28 |     file_pattern: str | Dict[str, str] = "**/*.json*"  # Organize data file in groups
29 | 
30 |     micro_batch_size: int = 1  # 'gen' task only support mbs = 1 for now
31 | 
32 |     def __post_init__(self):
33 |         assert self.use_task_mask or not self.unidirectional, "[MASK] doesn't support unidirectional attention"
34 | 
35 | 
36 | @dataclass
37 | class MultiChoiceTaskConfig(BaseConfig):
38 |     module = "evaluation.MultiChoiceTask"
39 |     metrics: List[str] = field(default_factory=lambda: ["Accuracy"])
40 | 
41 | 
42 | @dataclass
43 | class GenerationTaskConfig(BaseConfig):
44 |     module = "evaluation.GenerationTask"
45 |     metrics: List[str] = field(default_factory=lambda: ["EM", "F1"])
46 |     sampling_strategy: str = "BaseStrategy"
47 |     num_beams: int = 4
48 |     length_penalty: float = 1.0
49 |     no_repeat_ngram_size: int = 3
50 |     min_gen_length: int = 0
51 |     max_gen_length: int = 128
52 | 
53 | 
54 | @dataclass
55 | class LanguageModelTaskConfig(BaseConfig):
56 |     module = "evaluation.LanguageModelTask"
57 |     metrics: List[str] = field(default_factory=lambda: ["PPL"])
58 | 
59 |     generation_length: int = 256  # Generated length in each window
60 | 


--------------------------------------------------------------------------------
/evaluation/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import json
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from typing import List, Union
  9 | from abc import ABC, abstractmethod
 10 | from scipy.linalg import block_diag
 11 | from itertools import accumulate
 12 | from bisect import bisect_right
 13 | 
 14 | from SwissArmyTransformer import get_tokenizer
 15 | 
 16 | from .configs import BaseConfig, MultiChoiceTaskConfig, GenerationTaskConfig, LanguageModelTaskConfig
 17 | from .utils import get_tokenized_input
 18 | 
 19 | 
 20 | def pad_batch(tokens, position_ids, attention_mask, max_seq_length):
 21 |     attention_mask = np.pad(
 22 |         attention_mask,
 23 |         pad_width=((0, max_seq_length - len(tokens)),),
 24 |         mode="constant",
 25 |         constant_values=0,
 26 |     )
 27 |     tokens = np.concatenate((tokens, np.zeros(max_seq_length - len(tokens), dtype=np.int64)))
 28 |     position_ids = np.concatenate((position_ids, np.zeros(max_seq_length - len(position_ids), dtype=np.int64)))
 29 |     return tokens, position_ids, attention_mask
 30 | 
 31 | 
 32 | class EvaluationDataset(torch.utils.data.Dataset, ABC):
 33 |     """
 34 |     Jsonlines of {
 35 |         "text": context
 36 |         "choices": [choice_id1,...], if not None, len(target) == 1
 37 |         "label": If generation task -1, else [0, len(choices))
 38 |     }
 39 |     If [MASK] not in context, will append [MASK] after text
 40 |     """
 41 | 
 42 |     def __init__(self, path: Union[str, List[str]], config: BaseConfig):
 43 |         self.path = path if isinstance(path, list) else [path]
 44 |         self.config = config
 45 |         self.max_seq_length = self.config.max_seq_length
 46 |         self.dtype = np.int64
 47 | 
 48 |         self.tokenizer = get_tokenizer()
 49 |         self.mask_id = self.tokenizer.get_command("[MASK]")
 50 |         self.gmask_id = self.tokenizer.get_command("[gMASK]")
 51 | 
 52 |         self.data = []
 53 |         for p in self.path:
 54 |             self.process_single_file(p)
 55 | 
 56 |     @property
 57 |     def has_collate_fn(self) -> bool:
 58 |         return False
 59 | 
 60 |     def collate_fn(self, samples):
 61 |         return None
 62 | 
 63 |     def process_single_file(self, path):
 64 |         with open(os.path.join(path), "r", encoding="utf-8") as file:
 65 |             for line in file:
 66 |                 item = json.loads(line)
 67 |                 self.data.append(self.process_single_item(item))
 68 | 
 69 |     @abstractmethod
 70 |     def process_single_item(self, item) -> dict:
 71 |         pass
 72 | 
 73 |     def __len__(self):
 74 |         return len(self.data)
 75 | 
 76 | 
 77 | class GenerationTaskDataset(EvaluationDataset):
 78 |     config: GenerationTaskConfig
 79 | 
 80 |     def process_single_item(self, item):
 81 |         text, targets = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "targets")
 82 |         if len(text) + self.config.max_gen_length + 2 > self.config.max_seq_length:
 83 |             text_length = self.config.max_seq_length - self.config.max_gen_length - 2
 84 |             text = text[len(text) - text_length : len(text)]
 85 |         return {"text": text, "targets": targets}
 86 | 
 87 |     @property
 88 |     def has_collate_fn(self) -> bool:
 89 |         return True
 90 | 
 91 |     def collate_fn(self, samples):
 92 |         TILE = 32
 93 |         length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE
 94 | 
 95 |         token_batch, position_id_batch, attention_mask_batch = [], [], []
 96 |         context_length_batch, target_position_id_batch = [], []
 97 | 
 98 |         for sample in samples:
 99 |             token, position_id, attention_mask = pad_batch(
100 |                 sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad
101 |             )
102 |             token_batch.append(token)
103 |             position_id_batch.append(position_id)
104 |             attention_mask_batch.append(attention_mask)
105 |             context_length_batch.append(sample['context_length'])
106 |             target_position_id_batch.append(sample['target_position_id'])
107 |         return {
108 |             "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64),
109 |             "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64),
110 |             "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5,
111 |             "context_length": torch.tensor(context_length_batch, dtype=torch.int64),
112 |             "target_position_ids": torch.tensor(np.array(target_position_id_batch), dtype=torch.int64),
113 |         }
114 | 
115 |     @staticmethod
116 |     def build_generation_sample(text, max_gen_length, use_task_mask, unidirectional=True):
117 |         tokenizer = get_tokenizer()
118 | 
119 |         sop_id = tokenizer.get_command("sop")
120 |         mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]")
121 | 
122 |         token = np.array(text, dtype=np.int64)
123 | 
124 |         blank_filling = mask_id in text
125 |         if blank_filling:
126 |             assert not unidirectional, "Unidirectional attention doesn't support blank filling"
127 |             assert not use_task_mask, "Unidirectional attention doesn't support task mask"
128 |             mask_position = text.index(mask_id)
129 |             token = np.concatenate((token, [sop_id]))
130 |         else:
131 |             mask_position = len(token)
132 |             if unidirectional:
133 |                 token = np.concatenate(([mask_id, sop_id], token))
134 |             else:
135 |                 token = np.concatenate((token, [mask_id, sop_id]))
136 |         context_length = len(token)
137 | 
138 |         position_id = np.arange(0, context_length, dtype=np.int64)
139 |         target_position_id = np.arange(context_length, context_length + max_gen_length, dtype=np.int64)
140 |         if not use_task_mask:
141 |             position_id[context_length - 1:] = mask_position
142 |             target_position_id[:] = mask_position
143 | 
144 |         attention_mask = np.tril(np.ones((context_length, context_length), dtype=np.int64))
145 |         if not unidirectional:
146 |             attention_mask[: context_length - 1, : context_length - 1] = 1
147 | 
148 |         item = {
149 |             "token": token,
150 |             "position_id": position_id,
151 |             "target_position_id": target_position_id,
152 |             "attention_mask": attention_mask,
153 |             "context_length": context_length,
154 |         }
155 |         return item
156 | 
157 |     def __getitem__(self, idx):
158 |         item = self.data[idx]
159 |         sample = self.build_generation_sample(
160 |             item["text"],
161 |             max_gen_length=self.config.max_gen_length,
162 |             use_task_mask=self.config.use_task_mask,
163 |             unidirectional=self.config.unidirectional,
164 |         )
165 |         sample["targets"] = [np.array(target, dtype=self.dtype) for target in item["targets"]]
166 |         return sample
167 | 
168 | 
169 | class MultiChoiceTaskDataset(EvaluationDataset):
170 |     config: MultiChoiceTaskConfig
171 | 
172 |     def __init__(self, path, config: MultiChoiceTaskConfig):
173 |         self.is_single_token = True  # set to False later in process_single_item func
174 |         super().__init__(path, config)
175 | 
176 |     @property
177 |     def has_collate_fn(self) -> bool:
178 |         return True
179 | 
180 |     def collate_fn(self, samples):
181 |         TILE = 32
182 |         length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE
183 | 
184 |         token_batch, position_id_batch, attention_mask_batch = [], [], []
185 |         choices_batch, choice_target_ids_batch = [], []
186 | 
187 |         for sample in samples:
188 |             token, position_id, attention_mask = pad_batch(
189 |                 sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad
190 |             )
191 |             token_batch.append(token)
192 |             position_id_batch.append(position_id)
193 |             attention_mask_batch.append(attention_mask)
194 |             choices_batch.append(sample["choices"])
195 |             choice_target_ids_batch.append(sample["choice_target_ids"])
196 | 
197 |         return {
198 |             "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64),
199 |             "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64),
200 |             "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5,
201 |             "choices": choices_batch,
202 |             "choice_target_ids": choice_target_ids_batch,
203 |             "is_single_token": self.is_single_token,
204 |         }
205 | 
206 |     def process_single_item(self, item):
207 |         text, choices, label = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "choices"), item["label"]
208 | 
209 |         tgt_seq_length = sum([len(choice) for choice in choices])
210 |         if tgt_seq_length == len(choices):
211 |             # For single token, we only insert one [sop]
212 |             tgt_seq_length = 1
213 | 
214 |         assert tgt_seq_length < self.config.max_seq_length
215 |         if len(text) + tgt_seq_length + 2 > self.config.max_seq_length:
216 |             text_length = self.config.max_seq_length - tgt_seq_length - 2
217 |             text = text[len(text) - text_length : len(text)]
218 | 
219 |         assert not (
220 |             self.mask_id in text and self.config.use_multitask_encoding
221 |         ), "Unified multitask encoding don't support blank filling"
222 | 
223 |         if tgt_seq_length != 1:
224 |             self.is_single_token = False
225 | 
226 |         return {
227 |             "text": text,
228 |             "choices": choices,
229 |             "label": label,
230 |         }
231 | 
232 |     @staticmethod
233 |     def build_multiple_choice_sample(
234 |         text, choices, is_single_token, unified_multitask_encoding=False, use_task_mask=False
235 |     ):
236 |         tokenizer = get_tokenizer()
237 | 
238 |         sop_id = tokenizer.get_command("sop")
239 |         mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]")
240 | 
241 |         token = np.array(text, dtype=np.int64)
242 |         target = np.array(text, dtype=np.int64)
243 |         position_id = np.arange(len(text), dtype=np.int64)
244 |         choice_target_id = []
245 | 
246 |         blank_filling = mask_id in text
247 |         if not blank_filling:
248 |             mask_position = len(token)
249 |             token = np.concatenate((token, [mask_id]))
250 |             target = np.concatenate((target, [mask_id]))
251 |             position_id = np.concatenate((position_id, [mask_position]))
252 |         else:
253 |             mask_position = text.index(mask_id)
254 | 
255 |         division = len(token)
256 |         attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)]
257 | 
258 |         for choice in choices:
259 |             if use_task_mask == False:
260 |                 position_id = np.concatenate(
261 |                     (
262 |                         position_id,
263 |                         [mask_position] * len(choice)
264 |                         if blank_filling or not unified_multitask_encoding
265 |                         else np.arange(mask_position, mask_position + len(choice), dtype=np.int64),
266 |                     )
267 |                 )
268 |             else:
269 |                 position_id = np.concatenate(
270 |                     (
271 |                         position_id,
272 |                         np.arange(division, division + len(choice), dtype=np.int64),
273 |                     )
274 |                 )
275 | 
276 |             choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64))
277 |             attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64)))
278 |             token = np.concatenate((token, [sop_id], choice[:-1]))
279 |             target = np.concatenate((target, choice))
280 | 
281 |             if is_single_token:
282 |                 break
283 | 
284 |         attention_mask = block_diag(*attention_mask)
285 |         attention_mask[: len(token), :division] = 1
286 | 
287 |         if is_single_token:
288 |             choices = np.array(choices, dtype=np.int64).squeeze().tolist()
289 | 
290 |         item = {
291 |             "token": token,
292 |             "position_id": position_id,
293 |             "attention_mask": attention_mask,
294 |             "choices": choices,
295 |             "choice_target_ids": choice_target_id[0] if is_single_token else choice_target_id,
296 |         }
297 |         return item
298 | 
299 |     def __getitem__(self, idx):
300 |         item = self.data[idx]
301 |         sample = self.build_multiple_choice_sample(
302 |             item["text"],
303 |             item["choices"],
304 |             is_single_token=self.is_single_token,
305 |             unified_multitask_encoding=self.config.use_multitask_encoding,
306 |             use_task_mask=self.config.use_task_mask,
307 |         )
308 |         sample["label"] = item["label"]
309 |         return sample
310 | 
311 | 
312 | class LanguageModelTaskDataset(EvaluationDataset):
313 |     config: LanguageModelTaskConfig
314 |     left_weights: List[int]
315 |     weights: List[int]
316 | 
317 |     def process_single_file(self, path):
318 |         num_sequences = []
319 |         with open(os.path.join(path), "r", encoding="utf-8") as file:
320 |             raw_text = file.read()
321 |             tokens = self.tokenizer.tokenize(raw_text)
322 |             self.data.append(
323 |                 {
324 |                     "raw_text": tokens,
325 |                     "num_original_tokens": len(raw_text.strip().split(" ")),
326 |                     "num_sequences": max(
327 |                         math.ceil(
328 |                             max(len(tokens) - (self.config.max_seq_length - 1), 0) / self.config.generation_length
329 |                         )
330 |                         + 1,
331 |                         1,
332 |                     ),
333 |                 }
334 |             )
335 |             num_sequences.append(self.data[-1]["num_sequences"])
336 |         self.weights = list(accumulate(num_sequences))
337 |         self.left_weights = [0] + self.weights[:-1]
338 | 
339 |     def process_single_item(self, item):
340 |         pass
341 | 
342 |     def __len__(self):
343 |         return self.data[0]["num_sequences"]
344 | 
345 |     def __getitem__(self, idx):
346 |         document_idx = bisect_right(self.weights, idx)
347 |         idx = idx - self.left_weights[document_idx]
348 |         start_idx = idx * self.config.generation_length
349 |         end_idx = start_idx + self.config.max_seq_length - 1  # for additional [gMASK]
350 |         tokens = self.data[document_idx]["raw_text"][start_idx:end_idx]
351 | 
352 |         mask_id = self.gmask_id if self.config.use_task_mask else self.mask_id
353 |         sop_id = self.tokenizer.get_command("sop")
354 | 
355 |         if idx == 0 or self.config.unidirectional:
356 |             prompt, text = [], tokens
357 |         else:
358 |             prompt_length = self.config.max_seq_length - 1 - self.config.generation_length
359 |             prompt, text = tokens[:prompt_length], tokens[prompt_length:]
360 | 
361 |         seq_length = len(prompt) + len(text) + 1
362 |         attention_mask = np.tril(np.ones((seq_length, seq_length), dtype=np.int64))
363 |         attention_mask[: len(prompt) + 1, : len(prompt) + 1] = 1
364 | 
365 |         return {
366 |             "tokens": np.array(prompt + [mask_id, sop_id] + text[:-1], dtype=np.int64),
367 |             "targets": np.array(prompt + [mask_id] + text, dtype=np.int64),
368 |             "position_ids": np.arange(0, seq_length, dtype=np.int64),
369 |             "attention_mask": attention_mask < 0.5,
370 |             "loss_masks": np.array([0] * (len(prompt) + 1) + [1] * len(text), dtype=np.int64),
371 |         }
372 | 


--------------------------------------------------------------------------------
/evaluation/metrics.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import math
  3 | import string
  4 | import functools
  5 | 
  6 | import torch
  7 | import numpy as np
  8 | 
  9 | from typing import Tuple, List
 10 | from collections import Counter
 11 | from collections import defaultdict
 12 | from SwissArmyTransformer import get_tokenizer
 13 | 
 14 | from .utils import print_rank_0
 15 | 
 16 | 
 17 | def accuracy_metric(predictions, examples):
 18 |     count = 0
 19 |     num_predictions = max(len(predictions), 1)
 20 |     assert len(predictions) == len(examples)
 21 |     for prediction, example in zip(predictions, examples):
 22 |         count += prediction == example["label"]
 23 |     return count * 100.0 / num_predictions
 24 | 
 25 | 
 26 | def F1_metric(predictions, examples):
 27 |     assert len(predictions) == len(examples)
 28 |     from sklearn.metrics import f1_score
 29 | 
 30 |     truth = []
 31 |     for prediction, example in zip(predictions, examples):
 32 |         truth.append(example["label"])
 33 |     return f1_score(truth, predictions, average="micro") * 100.0
 34 | 
 35 | 
 36 | def precision_metric(predictions, examples):
 37 |     assert len(predictions) == len(examples)
 38 |     from sklearn.metrics import precision_score
 39 | 
 40 |     truth = []
 41 |     for prediction, example in zip(predictions, examples):
 42 |         truth.append(example["label"])
 43 |     return precision_score(truth, predictions, average="micro") * 100.0
 44 | 
 45 | 
 46 | def recall_metric(predictions, examples):
 47 |     assert len(predictions) == len(examples)
 48 |     from sklearn.metrics import recall_score
 49 | 
 50 |     truth = []
 51 |     for prediction, example in zip(predictions, examples):
 52 |         truth.append(example["label"])
 53 |     return recall_score(truth, predictions, average="micro") * 100.0
 54 | 
 55 | 
 56 | def normalize_answer(s):
 57 |     """Lower text and remove punctuation, articles and extra whitespace."""
 58 | 
 59 |     def remove_articles(text):
 60 |         return re.sub(r"\b(a|an|the)\b", " ", text)
 61 | 
 62 |     def white_space_fix(text):
 63 |         return " ".join(text.split())
 64 | 
 65 |     def remove_punc(text):
 66 |         exclude = set(string.punctuation)
 67 |         return "".join(ch for ch in text if ch not in exclude)
 68 | 
 69 |     def lower(text):
 70 |         return text.lower()
 71 | 
 72 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 73 | 
 74 | 
 75 | def f1_score(prediction, ground_truth):
 76 |     prediction_tokens = normalize_answer(prediction).split()
 77 |     ground_truth_tokens = normalize_answer(ground_truth).split()
 78 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 79 |     num_same = sum(common.values())
 80 |     if num_same == 0:
 81 |         return 0
 82 |     precision = 1.0 * num_same / len(prediction_tokens)
 83 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 84 |     f1 = (2 * precision * recall) / (precision + recall)
 85 |     return f1
 86 | 
 87 | 
 88 | def exact_match_score(prediction, ground_truth):
 89 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
 90 | 
 91 | 
 92 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 93 |     if not ground_truths:
 94 |         return 0.0
 95 |     scores_for_ground_truths = []
 96 |     for ground_truth in ground_truths:
 97 |         score = metric_fn(prediction, ground_truth)
 98 |         scores_for_ground_truths.append(score)
 99 |     return max(scores_for_ground_truths)
100 | 
101 | 
102 | def qa_evaluate(predictions, examples, metric):
103 |     assert len(examples) == len(predictions)
104 |     tokenizer = get_tokenizer()
105 | 
106 |     score = 0.0
107 |     for example, prediction in zip(examples, predictions):
108 |         ground_truths = [tokenizer.tokenizer.decode(target) for target in example["targets"]]
109 |         prediction = tokenizer.tokenizer.decode(prediction)
110 |         if ground_truths:
111 |             score += metric_max_over_ground_truths(metric, prediction, ground_truths)
112 |     score = 100.0 * score / len(predictions)
113 |     return score
114 | 
115 | 
116 | qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score)
117 | qa_f1 = functools.partial(qa_evaluate, metric=f1_score)
118 | 
119 | 
120 | def calculate_perplexity(loss: List[float], data):
121 |     return math.exp(min(20, np.sum(loss) / data[0]["num_original_tokens"]))
122 | 
123 | 
124 | def special_for_dataset(predictions, examples):
125 |     print_rank_0("Metrics not found, maybe dataset special metric or metric name error")
126 |     return True
127 | 
128 | 
129 | DEFAULT_METRICS = defaultdict(lambda: special_for_dataset)
130 | DEFAULT_METRICS.update(
131 |     {
132 |         "EM": qa_exact_match,
133 |         "F1": qa_f1,
134 |         "Accuracy": accuracy_metric,
135 |         "PPL": calculate_perplexity,
136 |         "Precision": precision_metric,
137 |         "Recall": recall_metric,
138 |         "F1_mul": F1_metric,
139 |     }
140 | )
141 | 


--------------------------------------------------------------------------------
/evaluation/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from typing import List, Union
  4 | 
  5 | from SwissArmyTransformer.generation.autoregressive_sampling import update_mems, get_masks_and_position_ids_default
  6 | from SwissArmyTransformer.mpu import vocab_parallel_cross_entropy
  7 | 
  8 | 
  9 | def batch_filling_sequence(
 10 |         model,
 11 |         seqs,
 12 |         context_lengths,
 13 |         strategy,
 14 |         max_memory_length=100000,
 15 |         get_masks_and_position_ids=get_masks_and_position_ids_default,
 16 |         mems=None,
 17 |         **kw_args
 18 |         ):
 19 |     '''
 20 |         seq: [2, 3, 5, ..., -1(to be generated), -1, ...]
 21 |         mems: [num_layers, batch_size, len_mems(index), mem_hidden_size]
 22 |             cache, should be first mems.shape[1] parts of context_tokens.
 23 |             mems are the first-level citizens here, but we don't assume what is memorized.
 24 |             input mems are used when multi-phase generation.
 25 |     '''
 26 |     assert len(seqs.shape) == 2
 27 | 
 28 |     # building the initial tokens, attention_mask, and position_ids
 29 |     batch_size, context_length = seqs.shape
 30 |     seqs, attention_mask, position_ids = get_masks_and_position_ids(seqs)
 31 |     tokens = seqs[..., :context_length]
 32 |     if attention_mask.dtype != torch.bool:
 33 |         attention_mask = attention_mask.type_as(next(model.parameters())) # if fp16
 34 |     # initialize generation
 35 |     counter = context_length - 1 # Last fixed index is ``counter''
 36 |     index = 0 if mems is None else mems.shape[2] # Next forward starting index, also the length of cache.
 37 |     num_beams = 1
 38 |     # step-by-step generation
 39 |     while counter < seqs.shape[1] - 1:
 40 |         # Now, we want to generate seq[counter + 1],
 41 |         # token[:, index: counter+1] needs forwarding.
 42 |         # forward
 43 |         tokens = tokens.reshape(batch_size * num_beams, -1)
 44 |         mems = mems.reshape(mems.shape[0], batch_size * num_beams, mems.shape[-2], mems.shape[-1]) if mems is not None else None
 45 |         logits, *output_per_layers = model(
 46 |             tokens[:, index:],
 47 |             position_ids[..., index: counter+1],
 48 |             attention_mask[..., index: counter+1, :counter+1], # TODO memlen
 49 |             mems=mems,
 50 |             **kw_args
 51 |         )
 52 |         mem_kv = [o['mem_kv'] for o in output_per_layers]
 53 |         mems = update_mems(mem_kv, mems, max_memory_length=max_memory_length)
 54 |         if counter == context_length - 1:
 55 |             logits = logits[torch.arange(batch_size), context_lengths - 1]
 56 |         else:
 57 |             logits = logits[:, -1]
 58 |         counter += 1
 59 |         index = counter
 60 |         # if torch.distributed.get_rank() == 0:
 61 |         #     print(f"counter: {counter}: logits: {logits.float().abs().mean()}")
 62 |         # sampling
 63 |         logits = logits.reshape(batch_size, num_beams, -1)
 64 |         tokens = tokens.reshape(batch_size, num_beams, -1)
 65 |         mems = mems.reshape(mems.shape[0], batch_size, num_beams, mems.shape[-2], mems.shape[-1])
 66 |         tokens, mems = strategy.forward(logits, tokens, mems)
 67 |         if len(tokens.shape) == 3 and num_beams == 1:
 68 |             num_beams = tokens.shape[1]
 69 |             position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, -1).reshape(batch_size * num_beams, -1)
 70 |             attention_mask_shape = attention_mask.shape[-3:]
 71 |             attention_mask = attention_mask.unsqueeze(1).expand(batch_size, num_beams, -1, -1, -1).reshape(
 72 |                 batch_size * num_beams, *attention_mask_shape)
 73 |         if strategy.is_done:
 74 |             break
 75 |     return strategy.finalize(tokens, mems)
 76 | 
 77 | 
 78 | class ModelForEvaluation(torch.nn.Module):
 79 |     def __init__(self, model):
 80 |         super().__init__()
 81 | 
 82 |         self.model = model
 83 |         self.device = next(self.model.parameters()).device
 84 | 
 85 |     @staticmethod
 86 |     def process_data(batch, device):
 87 |         return (
 88 |             batch["tokens"].to(device=device).long(),
 89 |             batch["position_ids"].to(device=device).long(),
 90 |             batch["attention_mask"].to(device=device).bool().unsqueeze(1),
 91 |         )
 92 | 
 93 |     def cond_log_prob(self, batch) -> List[List[float]]:
 94 |         """
 95 |         @return: Conditional log probability of each option
 96 |         """
 97 |         tokens, position_ids, attention_mask = self.process_data(batch, self.device)
 98 |         choices_batch, choice_target_ids_batch = batch["choices"], batch["choice_target_ids"]
 99 |         is_single_token = batch["is_single_token"]
100 | 
101 |         self.model.eval()
102 |         with torch.no_grad():
103 |             logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None)
104 |             logits_batch = torch.nn.functional.log_softmax(logits, dim=-1)
105 | 
106 |         # output: [b, sq, vocab]
107 |         log_probs = []
108 | 
109 |         if is_single_token:  # Single token
110 |             for logits, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch):
111 |                 log_probs.append(logits[choice_target_ids[0], choices].tolist())
112 |         else:  # Multi token
113 |             for output, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch):
114 |                 log_probs_single = []
115 |                 for choice, choice_target_id in zip(choices, choice_target_ids):
116 |                     tmp = output[choice_target_id, choice]
117 |                     log_probs_single.append(tmp.sum().tolist())
118 |                 log_probs.append(log_probs_single)
119 |         return log_probs
120 | 
121 |     def generate_text(self, sample, strategy, return_all_beams=False) -> Union[
122 |         List[List[int]], List[List[List[int]]]]:
123 |         """
124 |         @return: A list of text model generated, sorted by score in descending order
125 |         """
126 | 
127 |         seqs = sample["tokens"].to(device=self.device).long()
128 |         context_lengths = sample["context_length"].long()
129 | 
130 |         def get_masks_and_position_ids(seq):
131 |             batch_size = seq.shape[0]
132 |             max_gen_length = sample['target_position_ids'].shape[-1]
133 |             tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode='constant', value=-1)
134 |             position_ids = torch.cat((sample['position_ids'], sample['target_position_ids']), dim=-1)
135 |             position_ids = position_ids.to(device=self.device).long()
136 |             attention_mask = sample["attention_mask"].to(device=self.device)
137 |             context_mask = attention_mask[torch.arange(batch_size), context_lengths - 1].unsqueeze(1).repeat(1,
138 |                                                                                                              max_gen_length,
139 |                                                                                                              1)
140 |             causal_mask = torch.tril(context_mask.new_ones((batch_size, max_gen_length, max_gen_length))) < 0.5
141 |             generation_mask = torch.cat(
142 |                 (context_mask, causal_mask), dim=-1)
143 |             attention_mask = torch.nn.functional.pad(attention_mask, (0, max_gen_length), mode='constant', value=1)
144 |             attention_mask = torch.cat((attention_mask, generation_mask), dim=1)
145 |             attention_mask = attention_mask.bool().unsqueeze(1)
146 |             return tokens, attention_mask, position_ids
147 | 
148 |         self.model.eval()
149 |         with torch.no_grad():
150 |             output = batch_filling_sequence(
151 |                 self.model,
152 |                 seqs,
153 |                 context_lengths,
154 |                 get_masks_and_position_ids=get_masks_and_position_ids,
155 |                 strategy=strategy,
156 |             )[0]
157 | 
158 |         if isinstance(output, torch.Tensor):  # different strategies
159 |             output = output.tolist()
160 | 
161 |         output_targets = []
162 |         context_length = seqs.shape[1]
163 |         for lines in output:
164 |             lines = lines.tolist() if isinstance(lines, torch.Tensor) else lines
165 |             output_target = []
166 |             if not isinstance(lines, list):
167 |                 lines = [lines]
168 |             for line in lines:
169 |                 unfinished = line.index(-1) if -1 in line else len(line)
170 |                 if line[unfinished - 1] in strategy.end_tokens:
171 |                     unfinished -= 1
172 |                 line = line[context_length:unfinished]
173 |                 output_target.append(line)
174 |             if not return_all_beams:
175 |                 output_targets.append(output_target[0])
176 |             else:
177 |                 output_targets.append(output_target)
178 |         return output_targets
179 | 
180 | 
181 |     def calculate_loss(self, batch) -> List[float]:
182 |         tokens, position_ids, attention_mask = self.process_data(batch, self.device)
183 |         targets, loss_masks = (
184 |             batch["targets"].to(device=self.device).long(),
185 |             batch["loss_masks"].to(device=self.device).long(),
186 |         )
187 | 
188 |         original_parallel_output = self.model.transformer.parallel_output
189 |         self.model.transformer.parallel_output = True
190 |         self.model.eval()
191 | 
192 |         with torch.no_grad():
193 |             logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None)
194 |             losses = vocab_parallel_cross_entropy(logits.contiguous().float(), targets)
195 |             loss = torch.sum(losses * loss_masks, dim=-1)
196 | 
197 |         self.model.transformer.parallel_output = original_parallel_output
198 | 
199 |         return loss.tolist()
200 | 


--------------------------------------------------------------------------------
/evaluation/tasks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import time
  3 | import numpy as np
  4 | import torch.distributed as dist
  5 | 
  6 | from typing import Dict, Callable, Type, Tuple, List, Any
  7 | from abc import ABC, abstractmethod
  8 | from glob import glob
  9 | from os.path import join, relpath
 10 | from collections import defaultdict
 11 | 
 12 | from SwissArmyTransformer.tokenization.icetk_glm_130B.ice_tokenizer import _IceTokenizer
 13 | 
 14 | from generation import BaseStrategy, BeamSearchStrategy
 15 | from .configs import BaseConfig, GenerationTaskConfig, MultiChoiceTaskConfig, LanguageModelTaskConfig
 16 | from .model import ModelForEvaluation
 17 | from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset
 18 | from .utils import build_data_loader, gather_result, print_rank_0
 19 | from .metrics import DEFAULT_METRICS
 20 | 
 21 | 
 22 | class BaseTask(ABC):
 23 |     model: ModelForEvaluation
 24 |     tokenizer: _IceTokenizer
 25 |     config: BaseConfig
 26 |     file_groups: Dict[str, List[str]]
 27 | 
 28 |     @classmethod
 29 |     def config_class(cls) -> Type[BaseConfig]:
 30 |         return BaseConfig
 31 | 
 32 |     @property
 33 |     def metrics(self) -> Dict[str, Callable]:
 34 |         return {metric: DEFAULT_METRICS[metric] for metric in self.config.metrics}
 35 | 
 36 |     def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: BaseConfig):
 37 |         self.model = model
 38 |         self.tokenizer = tokenizer
 39 |         self.config = config
 40 |         self.config.metrics = list(self.metrics.keys())
 41 | 
 42 |         self.file_groups = self.get_file_groups()
 43 |         self.verbose = dist.get_rank() == 0
 44 | 
 45 |     def get_file_groups(self):
 46 |         pattern_group = {}
 47 |         if isinstance(self.config.file_pattern, str):
 48 |             pattern_group["all"] = self.config.file_pattern
 49 |         else:
 50 |             pattern_group = self.config.file_pattern
 51 |         return {
 52 |             name: [
 53 |                 relpath(path, start=self.config.path)
 54 |                 for path in sorted(glob(join(self.config.path, pattern), recursive=True))
 55 |             ]
 56 |             for name, pattern in pattern_group.items()
 57 |         }
 58 | 
 59 |     def evaluate(self):
 60 |         dist.barrier()
 61 |         start = time.time()
 62 |         print_rank_0("\n")
 63 |         print_rank_0(f"{self.config}")
 64 |         print_rank_0(f"Evaluating task {self.config.name}:")
 65 | 
 66 |         result_dict_all = {}
 67 | 
 68 |         for group_name, filelist in self.file_groups.items():
 69 |             print_rank_0(f"    Evaluating group {group_name}:")
 70 | 
 71 |             result_dict_group = {}
 72 |             for file in filelist:
 73 |                 dataset = self.build_dataset(file)
 74 |                 dataloader = build_data_loader(
 75 |                     dataset,
 76 |                     micro_batch_size=self.config.micro_batch_size,
 77 |                     num_workers=1,
 78 |                     drop_last=False,
 79 |                     collate_fn=dataset.collate_fn if dataset.has_collate_fn else None,
 80 |                 )
 81 | 
 82 |                 prediction = []
 83 |                 with torch.no_grad():
 84 |                     for _, batch in enumerate(dataloader):
 85 |                         prediction.append(self.predict_single_batch(batch))
 86 | 
 87 |                 prediction = gather_result(prediction, len(dataset), self.config.micro_batch_size)
 88 |                 result_dict = {key: metric(prediction, dataset.data) for key, metric in self.metrics.items()}
 89 |                 result_dict_group[file] = (result_dict, len(dataset))
 90 | 
 91 |                 if self.verbose:
 92 |                     self.report_single_metrics(file, result_dict)
 93 | 
 94 |             result_dict_all[group_name] = result_dict_group
 95 | 
 96 |         print_rank_0(f"Evaluation results of task {self.config.name}:")
 97 | 
 98 |         if self.verbose:
 99 |             for group_name, result_dict_group in result_dict_all.items():
100 |                 self.report_group_metrics(group_name, result_dict_group)
101 |             self.report_overall_metrics(
102 |                 {k: v for result_dict_group in result_dict_all.values() for k, v in result_dict_group.items()},
103 |             )
104 | 
105 |         print_rank_0(f"Finish task {self.config.name} in {time.time() - start:.1f}s.")
106 | 
107 |     def report_single_metrics(self, file: str, result_dict: Dict[str, float]):
108 |         output_str = f"        Finish {file}"
109 |         for key, value in result_dict.items():
110 |             output_str += f", {key} = {value:.3f}"
111 |         print_rank_0(output_str)
112 | 
113 |     @staticmethod
114 |     def calc_group_metrics(result_dict_group: Dict[str, Tuple[Dict[str, float], int]]):
115 |         metrics_dict = defaultdict(lambda: [])
116 |         weight = []
117 |         for file, (result_dict, length) in result_dict_group.items():
118 |             for key, value in result_dict.items():
119 |                 metrics_dict[key].append(value)
120 |             weight.append(length)
121 |         return {
122 |             name: {
123 |                 "max": np.max(value),
124 |                 "median": np.median(value),
125 |                 "average": np.average(value, weights=weight),
126 |             }
127 |             for name, value in metrics_dict.items()
128 |         }
129 | 
130 |     def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1):
131 |         stats_dict = self.calc_group_metrics(result_dict_group)
132 |         if len(stats_dict) == 1:
133 |             name, stats = next(iter(stats_dict.items()))
134 |             print_rank_0(
135 |                 "    " * level + f"Group {group_name} {name}: max = {stats['max']:.3f}, "
136 |                 f"median = {stats['median']:.3f}, average = {stats['average']:.3f}"
137 |             )
138 |         else:
139 |             print_rank_0("    " * level + f"  Group {group_name}: ")
140 |             for name, stats in stats_dict.items():
141 |                 print(
142 |                     "    " * (level + 1) + f"Metric {name}: max = {stats['max']:.3f}, "
143 |                     f"median = {stats['median']:.3f}, average = {stats['average']:.3f}"
144 |                 )
145 | 
146 |     def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]):
147 |         pass
148 | 
149 |     @abstractmethod
150 |     def predict_single_batch(self, batch) -> List[Any]:
151 |         pass
152 | 
153 |     @abstractmethod
154 |     def build_dataset(self, relative_path: str) -> EvaluationDataset:
155 |         pass
156 | 
157 | 
158 | class GenerationTask(BaseTask, ABC):
159 |     config: GenerationTaskConfig
160 | 
161 |     @classmethod
162 |     def config_class(cls):
163 |         return GenerationTaskConfig
164 | 
165 |     def build_dataset(self, relative_path):
166 |         return GenerationTaskDataset(join(self.config.path, relative_path), self.config)
167 | 
168 |     def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: GenerationTaskConfig):
169 |         super(GenerationTask, self).__init__(model, tokenizer, config)
170 | 
171 |         end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")]
172 |         if self.config.sampling_strategy == "BaseStrategy":
173 |             self.strategy = BaseStrategy(batch_size=self.config.micro_batch_size, temperature=1.0, top_k=1,
174 |                                          end_tokens=end_tokens)
175 |         elif self.config.sampling_strategy == "BeamSearchStrategy":
176 |             self.strategy = BeamSearchStrategy(
177 |                 self.config.micro_batch_size,
178 |                 self.config.num_beams,
179 |                 length_penalty=self.config.length_penalty,
180 |                 consider_end=True,
181 |                 end_tokens=end_tokens,
182 |                 no_repeat_ngram_size=self.config.no_repeat_ngram_size,
183 |                 min_gen_length=self.config.min_gen_length,
184 |                 deterministic=True,  # For evaluation, we need a determined generation strategy
185 |             )
186 |         else:
187 |             raise ValueError(f"unknown strategy {self.config.sampling_strategy}")
188 | 
189 |     def predict_single_batch(self, batch) -> List[List[int]]:
190 |         output = self.model.generate_text(batch, self.strategy, return_all_beams=False)
191 |         return output
192 | 
193 | 
194 | class MultiChoiceTask(BaseTask, ABC):
195 |     config: MultiChoiceTaskConfig
196 | 
197 |     @classmethod
198 |     def config_class(cls):
199 |         return MultiChoiceTaskConfig
200 | 
201 |     def build_dataset(self, relative_path):
202 |         return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.config)
203 | 
204 |     def predict_single_batch(self, batch) -> List[int]:
205 |         log_probs = self.model.cond_log_prob(batch)
206 |         return [np.argmax(log_probs_single).item() for log_probs_single in log_probs]
207 | 
208 | 
209 | class LanguageModelTask(BaseTask, ABC):
210 |     config: LanguageModelTaskConfig
211 | 
212 |     @classmethod
213 |     def config_class(cls):
214 |         return LanguageModelTaskConfig
215 | 
216 |     def build_dataset(self, relative_path):
217 |         return LanguageModelTaskDataset(join(self.config.path, relative_path), self.config)
218 | 
219 |     def predict_single_batch(self, batch) -> List[float]:
220 |         return self.model.calculate_loss(batch)
221 | 


--------------------------------------------------------------------------------
/evaluation/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | from SwissArmyTransformer import mpu, get_tokenizer
 5 | 
 6 | 
 7 | def print_rank_0(*args, **kwargs):
 8 |     if torch.distributed.get_rank() == 0:
 9 |         print(*args, **kwargs)
10 | 
11 | 
12 | def build_data_loader(dataset, micro_batch_size, num_workers, drop_last, collate_fn=None):
13 |     # Sampler.
14 |     world_size = mpu.get_data_parallel_world_size()
15 |     rank = mpu.get_data_parallel_rank()
16 |     sampler = torch.utils.data.distributed.DistributedSampler(
17 |         dataset, num_replicas=world_size, rank=rank, shuffle=False
18 |     )
19 | 
20 |     # Data loader. Note that batch size is the per GPU batch size.
21 |     data_loader = torch.utils.data.DataLoader(
22 |         dataset,
23 |         batch_size=micro_batch_size,
24 |         sampler=sampler,
25 |         shuffle=False,
26 |         num_workers=num_workers,
27 |         drop_last=drop_last,
28 |         pin_memory=True,
29 |         collate_fn=collate_fn,
30 |     )
31 | 
32 |     return data_loader
33 | 
34 | 
35 | def gather_result(prediction, total_length, micro_batch_size):
36 |     """
37 |     @param prediction: Local predictions with order defined by distributed sampler
38 |     @param total_length: Total sample num
39 |     @return: [sample_0, sample_1, ..., sample_{total_length-1}]
40 |     """
41 |     torch.cuda.empty_cache()
42 |     world_size = mpu.get_data_parallel_world_size()
43 |     prediction_gathered = [None for _ in range(world_size)]
44 |     dist.all_gather_object(prediction_gathered, prediction, group=mpu.get_data_parallel_group())
45 |     prediction = []
46 |     for i in range(len(prediction_gathered[0])):
47 |         for j in range(micro_batch_size):
48 |             for k in range(world_size):
49 |                 if j < len(prediction_gathered[k][i]):
50 |                     prediction.append(prediction_gathered[k][i][j])
51 |     prediction = prediction[:total_length]
52 |     return prediction
53 | 
54 | 
55 | def get_tokenized_input(item, key):
56 |     if key in item:
57 |         return item[key]
58 |     tokenizer = get_tokenizer()
59 |     pretokenized_key = key + "_pretokenized"
60 |     assert pretokenized_key in item
61 |     if isinstance(item[pretokenized_key], list):
62 |         result = []
63 |         for raw in item[pretokenized_key]:
64 |             result.append(tokenizer.tokenize(raw))
65 |         return result
66 |     else:
67 |         return tokenizer.tokenize(item[pretokenized_key])
68 | 


--------------------------------------------------------------------------------
/generate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import stat
  4 | import re
  5 | 
  6 | from functools import partial
  7 | from typing import List, Tuple
  8 | 
  9 | from SwissArmyTransformer import mpu
 10 | from evaluation.model import batch_filling_sequence
 11 | from generation import BeamSearchStrategy, BaseStrategy
 12 | from SwissArmyTransformer.generation.utils import timed_name, generate_continually
 13 | from initialize import initialize, initialize_model_and_tokenizer
 14 | 
 15 | 
 16 | def add_generation_specific_args(parser):
 17 |     parser.add_argument("--sampling-strategy", type=str, default="BaseStrategy", help="Type of sampling strategy.")
 18 |     parser.add_argument("--min-gen-length", type=int, default=0, help="The minimum length each blank should generate.")
 19 |     parser.add_argument(
 20 |         "--print-all-beams", action="store_true", help="Print all output generated by beam search strategy."
 21 |     )
 22 | 
 23 | 
 24 | def isEnglish(s):
 25 |     try:
 26 |         s.encode(encoding="utf-8").decode("ascii")
 27 |     except UnicodeDecodeError:
 28 |         return False
 29 |     else:
 30 |         return True
 31 | 
 32 | 
 33 | def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False):
 34 |     context_length = seq.shape[1]
 35 |     tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode="constant", value=-1)
 36 |     attention_mask = torch.ones((1, tokens.shape[-1], tokens.shape[-1]), device=tokens.device)
 37 |     attention_mask.tril_()
 38 |     attention_mask[..., : context_length - 1] = 1
 39 |     attention_mask.unsqueeze_(1)
 40 |     attention_mask = (attention_mask < 0.5).bool()
 41 | 
 42 |     position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device)
 43 |     if not gmask:
 44 |         position_ids[context_length - 1 :] = mask_position
 45 | 
 46 |     position_ids = position_ids.unsqueeze(0)
 47 | 
 48 |     return tokens, attention_mask, position_ids
 49 | 
 50 | 
 51 | def fill_blanks(raw_text: str, model, tokenizer, strategy) -> Tuple[List[str], List[str], List[List[str]]]:
 52 |     # add MASK
 53 |     generation_mask = "[gMASK]"
 54 |     if "[MASK]" in raw_text:
 55 |         generation_mask = "[MASK]"
 56 |     elif "[sMASK]" in raw_text:
 57 |         generation_mask = "[sMASK]"
 58 |     use_gmask = "[MASK]" not in raw_text and "[sMASK]" not in raw_text
 59 | 
 60 |     mask_pattern = r"\[[sg]?MASK\]"
 61 |     text_list = re.split(mask_pattern, raw_text)
 62 |     pattern_list = re.compile(mask_pattern).findall(raw_text)
 63 |     seq = []
 64 |     for i in range(len(pattern_list)):
 65 |         pattern = pattern_list[i]
 66 |         sub_text = text_list[i]
 67 |         seq.extend(tokenizer.tokenize(sub_text))
 68 |         seq.append(tokenizer.get_command(pattern))
 69 | 
 70 |     seq.extend(tokenizer.tokenize(text_list[-1]))
 71 | 
 72 |     if "MASK]" not in raw_text:
 73 |         seq += [tokenizer.get_command(generation_mask)]
 74 |         raw_text += " " + generation_mask
 75 |     if not raw_text.endswith("MASK]"):
 76 |         seq = seq + [tokenizer.get_command("eos")]
 77 |     if mpu.get_model_parallel_rank() == 0:
 78 |         print("\nInput: {}\n".format(raw_text))
 79 |     if len(seq) > args.max_sequence_length:
 80 |         raise ValueError("text too long.")
 81 | 
 82 |     # generation
 83 |     is_english = isEnglish(raw_text)
 84 |     output_list = [seq]
 85 |     num_output = args.num_beams if args.sampling_strategy == "BeamSearchStrategy" else 1
 86 |     last_pos, answers, answers_with_style, blanks = (
 87 |         [0] * num_output,
 88 |         ["" for _ in range(num_output)],
 89 |         ["" for _ in range(num_output)],
 90 |         [[] for _ in range(num_output)],
 91 |     )
 92 | 
 93 |     # continually detect the first mark position
 94 |     while True:
 95 |         seq = output_list[0]
 96 |         # detect mask position
 97 |         mask_token = tokenizer.get_command(generation_mask)
 98 |         if mask_token not in seq:
 99 |             break
100 |         mask_position = seq.index(mask_token)
101 | 
102 |         output_list = []
103 | 
104 |         input_seq = torch.cuda.LongTensor(
105 |             [seq + [tokenizer.get_command("sop")]],
106 |             device=args.device,
107 |         )
108 |         output, _ = batch_filling_sequence(
109 |             model,
110 |             input_seq,
111 |             torch.cuda.LongTensor([input_seq.shape[-1]], device=args.device),
112 |             strategy=strategy,
113 |             get_masks_and_position_ids=partial(
114 |                 get_masks_and_position_ids,
115 |                 mask_position=mask_position,
116 |                 max_gen_length=args.out_seq_length - input_seq.shape[-1],
117 |                 gmask=use_gmask,
118 |             ),
119 |         )
120 |         if isinstance(output, torch.Tensor):  # different strategies
121 |             output = output.tolist()
122 |         output = output[0]  # batch_size = 1
123 |         output_list.extend(output)
124 | 
125 |         # clip -1s and fill back generated things into seq
126 |         for i in range(len(output_list)):
127 |             output = output_list[i].tolist() if isinstance(output_list[i], torch.Tensor) else output_list[i]
128 |             try:
129 |                 unfinished = output.index(-1)
130 |             except ValueError:
131 |                 unfinished = len(output)
132 |             if output[unfinished - 1] in strategy.end_tokens:
133 |                 unfinished -= 1
134 |             bog = output.index(tokenizer.get_command("sop"))
135 | 
136 |             prefix = tokenizer.detokenize(output[last_pos[i] : mask_position])
137 |             blank = tokenizer.detokenize(output[bog + 1 : unfinished])
138 |             answers_with_style[i] += (
139 |                 prefix
140 |                 + (" " if is_english else "")
141 |                 + ("\033[4m" if use_gmask else "\x1b[0;32m\033[4m")
142 |                 + blank
143 |                 + ("\033[0m" if use_gmask else "\033[0m\x1b[0m")
144 |                 + (" " if is_english else "")
145 |             )
146 |             blanks[i].append(blank)
147 |             last_pos[i] = mask_position + unfinished - (bog + 1)
148 |             output_list[i] = output[:mask_position] + output[bog + 1 : unfinished] + output[mask_position + 1 : bog]
149 | 
150 |     for i, output in enumerate(output_list):
151 |         if output[-1] == tokenizer.get_command("eos"):
152 |             output = output[:-1]
153 |         answers_with_style[i] += tokenizer.detokenize(output[last_pos[i] :])
154 |         answers[i] = tokenizer.detokenize(output)
155 | 
156 |     return answers, answers_with_style, blanks
157 | 
158 | 
159 | def main(args):
160 |     model, tokenizer = initialize_model_and_tokenizer(args)
161 | 
162 |     end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")]
163 | 
164 |     if args.sampling_strategy == "BaseStrategy":
165 |         strategy = BaseStrategy(
166 |             batch_size=1, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, end_tokens=end_tokens
167 |         )
168 |     elif args.sampling_strategy == "BeamSearchStrategy":
169 |         strategy = BeamSearchStrategy(
170 |             1,
171 |             args.num_beams,
172 |             length_penalty=args.length_penalty,
173 |             consider_end=True,
174 |             end_tokens=end_tokens,
175 |             no_repeat_ngram_size=args.no_repeat_ngram_size,
176 |             min_gen_length=args.min_gen_length,
177 |         )
178 |     else:
179 |         raise ValueError(f"unknown strategy {args.sampling_strategy}")
180 | 
181 |     def process(raw_text):
182 |         if args.with_id:
183 |             query_id, raw_text = raw_text.split("\t")
184 | 
185 |         answers, answers_with_style, blanks = fill_blanks(raw_text, model, tokenizer, strategy)
186 | 
187 |         # save
188 |         if args.with_id:
189 |             full_path = os.path.join(args.output_path, query_id + ".txt")
190 |         else:
191 |             prefix = raw_text.replace("/", "")[:20]
192 |             full_path = timed_name(prefix, ".txt", args.output_path)
193 |         if mpu.get_model_parallel_rank() == 0:
194 |             if args.print_all_beams and len(answers) > 1:
195 |                 for idx, answer_with_style in enumerate(answers_with_style):
196 |                     print(f"Output beam {idx}:", answer_with_style)  # print the first.
197 |                     if len(answer_with_style) > 120:
198 |                         print("")
199 |             else:
200 |                 print(f"Output:", answers_with_style[0])  # print the first.
201 |             with open(full_path, "w", encoding="utf-8") as fout:
202 |                 for answer in answers:
203 |                     fout.write(answer + "\n")
204 | 
205 |             os.chmod(full_path, stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU)
206 | 
207 |     os.makedirs(args.output_path, exist_ok=True)
208 |     generate_continually(process, args.input_source)
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     args = initialize(extra_args_provider=add_generation_specific_args)
213 | 
214 |     with torch.no_grad():
215 |         main(args)
216 | 


--------------------------------------------------------------------------------
/generation/__init__.py:
--------------------------------------------------------------------------------
1 | from .strategies import BaseStrategy, BeamSearchStrategy
2 | 


--------------------------------------------------------------------------------
/generation/strategies.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from SwissArmyTransformer.generation.sampling_strategies.base_strategy import top_k_logits
  5 | 
  6 | class BaseStrategy:
  7 |     def __init__(self, batch_size, invalid_slices=[], temperature=1., top_k=200, eps=1e-4, top_p=0.0, end_tokens=None):
  8 |         self.batch_size = batch_size
  9 |         self.invalid_slices = invalid_slices
 10 |         self.temperature = temperature
 11 |         self.topk = top_k
 12 |         self.top_p = top_p
 13 |         self.eps = eps
 14 |         if end_tokens is None:
 15 |             end_tokens = []
 16 |         self.end_tokens = end_tokens
 17 |         self._is_done = np.zeros(self.batch_size, dtype=np.bool)
 18 | 
 19 |     @property
 20 |     def is_done(self) -> bool:
 21 |         return self._is_done.all()
 22 | 
 23 |     def forward(self, logits, tokens, mems, temperature=None):
 24 |         logits = logits.view(-1, logits.size(-1))
 25 |         batch_size = tokens.shape[0]
 26 |         if temperature is None:
 27 |             temperature = self.temperature
 28 |         logits = logits / temperature
 29 |         for invalid_slice in self.invalid_slices:
 30 |             logits[..., invalid_slice] = -65504
 31 | 
 32 |         logits = top_k_logits(logits, self.topk, self.top_p)
 33 |         probs = F.softmax(logits.float(), dim=-1)  # float is essetial, due to a bug in Pytorch
 34 |         pred = torch.multinomial(probs, num_samples=1)
 35 |         for i in range(self.batch_size):
 36 |             if i >= batch_size:
 37 |                 self._is_done[i] = True
 38 |             elif self._is_done[i]:
 39 |                 pred[i] = -1
 40 |             elif pred[i].item() in self.end_tokens:
 41 |                 self._is_done[i] = True
 42 |         tokens = torch.cat((tokens, pred.view(tokens.shape[:-1] + (1,))), dim=-1)
 43 |         return tokens, mems
 44 | 
 45 |     def finalize(self, tokens, mems):
 46 |         self._is_done = np.zeros(self.batch_size, dtype=np.bool)
 47 |         return tokens, mems
 48 | 
 49 | 
 50 | class BeamSearchStrategy:
 51 |     def __init__(
 52 |         self,
 53 |         batch_size,
 54 |         num_beams,
 55 |         length_penalty=1.0,
 56 |         consider_end=False,
 57 |         end_tokens=[],
 58 |         invalid_slices=[],
 59 |         no_repeat_ngram_size=0,
 60 |         min_gen_length=0,
 61 |         deterministic=False,
 62 |     ):
 63 |         self.batch_size = batch_size
 64 |         self.num_beams = num_beams
 65 |         self.length_penalty = length_penalty
 66 |         self.end_tokens = end_tokens
 67 |         self.ngram = no_repeat_ngram_size
 68 |         self.min_gen_length = min_gen_length
 69 |         self.invalid_slices = invalid_slices
 70 |         self.consider_end = consider_end
 71 |         self.deterministic = deterministic
 72 |         self._init_cache()
 73 | 
 74 |     def _init_cache(self):
 75 |         self.end_beams = [[] for _ in range(self.batch_size)]  # list of LongTensors
 76 |         self.end_beams_penalized_scores = [[] for _ in range(self.batch_size)]  # list of LongTensors
 77 |         self.cached_beam_scores = 0  # [batch_size]
 78 |         self.cached_beam_ngram_bans = [[{} for _ in range(self.num_beams)] for _ in range(self.batch_size)]
 79 |         self.length_generated = 0
 80 |         self._is_done = np.zeros(self.batch_size, dtype=np.bool)
 81 | 
 82 |     def _add_end_beams(self, score, beam, batch_idx):
 83 |         score = score / ((5.0 + len(beam)) / 6) ** self.length_penalty  # Magic number for OpenNMT
 84 |         for i in range(len(self.end_beams[batch_idx]), -1, -1):
 85 |             if i == 0 or score < self.end_beams_penalized_scores[batch_idx][i - 1]:
 86 |                 break
 87 |         self.end_beams[batch_idx].insert(i, beam)
 88 |         self.end_beams_penalized_scores[batch_idx].insert(i, score)
 89 | 
 90 |         self.end_beams[batch_idx] = self.end_beams[batch_idx][: self.num_beams]
 91 |         self.end_beams_penalized_scores[batch_idx] = self.end_beams_penalized_scores[batch_idx][: self.num_beams]
 92 | 
 93 |     @property
 94 |     def is_done(self) -> bool:
 95 |         return self._is_done.all()
 96 | 
 97 |     def forward(self, logits, tokens, mems):
 98 |         batch_size, num_beams, vocab_size = logits.shape
 99 |         seq_len = tokens.shape[-1]
100 |         logits = logits.float()
101 |         for invalid_slice in self.invalid_slices:
102 |             logits[..., invalid_slice] = -65504
103 |         if self.min_gen_length > self.length_generated:
104 |             for end_token in self.end_tokens:
105 |                 logits[..., end_token] = -65504
106 |         if self.ngram > 0 and seq_len > self.ngram:
107 |             for batch_idx in range(batch_size):
108 |                 for i in range(num_beams):
109 |                     ngram_prefix = tokens[batch_idx, i, -(self.ngram - 1) :].tolist()  # TODO ngram=1
110 |                     for banned_index in self.cached_beam_ngram_bans[batch_idx][i].get(tuple(ngram_prefix), []):
111 |                         logits[batch_idx, i, banned_index] = -65504
112 | 
113 |         next_token_scores = F.log_softmax(logits, dim=-1)  # [batch_size, vocab_size]
114 |         prev_scores = self.cached_beam_scores
115 |         if isinstance(prev_scores, torch.Tensor):
116 |             prev_scores = prev_scores[..., None].expand_as(next_token_scores)
117 |         next_token_scores = next_token_scores + prev_scores
118 | 
119 |         next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
120 | 
121 |         probs = F.softmax(next_token_scores, dim=-1)
122 |         if num_beams < self.num_beams:  # First token
123 |             probs = probs[..., :vocab_size]
124 |         if self.deterministic:
125 |             next_tokens = torch.topk(probs, k=(max(1, len(self.end_tokens)) + 1) * self.num_beams).indices  # [2*nb]
126 |         else:
127 |             next_tokens = torch.multinomial(
128 |                 probs, num_samples=(max(1, len(self.end_tokens)) + 1) * self.num_beams
129 |             )  # [2*nb]
130 |         next_token_scores = next_token_scores[torch.arange(batch_size).unsqueeze(1), next_tokens]
131 |         next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
132 |         next_tokens = next_tokens[torch.arange(batch_size).unsqueeze(1), _indices]
133 | 
134 |         next_indices = torch.div(next_tokens, vocab_size, rounding_mode="trunc")
135 |         next_tokens = next_tokens % vocab_size
136 | 
137 |         # select out end beams or continue beams
138 |         beam_continue_batch, score_continue_batch, mems_continue_batch = [], [], []
139 |         for batch_idx in range(batch_size):
140 |             beam_continue = []
141 |             scores_continue = []
142 |             bans_continue = []
143 |             mems_contiue = []
144 |             for i in range(len(next_tokens[batch_idx])):
145 |                 beam = torch.cat((tokens[batch_idx, next_indices[batch_idx, i]], next_tokens[batch_idx, i : i + 1]))
146 |                 if not self._is_done[batch_idx] and int(next_tokens[batch_idx, i]) in self.end_tokens:
147 |                     self._add_end_beams(next_token_scores[batch_idx, i], beam, batch_idx)
148 |                 elif len(beam_continue) < self.num_beams:
149 |                     beam_continue.append(beam)
150 |                     mems_contiue.append(mems[:, batch_idx, next_indices[batch_idx, i]])
151 |                     # update caches
152 |                     scores_continue.append(next_token_scores[batch_idx, i])
153 |                     if self.ngram > 0:
154 |                         bans = self.cached_beam_ngram_bans[batch_idx][next_indices[batch_idx, i]].copy()
155 |                         # TODO ngram=1
156 |                         ngram_prefix = tuple(tokens[batch_idx, next_indices[batch_idx, i], -(self.ngram - 1):].tolist())
157 |                         bans[ngram_prefix] = bans.get(ngram_prefix, tuple()) + (next_tokens[batch_idx, i],)
158 |                         bans_continue.append(bans)
159 |                 else:
160 |                     break
161 |             beam_continue_batch.append(torch.stack(beam_continue))
162 |             mems_continue_batch.append(torch.stack(mems_contiue, dim=1))
163 |             score_continue_batch.append(scores_continue)
164 |             self.cached_beam_ngram_bans[batch_idx] = bans_continue
165 |         tokens = torch.stack(beam_continue_batch)
166 |         mems = torch.stack(mems_continue_batch, dim=1)
167 |         self.cached_beam_scores = torch.tensor(score_continue_batch, device=logits.device)
168 |         self.length_generated += 1
169 |         for batch_idx in range(self.batch_size):
170 |             if batch_idx >= batch_size:
171 |                 self._is_done[batch_idx] = True
172 |             elif (
173 |                 len(self.end_beams[batch_idx]) == self.num_beams
174 |                 and self.end_beams_penalized_scores[batch_idx][-1]
175 |                 >= self.cached_beam_scores[batch_idx].max() / ((5.0 + (seq_len + 1)) / 6) ** self.length_penalty
176 |             ):  # We're done if none of current tokens will better than the worst in end_beams
177 |                 self._is_done[batch_idx] = True
178 | 
179 |         return tokens, mems
180 | 
181 |     def finalize(self, tokens, mems):
182 |         if self.consider_end:
183 |             batch_size, num_beams = tokens.shape[:2]
184 |             for batch_idx in range(batch_size):
185 |                 if not self._is_done[batch_idx]:
186 |                     for i in range(num_beams):
187 |                         self._add_end_beams(self.cached_beam_scores[batch_idx, i], tokens[batch_idx, i], batch_idx)
188 |             mems = None
189 |             ret = self.end_beams[:batch_size]
190 |         else:
191 |             ret = tokens
192 |         self._init_cache()
193 |         return ret, mems
194 | 


--------------------------------------------------------------------------------
/initialize.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import time
  4 | 
  5 | from quantization import quantize
  6 | 
  7 | from SwissArmyTransformer import get_args, get_tokenizer
  8 | from SwissArmyTransformer.arguments import initialize_distributed
  9 | from SwissArmyTransformer.training import load_checkpoint
 10 | from SwissArmyTransformer.model import GLM130B
 11 | from SwissArmyTransformer.mpu import get_model_parallel_world_size, get_model_parallel_rank, get_model_parallel_group
 12 | 
 13 | 
 14 | def add_bminf_args(parser):
 15 |     """Arguments for BMInf"""
 16 |     group = parser.add_argument_group("BMInf")
 17 | 
 18 |     group.add_argument("--bminf", action="store_true", help="Use BMInf to support low resource evaluation")
 19 |     group.add_argument("--bminf-memory-limit", type=int, default=20, help="Max memory for model per GPU (in GB)")
 20 |     return parser
 21 | 
 22 | 
 23 | def add_quantization_args(parser):
 24 |     group = parser.add_argument_group("Quantization")
 25 | 
 26 |     group.add_argument("--quantization-bit-width", type=int, default=None)
 27 |     group.add_argument("--from-quantized-checkpoint", action="store_true", help="Loading from a quantized checkpoint")
 28 | 
 29 | 
 30 | def add_initialization_args(parser):
 31 |     group = parser.add_argument_group("Initialization")
 32 | 
 33 |     group.add_argument(
 34 |         "--sequential-initialization",
 35 |         action="store_true",
 36 |         help="Initialize sequentially in tensor parallel group (reduce CPU RAM for initialization)",
 37 |     )
 38 | 
 39 | 
 40 | def initialize(extra_args_provider):
 41 |     parser = argparse.ArgumentParser(add_help=False)
 42 |     add_bminf_args(parser)
 43 |     add_quantization_args(parser)
 44 |     add_initialization_args(parser)
 45 |     GLM130B.add_model_specific_args(parser)
 46 |     extra_args_provider(parser)
 47 |     known, args_list = parser.parse_known_args()
 48 |     args = get_args(args_list)
 49 |     args = argparse.Namespace(**vars(args), **vars(known))
 50 |     args.do_train = False
 51 |     initialize_distributed(args)
 52 |     return args
 53 | 
 54 | 
 55 | def initialize_model_and_tokenizer(args):
 56 |     tokenizer = get_tokenizer(args)
 57 | 
 58 |     torch.distributed.barrier()
 59 |     start = time.time()
 60 | 
 61 |     for i in range(get_model_parallel_world_size()):
 62 |         if get_model_parallel_rank() == i:
 63 |             # Initialize model
 64 |             model = GLM130B(args).half()
 65 | 
 66 |             if args.from_quantized_checkpoint:
 67 |                 assert args.quantization_bit_width is not None
 68 |                 # Quantize model before moving to GPU
 69 |                 model = quantize(model, args.quantization_bit_width)
 70 | 
 71 |             # Load checkpoint
 72 |             load_checkpoint(model, args)
 73 | 
 74 |             if args.quantization_bit_width is not None and not args.from_quantized_checkpoint:
 75 |                 # Quantize model before moving to GPU
 76 |                 model = quantize(model, args.quantization_bit_width)
 77 | 
 78 |             if args.bminf:
 79 |                 import bminf
 80 | 
 81 |                 if torch.distributed.get_rank() == 0:
 82 |                     print(f"> BMInf activated, memory limit: {args.bminf_memory_limit} GB")
 83 |                 with torch.cuda.device(args.device):
 84 |                     model = bminf.wrapper(model, quantization=False, memory_limit=args.bminf_memory_limit << 30)
 85 |             else:
 86 |                 model = model.to(args.device)
 87 |         if args.sequential_initialization:
 88 |             torch.distributed.barrier(group=get_model_parallel_group())
 89 | 
 90 |     torch.distributed.barrier()
 91 |     if torch.distributed.get_rank() == 0:
 92 |         print(f"> Model initialized in {time.time() - start:.1f}s")
 93 | 
 94 |     torch.cuda.empty_cache()
 95 |     model.eval()
 96 | 
 97 |     # generate rotary embedding cache
 98 |     original_parallel_output = model.transformer.parallel_output
 99 |     model.transformer.parallel_output = True
100 |     with torch.no_grad():
101 |         _, *_ = model(
102 |             torch.ones(1, args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64),
103 |             torch.arange(args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1),
104 |             torch.randn(
105 |                 1,
106 |                 1,
107 |                 args.max_sequence_length,
108 |                 args.max_sequence_length,
109 |                 device=torch.cuda.current_device(),
110 |             )
111 |             < 0.5,
112 |         )
113 |     model.transformer.parallel_output = original_parallel_output
114 |     torch.distributed.barrier()
115 | 
116 |     return model, tokenizer
117 | 


--------------------------------------------------------------------------------
/kernels/__init__.py:
--------------------------------------------------------------------------------
  1 | import pkg_resources
  2 | import torch
  3 | import ctypes
  4 | 
  5 | from typing import List
  6 | from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up
  7 | 
  8 | RESOURCE_PACKAGE_NAME = __name__
  9 | 
 10 | 
 11 | class Kernel:
 12 |     def __init__(self, filename: str, function_names: List[str]):
 13 |         filename = filename + ".fatbin"
 14 |         if not pkg_resources.resource_exists(RESOURCE_PACKAGE_NAME, filename):
 15 |             raise RuntimeError("File `%s` not found in `%s`" % (filename, RESOURCE_PACKAGE_NAME))
 16 |         self.filename = filename
 17 |         self.code = pkg_resources.resource_string(RESOURCE_PACKAGE_NAME, filename)
 18 |         self._function_names = function_names
 19 |         self._cmodule = LazyKernelCModule(self.code)
 20 | 
 21 |         for name in self._function_names:
 22 |             setattr(self, name, KernelFunction(self._cmodule, name))
 23 | 
 24 | 
 25 | kernels = Kernel(
 26 |     "quantization",
 27 |     [
 28 |         "int4WeightCompression",
 29 |         "int4WeightExtractionFloat",
 30 |         "int4WeightExtractionHalf",
 31 |         "int8WeightExtractionFloat",
 32 |         "int8WeightExtractionHalf",
 33 |     ],
 34 | )
 35 | 
 36 | 
 37 | def compress_int4_weight(weight: torch.Tensor):  # (n, m)
 38 |     with torch.cuda.device(weight.device):
 39 |         n, m = weight.size(0), weight.size(1)
 40 |         assert m % 2 == 0
 41 |         m = m // 2
 42 |         out = torch.empty(n, m, dtype=torch.int8, device="cuda")
 43 |         stream = torch.cuda.current_stream()
 44 | 
 45 |         gridDim = (n, 1, 1)
 46 |         blockDim = (min(round_up(m, 32), 1024), 1, 1)
 47 | 
 48 |         kernels.int4WeightCompression(
 49 |             gridDim,
 50 |             blockDim,
 51 |             0,
 52 |             stream,
 53 |             [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)],
 54 |         )
 55 |         return out
 56 | 
 57 | 
 58 | def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int):
 59 |     if source_bit_width == 8:
 60 |         func = kernels.int8WeightExtractionHalf
 61 |     elif source_bit_width == 4:
 62 |         func = kernels.int4WeightExtractionHalf
 63 |     else:
 64 |         assert False, "Unsupported bit-width"
 65 | 
 66 |     with torch.cuda.device(weight.device):
 67 |         n, m = weight.size(0), weight.size(1)
 68 |         out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda")
 69 |         stream = torch.cuda.current_stream()
 70 | 
 71 |         gridDim = (n, 1, 1)
 72 |         blockDim = (min(round_up(m, 32), 1024), 1, 1)
 73 | 
 74 |         func(
 75 |             gridDim,
 76 |             blockDim,
 77 |             0,
 78 |             stream,
 79 |             [
 80 |                 ctypes.c_void_p(weight.data_ptr()),
 81 |                 ctypes.c_void_p(scale_list.data_ptr()),
 82 |                 ctypes.c_void_p(out.data_ptr()),
 83 |                 ctypes.c_int32(n),
 84 |                 ctypes.c_int32(m),
 85 |             ],
 86 |         )
 87 |         return out
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 |     weight = torch.randn(4, 32).to(torch.int8).cuda()
 92 |     scale = torch.ones(weight.size(0)).to(torch.half).cuda()
 93 | 
 94 |     print(weight)
 95 |     b = compress_int4_weight(weight)
 96 |     print(b)
 97 | 
 98 |     a = extract_weight_to_half(b, scale, source_bit_width=4)
 99 |     print(a)
100 | 


--------------------------------------------------------------------------------
/kernels/quantization.fatbin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/kernels/quantization.fatbin


--------------------------------------------------------------------------------
/logs/README.md:
--------------------------------------------------------------------------------
1 | # Training Logs
2 | 
3 | `main-log.md` contains detailed information about each restart of training during GLM-130B training.
4 | 
5 | Tensorboard logs is available at [here](https://cloud.tsinghua.edu.cn/f/503ef9fa85b84fbba9ef/).
6 | 


--------------------------------------------------------------------------------
/logs/main-log-en.md:
--------------------------------------------------------------------------------
  1 | # The training notes of GLM-130B 
  2 | 
  3 | ## Basic Information about GLM-130B
  4 | 
  5 | - 130B：70 layers，12288 hidden size，32768 ffn hidden size, 150000 vocab size
  6 |    - MP = 4, PP = 8
  7 | - GLM + Rotary Positional Embedding + GeGLU + DeepNorm
  8 | - FP32 softmax with QKV scaling（no PB-Relax）
  9 | - Shrink embedding gradient with $\alpha=0.1$
 10 | - Global batch size: 4224
 11 | 
 12 | ## Environment
 13 | 
 14 | - PyTorch 1.11 / CUDA 11.3
 15 | - LargeScale@400893da37bb5cbe22c29e41c02a052369cc72ce
 16 | - DeepSpeed 0.6.1
 17 | - apex@master
 18 | 
 19 | ## Speed Testing (with Different Batch Sizes)
 20 | 
 21 | - 96 nodes, BSZ=176 * 24=4224
 22 |    - glm-130B-2022.05.05-19:34:16：134TFLOPS, 88.5s/iter, 48samples/s,
 23 | - 96 nodes, BSZ=256 * 24=6144
 24 |    - glm-130B-2022.05.05-19:43:13：141TFLOPS, 122.5s/iter, 50samples/s
 25 | 
 26 | ## 2022-05-06 04:00 Training starts
 27 | 
 28 | - glm-130B-2022.05.05-19:53:15
 29 | 
 30 | ## 2022-05-07 20:14 Node failure
 31 | 
 32 | n30041, n30157 break down, changing saving interval to 100 steps (originally 500 steps, too long), restart from 4000 step
 33 | 
 34 | - glm-130B-2022.05.07-13:44:59
 35 | 
 36 | ## 2022-05-10 00:00 Increase alpha for embedding shrink, as we think the original alpha is too small (originally 0.1)
 37 | 
 38 | add `--shrink-embedding-gradient-steps 6000 500` to warmup alpha to 1 from 6000 step within 500 steps
 39 | 
 40 | - glm-130B-2022.05.09-16:02:04
 41 | 
 42 | ## 2022-05-11 12:13 Node failure
 43 | 
 44 | n30115 breaks down, restart from 7300 step
 45 | 
 46 | - glm-130B-2022.05.11-05:55:32
 47 | 
 48 | ## 2022-05-20 00:03 Node failure
 49 | 
 50 | n30066 breaks down, restart from 15400 step
 51 | 
 52 | - glm-130B-2022.05.19-19:56:19
 53 | 
 54 | Switch to another node pool, and restart from 15600 step
 55 | 
 56 | - glm-130B-2022.05.20-01:58:57
 57 | 
 58 | ## 2022-05-21 12:40 Replace node
 59 | 
 60 | Finding that the training flop is only 127T, smaller than before; suspecting that the n30076 we have replaced in has some unknown errors and kicking it out from 16600 step; nothing changes
 61 | 
 62 | ## 2022-05-22 19:27 Node failure
 63 | 
 64 | n30126 loses connection
 65 | 
 66 | - glm-130B-2022.05.22-14:15:41
 67 | 
 68 | ## 2022-05-26 04:30 Node failure
 69 | 
 70 | n30039 reports missing GPUs
 71 | 
 72 | - glm-130B-2022.05.25-22:23:12
 73 | 
 74 | 
 75 | ## 2022-05-28 11:50 Change Multi-task Instruction Pre-training (MIP) data (abolished)
 76 | 
 77 | Restarts from 22800 step, change MIP data to the correct one (English & Chinese)
 78 | 
 79 | - glm-130B-2022.05.28-03:52:26
 80 | - events.out.tfevents.1653709957.9droa42ltcad5-0.1858.0 (abolished)
 81 | 
 82 | ## 2022-05-28 16:50 Change MIP data
 83 | 
 84 | New MIP data (English & Chinese) leads to NaN loss at 22900 step; finding too much noises in Chinese multi-task data; switch to vanilla T0 training datasets
 85 | 
 86 | - glm-130B-2022.05.28-09:18:12
 87 | - events.out.tfevents.1653729502.9droa42ltcad5-0.5648.0（移除）
 88 | 
 89 | ## 2022-05-28 20:50 Add warmup (abolished)
 90 | 
 91 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C850748B-92A4-4F9F-932F-AD22330895D6_2/E8MboG8vrTTb2N51FRhkb6wsB4eyrD77USmM992obQgz/Image.png)
 92 | 
 93 | Vanilla T0 datasets still lead to disconvergence; suspecting a changed task ratio leads to the instability; add argument `--warmup-samples-after-loading 2112000` to warmup 500 steps from 22800 step
 94 | 
 95 | - glm-130B-2022.05.28-12:57:24
 96 | - events.out.tfevents.1653742654.9droa42ltcad5-0.7942.0（移除）
 97 | 
 98 | ## 2022-05-29 01:30 Disconverges again, switch to self-supervised pre-training only (abolished)
 99 | 
100 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/028DE014-00FE-4521-BEEB-EF3F61BB8DA1_2/mgYybTR1OLgPkBysqMiUgGYNyIg8OQnf1yXI66grBeMz/Image.png)
101 | 
102 | - Disconverges after warmup; suspecting that the distribution change is still too large; trying to restart using self-supervised pre-training only with data reshuffle, loading from 22800 step
103 | - glm-130B-2022.05.28-18:05:33
104 | - events.out.tfevents.1653761143.9droa42ltcad5-0.9744.0 (abolished)
105 | - global_step23200_text
106 | + Configuration file
107 | 
108 | ## 2022-05-29 Smoothing distribution shift (abolished)
109 | 
110 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E2BC463F-E519-461E-B1B0-99551DA940BE_2/0ZqN22TLyqRTvqOy6JNLeixEy4TarDJEF7DOvdh3saIz/Image.png)
111 | 
112 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/9C7AC4B3-59AB-471A-872E-41CCBAE7E90D_2/0rpEmyAOcIkLyDGR2R4RQiBeUwbWIWiaHbHcwosx6yAz/Image.png)
113 | 
114 | Self-supervised pre-training only seems to be stable; trying to smooth the distribution shift via a warmed-up ratio of correct T0 data from 22800 step
115 | 
116 | - glm-130B-2022.05.29-05:17:06
117 | - events.out.tfevents.1653801436.9droa42ltcad5-0.13868.0 (abolished)
118 | 
119 | ## 2022-05-29 22:40 Smoothing data distribution shift & warmup learning rate
120 | 
121 | - Disconverges; suspecting that learning rate requires warmup in this process, too
122 | 
123 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/F5532A86-3AAC-4CCE-AC9B-A976B7736D7F_2/M4JZx5GYzNPuysPHXrn0R5Oo54rBhDwQxdErkOpFOhEz/Image.png)
124 | 
125 | - Restart from 22800, warmup correct MIP data ratio and learning rate for 2000 steps; warmup embedding gradient shrink alpha from 0.2 to 1 by 6000 steps
126 | - glm-130B-2022.05.29-17:35:45
127 | 
128 | ## 2022-05-30 14:00 Node and file system failure
129 | 
130 | Finding the warmup steps for embedding gradient shrink to be wrong (26850 steps instead of 6000 steps); changing the warmup steps implementation (according to the absolute number of samples); restarting from global_step23200
131 | 
132 | We discover that the restart is stacked in the data loading, which turns out to be an error of the Lustre file system. The result is that we cannot read the 2.3T text corpora and the engineer cannot help to recover the data, and we have to copy data from backup disk to the file system again (which takes few days)
133 | 
134 | - glm-130B-2022.05.31-02:18:24
135 | 
136 | ## 2022.05.03 20:00 Add DeepStruct data to MIP
137 | 
138 | - Keeping the original warmup process; adding DeepStruct data to MIP portion; restart from 23500 step
139 | 
140 | ## 2022-06-01 22:22 Replace MIP data to a cleaner version
141 | 
142 | Finding one noisy prompt in the task data for T0 (qqp) and DeepStruct respectively; removing them and restarting from 24500 step
143 | 
144 | - glm-130B-2022.06.01-14:24:33
145 | 
146 | ## 2022-06-02 12:00 Node failure
147 | 
148 | - n30145 CPU error, restarting from 25000 step; removing the warmup process as it has ended
149 | - glm-130B-2022.06.02-04:35:05
150 | 
151 | ## 2022-06-02 09:30 Start to print multitask loss
152 | 
153 | From 25800 step, we print multitask loss
154 | 
155 | - glm-130B-2022.06.03-01:40:12
156 | 
157 | ## 2022-06-02 15:00 Reduce learning rate and print gpt/bert loss 
158 | 
159 | The loss decreases slowly, and we think it might be attributed to a too large learning rate; from 26000 step, we half the learning rate
160 | 
161 | - glm-130B-2022.06.03-07:26:16
162 | 
163 | ## 2022-06-06 17:00 Node cluster maintenance
164 | 
165 | The node cluster needs an upgrade from 9 am to 5 am
166 | 
167 | - glm-130B-2022.06.06-10:00:39
168 | 
169 | PS: we observe a significant improvement of the file system's reading speed; only need 1 minute to load the checkpoint now
170 | 
171 | ## 2022-06-08 08:00 Node failure
172 | 
173 | - glm-130B-2022.06.08-00:00:37
174 | 
175 | ## 2022-06-09 13:30 Unexpected termination of the training
176 | 
177 | Restarting from 23100 step; suspecting the network communication problem
178 | 
179 | - glm-130B-2022.06.09-05:27:54
180 | 
181 | ## 2022-06-12 10:00 Loss explodes
182 | 
183 | From 33700 step, the training loss explodes. The loss-scale reduces drastically around 33710 step, and the loss explodes at 33740 step
184 | 
185 | - tensorboard record：glm-130B-33700
186 | 
187 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C46C7CFE-1B79-491C-90FC-5A88AE90E9DF_2/7ICMyH8v6GhAgngz5bVaDKwzYjFPyk99Ax27R5w56wMz/Image.png)
188 | 
189 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E56BCDE0-C798-429F-81E0-1A07CCB9BC0E_2/Ig2rfKnPmLadg39Jc38UEdK90LDxlAxoH0AxmAygxzAz/Image.png)
190 | 
191 | - Restaring from 33600 step, reduce shrink embedding gradient from 1.0 to 0.5
192 | - glm-130B-2022.06.12-02:20:49
193 | 
194 | ## 2022-06-14 03:00 Loss explodes
195 | 
196 | At 35250 step, the loss explodes again; almost the same behavior as it is in 33700 step; breaking down without any signs
197 | 
198 | tensorboard record：glm-130B-35250
199 | 
200 | - Restarting from 35200 step, and shrinking embedding gradient from 0.5 to 0.1
201 | - glm-130B-2022.06.14-02:28:21
202 | 
203 | ## 2022-06-19 00:10 Node failure
204 | 
205 | n30085 breaks down, restarting from 39600 step
206 | 
207 | - glm-130B-2022.06.18-17:49:53
208 | 
209 | ## 2022-06-20 09:10 Loss explodes
210 | 
211 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/CA344108-3B01-469C-9ABE-C41002F76484_2/oEvBST5MP0I7S4qHmQUeE7DoPCsGFSrveAOOSyitSUwz/Image.png)
212 | 
213 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/FED0DE40-A710-4259-AE98-26BCB9568C7A_2/kH4FijsPDVJFzkbaxz7BiX0RZrul1Wrye6cE5EV8ZG0z/Image.png)
214 | 
215 | - tensorboard record：glm-130B-40800
216 | - `--skip-train-iteration-range 40701-40900`
217 | - Restarting from 40700 step and skipping the noisy data in 40701-40900 steps
218 | - glm-130B-2022.06.20-03:36:13
219 | 
220 | ## 2022-06-22 10:40 Gradient spikes
221 | 
222 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/0B7E0A0C-4B11-4F52-BF10-E6B11A533BEF_2/yb1zC07di9zux8jbAi15gpqlstGHXZyjyMBEjO0gNKUz/Image.png)
223 | 
224 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/1C60424A-0290-4070-9327-DF9DFD135020_2/XyVoPs1yMLIuzUyrDixSYfgjc2Y2Nuor20GCz0nSPkAz/Image.png)
225 | 
226 | - The gradient norm experiences a spike, which seems to recover automatically; but the training loss experiences a drastic change
227 | - `--skip-train-iteration-range 40701-40900`
228 | - Restarting from 42400 and skipping data in 42401-42600 steps
229 | - glm-130B-2022.06.22-02:38:20
230 | 
231 | ## 2022-06-22 21:00 Gradient spikes
232 | 
233 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/1D7D801C-3226-4CB0-978C-F19B4DA46721_2/nmg9r87OFrdErZvY9xjiDIHvgPVLv39vy8ZVtGkj2H0z/Image.png)
234 | 
235 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/5F5CA3D6-AF58-4087-9806-1529D3A2EF6C_2/WSQqyBdv1rvzvNloXE6Ssql7GxMDoULU38FAQCv3778z/Image.png)
236 | 
237 | - The gradient norm experiences a spike again, but the loss-scale seems stable. We think it might recover automatically.
238 | - Rethinking on the repeating gradient spikes in recent days, we speculate it might be attributed to a too-slow learning rate decay in the late stage of pre-training; reducing minimum lr from 8e-6 to 4e-6
239 | - `--min-lr 4e-6`
240 | - Restarting from 42700 step
241 | - glm-130B-2022.06.22-13:03:53
242 | 
243 | ## 2022.06.26 16:00 Node failure
244 | 
245 | - Unexpected NVLink Error; restarting training
246 | - glm-130B-2022.06.26-13:13:51
247 | 
248 | ## 2022.06.29 00:00 Recover position_id
249 | 
250 | - Restarting training from 48100 step; using another more consistent positional encoding (the original one has a different implementation for \[MASK\] and \[gMASK\])
251 | - glm-130B-2022.06.29-13:53:21
252 | 


--------------------------------------------------------------------------------
/logs/main-log.md:
--------------------------------------------------------------------------------
  1 | # GLM-130B 训练日志
  2 | 
  3 | ## 模型信息
  4 | 
  5 | - 130B：70 layers，12288 hidden size，32768 ffn hidden size, 150000 vocab size
  6 |    - MP = 4, PP = 8
  7 | - GLM + Rotary Positional Embedding + GeGLU + DeepNorm
  8 | - FP32 softmax with QKV scaling（no PB-Relax）
  9 | - Shrink embedding gradient with $\alpha=0.1$
 10 | - Global batch size: 4224
 11 | 
 12 | ## 环境版本
 13 | 
 14 | - PyTorch 1.11 / CUDA 11.3
 15 | - LargeScale@400893da37bb5cbe22c29e41c02a052369cc72ce
 16 | - DeepSpeed 0.6.1
 17 | - apex@master
 18 | 
 19 | ## 测速
 20 | 
 21 | - 96 nodes, BSZ=176 * 24=4224
 22 |    - glm-130B-2022.05.05-19:34:16：134TFLOPS, 88.5s/iter, 48samples/s,
 23 | - 96 nodes, BSZ=256 * 24=6144
 24 |    - glm-130B-2022.05.05-19:43:13：141TFLOPS, 122.5s/iter, 50samples/s
 25 | 
 26 | ## 2022-05-06 04:00 开始训练
 27 | 
 28 | - glm-130B-2022.05.05-19:53:15
 29 | 
 30 | ## 2022-05-07 20:14 节点故障
 31 | 
 32 | 坏掉 n30041, n30157 两个点，更改保存间隔为 100step，从 4000 step 开始训练
 33 | 
 34 | - glm-130B-2022.05.07-13:44:59
 35 | 
 36 | ## 2022-05-10 00:00 提升 alpha
 37 | 
 38 | 加入 `--shrink-embedding-gradient-steps 6000 500` 从 6000 step 开始训练
 39 | 
 40 | - glm-130B-2022.05.09-16:02:04
 41 | 
 42 | ## 2022-05-11 12:13 节点故障
 43 | 
 44 | 坏掉 n30115 节点，从 7300 step 开始训练
 45 | 
 46 | - glm-130B-2022.05.11-05:55:32
 47 | 
 48 | ## 2022-05-20 00:03 节点故障
 49 | 
 50 | 坏掉 n30066 节点，从 15400 step 开始训练
 51 | 
 52 | - glm-130B-2022.05.19-19:56:19
 53 | 
 54 | 再换一批节点，从 15600 step 开始训练
 55 | 
 56 | - glm-130B-2022.05.20-01:58:57
 57 | 
 58 | ## 2022-05-21 12:40 换节点
 59 | 
 60 | 训练效率一直只有 127T 左右，怀疑之前加入的 n30076 存在问题，踢出后从 16600 step 开始训练，似乎不解决问题。
 61 | 
 62 | ## 2022-05-22 19:27 节点故障
 63 | 
 64 | n30126 失联
 65 | 
 66 | - glm-130B-2022.05.22-14:15:41
 67 | 
 68 | ## 2022-05-26 04:30 节点故障
 69 | 
 70 | n30039 掉卡
 71 | 
 72 | - glm-130B-2022.05.25-22:23:12
 73 | 
 74 | 
 75 | ## 2022-05-28 11:50 更换中英多任务数据（废除）
 76 | 
 77 | 从 22800 开始训练，换中英多任务数据
 78 | 
 79 | - glm-130B-2022.05.28-03:52:26
 80 | - events.out.tfevents.1653709957.9droa42ltcad5-0.1858.0（移除）
 81 | 
 82 | ## 2022-05-28 16:50 更换英文多任务数据（废除）
 83 | 
 84 | 换新的多任务数据 22900 左右出现 nan，挂掉训练，检查发现中文多任务数据噪声极大，从 22800 换成平衡后的 t0 原始数据开始训练
 85 | 
 86 | - glm-130B-2022.05.28-09:18:12
 87 | - events.out.tfevents.1653729502.9droa42ltcad5-0.5648.0（移除）
 88 | 
 89 | ## 2022-05-28 20:50 加入 warmup（废除）
 90 | 
 91 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C850748B-92A4-4F9F-932F-AD22330895D6_2/E8MboG8vrTTb2N51FRhkb6wsB4eyrD77USmM992obQgz/Image.png)
 92 | 
 93 | 换上平衡后且不泄漏的 t0 原始数据开始训练仍然有问题，推测是平衡后一些任务占比变大，其实等价于加入新任务的情况，加入参数 `--warmup-samples-after-loading 2112000` warmup 500 步从 22800 开始训练
 94 | 
 95 | - glm-130B-2022.05.28-12:57:24
 96 | - events.out.tfevents.1653742654.9droa42ltcad5-0.7942.0（移除）
 97 | 
 98 | ## 2022-05-29 01:30 再次爆炸，换纯文本（废除）
 99 | 
100 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/028DE014-00FE-4521-BEEB-EF3F61BB8DA1_2/mgYybTR1OLgPkBysqMiUgGYNyIg8OQnf1yXI66grBeMz/Image.png)
101 | 
102 | - warmup 以后还是炸了，分析可能是 distribution 变动仍然太过剧烈，先换纯文本 + reshuffle 尝试训练，从 22800 加载
103 | - glm-130B-2022.05.28-18:05:33
104 | - events.out.tfevents.1653761143.9droa42ltcad5-0.9744.0（废除）
105 | - global_step23200_text
106 | + 配置文件
107 | 
108 | ## 2022-05-29 逐渐修改数据分布（废除）
109 | 
110 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E2BC463F-E519-461E-B1B0-99551DA940BE_2/0ZqN22TLyqRTvqOy6JNLeixEy4TarDJEF7DOvdh3saIz/Image.png)
111 | 
112 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/9C7AC4B3-59AB-471A-872E-41CCBAE7E90D_2/0rpEmyAOcIkLyDGR2R4RQiBeUwbWIWiaHbHcwosx6yAz/Image.png)
113 | 
114 | 文本似乎能稳定，那么尝试逐渐平滑修改数据分布， 从 22800 开始，逐渐修改数据分布到 t0 平衡数据
115 | 
116 | - glm-130B-2022.05.29-05:17:06
117 | - events.out.tfevents.1653801436.9droa42ltcad5-0.13868.0（废除）
118 | 
119 | ## 2022-05-29 22:40 逐渐修改数据分布并全面 warmup
120 | 
121 | - 又挂了，分析可能是换新分布学习率也需要 warmup
122 | 
123 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/F5532A86-3AAC-4CCE-AC9B-A976B7736D7F_2/M4JZx5GYzNPuysPHXrn0R5Oo54rBhDwQxdErkOpFOhEz/Image.png)
124 | 
125 | - 从 22800 开始训练，数据和 lr 都 warmup 2000 步，shrink embbeding graident 从 0.2 warmup 6000 步到 1
126 | - glm-130B-2022.05.29-17:35:45
127 | 
128 | ## 2022-05-30 14:00 挂节点
129 | 
130 | 更改了一下参数配置，发现之前 shrink embedding 的步数写错了（26850 步），现在改成 6000 步。升级了一下 lr auto warmup 的逻辑，写成绝对 samples 数量。从 global_step23200 开始
131 | 
132 | 我们发现这次训练卡在了数据加载，排查后发现是 Lustre 文件系统的故障，导致 2.3T 文本数据读不出来，且工程师无法修复；最终重新从移动硬盘拷贝了一次数据
133 | 
134 | - glm-130B-2022.05.31-02:18:24
135 | 
136 | ## 2022.05.03 20:00 加 DeepStruct 数据
137 | 
138 | - 维持原有 transform 过程不变，但直接加入 DeepStruct 数据，从 23500 开始
139 | 
140 | ## 2022-06-01 22:22 换清洗数据
141 | 
142 | 之前的多任务数据 t0 和 deepsturct 各有一个任务的 target 异常，重新清洗后更换，从 24500 开始
143 | 
144 | - glm-130B-2022.06.01-14:24:33
145 | 
146 | ## 2022-06-02 12:00 节点故障
147 | 
148 | - n30145 CPU 故障，从 25000 重启训练，lr 和 数据集已经 transfromer 完毕，所以配置直接去掉 warmup
149 | - glm-130B-2022.06.02-04:35:05
150 | 
151 | ## 2022-06-02 09:30 加入 multitask loss 打印
152 | 
153 | 25800steps 开始，加入 multitask loss 打印
154 | 
155 | - glm-130B-2022.06.03-01:40:12
156 | 
157 | ## 2022-06-02 15:00 降低学习率，加入 gpt/bert loss 打印
158 | 
159 | loss 降低比较慢，讨论可能是学习率太大了，26000steps 开始，学习率砍半
160 | 
161 | - glm-130B-2022.06.03-07:26:16
162 | 
163 | ## 2022-06-06 17:00 集群维护
164 | 
165 | 集群从 9 点到 5 点升级驱动，从  开始训练
166 | 
167 | - glm-130B-2022.06.06-10:00:39
168 | 
169 | PS：观察到共享文件系统读取速度显著改善，现在加载 ckpt 几乎只需要 1 分钟
170 | 
171 | ## 2022-06-08 08:00 坏点
172 | 
173 | - glm-130B-2022.06.08-00:00:37
174 | 
175 | ## 2022-06-09 13:30 训练卡住
176 | 
177 | 23100 开始恢复
178 | 
179 | - glm-130B-2022.06.09-05:27:54
180 | 
181 | ## 2022-06-12 10:00 loss 爆炸
182 | 
183 | 33700 开始 loss 炸了，loss-scale 在 33710 左右突然下跌然后 loss 在 33740 左右爆炸
184 | 
185 | - tensorboard 记录：glm-130B-33700
186 | 
187 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C46C7CFE-1B79-491C-90FC-5A88AE90E9DF_2/7ICMyH8v6GhAgngz5bVaDKwzYjFPyk99Ax27R5w56wMz/Image.png)
188 | 
189 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E56BCDE0-C798-429F-81E0-1A07CCB9BC0E_2/Ig2rfKnPmLadg39Jc38UEdK90LDxlAxoH0AxmAygxzAz/Image.png)
190 | 
191 | - 从 33600 开始加载，shrink embedding gradient 1 → 0.5
192 | - glm-130B-2022.06.12-02:20:49
193 | 
194 | ## 2022-06-14 03:00 loss 爆炸
195 | 
196 | 35250 loss 又炸了，和 33700 的表现几乎一样，都是完全没有征兆突然爆炸
197 | 
198 | tensorboard 记录：glm-130B-35250
199 | 
200 | - 从 35200 开始加载，shrink embedding gradient 0.5 → 0.1
201 | - glm-130B-2022.06.14-02:28:21
202 | 
203 | ## 2022-06-19 00:10 节点故障
204 | 
205 | n30085 挂了，从 39600 恢复
206 | 
207 | - glm-130B-2022.06.18-17:49:53
208 | 
209 | ## 2022-06-20 09:10 loss 爆炸
210 | 
211 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/CA344108-3B01-469C-9ABE-C41002F76484_2/oEvBST5MP0I7S4qHmQUeE7DoPCsGFSrveAOOSyitSUwz/Image.png)
212 | 
213 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/FED0DE40-A710-4259-AE98-26BCB9568C7A_2/kH4FijsPDVJFzkbaxz7BiX0RZrul1Wrye6cE5EV8ZG0z/Image.png)
214 | 
215 | - tensorboard 记录：glm-130B-40800
216 | - `--skip-train-iteration-range 40701-40900`
217 | - 从 40700 开始重新加载并跳过 40701-40900 数据
218 | - glm-130B-2022.06.20-03:36:13
219 | 
220 | ## 2022-06-22 10:40 梯度 spike
221 | 
222 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/0B7E0A0C-4B11-4F52-BF10-E6B11A533BEF_2/yb1zC07di9zux8jbAi15gpqlstGHXZyjyMBEjO0gNKUz/Image.png)
223 | 
224 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/1C60424A-0290-4070-9327-DF9DFD135020_2/XyVoPs1yMLIuzUyrDixSYfgjc2Y2Nuor20GCz0nSPkAz/Image.png)
225 | 
226 | - grad 有点小 spike，看起来后续恢复了，但 loss 似乎遇到了比较大的波动
227 | - `--skip-train-iteration-range 40701-40900`
228 | - 从 42400 开始重新加载并跳过 42401-42600 数据
229 | - glm-130B-2022.06.22-02:38:20
230 | 
231 | ## 2022-06-22 21:00 梯度 spike
232 | 
233 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/1D7D801C-3226-4CB0-978C-F19B4DA46721_2/nmg9r87OFrdErZvY9xjiDIHvgPVLv39vy8ZVtGkj2H0z/Image.png)
234 | 
235 | ![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/5F5CA3D6-AF58-4087-9806-1529D3A2EF6C_2/WSQqyBdv1rvzvNloXE6Ssql7GxMDoULU38FAQCv3778z/Image.png)
236 | 
237 | - grad 又有 spike，但是 loss-scale 没有一降到底，推测应该可以恢复
238 | - 这几天的反复 spike，我们分析可能是后期 learning rate 降低太慢，将 min-lr 从 8e-6 调整到 4e-6
239 | - `--min-lr 4e-6`
240 | - 从 42700 加载开始训练
241 | - glm-130B-2022.06.22-13:03:53
242 | 
243 | ## 2022.06.26 16:00 节点故障
244 | 
245 | - 节点 NVLink Error，重启训练
246 | - glm-130B-2022.06.26-13:13:51
247 | 
248 | ## 2022.06.29 00:00 恢复 position_id
249 | 
250 | - 48100 从原先配置开始训练
251 | - glm-130B-2022.06.29-13:53:21
252 | 


--------------------------------------------------------------------------------
/quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .layers import QuantizedColumnParallelLinear
 4 | from .layers import QuantizedRowParallelLinear
 5 | 
 6 | 
 7 | def quantize(model, weight_bit_width):
 8 |     """Replace fp16 linear with quantized linear"""
 9 | 
10 |     if torch.distributed.get_rank() == 0:
11 |         print(f"> Quantizing model weight to {weight_bit_width} bits")
12 | 
13 |     for layer in model.transformer.layers:
14 |         layer.attention.query_key_value = QuantizedColumnParallelLinear(
15 |             weight_bit_width=weight_bit_width,
16 |             weight=layer.attention.query_key_value.weight.to(torch.cuda.current_device()),
17 |             input_size=layer.attention.query_key_value.input_size,
18 |             output_size=layer.attention.query_key_value.output_size,
19 |             bias=True,
20 |             gather_output=False,
21 |             params_dtype=torch.half,
22 |             name="query_key_value",
23 |             skip_init=True,
24 |             device=layer.attention.query_key_value.weight.device,
25 |         )
26 |         layer.attention.dense = QuantizedRowParallelLinear(
27 |             weight_bit_width=weight_bit_width,
28 |             weight=layer.attention.dense.weight.to(torch.cuda.current_device()),
29 |             input_size=layer.attention.dense.input_size,
30 |             output_size=layer.attention.dense.output_size,
31 |             bias=True,
32 |             input_is_parallel=True,
33 |             params_dtype=torch.half,
34 |             name="dense",
35 |             skip_init=True,
36 |             device=layer.attention.dense.weight.device,
37 |         )
38 |         layer.mlp.dense_h_to_4h = QuantizedColumnParallelLinear(
39 |             weight_bit_width=weight_bit_width,
40 |             weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()),
41 |             input_size=layer.mlp.dense_h_to_4h.input_size,
42 |             output_size=layer.mlp.dense_h_to_4h.output_size,
43 |             bias=True,
44 |             gather_output=False,
45 |             params_dtype=torch.half,
46 |             name="dense_h_to_4h",
47 |             skip_init=True,
48 |             device=layer.mlp.dense_h_to_4h.weight.device,
49 |         )
50 |         layer.mlp.dense_4h_to_h = QuantizedRowParallelLinear(
51 |             weight_bit_width=weight_bit_width,
52 |             weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()),
53 |             input_size=layer.mlp.dense_4h_to_h.input_size,
54 |             output_size=layer.mlp.dense_4h_to_h.output_size,
55 |             bias=True,
56 |             input_is_parallel=True,
57 |             params_dtype=torch.half,
58 |             name="dense_h_to_4h",
59 |             skip_init=True,
60 |             device=layer.mlp.dense_4h_to_h.weight.device,
61 |         )
62 | 
63 |     return model
64 | 


--------------------------------------------------------------------------------
/quantization/functional.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from kernels import extract_weight_to_half
 4 | 
 5 | 
 6 | class W8A16Linear(torch.autograd.Function):
 7 |     @staticmethod
 8 |     def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width):
 9 |         ctx.inp_shape = inp.size()
10 |         ctx.weight_shape = quant_w.size()
11 |         ctx.weight_bit_width = weight_bit_width
12 |         out_features = quant_w.size(0)
13 |         inp = inp.contiguous().view(-1, inp.size(-1))
14 |         weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width)
15 |         output = inp.mm(weight.t())
16 |         ctx.save_for_backward(inp, quant_w, scale_w)
17 |         return output.view(*(ctx.inp_shape[:-1] + (out_features,)))
18 | 
19 |     @staticmethod
20 |     def backward(ctx, grad_output: torch.Tensor):
21 |         inp, quant_w, scale_w = ctx.saved_tensors
22 |         weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width)
23 |         grad_output = grad_output.contiguous().view(-1, weight.size(0))
24 |         grad_input = grad_output.mm(weight)
25 |         grad_weight = grad_output.t().mm(inp)
26 |         return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None
27 | 


--------------------------------------------------------------------------------
/quantization/layers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn.parameter import Parameter
 3 | 
 4 | from SwissArmyTransformer.mpu import copy_to_model_parallel_region
 5 | from SwissArmyTransformer.mpu import gather_from_model_parallel_region
 6 | from SwissArmyTransformer.mpu import reduce_from_model_parallel_region
 7 | from SwissArmyTransformer.mpu import scatter_to_model_parallel_region
 8 | from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear
 9 | 
10 | from .functional import W8A16Linear
11 | from kernels import compress_int4_weight
12 | 
13 | 
14 | class QuantizedColumnParallelLinear(ColumnParallelLinear):
15 |     def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs):
16 |         super(QuantizedColumnParallelLinear, self).__init__(*args, **kwargs)
17 |         self.weight_bit_width = weight_bit_width
18 | 
19 |         shape = self.weight.shape
20 |         del self.weight
21 | 
22 |         if weight is None:
23 |             self.weight = torch.empty(
24 |                 shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
25 |             )
26 |             self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"])
27 |         else:
28 |             self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
29 |             self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
30 |             if weight_bit_width == 4:
31 |                 self.weight = compress_int4_weight(self.weight)
32 | 
33 |         self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
34 |         self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
35 | 
36 |     def forward(self, input_):
37 |         # Set up backprop all-reduce.
38 |         input_parallel = copy_to_model_parallel_region(input_)
39 |         # Matrix multiply.
40 |         output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width)
41 |         if self.bias is not None:
42 |             output_parallel = output_parallel + self.bias
43 |         if self.gather_output:
44 |             # All-gather across the partitions.
45 |             output = gather_from_model_parallel_region(output_parallel)
46 |         else:
47 |             output = output_parallel
48 |         return output
49 | 
50 | 
51 | class QuantizedRowParallelLinear(RowParallelLinear):
52 |     def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs):
53 |         super(QuantizedRowParallelLinear, self).__init__(*args, **kwargs)
54 |         self.weight_bit_width = weight_bit_width
55 | 
56 |         shape = self.weight.shape
57 |         del self.weight
58 | 
59 |         if weight is None:
60 |             self.weight = torch.empty(
61 |                 shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"]
62 |             )
63 |             self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"])
64 |         else:
65 |             self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half()
66 |             self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8)
67 |             if weight_bit_width == 4:
68 |                 self.weight = compress_int4_weight(self.weight)
69 | 
70 |         self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False)
71 |         self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False)
72 | 
73 |     def forward(self, input_):
74 |         # Set up backprop all-reduce.
75 |         if self.input_is_parallel:
76 |             input_parallel = input_
77 |         else:
78 |             input_parallel = scatter_to_model_parallel_region(input_)
79 |         # Matrix multiply.
80 |         output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width)
81 |         # All-reduce across all the partitions.
82 |         output_ = reduce_from_model_parallel_region(output_parallel)
83 |         if self.bias is not None:
84 |             output = output_ + self.bias
85 |         else:
86 |             output = output_
87 |         return output
88 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | SwissArmyTransformer>=0.2.12,<0.3
2 | icetk
3 | apex
4 | scipy
5 | dataclass_wizard
6 | cpm_kernels
7 | 


--------------------------------------------------------------------------------
/resources/03DF31017FE184DB45D41DFFC6F80EF0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/03DF31017FE184DB45D41DFFC6F80EF0.png


--------------------------------------------------------------------------------
/resources/33872E48D3539EA132B74BCF5EFF458F.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/33872E48D3539EA132B74BCF5EFF458F.png


--------------------------------------------------------------------------------
/resources/49BF334CB352BAA19F7D55460B1DBCA9.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/49BF334CB352BAA19F7D55460B1DBCA9.gif


--------------------------------------------------------------------------------
/resources/7CB441707D1035B2890AA2164C5B6EAC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/7CB441707D1035B2890AA2164C5B6EAC.png


--------------------------------------------------------------------------------
/resources/7D6433A42D189E2E6FBC62BE066BCE91.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/7D6433A42D189E2E6FBC62BE066BCE91.png


--------------------------------------------------------------------------------
/resources/849024E93FA85347F7F6443932911922.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/849024E93FA85347F7F6443932911922.png


--------------------------------------------------------------------------------
/resources/AE18F14396E2D22BC0BC8DD77EFD3414.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/AE18F14396E2D22BC0BC8DD77EFD3414.png


--------------------------------------------------------------------------------
/resources/E42321373D22DE198231279B5856BB42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/E42321373D22DE198231279B5856BB42.png


--------------------------------------------------------------------------------
/resources/F48B69263360688CCA21E915F4B1A98B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/F48B69263360688CCA21E915F4B1A98B.png


--------------------------------------------------------------------------------
/resources/WECHAT.md:
--------------------------------------------------------------------------------
1 | <div align="center">
2 | <img src=wechat.jpg width="60%"/>
3 | 
4 | <p> 扫码关注公众号，加入「GLM-130B 交流群」 </p>
5 | <p> Scan the QR code to follow the official account and join the "GLM-130B Discussion Group" </p>
6 | </div>
7 | 
8 | 


--------------------------------------------------------------------------------
/resources/multitask_list.txt:
--------------------------------------------------------------------------------
 1 | super_glue/wsc.fixed
 2 | winogrande/winogrande_xl
 3 | super_glue/rte
 4 | glue/mrpc
 5 | glue/qqp
 6 | paws/labeled_final
 7 | ai2_arc/ARC_Challenge
 8 | ai2_arc/ARC_Easy
 9 | kilt_tasks/hotpot_qa
10 | trivia_qa/unfiltered
11 | web_questions
12 | wiki_qa
13 | adversarial_qa/dbidaf
14 | adversarial_qa/dbert
15 | adversarial_qa/droberta
16 | duorc/SelfRC
17 | duorc/ParaphraseRC
18 | ropes
19 | squad_v2
20 | super_glue/record
21 | quoref
22 | tydiqa
23 | cos_e/v1.11
24 | cosmos_qa
25 | dream
26 | openbookqa/main
27 | qasc
28 | quail
29 | quarel
30 | quartz
31 | race/high
32 | race/middle
33 | sciq
34 | social_i_qa
35 | super_glue/boolq
36 | super_glue/multirc
37 | wiki_hop/original
38 | wiqa
39 | piqa
40 | amazon_polarity
41 | app_reviews
42 | imdb
43 | rotten_tomatoes
44 | yelp_review_full
45 | super_glue/copa
46 | hellaswag
47 | common_gen
48 | wiki_bio
49 | cnndailymail/3.0.0
50 | gigaword
51 | multi_news
52 | samsum
53 | xsum
54 | ag_news
55 | dbpedia_14
56 | trec
57 | super_glue/wic
58 | tacred
59 | conll04 (joint entity relation extraction)
60 | nyt29 (joint entity relation extraction)
61 | ace2005 (joint entity relation extraction)
62 | ade (joint entity relation extraction)
63 | conll03 (named entity recognition)
64 | ontonotes (named entity recognition)
65 | genia (named entity recognition)
66 | conll05 (semantic role labeling)
67 | conll12 (semantic role labeling)
68 | propbank (semantic role labeling)
69 | ace05 (event extraction)
70 | multi_woz_2.1 (dialogue state tracking)
71 | 


--------------------------------------------------------------------------------
/resources/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/resources/wechat.jpg


--------------------------------------------------------------------------------
/scripts/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $0)
 4 | script_dir=$(dirname $script_path)
 5 | main_dir=$(dirname $script_dir)
 6 | 
 7 | source "${main_dir}/configs/model_glm_130b.sh"
 8 | 
 9 | ARGS="${main_dir}/benchmark.py \
10 |        --mode inference \
11 |        $MODEL_ARGS"
12 | 
13 | TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S')
14 | EXP_NAME=${TIMESTAMP}
15 | 
16 | mkdir -p logs
17 | 
18 | run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}"
19 | echo $run_cmd
20 | eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log
21 | 


--------------------------------------------------------------------------------
/scripts/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $0)
 4 | script_dir=$(dirname $script_path)
 5 | main_dir=$(dirname $script_dir)
 6 | 
 7 | source "${main_dir}/configs/model_glm_130b.sh"
 8 | 
 9 | DATA_PATH="<your evaluation dataset base directory>"
10 | 
11 | ARGS="${main_dir}/evaluate.py \
12 |        --mode inference \
13 |        --data-path $DATA_PATH \
14 |        --task $* \
15 |        $MODEL_ARGS"
16 | 
17 | TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S')
18 | EXP_NAME=${TIMESTAMP}
19 | 
20 | mkdir -p logs
21 | 
22 | run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}"
23 | eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log
24 | 


--------------------------------------------------------------------------------
/scripts/evaluate_multiple_node.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NUM_WORKERS=16
 4 | NUM_GPUS_PER_WORKER=8
 5 | HOST_FILE_PATH="<your hostfile>"
 6 | OPTIONS_NCCL="NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 CUDA_LAUNCH_BLOCKING=0"
 7 | 
 8 | script_path=$(realpath $0)
 9 | script_dir=$(dirname $script_path)
10 | main_dir=$(dirname $script_dir)
11 | 
12 | source "${main_dir}/configs/model_glm_130b.sh"
13 | 
14 | DATA_PATH="<your evaluation dataset base directory>"
15 | 
16 | ARGS="${main_dir}/evaluate.py \
17 |        --mode inference \
18 |        --data-path $DATA_PATH \
19 |        --task $* \
20 |        $MODEL_ARGS"
21 | 
22 | TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S')
23 | EXP_NAME=${TIMESTAMP}
24 | 
25 | mkdir -p logs
26 | 
27 | run_cmd="${OPTIONS_NCCL} deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} ${ARGS}"
28 | eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log
29 | 


--------------------------------------------------------------------------------
/scripts/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | script_path=$(realpath $0)
 4 | script_dir=$(dirname $script_path)
 5 | main_dir=$(dirname $script_dir)
 6 | 
 7 | source "${main_dir}/configs/model_glm_130b.sh"
 8 | 
 9 | SEED=1234
10 | MAX_OUTPUT_LENGTH=256
11 | MIN_GEN_LENGTH=0
12 | # BeamSearchStrategy args
13 | NUM_BEAMS=4
14 | LENGTH_PENALTY=1.0
15 | NO_REPEAT_NGRAM=3
16 | # BaseStrategy args
17 | TEMP=1.0
18 | TOPK=0
19 | TOPP=0.7
20 | 
21 | ARGS="${main_dir}/generate.py \
22 |        --seed $SEED \
23 |        --mode inference \
24 |        --sampling-strategy BaseStrategy \
25 |        --out-seq-length $MAX_OUTPUT_LENGTH \
26 |        --min-gen-length $MIN_GEN_LENGTH \
27 |        --num-beams $NUM_BEAMS \
28 |        --length-penalty $LENGTH_PENALTY \
29 |        --no-repeat-ngram-size $NO_REPEAT_NGRAM \
30 |        --temperature $TEMP \
31 |        --top_k $TOPK \
32 |        --top_p $TOPP \
33 |        --output-path samples \
34 |        $MODEL_ARGS \
35 |        $*"
36 | 
37 | run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}"
38 | eval ${run_cmd}
39 | 


--------------------------------------------------------------------------------
/tasks/bloom/glue_cola.yaml:
--------------------------------------------------------------------------------
1 | name: 'glue_cola'
2 | type: 'mul'
3 | path: 'bloom/glue_cola'
4 | file-pattern:
5 |   validation: "**/validation.jsonl"
6 | micro-batch-size: 30


--------------------------------------------------------------------------------
/tasks/bloom/glue_mnli.yaml:
--------------------------------------------------------------------------------
1 | name: 'glue_mnli'
2 | type: 'mul'
3 | path: 'bloom/glue_mnli'
4 | file-pattern:
5 |   validation-matched: "**/validation_matched.jsonl"
6 |   validation-mismatched: "**/validation_mismatched.jsonl"
7 | micro_batch_size: 8


--------------------------------------------------------------------------------
/tasks/bloom/glue_qnli.yaml:
--------------------------------------------------------------------------------
1 | name: 'glue_qnli'
2 | type: 'mul'
3 | path: 'bloom/glue_qnli'
4 | file-pattern:
5 |   validation: "**/validation.jsonl"
6 | micro_batch_size: 6


--------------------------------------------------------------------------------
/tasks/bloom/glue_wnli.yaml:
--------------------------------------------------------------------------------
1 | name: 'glue_wnli'
2 | type: 'mul'
3 | path: 'bloom/glue_wnli'
4 | file-pattern:
5 |   validation: "**/validation.jsonl"
6 | micro_batch_size: 16


--------------------------------------------------------------------------------
/tasks/bloom/math_qa.yaml:
--------------------------------------------------------------------------------
1 | name: 'math_qa'
2 | type: 'mul'
3 | path: 'bloom/math_qa'
4 | file-pattern:
5 |   validation: "**/validation.jsonl"
6 |   test: "**/test.jsonl"
7 | micro_batch_size: 6


--------------------------------------------------------------------------------
/tasks/bloom/mc_taco.yaml:
--------------------------------------------------------------------------------
1 | name: 'mc_taco'
2 | type: 'gen'
3 | path: 'bloom/mc_taco'
4 | file-pattern:
5 |   validation: "**/validation_pp.jsonl"
6 |   test: "**/test_pp.jsonl"


--------------------------------------------------------------------------------
/tasks/bloom/openbook_qa.yaml:
--------------------------------------------------------------------------------
1 | name: 'openbook_qa'
2 | type: 'mul'
3 | path: 'bloom/openbookqa_main'
4 | file-pattern:
5 |   test: "**/test.jsonl"
6 |   validation: "**/validation.jsonl"
7 | micro_batch_size: 18


--------------------------------------------------------------------------------
/tasks/bloom/pubmed_qa.yaml:
--------------------------------------------------------------------------------
1 | name: 'pubmed_qa'
2 | type: 'mul'
3 | path: 'bloom/pubmed_qa_pqa_labeled'
4 | file-pattern:
5 |   train: "**/train.jsonl"
6 | micro_batch_size: 2


--------------------------------------------------------------------------------
/tasks/bloom/superglue_axb.yaml:
--------------------------------------------------------------------------------
1 | name: 'superglue_axb'
2 | type: 'mul'
3 | path: 'bloom/super_glue_axb'
4 | file-pattern:
5 |   test: "**/test.jsonl"
6 | micro_batch_size: 16


--------------------------------------------------------------------------------
/tasks/bloom/superglue_axg.yaml:
--------------------------------------------------------------------------------
1 | name: 'superglue_axg'
2 | type: 'mul'
3 | path: 'bloom/super_glue_axg'
4 | file-pattern:
5 |   test: "**/test.jsonl"
6 | micro_batch_size: 34


--------------------------------------------------------------------------------
/tasks/chinese/clue/afqmc.yaml:
--------------------------------------------------------------------------------
1 | name: 'AFQMC'
2 | type: 'mul'
3 | path: 'CLUE/afqmc'
4 | micro_batch_size: 16


--------------------------------------------------------------------------------
/tasks/chinese/clue/c3.yaml:
--------------------------------------------------------------------------------
1 | name: 'C3'
2 | type: 'mul'
3 | path: 'CLUE/c3'
4 | micro_batch_size: 2


--------------------------------------------------------------------------------
/tasks/chinese/clue/cluewsc.yaml:
--------------------------------------------------------------------------------
1 | name: 'CLUEWSC2020'
2 | type: 'mul'
3 | path: 'CLUE/cluewsc'
4 | micro_batch_size: 18


--------------------------------------------------------------------------------
/tasks/chinese/clue/cmnli.yaml:
--------------------------------------------------------------------------------
1 | name: 'CMNLI'
2 | type: 'mul'
3 | path: 'CLUE/cmnli'
4 | micro_batch_size: 16


--------------------------------------------------------------------------------
/tasks/chinese/clue/cmrc2018.yaml:
--------------------------------------------------------------------------------
1 | name: "CMRC2018"
2 | type: "gen"
3 | path: "CLUE/cmrc2018"
4 | 


--------------------------------------------------------------------------------
/tasks/chinese/clue/csl.yaml:
--------------------------------------------------------------------------------
1 | name: 'CSL'
2 | type: 'mul'
3 | path: 'CLUE/csl'
4 | micro_batch_size: 3


--------------------------------------------------------------------------------
/tasks/chinese/clue/drcd.yaml:
--------------------------------------------------------------------------------
1 | name: "DRCD"
2 | type: "gen"
3 | path: "CLUE/drcd"
4 | 


--------------------------------------------------------------------------------
/tasks/chinese/clue/ocnli.yaml:
--------------------------------------------------------------------------------
1 | name: 'OCNLI_50K'
2 | type: 'mul'
3 | path: 'CLUE/ocnli'
4 | micro_batch_size: 24


--------------------------------------------------------------------------------
/tasks/chinese/fewclue/bustm.yaml:
--------------------------------------------------------------------------------
1 | name: 'BUSTM'
2 | type: 'mul'
3 | path: 'CLUE/bustm'
4 | file-pattern:
5 |   dev: "**/dev_few_all.jsonl"
6 |   test: "**/test_public.jsonl"
7 | micro_batch_size: 56
8 | 


--------------------------------------------------------------------------------
/tasks/chinese/fewclue/chidf.yaml:
--------------------------------------------------------------------------------
1 | name: 'CHIDF'
2 | type: 'mul'
3 | path: 'CLUE/chid-fc'
4 | file-pattern:
5 |   dev: "**/dev_few_all.jsonl"
6 |   test: "**/test_public.jsonl"
7 | micro_batch_size: 16


--------------------------------------------------------------------------------
/tasks/chinese/fewclue/cluewscf.yaml:
--------------------------------------------------------------------------------
1 | name: 'CLUEWSCF'
2 | type: 'mul'
3 | path: 'CLUE/cluewsc-fc'
4 | file-pattern:
5 |   dev: "**/dev_few_all.jsonl"
6 |   test: "**/test_public.jsonl"
7 | micro_batch_size: 16


--------------------------------------------------------------------------------
/tasks/chinese/fewclue/cslf.yaml:
--------------------------------------------------------------------------------
1 | name: 'CSLF'
2 | type: 'mul'
3 | path: 'CLUE/csl-fc'
4 | file-pattern:
5 |   dev: "**/dev_few_all.jsonl"
6 |   test: "**/test_public.jsonl"
7 | micro_batch_size: 2


--------------------------------------------------------------------------------
/tasks/chinese/fewclue/eprstmt.yaml:
--------------------------------------------------------------------------------
1 | name: 'EPRSTMT'
2 | type: 'mul'
3 | path: 'CLUE/eprstmt-fc'
4 | file-pattern:
5 |   dev: "**/dev_few_all.jsonl"
6 |   test: "**/test_public.jsonl"
7 | micro_batch_size: 6


--------------------------------------------------------------------------------
/tasks/chinese/fewclue/ocnlif.yaml:
--------------------------------------------------------------------------------
1 | name: 'OCNLIF'
2 | type: 'mul'
3 | path: 'CLUE/ocnli-fc'
4 | file-pattern:
5 |   dev: "**/dev_few_all.jsonl"
6 |   test: "**/test_public.jsonl"
7 | micro_batch_size: 24


--------------------------------------------------------------------------------
/tasks/ethnic/crows-pair/crows-pair.yaml:
--------------------------------------------------------------------------------
1 | name: "CROWS"
2 | type: "mul"
3 | path: "data"
4 | module:  "tasks.ethnic.crows-pair.tasks.CrowsPairTask"
5 | file-pattern:
6 |   test: "**/crows-pair-dataset.jsonl"
7 | 
8 | micro-batch-size: 1


--------------------------------------------------------------------------------
/tasks/ethnic/crows-pair/tasks.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | from typing import Dict, Tuple, List
  3 | from abc import ABC
  4 | from collections import defaultdict
  5 | from evaluation import (
  6 |     MultiChoiceTask,
  7 |     MultiChoiceTaskConfig,
  8 | )
  9 | from evaluation.dataset import (
 10 |     MultiChoiceTaskDataset,
 11 | )
 12 | from evaluation.utils import (
 13 |     print_rank_0,
 14 |     get_tokenized_input,
 15 | )
 16 | 
 17 | 
 18 | class CrowsPairTask(MultiChoiceTask, ABC):
 19 |     config: MultiChoiceTaskConfig
 20 | 
 21 |     def build_dataset(self, relative_path):
 22 |         return CrowsPairDataset(join(self.config.path, relative_path), self.config)
 23 | 
 24 |     def predict_single_batch(self, batch) -> List[int]:
 25 |         log_probs = self.model.cond_log_prob(batch)
 26 |         return log_probs
 27 | 
 28 |     def CrowsPairMetric(self, predictions, examples):
 29 |         print_rank_0("Special metric for CrowsPair")
 30 |         results = defaultdict(float)
 31 |         labels = defaultdict()
 32 |         for prediction, example in zip(predictions, examples):
 33 |             prediction = prediction[0]
 34 |             if example["sent_ID"] == 1:
 35 |                 results[example["pair_ID"]] = results[example["pair_ID"]] + prediction
 36 |             else:
 37 |                 results[example["pair_ID"]] = results[example["pair_ID"]] - prediction
 38 |             labels[example["pair_ID"]] = example["bias_type"]
 39 |         cat_postivie = defaultdict(int)
 40 |         cat_tt = defaultdict(int)
 41 |         final = defaultdict(int)
 42 |         for val1, val2 in zip(results.values(), labels.values()):
 43 |             if val1 >= 0:
 44 |                 cat_postivie[val2] = cat_postivie[val2] + 1
 45 |             else:
 46 |                 cat_postivie[val2] = cat_postivie[val2]
 47 |             cat_tt[val2] = cat_tt[val2] + 1
 48 |         for key, val in cat_postivie.items():
 49 |             final[key] = val / cat_tt[key]
 50 |         return final
 51 | 
 52 |     def report_single_metrics(self, file: str, result_dict: Dict[str, float]):
 53 |         pass
 54 | 
 55 |     @property
 56 |     def metrics(self):
 57 |         return {"CP": self.CrowsPairMetric}
 58 | 
 59 |     def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1):
 60 |         for result in result_dict_group.values():
 61 |             result = result[0]
 62 |             for value1 in result.items():
 63 |                 value1 = value1[1]
 64 |                 for key, value in value1.items():
 65 |                     print_rank_0("category:{cat}        score:{score}".format(cat=key, score=round(value * 100,2)))
 66 | 
 67 | 
 68 | class CrowsPairDataset(MultiChoiceTaskDataset):
 69 | 
 70 |     config: MultiChoiceTaskConfig
 71 | 
 72 |     def __init__(self, path, config: MultiChoiceTaskConfig):
 73 |         self.is_single_token = True  # set to False later in process_single_item func
 74 |         self.eval_data = []
 75 |         super().__init__(path, config)
 76 | 
 77 |     def process_single_item(self, item):
 78 |         text, choices, label = (
 79 |             get_tokenized_input(item, "inputs"),
 80 |             get_tokenized_input(item, "choices"),
 81 |             item["label"],
 82 |         )
 83 |         pair_ID, sent_ID, bias_type = (
 84 |             item["pair_ID"],
 85 |             item["sent_ID"],
 86 |             item["bias_type"],
 87 |         )
 88 |         tgt_seq_length = sum([len(choice) for choice in choices])
 89 |         if tgt_seq_length == len(choices):
 90 |             # For single token, we only insert one [sop]
 91 |             tgt_seq_length = 1
 92 | 
 93 |         assert tgt_seq_length < self.config.max_seq_length
 94 |         if len(text) + tgt_seq_length + 2 > self.config.max_seq_length:
 95 |             text_length = self.config.max_seq_length - tgt_seq_length - 2
 96 |             text = text[len(text) - text_length : len(text)]
 97 | 
 98 |         assert not (
 99 |             self.mask_id in text and self.config.use_multitask_encoding
100 |         ), "Unified multitask encoding don't support blank filling"
101 | 
102 |         if tgt_seq_length != 1:
103 |             self.is_single_token = False
104 | 
105 |         dataset = {
106 |             "text": text,
107 |             "choices": choices,
108 |             "label": label,
109 |             "pair_ID": pair_ID,
110 |             "sent_ID": sent_ID,
111 |             "bias_type": bias_type,
112 |         }
113 | 
114 |         return dataset
115 | 


--------------------------------------------------------------------------------
/tasks/ethnic/ethos/ethos-fewshot-multi.yaml:
--------------------------------------------------------------------------------
1 | name: "ETHOS_fewshot_multi"
2 | type: "mul"
3 | path: "data"
4 | file-pattern:
5 |   test: "**/ethos-few-shot-multi.jsonl"
6 | 
7 | micro-batch-size: 32


--------------------------------------------------------------------------------
/tasks/ethnic/ethos/ethos-fewshot-single.yaml:
--------------------------------------------------------------------------------
1 | name: "ETHOS_fewshot_single"
2 | type: "mul"
3 | path: "data"
4 | file-pattern:
5 |   test: "**/ethos-few-shot-single.jsonl"
6 | 
7 | micro-batch-size: 32


--------------------------------------------------------------------------------
/tasks/ethnic/ethos/ethos-oneshot.yaml:
--------------------------------------------------------------------------------
1 | name: "ETHOS_oneshot"
2 | type: "mul"
3 | path: "data"
4 | file-pattern:
5 |   test: "**/ethos-one-shot.jsonl"
6 | 
7 | micro-batch-size: 64


--------------------------------------------------------------------------------
/tasks/ethnic/ethos/ethos-zeroshot.yaml:
--------------------------------------------------------------------------------
1 | name: "ETHOS_zeroshot"
2 | type: "mul"
3 | path: "data"
4 | file-pattern:
5 |   test: "**/ethos-zero-shot.jsonl"
6 | 
7 | micro-batch-size: 128


--------------------------------------------------------------------------------
/tasks/ethnic/stereoset/stereoset.yaml:
--------------------------------------------------------------------------------
 1 | name: "StereoSet"
 2 | type: "mul"
 3 | path: "data"
 4 | module: "tasks.ethnic.stereoset.tasks.StereoSetTask"
 5 | use_task_mask: True
 6 | file-pattern:
 7 |   test: "**/stereoset-dataset.jsonl"
 8 | 
 9 | micro-batch-size: 64
10 | 


--------------------------------------------------------------------------------
/tasks/ethnic/stereoset/tasks.py:
--------------------------------------------------------------------------------
  1 | from os.path import join
  2 | from collections import defaultdict
  3 | from abc import ABC
  4 | import numpy as np
  5 | from typing import Dict, Tuple, List
  6 | from evaluation import (
  7 |     MultiChoiceTask,
  8 |     MultiChoiceTaskConfig,
  9 | )
 10 | from evaluation.dataset import (
 11 |     MultiChoiceTaskDataset,
 12 | )
 13 | from evaluation.utils import (
 14 |     print_rank_0,
 15 |     get_tokenized_input,
 16 | )
 17 | 
 18 | 
 19 | class StereoSetTask(MultiChoiceTask, ABC):
 20 |     config: MultiChoiceTaskConfig
 21 | 
 22 |     def build_dataset(self, relative_path):
 23 |         return StereoSetDataset(join(self.config.path, relative_path), self.config)
 24 | 
 25 |     def predict_single_batch(self, batch) -> List[int]:
 26 |         log_probs = self.model.cond_log_prob(batch)
 27 |         normalize_log_probs = []
 28 |         for origin_datas, predicts in zip(batch.get("choices"), log_probs):
 29 |             normalize_log_probs_single = []
 30 |             for origin_data, predict in zip(origin_datas, predicts):
 31 |                 normalize_log_probs_single.append(predict / len(origin_data))
 32 |             normalize_log_probs.append(normalize_log_probs_single)
 33 |         return [np.argmax(log_probs_single).item() for log_probs_single in normalize_log_probs]
 34 | 
 35 |     def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1):
 36 |         for tmp1 in result_dict_group.values():
 37 |             tmp1 = tmp1[0]
 38 |             for result in tmp1.values():
 39 |                 print("LMS")
 40 |                 for key, val in result[0].items():
 41 |                     print_rank_0("cat:{key}        score:{score}".format(key=key, score=round(val, 2)))
 42 |                 print("SS")
 43 |                 for key, val in result[1].items():
 44 |                     print_rank_0("cat:{key}        score:{score}".format(key=key, score=round(val, 2)))
 45 |                 print("ICAT")
 46 |                 for key, val in result[2].items():
 47 |                     print_rank_0("cat:{key}        score:{score}".format(key=key, score=round(val, 2)))
 48 | 
 49 |     def StereoSetMetric(self, predictions, examples):
 50 |         print_rank_0("Special metric for StereoSet")
 51 |         assert len(predictions) == len(examples)
 52 |         results = defaultdict(list)
 53 |         for prediction, example in zip(predictions, examples):
 54 |             # print(prediction,example["goal_label"],example["goal_label"][prediction])
 55 |             predict_label = example["goal_label"][prediction]
 56 |             example_type = example["bias_type"]
 57 |             results[example_type].append(predict_label)
 58 |             results["overall"].append(predict_label)
 59 |         ss = {}
 60 |         lms = {}
 61 |         icat = {}
 62 |         for key, values in results.items():
 63 |             label_dict = {"unrelated": 0, "stereotype": 0, "anti-stereotype": 0}
 64 |             for label_keys in values:
 65 |                 label_dict[label_keys] = label_dict.get(label_keys, 0) + 1
 66 |             tt = len(values)
 67 |             lms[key] = (label_dict["anti-stereotype"] + label_dict["stereotype"]) / tt * 100
 68 |             if label_dict["stereotype"] + label_dict["anti-stereotype"] == 0:
 69 |                 ss[key] = 0
 70 |             else:
 71 |                 ss[key] = label_dict["stereotype"] / (label_dict["anti-stereotype"] + label_dict["stereotype"]) * 100
 72 | 
 73 |             icat[key] = lms[key] * (min(ss[key], 100.0 - ss[key]) / 50.0)
 74 |         return [lms, ss, icat]
 75 | 
 76 |     def report_single_metrics(self, file: str, result_dict: Dict[str, float]):
 77 |         pass
 78 | 
 79 |     @property
 80 |     def metrics(self):
 81 |         return {"SS_ICAT": self.StereoSetMetric}
 82 | 
 83 | 
 84 | class StereoSetDataset(MultiChoiceTaskDataset):
 85 |     config: MultiChoiceTaskConfig
 86 | 
 87 |     def __init__(self, path, config: MultiChoiceTaskConfig):
 88 |         self.is_single_token = True  # set to False later in process_single_item func
 89 |         self.eval_data = []
 90 |         super().__init__(path, config)
 91 | 
 92 |     def process_single_item(self, item):
 93 |         text, choices, label = (
 94 |             get_tokenized_input(item, "inputs"),
 95 |             get_tokenized_input(item, "choices"),
 96 |             item["label"],
 97 |         )
 98 |         # "ID":example.ID,"bias_type":example.bias_type,"goal_label":goal_label
 99 |         ID, bias_type, goal_label = item["ID"], item["bias_type"], item["goal_label"]
100 |         tgt_seq_length = sum([len(choice) for choice in choices])
101 |         if tgt_seq_length == len(choices):
102 |             # For single token, we only insert one [sop]
103 |             tgt_seq_length = 1
104 | 
105 |         assert tgt_seq_length < self.config.max_seq_length
106 |         if len(text) + tgt_seq_length + 2 > self.config.max_seq_length:
107 |             text_length = self.config.max_seq_length - tgt_seq_length - 2
108 |             text = text[len(text) - text_length : len(text)]
109 | 
110 |         assert not (
111 |             self.mask_id in text and self.config.use_multitask_encoding
112 |         ), "Unified multitask encoding don't support blank filling"
113 | 
114 |         if tgt_seq_length != 1:
115 |             self.is_single_token = False
116 | 
117 |         dataset = {
118 |             "text": text,
119 |             "choices": choices,
120 |             "label": label,
121 |             "ID": ID,
122 |             "bias_type": bias_type,
123 |             "goal_label": goal_label,
124 |         }
125 | 
126 |         return dataset
127 | 


--------------------------------------------------------------------------------
/tasks/lambada/lambada-unidirectional.yaml:
--------------------------------------------------------------------------------
 1 | name: "LAMBADA-unidirectional"
 2 | type: "gen"
 3 | module: "tasks.lambada.task.LAMBADA"
 4 | path: "lambada/lambada"
 5 | file-pattern:
 6 |   test: "**/test.jsonl"
 7 |   validation: "**/validation.jsonl"
 8 | 
 9 | sampling_strategy: "BeamSearchStrategy"
10 | num_beams: 16
11 | max_gen_length: 5
12 | use_task_mask: true
13 | unidirectional: true
14 | 


--------------------------------------------------------------------------------
/tasks/lambada/lambada.yaml:
--------------------------------------------------------------------------------
 1 | name: "LAMBADA"
 2 | type: "gen"
 3 | module: "tasks.lambada.task.LAMBADA"
 4 | path: "lambada/lambada"
 5 | file-pattern:
 6 |   test: "**/test.jsonl"
 7 |   validation: "**/validation.jsonl"
 8 | 
 9 | sampling_strategy: "BeamSearchStrategy"
10 | num_beams: 16
11 | max_gen_length: 5
12 | use_task_mask: true
13 | 


--------------------------------------------------------------------------------
/tasks/lambada/strategy.py:
--------------------------------------------------------------------------------
 1 | from generation import BeamSearchStrategy
 2 | 
 3 | 
 4 | class BeamSearchStrategyForLAMBADA(BeamSearchStrategy):
 5 |     def __init__(self, *args, banned_prefix=[], **kwargs):
 6 |         super().__init__(*args, **kwargs)
 7 |         self.banned_prefix = banned_prefix
 8 | 
 9 |     def forward(self, logits, tokens, mems):
10 |         batch_size, num_beams, vocab_size = logits.shape
11 |         logits = logits.float()
12 |         for prefix in self.banned_prefix:
13 |             if self.length_generated == len(prefix) - 1:
14 |                 if len(prefix) == 1:
15 |                     logits[..., prefix[0]] = -65504
16 |                 else:
17 |                     for i in range(batch_size):
18 |                         for j in range(num_beams):
19 |                             if tokens[i, j, -(len(prefix) - 1) :].tolist() == prefix[:-1]:
20 |                                 logits[i, j, prefix[-1]] = -65504
21 |         return super().forward(logits, tokens, mems)
22 | 


--------------------------------------------------------------------------------
/tasks/lambada/task.py:
--------------------------------------------------------------------------------
 1 | from string import punctuation
 2 | from functools import partial
 3 | from typing import List
 4 | 
 5 | from evaluation import qa_evaluate, GenerationTask
 6 | 
 7 | from .strategy import BeamSearchStrategyForLAMBADA
 8 | 
 9 | 
10 | def exact_match_score(prediction, ground_truth):
11 |     return prediction.strip() == ground_truth.strip()
12 | 
13 | 
14 | class LAMBADA(GenerationTask):
15 |     @property
16 |     def metrics(self):
17 |         return {"Accuracy": partial(qa_evaluate, metric=exact_match_score)}
18 | 
19 |     def __init__(self, model, tokenizer, config_path):
20 |         super(LAMBADA, self).__init__(model, tokenizer, config_path)
21 | 
22 |         if self.config.sampling_strategy == "BeamSearchStrategy":
23 |             banned_prefix = [[46010], [146337]]  # "'" and "``"
24 |             invalid_slices = [20068, 146010, 146337]
25 |             for p in punctuation:
26 |                 pp = tokenizer.tokenize(p)
27 |                 if len(pp) == 1:
28 |                     invalid_slices.append(pp[0])
29 |                 banned_prefix.append(pp)
30 |             self.strategy = BeamSearchStrategyForLAMBADA(
31 |                 batch_size=self.config.micro_batch_size,
32 |                 num_beams=self.config.num_beams,
33 |                 length_penalty=self.config.length_penalty,
34 |                 consider_end=True,
35 |                 end_tokens=self.strategy.end_tokens,
36 |                 invalid_slices=invalid_slices,
37 |                 banned_prefix=banned_prefix,
38 |                 no_repeat_ngram_size=self.config.no_repeat_ngram_size,
39 |                 min_gen_length=self.config.min_gen_length,
40 |                 deterministic=True,
41 |             )
42 | 
43 |     def get_first_word_tokens(self, tokens):
44 |         text = self.tokenizer.tokenizer.decode(tokens).strip()
45 |         return self.tokenizer.tokenize(text.split(" ")[0])
46 | 
47 |     def predict_single_batch(self, batch):
48 |         outputs_batch: List[List[List[int]]] = self.model.generate_text(batch, self.strategy, return_all_beams=True)
49 |         predictions = []
50 |         for outputs in outputs_batch:
51 |             found = False
52 |             for output in outputs:
53 |                 text = self.tokenizer.tokenizer.decode(output).strip()
54 |                 spl = text.split(" ")
55 |                 if len(spl) >= 2 and spl[1] in punctuation:
56 |                     predictions.append(self.get_first_word_tokens(output))
57 |                     found = True
58 |                     break
59 |             if not found:
60 |                 predictions.append(self.get_first_word_tokens(outputs[0]))
61 |         return predictions
62 | 


--------------------------------------------------------------------------------
/tasks/language-modeling/pile.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import math
 3 | import json
 4 | 
 5 | from typing import *
 6 | from os.path import join
 7 | from bisect import bisect_right
 8 | from itertools import accumulate
 9 | from collections import defaultdict
10 | 
11 | from evaluation import LanguageModelTask, LanguageModelTaskDataset, print_rank_0
12 | 
13 | 
14 | def calculate_bpb_score(loss: List[float], data: List[Dict]):
15 |     loss_per_category = defaultdict(lambda: 0.0)
16 |     utf8_length_per_category = defaultdict(lambda: 0.0)
17 |     weights = []
18 |     for item in data:
19 |         weights.append(item["num_sequences"])
20 |         utf8_length_per_category[item["meta"]["pile_set_name"]] += item["utf8_length"]
21 |     weights = list(accumulate(weights))
22 |     for idx in range(len(loss)):
23 |         document_idx = bisect_right(weights, idx)
24 |         loss_per_category[data[document_idx]["meta"]["pile_set_name"]] += loss[idx]
25 |     return {
26 |         name: (loss_per_category[name] / utf8_length_per_category[name] / math.log(2)) for name in loss_per_category
27 |     }
28 | 
29 | 
30 | class Pile(LanguageModelTask):
31 |     @property
32 |     def metrics(self) -> Dict[str, Callable]:
33 |         return {"BPB": calculate_bpb_score}
34 | 
35 |     def build_dataset(self, relative_path):
36 |         return PileDataset(join(self.config.path, relative_path), self.config)
37 | 
38 |     def report_single_metrics(self, file: str, result_dict: Dict[str, float]):
39 |         pass
40 | 
41 |     def report_group_metrics(
42 |         self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, Dict[str, float]], int]], level=1
43 |     ):
44 |         output_str = f"    Finish group {group_name}:\n"
45 |         result = list(result_dict_group.values())[0][0]["BPB"]
46 |         for key, value in result.items():
47 |             output_str += f"        {key} = {value:.3f}\n"
48 |         print_rank_0(output_str)
49 |         pass
50 | 
51 |     def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]):
52 |         pass
53 | 
54 | 
55 | class PileDataset(LanguageModelTaskDataset):
56 |     def __len__(self):
57 |         return self.weights[-1]
58 | 
59 |     def process_single_file(self, path):
60 |         num_sequences = []
61 |         with open(os.path.join(path), "r", encoding="utf-8") as file:
62 |             for line in file:
63 |                 item = json.loads(line)
64 |                 if len(item["text"]) == 0:
65 |                     continue
66 |                 self.data.append(
67 |                     {
68 |                         "raw_text": item["text"],
69 |                         "utf8_length": len(item["text_pretokenized"].encode("utf-8")),
70 |                         "num_sequences": max(
71 |                             math.ceil(
72 |                                 max(len(item["text"]) - (self.config.max_seq_length - 1), 0)
73 |                                 / self.config.generation_length
74 |                             )
75 |                             + 1,
76 |                             1,
77 |                         ),
78 |                         "meta": item["meta"],
79 |                     }
80 |                 )
81 |                 num_sequences.append(self.data[-1]["num_sequences"])
82 |             self.weights = list(accumulate(num_sequences))
83 |             self.left_weights = [0] + self.weights[:-1]
84 | 


--------------------------------------------------------------------------------
/tasks/language-modeling/pile.yaml:
--------------------------------------------------------------------------------
 1 | name: "Pile"
 2 | type: "lm"
 3 | module: "tasks.language-modeling.pile.Pile"
 4 | path: "pile"
 5 | file-pattern:
 6 |   test: "**/test_tokenized.jsonl"
 7 | #  validation: "**/val_tokenized.jsonl"
 8 | 
 9 | generation-length: 1024
10 | use_task_mask: true
11 | 


--------------------------------------------------------------------------------
/tasks/language-modeling/ptb.yaml:
--------------------------------------------------------------------------------
1 | name: "Penn Treebank"
2 | type: "lm"
3 | path: "ptbdataset"
4 | file-pattern:
5 |   test: "**/ptb.test.txt"
6 | 
7 | generation-length: 256
8 | use_task_mask: true
9 | 


--------------------------------------------------------------------------------
/tasks/language-modeling/wikitext-103.yaml:
--------------------------------------------------------------------------------
1 | name: "WikiText-103"
2 | type: "lm"
3 | path: "wikitext-103"
4 | file-pattern:
5 |   test: "**/wiki.test.tokens"
6 | 
7 | generation-length: 256
8 | use_task_mask: true
9 | 


--------------------------------------------------------------------------------
/tasks/language-modeling/wikitext-2.yaml:
--------------------------------------------------------------------------------
1 | name: "WikiText-2"
2 | type: "lm"
3 | path: "wikitext-2"
4 | file-pattern:
5 |   test: "**/wiki.test.tokens"
6 | 
7 | generation-length: 256
8 | use_task_mask: true
9 | 


--------------------------------------------------------------------------------
/tasks/mmlu/mmlu.yaml:
--------------------------------------------------------------------------------
 1 | name: "MMLU"
 2 | type: "mul"
 3 | module: "tasks.mmlu.task.MMLU"
 4 | path: "MMLU"
 5 | file-pattern:
 6 |   stem: "stem/*.json"
 7 |   social_sciences: "social_sciences/*.json"
 8 |   humanities: "humanities/*.json"
 9 |   other: "other/*.json"
10 | micro-batch-size: 1


--------------------------------------------------------------------------------
/tasks/mmlu/task.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from typing import Dict, Tuple
 4 | 
 5 | from evaluation import MultiChoiceTask
 6 | 
 7 | categories = {
 8 |     "STEM": [
 9 |         "Abstract Algebra",
10 |         "Anatomy",
11 |         "Astronomy",
12 |         "College Biology",
13 |         "College Chemistry",
14 |         "College Computer Science",
15 |         "College Mathematics",
16 |         "College Physics",
17 |         "Computer Security",
18 |         "Conceptual Physics",
19 |         "Electrical Engineering",
20 |         "Elementary Mathematics",
21 |         "High School Biology",
22 |         "High School Chemistry",
23 |         "High School Computer Science",
24 |         "High School Mathematics",
25 |         "High School Physics",
26 |         "High School Statistics",
27 |         "Machine Learning",
28 |     ],
29 |     "Other": [
30 |         "Business Ethics",
31 |         "Clinical Knowledge",
32 |         "College Medicine",
33 |         "Global Facts",
34 |         "Human Aging",
35 |         "Management",
36 |         "Marketing",
37 |         "Medical Genetics",
38 |         "Miscellaneous",
39 |         "Nutrition",
40 |         "Professional Accounting",
41 |         "Professional Medicine",
42 |         "Virology",
43 |     ],
44 |     "Social Sciences": [
45 |         "Econometrics",
46 |         "High School Geography",
47 |         "High School Government and Politics",
48 |         "High School Macroeconomics",
49 |         "High School Microeconomics",
50 |         "High School Psychology",
51 |         "Human Sexuality",
52 |         "Professional Psychology",
53 |         "Public Relations",
54 |         "Security Studies",
55 |         "Sociology",
56 |         "US Foreign Policy",
57 |     ],
58 |     "Humanities": [
59 |         "Formal Logic",
60 |         "High School European History",
61 |         "High School US History",
62 |         "High School World History",
63 |         "International Law",
64 |         "Jurisprudence",
65 |         "Logical Fallacies",
66 |         "Moral Disputes",
67 |         "Moral Scenarios",
68 |         "Philosophy",
69 |         "Prehistory",
70 |         "Professional Law",
71 |         "World Religions",
72 |     ],
73 | }
74 | 
75 | 
76 | class MMLU(MultiChoiceTask):
77 |     def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]):
78 |         self.report_group_metrics("Overall", result_dict_all, level=0)
79 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/THUDM/GLM-130B/212215c54f8a9da2deed51455868305235664370/tools/__init__.py


--------------------------------------------------------------------------------
/tools/convert_tp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import argparse
  5 | import glob
  6 | 
  7 | from typing import *
  8 | 
  9 | sys.path.append(".")
 10 | 
 11 | SEQUENTIAL_LAYERS = [
 12 |     "input_layernorm.weight",
 13 |     "input_layernorm.bias",
 14 |     "attention.dense.bias",
 15 |     "post_attention_layernorm.weight",
 16 |     "post_attention_layernorm.bias",
 17 |     "mlp.dense_4h_to_h.bias",
 18 |     "attention.rotary_emb.inv_freq",
 19 |     "final_layernorm.weight",
 20 |     "final_layernorm.bias",
 21 | ]
 22 | 
 23 | GLU_LAYERS = [
 24 |     "mlp.dense_h_to_4h.weight",
 25 |     "mlp.dense_h_to_4h.bias",
 26 | ]
 27 | 
 28 | QUANTIZED_LAYERS = [
 29 |     "attention.dense.weight",
 30 |     "attention.query_key_value.weight",
 31 |     "mlp.dense_h_to_4h.weight",
 32 |     "mlp.dense_4h_to_h.weight",
 33 | ]
 34 | 
 35 | LAYER_CONCAT_DIM = {"attention.dense.weight": 1, "mlp.dense_4h_to_h.weight": 1}
 36 | 
 37 | 
 38 | def parse_arguments():
 39 |     parser = argparse.ArgumentParser()
 40 |     parser.add_argument("--input-folder", default=None, type=str, help="Input SAT checkpoint folder")
 41 |     parser.add_argument("--output-folder", default=None, type=str, help="Output SAT checkpoint folder")
 42 |     parser.add_argument("--target-tp", default=4, type=int, help="Target TP degree")
 43 |     parser.add_argument("--quantization-bit-width", default=None, type=int, help="Quantization bit width")
 44 | 
 45 |     args = parser.parse_args()
 46 |     if args.quantization_bit_width is not None:
 47 |         assert args.quantization_bit_width in [4, 8]
 48 | 
 49 |     return args
 50 | 
 51 | 
 52 | def merge_weights(
 53 |     key: str,
 54 |     sd_list: List[Dict],
 55 |     tp_index: int,
 56 |     original_tp: int,
 57 |     target_tp: int,
 58 |     cat_dim: int,
 59 |     is_glu: bool,
 60 |     quantization_bit_width: Optional[int],
 61 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
 62 |     if original_tp >= target_tp:
 63 |         if is_glu:
 64 |             if original_tp > target_tp:
 65 |                 num_part = original_tp // target_tp
 66 |                 assert len(sd_list) == num_part
 67 |                 part1, part2 = [], []
 68 |                 for i in range(len(sd_list)):
 69 |                     chunks = torch.chunk(sd_list[i][key], 2, dim=cat_dim)
 70 |                     part1.append(chunks[0])
 71 |                     part2.append(chunks[1])
 72 |                 merged_sd = torch.cat(part1 + part2, dim=cat_dim)
 73 |             else:
 74 |                 merged_sd = sd_list[0][key]
 75 |         else:
 76 |             merged_sd = torch.cat([sd[key] for sd in sd_list], dim=cat_dim)
 77 |     else:
 78 |         assert len(sd_list) == 1
 79 |         num_part = target_tp // original_tp
 80 |         if is_glu:
 81 |             offset = tp_index % num_part
 82 |             chunks = torch.chunk(sd_list[0][key], num_part * 2, dim=cat_dim)
 83 |             merged_sd = torch.cat([chunks[offset], chunks[num_part + offset]], dim=cat_dim)
 84 |         else:
 85 |             # without clone, torch will save entire tensor
 86 |             merged_sd = torch.chunk(sd_list[0][key], num_part, dim=cat_dim)[tp_index % num_part].clone()
 87 | 
 88 |     if quantization_bit_width is not None:
 89 |         from kernels import compress_int4_weight
 90 | 
 91 |         weight = merged_sd.cuda()
 92 |         weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (quantization_bit_width - 1)) - 1)).half()
 93 |         weight = torch.round(weight / weight_scale[:, None]).to(torch.int8)
 94 |         if quantization_bit_width == 4:
 95 |             weight = compress_int4_weight(weight)
 96 |         return weight.cpu(), weight_scale.cpu()
 97 | 
 98 |     return merged_sd
 99 | 
100 | 
101 | def create_checkpoint(
102 |     sd_list: List[Dict], tp_index: int, original_tp: int, target_tp: int, quantization_bit_width: Optional[int]
103 | ) -> Dict:
104 |     new_sd = {}
105 |     for key in sd_list[0].keys():
106 |         name = ".".join(key.split(".")[3 if key.startswith("transformer.layers") else 1 :])
107 |         if name in SEQUENTIAL_LAYERS:
108 |             new_sd[key] = sd_list[0][key]
109 |         else:
110 |             new_sd[key] = merge_weights(
111 |                 key,
112 |                 sd_list,
113 |                 tp_index=tp_index,
114 |                 original_tp=original_tp,
115 |                 target_tp=target_tp,
116 |                 cat_dim=LAYER_CONCAT_DIM.get(name, 0),
117 |                 is_glu=name in GLU_LAYERS,
118 |                 quantization_bit_width=quantization_bit_width if name in QUANTIZED_LAYERS else None,
119 |             )
120 |             if quantization_bit_width is not None and name in QUANTIZED_LAYERS:
121 |                 new_sd[key], new_sd[f"{key}_scale"] = new_sd[key]
122 |     new_sd = {"module": new_sd}
123 |     return new_sd
124 | 
125 | 
126 | def main(args):
127 |     iteration = open(os.path.join(args.input_folder, "latest"), "r").read()
128 |     original_tp = len(glob.glob(os.path.join(args.input_folder, iteration, "mp_rank_*_model_states.pt")))
129 |     print(f"Iteration {iteration} from {args.input_folder} to {args.output_folder}")
130 |     os.makedirs(args.output_folder, exist_ok=True)
131 |     with open(os.path.join(args.output_folder, "latest"), "w") as file:
132 |         file.write(str(iteration))
133 |     os.makedirs(os.path.join(args.output_folder, iteration), exist_ok=True)
134 | 
135 |     for i in range(0, args.target_tp):
136 |         save_path = os.path.join(args.output_folder, iteration, f"mp_rank_{i:02}_model_states.pt")
137 |         print(f"Processing {save_path}")
138 |         num_parts = original_tp // args.target_tp
139 |         sd_list = [
140 |             torch.load(
141 |                 os.path.join(args.input_folder, iteration, f"mp_rank_{j:02}_model_states.pt"), map_location="cpu"
142 |             )["module"]
143 |             for j in (
144 |                 range(i * num_parts, (i + 1) * num_parts)
145 |                 if args.target_tp <= original_tp
146 |                 else [i // (args.target_tp // original_tp)]
147 |             )
148 |         ]
149 |         torch.save(create_checkpoint(sd_list, i, original_tp, args.target_tp, args.quantization_bit_width), save_path)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     args = parse_arguments()
154 |     main(args)
155 | 


--------------------------------------------------------------------------------
/tools/tokenize_pile.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import tqdm
 3 | from icetk import icetk
 4 | from multiprocessing import Pool
 5 | 
 6 | DATA_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val.jsonl"
 7 | OUTPUT_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val_tokenized.jsonl"
 8 | 
 9 | 
10 | def get_data(line):
11 |     item = json.loads(line)
12 |     item["text_pretokenized"] = item["text"]
13 |     item["text"] = icetk.encode(item["text_pretokenized"])
14 |     return json.dumps(item) + "\n"
15 | 
16 | 
17 | with open(DATA_PATH, "r") as file:
18 |     data = file.readlines()
19 | 
20 | with Pool(16) as p:
21 |     result = list(tqdm.tqdm(p.imap(get_data, data), total=len(data)))
22 | 
23 | with open(OUTPUT_PATH, "w") as file:
24 |     file.writelines(result)
25 | 


--------------------------------------------------------------------------------