├── .gitignore
├── LICENSE
├── README.md
├── chp10
    ├── README.md
    ├── autogpt.ipynb
    ├── crawl_prompt.py
    ├── gradio_demo.ipynb
    ├── langchain_demo.ipynb
    └── llamacpp.ipynb
├── chp11
    └── elo.py
├── chp2
    ├── fmm_word_seg.py
    ├── lexicon.txt
    └── svd.py
├── chp3
    ├── convert_t2s.py
    ├── t2s.json
    └── wikidata_cleaning.py
├── chp4
    ├── cnn_sent_polarity.py
    ├── lstm_postag.py
    ├── lstm_sent_polarity.py
    ├── mlp.py
    ├── mlp_embedding.py
    ├── mlp_sent_polarity.py
    ├── mlp_train.py
    ├── transformer
    │   └── model.py
    ├── transformer_postag.py
    ├── transformer_sent_polarity.py
    ├── utils.py
    └── vocab.py
├── chp5
    ├── ffnnlm.py
    ├── ngram-lm.py
    ├── rnnlm.py
    ├── tflm
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── dataset.cpython-38.pyc
    │   │   ├── model.cpython-38.pyc
    │   │   ├── sample.cpython-38.pyc
    │   │   └── train.cpython-38.pyc
    │   ├── dataset.py
    │   ├── model.py
    │   ├── sample.py
    │   └── train.py
    ├── utils.py
    └── vocab.py
├── chp6
    ├── cbow.py
    ├── evaluate.py
    ├── glove.py
    ├── sgns.py
    ├── skipgram.py
    ├── train_elmo.py
    ├── utils.py
    └── vocab.py
├── chp7
    ├── README.md
    ├── finetune_bert_mrc.py
    ├── finetune_bert_ner.py
    ├── finetune_bert_spc.py
    ├── finetune_bert_ssc.py
    ├── finetune_gpt2_tg.py
    └── finetune_t5_mt.py
├── chp9
    ├── README.md
    ├── chinese_sp.model
    ├── merge_tokenizers.py
    ├── t4tiny.json
    ├── textbrewer_example.py
    └── textpruner_example.py
└── slides
    ├── 01-绪论.pptx
    ├── 02-自然语言处理基础.pptx
    ├── 03-基础工具集与常用数据集.pptx
    ├── 04-神经网络基础.pptx
    ├── 05-语言模型.pptx
    ├── 06-预训练词向量.pptx
    ├── 07-预训练语言模型.pptx
    ├── 08-大语言模型的预训练.pptx
    ├── 09-大语言模型的适配.pptx
    ├── 10-大语言模型的应用.pptx
    ├── 11-大语言模型的能力评估.pptx
    ├── 12-预训练语言模型的延伸.pptx
    └── 13-DeepSeek.pptx


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # llm-nlp-book
2 | 
3 | 本仓库用于存放《[自然语言处理：基于大语言模型的方法](https://item.jd.com/14395393.html)》（作者：车万翔、郭江、崔一鸣）一书各章节的示例代码。
4 | 


--------------------------------------------------------------------------------
/chp10/README.md:
--------------------------------------------------------------------------------
 1 | # 第10章：大语言模型的应用
 2 | 
 3 | 本章节涉及的工具包相比撰写书籍时，均发生了较大更新。请参考以下notebook使用相关工具。
 4 | 
 5 | ### 10.2 生成指令数据
 6 | 
 7 | ```
 8 | python crawl_prompt.py output_file.json
 9 | ```
10 | 
11 | ### 10.3.1 llama.cpp
12 | 
13 | 参考`llamacpp.ipynb`。
14 | 
15 | ### 10.3.2 transformers搭建Gradio demo
16 | 
17 | 参考`gradio_demo.ipynb`。
18 | 
19 | ### 10.4.1 LangChain
20 | 
21 | 参考`langchain_demo.ipynb`。
22 | 
23 | ### 10.5.1 AutoGPT
24 | 
25 | 参考`autogpt.ipynb`。
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/chp10/crawl_prompt.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import sys
 3 | import random
 4 | 
 5 | openai.api_key = ""   # you must provide your OpenAI API key before crawling
 6 | if not openai.api_key:
 7 |   raise ValueError("OpenAI API key not provided. Please set the 'openai.api_key' variable.")
 8 | 
 9 | def return_random_prompt():
10 |   system_prompt = "你需要尽可能给出多样化的任务指令和对应的回答。我们将用于人工评估ChatGPT模型对指令的完成情况。要求:\n"
11 | 
12 |   # generate random topics
13 |   topic_list = ["科技", "娱乐", "体育", "金融", "时政", "教育", "医疗", "旅游", "美食", "汽车", "房产", "文化", "历史", "地理", "自然", "人文", "社会", "法律", "军事", "政治", "经济", "文学", "艺术", "宗教", "哲学", "语言", "数学", "物理", "化学", "生物", "地球科学", "天文学", "计算机科学", "工程", "建筑", "设计", "音乐", "舞蹈", "电影", "电视", "动漫", "游戏", "健康", "美容", "时尚", "家居", "家电", "家具", "家装", "母婴", "育儿", "职场", "工作", "生活", "养生", "心理", "情感", "人际", "社交", "交友", "恋爱", "婚姻", "家庭", "亲子", "宠物", "动物", "植物", "食品", "饮料", "餐饮", "酒店", "购物", "消费", "理财", "税务", "法规", "法院", "司法", "刑事", "民事", "行政", "战争"]
14 |   system_prompt += "1. 主题多样化，涵盖各个领域，例如：" + "、".join(random.sample(topic_list, 10)) + "等。\n"
15 | 
16 |   # generate random tasks
17 |   task_list = ["开放式生成", "分类", "问答", "编辑", "摘要", "写作", "翻译", "写代码", "分析", "代码解析", "常识推理", "写信", "抽取", "推荐"]
18 |   system_prompt += "2. 表述多样化，结合真实问题；指令类型多样化，例如：" + "、".join(random.sample(task_list, 10)) + "等。\n"
19 | 
20 |   # other requirements
21 |   system_prompt += "3. 如果遇到无法处理的指令（只靠文本无法回答），给出无法处理的回复。\n"
22 |   system_prompt += "4. 除非特别要求，请使用中文，指令可以是命令句、疑问句、或其他合适的类型。\n"
23 |   system_prompt += "5. 为指令生成一个适当且涉及真实情况的<input>，不应该只包含简单的占位符。<input>应提供实质性的内容，具有挑战性。字数不超过" + str(random.randint(80, 120)) + "字。\n"
24 |   system_prompt += "6. <output>应该是对指令的适当且真实的回应，不能只回复答应或拒绝请求。如果需要额外信息才能回复时，请努力预测用户意图并尝试回复。<output>的内容应少于" + str(random.randint(128, 512)) + "字。\n\n"
25 | 
26 |   system_prompt += "请给出满足条件的20条JSON格式数据：\n"
27 | 
28 |   return system_prompt
29 | 
30 | 
31 | if __name__ == "__main__":
32 |   if len(sys.argv) != 2:
33 |     print("Usage: python crawl_prompt.py <output_file>")
34 |     exit(1)
35 | 
36 |   output_file = open(sys.argv[1], 'w')
37 | 
38 |   MAX_EPOCHS = 1    # number of data to generate (each prompt contains 20 JSON-formatted data)
39 |   for k in range(MAX_EPOCHS):
40 |     response = openai.ChatCompletion.create(
41 |       model="gpt-3.5-turbo",    # here we use `gpt-3.5-turbo` model, while Stanford-Alpaca uses `text-davinci-003`
42 |       messages=[
43 |           {"role": "user", "content": return_random_prompt()},
44 |       ]
45 |     )
46 |     output_file.write(response["choices"][0]["message"]["content"] + '\n')
47 |   output_file.close()
48 | 


--------------------------------------------------------------------------------
/chp10/gradio_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "execution_count": null,
  6 |       "metadata": {
  7 |         "colab": {
  8 |           "base_uri": "https://localhost:8080/"
  9 |         },
 10 |         "id": "CeN_Rw_kYubO",
 11 |         "outputId": "c0cef1a9-34db-4722-ee49-26ec119b296e"
 12 |       },
 13 |       "outputs": [
 14 |         {
 15 |           "name": "stdout",
 16 |           "output_type": "stream",
 17 |           "text": [
 18 |             "Collecting transformers\n",
 19 |             "  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)\n",
 20 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m63.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 21 |             "\u001b[?25hCollecting gradio\n",
 22 |             "  Downloading gradio-3.44.3-py3-none-any.whl (20.2 MB)\n",
 23 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.2/20.2 MB\u001b[0m \u001b[31m85.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 24 |             "\u001b[?25hCollecting bitsandbytes\n",
 25 |             "  Downloading bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)\n",
 26 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m19.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 27 |             "\u001b[?25hCollecting sentencepiece\n",
 28 |             "  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
 29 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m74.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 30 |             "\u001b[?25hCollecting accelerate\n",
 31 |             "  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)\n",
 32 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m258.1/258.1 kB\u001b[0m \u001b[31m28.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 33 |             "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n",
 34 |             "Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)\n",
 35 |             "  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)\n",
 36 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.8/294.8 kB\u001b[0m \u001b[31m29.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 37 |             "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n",
 38 |             "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n",
 39 |             "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
 40 |             "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n",
 41 |             "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
 42 |             "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n",
 43 |             "  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
 44 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m110.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 45 |             "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n",
 46 |             "  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
 47 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m75.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 48 |             "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n",
 49 |             "Collecting aiofiles<24.0,>=22.0 (from gradio)\n",
 50 |             "  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
 51 |             "Requirement already satisfied: altair<6.0,>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (4.2.2)\n",
 52 |             "Collecting fastapi (from gradio)\n",
 53 |             "  Downloading fastapi-0.103.1-py3-none-any.whl (66 kB)\n",
 54 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.2/66.2 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 55 |             "\u001b[?25hCollecting ffmpy (from gradio)\n",
 56 |             "  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n",
 57 |             "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 58 |             "Collecting gradio-client==0.5.0 (from gradio)\n",
 59 |             "  Downloading gradio_client-0.5.0-py3-none-any.whl (298 kB)\n",
 60 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.2/298.2 kB\u001b[0m \u001b[31m32.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 61 |             "\u001b[?25hCollecting httpx (from gradio)\n",
 62 |             "  Downloading httpx-0.25.0-py3-none-any.whl (75 kB)\n",
 63 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.7/75.7 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 64 |             "\u001b[?25hRequirement already satisfied: importlib-resources<7.0,>=1.3 in /usr/local/lib/python3.10/dist-packages (from gradio) (6.0.1)\n",
 65 |             "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.1.2)\n",
 66 |             "Requirement already satisfied: markupsafe~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (2.1.3)\n",
 67 |             "Requirement already satisfied: matplotlib~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (3.7.1)\n",
 68 |             "Collecting orjson~=3.0 (from gradio)\n",
 69 |             "  Downloading orjson-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n",
 70 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 71 |             "\u001b[?25hRequirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (1.5.3)\n",
 72 |             "Requirement already satisfied: pillow<11.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (9.4.0)\n",
 73 |             "Requirement already satisfied: pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from gradio) (1.10.12)\n",
 74 |             "Collecting pydub (from gradio)\n",
 75 |             "  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
 76 |             "Collecting python-multipart (from gradio)\n",
 77 |             "  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n",
 78 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 79 |             "\u001b[?25hCollecting semantic-version~=2.0 (from gradio)\n",
 80 |             "  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
 81 |             "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio) (4.5.0)\n",
 82 |             "Collecting uvicorn>=0.14.0 (from gradio)\n",
 83 |             "  Downloading uvicorn-0.23.2-py3-none-any.whl (59 kB)\n",
 84 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 85 |             "\u001b[?25hCollecting websockets<12.0,>=10.0 (from gradio)\n",
 86 |             "  Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n",
 87 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m15.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 88 |             "\u001b[?25hRequirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from gradio-client==0.5.0->gradio) (2023.6.0)\n",
 89 |             "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.5)\n",
 90 |             "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.0.1+cu118)\n",
 91 |             "Requirement already satisfied: entrypoints in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio) (0.4)\n",
 92 |             "Requirement already satisfied: jsonschema>=3.0 in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio) (4.19.0)\n",
 93 |             "Requirement already satisfied: toolz in /usr/local/lib/python3.10/dist-packages (from altair<6.0,>=4.2.0->gradio) (0.12.0)\n",
 94 |             "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (1.1.0)\n",
 95 |             "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (0.11.0)\n",
 96 |             "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (4.42.1)\n",
 97 |             "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (1.4.5)\n",
 98 |             "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (3.1.1)\n",
 99 |             "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib~=3.0->gradio) (2.8.2)\n",
100 |             "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio) (2023.3.post1)\n",
101 |             "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.2.0)\n",
102 |             "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n",
103 |             "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.4)\n",
104 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n",
105 |             "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.12)\n",
106 |             "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1)\n",
107 |             "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.0.0)\n",
108 |             "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (3.27.4.1)\n",
109 |             "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.10.0->accelerate) (16.0.6)\n",
110 |             "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.10/dist-packages (from uvicorn>=0.14.0->gradio) (8.1.7)\n",
111 |             "Collecting h11>=0.8 (from uvicorn>=0.14.0->gradio)\n",
112 |             "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
113 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
114 |             "\u001b[?25hRequirement already satisfied: anyio<4.0.0,>=3.7.1 in /usr/local/lib/python3.10/dist-packages (from fastapi->gradio) (3.7.1)\n",
115 |             "Collecting starlette<0.28.0,>=0.27.0 (from fastapi->gradio)\n",
116 |             "  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)\n",
117 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
118 |             "\u001b[?25hCollecting httpcore<0.19.0,>=0.18.0 (from httpx->gradio)\n",
119 |             "  Downloading httpcore-0.18.0-py3-none-any.whl (76 kB)\n",
120 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.0/76.0 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
121 |             "\u001b[?25hRequirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx->gradio) (1.3.0)\n",
122 |             "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0.0,>=3.7.1->fastapi->gradio) (1.1.3)\n",
123 |             "Requirement already satisfied: attrs>=22.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (23.1.0)\n",
124 |             "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (2023.7.1)\n",
125 |             "Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.30.2)\n",
126 |             "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.10/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.10.2)\n",
127 |             "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib~=3.0->gradio) (1.16.0)\n",
128 |             "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n",
129 |             "Building wheels for collected packages: ffmpy\n",
130 |             "  Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
131 |             "  Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5579 sha256=9e3a9c55e1c5d4d3be49a7fcbdb38b833b3855c9f11b00db7c3c4723f6635c99\n",
132 |             "  Stored in directory: /root/.cache/pip/wheels/01/a6/d1/1c0828c304a4283b2c1639a09ad86f83d7c487ef34c6b4a1bf\n",
133 |             "Successfully built ffmpy\n",
134 |             "Installing collected packages: tokenizers, sentencepiece, safetensors, pydub, ffmpy, bitsandbytes, websockets, semantic-version, python-multipart, orjson, h11, aiofiles, uvicorn, starlette, huggingface-hub, httpcore, transformers, httpx, fastapi, gradio-client, gradio, accelerate\n",
135 |             "Successfully installed accelerate-0.23.0 aiofiles-23.2.1 bitsandbytes-0.41.1 fastapi-0.103.1 ffmpy-0.3.1 gradio-3.44.3 gradio-client-0.5.0 h11-0.14.0 httpcore-0.18.0 httpx-0.25.0 huggingface-hub-0.17.1 orjson-3.9.7 pydub-0.25.1 python-multipart-0.0.6 safetensors-0.3.3 semantic-version-2.10.0 sentencepiece-0.1.99 starlette-0.27.0 tokenizers-0.13.3 transformers-4.33.1 uvicorn-0.23.2 websockets-11.0.3\n"
136 |           ]
137 |         }
138 |       ],
139 |       "source": [
140 |         "!pip install transformers gradio bitsandbytes sentencepiece accelerate"
141 |       ]
142 |     },
143 |     {
144 |       "cell_type": "code",
145 |       "execution_count": null,
146 |       "metadata": {
147 |         "id": "OOn8sD-nZB1d"
148 |       },
149 |       "outputs": [],
150 |       "source": [
151 |         "!pip install hf_transfer\n",
152 |         "!HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir-use-symlinks False \\\n",
153 |         "--local-dir chinese-alpaca-2-7b hfl/chinese-alpaca-2-7b --exclude *.pth"
154 |       ]
155 |     },
156 |     {
157 |       "cell_type": "markdown",
158 |       "metadata": {
159 |         "id": "Ia-cIw5aZGrm"
160 |       },
161 |       "source": [
162 |         "### import"
163 |       ]
164 |     },
165 |     {
166 |       "cell_type": "code",
167 |       "execution_count": null,
168 |       "metadata": {
169 |         "id": "6jEts7uVY5st"
170 |       },
171 |       "outputs": [],
172 |       "source": [
173 |         "import gradio as gr\n",
174 |         "import torch\n",
175 |         "from transformers import LlamaForCausalLM, LlamaTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer\n",
176 |         "from threading import Thread\n",
177 |         "import os\n",
178 |         "\n",
179 |         "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '0'"
180 |       ]
181 |     },
182 |     {
183 |       "cell_type": "markdown",
184 |       "metadata": {
185 |         "id": "FeiPN2DqZIyZ"
186 |       },
187 |       "source": [
188 |         "### load model"
189 |       ]
190 |     },
191 |     {
192 |       "cell_type": "code",
193 |       "execution_count": null,
194 |       "metadata": {
195 |         "colab": {
196 |           "base_uri": "https://localhost:8080/",
197 |           "height": 49,
198 |           "referenced_widgets": [
199 |             "801997e1120c404bb02ce2bb4e392af6",
200 |             "7508a218d0a14c339204eef1fef4abfb",
201 |             "55314e86804848cf95be4ea833396837",
202 |             "793d13e7ad2143d3aa1e2c88ed805724",
203 |             "1d14c06992044818b25a4ac3c0b2d5e2",
204 |             "7f685d3b3e5b4dd792b2ddcba0f7b4dc",
205 |             "3e71c4fe82434351a3a862f67573a885",
206 |             "ea79f68a7b4043f6a009f5a7b9724046",
207 |             "530748bd1cf5473583fcaacb3a5eaa23",
208 |             "92b37b92101544339b072b09321df02e",
209 |             "8ea6194612a041c5abd10815c27d5eeb"
210 |           ]
211 |         },
212 |         "id": "cDDJOwgdZYYF",
213 |         "outputId": "db7ef108-7bc9-40d3-d77b-35449f9cc081"
214 |       },
215 |       "outputs": [
216 |         {
217 |           "data": {
218 |             "application/vnd.jupyter.widget-view+json": {
219 |               "model_id": "801997e1120c404bb02ce2bb4e392af6",
220 |               "version_major": 2,
221 |               "version_minor": 0
222 |             },
223 |             "text/plain": [
224 |               "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
225 |             ]
226 |           },
227 |           "metadata": {},
228 |           "output_type": "display_data"
229 |         }
230 |       ],
231 |       "source": [
232 |         "base_model_path = '/content/chinese-alpaca-2-7b'\n",
233 |         "tokenizer = LlamaTokenizer.from_pretrained(base_model_path, legacy=True)\n",
234 |         "model = LlamaForCausalLM.from_pretrained(\n",
235 |         "    base_model_path,\n",
236 |         "    torch_dtype=torch.float16,\n",
237 |         "    low_cpu_mem_usage=True,\n",
238 |         "    device_map='auto',\n",
239 |         "    load_in_8bit=True)"
240 |       ]
241 |     },
242 |     {
243 |       "cell_type": "code",
244 |       "execution_count": null,
245 |       "metadata": {
246 |         "id": "uLBoiY4ndt_J"
247 |       },
248 |       "outputs": [],
249 |       "source": [
250 |         "DEFAULT_SYSTEM_PROMPT = \"\"\"You are a helpful assistant. 你是一个乐于助人的助手。\"\"\"\n",
251 |         "TEMPLATE_WITH_SYSTEM_PROMPT = (\n",
252 |         "    \"[INST] <<SYS>>\\n\"\n",
253 |         "    \"{system_prompt}\\n\"\n",
254 |         "    \"<</SYS>>\\n\\n\"\n",
255 |         "    \"{instruction} [/INST]\"\n",
256 |         ")\n",
257 |         "TEMPLATE_WITHOUT_SYSTEM_PROMPT = \"[INST] {instruction} [/INST]\"\n",
258 |         "\n",
259 |         "def generate_prompt(instruction, response=\"\", with_system_prompt=True, system_prompt=DEFAULT_SYSTEM_PROMPT):\n",
260 |         "    if with_system_prompt is True:\n",
261 |         "        prompt = TEMPLATE_WITH_SYSTEM_PROMPT.format_map({'instruction': instruction,'system_prompt': system_prompt})\n",
262 |         "    else:\n",
263 |         "        prompt = TEMPLATE_WITHOUT_SYSTEM_PROMPT.format_map({'instruction': instruction})\n",
264 |         "    if len(response)>0:\n",
265 |         "        prompt += \" \" + response\n",
266 |         "    return prompt\n",
267 |         "\n",
268 |         "class StopOnTokens(StoppingCriteria):\n",
269 |         "    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:\n",
270 |         "        stop_ids = [29, 0]\n",
271 |         "        for stop_id in stop_ids:\n",
272 |         "            if input_ids[0][-1] == stop_id:\n",
273 |         "                return True\n",
274 |         "        return False\n",
275 |         "\n",
276 |         "class Stream(StoppingCriteria):\n",
277 |         "    def __init__(self, callback_func=None):\n",
278 |         "        self.callback_func = callback_func\n",
279 |         "\n",
280 |         "    def __call__(self, input_ids, scores) -> bool:\n",
281 |         "        if self.callback_func is not None:\n",
282 |         "            self.callback_func(input_ids[0])\n",
283 |         "        return False"
284 |       ]
285 |     },
286 |     {
287 |       "cell_type": "markdown",
288 |       "metadata": {
289 |         "id": "UXY1l2HzZx6A"
290 |       },
291 |       "source": [
292 |         "### predict"
293 |       ]
294 |     },
295 |     {
296 |       "cell_type": "code",
297 |       "execution_count": null,
298 |       "metadata": {
299 |         "id": "PGd0qievaTUv"
300 |       },
301 |       "outputs": [],
302 |       "source": [
303 |         "# message: current user's input\n",
304 |         "# history: a 2D-array with [[user1, sys1], [user2, sys2], ...]\n",
305 |         "def predict(message, history):\n",
306 |         "    history_transformer_format = history + [[message, \"\"]]\n",
307 |         "    stop = StopOnTokens()\n",
308 |         "\n",
309 |         "    # first round conversation, we paste full system + input template\n",
310 |         "    if len(history) == 0:\n",
311 |         "        messages = generate_prompt(message, response=\"\", with_system_prompt=True, system_prompt=DEFAULT_SYSTEM_PROMPT)\n",
312 |         "    else:\n",
313 |         "        # handle the first input/response\n",
314 |         "        first_input = history[0][0]\n",
315 |         "        first_response = history[0][1]\n",
316 |         "        messages = generate_prompt(first_input, response=first_response, with_system_prompt=True, system_prompt=DEFAULT_SYSTEM_PROMPT)\n",
317 |         "\n",
318 |         "        # handle the rest\n",
319 |         "        for hist in history[1:]:\n",
320 |         "            cur_input = hist[0]\n",
321 |         "            cur_response = hist[1]\n",
322 |         "            cur_prompt = generate_prompt(cur_input, response=cur_response, with_system_prompt=False)\n",
323 |         "            messages = messages + cur_prompt\n",
324 |         "\n",
325 |         "        # handle the current\n",
326 |         "        messages = messages + generate_prompt(message, response=\"\", with_system_prompt=False)\n",
327 |         "\n",
328 |         "    #messages = \"\".join([\"\".join([\"\\n<human>:\"+item[0], \"\\n<bot>:\"+item[1]])  #curr_system_message +\n",
329 |         "    #            for item in history_transformer_format])\n",
330 |         "\n",
331 |         "    print(message)\n",
332 |         "    print(history)\n",
333 |         "    print(messages)\n",
334 |         "    print('----')\n",
335 |         "\n",
336 |         "    model_inputs = tokenizer([messages], return_tensors=\"pt\").to(\"cuda\")\n",
337 |         "    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)\n",
338 |         "    generate_kwargs = dict(\n",
339 |         "        model_inputs,\n",
340 |         "        streamer=streamer,\n",
341 |         "        max_new_tokens=512,\n",
342 |         "        do_sample=True,\n",
343 |         "        top_p=0.9,\n",
344 |         "        top_k=40,\n",
345 |         "        temperature=0.2,\n",
346 |         "        num_beams=1,\n",
347 |         "        stopping_criteria=StoppingCriteriaList([Stream(callback_func=None)])\n",
348 |         "        )\n",
349 |         "    # StoppingCriteriaList([stop]) #\n",
350 |         "    t = Thread(target=model.generate, kwargs=generate_kwargs)\n",
351 |         "    t.start()\n",
352 |         "\n",
353 |         "    partial_message  = \"\"\n",
354 |         "    for new_token in streamer:\n",
355 |         "        if new_token != '<':\n",
356 |         "            partial_message += new_token\n",
357 |         "            yield partial_message\n"
358 |       ]
359 |     },
360 |     {
361 |       "cell_type": "code",
362 |       "execution_count": null,
363 |       "metadata": {
364 |         "colab": {
365 |           "base_uri": "https://localhost:8080/"
366 |         },
367 |         "id": "mc3KUa1IvwHB",
368 |         "outputId": "97dd3b0b-8975-434e-ee84-d096111fbf0b"
369 |       },
370 |       "outputs": [
371 |         {
372 |           "data": {
373 |             "text/plain": [
374 |               "2"
375 |             ]
376 |           },
377 |           "execution_count": 6,
378 |           "metadata": {},
379 |           "output_type": "execute_result"
380 |         }
381 |       ],
382 |       "source": [
383 |         "tokenizer.eos_token_id"
384 |       ]
385 |     },
386 |     {
387 |       "cell_type": "markdown",
388 |       "metadata": {
389 |         "id": "G72Y2Ua9aRre"
390 |       },
391 |       "source": [
392 |         "### launch"
393 |       ]
394 |     },
395 |     {
396 |       "cell_type": "code",
397 |       "execution_count": null,
398 |       "metadata": {
399 |         "colab": {
400 |           "base_uri": "https://localhost:8080/",
401 |           "height": 1000
402 |         },
403 |         "id": "CNIC8qPdaXLE",
404 |         "outputId": "ee23402d-4a12-403e-d8a0-0573fd9957c1"
405 |       },
406 |       "outputs": [
407 |         {
408 |           "name": "stdout",
409 |           "output_type": "stream",
410 |           "text": [
411 |             "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n",
412 |             "Running on public URL: https://6b66f533e663af200f.gradio.live\n",
413 |             "\n",
414 |             "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
415 |           ]
416 |         },
417 |         {
418 |           "data": {
419 |             "text/html": [
420 |               "<div><iframe src=\"https://6b66f533e663af200f.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
421 |             ],
422 |             "text/plain": [
423 |               "<IPython.core.display.HTML object>"
424 |             ]
425 |           },
426 |           "metadata": {},
427 |           "output_type": "display_data"
428 |         },
429 |         {
430 |           "name": "stdout",
431 |           "output_type": "stream",
432 |           "text": [
433 |             "你好\n",
434 |             "[]\n",
435 |             "[INST] <<SYS>>\n",
436 |             "You are a helpful assistant. 你是一个乐于助人的助手。\n",
437 |             "<</SYS>>\n",
438 |             "\n",
439 |             "你好 [/INST]\n",
440 |             "----\n",
441 |             "请你帮我购物\n",
442 |             "[['你好', '你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？']]\n",
443 |             "[INST] <<SYS>>\n",
444 |             "You are a helpful assistant. 你是一个乐于助人的助手。\n",
445 |             "<</SYS>>\n",
446 |             "\n",
447 |             "你好 [/INST] 你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？[INST] 请你帮我购物 [/INST]\n",
448 |             "----\n",
449 |             "我要买最新款iphone\n",
450 |             "[['你好', '你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？'], ['请你帮我购物', '当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。']]\n",
451 |             "[INST] <<SYS>>\n",
452 |             "You are a helpful assistant. 你是一个乐于助人的助手。\n",
453 |             "<</SYS>>\n",
454 |             "\n",
455 |             "你好 [/INST] 你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？[INST] 请你帮我购物 [/INST] 当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。[INST] 我要买最新款iphone [/INST]\n",
456 |             "----\n",
457 |             "我需要在官网买iphone 15 pro max\n",
458 |             "[['你好', '你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？'], ['请你帮我购物', '当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。'], ['我要买最新款iphone', '好的，最新款的 iPhone 是 iPhone 13。以下是购买 iPhone 13 的选项：\\n\\n1. 在苹果官网上购买：您可以在苹果官网上购买 iPhone 13，选择您喜欢的颜色和存储容量。\\n\\n2. 在运营商处购买：您可以在运营商处购买 iPhone 13，例如 AT&T、Verizon、T-Mobile 或 Sprint。这些运营商通常会提供一些优惠和折扣。\\n\\n3. 在第三方零售商处购买：您可以在第三方零售商处购买 iPhone 13，例如 Best Buy、Amazon 或 Walmart。这些零售商通常会提供一些优惠和折扣。\\n\\n请告诉我您更倾向于哪种购买方式，我可以为您提供更多信息。']]\n",
459 |             "[INST] <<SYS>>\n",
460 |             "You are a helpful assistant. 你是一个乐于助人的助手。\n",
461 |             "<</SYS>>\n",
462 |             "\n",
463 |             "你好 [/INST] 你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？[INST] 请你帮我购物 [/INST] 当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。[INST] 我要买最新款iphone [/INST] 好的，最新款的 iPhone 是 iPhone 13。以下是购买 iPhone 13 的选项：\n",
464 |             "\n",
465 |             "1. 在苹果官网上购买：您可以在苹果官网上购买 iPhone 13，选择您喜欢的颜色和存储容量。\n",
466 |             "\n",
467 |             "2. 在运营商处购买：您可以在运营商处购买 iPhone 13，例如 AT&T、Verizon、T-Mobile 或 Sprint。这些运营商通常会提供一些优惠和折扣。\n",
468 |             "\n",
469 |             "3. 在第三方零售商处购买：您可以在第三方零售商处购买 iPhone 13，例如 Best Buy、Amazon 或 Walmart。这些零售商通常会提供一些优惠和折扣。\n",
470 |             "\n",
471 |             "请告诉我您更倾向于哪种购买方式，我可以为您提供更多信息。[INST] 我需要在官网买iphone 15 pro max [/INST]\n",
472 |             "----\n",
473 |             "官网\n",
474 |             "[['你好', '你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？'], ['请你帮我购物', '当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。'], ['我要买最新款iphone', '好的，最新款的 iPhone 是 iPhone 13。以下是购买 iPhone 13 的选项：\\n\\n1. 在苹果官网上购买：您可以在苹果官网上购买 iPhone 13，选择您喜欢的颜色和存储容量。\\n\\n2. 在运营商处购买：您可以在运营商处购买 iPhone 13，例如 AT&T、Verizon、T-Mobile 或 Sprint。这些运营商通常会提供一些优惠和折扣。\\n\\n3. 在第三方零售商处购买：您可以在第三方零售商处购买 iPhone 13，例如 Best Buy、Amazon 或 Walmart。这些零售商通常会提供一些优惠和折扣。\\n\\n请告诉我您更倾向于哪种购买方式，我可以为您提供更多信息。'], ['我需要在官网买iphone 15 pro max', '好的，您可以在苹果官网上购买 iPhone 15 Pro Max。以下是购买 iPhone 15 Pro Max 的选项：\\n\\n1. 在苹果官网上购买：您可以在苹果官网上购买 iPhone 15 Pro Max，选择您喜欢的颜色和存储容量。\\n\\n2. 在运营商处购买：您可以在运营商处购买 iPhone 15 Pro Max，例如 AT&T、Verizon、T-Mobile 或 Sprint。这些运营商通常会提供一些优惠和折扣。\\n\\n3. 在第三方零售商处购买：您可以在第三方零售商处购买 iPhone 15 Pro Max，例如 Best Buy、Amazon 或 Walmart。这些零售商通常会提供一些优惠和折扣。\\n\\n请告诉我您更倾向于哪种购买方式，我可以为您提供更多信息。']]\n",
475 |             "[INST] <<SYS>>\n",
476 |             "You are a helpful assistant. 你是一个乐于助人的助手。\n",
477 |             "<</SYS>>\n",
478 |             "\n",
479 |             "你好 [/INST] 你好！很高兴见到你。我是 Assistant，一个大型语言模型，由 OpenAI 训练。有什么我可以帮助你的吗？[INST] 请你帮我购物 [/INST] 当然可以！请告诉我你需要购买什么，我可以帮你搜索并提供购买选项。[INST] 我要买最新款iphone [/INST] 好的，最新款的 iPhone 是 iPhone 13。以下是购买 iPhone 13 的选项：\n",
480 |             "\n",
481 |             "1. 在苹果官网上购买：您可以在苹果官网上购买 iPhone 13，选择您喜欢的颜色和存储容量。\n",
482 |             "\n",
483 |             "2. 在运营商处购买：您可以在运营商处购买 iPhone 13，例如 AT&T、Verizon、T-Mobile 或 Sprint。这些运营商通常会提供一些优惠和折扣。\n",
484 |             "\n",
485 |             "3. 在第三方零售商处购买：您可以在第三方零售商处购买 iPhone 13，例如 Best Buy、Amazon 或 Walmart。这些零售商通常会提供一些优惠和折扣。\n",
486 |             "\n",
487 |             "请告诉我您更倾向于哪种购买方式，我可以为您提供更多信息。[INST] 我需要在官网买iphone 15 pro max [/INST] 好的，您可以在苹果官网上购买 iPhone 15 Pro Max。以下是购买 iPhone 15 Pro Max 的选项：\n",
488 |             "\n",
489 |             "1. 在苹果官网上购买：您可以在苹果官网上购买 iPhone 15 Pro Max，选择您喜欢的颜色和存储容量。\n",
490 |             "\n",
491 |             "2. 在运营商处购买：您可以在运营商处购买 iPhone 15 Pro Max，例如 AT&T、Verizon、T-Mobile 或 Sprint。这些运营商通常会提供一些优惠和折扣。\n",
492 |             "\n",
493 |             "3. 在第三方零售商处购买：您可以在第三方零售商处购买 iPhone 15 Pro Max，例如 Best Buy、Amazon 或 Walmart。这些零售商通常会提供一些优惠和折扣。\n",
494 |             "\n",
495 |             "请告诉我您更倾向于哪种购买方式，我可以为您提供更多信息。[INST] 官网 [/INST]\n",
496 |             "----\n",
497 |             "Keyboard interruption in main thread... closing server.\n"
498 |           ]
499 |         }
500 |       ],
501 |       "source": [
502 |         "gr.ChatInterface(predict).queue().launch(share=True, debug=True)\n",
503 |         "#gr.ChatInterface(predict).queue().launch(share=False, inbrowser=True, server_name='0.0.0.0', server_port=8765)"
504 |       ]
505 |     }
506 |   ],
507 |   "metadata": {
508 |     "accelerator": "GPU",
509 |     "colab": {
510 |       "gpuType": "A100",
511 |       "machine_shape": "hm",
512 |       "provenance": []
513 |     },
514 |     "kernelspec": {
515 |       "display_name": "Python 3",
516 |       "name": "python3"
517 |     },
518 |     "language_info": {
519 |       "name": "python"
520 |     },
521 |     "widgets": {
522 |       "application/vnd.jupyter.widget-state+json": {
523 |         "1d14c06992044818b25a4ac3c0b2d5e2": {
524 |           "model_module": "@jupyter-widgets/base",
525 |           "model_module_version": "1.2.0",
526 |           "model_name": "LayoutModel",
527 |           "state": {
528 |             "_model_module": "@jupyter-widgets/base",
529 |             "_model_module_version": "1.2.0",
530 |             "_model_name": "LayoutModel",
531 |             "_view_count": null,
532 |             "_view_module": "@jupyter-widgets/base",
533 |             "_view_module_version": "1.2.0",
534 |             "_view_name": "LayoutView",
535 |             "align_content": null,
536 |             "align_items": null,
537 |             "align_self": null,
538 |             "border": null,
539 |             "bottom": null,
540 |             "display": null,
541 |             "flex": null,
542 |             "flex_flow": null,
543 |             "grid_area": null,
544 |             "grid_auto_columns": null,
545 |             "grid_auto_flow": null,
546 |             "grid_auto_rows": null,
547 |             "grid_column": null,
548 |             "grid_gap": null,
549 |             "grid_row": null,
550 |             "grid_template_areas": null,
551 |             "grid_template_columns": null,
552 |             "grid_template_rows": null,
553 |             "height": null,
554 |             "justify_content": null,
555 |             "justify_items": null,
556 |             "left": null,
557 |             "margin": null,
558 |             "max_height": null,
559 |             "max_width": null,
560 |             "min_height": null,
561 |             "min_width": null,
562 |             "object_fit": null,
563 |             "object_position": null,
564 |             "order": null,
565 |             "overflow": null,
566 |             "overflow_x": null,
567 |             "overflow_y": null,
568 |             "padding": null,
569 |             "right": null,
570 |             "top": null,
571 |             "visibility": null,
572 |             "width": null
573 |           }
574 |         },
575 |         "3e71c4fe82434351a3a862f67573a885": {
576 |           "model_module": "@jupyter-widgets/controls",
577 |           "model_module_version": "1.5.0",
578 |           "model_name": "DescriptionStyleModel",
579 |           "state": {
580 |             "_model_module": "@jupyter-widgets/controls",
581 |             "_model_module_version": "1.5.0",
582 |             "_model_name": "DescriptionStyleModel",
583 |             "_view_count": null,
584 |             "_view_module": "@jupyter-widgets/base",
585 |             "_view_module_version": "1.2.0",
586 |             "_view_name": "StyleView",
587 |             "description_width": ""
588 |           }
589 |         },
590 |         "530748bd1cf5473583fcaacb3a5eaa23": {
591 |           "model_module": "@jupyter-widgets/controls",
592 |           "model_module_version": "1.5.0",
593 |           "model_name": "ProgressStyleModel",
594 |           "state": {
595 |             "_model_module": "@jupyter-widgets/controls",
596 |             "_model_module_version": "1.5.0",
597 |             "_model_name": "ProgressStyleModel",
598 |             "_view_count": null,
599 |             "_view_module": "@jupyter-widgets/base",
600 |             "_view_module_version": "1.2.0",
601 |             "_view_name": "StyleView",
602 |             "bar_color": null,
603 |             "description_width": ""
604 |           }
605 |         },
606 |         "55314e86804848cf95be4ea833396837": {
607 |           "model_module": "@jupyter-widgets/controls",
608 |           "model_module_version": "1.5.0",
609 |           "model_name": "FloatProgressModel",
610 |           "state": {
611 |             "_dom_classes": [],
612 |             "_model_module": "@jupyter-widgets/controls",
613 |             "_model_module_version": "1.5.0",
614 |             "_model_name": "FloatProgressModel",
615 |             "_view_count": null,
616 |             "_view_module": "@jupyter-widgets/controls",
617 |             "_view_module_version": "1.5.0",
618 |             "_view_name": "ProgressView",
619 |             "bar_style": "success",
620 |             "description": "",
621 |             "description_tooltip": null,
622 |             "layout": "IPY_MODEL_ea79f68a7b4043f6a009f5a7b9724046",
623 |             "max": 2,
624 |             "min": 0,
625 |             "orientation": "horizontal",
626 |             "style": "IPY_MODEL_530748bd1cf5473583fcaacb3a5eaa23",
627 |             "value": 2
628 |           }
629 |         },
630 |         "7508a218d0a14c339204eef1fef4abfb": {
631 |           "model_module": "@jupyter-widgets/controls",
632 |           "model_module_version": "1.5.0",
633 |           "model_name": "HTMLModel",
634 |           "state": {
635 |             "_dom_classes": [],
636 |             "_model_module": "@jupyter-widgets/controls",
637 |             "_model_module_version": "1.5.0",
638 |             "_model_name": "HTMLModel",
639 |             "_view_count": null,
640 |             "_view_module": "@jupyter-widgets/controls",
641 |             "_view_module_version": "1.5.0",
642 |             "_view_name": "HTMLView",
643 |             "description": "",
644 |             "description_tooltip": null,
645 |             "layout": "IPY_MODEL_7f685d3b3e5b4dd792b2ddcba0f7b4dc",
646 |             "placeholder": "​",
647 |             "style": "IPY_MODEL_3e71c4fe82434351a3a862f67573a885",
648 |             "value": "Loading checkpoint shards: 100%"
649 |           }
650 |         },
651 |         "793d13e7ad2143d3aa1e2c88ed805724": {
652 |           "model_module": "@jupyter-widgets/controls",
653 |           "model_module_version": "1.5.0",
654 |           "model_name": "HTMLModel",
655 |           "state": {
656 |             "_dom_classes": [],
657 |             "_model_module": "@jupyter-widgets/controls",
658 |             "_model_module_version": "1.5.0",
659 |             "_model_name": "HTMLModel",
660 |             "_view_count": null,
661 |             "_view_module": "@jupyter-widgets/controls",
662 |             "_view_module_version": "1.5.0",
663 |             "_view_name": "HTMLView",
664 |             "description": "",
665 |             "description_tooltip": null,
666 |             "layout": "IPY_MODEL_92b37b92101544339b072b09321df02e",
667 |             "placeholder": "​",
668 |             "style": "IPY_MODEL_8ea6194612a041c5abd10815c27d5eeb",
669 |             "value": " 2/2 [00:12&lt;00:00,  5.67s/it]"
670 |           }
671 |         },
672 |         "7f685d3b3e5b4dd792b2ddcba0f7b4dc": {
673 |           "model_module": "@jupyter-widgets/base",
674 |           "model_module_version": "1.2.0",
675 |           "model_name": "LayoutModel",
676 |           "state": {
677 |             "_model_module": "@jupyter-widgets/base",
678 |             "_model_module_version": "1.2.0",
679 |             "_model_name": "LayoutModel",
680 |             "_view_count": null,
681 |             "_view_module": "@jupyter-widgets/base",
682 |             "_view_module_version": "1.2.0",
683 |             "_view_name": "LayoutView",
684 |             "align_content": null,
685 |             "align_items": null,
686 |             "align_self": null,
687 |             "border": null,
688 |             "bottom": null,
689 |             "display": null,
690 |             "flex": null,
691 |             "flex_flow": null,
692 |             "grid_area": null,
693 |             "grid_auto_columns": null,
694 |             "grid_auto_flow": null,
695 |             "grid_auto_rows": null,
696 |             "grid_column": null,
697 |             "grid_gap": null,
698 |             "grid_row": null,
699 |             "grid_template_areas": null,
700 |             "grid_template_columns": null,
701 |             "grid_template_rows": null,
702 |             "height": null,
703 |             "justify_content": null,
704 |             "justify_items": null,
705 |             "left": null,
706 |             "margin": null,
707 |             "max_height": null,
708 |             "max_width": null,
709 |             "min_height": null,
710 |             "min_width": null,
711 |             "object_fit": null,
712 |             "object_position": null,
713 |             "order": null,
714 |             "overflow": null,
715 |             "overflow_x": null,
716 |             "overflow_y": null,
717 |             "padding": null,
718 |             "right": null,
719 |             "top": null,
720 |             "visibility": null,
721 |             "width": null
722 |           }
723 |         },
724 |         "801997e1120c404bb02ce2bb4e392af6": {
725 |           "model_module": "@jupyter-widgets/controls",
726 |           "model_module_version": "1.5.0",
727 |           "model_name": "HBoxModel",
728 |           "state": {
729 |             "_dom_classes": [],
730 |             "_model_module": "@jupyter-widgets/controls",
731 |             "_model_module_version": "1.5.0",
732 |             "_model_name": "HBoxModel",
733 |             "_view_count": null,
734 |             "_view_module": "@jupyter-widgets/controls",
735 |             "_view_module_version": "1.5.0",
736 |             "_view_name": "HBoxView",
737 |             "box_style": "",
738 |             "children": [
739 |               "IPY_MODEL_7508a218d0a14c339204eef1fef4abfb",
740 |               "IPY_MODEL_55314e86804848cf95be4ea833396837",
741 |               "IPY_MODEL_793d13e7ad2143d3aa1e2c88ed805724"
742 |             ],
743 |             "layout": "IPY_MODEL_1d14c06992044818b25a4ac3c0b2d5e2"
744 |           }
745 |         },
746 |         "8ea6194612a041c5abd10815c27d5eeb": {
747 |           "model_module": "@jupyter-widgets/controls",
748 |           "model_module_version": "1.5.0",
749 |           "model_name": "DescriptionStyleModel",
750 |           "state": {
751 |             "_model_module": "@jupyter-widgets/controls",
752 |             "_model_module_version": "1.5.0",
753 |             "_model_name": "DescriptionStyleModel",
754 |             "_view_count": null,
755 |             "_view_module": "@jupyter-widgets/base",
756 |             "_view_module_version": "1.2.0",
757 |             "_view_name": "StyleView",
758 |             "description_width": ""
759 |           }
760 |         },
761 |         "92b37b92101544339b072b09321df02e": {
762 |           "model_module": "@jupyter-widgets/base",
763 |           "model_module_version": "1.2.0",
764 |           "model_name": "LayoutModel",
765 |           "state": {
766 |             "_model_module": "@jupyter-widgets/base",
767 |             "_model_module_version": "1.2.0",
768 |             "_model_name": "LayoutModel",
769 |             "_view_count": null,
770 |             "_view_module": "@jupyter-widgets/base",
771 |             "_view_module_version": "1.2.0",
772 |             "_view_name": "LayoutView",
773 |             "align_content": null,
774 |             "align_items": null,
775 |             "align_self": null,
776 |             "border": null,
777 |             "bottom": null,
778 |             "display": null,
779 |             "flex": null,
780 |             "flex_flow": null,
781 |             "grid_area": null,
782 |             "grid_auto_columns": null,
783 |             "grid_auto_flow": null,
784 |             "grid_auto_rows": null,
785 |             "grid_column": null,
786 |             "grid_gap": null,
787 |             "grid_row": null,
788 |             "grid_template_areas": null,
789 |             "grid_template_columns": null,
790 |             "grid_template_rows": null,
791 |             "height": null,
792 |             "justify_content": null,
793 |             "justify_items": null,
794 |             "left": null,
795 |             "margin": null,
796 |             "max_height": null,
797 |             "max_width": null,
798 |             "min_height": null,
799 |             "min_width": null,
800 |             "object_fit": null,
801 |             "object_position": null,
802 |             "order": null,
803 |             "overflow": null,
804 |             "overflow_x": null,
805 |             "overflow_y": null,
806 |             "padding": null,
807 |             "right": null,
808 |             "top": null,
809 |             "visibility": null,
810 |             "width": null
811 |           }
812 |         },
813 |         "ea79f68a7b4043f6a009f5a7b9724046": {
814 |           "model_module": "@jupyter-widgets/base",
815 |           "model_module_version": "1.2.0",
816 |           "model_name": "LayoutModel",
817 |           "state": {
818 |             "_model_module": "@jupyter-widgets/base",
819 |             "_model_module_version": "1.2.0",
820 |             "_model_name": "LayoutModel",
821 |             "_view_count": null,
822 |             "_view_module": "@jupyter-widgets/base",
823 |             "_view_module_version": "1.2.0",
824 |             "_view_name": "LayoutView",
825 |             "align_content": null,
826 |             "align_items": null,
827 |             "align_self": null,
828 |             "border": null,
829 |             "bottom": null,
830 |             "display": null,
831 |             "flex": null,
832 |             "flex_flow": null,
833 |             "grid_area": null,
834 |             "grid_auto_columns": null,
835 |             "grid_auto_flow": null,
836 |             "grid_auto_rows": null,
837 |             "grid_column": null,
838 |             "grid_gap": null,
839 |             "grid_row": null,
840 |             "grid_template_areas": null,
841 |             "grid_template_columns": null,
842 |             "grid_template_rows": null,
843 |             "height": null,
844 |             "justify_content": null,
845 |             "justify_items": null,
846 |             "left": null,
847 |             "margin": null,
848 |             "max_height": null,
849 |             "max_width": null,
850 |             "min_height": null,
851 |             "min_width": null,
852 |             "object_fit": null,
853 |             "object_position": null,
854 |             "order": null,
855 |             "overflow": null,
856 |             "overflow_x": null,
857 |             "overflow_y": null,
858 |             "padding": null,
859 |             "right": null,
860 |             "top": null,
861 |             "visibility": null,
862 |             "width": null
863 |           }
864 |         }
865 |       }
866 |     }
867 |   },
868 |   "nbformat": 4,
869 |   "nbformat_minor": 0
870 | }
871 | 


--------------------------------------------------------------------------------
/chp10/langchain_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "code",
  5 |       "execution_count": null,
  6 |       "metadata": {
  7 |         "id": "mHWrhvqONBWd"
  8 |       },
  9 |       "outputs": [],
 10 |       "source": [
 11 |         "!pip install langchain transformers sentencepiece accelerate bitsandbytes"
 12 |       ]
 13 |     },
 14 |     {
 15 |       "cell_type": "code",
 16 |       "execution_count": null,
 17 |       "metadata": {
 18 |         "colab": {
 19 |           "base_uri": "https://localhost:8080/"
 20 |         },
 21 |         "id": "rwIace6sOPsZ",
 22 |         "outputId": "28fc3579-2dea-40ac-9c1e-75a950dec8b5"
 23 |       },
 24 |       "outputs": [
 25 |         {
 26 |           "name": "stdout",
 27 |           "output_type": "stream",
 28 |           "text": [
 29 |             "Cloning into 'Chinese-LLaMA-Alpaca-2'...\n",
 30 |             "remote: Enumerating objects: 1089, done.\u001b[K\n",
 31 |             "remote: Counting objects: 100% (452/452), done.\u001b[K\n",
 32 |             "remote: Compressing objects: 100% (120/120), done.\u001b[K\n",
 33 |             "remote: Total 1089 (delta 372), reused 340 (delta 332), pack-reused 637\u001b[K\n",
 34 |             "Receiving objects: 100% (1089/1089), 8.18 MiB | 28.67 MiB/s, done.\n",
 35 |             "Resolving deltas: 100% (675/675), done.\n"
 36 |           ]
 37 |         }
 38 |       ],
 39 |       "source": [
 40 |         "!git clone https://github.com/ymcui/Chinese-LLaMA-Alpaca-2.git"
 41 |       ]
 42 |     },
 43 |     {
 44 |       "cell_type": "code",
 45 |       "execution_count": null,
 46 |       "metadata": {
 47 |         "id": "ClQ6kPegOYjv"
 48 |       },
 49 |       "outputs": [],
 50 |       "source": [
 51 |         "!pip install hf_transfer\n",
 52 |         "!HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download --local-dir-use-symlinks False \\\n",
 53 |         "--local-dir chinese-alpaca-2-7b hfl/chinese-alpaca-2-7b --exclude *.pth"
 54 |       ]
 55 |     },
 56 |     {
 57 |       "cell_type": "code",
 58 |       "execution_count": null,
 59 |       "metadata": {
 60 |         "colab": {
 61 |           "base_uri": "https://localhost:8080/"
 62 |         },
 63 |         "id": "LblR5fS0OYqE",
 64 |         "outputId": "ba11840b-b1a6-43f6-b986-75f68a4004cd"
 65 |       },
 66 |       "outputs": [
 67 |         {
 68 |           "name": "stdout",
 69 |           "output_type": "stream",
 70 |           "text": [
 71 |             "loading LLM...\n",
 72 |             "2023-09-13 08:10:31.313810: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
 73 |             "/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
 74 |             "  warnings.warn(\n",
 75 |             "Loading checkpoint shards: 100% 2/2 [00:07<00:00,  3.89s/it]\n",
 76 |             "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1417: UserWarning: You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation )\n",
 77 |             "  warnings.warn(\n",
 78 |             "/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
 79 |             "  warnings.warn(\n",
 80 |             "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1260: UserWarning: Using the model-agnostic default `max_length` (=20) to control thegeneration length. We recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
 81 |             "  warnings.warn(\n",
 82 |             "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py:1268: UserWarning: Input length of input_ids is 1888, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.\n",
 83 |             "  warnings.warn(\n",
 84 |             " \n"
 85 |           ]
 86 |         }
 87 |       ],
 88 |       "source": [
 89 |         "!cd Chinese-LLaMA-Alpaca-2/scripts/langchain && CUDA_VISIBLE_DEVICES=0 python langchain_sum.py --model_path /content/chinese-alpaca-2-7b --file_path /content/doc.txt --chain_type stuff"
 90 |       ]
 91 |     },
 92 |     {
 93 |       "cell_type": "code",
 94 |       "execution_count": null,
 95 |       "metadata": {
 96 |         "colab": {
 97 |           "base_uri": "https://localhost:8080/"
 98 |         },
 99 |         "id": "QhhUlATSbJOc",
100 |         "outputId": "13f9a53d-f317-4c36-ec65-985ac9b86e70"
101 |       },
102 |       "outputs": [
103 |         {
104 |           "name": "stdout",
105 |           "output_type": "stream",
106 |           "text": [
107 |             "loading LLM...\n",
108 |             "2023-09-13 07:23:48.501954: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
109 |             "Loading checkpoint shards: 100% 2/2 [00:07<00:00,  3.89s/it]\n",
110 |             " 李白是中国唐代一位著名的诗人，被认为是中国诗歌史上的重要人物之一。他曾经担任过多次官职，但由于桀骜不驯的性格，很快就离开了政府工作岗位。他游历了中国的很多地方并写下了很多诗篇。他的诗歌充满了想象力并且经常使用生动形象的比喻来传达情感。尽管有许多文学作品和典故与他的经历有关，但他本人的具体死亡原因一直是一个谜题。然而，他的才华和诗歌影响了许多之后的诗人和文学家。\n"
111 |           ]
112 |         }
113 |       ],
114 |       "source": [
115 |         "!cd Chinese-LLaMA-Alpaca-2/scripts/langchain && CUDA_VISIBLE_DEVICES=0 python langchain_sum.py --model_path /content/chinese-alpaca-2-7b --file_path /content/doc.txt --chain_type refine"
116 |       ]
117 |     }
118 |   ],
119 |   "metadata": {
120 |     "accelerator": "GPU",
121 |     "colab": {
122 |       "gpuType": "A100",
123 |       "machine_shape": "hm",
124 |       "provenance": []
125 |     },
126 |     "kernelspec": {
127 |       "display_name": "Python 3",
128 |       "name": "python3"
129 |     },
130 |     "language_info": {
131 |       "name": "python"
132 |     }
133 |   },
134 |   "nbformat": 4,
135 |   "nbformat_minor": 0
136 | }
137 | 


--------------------------------------------------------------------------------
/chp11/elo.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import pandas as pd
 3 | import json
 4 | 
 5 | def compute_elo(battles, K=4, SCALE=400, BASE=10, INIT_RATING=1000):
 6 |     # 初始化模型得分
 7 |     ratings = defaultdict(lambda: INIT_RATING)
 8 | 
 9 |     # 遍历每次两两比较
10 |     for _, model_a, model_b, winner in battles[['model_a', 'model_b', 'winner']].itertuples():
11 |         ra = ratings[model_a]
12 |         rb = ratings[model_b]
13 | 
14 |         # 计算期望胜率
15 |         ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
16 |         eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
17 | 
18 |         # 根据真实胜率更新等级分
19 |         if winner == "model_a":
20 |             sa = 1
21 |         elif winner == "model_b":
22 |             sa = 0
23 |         elif winner in ["tie", "tie (bothbad)"]:
24 |             sa = 0.5
25 |         else:
26 |             raise ValueError(f"unexpected winner value: {winner}")
27 |         ratings[model_a] += K * (sa - ea)
28 |         ratings[model_b] += K * (1 - sa - eb)
29 | 
30 |     return ratings
31 | 
32 | # 示例数
33 | battles = pd.DataFrame({
34 |     'model_a': ['A', 'A', 'B', 'C', 'C', 'D'],
35 |     'model_b': ['B', 'C', 'C', 'D', 'A', 'A'],
36 |     'winner': ['model_a', 'model_b', 'model_b', 'model_a', 'tie', 'model_b']
37 | })
38 | 
39 | # 计算Elo评分
40 | elo_scores = compute_elo(battles)
41 | print(json.dumps(elo_scores, indent=2))
42 | 


--------------------------------------------------------------------------------
/chp2/fmm_word_seg.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 2.2.2
 2 | 
 3 | def load_dict():
 4 |     f = open("lexicon.txt")
 5 |     lexicon = set()
 6 |     max_len = 0
 7 |     for line in f:
 8 |         word = line.strip()
 9 |         lexicon.add(word)
10 |         if len(word) > max_len:
11 |             max_len = len(word)
12 |     f.close()
13 | 
14 |     return lexicon, max_len
15 | 
16 | def fmm_word_seg(sentence, lexicon, max_len):
17 |     begin = 0
18 |     end = min(begin + max_len, len(sentence))
19 |     words = []
20 |     while begin < end:
21 |         word = sentence[begin:end]
22 |         if word in lexicon or end - begin == 1:
23 |             words.append(word)
24 |             begin = end
25 |             end = min(begin + max_len, len(sentence))
26 |         else:
27 |             end -= 1
28 |     return words
29 | 
30 | lexicon, max_len = load_dict()
31 | words = fmm_word_seg(input("请输入句子："), lexicon, max_len)
32 | 
33 | for word in words:
34 |     print(word,) 
35 | 


--------------------------------------------------------------------------------
/chp2/svd.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 2.1.2
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | M = np.array([[0, 2, 1, 1, 1, 1, 1, 2, 1, 3],
 6 |               [2, 0, 1, 1, 1, 0, 0, 1, 1, 2],
 7 |               [1, 1, 0, 1, 1, 0, 0, 0, 0, 1],
 8 |               [1, 1, 1, 0, 1, 0, 0, 0, 0, 1],
 9 |               [1, 1, 1, 1, 0, 0, 0, 0, 0, 1],
10 |               [1, 0, 0, 0, 0, 0, 1, 1, 0, 1],
11 |               [1, 0, 0, 0, 0, 1, 0, 1, 0, 1],
12 |               [2, 1, 0, 0, 0, 1, 1, 0, 1, 2],
13 |               [1, 1, 0, 0, 0, 0, 0, 1, 0, 1],
14 |               [3, 2, 1, 1, 1, 1, 1, 2, 1, 0]])
15 | 
16 | def pmi(M, positive=True):
17 |     col_totals = M.sum(axis=0)
18 |     row_totals = M.sum(axis=1)
19 |     total = col_totals.sum()
20 |     expected = np.outer(row_totals, col_totals) / total
21 |     M = M / expected
22 |     # Silence distracting warnings about log(0):
23 |     with np.errstate(divide='ignore'):
24 |         M = np.log(M)
25 |     M[np.isinf(M)] = 0.0  # log(0) = 0
26 |     if positive:
27 |         M[M < 0] = 0.0
28 |     return M
29 | 
30 | M_pmi = pmi(M)
31 | 
32 | np.set_printoptions(precision=2)
33 | print(M_pmi)
34 | 
35 | U, s, Vh = np.linalg.svd(M_pmi)
36 | 
37 | plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
38 | 
39 | words = ["我", "喜欢", "自然", "语言", "处理", "爱", "深度", "学习", "机器", "。"]
40 | 
41 | for i in range(len(words)):
42 |     plt.text(U[i, 0], U[i, 1], words[i])
43 |     plt.scatter(U[i, 0], U[i, 1], c='red', s=50)
44 | 
45 | plt.title('词向量分布图')
46 | plt.xlabel('第一维度')
47 | plt.ylabel('第二维度')
48 | plt.grid(True, linestyle='--', alpha=0.7)
49 | plt.margins(0.1)
50 | output_file = 'svd.pdf'
51 | plt.savefig(output_file, bbox_inches='tight', dpi=300)
52 | print(f"图形已保存至 {output_file}")
53 | plt.show()
54 | 


--------------------------------------------------------------------------------
/chp3/convert_t2s.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 3.5.3
 2 | 
 3 | import sys
 4 | import opencc
 5 | 
 6 | converter = opencc.OpenCC("t2s.json")
 7 | f_in = open(sys.argv[1], "r")
 8 | 
 9 | for line in f_in.readlines():
10 |     line = line.strip()
11 |     line_t2s = converter.convert(line)
12 |     print(line_t2s)
13 | 


--------------------------------------------------------------------------------
/chp3/t2s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Traditional Chinese to Simplified Chinese",
 3 |   "segmentation": {
 4 |     "type": "mmseg",
 5 |     "dict": {
 6 |       "type": "ocd2",
 7 |       "file": "TSPhrases.ocd2"
 8 |     }
 9 |   },
10 |   "conversion_chain": [{
11 |     "dict": {
12 |       "type": "group",
13 |       "dicts": [{
14 |         "type": "ocd2",
15 |         "file": "TSPhrases.ocd2"
16 |       }, {
17 |         "type": "ocd2",
18 |         "file": "TSCharacters.ocd2"
19 |       }]
20 |     }
21 |   }]
22 | }


--------------------------------------------------------------------------------
/chp3/wikidata_cleaning.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 3.5.3
 2 | 
 3 | import sys
 4 | import re
 5 | 
 6 | def remove_empty_paired_punc(in_str):
 7 |     return in_str.replace('（）', '').replace('《》', '').replace('【】', '').replace('[]', '')
 8 |     
 9 | def remove_html_tags(in_str):
10 |     html_pattern = re.compile(r'<[^>]+>', re.S)
11 |     return html_pattern.sub('', in_str)
12 | 
13 | def remove_control_chars(in_str):
14 |     control_chars = ''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))
15 |     control_chars = re.compile('[%s]' % re.escape(control_chars))
16 |     return control_chars.sub('', in_str)
17 | 
18 | f_in = open(sys.argv[1], 'r')
19 | for line in f_in.readlines():
20 |     line = line.strip()
21 |     if re.search(r'^(<doc id)|(</doc>)', line):
22 |         print(line)
23 |         continue
24 |     line = remove_empty_paired_punc(line)
25 |     line = remove_html_tags(line)
26 |     line = remove_control_chars(line)
27 |     print(line)
28 | 


--------------------------------------------------------------------------------
/chp4/cnn_sent_polarity.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | import torch
 4 | from torch import nn, optim
 5 | from torch.nn import functional as F
 6 | from torch.utils.data import Dataset, DataLoader
 7 | from torch.nn.utils.rnn import pad_sequence
 8 | from collections import defaultdict
 9 | from vocab import Vocab
10 | from utils import load_sentence_polarity
11 | 
12 | class CnnDataset(Dataset):
13 |     def __init__(self, data):
14 |         self.data = data
15 |     def __len__(self):
16 |         return len(self.data)
17 |     def __getitem__(self, i):
18 |         return self.data[i]
19 | 
20 | def collate_fn(examples):
21 |     inputs = [torch.tensor(ex[0]) for ex in examples]
22 |     targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
23 |     # 对batch内的样本进行padding，使其具有相同长度
24 |     inputs = pad_sequence(inputs, batch_first=True)
25 |     return inputs, targets
26 | 
27 | class CNN(nn.Module):
28 |     def __init__(self, vocab_size, embedding_dim, filter_size, num_filter, num_class):
29 |         super(CNN, self).__init__()
30 |         self.embedding = nn.Embedding(vocab_size, embedding_dim)
31 |         self.conv1d = nn.Conv1d(embedding_dim, num_filter, filter_size, padding=1)
32 |         self.activate = F.relu
33 |         self.linear = nn.Linear(num_filter, num_class)
34 |     def forward(self, inputs):
35 |         embedding = self.embedding(inputs)
36 |         convolution = self.activate(self.conv1d(embedding.permute(0, 2, 1)))
37 |         pooling = F.max_pool1d(convolution, kernel_size=convolution.shape[2])
38 |         outputs = self.linear(pooling.squeeze(dim=2))
39 |         log_probs = F.log_softmax(outputs, dim=1)
40 |         return log_probs
41 | 
42 | #tqdm是一个Pyth模块，能以进度条的方式显示迭代的进度
43 | from tqdm.auto import tqdm
44 | 
45 | #超参数设置
46 | embedding_dim = 128
47 | hidden_dim = 256
48 | num_class = 2
49 | batch_size = 32
50 | num_epoch = 5
51 | filter_size = 3
52 | num_filter = 100
53 | 
54 | #加载数据
55 | train_data, test_data, vocab = load_sentence_polarity()
56 | train_dataset = CnnDataset(train_data)
57 | test_dataset = CnnDataset(test_data)
58 | train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
59 | test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
60 | 
61 | #加载模型
62 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
63 | model = CNN(len(vocab), embedding_dim, filter_size, num_filter, num_class)
64 | model.to(device) #将模型加载到CPU或GPU设备
65 | 
66 | #训练过程
67 | nll_loss = nn.NLLLoss()
68 | optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器
69 | 
70 | model.train()
71 | for epoch in range(num_epoch):
72 |     total_loss = 0
73 |     for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
74 |         inputs, targets = [x.to(device) for x in batch]
75 |         log_probs = model(inputs)
76 |         loss = nll_loss(log_probs, targets)
77 |         optimizer.zero_grad()
78 |         loss.backward()
79 |         optimizer.step()
80 |         total_loss += loss.item()
81 |     print(f"Loss: {total_loss:.2f}")
82 | 
83 | #测试过程
84 | acc = 0
85 | for batch in tqdm(test_data_loader, desc=f"Testing"):
86 |     inputs, targets = [x.to(device) for x in batch]
87 |     with torch.no_grad():
88 |         output = model(inputs)
89 |         acc += (output.argmax(dim=1) == targets).sum().item()
90 | 
91 | #输出在测试集上的准确率
92 | print(f"Acc: {acc / len(test_data_loader):.2f}")
93 | 


--------------------------------------------------------------------------------
/chp4/lstm_postag.py:
--------------------------------------------------------------------------------
  1 | # Defined in Section 4.6.2
  2 | 
  3 | import torch
  4 | from torch import nn, optim
  5 | from torch.nn import functional as F
  6 | from torch.utils.data import Dataset, DataLoader
  7 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
  8 | from collections import defaultdict
  9 | from vocab import Vocab
 10 | from utils import load_treebank
 11 | 
 12 | #tqdm是一个Python模块，能以进度条的方式显式迭代的进度
 13 | from tqdm.auto import tqdm
 14 | 
 15 | WEIGHT_INIT_RANGE = 0.1
 16 | 
 17 | class LstmDataset(Dataset):
 18 |     def __init__(self, data):
 19 |         self.data = data
 20 | 
 21 |     def __len__(self):
 22 |         return len(self.data)
 23 | 
 24 |     def __getitem__(self, i):
 25 |         return self.data[i]
 26 | 
 27 | def collate_fn(examples):
 28 |     lengths = torch.tensor([len(ex[0]) for ex in examples])
 29 |     inputs = [torch.tensor(ex[0]) for ex in examples]
 30 |     targets = [torch.tensor(ex[1]) for ex in examples]
 31 |     inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab["<pad>"])
 32 |     targets = pad_sequence(targets, batch_first=True, padding_value=vocab["<pad>"])
 33 |     return inputs, lengths, targets, inputs != vocab["<pad>"]
 34 | 
 35 | 
 36 | def init_weights(model):
 37 |     for param in model.parameters():
 38 |         torch.nn.init.uniform_(param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE)
 39 | 
 40 | class LSTM(nn.Module):
 41 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
 42 |         super(LSTM, self).__init__()
 43 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 44 |         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
 45 |         self.output = nn.Linear(hidden_dim, num_class)
 46 |         init_weights(self)
 47 | 
 48 |     def forward(self, inputs, lengths):
 49 |         embeddings = self.embeddings(inputs)
 50 |         x_pack = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted=False)
 51 |         hidden, (hn, cn) = self.lstm(x_pack)
 52 |         hidden, _ = pad_packed_sequence(hidden, batch_first=True)
 53 |         outputs = self.output(hidden)
 54 |         log_probs = F.log_softmax(outputs, dim=-1)
 55 |         return log_probs
 56 | 
 57 | embedding_dim = 128
 58 | hidden_dim = 256
 59 | batch_size = 32
 60 | num_epoch = 5
 61 | 
 62 | #加载数据
 63 | train_data, test_data, vocab, pos_vocab = load_treebank()
 64 | train_dataset = LstmDataset(train_data)
 65 | test_dataset = LstmDataset(test_data)
 66 | train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
 67 | test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
 68 | 
 69 | num_class = len(pos_vocab)
 70 | 
 71 | #加载模型
 72 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 73 | model = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
 74 | model.to(device) #将模型加载到GPU中（如果已经正确安装）
 75 | 
 76 | #训练过程
 77 | nll_loss = nn.NLLLoss()
 78 | optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器
 79 | 
 80 | model.train()
 81 | for epoch in range(num_epoch):
 82 |     total_loss = 0
 83 |     for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
 84 |         inputs, lengths, targets, mask = [x for x in batch]
 85 |         inputs, targets, mask = inputs.to(device), targets.to(device), mask.to(device)
 86 |         log_probs = model(inputs, lengths)
 87 |         loss = nll_loss(log_probs[mask], targets[mask])
 88 |         optimizer.zero_grad()
 89 |         loss.backward()
 90 |         optimizer.step()
 91 |         total_loss += loss.item()
 92 |     print(f"Loss: {total_loss:.2f}")
 93 | 
 94 | #测试过程
 95 | acc = 0
 96 | total = 0
 97 | for batch in tqdm(test_data_loader, desc=f"Testing"):
 98 |     inputs, lengths, targets, mask = [x for x in batch]
 99 |     inputs, targets, mask = inputs.to(device), targets.to(device), mask.to(device)
100 |     with torch.no_grad():
101 |         output = model(inputs, lengths)
102 |         acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
103 |         total += mask.sum().item()
104 | 
105 | #输出在测试集上的准确率
106 | print(f"Acc: {acc / total:.2f}")
107 | 


--------------------------------------------------------------------------------
/chp4/lstm_sent_polarity.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | import torch
 4 | from torch import nn, optim
 5 | from torch.nn import functional as F
 6 | from torch.utils.data import Dataset, DataLoader
 7 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
 8 | from collections import defaultdict
 9 | from vocab import Vocab
10 | from utils import load_sentence_polarity
11 | 
12 | #tqdm是一个Python模块，能以进度条的方式显式迭代的进度
13 | from tqdm.auto import tqdm
14 | 
15 | class LstmDataset(Dataset):
16 |     def __init__(self, data):
17 |         self.data = data
18 |     def __len__(self):
19 |         return len(self.data)
20 |     def __getitem__(self, i):
21 |         return self.data[i]
22 | 
23 | def collate_fn(examples):
24 |     lengths = torch.tensor([len(ex[0]) for ex in examples])
25 |     inputs = [torch.tensor(ex[0]) for ex in examples]
26 |     targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
27 |     # 对batch内的样本进行padding，使其具有相同长度
28 |     inputs = pad_sequence(inputs, batch_first=True)
29 |     return inputs, lengths, targets
30 | 
31 | class LSTM(nn.Module):
32 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
33 |         super(LSTM, self).__init__()
34 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
35 |         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
36 |         self.output = nn.Linear(hidden_dim, num_class)
37 | 
38 |     def forward(self, inputs, lengths):
39 |         embeddings = self.embeddings(inputs)
40 |         x_pack = pack_padded_sequence(embeddings, lengths, batch_first=True, enforce_sorted=False)
41 |         hidden, (hn, cn) = self.lstm(x_pack)
42 |         outputs = self.output(hn[-1])
43 |         log_probs = F.log_softmax(outputs, dim=-1)
44 |         return log_probs
45 | 
46 | embedding_dim = 128
47 | hidden_dim = 256
48 | num_class = 2
49 | batch_size = 32
50 | num_epoch = 5
51 | 
52 | #加载数据
53 | train_data, test_data, vocab = load_sentence_polarity()
54 | train_dataset = LstmDataset(train_data)
55 | test_dataset = LstmDataset(test_data)
56 | train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
57 | test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
58 | 
59 | #加载模型
60 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
61 | model = LSTM(len(vocab), embedding_dim, hidden_dim, num_class)
62 | model.to(device) #将模型加载到GPU中（如果已经正确安装）
63 | 
64 | #训练过程
65 | nll_loss = nn.NLLLoss()
66 | optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器
67 | 
68 | model.train()
69 | for epoch in range(num_epoch):
70 |     total_loss = 0
71 |     for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
72 |         inputs, lengths, targets = [x for x in batch]
73 |         inputs, targets = inputs.to(device), targets.to(device)
74 |         log_probs = model(inputs, lengths)
75 |         loss = nll_loss(log_probs, targets)
76 |         optimizer.zero_grad()
77 |         loss.backward()
78 |         optimizer.step()
79 |         total_loss += loss.item()
80 |     print(f"Loss: {total_loss:.2f}")
81 | 
82 | #测试过程
83 | acc = 0
84 | for batch in tqdm(test_data_loader, desc=f"Testing"):
85 |     inputs, lengths, targets = [x for x in batch]
86 |     inputs, targets = inputs.to(device), targets.to(device)
87 |     with torch.no_grad():
88 |         output = model(inputs, lengths)
89 |         acc += (output.argmax(dim=1) == targets).sum().item()
90 | 
91 | #输出在测试集上的准确率
92 | print(f"Acc: {acc / len(test_data_loader):.2f}")
93 | 


--------------------------------------------------------------------------------
/chp4/mlp.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.1.6
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import functional as F
 6 | 
 7 | class MLP(nn.Module):
 8 |     def __init__(self, input_dim, hidden_dim, num_class):
 9 |         super(MLP, self).__init__()
10 |         # 线性变换：输入层->隐含层
11 |         self.linear1 = nn.Linear(input_dim, hidden_dim)
12 |         # 使用ReLU激活函数
13 |         self.activate = F.relu
14 |         # 线性变换：隐含层->输出层
15 |         self.linear2 = nn.Linear(hidden_dim, num_class)
16 | 
17 |     def forward(self, inputs):
18 |         hidden = self.linear1(inputs)
19 |         activation = self.activate(hidden)
20 |         outputs = self.linear2(activation)
21 |         probs = F.softmax(outputs, dim=1) # 获得每个输入属于某一类别的概率
22 |         return probs
23 | 
24 | mlp = MLP(input_dim=4, hidden_dim=5, num_class=2)
25 | inputs = torch.rand(3, 4) # 输入形状为(3, 4)的张量，其中3表示有3个输入，4表示每个输入的维度
26 | probs = mlp(inputs) # 自动调用forward函数
27 | print(probs) # 输出3个输入对应输出的概率
28 | 


--------------------------------------------------------------------------------
/chp4/mlp_embedding.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.nn import functional as F
 6 | 
 7 | class MLP(nn.Module):
 8 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
 9 |         super(MLP, self).__init__()
10 |         # 词嵌入层
11 |         self.embedding = nn.Embedding(vocab_size, embedding_dim)
12 |         # 线性变换：词嵌入层->隐含层
13 |         self.linear1 = nn.Linear(embedding_dim, hidden_dim)
14 |         # 使用ReLU激活函数
15 |         self.activate = F.relu
16 |         # 线性变换：激活层->输出层
17 |         self.linear2 = nn.Linear(hidden_dim, num_class)
18 | 
19 |     def forward(self, inputs):
20 |         embeddings = self.embedding(inputs)
21 |         # 将序列中多个embedding进行聚合（此处是求平均值）
22 |         embedding = embeddings.mean(dim=1)
23 |         hidden = self.activate(self.linear1(embedding))
24 |         outputs = self.linear2(hidden)
25 |         # 获得每个序列属于某一类别概率的对数值
26 |         probs = F.log_softmax(outputs, dim=1)
27 |         return probs
28 | 
29 | mlp = MLP(vocab_size=8, embedding_dim=3, hidden_dim=5, num_class=2)
30 | # 输入为两个长度为4的整数序列
31 | inputs = torch.tensor([[0, 1, 2, 1], [4, 6, 6, 7]], dtype=torch.long)
32 | outputs = mlp(inputs)
33 | print(outputs)


--------------------------------------------------------------------------------
/chp4/mlp_sent_polarity.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | import torch
 4 | from torch import nn, optim
 5 | from torch.nn import functional as F
 6 | from torch.utils.data import Dataset, DataLoader
 7 | from collections import defaultdict
 8 | from vocab import Vocab
 9 | from utils import load_sentence_polarity
10 | 
11 | class BowDataset(Dataset):
12 |     def __init__(self, data):
13 |         self.data = data
14 |     def __len__(self):
15 |         return len(self.data)
16 |     def __getitem__(self, i):
17 |         return self.data[i]
18 | 
19 | def collate_fn(examples):
20 |     inputs = [torch.tensor(ex[0]) for ex in examples]
21 |     targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
22 |     offsets = [0] + [i.shape[0] for i in inputs]
23 |     offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
24 |     inputs = torch.cat(inputs)
25 |     return inputs, offsets, targets
26 | 
27 | class MLP(nn.Module):
28 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class):
29 |         super(MLP, self).__init__()
30 |         self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim)
31 |         self.linear1 = nn.Linear(embedding_dim, hidden_dim)
32 |         self.activate = F.relu
33 |         self.linear2 = nn.Linear(hidden_dim, num_class)
34 |     def forward(self, inputs, offsets):
35 |         embedding = self.embedding(inputs, offsets)
36 |         hidden = self.activate(self.linear1(embedding))
37 |         outputs = self.linear2(hidden)
38 |         log_probs = F.log_softmax(outputs, dim=1)
39 |         return log_probs
40 | 
41 | # tqdm是一个Python模块，能以进度条的方式显示迭代的进度
42 | from tqdm.auto import tqdm
43 | 
44 | # 超参数设置
45 | embedding_dim = 128
46 | hidden_dim = 256
47 | num_class = 2
48 | batch_size = 32
49 | num_epoch = 5
50 | 
51 | # 加载数据
52 | train_data, test_data, vocab = load_sentence_polarity()
53 | train_dataset = BowDataset(train_data)
54 | test_dataset = BowDataset(test_data)
55 | train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
56 | test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
57 | 
58 | # 加载模型
59 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
60 | model = MLP(len(vocab), embedding_dim, hidden_dim, num_class)
61 | model.to(device) # 将模型加载到CPU或GPU设备
62 | 
63 | #训练过程
64 | nll_loss = nn.NLLLoss()
65 | optimizer = optim.Adam(model.parameters(), lr=0.001) # 使用Adam优化器
66 | 
67 | model.train()
68 | for epoch in range(num_epoch):
69 |     total_loss = 0
70 |     for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
71 |         inputs, offsets, targets = [x.to(device) for x in batch]
72 |         log_probs = model(inputs, offsets)
73 |         loss = nll_loss(log_probs, targets)
74 |         optimizer.zero_grad()
75 |         loss.backward()
76 |         optimizer.step()
77 |         total_loss += loss.item()
78 |     print(f"Loss: {total_loss:.2f}")
79 | 
80 | # 测试过程
81 | acc = 0
82 | for batch in tqdm(test_data_loader, desc=f"Testing"):
83 |     inputs, offsets, targets = [x.to(device) for x in batch]
84 |     with torch.no_grad():
85 |         output = model(inputs, offsets)
86 |         acc += (output.argmax(dim=1) == targets).sum().item()
87 | 
88 | # 输出在测试集上的准确率
89 | print(f"Acc: {acc / len(test_data_loader):.2f}")
90 | 


--------------------------------------------------------------------------------
/chp4/mlp_train.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.5.2
 2 | 
 3 | import torch
 4 | from torch import nn, optim
 5 | from torch.nn import functional as F
 6 | 
 7 | class MLP(nn.Module):
 8 |     def __init__(self, input_dim, hidden_dim, num_class):
 9 |         super(MLP, self).__init__()
10 |         self.linear1 = nn.Linear(input_dim, hidden_dim)
11 |         self.activate = F.relu
12 |         self.linear2 = nn.Linear(hidden_dim, num_class)
13 | 
14 |     def forward(self, inputs):
15 |         hidden = self.linear1(inputs)
16 |         activation = self.activate(hidden)
17 |         outputs = self.linear2(activation)
18 |         # 获得每个输入属于某一类别的概率（Softmax），然后再取对数
19 |         # 取对数的目的是避免计算Softmax时可能产生的数值溢出问题
20 |         log_probs = F.log_softmax(outputs, dim=1)
21 |         return log_probs
22 | 
23 | # 异或问题的4个输入
24 | x_train = torch.tensor([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
25 | # 每个输入对应的输出类别
26 | y_train = torch.tensor([0, 1, 1, 0])
27 | 
28 | # 创建多层感知器模型，输入层大小为2，隐含层大小为5，输出层大小为2（即有两个类别）
29 | model = MLP(input_dim=2, hidden_dim=5, num_class=2)
30 | 
31 | criterion = nn.NLLLoss() # 当使用log_softmax输出时，需要调用负对数似然损失（Negative Log Likelihood，NLL）
32 | optimizer = optim.SGD(model.parameters(), lr=0.05) # 使用梯度下降参数优化方法，学习率设置为0.05
33 | 
34 | for epoch in range(100):
35 |     y_pred = model(x_train) # 调用模型，预测输出结果
36 |     loss = criterion(y_pred, y_train) # 通过对比预测结果与正确的结果，计算损失
37 |     optimizer.zero_grad() # 在调用反向传播算法之前，将优化器的梯度值置为零，否则每次循环的梯度将进行累加
38 |     loss.backward() # 通过反向传播计算参数的梯度
39 |     optimizer.step() # 在优化器中更新参数，不同优化器更新的方法不同，但是调用方式相同
40 | 
41 | print("Parameters:")
42 | for name, param in model.named_parameters():
43 |     print (name, param.data)
44 | 
45 | y_pred = model(x_train)
46 | print("Predicted results:", y_pred.argmax(axis=1))
47 | 


--------------------------------------------------------------------------------
/chp4/transformer/model.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 |  
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | @dataclass
 8 | class Config:
 9 |     batch_size: int = 2
10 |     seq_len: int = 3
11 |     n_embd: int = 4
12 |     n_head: int = 2
13 |     n_layer: int = 2
14 | 
15 | class MultiHeadSelfAttention(nn.Module):
16 |     def __init__(self, config):
17 |         super().__init__()
18 |         self.config = config
19 |         self.proj = nn.Linear(config.n_embd, config.n_embd * 3)
20 | 
21 |     def forward(self, x):
22 |         B, T, C = x.size() # batch_size, seq_len, n_embd
23 | 
24 |         # 获得batch中每个输入的q, k, v，并将q, k, v分解为n_head组
25 |         q, k, v = self.proj(x).chunk(3, dim=-1)
26 |         k = k.view(B, T, self.config.n_head, -1).transpose(1, 2)
27 |         q = q.view(B, T, self.config.n_head, -1).transpose(1, 2)
28 |         v = v.view(B, T, self.config.n_head, -1).transpose(1, 2)
29 | 
30 |         # 计算自注意力：
31 |         # (B, n_head, T, hs) x (B, n_head, hs, T) -> (B, n_head, T, T)
32 |         attn = (q @ k.transpose(-2, -1)) / (k.size(-1) ** 0.5)
33 |         attn = F.softmax(attn, dim=-1)
34 |         y = attn @ v
35 |         y = y.transpose(1, 2).reshape(B, T, C)
36 |         return y
37 | 
38 | class MLP(nn.Module):
39 |     def __init__(self, config):
40 |         super().__init__()
41 |         self.fc1  = nn.Linear(config.n_embd, 4 * config.n_embd)
42 |         self.gelu = nn.GELU()
43 |         self.fc2  = nn.Linear(4 * config.n_embd, config.n_embd)
44 | 
45 |     def forward(self, x):
46 |         x = self.fc1(x)
47 |         x = self.gelu(x)
48 |         x = self.fc2(x)
49 |         return x
50 | 
51 | class Block(nn.Module):
52 |     def __init__(self, config):
53 |         super().__init__()
54 |         self.ln_1 = nn.LayerNorm(config.n_embd)
55 |         self.attn = MultiHeadSelfAttention(config)
56 |         self.ln_2 = nn.LayerNorm(config.n_embd)
57 |         self.mlp = MLP(config)
58 | 
59 |     def forward(self, x):
60 |         x = self.ln_1(x + self.attn(x))
61 |         x = self.ln_2(x + self.mlp(x))
62 |         return x
63 | 
64 | class Transformer(nn.Module):
65 |     def __init__(self, config):
66 |         super().__init__()
67 |         self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
68 | 
69 |     def forward(self, x):
70 |         for block in self.blocks:
71 |             x = block(x)
72 |         return x
73 | 
74 | if __name__ == '__main__':
75 |     config = Config()
76 |     x = torch.randn(config.batch_size, config.seq_len, config.n_embd)
77 |     self_attn = Transformer(config)
78 |     y = self_attn(x)
79 |     print(y, y.shape)
80 | 


--------------------------------------------------------------------------------
/chp4/transformer_postag.py:
--------------------------------------------------------------------------------
  1 | # Defined in Section 4.6.2
  2 | 
  3 | import math
  4 | import torch
  5 | from torch import nn, optim
  6 | from torch.nn import functional as F
  7 | from torch.utils.data import Dataset, DataLoader
  8 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
  9 | from collections import defaultdict
 10 | from vocab import Vocab
 11 | from utils import load_treebank, length_to_mask
 12 | 
 13 | #tqdm是一个Pyth模块，能以进度条的方式显式迭代的进度
 14 | from tqdm.auto import tqdm
 15 | 
 16 | class TransformerDataset(Dataset):
 17 |     def __init__(self, data):
 18 |         self.data = data
 19 |     def __len__(self):
 20 |         return len(self.data)
 21 |     def __getitem__(self, i):
 22 |         return self.data[i]
 23 | 
 24 | def collate_fn(examples):
 25 |     lengths = torch.tensor([len(ex[0]) for ex in examples])
 26 |     inputs = [torch.tensor(ex[0]) for ex in examples]
 27 |     targets = [torch.tensor(ex[1]) for ex in examples]
 28 |     # 对batch内的样本进行padding，使其具有相同长度
 29 |     inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab["<pad>"])
 30 |     targets = pad_sequence(targets, batch_first=True, padding_value=vocab["<pad>"])
 31 |     return inputs, lengths, targets, inputs != vocab["<pad>"]
 32 | 
 33 | class PositionalEncoding(nn.Module):
 34 |     def __init__(self, d_model, dropout=0.1, max_len=512):
 35 |         super(PositionalEncoding, self).__init__()
 36 | 
 37 |         pe = torch.zeros(max_len, d_model)
 38 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
 39 |         div_term2 = torch.pow(torch.tensor(10000.0), torch.arange(0, d_model, 2).float() / d_model)
 40 |         div_term1 = torch.pow(torch.tensor(10000.0), torch.arange(1, d_model, 2).float() / d_model)
 41 |         # 高级切片方式，即从0开始，两个步长取一个。即奇数和偶数位置赋值不一样。直观来看就是每一句话的
 42 |         pe[:, 0::2] = torch.sin(position * div_term2)
 43 |         pe[:, 1::2] = torch.cos(position * div_term1)
 44 |         pe = pe.unsqueeze(0).transpose(0, 1)
 45 |         self.register_buffer('pe', pe)
 46 | 
 47 |     def forward(self, x):
 48 |         x = x + self.pe[:x.size(0), :]
 49 |         return x
 50 | 
 51 | class Transformer(nn.Module):
 52 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class,
 53 |                  dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=512, activation: str = "relu"):
 54 |         super(Transformer, self).__init__()
 55 |         # 词嵌入层
 56 |         self.embedding_dim = embedding_dim
 57 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 58 |         self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)
 59 |         # 编码层：使用Transformer
 60 |         encoder_layer = nn.TransformerEncoderLayer(hidden_dim, num_head, dim_feedforward, dropout, activation)
 61 |         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
 62 |         # 输出层
 63 |         self.output = nn.Linear(hidden_dim, num_class)
 64 | 
 65 |     def forward(self, inputs, lengths):
 66 |         inputs = torch.transpose(inputs, 0, 1)
 67 |         hidden_states = self.embeddings(inputs)
 68 |         hidden_states = self.position_embedding(hidden_states)
 69 |         attention_mask = length_to_mask(lengths) == False
 70 |         hidden_states = self.transformer(hidden_states, src_key_padding_mask=attention_mask).transpose(0, 1)
 71 |         logits = self.output(hidden_states)
 72 |         log_probs = F.log_softmax(logits, dim=-1)
 73 |         return log_probs
 74 | 
 75 | embedding_dim = 128
 76 | hidden_dim = 128
 77 | batch_size = 32
 78 | num_epoch = 5
 79 | 
 80 | #加载数据
 81 | train_data, test_data, vocab, pos_vocab = load_treebank()
 82 | train_dataset = TransformerDataset(train_data)
 83 | test_dataset = TransformerDataset(test_data)
 84 | train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
 85 | test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
 86 | 
 87 | num_class = len(pos_vocab)
 88 | 
 89 | #加载模型
 90 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 91 | model = Transformer(len(vocab), embedding_dim, hidden_dim, num_class)
 92 | model.to(device) #将模型加载到GPU中（如果已经正确安装）
 93 | 
 94 | #训练过程
 95 | nll_loss = nn.NLLLoss()
 96 | optimizer = optim.Adam(model.parameters(), lr=0.001) #使用Adam优化器
 97 | 
 98 | model.train()
 99 | for epoch in range(num_epoch):
100 |     total_loss = 0
101 |     for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
102 |         inputs, lengths, targets, mask = [x.to(device) for x in batch]
103 |         log_probs = model(inputs, lengths)
104 |         loss = nll_loss(log_probs[mask], targets[mask])
105 |         optimizer.zero_grad()
106 |         loss.backward()
107 |         optimizer.step()
108 |         total_loss += loss.item()
109 |     print(f"Loss: {total_loss:.2f}")
110 | 
111 | #测试过程
112 | acc = 0
113 | total = 0
114 | for batch in tqdm(test_data_loader, desc=f"Testing"):
115 |     inputs, lengths, targets, mask = [x.to(device) for x in batch]
116 |     with torch.no_grad():
117 |         output = model(inputs, lengths)
118 |         acc += (output.argmax(dim=-1) == targets)[mask].sum().item()
119 |         total += mask.sum().item()
120 | 
121 | #输出在测试集上的准确率
122 | print(f"Acc: {acc / total:.2f}")
123 | 


--------------------------------------------------------------------------------
/chp4/transformer_sent_polarity.py:
--------------------------------------------------------------------------------
  1 | # Defined in Section 4.6.1
  2 | 
  3 | import math
  4 | import torch
  5 | from torch import nn, optim
  6 | from torch.nn import functional as F
  7 | from torch.utils.data import Dataset, DataLoader
  8 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
  9 | from collections import defaultdict
 10 | from vocab import Vocab
 11 | from utils import load_sentence_polarity, length_to_mask
 12 | 
 13 | # tqdm是一个Pyth模块，能以进度条的方式显式迭代的进度
 14 | from tqdm.auto import tqdm
 15 | 
 16 | class TransformerDataset(Dataset):
 17 |     def __init__(self, data):
 18 |         self.data = data
 19 |     def __len__(self):
 20 |         return len(self.data)
 21 |     def __getitem__(self, i):
 22 |         return self.data[i]
 23 | 
 24 | def collate_fn(examples):
 25 |     lengths = torch.tensor([len(ex[0]) for ex in examples])
 26 |     inputs = [torch.tensor(ex[0]) for ex in examples]
 27 |     targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
 28 |     # 对batch内的样本进行padding，使其具有相同长度
 29 |     inputs = pad_sequence(inputs, batch_first=True)
 30 |     return inputs, lengths, targets
 31 | 
 32 | class PositionalEncoding(nn.Module):
 33 |     def __init__(self, d_model, dropout=0.1, max_len=512):
 34 |         super(PositionalEncoding, self).__init__()
 35 | 
 36 |         pe = torch.zeros(max_len, d_model)
 37 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
 38 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
 39 |         pe[:, 0::2] = torch.sin(position * div_term)
 40 |         pe[:, 1::2] = torch.cos(position * div_term)
 41 |         pe = pe.unsqueeze(0).transpose(0, 1)
 42 |         self.register_buffer('pe', pe)
 43 | 
 44 |     def forward(self, x):
 45 |         x = x + self.pe[:x.size(0), :]
 46 |         return x
 47 | 
 48 | class Transformer(nn.Module):
 49 |     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_class,
 50 |                  dim_feedforward=512, num_head=2, num_layers=2, dropout=0.1, max_len=128, activation: str = "relu"):
 51 |         super(Transformer, self).__init__()
 52 |         # 词嵌入层
 53 |         self.embedding_dim = embedding_dim
 54 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 55 |         self.position_embedding = PositionalEncoding(embedding_dim, dropout, max_len)
 56 |         # 编码层：使用Transformer
 57 |         encoder_layer = nn.TransformerEncoderLayer(hidden_dim, num_head, dim_feedforward, dropout, activation)
 58 |         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
 59 |         # 输出层
 60 |         self.output = nn.Linear(hidden_dim, num_class)
 61 | 
 62 | 
 63 |     def forward(self, inputs, lengths):
 64 |         inputs = torch.transpose(inputs, 0, 1)
 65 |         hidden_states = self.embeddings(inputs)
 66 |         hidden_states = self.position_embedding(hidden_states)
 67 |         attention_mask = length_to_mask(lengths) == False
 68 |         hidden_states = self.transformer(hidden_states, src_key_padding_mask=attention_mask)
 69 |         hidden_states = hidden_states[0, :, :]
 70 |         output = self.output(hidden_states)
 71 |         log_probs = F.log_softmax(output, dim=1)
 72 |         return log_probs
 73 | 
 74 | embedding_dim = 128
 75 | hidden_dim = 128
 76 | num_class = 2
 77 | batch_size = 32
 78 | num_epoch = 5
 79 | 
 80 | # 加载数据
 81 | train_data, test_data, vocab = load_sentence_polarity()
 82 | train_dataset = TransformerDataset(train_data)
 83 | test_dataset = TransformerDataset(test_data)
 84 | train_data_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
 85 | test_data_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=False)
 86 | 
 87 | # 加载模型
 88 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 89 | model = Transformer(len(vocab), embedding_dim, hidden_dim, num_class)
 90 | model.to(device) # 将模型加载到GPU中（如果已经正确安装）
 91 | 
 92 | # 训练过程
 93 | nll_loss = nn.NLLLoss()
 94 | optimizer = optim.Adam(model.parameters(), lr=0.001) # 使用Adam优化器
 95 | 
 96 | model.train()
 97 | for epoch in range(num_epoch):
 98 |     total_loss = 0
 99 |     for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"):
100 |         inputs, lengths, targets = [x.to(device) for x in batch]
101 |         log_probs = model(inputs, lengths)
102 |         loss = nll_loss(log_probs, targets)
103 |         optimizer.zero_grad()
104 |         loss.backward()
105 |         optimizer.step()
106 |         total_loss += loss.item()
107 |     print(f"Loss: {total_loss:.2f}")
108 | 
109 | # 测试过程
110 | acc = 0
111 | for batch in tqdm(test_data_loader, desc=f"Testing"):
112 |     inputs, lengths, targets = [x.to(device) for x in batch]
113 |     with torch.no_grad():
114 |         output = model(inputs, lengths)
115 |         acc += (output.argmax(dim=1) == targets).sum().item()
116 | 
117 | # 输出在测试集上的准确率
118 | print(f"Acc: {acc / len(test_data_loader):.2f}")
119 | 


--------------------------------------------------------------------------------
/chp4/utils.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | import torch
 4 | from vocab import Vocab
 5 | 
 6 | def load_sentence_polarity():
 7 |     from nltk.corpus import sentence_polarity
 8 | 
 9 |     vocab = Vocab.build(sentence_polarity.sents())
10 | 
11 |     train_data = [(vocab.convert_tokens_to_ids(sentence), 0)
12 |                   for sentence in sentence_polarity.sents(categories='pos')[:4000]] \
13 |         + [(vocab.convert_tokens_to_ids(sentence), 1)
14 |             for sentence in sentence_polarity.sents(categories='neg')[:4000]]
15 | 
16 |     test_data = [(vocab.convert_tokens_to_ids(sentence), 0)
17 |                  for sentence in sentence_polarity.sents(categories='pos')[4000:]] \
18 |         + [(vocab.convert_tokens_to_ids(sentence), 1)
19 |             for sentence in sentence_polarity.sents(categories='neg')[4000:]]
20 | 
21 |     return train_data, test_data, vocab
22 | 
23 | def length_to_mask(lengths):
24 |     max_len = torch.max(lengths)
25 |     mask = torch.arange(max_len, device=lengths.device).expand(lengths.shape[0], max_len) < lengths.unsqueeze(1)
26 |     return mask
27 | 
28 | def load_treebank():
29 |     from nltk.corpus import treebank
30 |     sents, postags = zip(*(zip(*sent) for sent in treebank.tagged_sents()))
31 | 
32 |     vocab = Vocab.build(sents, reserved_tokens=["<pad>"])
33 | 
34 |     tag_vocab = Vocab.build(postags)
35 | 
36 |     train_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[:3000], postags[:3000])]
37 |     test_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[3000:], postags[3000:])]
38 | 
39 |     return train_data, test_data, vocab, tag_vocab
40 | 
41 | 


--------------------------------------------------------------------------------
/chp4/vocab.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | from collections import defaultdict, Counter
 4 | 
 5 | class Vocab:
 6 |     def __init__(self, tokens=None):
 7 |         self.idx_to_token = list()
 8 |         self.token_to_idx = dict()
 9 | 
10 |         if tokens is not None:
11 |             if "<unk>" not in tokens:
12 |                 tokens = tokens + ["<unk>"]
13 |             for token in tokens:
14 |                 self.idx_to_token.append(token)
15 |                 self.token_to_idx[token] = len(self.idx_to_token) - 1
16 |             self.unk = self.token_to_idx['<unk>']
17 | 
18 |     @classmethod
19 |     def build(cls, text, min_freq=1, reserved_tokens=None):
20 |         token_freqs = defaultdict(int)
21 |         for sentence in text:
22 |             for token in sentence:
23 |                 token_freqs[token] += 1
24 |         uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
25 |         uniq_tokens += [token for token, freq in token_freqs.items() \
26 |                         if freq >= min_freq and token != "<unk>"]
27 |         return cls(uniq_tokens)
28 | 
29 |     def __len__(self):
30 |         return len(self.idx_to_token)
31 | 
32 |     def __getitem__(self, token):
33 |         return self.token_to_idx.get(token, self.unk)
34 | 
35 |     def convert_tokens_to_ids(self, tokens):
36 |         return [self[token] for token in tokens]
37 | 
38 |     def convert_ids_to_tokens(self, indices):
39 |         return [self.idx_to_token[index] for index in indices]
40 | 
41 | 
42 | def save_vocab(vocab, path):
43 |     with open(path, 'w') as writer:
44 |         writer.write("\n".join(vocab.idx_to_token))
45 | 
46 | 
47 | def read_vocab(path):
48 |     with open(path, 'r') as f:
49 |         tokens = f.read().split('\n')
50 |     return Vocab(tokens)
51 | 
52 | 


--------------------------------------------------------------------------------
/chp5/ffnnlm.py:
--------------------------------------------------------------------------------
  1 | # Defined in Section 5.4.2
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | from torch.utils.data import Dataset
  8 | from tqdm.auto import tqdm
  9 | from utils import BOS_TOKEN, EOS_TOKEN
 10 | from utils import load_reuters, save_pretrained, get_loader, init_weights
 11 | 
 12 | class NGramDataset(Dataset):
 13 |     def __init__(self, corpus, vocab, context_size=2):
 14 |         self.data = []
 15 |         self.bos = vocab[BOS_TOKEN]
 16 |         self.eos = vocab[EOS_TOKEN]
 17 |         for sentence in tqdm(corpus, desc="Dataset Construction"):
 18 |             # 插入句首句尾符号
 19 |             sentence = [self.bos] + sentence + [self.eos]
 20 |             if len(sentence) < context_size:
 21 |                 continue
 22 |             for i in range(context_size, len(sentence)):
 23 |                 # 模型输入：长为context_size的上文
 24 |                 context = sentence[i-context_size:i]
 25 |                 # 模型输出：当前词
 26 |                 target = sentence[i]
 27 |                 self.data.append((context, target))
 28 | 
 29 |     def __len__(self):
 30 |         return len(self.data)
 31 | 
 32 |     def __getitem__(self, i):
 33 |         return self.data[i]
 34 | 
 35 |     def collate_fn(self, examples):
 36 |         # 从独立样本集合中构建batch输入输出
 37 |         inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
 38 |         targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
 39 |         return (inputs, targets)
 40 | 
 41 | class FeedForwardNNLM(nn.Module):
 42 |     def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
 43 |         super(FeedForwardNNLM, self).__init__()
 44 |         # 词嵌入层
 45 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
 46 |         # 线性变换：词嵌入层->隐含层
 47 |         self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
 48 |         # 线性变换：隐含层->输出层
 49 |         self.linear2 = nn.Linear(hidden_dim, vocab_size)
 50 |         # 使用ReLU激活函数
 51 |         self.activate = F.relu
 52 |         init_weights(self)
 53 | 
 54 |     def forward(self, inputs):
 55 |         embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
 56 |         hidden = self.activate(self.linear1(embeds))
 57 |         output = self.linear2(hidden)
 58 |         # 根据输出层（logits）计算概率分布并取对数，以便于计算对数似然
 59 |         # 这里采用PyTorch库的log_softmax实现
 60 |         log_probs = F.log_softmax(output, dim=1)
 61 |         return log_probs
 62 | 
 63 | embedding_dim = 64
 64 | context_size = 2
 65 | hidden_dim = 128
 66 | batch_size = 512
 67 | num_epoch = 5
 68 | 
 69 | # 读取文本数据，构建FFNNLM训练数据集（n-grams）
 70 | corpus, vocab = load_reuters()
 71 | dataset = NGramDataset(corpus, vocab, context_size)
 72 | data_loader = get_loader(dataset, batch_size)
 73 | 
 74 | # 负对数似然损失函数
 75 | nll_loss = nn.NLLLoss()
 76 | # 构建FFNNLM，并加载至device（GPU）
 77 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 78 | model = FeedForwardNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
 79 | model.to(device)
 80 | # 使用Adam优化器
 81 | optimizer = optim.Adam(model.parameters(), lr=0.001)
 82 | 
 83 | model.train()
 84 | total_losses = []
 85 | for epoch in range(num_epoch):
 86 |     total_loss = 0
 87 |     for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
 88 |         inputs, targets = [x.to(device) for x in batch]
 89 |         optimizer.zero_grad()
 90 |         log_probs = model(inputs)
 91 |         loss = nll_loss(log_probs, targets)
 92 |         loss.backward()
 93 |         optimizer.step()
 94 |         total_loss += loss.item()
 95 |     print(f"Loss: {total_loss:.2f}")
 96 |     total_losses.append(total_loss)
 97 | 
 98 | # 保存词向量（model.embeddings）
 99 | save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")
100 | 
101 | 


--------------------------------------------------------------------------------
/chp5/ngram-lm.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import defaultdict
 3 | from nltk.corpus import reuters
 4 | 
 5 | # 以trigram语言模型为例
 6 | n = 3
 7 | 
 8 | # 存储每个ngram的出现频次
 9 | ngram_count = defaultdict(int)
10 | # 存储每个ngram的前缀出现频次
11 | ngram_precedings_count = defaultdict(int)
12 | # 存储每个ngram的前缀所对应的下一个词的列表及每个词出现的概率列表
13 | ngram_prob = {}
14 | 
15 | # 获取句子中所有的ngram的列表及其前缀列表
16 | def get_ngrams(sentence, n):
17 |     # 在句子前后加上开始符号和结束符号
18 |     sentence = (n - 1) * ['<bos>'] + sentence + ['<eos>']
19 |     ngrams = []
20 |     precedings = []
21 |     for i in range(n - 1, len(sentence)):
22 |         prec = tuple(sentence[i - n + 1:i])
23 |         ngram = tuple((prec, sentence[i]))
24 |         precedings.append(prec)
25 |         ngrams.append(ngram)
26 | 
27 |     return ngrams, precedings
28 | 
29 | # 构建ngram及其前缀的出现频次
30 | def build_ngrams_precedings(text):
31 |     for sentence in text:
32 |         ngrams, precedings = get_ngrams(sentence, n)
33 |         for i in range(len(ngrams)):
34 |             ngram = ngrams[i]
35 |             prec = precedings[i]
36 |             ngram_count[ngram] += 1
37 |             ngram_precedings_count[prec] += 1
38 | 
39 | # 构建ngram的前缀所对应的下一个词的列表及每个词出现的概率列表
40 | def build_ngram_prob():
41 |     for ngram in ngram_count.keys():
42 |         prec, next = ngram
43 |         prob = ngram_count[ngram] / ngram_precedings_count[prec]
44 |         if prec in ngram_prob:
45 |             ngram_prob[prec]['next'].append(next)
46 |             ngram_prob[prec]['prob'].append(prob)
47 |         else:
48 |             ngram_prob[prec] = {'next': [next], 'prob': [prob]}
49 | 
50 | # 构建语言模型
51 | def build_lm():
52 |     text = reuters.sents()
53 |     build_ngrams_precedings(text)
54 |     build_ngram_prob()
55 | 
56 | # 生成句子
57 | def generate(length=10):
58 |     word_list = (n - 1) * ['<bos>']
59 |     for _ in range(length):
60 |         try:
61 |             prec = tuple(word_list[1 - n:])
62 |             next_choice = ngram_prob[prec]
63 |             # 从下一个词的列表中根据概率随机选择一个词
64 |             generated_word = random.choices(next_choice['next'], next_choice['prob'])[0]
65 |             word_list.append(generated_word)
66 |         except:
67 |             break
68 |             
69 |     return word_list
70 | 
71 | build_lm()
72 | word_list = generate(50)
73 | print(f'Word count: {len(word_list)}')
74 | print(f'Generated sentence: {" ".join(word_list)}')
75 | 


--------------------------------------------------------------------------------
/chp5/rnnlm.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 5.4.3
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch.optim as optim
 7 | from torch.utils.data import Dataset
 8 | from torch.nn.utils.rnn import pad_sequence
 9 | from tqdm.auto import tqdm
10 | from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
11 | from utils import load_reuters, save_pretrained, get_loader, init_weights
12 | 
13 | class RnnlmDataset(Dataset):
14 |     def __init__(self, corpus, vocab):
15 |         self.data = []
16 |         self.bos = vocab[BOS_TOKEN]
17 |         self.eos = vocab[EOS_TOKEN]
18 |         self.pad = vocab[PAD_TOKEN]
19 |         for sentence in tqdm(corpus, desc="Dataset Construction"):
20 |             # 模型输入：BOS_TOKEN, w_1, w_2, ..., w_n
21 |             input = [self.bos] + sentence
22 |             # 模型输出：w_1, w_2, ..., w_n, EOS_TOKEN
23 |             target = sentence + [self.eos]
24 |             self.data.append((input, target))
25 | 
26 |     def __len__(self):
27 |         return len(self.data)
28 | 
29 |     def __getitem__(self, i):
30 |         return self.data[i]
31 | 
32 |     def collate_fn(self, examples):
33 |         # 从独立样本集合中构建batch输入输出
34 |         inputs = [torch.tensor(ex[0]) for ex in examples]
35 |         targets = [torch.tensor(ex[1]) for ex in examples]
36 |         # 对batch内的样本进行padding，使其具有相同长度
37 |         inputs = pad_sequence(inputs, batch_first=True, padding_value=self.pad)
38 |         targets = pad_sequence(targets, batch_first=True, padding_value=self.pad)
39 |         return (inputs, targets)
40 | 
41 | class RNNLM(nn.Module):
42 |     def __init__(self, vocab_size, embedding_dim, hidden_dim):
43 |         super(RNNLM, self).__init__()
44 |         # 词嵌入层
45 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
46 |         # 循环神经网络：这里使用LSTM
47 |         self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
48 |         # 输出层
49 |         self.output = nn.Linear(hidden_dim, vocab_size)
50 | 
51 |     def forward(self, inputs):
52 |         embeds = self.embeddings(inputs)
53 |         # 计算每一时刻的隐含层表示
54 |         hidden, _ = self.rnn(embeds)
55 |         output = self.output(hidden)
56 |         log_probs = F.log_softmax(output, dim=2)
57 |         return log_probs
58 | 
59 | embedding_dim = 64
60 | context_size = 2
61 | hidden_dim = 128
62 | batch_size = 512
63 | num_epoch = 5
64 | 
65 | # 读取文本数据，构建FFNNLM训练数据集（n-grams）
66 | corpus, vocab = load_reuters()
67 | dataset = RnnlmDataset(corpus, vocab)
68 | data_loader = get_loader(dataset, batch_size)
69 | 
70 | # 负对数似然损失函数，忽略pad_token处的损失
71 | nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
72 | # 构建RNNLM，并加载至device
73 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
74 | model = RNNLM(len(vocab), embedding_dim, hidden_dim)
75 | model.to(device)
76 | # 使用Adam优化器
77 | optimizer = optim.Adam(model.parameters(), lr=0.001)
78 | 
79 | model.train()
80 | for epoch in range(num_epoch):
81 |     total_loss = 0
82 |     for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
83 |         inputs, targets = [x.to(device) for x in batch]
84 |         optimizer.zero_grad()
85 |         log_probs = model(inputs)
86 |         loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]), targets.view(-1))
87 |         loss.backward()
88 |         optimizer.step()
89 |         total_loss += loss.item()
90 |     print(f"Loss: {total_loss:.2f}")
91 | 
92 | save_pretrained(vocab, model.embeddings.weight.data, "rnnlm.vec")
93 | 
94 | 


--------------------------------------------------------------------------------
/chp5/tflm/__init__.py:
--------------------------------------------------------------------------------
1 | from .train import train_tflm
2 | from .sample import sample_tflm
3 | 


--------------------------------------------------------------------------------
/chp5/tflm/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/chp5/tflm/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/chp5/tflm/__pycache__/dataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/chp5/tflm/__pycache__/dataset.cpython-38.pyc


--------------------------------------------------------------------------------
/chp5/tflm/__pycache__/model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/chp5/tflm/__pycache__/model.cpython-38.pyc


--------------------------------------------------------------------------------
/chp5/tflm/__pycache__/sample.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/chp5/tflm/__pycache__/sample.cpython-38.pyc


--------------------------------------------------------------------------------
/chp5/tflm/__pycache__/train.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/chp5/tflm/__pycache__/train.cpython-38.pyc


--------------------------------------------------------------------------------
/chp5/tflm/dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | from utils import BOS_TOKEN, EOS_TOKEN
 4 | from tqdm.auto import tqdm
 5 | 
 6 | class TransformerDataset(Dataset):
 7 |     def __init__(self, corpus, vocab, context_size=16):
 8 |         self.data = []
 9 |         self.bos = vocab[BOS_TOKEN]
10 |         self.eos = vocab[EOS_TOKEN]
11 |         for sentence in tqdm(corpus, desc="Dataset Construction"):
12 |             # 插入句首句尾符号
13 |             sentence = context_size * [self.bos] + sentence + [self.eos]
14 |             for i in range(context_size, len(sentence)):
15 |                 # 模型输入：长为context_size的上文
16 |                 context = sentence[i - context_size:i]
17 |                 # 模型输出：模型输入的下一个词构成的长为context_size的序列
18 |                 target = sentence[i - context_size + 1: i + 1]
19 |                 self.data.append((context, target))
20 | 
21 |     def __len__(self):
22 |         return len(self.data)
23 | 
24 |     def __getitem__(self, i):
25 |         return self.data[i]
26 | 
27 |     def collate_fn(self, examples):
28 |         # 从独立样本集合中构建batch输入输出
29 |         inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
30 |         targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
31 |         return (inputs, targets)
32 | 


--------------------------------------------------------------------------------
/chp5/tflm/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | from utils import init_weights
  5 | from dataclasses import dataclass
  6 | 
  7 | @dataclass
  8 | class Config:
  9 |     def __init__(self, vocab_size, context_size, n_embd=2, n_head=2, n_layer=2):
 10 |         """
 11 | 
 12 |         :param vocab_size: 词表大小
 13 |         :param context_size: 最大序列长度, 即Transformer块的"大小"
 14 |         :param batch_size: 批次大小
 15 |         :param n_embd: 词向量维度
 16 |         :param n_head: 注意力头数
 17 |         :param n_layer: 注意力层数
 18 |         """
 19 |         self.n_embd = n_embd
 20 |         self.n_head = n_head
 21 |         self.n_layer = n_layer
 22 |         self.vocab_size = vocab_size
 23 |         self.context_size = context_size
 24 | 
 25 | class MultiHeadSelfAttention(nn.Module):
 26 |     def __init__(self, config):
 27 |         super().__init__()
 28 | 
 29 |         # 保存模型配置
 30 |         self.config = config
 31 | 
 32 |         # 保证n_embd可以被n_head整除
 33 |         assert config.n_embd % config.n_head == 0, "n_embd must be divisible by n_head"
 34 | 
 35 |         # 将向量映射到q/k/v
 36 |         self.proj = nn.Linear(config.n_embd, config.n_embd * 3)
 37 | 
 38 |         # 注意力掩码: 不对当前token之后的内容施加注意力, 避免模型看到未来的信息
 39 |         self.register_buffer("mask", torch.tril(torch.ones(config.context_size, config.context_size))
 40 |                              .view(1, 1, config.context_size, config.context_size))
 41 | 
 42 |     def forward(self, x):
 43 |         B, T, C = x.size()  # batch_size, seq_len, n_embd
 44 | 
 45 |         # 获得batch中每个输入的q, k, v
 46 |         # x(batch_size, seq_len, n_embd) --proj--> (batch_size, seq_len, n_embd*3)
 47 |         # --chunk--> q,k,v(batch_size, seq_len, n_embd)
 48 |         q, k, v = self.proj(x).chunk(3, dim=-1)
 49 | 
 50 |         # 将q, k, v分解为n_head组, 每个head对应的向量维度为n_embd/n_head, 在第四维
 51 |         k = k.view(B, T, self.config.n_head, -1).transpose(1, 2)
 52 |         q = q.view(B, T, self.config.n_head, -1).transpose(1, 2)
 53 |         v = v.view(B, T, self.config.n_head, -1).transpose(1, 2)
 54 | 
 55 |         # 计算自注意力分数
 56 |         # (B, n_head, T, hs) x (B, n_head, hs, T) -> (B, n_head, T, T)
 57 |         attn = (q @ k.transpose(-2, -1)) / (k.size(-1) ** 0.5)
 58 | 
 59 |         # 应用掩码
 60 |         attn = attn.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
 61 |         # 将注意力分数转化为注意力分布
 62 |         attn = F.softmax(attn, dim=-1)
 63 | 
 64 |         # 注意力分布与v相乘, 得到注意力输出
 65 |         y = attn @ v
 66 | 
 67 |         # head组的输出拼接起来
 68 |         y = y.transpose(1, 2).reshape(B, T, C)
 69 | 
 70 |         return y
 71 | 
 72 | 
 73 | class MLP(nn.Module):
 74 |     """
 75 |     两层全连接网络
 76 |     用于为Transformer的每个Block添加非线性表示能力
 77 |     """
 78 | 
 79 |     def __init__(self, config):
 80 |         super().__init__()
 81 |         # 隐层, 将向量映射到4倍的维度
 82 |         self.fc1 = nn.Linear(config.n_embd, 4 * config.n_embd)
 83 |         # 激活
 84 |         self.gelu = nn.GELU()
 85 |         # 输出层, 将向量映射回原来的维度
 86 |         self.fc2 = nn.Linear(4 * config.n_embd, config.n_embd)
 87 | 
 88 |     def forward(self, x):
 89 |         x = self.fc1(x)
 90 |         x = self.gelu(x)
 91 |         x = self.fc2(x)
 92 |         return x
 93 | 
 94 | 
 95 | class Block(nn.Module):
 96 |     """
 97 |     Transformer的基本单元
 98 |     在每个子层的入口进行归一化和残差连接
 99 |     """
100 | 
101 |     def __init__(self, config):
102 |         super().__init__()
103 |         # 归一化
104 |         self.ln_1 = nn.LayerNorm(config.n_embd)
105 |         # 多头自注意力块
106 |         self.attn = MultiHeadSelfAttention(config)
107 |         # 归一化
108 |         self.ln_2 = nn.LayerNorm(config.n_embd)
109 |         # 前馈网络
110 |         self.mlp = MLP(config)
111 | 
112 |     def forward(self, x):
113 |         # x: (batch_size, seq_len, n_embd)
114 | 
115 |         # self.attn(x) 对 x 应用多头自注意力
116 |         # x + self.attn(x)的过程为残差连接
117 |         # self.ln_1对残差连接的结果进行归一化
118 |         x = self.ln_1(x + self.attn(x))
119 | 
120 |         # 应用前馈网络, 并进行残差连接和归一化
121 |         x = self.ln_2(x + self.mlp(x))
122 |         return x
123 | 
124 | 
125 | class Transformer(nn.Module):
126 |     """
127 |     Transformer模型
128 |     输入部分: 词向量 + 位置向量 + dropout
129 |     编码部分: 由多个Block组成
130 |     输出部分: 归一化 + 线性映射
131 |     """
132 | 
133 |     def __init__(self, config):
134 |         super().__init__()
135 |         # 配置信息
136 |         self.config = config
137 | 
138 |         # 词向量: 将输入的id映射为词向量
139 |         self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
140 |         # 位置向量: 将输入的位置映射为位置向量
141 |         self.pos_emb = nn.Embedding(config.context_size, config.n_embd)
142 |         # 层归一化: 对输入进行归一化(块间和块输出已经进行了归一化)
143 |         self.ln_f = nn.LayerNorm(config.n_embd)
144 | 
145 |         # 编码层: 由多个Transformer块组成
146 |         self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
147 | 
148 |         # 解码层: 将输出的词向量映射为词id
149 |         self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
150 | 
151 |     def forward(self, x, y=None):
152 |         # 要求输入序列长度不能大于块大小
153 |         _, seq_len = x.size()
154 |         assert seq_len <= self.config.context_size, "Cannot forward, model block size is exhausted."
155 | 
156 |         # 获取词向量
157 |         # x(batch_size, seq_len) --> token_embeddings: (batch_size, seq_len, n_embd)
158 |         token_embeddings = self.tok_emb(x)
159 | 
160 |         # 获取位置向量
161 |         pos = torch.arange(seq_len, dtype=torch.long).to(x.device)
162 |         position_embeddings = self.pos_emb(pos)
163 | 
164 |         # 二者相加作为输入
165 |         x = token_embeddings + position_embeddings
166 | 
167 |         x = self.ln_f(x)
168 | 
169 |         # 通过多个Transformer块进行编码
170 |         for block in self.blocks:
171 |             x = block(x)
172 | 
173 |         # 解码为对下一个token的回归预测
174 |         # x(batch_size, seq_len, n_embd) --> logits(batch_size, seq_len, vocab_size)
175 |         logits = self.head(x)
176 | 
177 |         # 如果有给定的目标输出, 则计算对数似然损失
178 |         loss = None
179 |         if y is not None:
180 |             # 计算损失
181 |             # x(batch_size, seq_len, vocab_size) --> x(batch_size*seq_len, vocab_size)
182 |             # y(batch_size * seq_len)
183 |             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
184 | 
185 |         return logits, loss
186 | 


--------------------------------------------------------------------------------
/chp5/tflm/sample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional as F
 3 | from utils import load_pretrained, save_pretrained, BOS_TOKEN, EOS_TOKEN
 4 | from .model import Transformer
 5 | 
 6 | @torch.no_grad()
 7 | def sample(model, vocab, x, steps, temperature=1.0):
 8 |     """
 9 |     接收一个输入序列 x （形状为 (b, t)）并预测序列中的下一个词元，每次将预测结果反馈给模型。
10 |     用temperature配合随机采样可以增加/减少随机性
11 |     """
12 | 
13 |     # 设置为评估模式
14 |     model.eval()
15 | 
16 |     # 生成符合目标长度的序列
17 |     for k in range(steps):
18 |         # 如果对于Transformer, 如果上文过长, 截取前context_size个token
19 |         if x.size(1) >= model.config.context_size:
20 |             x_cond = x[:, -model.config.context_size:]
21 |         # 如果上文不够长，在其末尾进行padding，由于掩码机制，这部分内容不会影响结果
22 |         else:
23 |             pad = torch.zeros(x.size(0), model.config.context_size - x.size(1))
24 |             x_cond = torch.cat((pad.long().to(x.device), x), dim=1)
25 | 
26 |         # 用模型进行预测
27 |         logits = model(x_cond)
28 |         # Transformer的输出是logit，loss，并且要取第input_length个数据的结果
29 |         input_length = min(x_cond.size(1), model.config.context_size)
30 |         logits = logits[0][:, input_length - 1, :]
31 |         # 提取最后一步的输出结果并按温度缩放，温度越高，采样越随机
32 |         probs = F.softmax(logits / temperature, dim=-1)
33 | 
34 |         # 根据prob进行多项式采样
35 |         ix = torch.multinomial(probs, num_samples=1)
36 |         if ix == vocab[EOS_TOKEN]:
37 |             break
38 | 
39 |         # 将结果添加到序列并继续
40 |         x = torch.cat((x, ix), dim=1)
41 |     return x
42 | 
43 | def sample_tflm(context, steps=10, model_path="tflm.model", temperature=1.0):
44 |     # 判断是否有可用的GPU
45 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
46 |     # 加载模型和词表到可用的设备上
47 |     vocab, model = load_pretrained(model_path, map_location=device)
48 |     # 将context全部小写化并按空格分割
49 |     context = context.lower().split()
50 |     context = model.config.context_size * [BOS_TOKEN] + context
51 | 
52 |     # 将输入内容转换为id序列
53 |     x = torch.tensor([vocab.convert_tokens_to_ids(context)]).to(device)
54 | 
55 |     # 生成结果并转换为token序列
56 |     y = sample(model, vocab, x, steps=steps, temperature=temperature)[0]
57 |     y = vocab.convert_ids_to_tokens(y)
58 | 
59 |     print(" ".join(y))
60 | 


--------------------------------------------------------------------------------
/chp5/tflm/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import functional as F
 4 | import torch.optim as optim
 5 | 
 6 | from .dataset import TransformerDataset
 7 | from .model import Transformer, Config
 8 | from utils import load_reuters, save_pretrained, device, get_loader
 9 | 
10 | from tqdm.auto import tqdm
11 | 
12 | def train_tflm(batch_size, num_epoch):
13 |     corpus, vocab = load_reuters()
14 |     # 设置参数
15 |     train_config = Config(
16 |         vocab_size=len(vocab),
17 |         context_size=64,
18 |         n_embd=128,
19 |         n_head=4,
20 |         n_layer=4)
21 | 
22 |     dataset = TransformerDataset(corpus, vocab)
23 |     data_loader = get_loader(dataset, batch_size)
24 | 
25 |     # 负对数似然损失函数，忽略pad_token处的损失
26 |     nll_loss = nn.NLLLoss()
27 |     # 构建TransformerLM，并加载至device
28 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
29 |     model = Transformer(train_config)
30 |     model.to(device)
31 |     # 使用Adam优化器
32 |     optimizer = optim.Adam(model.parameters(), lr=0.001)
33 | 
34 |     model.train()
35 |     for epoch in range(num_epoch):
36 |         total_loss = 0
37 |         for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
38 |             inputs, targets = [x.to(device) for x in batch]
39 |             optimizer.zero_grad()
40 |             # 生成并计算损失
41 |             _, loss = model(inputs, targets)
42 |             loss.backward()
43 |             optimizer.step()
44 |             total_loss += loss.item()
45 |         print(f"Loss: {total_loss:.2f}")
46 | 
47 |     save_pretrained(vocab, model, "tflm.model")
48 | 


--------------------------------------------------------------------------------
/chp5/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset, DataLoader
 3 | from vocab import Vocab
 4 | 
 5 | # Constants
 6 | BOS_TOKEN = "<bos>"
 7 | EOS_TOKEN = "<eos>"
 8 | PAD_TOKEN = "<pad>"
 9 | BOW_TOKEN = "<bow>"
10 | EOW_TOKEN = "<eow>"
11 | 
12 | WEIGHT_INIT_RANGE = 0.1
13 | 
14 | def load_reuters():
15 |     from nltk.corpus import reuters
16 |     text = reuters.sents()
17 |     # lowercase (optional)
18 |     text = [[word.lower() for word in sentence] for sentence in text]
19 |     vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
20 |     corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]
21 | 
22 |     return corpus, vocab
23 | 
24 | def save_pretrained(vocab, embeds, save_path):
25 |     """
26 |     Save pretrained token vectors in a unified format, where the first line
27 |     specifies the `number_of_tokens` and `embedding_dim` followed with all
28 |     token vectors, one token per line.
29 |     """
30 |     with open(save_path, "w") as writer:
31 |         writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
32 |         for idx, token in enumerate(vocab.idx_to_token):
33 |             vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
34 |             writer.write(f"{token} {vec}\n")
35 |     print(f"Pretrained embeddings saved to: {save_path}")
36 | 
37 | def load_pretrained(load_path):
38 |     with open(load_path, "r") as fin:
39 |         # Optional: depending on the specific format of pretrained vector file
40 |         n, d = map(int, fin.readline().split())
41 |         tokens = []
42 |         embeds = []
43 |         for line in fin:
44 |             line = line.rstrip().split(' ')
45 |             token, embed = line[0], list(map(float, line[1:]))
46 |             tokens.append(token)
47 |             embeds.append(embed)
48 |         vocab = Vocab(tokens)
49 |         embeds = torch.tensor(embeds, dtype=torch.float)
50 |     return vocab, embeds
51 | 
52 | def get_loader(dataset, batch_size, shuffle=True):
53 |     data_loader = DataLoader(
54 |         dataset,
55 |         batch_size=batch_size,
56 |         collate_fn=dataset.collate_fn,
57 |         shuffle=shuffle
58 |     )
59 |     return data_loader
60 | 
61 | def init_weights(model):
62 |     for name, param in model.named_parameters():
63 |         if "embedding" not in name:
64 |             torch.nn.init.uniform_(
65 |                 param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE
66 |             )
67 | 
68 | 


--------------------------------------------------------------------------------
/chp5/vocab.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | from collections import defaultdict, Counter
 4 | 
 5 | class Vocab:
 6 |     def __init__(self, tokens=None):
 7 |         self.idx_to_token = list()
 8 |         self.token_to_idx = dict()
 9 | 
10 |         if tokens is not None:
11 |             if "<unk>" not in tokens:
12 |                 tokens = tokens + ["<unk>"]
13 |             for token in tokens:
14 |                 self.idx_to_token.append(token)
15 |                 self.token_to_idx[token] = len(self.idx_to_token) - 1
16 |             self.unk = self.token_to_idx['<unk>']
17 | 
18 |     @classmethod
19 |     def build(cls, text, min_freq=1, reserved_tokens=None):
20 |         token_freqs = defaultdict(int)
21 |         for sentence in text:
22 |             for token in sentence:
23 |                 token_freqs[token] += 1
24 |         uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
25 |         uniq_tokens += [token for token, freq in token_freqs.items() \
26 |                         if freq >= min_freq and token != "<unk>"]
27 |         return cls(uniq_tokens)
28 | 
29 |     def __len__(self):
30 |         return len(self.idx_to_token)
31 | 
32 |     def __getitem__(self, token):
33 |         return self.token_to_idx.get(token, self.unk)
34 | 
35 |     def convert_tokens_to_ids(self, tokens):
36 |         return [self[token] for token in tokens]
37 | 
38 |     def convert_ids_to_tokens(self, indices):
39 |         return [self.idx_to_token[index] for index in indices]
40 | 
41 | 
42 | def save_vocab(vocab, path):
43 |     with open(path, 'w') as writer:
44 |         writer.write("\n".join(vocab.idx_to_token))
45 | 
46 | 
47 | def read_vocab(path):
48 |     with open(path, 'r') as f:
49 |         tokens = f.read().split('\n')
50 |     return Vocab(tokens)
51 | 
52 | 


--------------------------------------------------------------------------------
/chp6/cbow.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 6.1.5
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch.optim as optim
 7 | from torch.utils.data import Dataset
 8 | from torch.nn.utils.rnn import pad_sequence
 9 | from tqdm.auto import tqdm
10 | from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
11 | from utils import load_reuters, save_pretrained, get_loader, init_weights
12 | 
13 | class CbowDataset(Dataset):
14 |     def __init__(self, corpus, vocab, context_size=2):
15 |         self.data = []
16 |         self.bos = vocab[BOS_TOKEN]
17 |         self.eos = vocab[EOS_TOKEN]
18 |         for sentence in tqdm(corpus, desc="Dataset Construction"):
19 |             sentence = [self.bos] + sentence+ [self.eos]
20 |             if len(sentence) < context_size * 2 + 1:
21 |                 continue
22 |             for i in range(context_size, len(sentence) - context_size):
23 |                 # 模型输入：左右分别取context_size长度的上下文
24 |                 context = sentence[i-context_size:i] + sentence[i+1:i+context_size+1]
25 |                 # 模型输出：当前词
26 |                 target = sentence[i]
27 |                 self.data.append((context, target))
28 | 
29 |     def __len__(self):
30 |         return len(self.data)
31 | 
32 |     def __getitem__(self, i):
33 |         return self.data[i]
34 | 
35 |     def collate_fn(self, examples):
36 |         inputs = torch.tensor([ex[0] for ex in examples])
37 |         targets = torch.tensor([ex[1] for ex in examples])
38 |         return (inputs, targets)
39 | 
40 | class CbowModel(nn.Module):
41 |     def __init__(self, vocab_size, embedding_dim):
42 |         super(CbowModel, self).__init__()
43 |         # 词嵌入层
44 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
45 |         # 线性变换：隐含层->输出层
46 |         self.output = nn.Linear(embedding_dim, vocab_size)
47 |         init_weights(self)
48 | 
49 |     def forward(self, inputs):
50 |         embeds = self.embeddings(inputs)
51 |         # 计算隐含层：对上下文词向量求平均
52 |         hidden = embeds.mean(dim=1)
53 |         output = self.output(hidden)
54 |         log_probs = F.log_softmax(output, dim=1)
55 |         return log_probs
56 | 
57 | embedding_dim = 64
58 | context_size = 2
59 | hidden_dim = 128
60 | batch_size = 512
61 | num_epoch = 5
62 | 
63 | # 读取文本数据，构建CBOW模型训练数据集
64 | corpus, vocab = load_reuters()
65 | dataset = CbowDataset(corpus, vocab, context_size=context_size)
66 | data_loader = get_loader(dataset, batch_size)
67 | 
68 | nll_loss = nn.NLLLoss()
69 | # 构建CBOW模型，并加载至device
70 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
71 | model = CbowModel(len(vocab), embedding_dim)
72 | model.to(device)
73 | optimizer = optim.Adam(model.parameters(), lr=0.001)
74 | 
75 | model.train()
76 | for epoch in range(num_epoch):
77 |     total_loss = 0
78 |     for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
79 |         inputs, targets = [x.to(device) for x in batch]
80 |         optimizer.zero_grad()
81 |         log_probs = model(inputs)
82 |         loss = nll_loss(log_probs, targets)
83 |         loss.backward()
84 |         optimizer.step()
85 |         total_loss += loss.item()
86 |     print(f"Loss: {total_loss:.2f}")
87 | 
88 | # 保存词向量（model.embeddings）
89 | save_pretrained(vocab, model.embeddings.weight.data, "cbow.vec")
90 | 
91 | 


--------------------------------------------------------------------------------
/chp6/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 6.1.6
 2 | 
 3 | import torch
 4 | from utils import load_pretrained
 5 | 
 6 | def knn(W, x, k):
 7 |     similarities = torch.matmul(x, W.transpose(1, 0)) / (torch.norm(W, dim=1) * torch.norm(x) + 1e-9)
 8 |     knn = similarities.topk(k=k)
 9 |     return knn.values.tolist(), knn.indices.tolist()
10 | 
11 | def find_similar_words(embeds, vocab, query, k=5):
12 |     knn_values, knn_indices = knn(embeds, embeds[vocab[query]], k + 1)
13 |     knn_words = vocab.convert_ids_to_tokens(knn_indices)
14 |     print(f">>> Query word: {query}")
15 |     for i in range(k):
16 |         print(f"cosine similarity={knn_values[i + 1]:.4f}: {knn_words[i + 1]}")
17 | 
18 | word_sim_queries = ["china", "august", "good", "paris"]
19 | vocab, embeds = load_pretrained("glove.vec")
20 | for w in word_sim_queries:
21 |     find_similar_words(embeds, vocab, w)
22 | 
23 | 
24 | def find_analogy(embeds, vocab, word_a, word_b, word_c):
25 |     vecs = embeds[vocab.convert_tokens_to_ids([word_a, word_b, word_c])]
26 |     x = vecs[2] + vecs[1] - vecs[0]
27 |     knn_values, knn_indices = knn(embeds, x, k=1)
28 |     analogies = vocab.convert_ids_to_tokens(knn_indices)
29 |     print(f">>> Query: {word_a}, {word_b}, {word_c}")
30 |     print(f"{analogies}")
31 | 
32 | word_analogy_queries = [["brother", "sister", "man"],
33 |                         ["paris", "france", "berlin"]]
34 | vocab, embeds = load_pretrained("glove.vec")
35 | for w_a, w_b, w_c in word_analogy_queries:
36 |     find_analogy(embeds, vocab, w_a, w_b, w_c)
37 | 
38 | 


--------------------------------------------------------------------------------
/chp6/glove.py:
--------------------------------------------------------------------------------
  1 | # Defined in Section 6.1.5
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | from torch.utils.data import Dataset
  8 | from torch.nn.utils.rnn import pad_sequence
  9 | from tqdm.auto import tqdm
 10 | from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
 11 | from utils import load_reuters, save_pretrained, get_loader, init_weights
 12 | from collections import defaultdict
 13 | 
 14 | class GloveDataset(Dataset):
 15 |     def __init__(self, corpus, vocab, context_size=2):
 16 |         # 记录词与上下文在给定语料中的共现次数
 17 |         self.cooccur_counts = defaultdict(float)
 18 |         self.bos = vocab[BOS_TOKEN]
 19 |         self.eos = vocab[EOS_TOKEN]
 20 |         for sentence in tqdm(corpus, desc="Dataset Construction"):
 21 |             sentence = [self.bos] + sentence + [self.eos]
 22 |             for i in range(1, len(sentence)-1):
 23 |                 w = sentence[i]
 24 |                 left_contexts = sentence[max(0, i - context_size):i]
 25 |                 right_contexts = sentence[i+1:min(len(sentence), i + context_size)+1]
 26 |                 # 共现次数随距离衰减: 1/d(w, c)
 27 |                 for k, c in enumerate(left_contexts[::-1]):
 28 |                     self.cooccur_counts[(w, c)] += 1 / (k + 1)
 29 |                 for k, c in enumerate(right_contexts):
 30 |                     self.cooccur_counts[(w, c)] += 1 / (k + 1)
 31 |         self.data = [(w, c, count) for (w, c), count in self.cooccur_counts.items()]
 32 | 
 33 |     def __len__(self):
 34 |         return len(self.data)
 35 | 
 36 |     def __getitem__(self, i):
 37 |         return self.data[i]
 38 | 
 39 |     def collate_fn(self, examples):
 40 |         words = torch.tensor([ex[0] for ex in examples])
 41 |         contexts = torch.tensor([ex[1] for ex in examples])
 42 |         counts = torch.tensor([ex[2] for ex in examples])
 43 |         return (words, contexts, counts)
 44 | 
 45 | class GloveModel(nn.Module):
 46 |     def __init__(self, vocab_size, embedding_dim):
 47 |         super(GloveModel, self).__init__()
 48 |         # 词嵌入及偏置向量
 49 |         self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
 50 |         self.w_biases = nn.Embedding(vocab_size, 1)
 51 |         # 上下文嵌入及偏置向量
 52 |         self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)
 53 |         self.c_biases = nn.Embedding(vocab_size, 1)
 54 | 
 55 |     def forward_w(self, words):
 56 |         w_embeds = self.w_embeddings(words)
 57 |         w_biases = self.w_biases(words)
 58 |         return w_embeds, w_biases
 59 | 
 60 |     def forward_c(self, contexts):
 61 |         c_embeds = self.c_embeddings(contexts)
 62 |         c_biases = self.c_biases(contexts)
 63 |         return c_embeds, c_biases
 64 | 
 65 | embedding_dim = 64
 66 | context_size = 2
 67 | batch_size = 512
 68 | num_epoch = 5
 69 | 
 70 | # 用以控制样本权重的超参数
 71 | m_max = 100
 72 | alpha = 0.75
 73 | # 从文本数据中构建GloVe训练数据集
 74 | corpus, vocab = load_reuters()
 75 | dataset = GloveDataset(
 76 |     corpus,
 77 |     vocab,
 78 |     context_size=context_size
 79 | )
 80 | data_loader = get_loader(dataset, batch_size)
 81 | 
 82 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 83 | model = GloveModel(len(vocab), embedding_dim)
 84 | model.to(device)
 85 | optimizer = optim.Adam(model.parameters(), lr=0.001)
 86 | 
 87 | model.train()
 88 | for epoch in range(num_epoch):
 89 |     total_loss = 0
 90 |     for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
 91 |         words, contexts, counts = [x.to(device) for x in batch]
 92 |         # 提取batch内词、上下文的向量表示及偏置
 93 |         word_embeds, word_biases = model.forward_w(words)
 94 |         context_embeds, context_biases = model.forward_c(contexts)
 95 |         # 回归目标值：必要时可以使用log(counts+1)进行平滑
 96 |         log_counts = torch.log(counts)
 97 |         # 样本权重
 98 |         weight_factor = torch.clamp(torch.pow(counts / m_max, alpha), max=1.0)
 99 |         optimizer.zero_grad()
100 |         # 计算batch内每个样本的L2损失
101 |         loss = (torch.sum(word_embeds * context_embeds, dim=1, keepdim=True) + word_biases + context_biases - log_counts) ** 2
102 |         # 样本加权损失
103 |         wavg_loss = (weight_factor * loss).mean()
104 |         wavg_loss.backward()
105 |         optimizer.step()
106 |         total_loss += wavg_loss.item()
107 |     print(f"Loss: {total_loss:.2f}")
108 | 
109 | # 合并词嵌入矩阵与上下文嵌入矩阵，作为最终的预训练词向量
110 | combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
111 | save_pretrained(vocab, combined_embeds.data, "glove.vec")
112 | 
113 | 


--------------------------------------------------------------------------------
/chp6/sgns.py:
--------------------------------------------------------------------------------
  1 | # Defined in Section 6.1.5
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | from torch.utils.data import Dataset
  8 | from torch.nn.utils.rnn import pad_sequence
  9 | from tqdm.auto import tqdm
 10 | from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
 11 | from utils import load_reuters, save_pretrained, get_loader, init_weights
 12 | 
 13 | class SGNSDataset(Dataset):
 14 |     def __init__(self, corpus, vocab, context_size=2, n_negatives=5, ns_dist=None):
 15 |         self.data = []
 16 |         self.bos = vocab[BOS_TOKEN]
 17 |         self.eos = vocab[EOS_TOKEN]
 18 |         self.pad = vocab[PAD_TOKEN]
 19 |         for sentence in tqdm(corpus, desc="Dataset Construction"):
 20 |             sentence = [self.bos] + sentence + [self.eos]
 21 |             for i in range(1, len(sentence)-1):
 22 |                 # 模型输入：(w, context) ；输出为0/1，表示context是否为负样本
 23 |                 w = sentence[i]
 24 |                 left_context_index = max(0, i - context_size)
 25 |                 right_context_index = min(len(sentence), i + context_size)
 26 |                 context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
 27 |                 context += [self.pad] * (2 * context_size - len(context))
 28 |                 self.data.append((w, context))
 29 | 
 30 |         # 负样本数量
 31 |         self.n_negatives = n_negatives
 32 |         # 负采样分布：若参数ns_dist为None，则使用uniform分布
 33 |         self.ns_dist = ns_dist if ns_dist is not None else torch.ones(len(vocab))
 34 | 
 35 |     def __len__(self):
 36 |         return len(self.data)
 37 | 
 38 |     def __getitem__(self, i):
 39 |         return self.data[i]
 40 | 
 41 |     def collate_fn(self, examples):
 42 |         words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
 43 |         contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
 44 |         batch_size, context_size = contexts.shape
 45 |         neg_contexts = []
 46 |         # 对batch内的样本分别进行负采样
 47 |         for i in range(batch_size):
 48 |             # 保证负样本不包含当前样本中的context
 49 |             ns_dist = self.ns_dist.index_fill(0, contexts[i], .0)
 50 |             neg_contexts.append(torch.multinomial(ns_dist, self.n_negatives * context_size, replacement=True))
 51 |         neg_contexts = torch.stack(neg_contexts, dim=0)
 52 |         return words, contexts, neg_contexts
 53 | 
 54 | class SGNSModel(nn.Module):
 55 |     def __init__(self, vocab_size, embedding_dim):
 56 |         super(SGNSModel, self).__init__()
 57 |         # 词嵌入
 58 |         self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
 59 |         # 上下文嵌入
 60 |         self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)
 61 | 
 62 |     def forward_w(self, words):
 63 |         w_embeds = self.w_embeddings(words)
 64 |         return w_embeds
 65 | 
 66 |     def forward_c(self, contexts):
 67 |         c_embeds = self.c_embeddings(contexts)
 68 |         return c_embeds
 69 | 
 70 | 
 71 | def get_unigram_distribution(corpus, vocab_size):
 72 |     # 从给定语料中统计unigram概率分布
 73 |     token_counts = torch.tensor([0] * vocab_size)
 74 |     total_count = 0
 75 |     for sentence in corpus:
 76 |         total_count += len(sentence)
 77 |         for token in sentence:
 78 |             token_counts[token] += 1
 79 |     unigram_dist = torch.div(token_counts.float(), total_count)
 80 |     return unigram_dist
 81 | 
 82 | embedding_dim = 64
 83 | context_size = 2
 84 | hidden_dim = 128
 85 | batch_size = 1024
 86 | num_epoch = 10
 87 | n_negatives = 10
 88 | 
 89 | # 读取文本数据
 90 | corpus, vocab = load_reuters()
 91 | # 计算unigram概率分布
 92 | unigram_dist = get_unigram_distribution(corpus, len(vocab))
 93 | # 根据unigram分布计算负采样分布: p(w)**0.75
 94 | negative_sampling_dist = unigram_dist ** 0.75
 95 | negative_sampling_dist /= negative_sampling_dist.sum()
 96 | # 构建SGNS训练数据集
 97 | dataset = SGNSDataset(
 98 |     corpus,
 99 |     vocab,
100 |     context_size=context_size,
101 |     n_negatives=n_negatives,
102 |     ns_dist=negative_sampling_dist
103 | )
104 | data_loader = get_loader(dataset, batch_size)
105 | 
106 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
107 | model = SGNSModel(len(vocab), embedding_dim)
108 | model.to(device)
109 | optimizer = optim.Adam(model.parameters(), lr=0.001)
110 | 
111 | model.train()
112 | for epoch in range(num_epoch):
113 |     total_loss = 0
114 |     for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
115 |         words, contexts, neg_contexts = [x.to(device) for x in batch]
116 |         optimizer.zero_grad()
117 |         batch_size = words.shape[0]
118 |         # 提取batch内词、上下文以及负样本的向量表示
119 |         word_embeds = model.forward_w(words).unsqueeze(dim=2)
120 |         context_embeds = model.forward_c(contexts)
121 |         neg_context_embeds = model.forward_c(neg_contexts)
122 |         # 正样本的分类（对数）似然
123 |         context_loss = F.logsigmoid(torch.bmm(context_embeds, word_embeds).squeeze(dim=2))
124 |         context_loss = context_loss.mean(dim=1)
125 |         # 负样本的分类（对数）似然
126 |         neg_context_loss = F.logsigmoid(torch.bmm(neg_context_embeds, word_embeds).squeeze(dim=2).neg())
127 |         neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
128 |         neg_context_loss = neg_context_loss.mean(dim=1)
129 |         # 损失：负对数似然
130 |         loss = -(context_loss + neg_context_loss).mean()
131 |         loss.backward()
132 |         optimizer.step()
133 |         total_loss += loss.item()
134 |     print(f"Loss: {total_loss:.2f}")
135 | 
136 | # 合并词嵌入矩阵与上下文嵌入矩阵，作为最终的预训练词向量
137 | combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
138 | save_pretrained(vocab, combined_embeds.data, "sgns.vec")
139 | 


--------------------------------------------------------------------------------
/chp6/skipgram.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 6.1.5
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | from torch.utils.data import Dataset
 7 | from torch.nn.utils.rnn import pad_sequence
 8 | from tqdm.auto import tqdm
 9 | from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
10 | from utils import load_reuters, save_pretrained, get_loader, init_weights
11 | 
12 | class SkipGramDataset(Dataset):
13 |     def __init__(self, corpus, vocab, context_size=2):
14 |         self.data = []
15 |         self.bos = vocab[BOS_TOKEN]
16 |         self.eos = vocab[EOS_TOKEN]
17 |         for sentence in tqdm(corpus, desc="Dataset Construction"):
18 |             sentence = [self.bos] + sentence + [self.eos]
19 |             for i in range(1, len(sentence)-1):
20 |                 # 模型输入：当前词
21 |                 w = sentence[i]
22 |                 # 模型输出：一定窗口大小内的上下文
23 |                 left_context_index = max(0, i - context_size)
24 |                 right_context_index = min(len(sentence), i + context_size)
25 |                 context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
26 |                 self.data.extend([(w, c) for c in context])
27 | 
28 |     def __len__(self):
29 |         return len(self.data)
30 | 
31 |     def __getitem__(self, i):
32 |         return self.data[i]
33 | 
34 |     def collate_fn(self, examples):
35 |         inputs = torch.tensor([ex[0] for ex in examples])
36 |         targets = torch.tensor([ex[1] for ex in examples])
37 |         return (inputs, targets)
38 | 
39 | class SkipGramModel(nn.Module):
40 |     def __init__(self, vocab_size, embedding_dim):
41 |         super(SkipGramModel, self).__init__()
42 |         self.embeddings = nn.Embedding(vocab_size, embedding_dim)
43 |         self.output = nn.Linear(embedding_dim, vocab_size)
44 |         init_weights(self)
45 | 
46 |     def forward(self, inputs):
47 |         embeds = self.embeddings(inputs)
48 |         output = self.output(embeds)
49 |         log_probs = F.log_softmax(output, dim=1)
50 |         return log_probs
51 | 
52 | embedding_dim = 64
53 | context_size = 2
54 | hidden_dim = 128
55 | batch_size = 1024
56 | num_epoch = 10
57 | 
58 | # 读取文本数据，构建Skip-gram模型训练数据集
59 | corpus, vocab = load_reuters()
60 | dataset = SkipGramDataset(corpus, vocab, context_size=context_size)
61 | data_loader = get_loader(dataset, batch_size)
62 | 
63 | nll_loss = nn.NLLLoss()
64 | # 构建Skip-gram模型，并加载至device
65 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
66 | model = SkipGramModel(len(vocab), embedding_dim)
67 | model.to(device)
68 | optimizer = optim.Adam(model.parameters(), lr=0.001)
69 | 
70 | model.train()
71 | for epoch in range(num_epoch):
72 |     total_loss = 0
73 |     for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
74 |         inputs, targets = [x.to(device) for x in batch]
75 |         optimizer.zero_grad()
76 |         log_probs = model(inputs)
77 |         loss = nll_loss(log_probs, targets)
78 |         loss.backward()
79 |         optimizer.step()
80 |         total_loss += loss.item()
81 |     print(f"Loss: {total_loss:.2f}")
82 | 
83 | # 保存词向量（model.embeddings）
84 | save_pretrained(vocab, model.embeddings.weight.data, "skipgram.vec")
85 | 
86 | 


--------------------------------------------------------------------------------
/chp6/train_elmo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.nn.modules import Dropout
  5 | import torch.optim as optim
  6 | from torch.nn.utils.rnn import pad_sequence
  7 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  8 | from torch.utils.data import Dataset
  9 | from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
 10 | from utils import BOW_TOKEN, EOW_TOKEN
 11 | from utils import get_loader
 12 | from vocab import Vocab, save_vocab
 13 | 
 14 | import codecs
 15 | import json
 16 | import os
 17 | import numpy as np
 18 | from tqdm.auto import tqdm
 19 | from collections import defaultdict
 20 | 
 21 | def load_corpus(path, max_tok_len=None, max_seq_len=None):
 22 |     # Read raw text file
 23 |     # and build vocabulary for both words and chars
 24 |     text = []
 25 |     charset = {BOS_TOKEN, EOS_TOKEN, PAD_TOKEN, BOW_TOKEN, EOW_TOKEN}
 26 |     print(f"Loading corpus from {path}")
 27 |     with codecs.open(path, "r", encoding="utf-8") as f:
 28 |         for line in tqdm(f):
 29 |             tokens = line.rstrip().split(" ")
 30 |             if max_seq_len is not None and len(tokens) + 2 > max_seq_len:
 31 |                 tokens = line[:max_seq_len-2]
 32 |             sent = [BOS_TOKEN]
 33 |             for token in tokens:
 34 |                 if max_tok_len is not None and len(token) + 2 > max_tok_len:
 35 |                     token = token[:max_tok_len-2]
 36 |                 sent.append(token)
 37 |                 for ch in token:
 38 |                     charset.add(ch)
 39 |             sent.append(EOS_TOKEN)
 40 |             text.append(sent)
 41 | 
 42 |     # Build word and character vocabulary
 43 |     print("Building word-level vocabulary")
 44 |     vocab_w = Vocab.build(
 45 |         text,
 46 |         min_freq=2,
 47 |         reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN]
 48 |     )
 49 |     print("Building char-level vocabulary")
 50 |     vocab_c = Vocab(tokens=list(charset))
 51 | 
 52 |     # Construct corpus using word_voab and char_vocab
 53 |     corpus_w = [vocab_w.convert_tokens_to_ids(sent) for sent in text]
 54 |     corpus_c = []
 55 |     bow = vocab_c[BOW_TOKEN]
 56 |     eow = vocab_c[EOW_TOKEN]
 57 |     for i, sent in enumerate(text):
 58 |         sent_c = []
 59 |         for token in sent:
 60 |             if token == BOS_TOKEN or token == EOS_TOKEN:
 61 |                 token_c = [bow, vocab_c[token], eow]
 62 |             else:
 63 |                 token_c = [bow] + vocab_c.convert_tokens_to_ids(token) + [eow]
 64 |             sent_c.append(token_c)
 65 |         assert len(sent_c) == len(corpus_w[i])
 66 |         corpus_c.append(sent_c)
 67 | 
 68 |     assert len(corpus_w) == len(corpus_c)
 69 |     return corpus_w, corpus_c, vocab_w, vocab_c
 70 | 
 71 | # Dataset
 72 | class BiLMDataset(Dataset):
 73 |     def __init__(self, corpus_w, corpus_c, vocab_w, vocab_c):
 74 |         super(BiLMDataset, self).__init__()
 75 |         self.pad_w = vocab_w[PAD_TOKEN]
 76 |         self.pad_c = vocab_c[PAD_TOKEN]
 77 | 
 78 |         self.data = []
 79 |         for sent_w, sent_c in tqdm(zip(corpus_w, corpus_c)):
 80 |             self.data.append((sent_w, sent_c))
 81 | 
 82 |     def __len__(self):
 83 |         return len(self.data)
 84 | 
 85 |     def __getitem__(self, i):
 86 |         return self.data[i]
 87 | 
 88 |     def collate_fn(self, examples):
 89 |         # lengths: batch_size
 90 |         seq_lens = torch.LongTensor([len(ex[0]) for ex in examples])
 91 | 
 92 |         # inputs_w
 93 |         inputs_w = [torch.tensor(ex[0]) for ex in examples]
 94 |         inputs_w = pad_sequence(inputs_w, batch_first=True, padding_value=self.pad_w)
 95 | 
 96 |         # inputs_c: batch_size * max_seq_len * max_tok_len
 97 |         batch_size, max_seq_len = inputs_w.shape
 98 |         max_tok_len = max([max([len(tok) for tok in ex[1]]) for ex in examples])
 99 | 
100 |         inputs_c = torch.LongTensor(batch_size, max_seq_len, max_tok_len).fill_(self.pad_c)
101 |         for i, (sent_w, sent_c) in enumerate(examples):
102 |             for j, tok in enumerate(sent_c):
103 |                 inputs_c[i][j][:len(tok)] = torch.LongTensor(tok)
104 | 
105 |         # fw_input_indexes, bw_input_indexes = [], []
106 |         targets_fw = torch.LongTensor(inputs_w.shape).fill_(self.pad_w)
107 |         targets_bw = torch.LongTensor(inputs_w.shape).fill_(self.pad_w)
108 |         for i, (sent_w, sent_c) in enumerate(examples):
109 |             targets_fw[i][:len(sent_w)-1] = torch.LongTensor(sent_w[1:])
110 |             targets_bw[i][1:len(sent_w)] = torch.LongTensor(sent_w[:len(sent_w)-1])
111 | 
112 |         return inputs_w, inputs_c, seq_lens, targets_fw, targets_bw
113 | 
114 | # Model Components
115 | class Highway(nn.Module):
116 |     def __init__(self, input_dim, num_layers, activation=F.relu):
117 |         super(Highway, self).__init__()
118 |         self.input_dim = input_dim
119 |         self.layers = torch.nn.ModuleList(
120 |             [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)]
121 |         )
122 |         self.activation = activation
123 |         for layer in self.layers:
124 |             # set bias in the gates to be positive
125 |             # such that the highway layer will be biased towards the input part
126 |             layer.bias[input_dim:].data.fill_(1)
127 | 
128 |     def forward(self, inputs):
129 |         curr_inputs = inputs
130 |         for layer in self.layers:
131 |             projected_inputs = layer(curr_inputs)
132 |             hidden = self.activation(projected_inputs[:, 0:self.input_dim])
133 |             gate = torch.sigmoid(projected_inputs[:, self.input_dim:])
134 |             curr_inputs = gate * curr_inputs + (1 - gate) * hidden
135 |         return curr_inputs
136 | 
137 | 
138 | class ConvTokenEmbedder(nn.Module):
139 |     def __init__(
140 |         self,
141 |         vocab_c,
142 |         char_embedding_dim,
143 |         char_conv_filters,
144 |         num_highways,
145 |         output_dim,
146 |         pad="<pad>"
147 |     ):
148 |         super(ConvTokenEmbedder, self).__init__()
149 |         self.vocab_c = vocab_c
150 | 
151 |         self.char_embeddings = nn.Embedding(
152 |             len(vocab_c),
153 |             char_embedding_dim,
154 |             padding_idx=vocab_c[pad]
155 |         )
156 |         self.char_embeddings.weight.data.uniform_(-0.25, 0.25)
157 | 
158 |         self.convolutions = nn.ModuleList()
159 |         for kernel_size, out_channels in char_conv_filters:
160 |             conv = torch.nn.Conv1d(
161 |                 in_channels=char_embedding_dim,
162 |                 out_channels=out_channels,
163 |                 kernel_size=kernel_size,
164 |                 bias=True
165 |             )
166 |             self.convolutions.append(conv)
167 | 
168 |         self.num_filters = sum(f[1] for f in char_conv_filters)
169 |         self.num_highways = num_highways
170 |         self.highways = Highway(self.num_filters, self.num_highways, activation=F.relu)
171 | 
172 |         self.projection = nn.Linear(self.num_filters, output_dim, bias=True)
173 | 
174 |     def forward(self, inputs):
175 |         batch_size, seq_len, token_len = inputs.shape
176 |         inputs = inputs.view(batch_size * seq_len, -1)
177 |         char_embeds = self.char_embeddings(inputs)
178 |         char_embeds = char_embeds.transpose(1, 2)
179 | 
180 |         conv_hiddens = []
181 |         for i in range(len(self.convolutions)):
182 |             conv_hidden = self.convolutions[i](char_embeds)
183 |             conv_hidden, _ = torch.max(conv_hidden, dim=-1)
184 |             conv_hidden = F.relu(conv_hidden)
185 |             conv_hiddens.append(conv_hidden)
186 | 
187 |         token_embeds = torch.cat(conv_hiddens, dim=-1)
188 |         token_embeds = self.highways(token_embeds)
189 |         token_embeds = self.projection(token_embeds)
190 |         token_embeds = token_embeds.view(batch_size, seq_len, -1)
191 | 
192 |         return token_embeds
193 | 
194 | class ELMoLstmEncoder(nn.Module):
195 |     def __init__(
196 |         self,
197 |         input_dim,
198 |         hidden_dim,
199 |         num_layers,
200 |         dropout_prob=0.0
201 |     ):
202 |         super(ELMoLstmEncoder, self).__init__()
203 | 
204 |         # set projection_dim==input_dim for ELMo usage
205 |         self.projection_dim = input_dim
206 |         self.num_layers = num_layers
207 | 
208 |         self.forward_layers = nn.ModuleList()
209 |         self.backward_layers = nn.ModuleList()
210 |         self.forward_projections = nn.ModuleList()
211 |         self.backward_projections = nn.ModuleList()
212 | 
213 |         lstm_input_dim = input_dim
214 |         for _ in range(num_layers):
215 |             forward_layer = nn.LSTM(
216 |                 lstm_input_dim,
217 |                 hidden_dim,
218 |                 num_layers=1,
219 |                 batch_first=True
220 |             )
221 |             forward_projection = nn.Linear(hidden_dim, self.projection_dim, bias=True)
222 | 
223 |             backward_layer = nn.LSTM(
224 |                 lstm_input_dim,
225 |                 hidden_dim,
226 |                 num_layers=1,
227 |                 batch_first=True
228 |             )
229 |             backward_projection = nn.Linear(hidden_dim, self.projection_dim, bias=True)
230 | 
231 |             lstm_input_dim = self.projection_dim
232 | 
233 |             self.forward_layers.append(forward_layer)
234 |             self.forward_projections.append(forward_projection)
235 |             self.backward_layers.append(backward_layer)
236 |             self.backward_projections.append(backward_projection)
237 | 
238 |     def forward(self, inputs, lengths):
239 |         batch_size, seq_len, input_dim = inputs.shape
240 |         rev_idx = torch.arange(seq_len).unsqueeze(0).repeat(batch_size, 1)
241 |         for i in range(lengths.shape[0]):
242 |             rev_idx[i,:lengths[i]] = torch.arange(lengths[i]-1, -1, -1)
243 |         rev_idx = rev_idx.unsqueeze(2).expand_as(inputs)
244 |         rev_idx = rev_idx.to(inputs.device)
245 |         rev_inputs = inputs.gather(1, rev_idx)
246 | 
247 |         forward_inputs, backward_inputs = inputs, rev_inputs
248 |         stacked_forward_states, stacked_backward_states = [], []
249 | 
250 |         for layer_index in range(self.num_layers):
251 |             # Transfer `lengths` to CPU to be compatible with latest PyTorch versions.
252 |             packed_forward_inputs = pack_padded_sequence(
253 |                 forward_inputs, lengths.cpu(), batch_first=True, enforce_sorted=False)
254 |             packed_backward_inputs = pack_padded_sequence(
255 |                 backward_inputs, lengths.cpu(), batch_first=True, enforce_sorted=False)
256 | 
257 |             # forward
258 |             forward_layer = self.forward_layers[layer_index]
259 |             packed_forward, _ = forward_layer(packed_forward_inputs)
260 |             forward = pad_packed_sequence(packed_forward, batch_first=True)[0]
261 |             forward = self.forward_projections[layer_index](forward)
262 |             stacked_forward_states.append(forward)
263 | 
264 |             # backward
265 |             backward_layer = self.backward_layers[layer_index]
266 |             packed_backward, _ = backward_layer(packed_backward_inputs)
267 |             backward = pad_packed_sequence(packed_backward, batch_first=True)[0]
268 |             backward = self.backward_projections[layer_index](backward)
269 |             # convert back to original sequence order using rev_idx
270 |             stacked_backward_states.append(backward.gather(1, rev_idx))
271 | 
272 |             forward_inputs, backward_inputs = forward, backward
273 | 
274 |         # stacked_forward_states: [batch_size, seq_len, projection_dim] * num_layers
275 |         # stacked_backward_states: [batch_size, seq_len, projection_dim] * num_layers
276 |         return stacked_forward_states, stacked_backward_states
277 | 
278 | 
279 | class BiLM(nn.Module):
280 |     """
281 |     多层双向语言模型。
282 |     """
283 |     def __init__(self, configs, vocab_w, vocab_c):
284 |         super(BiLM, self).__init__()
285 |         self.dropout_prob = configs['dropout_prob']
286 |         self.num_classes = len(vocab_w)
287 | 
288 |         self.token_embedder = ConvTokenEmbedder(
289 |             vocab_c,
290 |             configs['char_embedding_dim'],
291 |             configs['char_conv_filters'],
292 |             configs['num_highways'],
293 |             configs['projection_dim']
294 |         )
295 | 
296 |         self.encoder = ELMoLstmEncoder(
297 |             configs['projection_dim'],
298 |             configs['hidden_dim'],
299 |             configs['num_layers']
300 |         )
301 | 
302 |         self.classifier = nn.Linear(configs['projection_dim'], self.num_classes)
303 | 
304 |     def forward(self, inputs, lengths):
305 |         token_embeds = self.token_embedder(inputs)
306 |         token_embeds = F.dropout(token_embeds, self.dropout_prob)
307 |         forward, backward = self.encoder(token_embeds, lengths)
308 | 
309 |         return self.classifier(forward[-1]), self.classifier(backward[-1])
310 | 
311 |     def save_pretrained(self, path):
312 |         os.makedirs(path, exist_ok=True)
313 |         torch.save(self.token_embedder.state_dict(), os.path.join(path, 'token_embedder.pth'))
314 |         torch.save(self.encoder.state_dict(), os.path.join(path, 'encoder.pth'))
315 |         torch.save(self.classifier.state_dict(), os.path.join(path, 'classifier.pth'))
316 | 
317 |     def load_pretrained(self, path):
318 |         self.token_embedder.load_state_dict(torch.load(os.path.join(path, 'token_embedder.pth')))
319 |         self.encoder.load_state_dict(torch.load(os.path.join(path, 'encoder.pth')))
320 |         self.classifier.load_state_dict(torch.load(os.path.join(path, 'classifier.pth')))
321 | 
322 | 
323 | configs = {
324 |     'max_tok_len': 50,
325 |     'train_file': './train.txt', # path to your training file, line-by-line and tokenized
326 |     'model_path': './elmo_bilm',
327 |     'char_embedding_dim': 50,
328 |     'char_conv_filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]],
329 |     'num_highways': 2,
330 |     'projection_dim': 512,
331 |     'hidden_dim': 4096,
332 |     'num_layers': 2,
333 |     'batch_size': 32,
334 |     'dropout_prob': 0.1,
335 |     'learning_rate': 0.0004,
336 |     'clip_grad': 5,
337 |     'num_epoch': 10
338 | }
339 | 
340 | corpus_w, corpus_c, vocab_w, vocab_c = load_corpus(configs['train_file'])
341 | train_data = BiLMDataset(corpus_w, corpus_c, vocab_w, vocab_c)
342 | train_loader = get_loader(train_data, configs['batch_size'])
343 | 
344 | criterion = nn.CrossEntropyLoss(
345 |     ignore_index=vocab_w[PAD_TOKEN],
346 |     reduction="sum"
347 | )
348 | print("Building BiLM model")
349 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
350 | model = BiLM(configs, vocab_w, vocab_c)
351 | print(model)
352 | model.to(device)
353 | 
354 | optimizer = optim.Adam(
355 |     filter(lambda x: x.requires_grad, model.parameters()),
356 |     lr=configs['learning_rate']
357 | )
358 | 
359 | model.train()
360 | for epoch in range(configs['num_epoch']):
361 |     total_loss = 0
362 |     total_tags = 0 # number of valid predictions
363 |     for batch in tqdm(train_loader, desc=f"Training Epoch {epoch}"):
364 |         batch = [x.to(device) for x in batch]
365 |         inputs_w, inputs_c, seq_lens, targets_fw, targets_bw = batch
366 | 
367 |         optimizer.zero_grad()
368 |         outputs_fw, outputs_bw = model(inputs_c, seq_lens)
369 |         loss_fw = criterion(
370 |             outputs_fw.view(-1, outputs_fw.shape[-1]),
371 |             targets_fw.view(-1)
372 |         )
373 |         loss_bw = criterion(
374 |             outputs_bw.view(-1, outputs_bw.shape[-1]),
375 |             targets_bw.view(-1)
376 |         )
377 |         loss = (loss_fw + loss_bw) / 2.0
378 |         loss.backward()
379 | 
380 |         torch.nn.utils.clip_grad_norm_(model.parameters(), configs['clip_grad'])
381 |         optimizer.step()
382 | 
383 |         total_loss += loss_fw.item()
384 |         total_tags += seq_lens.sum().item()
385 | 
386 |     train_ppl = np.exp(total_loss / total_tags)
387 |     print(f"Train PPL: {train_ppl:.2f}")
388 | 
389 | # save BiLM encoders
390 | model.save_pretrained(configs['model_path'])
391 | # save configs
392 | json.dump(configs, open(os.path.join(configs['model_path'], 'configs.json'), "w"))
393 | # save vocabularies
394 | save_vocab(vocab_w, os.path.join(configs['model_path'], 'word.dic'))
395 | save_vocab(vocab_c, os.path.join(configs['model_path'], 'char.dic'))
396 | 
397 | 


--------------------------------------------------------------------------------
/chp6/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset, DataLoader
 3 | from vocab import Vocab
 4 | 
 5 | # Constants
 6 | BOS_TOKEN = "<bos>"
 7 | EOS_TOKEN = "<eos>"
 8 | PAD_TOKEN = "<pad>"
 9 | BOW_TOKEN = "<bow>"
10 | EOW_TOKEN = "<eow>"
11 | 
12 | WEIGHT_INIT_RANGE = 0.1
13 | 
14 | def load_reuters():
15 |     from nltk.corpus import reuters
16 |     text = reuters.sents()
17 |     # lowercase (optional)
18 |     text = [[word.lower() for word in sentence] for sentence in text]
19 |     vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
20 |     corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]
21 | 
22 |     return corpus, vocab
23 | 
24 | def save_pretrained(vocab, embeds, save_path):
25 |     """
26 |     Save pretrained token vectors in a unified format, where the first line
27 |     specifies the `number_of_tokens` and `embedding_dim` followed with all
28 |     token vectors, one token per line.
29 |     """
30 |     with open(save_path, "w") as writer:
31 |         writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
32 |         for idx, token in enumerate(vocab.idx_to_token):
33 |             vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
34 |             writer.write(f"{token} {vec}\n")
35 |     print(f"Pretrained embeddings saved to: {save_path}")
36 | 
37 | def load_pretrained(load_path):
38 |     with open(load_path, "r") as fin:
39 |         # Optional: depending on the specific format of pretrained vector file
40 |         n, d = map(int, fin.readline().split())
41 |         tokens = []
42 |         embeds = []
43 |         for line in fin:
44 |             line = line.rstrip().split(' ')
45 |             token, embed = line[0], list(map(float, line[1:]))
46 |             tokens.append(token)
47 |             embeds.append(embed)
48 |         vocab = Vocab(tokens)
49 |         embeds = torch.tensor(embeds, dtype=torch.float)
50 |     return vocab, embeds
51 | 
52 | def get_loader(dataset, batch_size, shuffle=True):
53 |     data_loader = DataLoader(
54 |         dataset,
55 |         batch_size=batch_size,
56 |         collate_fn=dataset.collate_fn,
57 |         shuffle=shuffle
58 |     )
59 |     return data_loader
60 | 
61 | def init_weights(model):
62 |     for name, param in model.named_parameters():
63 |         if "embedding" not in name:
64 |             torch.nn.init.uniform_(
65 |                 param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE
66 |             )
67 | 
68 | 


--------------------------------------------------------------------------------
/chp6/vocab.py:
--------------------------------------------------------------------------------
 1 | # Defined in Section 4.6.1
 2 | 
 3 | from collections import defaultdict, Counter
 4 | 
 5 | class Vocab:
 6 |     def __init__(self, tokens=None):
 7 |         self.idx_to_token = list()
 8 |         self.token_to_idx = dict()
 9 | 
10 |         if tokens is not None:
11 |             if "<unk>" not in tokens:
12 |                 tokens = tokens + ["<unk>"]
13 |             for token in tokens:
14 |                 self.idx_to_token.append(token)
15 |                 self.token_to_idx[token] = len(self.idx_to_token) - 1
16 |             self.unk = self.token_to_idx['<unk>']
17 | 
18 |     @classmethod
19 |     def build(cls, text, min_freq=1, reserved_tokens=None):
20 |         token_freqs = defaultdict(int)
21 |         for sentence in text:
22 |             for token in sentence:
23 |                 token_freqs[token] += 1
24 |         uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
25 |         uniq_tokens += [token for token, freq in token_freqs.items() \
26 |                         if freq >= min_freq and token != "<unk>"]
27 |         return cls(uniq_tokens)
28 | 
29 |     def __len__(self):
30 |         return len(self.idx_to_token)
31 | 
32 |     def __getitem__(self, token):
33 |         return self.token_to_idx.get(token, self.unk)
34 | 
35 |     def convert_tokens_to_ids(self, tokens):
36 |         return [self[token] for token in tokens]
37 | 
38 |     def convert_ids_to_tokens(self, indices):
39 |         return [self.idx_to_token[index] for index in indices]
40 | 
41 | 
42 | def save_vocab(vocab, path):
43 |     with open(path, 'w') as writer:
44 |         writer.write("\n".join(vocab.idx_to_token))
45 | 
46 | 
47 | def read_vocab(path):
48 |     with open(path, 'r') as f:
49 |         tokens = f.read().split('\n')
50 |     return Vocab(tokens)
51 | 
52 | 


--------------------------------------------------------------------------------
/chp7/README.md:
--------------------------------------------------------------------------------
 1 | # 第7章：预训练语言模型
 2 | ## 7.5 预训练模型的任务微调：NLU类任务
 3 | ### 7.5.1 单句文本分类
 4 | ```
 5 | python finetune_bert_ssc.py
 6 | ```
 7 | 
 8 | ### 7.5.2 句对文本分类
 9 | ```
10 | python finetune_bert_spc.py
11 | ```
12 | 
13 | ### 7.5.3 阅读理解
14 | ```
15 | python finetune_bert_mrc.py
16 | ```
17 | 
18 | ### 7.5.4 序列标注（命名实体识别）
19 | ```
20 | python finetune_bert_ner.py
21 | ```
22 | 
23 | ## 7.6 预训练模型的任务微调：NLG类任务
24 | ### 7.6.1 文本生成
25 | ```
26 | python finetune_gpt2_tg.py
27 | ```
28 | 
29 | ### 7.6.2 机器翻译
30 | ```
31 | python finetune_t5_mt.py
32 | ```


--------------------------------------------------------------------------------
/chp7/finetune_bert_mrc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from datasets import load_dataset, load_metric
 3 | from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
 4 | 
 5 | # 加载训练数据、分词器、预训练模型以及评价方法
 6 | dataset = load_dataset('squad')
 7 | tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
 8 | model = BertForQuestionAnswering.from_pretrained('bert-base-cased', return_dict=True)
 9 | metric = load_metric('squad')
10 | 
11 | # 准备训练数据并转换为feature
12 | def prepare_train_features(examples):
13 |     tokenized_examples = tokenizer(
14 |         examples["question"],           # 问题文本
15 |         examples["context"],            # 篇章文本
16 |         truncation="only_second",       # 截断只发生在第二部分，即篇章
17 |         max_length=384,                 # 设定最大长度为384
18 |         stride=128,                     # 设定篇章切片步长为128
19 |         return_overflowing_tokens=True, # 返回超出最大长度的标记，将篇章切成多片
20 |         return_offsets_mapping=True,    # 返回偏置信息，用于对齐答案位置
21 |         padding="max_length",           # 按最大长度进行补齐
22 |     )
23 | 
24 |     # 如果篇章很长，则可能会被切成多个小篇章，需要通过以下函数建立feature到example的映射关系
25 |     sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
26 |     # 建立token到原文的字符级映射关系，用于确定答案的开始和结束位置
27 |     offset_mapping = tokenized_examples.pop("offset_mapping")
28 | 
29 |     # 获取开始和结束位置
30 |     tokenized_examples["start_positions"] = []
31 |     tokenized_examples["end_positions"] = []
32 | 
33 |     for i, offsets in enumerate(offset_mapping):
34 |         # 获取输入序列的input_ids以及[CLS]标记的位置（在BERT中为第0位）
35 |         input_ids = tokenized_examples["input_ids"][i]
36 |         cls_index = input_ids.index(tokenizer.cls_token_id)
37 | 
38 |         # 获取哪些部分是问题，哪些部分是篇章
39 |         sequence_ids = tokenized_examples.sequence_ids(i)
40 | 
41 |         # 获取答案在文本中的字符级开始和结束位置
42 |         sample_index = sample_mapping[i]
43 |         answers = examples["answers"][sample_index]
44 |         start_char = answers["answer_start"][0]
45 |         end_char = start_char + len(answers["text"][0])
46 | 
47 |         # 获取在当前切片中的开始和结束位置
48 |         token_start_index = 0
49 |         while sequence_ids[token_start_index] != 1:
50 |             token_start_index += 1
51 |         token_end_index = len(input_ids) - 1
52 |         while sequence_ids[token_end_index] != 1:
53 |             token_end_index -= 1
54 | 
55 |         # 检测答案是否超出当前切片的范围
56 |         if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
57 |             # 超出范围时，答案的开始和结束位置均设置为[CLS]标记的位置
58 |             tokenized_examples["start_positions"].append(cls_index)
59 |             tokenized_examples["end_positions"].append(cls_index)
60 |         else:
61 |             # 将token_start_index和token_end_index移至答案的两端
62 |             while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
63 |                 token_start_index += 1
64 |             tokenized_examples["start_positions"].append(token_start_index - 1)
65 |             while offsets[token_end_index][1] >= end_char:
66 |                 token_end_index -= 1
67 |             tokenized_examples["end_positions"].append(token_end_index + 1)
68 | 
69 |     return tokenized_examples
70 | 
71 | # 通过函数prepare_train_features，建立分词后的训练集
72 | tokenized_datasets = dataset.map(prepare_train_features, batched=True, remove_columns=dataset["train"].column_names)
73 | 
74 | # 定义训练参数TrainingArguments，默认使用AdamW优化器
75 | args = TrainingArguments(
76 |     "ft-squad",                         # 输出路径，存放检查点和其他输出文件
77 |     evaluation_strategy="epoch",        # 定义每轮结束后进行评价
78 |     learning_rate=2e-5,                 # 定义初始学习率
79 |     per_device_train_batch_size=16,     # 定义训练批次大小
80 |     per_device_eval_batch_size=16,      # 定义测试批次大小
81 |     num_train_epochs=2,                 # 定义训练轮数
82 | )
83 | 
84 | # 定义Trainer，指定模型和训练参数，输入训练集、验证集、分词器以及评价函数
85 | trainer = Trainer(
86 |     model,
87 |     args,
88 |     train_dataset=tokenized_datasets["train"],
89 |     eval_dataset=tokenized_datasets["validation"],
90 |     data_collator=default_data_collator,
91 |     tokenizer=tokenizer,
92 | )
93 | 
94 | # 开始训练！（主流GPU上耗时约几小时）
95 | trainer.train()
96 | 


--------------------------------------------------------------------------------
/chp7/finetune_bert_ner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from datasets import load_dataset, load_metric
 3 | from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
 4 | 
 5 | # 加载CoNLL-2003数据集、分词器
 6 | dataset = load_dataset('conll2003')
 7 | tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
 8 | 
 9 | # 将训练集转换为可训练的特征形式
10 | def tokenize_and_align_labels(examples):
11 |     tokenized_inputs = tokenizer(examples["tokens"], truncation=True,  is_split_into_words=True)
12 |     labels = []
13 |     for i, label in enumerate(examples["ner_tags"]):
14 |         word_ids = tokenized_inputs.word_ids(batch_index=i)
15 |         previous_word_idx = None
16 |         label_ids = []
17 |         for word_idx in word_ids:
18 |             # 将特殊符号的标签设置为-100，以便在计算损失函数时自动忽略
19 |             if word_idx is None:
20 |                 label_ids.append(-100)
21 |             # 把标签设置到每个词的第一个token上
22 |             elif word_idx != previous_word_idx:
23 |                 label_ids.append(label[word_idx])
24 |             # 对于每个词的其他token也设置为当前标签
25 |             else:
26 |                 label_ids.append(label[word_idx])
27 |             previous_word_idx = word_idx
28 | 
29 |         labels.append(label_ids)
30 |     tokenized_inputs["labels"] = labels
31 |     return tokenized_inputs
32 | 
33 | tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True, load_from_cache_file=False)
34 | 
35 | # 获取标签列表，并加载预训练模型
36 | label_list = dataset["train"].features["ner_tags"].feature.names
37 | model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(label_list))
38 | 
39 | # 定义data_collator，并使用seqeval进行评价
40 | data_collator = DataCollatorForTokenClassification(tokenizer)
41 | metric = load_metric("seqeval")
42 | 
43 | # 定义评价指标
44 | def compute_metrics(p):
45 |     predictions, labels = p
46 |     predictions = np.argmax(predictions, axis=2)
47 | 
48 |     # 移除需要忽略的下标（之前记为-100）
49 |     true_predictions = [
50 |         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
51 |         for prediction, label in zip(predictions, labels)
52 |     ]
53 |     true_labels = [
54 |         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
55 |         for prediction, label in zip(predictions, labels)
56 |     ]
57 | 
58 |     results = metric.compute(predictions=true_predictions, references=true_labels)
59 |     return {
60 |         "precision": results["overall_precision"],
61 |         "recall": results["overall_recall"],
62 |         "f1": results["overall_f1"],
63 |         "accuracy": results["overall_accuracy"],
64 |     }
65 | 
66 | # 定义训练参数TrainingArguments和Trainer
67 | args = TrainingArguments(
68 |     "ft-conll2003",                     # 输出路径，存放检查点和其他输出文件
69 |     evaluation_strategy="epoch",        # 定义每轮结束后进行评价
70 |     learning_rate=2e-5,                 # 定义初始学习率
71 |     per_device_train_batch_size=16,     # 定义训练批次大小
72 |     per_device_eval_batch_size=16,      # 定义测试批次大小
73 |     num_train_epochs=3,                 # 定义训练轮数
74 | )
75 | 
76 | trainer = Trainer(
77 |     model,
78 |     args,
79 |     train_dataset=tokenized_datasets["train"],
80 |     eval_dataset=tokenized_datasets["validation"],
81 |     data_collator=data_collator,
82 |     tokenizer=tokenizer,
83 |     compute_metrics=compute_metrics
84 | )
85 | 
86 | # 开始训练！（主流GPU上耗时约几分钟）
87 | trainer.train()
88 | 


--------------------------------------------------------------------------------
/chp7/finetune_bert_spc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from datasets import load_dataset, load_metric
 3 | from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
 4 | 
 5 | # 加载训练数据、分词器、预训练模型以及评价方法
 6 | dataset = load_dataset('glue', 'rte')
 7 | tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
 8 | model = BertForSequenceClassification.from_pretrained('bert-base-cased', return_dict=True)
 9 | metric = load_metric('glue', 'rte')
10 | 
11 | # 对训练集进行分词
12 | def tokenize(examples):
13 |     return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')
14 | dataset = dataset.map(tokenize, batched=True)
15 | encoded_dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
16 | 
17 | # 将数据集格式化为torch.Tensor类型以训练PyTorch模型
18 | columns = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
19 | encoded_dataset.set_format(type='torch', columns=columns)
20 | 
21 | # 定义评价指标
22 | def compute_metrics(eval_pred):
23 |     predictions, labels = eval_pred
24 |     return metric.compute(predictions=np.argmax(predictions, axis=1), references=labels)
25 | 
26 | # 定义训练参数TrainingArguments，默认使用AdamW优化器
27 | args = TrainingArguments(
28 |     "ft-rte",                           # 输出路径，存放检查点和其他输出文件
29 |     evaluation_strategy="epoch",        # 定义每轮结束后进行评价
30 |     learning_rate=2e-5,                 # 定义初始学习率
31 |     per_device_train_batch_size=16,     # 定义训练批次大小
32 |     per_device_eval_batch_size=16,      # 定义测试批次大小
33 |     num_train_epochs=2,                 # 定义训练轮数
34 | )
35 | 
36 | # 定义Trainer，指定模型和训练参数，输入训练集、验证集、分词器以及评价函数
37 | trainer = Trainer(
38 |     model,
39 |     args,
40 |     train_dataset=encoded_dataset["train"],
41 |     eval_dataset=encoded_dataset["validation"],
42 |     tokenizer=tokenizer,
43 |     compute_metrics=compute_metrics
44 | )
45 | 
46 | # 开始训练！（主流GPU上耗时约几小时）
47 | trainer.train()
48 | 


--------------------------------------------------------------------------------
/chp7/finetune_bert_ssc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from datasets import load_dataset, load_metric
 3 | from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
 4 | 
 5 | # 加载训练数据、分词器、预训练模型以及评价方法
 6 | dataset = load_dataset('glue', 'sst2')
 7 | tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
 8 | model = BertForSequenceClassification.from_pretrained('bert-base-cased', return_dict=True)
 9 | metric = load_metric('glue', 'sst2')
10 | 
11 | # 对训练集进行分词
12 | def tokenize(examples):
13 |     return tokenizer(examples['sentence'], truncation=True, padding='max_length')
14 | dataset = dataset.map(tokenize, batched=True)
15 | encoded_dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
16 | 
17 | # 将数据集格式化为torch.Tensor类型以训练PyTorch模型
18 | columns = ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
19 | encoded_dataset.set_format(type='torch', columns=columns)
20 | 
21 | # 定义评价指标
22 | def compute_metrics(eval_pred):
23 |     predictions, labels = eval_pred
24 |     return metric.compute(predictions=np.argmax(predictions, axis=1), references=labels)
25 | 
26 | # 定义训练参数TrainingArguments，默认使用AdamW优化器
27 | args = TrainingArguments(
28 |     "ft-sst2",                          # 输出路径，存放检查点和其他输出文件
29 |     evaluation_strategy="epoch",        # 定义每轮结束后进行评价
30 |     learning_rate=2e-5,                 # 定义初始学习率
31 |     per_device_train_batch_size=16,     # 定义训练批次大小
32 |     per_device_eval_batch_size=16,      # 定义测试批次大小
33 |     num_train_epochs=2,                 # 定义训练轮数
34 | )
35 | 
36 | # 定义Trainer，指定模型和训练参数，输入训练集、验证集、分词器以及评价函数
37 | trainer = Trainer(
38 |     model,
39 |     args,
40 |     train_dataset=encoded_dataset["train"],
41 |     eval_dataset=encoded_dataset["validation"],
42 |     tokenizer=tokenizer,
43 |     compute_metrics=compute_metrics
44 | )
45 | 
46 | # 开始训练！（主流GPU上耗时约几小时）
47 | trainer.train()
48 | 


--------------------------------------------------------------------------------
/chp7/finetune_gpt2_tg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import evaluate
 3 | from datasets import load_dataset
 4 | from transformers import AutoTokenizer, DataCollatorForLanguageModeling, 
 5 | 
 6 | # 加载并处理数据集
 7 | model_name = "gpt2"
 8 | wikitext_data = load_dataset("wikitext", "wikitext-2-v1")
 9 | tokenizer = AutoTokenizer.from_pretrained(model_name)
10 | block_size = 128
11 | 
12 | def preprocess_function(examples):
13 |     return tokenizer([" ".join(x) for x in examples["text"]])
14 | 
15 | def group_texts(examples):
16 |     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
17 |     total_length = len(concatenated_examples[list(examples.keys())[0]])
18 |     if total_length >= block_size:
19 |         total_length = (total_length // block_size) * block_size
20 |     result = {
21 |         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
22 |         for k, t in concatenated_examples.items()
23 |     }
24 |     result["labels"] = result["input_ids"].copy()
25 |     return result
26 | 
27 | tokenized_wikitext = wikitext_data.map(
28 |     preprocess_function,
29 |     batched=True,
30 |     num_proc=4,
31 |     remove_columns=wikitext_data["train"].column_names,
32 | )
33 | lm_dataset = tokenized_wikitext.map(group_texts, batched=True, num_proc=4)
34 | tokenizer.pad_token = tokenizer.eos_token
35 | data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
36 | 
37 | # 定义模型、训练超参
38 | model = AutoModelForCausalLM.from_pretrained("distilgpt2")
39 | 
40 | training_args = TrainingArguments(
41 |     output_dir="gpt2_wikitext_model",   # 输出路径，存放检查点和其他输出文件
42 |     evaluation_strategy="epoch",        # 定义每轮结束后进行评价
43 |     learning_rate=2e-5,                 # 定义初始学习率
44 |     per_device_train_batch_size=32,     # 定义训练批次大小
45 |     per_device_eval_batch_size=32,      # 定义测试批次大小
46 |     weight_decay=0.01,                  # 定义优化器权重衰减系数
47 |     num_train_epochs=2,                 # 定义训练轮数
48 | )
49 | 
50 | trainer = Trainer(
51 |     model=model,
52 |     args=training_args,
53 |     train_dataset=lm_dataset["train"],
54 |     eval_dataset=lm_dataset["test"],
55 |     data_collator=data_collator,
56 | )
57 | 
58 | # 开始训练！
59 | trainer.train()
60 | 


--------------------------------------------------------------------------------
/chp7/finetune_t5_mt.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import evaluate
 3 | from datasets import load_dataset
 4 | from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
 5 | 
 6 | # 加载并处理数据集
 7 | model_name = "google/mt5-small"     # 此处也可以选用更大的模型版本
 8 | iwslt_data = load_dataset("iwslt2017", "iwslt2017-zh-en")
 9 | tokenizer = AutoTokenizer.from_pretrained(model_name)
10 | 
11 | source_lang = "zh"
12 | target_lang = "en"
13 | prefix = "translate Chinese to English: "
14 | 
15 | def preprocess_function(examples):
16 |     inputs = [prefix + example[source_lang] for example in examples["translation"]]
17 |     targets = [example[target_lang] for example in examples["translation"]]
18 |     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
19 |     return model_inputs
20 | 
21 | tokenized_data = iwslt_data.map(preprocess_function, batched=True)
22 | data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)
23 | 
24 | # 定义评价方法
25 | metric = evaluate.load("sacrebleu")
26 | def postprocess_text(preds, labels):
27 |     preds = [pred.strip() for pred in preds]
28 |     labels = [[label.strip()] for label in labels]
29 |     return preds, labels
30 | 
31 | def compute_metrics(eval_preds):
32 |     preds, labels = eval_preds
33 |     if isinstance(preds, tuple):
34 |         preds = preds[0]
35 |     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
36 | 
37 |     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
38 |     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
39 | 
40 |     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
41 | 
42 |     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
43 |     result = {"bleu": result["score"]}
44 | 
45 |     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
46 |     result["gen_len"] = np.mean(prediction_lens)
47 |     result = {k: round(v, 4) for k, v in result.items()}
48 |     return result
49 | 
50 | # 定义模型、训练超参
51 | model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
52 | 
53 | training_args = Seq2SeqTrainingArguments(
54 |     output_dir="iwslt_zh_en_model",     # 输出路径，存放检查点和其他输出文件
55 |     evaluation_strategy="epoch",        # 定义每轮结束后进行评价
56 |     learning_rate=2e-5,                 # 定义初始学习率
57 |     per_device_train_batch_size=64,     # 定义训练批次大小
58 |     per_device_eval_batch_size=64,      # 定义测试批次大小
59 |     weight_decay=0.01,                  # 定义优化器权重衰减系数
60 |     save_total_limit=3,                 # 定义最多保存多少个检查点  
61 |     num_train_epochs=2,                 # 定义训练轮数
62 | )
63 | 
64 | trainer = Seq2SeqTrainer(
65 |     model=model,
66 |     args=training_args,
67 |     train_dataset=tokenized_data["train"],
68 |     eval_dataset=tokenized_data["test"],
69 |     tokenizer=tokenizer,
70 |     data_collator=data_collator,
71 |     compute_metrics=compute_metrics,
72 | )
73 | 
74 | # 开始训练！
75 | trainer.train()


--------------------------------------------------------------------------------
/chp9/README.md:
--------------------------------------------------------------------------------
 1 | # 第9章：大语言模型的适配
 2 | ### 9.6.1 中文词表扩充
 3 | 
 4 | ```
 5 | python merge_tokenizers.py --llama_tokenizer_dir original_llama_tokenizer_dir --chinese_sp_model_file zh_vocab.model
 6 | ```
 7 | 
 8 | ### 9.7.1 知识蒸馏
 9 | 
10 | ```
11 | python textbrewer_example.py
12 | ```
13 | 
14 | ### 9.7.2 模型裁剪
15 | 
16 | ```
17 | python textpruner_example.py
18 | ```
19 | 


--------------------------------------------------------------------------------
/chp9/chinese_sp.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/chp9/chinese_sp.model


--------------------------------------------------------------------------------
/chp9/merge_tokenizers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import sentencepiece as spm
 4 | import argparse
 5 | from transformers import LlamaTokenizer
 6 | from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
 7 | 
 8 | import logging
 9 | logging.basicConfig(level=logging.INFO)
10 | 
11 | def load_model(model_file):
12 |     sp_model = spm.SentencePieceProcessor()
13 |     sp_model.Load(model_file)
14 |     return sp_model
15 | 
16 | def find_english_tokens_and_punctuations(model_proto):
17 |     en_words = {p.piece for p in model_proto.pieces if re.findall("[a-zA-Z]+", p.piece)}
18 |     punct_ps = {p.piece for p in model_proto.pieces if not re.search(r'(\w|\d)+', p.piece) and len(p.piece.lstrip('▁')) > 1}
19 |     return en_words, punct_ps
20 | 
21 | def merge_tokenizers(llama_model_proto, chinese_model_proto, en_words, punct_ps):
22 |     llama_tokens_set = {p.piece for p in llama_model_proto.pieces}
23 |     logging.info(f"Initial Llama tokenizer size: {len(llama_tokens_set)}")
24 |     
25 |     for p in chinese_model_proto.pieces:
26 |         if p.piece not in llama_tokens_set and p.piece not in en_words and p.piece not in punct_ps:
27 |             llama_model_proto.pieces.add(sp_pb2_model.ModelProto.SentencePiece(piece=p.piece, score=0))
28 |             if len(llama_model_proto.pieces) == 32000:
29 |                 llama_model_proto.pieces.add(sp_pb2_model.ModelProto.SentencePiece(piece='<pad>', score=0))
30 |                 break
31 | 
32 |     logging.info(f"New model pieces: {len(llama_model_proto.pieces)}")
33 | 
34 | def save_merged_model(model_proto, output_sp_dir, output_hf_dir):
35 |     os.makedirs(output_sp_dir, exist_ok=True)
36 |     with open(os.path.join(output_sp_dir, 'chinese_llama.model'), 'wb') as f:
37 |         f.write(model_proto.SerializeToString())
38 | 
39 |     tokenizer = LlamaTokenizer(vocab_file=os.path.join(output_sp_dir, 'chinese_llama.model'))
40 |     tokenizer.save_pretrained(output_hf_dir)
41 |     logging.info(f"Chinese-Llama tokenizer has been saved to {output_hf_dir}")
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     parser.add_argument('--llama_tokenizer_file', required=True)
46 |     parser.add_argument('--chinese_sp_model_file', default='./chinese_sp.model')
47 |     args = parser.parse_args()
48 | 
49 |     llama_sp_model = load_model(args.llama_tokenizer_file)
50 |     chinese_sp_model = load_model(args.chinese_sp_model_file)
51 | 
52 |     llama_sp_mp = sp_pb2_model.ModelProto()
53 |     llama_sp_mp.ParseFromString(llama_sp_model.serialized_model_proto())
54 |     chinese_uni_sp_mp = sp_pb2_model.ModelProto()
55 |     chinese_uni_sp_mp.ParseFromString(chinese_sp_model.serialized_model_proto())
56 | 
57 |     en_words, punct_ps = find_english_tokens_and_punctuations(chinese_uni_sp_mp)
58 |     merge_tokenizers(llama_sp_mp, chinese_uni_sp_mp, en_words, punct_ps)
59 | 
60 |     output_sp_dir = 'merged_tokenizer_sp'
61 |     output_hf_dir = 'merged_tokenizer_hf'
62 |     save_merged_model(llama_sp_mp, output_sp_dir, output_hf_dir)


--------------------------------------------------------------------------------
/chp9/t4tiny.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "temperature" : 8,
 3 |     "hard_label_weight": 0,
 4 |     "kd_loss_type":"ce",
 5 |     "kd_loss_weight":1,
 6 |     "probability_shift": false,
 7 |     "is_caching_logits": false,
 8 |     "intermediate_matches":[
 9 |         {"layer_T":[0,0],  "layer_S":[0,0], "feature":"hidden", "loss":"mmd", "weight":1},
10 |         {"layer_T":[3,3],  "layer_S":[1,1], "feature":"hidden", "loss":"mmd", "weight":1},
11 |         {"layer_T":[6,6],  "layer_S":[2,2], "feature":"hidden", "loss":"mmd", "weight":1},
12 |         {"layer_T":[9,9],  "layer_S":[3,3], "feature":"hidden", "loss":"mmd", "weight":1},
13 |         {"layer_T":[12,12],"layer_S":[4,4], "feature":"hidden", "loss":"mmd", "weight":1},
14 |         {"layer_T":0, "layer_S":0, "feature":"hidden", "loss":"hidden_mse", "weight":1, "proj":["linear",312,768]},
15 |         {"layer_T":3, "layer_S":1, "feature":"hidden", "loss":"hidden_mse", "weight":1, "proj":["linear",312,768]},
16 |         {"layer_T":6, "layer_S":2, "feature":"hidden", "loss":"hidden_mse", "weight":1, "proj":["linear",312,768]},
17 |         {"layer_T":9, "layer_S":3, "feature":"hidden", "loss":"hidden_mse", "weight":1, "proj":["linear",312,768]},
18 |         {"layer_T":12,"layer_S":4, "feature":"hidden", "loss":"hidden_mse", "weight":1, "proj":["linear",312,768]}]
19 | }


--------------------------------------------------------------------------------
/chp9/textbrewer_example.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import textbrewer
 3 | from textbrewer import GeneralDistiller, TrainingConfig, DistillationConfig
 4 | from transformers import BertTokenizerFast, BertForSequenceClassification, DistilBertForSequenceClassification
 5 | 
 6 | # 加载数据并构建Dataloader
 7 | dataset = load_dataset('glue', 'sst2', split='train')
 8 | tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
 9 | 
10 | def encode(examples):
11 |     return tokenizer(examples['sentence'], truncation=True, padding='max_length')
12 | 
13 | dataset = dataset.map(encode, batched=True)
14 | encoded_dataset = dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
15 | columns = ['input_ids', 'attention_mask', 'labels']
16 | encoded_dataset.set_format(type='torch', columns=columns)
17 | 
18 | def collate_fn(examples):
19 |     return dict(tokenizer.pad(examples, return_tensors='pt'))
20 | dataloader = torch.utils.data.DataLoader(encoded_dataset, collate_fn=collate_fn, batch_size=8)
21 | 
22 | # 定义教师模型和学生模型
23 | teacher_model = BertForSequenceClassification.from_pretrained('bert-base-cased')
24 | student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased')
25 | 
26 | # 打印教师模型和学生模型的参数量（可选）
27 | print("\nteacher_model's parameters:")
28 | result, _ = textbrewer.utils.display_parameters(teacher_model, max_level=3)
29 | print(result)
30 | 
31 | print("student_model's parameters:")
32 | result, _ = textbrewer.utils.display_parameters(student_model, max_level=3)
33 | print(result)
34 | 
35 | # 定义优化器
36 | optimizer = torch.optim.AdamW(student_model.parameters(), lr=1e-5)
37 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
38 | if device == 'cuda':
39 |     teacher_model.to(device)
40 |     student_model.to(device)
41 | 
42 | # 定义adaptor、训练配置和蒸馏配置
43 | def simple_adaptor(batch, model_outputs):
44 |     return {'logits': model_outputs[1]}
45 | train_config = TrainingConfig(device=device)
46 | distill_config = DistillationConfig()
47 | 
48 | # 定义distiller
49 | distiller = GeneralDistiller(
50 |     train_config=train_config, distill_config=distill_config,
51 |     model_T=teacher_model, model_S=student_model,
52 |     adaptor_T=simple_adaptor, adaptor_S=simple_adaptor)
53 | 
54 | # 开始蒸馏！
55 | with distiller:
56 |     distiller.train(optimizer, dataloader,
57 |                     scheduler_class=None, scheduler_args=None,
58 |                     num_epochs=1, callback=None)


--------------------------------------------------------------------------------
/chp9/textpruner_example.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 3 | logger = logging.getLogger(__name__)
 4 | 
 5 | from transformers import XLMRobertaForSequenceClassification,XLMRobertaTokenizer
 6 | from textpruner import summary, TransformerPruner, TransformerPruningConfig
 7 | import sys, os
 8 | 
 9 | sys.path.insert(0, os.path.abspath('..'))
10 | 
11 | from classification_utils.dataloader_script import eval_dataset, dataloader, eval_langs, batch_size
12 | from classification_utils.predict_function import predict
13 | 
14 | model_path = 'ziqingyang/XLMRobertaBaseForPAWSX-en'
15 | model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
16 | tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
17 | 
18 | print("Before pruning:")
19 | print(summary(model))
20 | 
21 | transformer_pruning_config = TransformerPruningConfig(
22 |     target_ffn_size=2048, target_num_of_heads=8, 
23 |     pruning_method='iterative',n_iters=4)
24 | pruner = TransformerPruner(model,transformer_pruning_config=transformer_pruning_config)   
25 | pruner.prune(dataloader=dataloader, save_model=True)
26 | 
27 | # save the tokenizer to the same place
28 | tokenizer.save_pretrained(pruner.save_dir)
29 | 
30 | print("After pruning:")
31 | print(summary(model))
32 | 
33 | for i in range(12):
34 |     print ((model.base_model.encoder.layer[i].intermediate.dense.weight.shape,
35 |             model.base_model.encoder.layer[i].intermediate.dense.bias.shape,
36 |             model.base_model.encoder.layer[i].attention.self.key.weight.shape))
37 | 
38 | 
39 | print("Measure performance")
40 | device= model.device
41 | eval_datasets = [eval_dataset.lang_datasets[lang] for lang in eval_langs]
42 | 
43 | predict(model, eval_datasets, eval_langs, device, batch_size)
44 | 


--------------------------------------------------------------------------------
/slides/01-绪论.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/01-绪论.pptx


--------------------------------------------------------------------------------
/slides/02-自然语言处理基础.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/02-自然语言处理基础.pptx


--------------------------------------------------------------------------------
/slides/03-基础工具集与常用数据集.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/03-基础工具集与常用数据集.pptx


--------------------------------------------------------------------------------
/slides/04-神经网络基础.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/04-神经网络基础.pptx


--------------------------------------------------------------------------------
/slides/05-语言模型.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/05-语言模型.pptx


--------------------------------------------------------------------------------
/slides/06-预训练词向量.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/06-预训练词向量.pptx


--------------------------------------------------------------------------------
/slides/07-预训练语言模型.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/07-预训练语言模型.pptx


--------------------------------------------------------------------------------
/slides/08-大语言模型的预训练.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/08-大语言模型的预训练.pptx


--------------------------------------------------------------------------------
/slides/09-大语言模型的适配.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/09-大语言模型的适配.pptx


--------------------------------------------------------------------------------
/slides/10-大语言模型的应用.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/10-大语言模型的应用.pptx


--------------------------------------------------------------------------------
/slides/11-大语言模型的能力评估.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/11-大语言模型的能力评估.pptx


--------------------------------------------------------------------------------
/slides/12-预训练语言模型的延伸.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/12-预训练语言模型的延伸.pptx


--------------------------------------------------------------------------------
/slides/13-DeepSeek.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HIT-SCIR/llm-nlp-book/6214b52fb9bafbe3162125be3e5a6c82d2aac052/slides/13-DeepSeek.pptx


--------------------------------------------------------------------------------