├── 01_NGram_BoW ├── BiGram.ipynb └── BoW.ipynb ├── 02_Word2Vec ├── CBOW.ipynb ├── SkipGram-Enbedding.ipynb └── SkipGram.ipynb ├── 03_NPLM ├── NPLM.ipynb └── RNN.ipynb ├── 04_Seq2Seq └── Seq2Seq.ipynb ├── 05_Attention ├── Dot-Product_Attention.ipynb ├── Multiheads-Self-Attention.ipynb ├── QKV.ipynb ├── Scaled_Dot-Product_Attention.ipynb ├── Self-Attention.ipynb └── Seq2Seq_with_Attention.ipynb ├── 06_Transformer └── Transformer.ipynb ├── 07_GPT ├── GPT.ipynb ├── Transformer_with_GreedyDecoder.ipynb ├── WikiGPT.ipynb └── lang.txt ├── 08_ChatGPT ├── .ipynb_checkpoints │ └── SelfTrain_ChatGPT-checkpoint.ipynb ├── GPT_Model.py ├── Pretrain_ChatGPT.ipynb ├── RLHF_Reward_ChatGPT.py ├── SelfTrain_ChatGPT.ipynb └── chat.txt ├── 09_OpenAI_API ├── ChatBot.ipynb └── ChatBot.py ├── README.md ├── images ├── Archive │ ├── 8daf31eae3eb392efbb4624feb3c53d.jpg │ ├── c41f10da370ac2ff5a3dcd63a55db06.jpg │ └── e4f961d4b53cd9f956d26784e39daa7.jpg ├── P109.png ├── book.png └── sales.png └── 勘误表.md /01_NGram_BoW/BiGram.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "2d56fa5d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# 构建一个玩具数据集\n", 11 | "corpus = [ \"我喜欢吃苹果\",\n", 12 | " \"我喜欢吃香蕉\",\n", 13 | " \"她喜欢吃葡萄\",\n", 14 | " \"他不喜欢吃香蕉\",\n", 15 | " \"他喜欢吃苹果\",\n", 16 | " \"她喜欢吃草莓\"]" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "id": "f31f3204", 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# 定义一个分词函数,将文本转换为单个字符的列表\n", 27 | "def tokenize(text):\n", 28 | " return [char for char in text] # 将文本拆分为字符列表\n", 29 | "# 对每个文本进行分词,并打印出对应的单字列表\n", 30 | "print(\"单字列表:\") \n", 31 | "for text in corpus:\n", 32 | " tokens = tokenize(text)\n", 33 | " print(tokens)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "id": "93cb1fa8", 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "bigram 词频:\n", 47 | "我: {'喜': 2}\n", 48 | "喜: {'欢': 6}\n", 49 | "欢: {'吃': 6}\n", 50 | "吃: {'苹': 2, '香': 2, '葡': 1, '草': 1}\n", 51 | "苹: {'果': 2}\n", 52 | "香: {'蕉': 2}\n", 53 | "她: {'喜': 2}\n", 54 | "葡: {'萄': 1}\n", 55 | "他: {'不': 1, '喜': 1}\n", 56 | "不: {'喜': 1}\n", 57 | "草: {'莓': 1}\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "# 定义计算 N-Gram 词频的函数\n", 63 | "from collections import defaultdict, Counter # 导入所需库\n", 64 | "def count_ngrams(corpus, n):\n", 65 | " ngrams_count = defaultdict(Counter) # 创建一个字典,存储 N-Gram 计数\n", 66 | " for text in corpus: # 遍历语料库中的每个文本\n", 67 | " tokens = tokenize(text) # 对文本进行分词\n", 68 | " for i in range(len(tokens) - n + 1): # 遍历分词结果,生成 N-Gram\n", 69 | " ngram = tuple(tokens[i:i+n]) # 创建一个 N-Gram 元组\n", 70 | " prefix = ngram[:-1] # 获取 N-Gram 的前缀\n", 71 | " token = ngram[-1] # 获取 N-Gram 的目标单字\n", 72 | " ngrams_count[prefix][token] += 1 # 更新 N-Gram 计数\n", 73 | " return ngrams_count\n", 74 | "bigram_counts = count_ngrams(corpus, 2) # 计算 bigram 词频\n", 75 | "print(\"bigram 词频:\") # 打印 bigram 词频\n", 76 | "for prefix, counts in bigram_counts.items():\n", 77 | " print(\"{}: {}\".format(\"\".join(prefix), dict(counts))) " 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "id": "06678ebf", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "\n", 91 | "bigram 出现的概率 :\n", 92 | "我: {'喜': 1.0}\n", 93 | "喜: {'欢': 1.0}\n", 94 | "欢: {'吃': 1.0}\n", 95 | "吃: {'苹': 0.3333333333333333, '香': 0.3333333333333333, '葡': 0.16666666666666666, '草': 0.16666666666666666}\n", 96 | "苹: {'果': 1.0}\n", 97 | "香: {'蕉': 1.0}\n", 98 | "她: {'喜': 1.0}\n", 99 | "葡: {'萄': 1.0}\n", 100 | "他: {'不': 0.5, '喜': 0.5}\n", 101 | "不: {'喜': 1.0}\n", 102 | "草: {'莓': 1.0}\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "# 定义计算 N-Gram 出现概率的函数\n", 108 | "def ngram_probabilities(ngram_counts):\n", 109 | " ngram_probs = defaultdict(Counter) # 创建一个字典,存储 N-Gram 出现的概率\n", 110 | " for prefix, tokens_count in ngram_counts.items(): # 遍历 N-Gram 前缀\n", 111 | " total_count = sum(tokens_count.values()) # 计算当前前缀的 N-Gram 计数\n", 112 | " for token, count in tokens_count.items(): # 遍历每个前缀的 N-Gram\n", 113 | " ngram_probs[prefix][token] = count / total_count # 计算每个 N-Gram 出现的概率\n", 114 | " return ngram_probs\n", 115 | "bigram_probs = ngram_probabilities(bigram_counts) # 计算 bigram 出现的概率\n", 116 | "print(\"\\nbigram 出现的概率 :\") # 打印 bigram 概率\n", 117 | "for prefix, probs in bigram_probs.items():\n", 118 | " print(\"{}: {}\".format(\"\".join(prefix), dict(probs)))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "id": "fe633e97", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# 定义生成下一个词的函数\n", 129 | "def generate_next_token(prefix, ngram_probs):\n", 130 | " if not prefix in ngram_probs: # 如果前缀不在 N-Gram 中,返回 None\n", 131 | " return None\n", 132 | " next_token_probs = ngram_probs[prefix] # 获取当前前缀的下一个词的概率\n", 133 | " next_token = max(next_token_probs, \n", 134 | " key=next_token_probs.get) # 选择概率最大的词作为下一个词\n", 135 | " return next_token" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 6, 141 | "id": "12da246f", 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# 定义生成连续文本的函数\n", 146 | "def generate_text(prefix, ngram_probs, n, length=6):\n", 147 | " tokens = list(prefix) # 将前缀转换为字符列表\n", 148 | " for _ in range(length - len(prefix)): # 根据指定长度生成文本 \n", 149 | " # 获取当前前缀的下一个词\n", 150 | " next_token = generate_next_token(tuple(tokens[-(n-1):]), ngram_probs) \n", 151 | " if not next_token: # 如果下一个词为 None,跳出循环\n", 152 | " break\n", 153 | " tokens.append(next_token) # 将下一个词添加到生成的文本中\n", 154 | " return \"\".join(tokens) # 将字符列表连接成字符串" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 7, 160 | "id": "7af72d40", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "\n", 168 | " 生成的文本: 我喜欢吃苹果\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "# 输入一个前缀,生成文本\n", 174 | "generated_text = generate_text(\"我\", bigram_probs, 2)\n", 175 | "print(\"\\n 生成的文本:\", generated_text) # 打印生成的文本" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "8d3ed012", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3 (ipykernel)", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.10.11" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 5 208 | } 209 | -------------------------------------------------------------------------------- /03_NPLM/NPLM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "11823a2e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " 词汇表: {'爸爸': 0, '我': 1, '玩具': 2, '爱': 3, '挨打': 4, '喜欢': 5, '讨厌': 6}\n", 14 | " 词汇表大小: 7\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "# 构建一个非常简单的数据集\n", 20 | "sentences = [\"我 喜欢 玩具\", \"我 爱 爸爸\", \"我 讨厌 挨打\"] \n", 21 | "# 将所有句子连接在一起,用空格分隔成多个词,再将重复的词去除,构建词汇表\n", 22 | "word_list = list(set(\" \".join(sentences).split())) \n", 23 | "# 创建一个字典,将每个词映射到一个唯一的索引\n", 24 | "word_to_idx = {word: idx for idx, word in enumerate(word_list)} \n", 25 | "# 创建一个字典,将每个索引映射到对应的词\n", 26 | "idx_to_word = {idx: word for idx, word in enumerate(word_list)} \n", 27 | "voc_size = len(word_list) # 计算词汇表的大小\n", 28 | "print(' 词汇表:', word_to_idx) # 打印词汇到索引的映射字典\n", 29 | "print(' 词汇表大小:', voc_size) # 打印词汇表大小" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "id": "ae1436d0", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | " 输入批处理数据: tensor([[1, 5],\n", 43 | " [1, 3]])\n", 44 | " 输入批处理数据对应的原始词: [['我', '喜欢'], ['我', '爱']]\n", 45 | " 目标批处理数据: tensor([2, 0])\n", 46 | " 目标批处理数据对应的原始词: ['玩具', '爸爸']\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "# 构建批处理数据\n", 52 | "import torch # 导入 PyTorch 库\n", 53 | "import random # 导入 random 库\n", 54 | "batch_size = 2 # 每批数据的大小\n", 55 | "def make_batch():\n", 56 | " input_batch = [] # 定义输入批处理列表\n", 57 | " target_batch = [] # 定义目标批处理列表\n", 58 | " selected_sentences = random.sample(sentences, batch_size) # 随机选择句子\n", 59 | " for sen in selected_sentences: # 遍历每个句子\n", 60 | " word = sen.split() # 用空格将句子分隔成多个词\n", 61 | " # 将除最后一个词以外的所有词的索引作为输入\n", 62 | " input = [word_to_idx[n] for n in word[:-1]] # 创建输入数据\n", 63 | " # 将最后一个词的索引作为目标\n", 64 | " target = word_to_idx[word[-1]] # 创建目标数据\n", 65 | " input_batch.append(input) # 将输入添加到输入批处理列表\n", 66 | " target_batch.append(target) # 将目标添加到目标批处理列表\n", 67 | " input_batch = torch.LongTensor(input_batch) # 将输入数据转换为张量\n", 68 | " target_batch = torch.LongTensor(target_batch) # 将目标数据转换为张量\n", 69 | " return input_batch, target_batch # 返回输入批处理和目标批处理数据\n", 70 | "input_batch, target_batch = make_batch() # 生成批处理数据\n", 71 | "print(\" 输入批处理数据:\",input_batch) # 打印输入批处理数据\n", 72 | "# 将输入批处理数据中的每个索引值转换为对应的原始词\n", 73 | "input_words = []\n", 74 | "for input_idx in input_batch:\n", 75 | " input_words.append([idx_to_word[idx.item()] for idx in input_idx])\n", 76 | "print(\" 输入批处理数据对应的原始词:\",input_words)\n", 77 | "print(\" 目标批处理数据:\",target_batch) # 打印目标批处理数据\n", 78 | "# 将目标批处理数据中的每个索引值转换为对应的原始词\n", 79 | "target_words = [idx_to_word[idx.item()] for idx in target_batch]\n", 80 | "print(\" 目标批处理数据对应的原始词:\",target_words)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "id": "5b4d9351", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "import torch.nn as nn # 导入神经网络模块\n", 91 | "# 定义神经概率语言模型(NPLM)\n", 92 | "class NPLM(nn.Module):\n", 93 | " def __init__(self):\n", 94 | " super(NPLM, self).__init__() \n", 95 | " self.C = nn.Embedding(voc_size, embedding_size) # 定义一个词嵌入层\n", 96 | " # 第一个线性层,其输入大小为 n_step * embedding_size,输出大小为 n_hidden\n", 97 | " self.linear1 = nn.Linear(n_step * embedding_size, n_hidden) \n", 98 | " # 第二个线性层,其输入大小为 n_hidden,输出大小为 voc_size,即词汇表大小\n", 99 | " self.linear2 = nn.Linear(n_hidden, voc_size) \n", 100 | " def forward(self, X): # 定义前向传播过程\n", 101 | " # 输入数据 X 张量的形状为 [batch_size, n_step]\n", 102 | " X = self.C(X) # 将 X 通过词嵌入层,形状变为 [batch_size, n_step, embedding_size] \n", 103 | " X = X.view(-1, n_step * embedding_size) # 形状变为 [batch_size, n_step * embedding_size]\n", 104 | " # 通过第一个线性层并应用 ReLU 激活函数\n", 105 | " hidden = torch.tanh(self.linear1(X)) # hidden 张量形状为 [batch_size, n_hidden]\n", 106 | " # 通过第二个线性层得到输出 \n", 107 | " output = self.linear2(hidden) # output 形状为 [batch_size, voc_size]\n", 108 | " return output # 返回输出结果" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "id": "7c13a572", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | " NPLM 模型结构: NPLM(\n", 122 | " (C): Embedding(7, 2)\n", 123 | " (linear1): Linear(in_features=4, out_features=2, bias=True)\n", 124 | " (linear2): Linear(in_features=2, out_features=7, bias=True)\n", 125 | ")\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "n_step = 2 # 时间步数,表示每个输入序列的长度,也就是上下文长度 \n", 131 | "n_hidden = 2 # 隐藏层大小\n", 132 | "embedding_size = 2 # 词嵌入大小\n", 133 | "model = NPLM() # 创建神经概率语言模型实例\n", 134 | "print(' NPLM 模型结构:', model) # 打印模型的结构" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 5, 140 | "id": "f0f3514a", 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stderr", 145 | "output_type": "stream", 146 | "text": [ 147 | "c:\\Users\\huangj2.ARES\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 148 | " from .autonotebook import tqdm as notebook_tqdm\n" 149 | ] 150 | }, 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Epoch: 1000 cost = 0.000453\n", 156 | "Epoch: 2000 cost = 0.000156\n", 157 | "Epoch: 3000 cost = 0.000070\n", 158 | "Epoch: 4000 cost = 0.000032\n", 159 | "Epoch: 5000 cost = 0.000018\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "import torch.optim as optim # 导入优化器模块\n", 165 | "criterion = nn.CrossEntropyLoss() # 定义损失函数为交叉熵损失\n", 166 | "optimizer = optim.Adam(model.parameters(), lr=0.1) # 定义优化器为 Adam,学习率为 0.1\n", 167 | "# 训练模型\n", 168 | "for epoch in range(5000): # 设置训练迭代次数\n", 169 | " optimizer.zero_grad() # 清除优化器的梯度\n", 170 | " input_batch, target_batch = make_batch() # 创建输入和目标批处理数据\n", 171 | " output = model(input_batch) # 将输入数据传入模型,得到输出结果\n", 172 | " loss = criterion(output, target_batch) # 计算损失值\n", 173 | " if (epoch + 1) % 1000 == 0: # 每 1000 次迭代,打印损失值\n", 174 | " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", 175 | " loss.backward() # 反向传播计算梯度\n", 176 | " optimizer.step() # 更新模型参数" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 6, 182 | "id": "e6241b8e", 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "['我', '讨厌'] -> 挨打\n", 190 | "['我', '喜欢'] -> 玩具\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "# 进行预测\n", 196 | "input_strs = [['我', '讨厌'], ['我', '喜欢']] # 需要预测的输入序列\n", 197 | "# 将输入序列转换为对应的索引\n", 198 | "input_indices = [[word_to_idx[word] for word in seq] for seq in input_strs]\n", 199 | "# 将输入序列的索引转换为张量\n", 200 | "input_batch = torch.LongTensor(input_indices) \n", 201 | "# 对输入序列进行预测,取输出中概率最大的类别\n", 202 | "predict = model(input_batch).data.max(1)[1] \n", 203 | "# 将预测结果的索引转换为对应的词\n", 204 | "predict_strs = [idx_to_word[n.item()] for n in predict.squeeze()] \n", 205 | "for input_seq, pred in zip(input_strs, predict_strs):\n", 206 | " print(input_seq, '->', pred) # 打印输入序列和预测结果" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3 (ipykernel)", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.11.5" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 5 231 | } 232 | -------------------------------------------------------------------------------- /03_NPLM/RNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "87677c2a", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " 词汇表: {'讨厌': 0, '我': 1, '喜欢': 2, '挨打': 3, '爸爸': 4, '玩具': 5, '爱': 6}\n", 14 | " 词汇表大小: 7\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "# 构建一个非常简单的数据集\n", 20 | "sentences = [\"我 喜欢 玩具\", \"我 爱 爸爸\", \"我 讨厌 挨打\"] \n", 21 | "# 将所有句子连接在一起,用空格分隔成多个词,再将重复的词去除,构建词汇表\n", 22 | "word_list = list(set(\" \".join(sentences).split())) \n", 23 | "# 创建一个字典,将每个词映射到一个唯一的索引\n", 24 | "word_to_idx = {word: idx for idx, word in enumerate(word_list)} \n", 25 | "# 创建一个字典,将每个索引映射到对应的词\n", 26 | "idx_to_word = {idx: word for idx, word in enumerate(word_list)} \n", 27 | "voc_size = len(word_list) # 计算词汇表的大小\n", 28 | "print(' 词汇表:', word_to_idx) # 打印词汇到索引的映射字典\n", 29 | "print(' 词汇表大小:', voc_size) # 打印词汇表大小" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "id": "0fb285df", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | " 输入批处理数据: tensor([[1, 2],\n", 43 | " [1, 0]])\n", 44 | " 输入批处理数据对应的原始词: [['我', '喜欢'], ['我', '讨厌']]\n", 45 | " 目标批处理数据: tensor([5, 3])\n", 46 | " 目标批处理数据对应的原始词: ['玩具', '挨打']\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "# 构建批处理数据\n", 52 | "import torch # 导入 PyTorch 库\n", 53 | "import random # 导入 random 库\n", 54 | "batch_size = 2 # 每批数据的大小\n", 55 | "def make_batch():\n", 56 | " input_batch = [] # 定义输入批处理列表\n", 57 | " target_batch = [] # 定义目标批处理列表\n", 58 | " selected_sentences = random.sample(sentences, batch_size) # 随机选择句子\n", 59 | " for sen in selected_sentences: # 遍历每个句子\n", 60 | " word = sen.split() # 用空格将句子分隔成多个词\n", 61 | " # 将除最后一个词以外的所有词的索引作为输入\n", 62 | " input = [word_to_idx[n] for n in word[:-1]] # 创建输入数据\n", 63 | " # 将最后一个词的索引作为目标\n", 64 | " target = word_to_idx[word[-1]] # 创建目标数据\n", 65 | " input_batch.append(input) # 将输入添加到输入批处理列表\n", 66 | " target_batch.append(target) # 将目标添加到目标批处理列表\n", 67 | " input_batch = torch.LongTensor(input_batch) # 将输入数据转换为张量\n", 68 | " target_batch = torch.LongTensor(target_batch) # 将目标数据转换为张量\n", 69 | " return input_batch, target_batch # 返回输入批处理和目标批处理数据\n", 70 | "input_batch, target_batch = make_batch() # 生成批处理数据\n", 71 | "print(\" 输入批处理数据:\",input_batch) # 打印输入批处理数据\n", 72 | "# 将输入批处理数据中的每个索引值转换为对应的原始词\n", 73 | "input_words = []\n", 74 | "for input_idx in input_batch:\n", 75 | " input_words.append([idx_to_word[idx.item()] for idx in input_idx])\n", 76 | "print(\" 输入批处理数据对应的原始词:\",input_words)\n", 77 | "print(\" 目标批处理数据:\",target_batch) # 打印目标批处理数据\n", 78 | "# 将目标批处理数据中的每个索引值转换为对应的原始词\n", 79 | "target_words = [idx_to_word[idx.item()] for idx in target_batch]\n", 80 | "print(\" 目标批处理数据对应的原始词:\",target_words)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "id": "87f165e1", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "import torch.nn as nn # 导入神经网络模块\n", 91 | "# 定义神经概率语言模型(NPLM)\n", 92 | "class NPLM(nn.Module):\n", 93 | " def __init__(self):\n", 94 | " super(NPLM, self).__init__() # 调用父类的构造函数\n", 95 | " self.C = nn.Embedding(voc_size, embedding_size) # 定义一个词嵌入层\n", 96 | " # 用 LSTM 层替代第一个线性层,其输入大小为 embedding_size,隐藏层大小为 n_hidden\n", 97 | " self.lstm = nn.LSTM(embedding_size, n_hidden, batch_first=True) \n", 98 | " # 第二个线性层,其输入大小为 n_hidden,输出大小为 voc_size,即词汇表大小\n", 99 | " self.linear = nn.Linear(n_hidden, voc_size) \n", 100 | " def forward(self, X): # 定义前向传播过程\n", 101 | " # 输入数据 X 张量的形状为 [batch_size, n_step]\n", 102 | " X = self.C(X) # 将 X 通过词嵌入层,形状变为 [batch_size, n_step, embedding_size]\n", 103 | " # 通过 LSTM 层\n", 104 | " lstm_out, _ = self.lstm(X) # lstm_out 形状变为 [batch_size, n_step, n_hidden]\n", 105 | " # 只选择最后一个时间步的输出作为全连接层的输入,通过第二个线性层得到输出 \n", 106 | " output = self.linear(lstm_out[:, -1, :]) # output 的形状为 [batch_size, voc_size]\n", 107 | " return output # 返回输出结果" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "id": "9d214ced", 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | " NPLM 模型结构: NPLM(\n", 121 | " (C): Embedding(7, 2)\n", 122 | " (lstm): LSTM(2, 2, batch_first=True)\n", 123 | " (linear): Linear(in_features=2, out_features=7, bias=True)\n", 124 | ")\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "n_step = 2 # 时间步数,表示每个输入序列的长度,也就是上下文长度 \n", 130 | "n_hidden = 2 # 隐藏层大小\n", 131 | "embedding_size = 2 # 词嵌入大小\n", 132 | "model = NPLM() # 创建神经概率语言模型实例\n", 133 | "print(' NPLM 模型结构:', model) # 打印模型的结构" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "id": "f3c3dfb5", 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Epoch: 1000 cost = 0.000698\n", 147 | "Epoch: 2000 cost = 0.000196\n", 148 | "Epoch: 3000 cost = 0.000068\n", 149 | "Epoch: 4000 cost = 0.000045\n", 150 | "Epoch: 5000 cost = 0.000028\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "import torch.optim as optim # 导入优化器模块\n", 156 | "criterion = nn.CrossEntropyLoss() # 定义损失函数为交叉熵损失\n", 157 | "optimizer = optim.Adam(model.parameters(), lr=0.1) # 定义优化器为 Adam,学习率为 0.1\n", 158 | "# 训练模型\n", 159 | "for epoch in range(5000): # 设置训练迭代次数\n", 160 | " optimizer.zero_grad() # 清除优化器的梯度\n", 161 | " input_batch, target_batch = make_batch() # 创建输入和目标批处理数据\n", 162 | " output = model(input_batch) # 将输入数据传入模型,得到输出结果\n", 163 | " loss = criterion(output, target_batch) # 计算损失值\n", 164 | " if (epoch + 1) % 1000 == 0: # 每 1000 次迭代,打印损失值\n", 165 | " print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))\n", 166 | " loss.backward() # 反向传播计算梯度\n", 167 | " optimizer.step() # 更新模型参数" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 6, 173 | "id": "fcfa4b9b", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "['我', '讨厌'] -> 挨打\n", 181 | "['我', '喜欢'] -> 玩具\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "# 进行预测\n", 187 | "input_strs = [['我', '讨厌'], ['我', '喜欢']] # 需要预测的输入序列\n", 188 | "# 将输入序列转换为对应的索引\n", 189 | "input_indices = [[word_to_idx[word] for word in seq] for seq in input_strs]\n", 190 | "# 将输入序列的索引转换为张量\n", 191 | "input_batch = torch.LongTensor(input_indices) \n", 192 | "# 对输入序列进行预测,取输出中概率最大的类别\n", 193 | "predict = model(input_batch).data.max(1)[1] \n", 194 | "# 将预测结果的索引转换为对应的词\n", 195 | "predict_strs = [idx_to_word[n.item()] for n in predict.squeeze()] \n", 196 | "for input_seq, pred in zip(input_strs, predict_strs):\n", 197 | " print(input_seq, '->', pred) # 打印输入序列和预测结果" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "id": "7660094b", 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3 (ipykernel)", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.10.11" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 5 230 | } 231 | -------------------------------------------------------------------------------- /04_Seq2Seq/Seq2Seq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4d476dd6", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " 句子数量: 5\n", 14 | " 中文词汇表大小: 18\n", 15 | " 英文词汇表大小: 20\n", 16 | " 中文词汇到索引的字典: {'处理': 0, '小冰': 1, '深度学习': 2, '复杂': 3, '人工智能': 4, '我': 5, '喜欢': 6, '强大': 7, '非常': 8, '自然': 9, '学习': 10, '语言': 11, '改变': 12, '爱': 13, '神经网络': 14, '咖哥': 15, '很': 16, '世界': 17}\n", 17 | " 英文词汇到索引的字典: {'world': 0, 'are': 1, 'is': 2, 'changed': 3, 'Neural-Nets': 4, 'DL': 5, 'KaGe': 6, 'likes': 7, 'XiaoBing': 8, 'AI': 9, 'complex': 10, 'the': 11, 'love': 12, 'powerful': 13, 'I': 14, 'NLP': 15, '': 16, '': 17, 'studying': 18, 'so': 19}\n" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "# 构建语料库,每行包含中文、英文(解码器输入)和翻译成英文后的目标输出 3 个句子\n", 23 | "sentences = [\n", 24 | " ['咖哥 喜欢 小冰', ' KaGe likes XiaoBing', 'KaGe likes XiaoBing '],\n", 25 | " ['我 爱 学习 人工智能', ' I love studying AI', 'I love studying AI '],\n", 26 | " ['深度学习 改变 世界', ' DL changed the world', 'DL changed the world '],\n", 27 | " ['自然 语言 处理 很 强大', ' NLP is so powerful', 'NLP is so powerful '],\n", 28 | " ['神经网络 非常 复杂', ' Neural-Nets are complex', 'Neural-Nets are complex ']]\n", 29 | "word_list_cn, word_list_en = [], [] # 初始化中英文词汇表\n", 30 | "# 遍历每一个句子并将单词添加到词汇表中\n", 31 | "for s in sentences:\n", 32 | " word_list_cn.extend(s[0].split())\n", 33 | " word_list_en.extend(s[1].split())\n", 34 | " word_list_en.extend(s[2].split())\n", 35 | "# 去重,得到没有重复单词的词汇表\n", 36 | "word_list_cn = list(set(word_list_cn))\n", 37 | "word_list_en = list(set(word_list_en))\n", 38 | "# 构建单词到索引的映射\n", 39 | "word2idx_cn = {w: i for i, w in enumerate(word_list_cn)}\n", 40 | "word2idx_en = {w: i for i, w in enumerate(word_list_en)}\n", 41 | "# 构建索引到单词的映射\n", 42 | "idx2word_cn = {i: w for i, w in enumerate(word_list_cn)}\n", 43 | "idx2word_en = {i: w for i, w in enumerate(word_list_en)}\n", 44 | "# 计算词汇表的大小\n", 45 | "voc_size_cn = len(word_list_cn)\n", 46 | "voc_size_en = len(word_list_en)\n", 47 | "print(\" 句子数量:\", len(sentences)) # 打印句子数\n", 48 | "print(\" 中文词汇表大小:\", voc_size_cn) # 打印中文词汇表大小\n", 49 | "print(\" 英文词汇表大小:\", voc_size_en) # 打印英文词汇表大小\n", 50 | "print(\" 中文词汇到索引的字典:\", word2idx_cn) # 打印中文词汇到索引的字典\n", 51 | "print(\" 英文词汇到索引的字典:\", word2idx_en) # 打印英文词汇到索引的字典" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "id": "2cf92365", 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | " 原始句子: ['我 爱 学习 人工智能', ' I love studying AI', 'I love studying AI ']\n", 65 | " 编码器输入张量的形状: torch.Size([1, 4])\n", 66 | " 解码器输入张量的形状: torch.Size([1, 5])\n", 67 | " 目标张量的形状: torch.Size([1, 5])\n", 68 | " 编码器输入张量: tensor([[ 5, 13, 10, 4]])\n", 69 | " 解码器输入张量: tensor([[17, 14, 12, 18, 9]])\n", 70 | " 目标张量: tensor([[14, 12, 18, 9, 16]])\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "import numpy as np # 导入 numpy\n", 76 | "import torch # 导入 torch\n", 77 | "import random # 导入 random 库\n", 78 | "# 定义一个函数,随机选择一个句子和词汇表生成输入、输出和目标数据\n", 79 | "def make_data(sentences):\n", 80 | " # 随机选择一个句子进行训练\n", 81 | " random_sentence = random.choice(sentences)\n", 82 | " # 将输入句子中的单词转换为对应的索引\n", 83 | " encoder_input = np.array([[word2idx_cn[n] for n in random_sentence[0].split()]])\n", 84 | " # 将输出句子中的单词转换为对应的索引\n", 85 | " decoder_input = np.array([[word2idx_en[n] for n in random_sentence[1].split()]])\n", 86 | " # 将目标句子中的单词转换为对应的索引\n", 87 | " target = np.array([[word2idx_en[n] for n in random_sentence[2].split()]])\n", 88 | " # 将输入、输出和目标批次转换为 LongTensor\n", 89 | " encoder_input = torch.LongTensor(encoder_input)\n", 90 | " decoder_input = torch.LongTensor(decoder_input)\n", 91 | " target = torch.LongTensor(target)\n", 92 | " return encoder_input, decoder_input, target \n", 93 | "# 使用 make_data 函数生成输入、输出和目标张量\n", 94 | "encoder_input, decoder_input, target = make_data(sentences)\n", 95 | "for s in sentences: # 获取原始句子\n", 96 | " if all([word2idx_cn[w] in encoder_input[0] for w in s[0].split()]):\n", 97 | " original_sentence = s\n", 98 | " break\n", 99 | "print(\" 原始句子:\", original_sentence) # 打印原始句子\n", 100 | "print(\" 编码器输入张量的形状:\", encoder_input.shape) # 打印输入张量形状\n", 101 | "print(\" 解码器输入张量的形状:\", decoder_input.shape) # 打印输出张量形状\n", 102 | "print(\" 目标张量的形状:\", target.shape) # 打印目标张量形状\n", 103 | "print(\" 编码器输入张量:\", encoder_input) # 打印输入张量\n", 104 | "print(\" 解码器输入张量:\", decoder_input) # 打印输出张量\n", 105 | "print(\" 目标张量:\", target) # 打印目标张量\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 3, 111 | "id": "3b8b3d1d", 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | " 编码器结构: Encoder(\n", 119 | " (embedding): Embedding(18, 128)\n", 120 | " (rnn): RNN(128, 128, batch_first=True)\n", 121 | ")\n", 122 | " 解码器结构: Decoder(\n", 123 | " (embedding): Embedding(20, 128)\n", 124 | " (rnn): RNN(128, 128, batch_first=True)\n", 125 | " (out): Linear(in_features=128, out_features=20, bias=True)\n", 126 | ")\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "import torch.nn as nn # 导入 torch.nn 库\n", 132 | "# 定义编码器类,继承自 nn.Module\n", 133 | "class Encoder(nn.Module):\n", 134 | " def __init__(self, input_size, hidden_size):\n", 135 | " super(Encoder, self).__init__() \n", 136 | " self.hidden_size = hidden_size # 设置隐藏层大小 \n", 137 | " self.embedding = nn.Embedding(input_size, hidden_size) # 创建词嵌入层 \n", 138 | " self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True) # 创建 RNN 层 \n", 139 | " def forward(self, inputs, hidden): # 前向传播函数\n", 140 | " embedded = self.embedding(inputs) # 将输入转换为嵌入向量 \n", 141 | " output, hidden = self.rnn(embedded, hidden) # 将嵌入向量输入 RNN 层并获取输出\n", 142 | " return output, hidden\n", 143 | "# 定义解码器类,继承自 nn.Module\n", 144 | "class Decoder(nn.Module):\n", 145 | " def __init__(self, hidden_size, output_size):\n", 146 | " super(Decoder, self).__init__() \n", 147 | " self.hidden_size = hidden_size # 设置隐藏层大小 \n", 148 | " self.embedding = nn.Embedding(output_size, hidden_size) # 创建词嵌入层\n", 149 | " self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True) # 创建 RNN 层 \n", 150 | " self.out = nn.Linear(hidden_size, output_size) # 创建线性输出层 \n", 151 | " def forward(self, inputs, hidden): # 前向传播函数 \n", 152 | " embedded = self.embedding(inputs) # 将输入转换为嵌入向量 \n", 153 | " output, hidden = self.rnn(embedded, hidden) # 将嵌入向量输入 RNN 层并获取输出 \n", 154 | " output = self.out(output) # 使用线性层生成最终输出\n", 155 | " return output, hidden\n", 156 | "n_hidden = 128 # 设置隐藏层数量\n", 157 | "# 创建编码器和解码器\n", 158 | "encoder = Encoder(voc_size_cn, n_hidden)\n", 159 | "decoder = Decoder(n_hidden, voc_size_en)\n", 160 | "print(' 编码器结构:', encoder) # 打印编码器的结构\n", 161 | "print(' 解码器结构:', decoder) # 打印解码器的结构" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 4, 167 | "id": "82dd33e3", 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "S2S 模型结构: Seq2Seq(\n", 175 | " (encoder): Encoder(\n", 176 | " (embedding): Embedding(18, 128)\n", 177 | " (rnn): RNN(128, 128, batch_first=True)\n", 178 | " )\n", 179 | " (decoder): Decoder(\n", 180 | " (embedding): Embedding(20, 128)\n", 181 | " (rnn): RNN(128, 128, batch_first=True)\n", 182 | " (out): Linear(in_features=128, out_features=20, bias=True)\n", 183 | " )\n", 184 | ")\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "class Seq2Seq(nn.Module):\n", 190 | " def __init__(self, encoder, decoder):\n", 191 | " super(Seq2Seq, self).__init__()\n", 192 | " # 初始化编码器和解码器\n", 193 | " self.encoder = encoder\n", 194 | " self.decoder = decoder\n", 195 | " def forward(self, enc_input, hidden, dec_input): # 定义前向传播函数\n", 196 | " # 使输入序列通过编码器并获取输出和隐藏状态\n", 197 | " encoder_output, encoder_hidden = self.encoder(enc_input, hidden)\n", 198 | " # 将编码器的隐藏状态传递给解码器作为初始隐藏状态\n", 199 | " decoder_hidden = encoder_hidden\n", 200 | " # 使解码器输入(目标序列)通过解码器并获取输出\n", 201 | " decoder_output, _ = self.decoder(dec_input, decoder_hidden)\n", 202 | " return decoder_output\n", 203 | "# 创建 Seq2Seq 架构\n", 204 | "model = Seq2Seq(encoder, decoder)\n", 205 | "print('S2S 模型结构:', model) # 打印模型的结构" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 5, 211 | "id": "4645634a", 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "Epoch: 0040 cost = 0.540944\n", 219 | "Epoch: 0080 cost = 0.072166\n", 220 | "Epoch: 0120 cost = 0.030407\n", 221 | "Epoch: 0160 cost = 0.026792\n", 222 | "Epoch: 0200 cost = 0.017567\n", 223 | "Epoch: 0240 cost = 0.011450\n", 224 | "Epoch: 0280 cost = 0.012062\n", 225 | "Epoch: 0320 cost = 0.011834\n", 226 | "Epoch: 0360 cost = 0.007469\n", 227 | "Epoch: 0400 cost = 0.007511\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "# 定义训练函数\n", 233 | "def train_seq2seq(model, criterion, optimizer, epochs):\n", 234 | " for epoch in range(epochs):\n", 235 | " encoder_input, decoder_input, target = make_data(sentences) # 训练数据的创建\n", 236 | " hidden = torch.zeros(1, encoder_input.size(0), n_hidden) # 初始化隐藏状态 \n", 237 | " optimizer.zero_grad()# 梯度清零 \n", 238 | " output = model(encoder_input, hidden, decoder_input) # 获取模型输出 \n", 239 | " loss = criterion(output.view(-1, voc_size_en), target.view(-1)) # 计算损失 \n", 240 | " if (epoch + 1) % 40 == 0: # 打印损失\n", 241 | " print(f\"Epoch: {epoch + 1:04d} cost = {loss:.6f}\") \n", 242 | " loss.backward()# 反向传播 \n", 243 | " optimizer.step()# 更新参数\n", 244 | "# 训练模型\n", 245 | "epochs = 400 # 训练轮次\n", 246 | "criterion = nn.CrossEntropyLoss() # 损失函数\n", 247 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # 优化器\n", 248 | "train_seq2seq(model, criterion, optimizer, epochs) # 调用函数训练模型" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 8, 254 | "id": "9877b5c7", 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "咖哥 喜欢 小冰 -> ['KaGe', 'likes', 'XiaoBing']\n", 262 | "自然 语言 处理 很 强大 -> ['NLP', 'is', 'so', 'powerful', '']\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "# 定义测试函数\n", 268 | "def test_seq2seq(model, source_sentence):\n", 269 | " # 将输入的句子转换为索引\n", 270 | " encoder_input = np.array([[word2idx_cn[n] for n in source_sentence.split()]])\n", 271 | " # 构建输出的句子的索引,以 '' 开始,后面跟 '',长度与输入句子相同\n", 272 | " decoder_input = np.array([word2idx_en['']] + [word2idx_en['']]*(len(encoder_input[0])-1))\n", 273 | " # 转换为 LongTensor 类型\n", 274 | " encoder_input = torch.LongTensor(encoder_input)\n", 275 | " decoder_input = torch.LongTensor(decoder_input).unsqueeze(0) # 增加一维 \n", 276 | " hidden = torch.zeros(1, encoder_input.size(0), n_hidden) # 初始化隐藏状态 \n", 277 | " predict = model(encoder_input, hidden, decoder_input) # 获取模型输出 \n", 278 | " predict = predict.data.max(2, keepdim=True)[1] # 获取概率最大的索引\n", 279 | " # 打印输入的句子和预测的句子\n", 280 | " print(source_sentence, '->', [idx2word_en[n.item()] for n in predict.squeeze()])\n", 281 | "# 测试模型\n", 282 | "test_seq2seq(model, '咖哥 喜欢 小冰') \n", 283 | "test_seq2seq(model, '自然 语言 处理 很 强大')" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "id": "7cf88ce0", 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [] 293 | } 294 | ], 295 | "metadata": { 296 | "kernelspec": { 297 | "display_name": "Python 3 (ipykernel)", 298 | "language": "python", 299 | "name": "python3" 300 | }, 301 | "language_info": { 302 | "codemirror_mode": { 303 | "name": "ipython", 304 | "version": 3 305 | }, 306 | "file_extension": ".py", 307 | "mimetype": "text/x-python", 308 | "name": "python", 309 | "nbconvert_exporter": "python", 310 | "pygments_lexer": "ipython3", 311 | "version": "3.10.11" 312 | } 313 | }, 314 | "nbformat": 4, 315 | "nbformat_minor": 5 316 | } 317 | -------------------------------------------------------------------------------- /05_Attention/Dot-Product_Attention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "95acce4b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import torch # 导入 torch\n", 11 | "import torch.nn.functional as F # 导入 nn.functional\n", 12 | "# 1. 创建两个张量 x1 和 x2\n", 13 | "x1 = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len1, feature_dim)\n", 14 | "x2 = torch.randn(2, 5, 4) # 形状 (batch_size, seq_len2, feature_dim)\n", 15 | "# 2. 计算原始权重\n", 16 | "raw_weights = torch.bmm(x1, x2.transpose(1, 2)) # 形状 (batch_size, seq_len1, seq_len2)\n", 17 | "# 3. 用 softmax 函数对原始权重进行归一化\n", 18 | "attn_weights = F.softmax(raw_weights, dim=2) # 形状 (batch_size, seq_len1, seq_len2)\n", 19 | "# 4. 将注意力权重与 x2 相乘,计算加权和\n", 20 | "attn_output = torch.bmm(attn_weights, x2) # 形状 (batch_size, seq_len1, feature_dim)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "8fa6cc98", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "x1: tensor([[[ 0.0971, -0.5204, 0.3034, -0.8555],\n", 34 | " [ 1.1608, -1.5700, -0.6016, 0.3667],\n", 35 | " [-1.5075, 0.0321, -1.5487, -0.5641]],\n", 36 | "\n", 37 | " [[-1.2064, 0.8690, -0.4074, 0.2248],\n", 38 | " [ 1.1715, 0.1240, -0.7232, 0.5079],\n", 39 | " [-0.4021, 1.6824, -0.2344, 0.5673]]])\n", 40 | "x2: tensor([[[ 0.5456, 1.2140, 0.9290, -1.3675],\n", 41 | " [ 0.9062, -1.7574, -0.1825, 0.4145],\n", 42 | " [-1.3821, -0.9140, 0.6957, 0.0766],\n", 43 | " [-0.3361, 0.7526, 1.0915, -0.3276],\n", 44 | " [ 0.4812, -0.1446, -0.6791, 0.5942]],\n", 45 | "\n", 46 | " [[-1.5356, -0.2674, 1.5787, 0.3518],\n", 47 | " [ 0.7072, -0.2149, 0.6725, -0.9577],\n", 48 | " [-0.4562, -0.9658, -0.2043, 0.4609],\n", 49 | " [ 1.3268, -0.7196, -0.6243, 1.6647],\n", 50 | " [ 0.0198, 1.0432, -0.1927, -2.1842]]])\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "# 创建两个张量 x1 和 x2\n", 56 | "x1 = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len1, feature_dim)\n", 57 | "x2 = torch.randn(2, 5, 4) # 形状 (batch_size, seq_len2, feature_dim)\n", 58 | "print(\"x1:\", x1)\n", 59 | "print(\"x2:\", x2)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "id": "b79f661d", 66 | "metadata": { 67 | "scrolled": true 68 | }, 69 | "outputs": [ 70 | { 71 | "name": "stdout", 72 | "output_type": "stream", 73 | "text": [ 74 | " 原始权重: tensor([[[ 8.7287e-01, 5.9256e-01, 4.8711e-01, 1.8720e-01, -5.9238e-01],\n", 75 | " [-2.3330e+00, 4.0729e+00, -5.5992e-01, -2.3485e+00, 1.4121e+00],\n", 76 | " [-1.4509e+00, -1.3736e+00, 9.3359e-01, -9.7488e-01, -1.3552e-02]],\n", 77 | "\n", 78 | " [[ 1.0561e+00, -1.5290e+00, -1.0207e-01, -1.5974e+00, 4.7008e-01],\n", 79 | " [-2.7952e+00, -1.7101e-01, -2.7233e-01, 2.7622e+00, -8.1742e-01],\n", 80 | " [-2.9749e-03, -1.3468e+00, -1.1321e+00, -6.5329e-01, 5.5310e-01]]])\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "# 计算点积,得到原始权重,形状为 (batch_size, seq_len1, seq_len2)\n", 86 | "raw_weights = torch.bmm(x1, x2.transpose(1, 2))\n", 87 | "print(\" 原始权重:\", raw_weights) " 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "id": "7fe47f56", 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | " 归一化后的注意力权重: tensor([[[0.3154, 0.2383, 0.2145, 0.1589, 0.0729],\n", 101 | " [0.0015, 0.9234, 0.0090, 0.0015, 0.0645],\n", 102 | " [0.0533, 0.0576, 0.5788, 0.0858, 0.2245]],\n", 103 | "\n", 104 | " [[0.4959, 0.0374, 0.1558, 0.0349, 0.2760],\n", 105 | " [0.0034, 0.0470, 0.0424, 0.8826, 0.0246],\n", 106 | " [0.2597, 0.0678, 0.0840, 0.1356, 0.4530]]])\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "import torch.nn.functional as F # 导入 torch.nn.functional\n", 112 | "# 应用 softmax 函数,使权重的值在 0 和 1 之间,且每一行的和为 1\n", 113 | "attn_weights = F.softmax(raw_weights, dim=-1) # 归一化\n", 114 | "print(\" 归一化后的注意力权重:\", attn_weights)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "id": "765f34aa", 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | " 注意力输出 : tensor([[[ 0.0733, -0.1229, 0.5227, -0.3249],\n", 128 | " [ 0.8558, -1.6374, -0.2031, 0.4193],\n", 129 | " [-0.6394, -0.5333, 0.3829, 0.1005]],\n", 130 | "\n", 131 | " [[-0.7544, -0.0283, 0.7013, -0.3343],\n", 132 | " [ 1.1801, -0.6614, -0.5275, 1.3913],\n", 133 | " [-0.2004, 0.2098, 0.2665, -0.6985]]])\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "# 与 x2 相乘,得到注意力分布的加权和,形状为 (batch_size, seq_len1, feature_dim)\n", 139 | "attn_output = torch.bmm(attn_weights, x2)\n", 140 | "print(\" 注意力输出 :\", attn_output)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "b84776b2", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "c6999321", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "d2bc56ee", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 3 (ipykernel)", 171 | "language": "python", 172 | "name": "python3" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 3 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython3", 184 | "version": "3.10.11" 185 | } 186 | }, 187 | "nbformat": 4, 188 | "nbformat_minor": 5 189 | } 190 | -------------------------------------------------------------------------------- /05_Attention/Multiheads-Self-Attention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "dd0823d2", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | " 加权信息 : tensor([[[ 0.2412, -0.0175, 0.2173, -0.5284],\n", 14 | " [ 0.2514, -0.0224, 0.2202, -0.5313],\n", 15 | " [ 0.2364, -0.0168, 0.2185, -0.5227]],\n", 16 | "\n", 17 | " [[-0.0085, 0.0158, 0.0018, -0.3527],\n", 18 | " [-0.1217, 0.0532, -0.0055, -0.2765],\n", 19 | " [-0.0809, 0.0917, -0.0446, -0.4277]]], grad_fn=)\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "import torch\n", 25 | "import torch.nn.functional as F\n", 26 | "# 一个形状为 (batch_size, seq_len, feature_dim) 的张量 x\n", 27 | "x = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len, feature_dim) \n", 28 | "# 定义头数和每个头的维度\n", 29 | "num_heads = 2\n", 30 | "head_dim = 2\n", 31 | "# feature_dim 必须是 num_heads * head_dim 的整数倍\n", 32 | "assert x.size(-1) == num_heads * head_dim\n", 33 | "# 定义线性层用于将 x 转换为 Q, K, V 向量\n", 34 | "linear_q = torch.nn.Linear(4, 4)\n", 35 | "linear_k = torch.nn.Linear(4, 4)\n", 36 | "linear_v = torch.nn.Linear(4, 4)\n", 37 | "# 通过线性层计算 Q, K, V\n", 38 | "Q = linear_q(x) # 形状 (batch_size, seq_len, feature_dim) \n", 39 | "K = linear_k(x) # 形状 (batch_size, seq_len, feature_dim) \n", 40 | "V = linear_v(x) # 形状 (batch_size, seq_len, feature_dim) \n", 41 | "# 将 Q, K, V 分割成 num_heads 个头\n", 42 | "def split_heads(tensor, num_heads):\n", 43 | " batch_size, seq_len, feature_dim = tensor.size()\n", 44 | " head_dim = feature_dim // num_heads\n", 45 | " output = tensor.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)\n", 46 | " return output # 形状 (batch_size, num_heads, seq_len, feature_dim)\n", 47 | "Q = split_heads(Q, num_heads) # 形状 (batch_size, num_heads, seq_len, head_dim)\n", 48 | "K = split_heads(K, num_heads) # 形状 (batch_size, num_heads, seq_len, head_dim)\n", 49 | "V = split_heads(V, num_heads) # 形状 (batch_size, num_heads, seq_len, head_dim)\n", 50 | "# 计算 Q 和 K 的点积,作为相似度分数 , 也就是自注意力原始权重\n", 51 | "raw_weights = torch.matmul(Q, K.transpose(-2, -1)) # 形状 (batch_size, num_heads, seq_len, seq_len)\n", 52 | "# 对自注意力原始权重进行缩放\n", 53 | "scale_factor = K.size(-1) ** 0.5\n", 54 | "scaled_weights = raw_weights / scale_factor # 形状 (batch_size, num_heads, seq_len, seq_len)\n", 55 | "# 对缩放后的权重进行 softmax 归一化,得到注意力权重\n", 56 | "attn_weights = F.softmax(scaled_weights, dim=-1) # 形状 (batch_size, num_heads, seq_len, seq_len)\n", 57 | "# 将注意力权重应用于 V 向量,计算加权和,得到加权信息\n", 58 | "attn_outputs = torch.matmul(attn_weights, V) # 形状 (batch_size, num_heads, seq_len, head_dim)\n", 59 | "# 将所有头的结果拼接起来\n", 60 | "def combine_heads(tensor):\n", 61 | " batch_size, num_heads, seq_len, head_dim = tensor.size()\n", 62 | " feature_dim = num_heads * head_dim\n", 63 | " output = tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, feature_dim)\n", 64 | " return output# 形状 : (batch_size, seq_len, feature_dim)\n", 65 | "attn_outputs = combine_heads(attn_outputs) # 形状 (batch_size, seq_len, feature_dim) \n", 66 | "# 对拼接后的结果进行线性变换\n", 67 | "linear_out = torch.nn.Linear(4, 4)\n", 68 | "attn_outputs = linear_out(attn_outputs) # 形状 (batch_size, seq_len, feature_dim) \n", 69 | "print(\" 加权信息 :\", attn_outputs)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "219601c1", 75 | "metadata": {}, 76 | "source": [] 77 | } 78 | ], 79 | "metadata": { 80 | "kernelspec": { 81 | "display_name": "Python 3 (ipykernel)", 82 | "language": "python", 83 | "name": "python3" 84 | }, 85 | "language_info": { 86 | "codemirror_mode": { 87 | "name": "ipython", 88 | "version": 3 89 | }, 90 | "file_extension": ".py", 91 | "mimetype": "text/x-python", 92 | "name": "python", 93 | "nbconvert_exporter": "python", 94 | "pygments_lexer": "ipython3", 95 | "version": "3.11.5" 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 5 100 | } 101 | -------------------------------------------------------------------------------- /05_Attention/QKV.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "889f1070", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import torch\n", 11 | "import torch.nn.functional as F\n", 12 | "#1. 创建 Query、Key 和 Value 张量\n", 13 | "q = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len1, feature_dim)\n", 14 | "k = torch.randn(2, 4, 4) # 形状 (batch_size, seq_len2, feature_dim)\n", 15 | "v = torch.randn(2, 4, 4) # 形状 (batch_size, seq_len2, feature_dim)\n", 16 | "# 2. 计算点积,得到原始权重,形状为 (batch_size, seq_len1, seq_len2)\n", 17 | "raw_weights = torch.bmm(q, k.transpose(1, 2))\n", 18 | "# 3. 将原始权重进行缩放(可选),形状仍为 (batch_size, seq_len1, seq_len2)\n", 19 | "scaling_factor = q.size(-1) ** 0.5\n", 20 | "scaled_weights = raw_weights / scaling_factor\n", 21 | "# 4. 应用 softmax 函数,使结果的值在 0 和 1 之间,且每一行的和为 1\n", 22 | "attn_weights = F.softmax(scaled_weights, dim=-1) # 形状仍为 (batch_size, seq_len1, seq_len2)\n", 23 | "# 5. 与 Value 相乘,得到注意力分布的加权和 , 形状为 (batch_size, seq_len1, feature_dim)\n", 24 | "attn_output = torch.bmm(attn_weights, v)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "id": "10067227", 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "tensor([[[-0.5476, 0.1485, 0.0807, -0.2692],\n", 37 | " [-0.5118, -0.1278, -0.1846, -0.9184],\n", 38 | " [-0.4600, -0.0676, -0.1448, -0.7520]],\n", 39 | "\n", 40 | " [[-0.8415, 0.8045, 1.5253, 0.4558],\n", 41 | " [-0.4667, 0.6294, 0.8493, 0.4709],\n", 42 | " [-0.1052, 0.5399, -0.1296, 0.5918]]])" 43 | ] 44 | }, 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "attn_output" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "0c60484d", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3 (ipykernel)", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.10.11" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 5 84 | } 85 | -------------------------------------------------------------------------------- /05_Attention/Scaled_Dot-Product_Attention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "id": "ff9ab1cc", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import torch # 导入 torch\n", 11 | "import torch.nn.functional as F # 导入 nn.functional\n", 12 | "# 1. 创建两个张量 x1 和 x2\n", 13 | "x1 = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len1, feature_dim)\n", 14 | "x2 = torch.randn(2, 5, 4) # 形状 (batch_size, seq_len2, feature_dim)\n", 15 | "# 2. 计算张量点积,得到原始权重\n", 16 | "raw_weights = torch.bmm(x1, x2.transpose(1, 2)) # 形状 (batch_size, seq_len1, seq_len2)\n", 17 | "# 3. 将原始权重除以缩放因子\n", 18 | "scaling_factor = x1.size(-1) ** 0.5\n", 19 | "scaled_weights = raw_weights / scaling_factor # 形状 (batch_size, seq_len1, seq_len2)\n", 20 | "# 4. 对原始权重进行归一化\n", 21 | "attn_weights = F.softmax(scaled_weights, dim=2) # 形 状 (batch_size, seq_len1, seq_len2)\n", 22 | "# 5. 使用注意力权重对 x2 加权求和\n", 23 | "attn_output = torch.bmm(attn_weights, x2) # 形状 (batch_size, seq_len1, feature_dim)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 11, 29 | "id": "9973de74", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "tensor([[[-0.2783, 0.0243, -0.3903, -0.2246],\n", 36 | " [-0.1287, -0.1590, -0.7114, -0.2604],\n", 37 | " [ 0.1000, -0.0938, -0.5747, -0.1973]],\n", 38 | "\n", 39 | " [[-0.0887, 0.2160, -0.1005, 0.0643],\n", 40 | " [-0.6038, -0.5416, 0.2732, 0.0515],\n", 41 | " [-0.2174, -0.0502, -0.1419, -0.1929]]])" 42 | ] 43 | }, 44 | "execution_count": 11, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "attn_output" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "97ab79b9", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [] 60 | } 61 | ], 62 | "metadata": { 63 | "kernelspec": { 64 | "display_name": "Python 3 (ipykernel)", 65 | "language": "python", 66 | "name": "python3" 67 | }, 68 | "language_info": { 69 | "codemirror_mode": { 70 | "name": "ipython", 71 | "version": 3 72 | }, 73 | "file_extension": ".py", 74 | "mimetype": "text/x-python", 75 | "name": "python", 76 | "nbconvert_exporter": "python", 77 | "pygments_lexer": "ipython3", 78 | "version": "3.11.5" 79 | } 80 | }, 81 | "nbformat": 4, 82 | "nbformat_minor": 5 83 | } 84 | -------------------------------------------------------------------------------- /05_Attention/Self-Attention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "5842113a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import torch\n", 11 | "import torch.nn.functional as F\n", 12 | "# 一个形状为 (batch_size, seq_len, feature_dim) 的张量 x\n", 13 | "x = torch.randn(2, 3, 4)\n", 14 | "# 计算原始权重,形状为 (batch_size, seq_len, seq_len)\n", 15 | "raw_weights = torch.bmm(x, x.transpose(1, 2))\n", 16 | "# 对原始权重进行 softmax 归一化,形状为 (batch_size, seq_len, seq_len)\n", 17 | "attn_weights = F.softmax(raw_weights, dim=2)\n", 18 | "# 计算加权和,形状为 (batch_size, seq_len, feature_dim) \n", 19 | "attn_outputs = torch.bmm(attn_weights, x)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "id": "b7ec4b56", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | " 加权信息 : tensor([[[ 0.5676, -0.0132, -0.8214, -0.0548],\n", 33 | " [ 0.5352, -0.1170, -0.5392, -0.0256],\n", 34 | " [ 0.6141, -0.1343, -0.5587, -0.0331]],\n", 35 | "\n", 36 | " [[ 0.5973, -0.2426, -0.3217, -0.0335],\n", 37 | " [ 0.5996, -0.1914, -0.2840, 0.0152],\n", 38 | " [ 0.6117, -0.2507, -0.3363, -0.0404]]], grad_fn=)\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "# 一个形状为 (batch_size, seq_len, feature_dim) 的张量 x\n", 44 | "x = torch.randn(2, 3, 4) # 形状 (batch_size, seq_len, feature_dim)\n", 45 | "# 定义线性层用于将 x 转换为 Q, K, V 向量\n", 46 | "linear_q = torch.nn.Linear(4, 4)\n", 47 | "linear_k = torch.nn.Linear(4, 4)\n", 48 | "linear_v = torch.nn.Linear(4, 4)\n", 49 | "# 通过线性层计算 Q, K, V\n", 50 | "Q = linear_q(x) # 形状 (batch_size, seq_len, feature_dim)\n", 51 | "K = linear_k(x) # 形状 (batch_size, seq_len, feature_dim)\n", 52 | "V = linear_v(x) # 形状 (batch_size, seq_len, feature_dim)\n", 53 | "# 计算 Q 和 K 的点积,作为相似度分数 , 也就是自注意力原始权重\n", 54 | "raw_weights = torch.bmm(Q, K.transpose(1, 2)) # 形状 (batch_size, seq_len, seq_len)\n", 55 | "# 将自注意力原始权重进行缩放\n", 56 | "scale_factor = K.size(-1) ** 0.5 # 这里是 4 ** 0.5\n", 57 | "scaled_weights = raw_weights / scale_factor # 形状 (batch_size, seq_len, seq_len)\n", 58 | "# 对缩放后的权重进行 softmax 归一化,得到注意力权重\n", 59 | "attn_weights = F.softmax(scaled_weights, dim=2) # 形状 (batch_size, seq_len, seq_len)\n", 60 | "# 将注意力权重应用于 V 向量,计算加权和,得到加权信息\n", 61 | "attn_outputs = torch.bmm(attn_weights, V) # 形状 (batch_size, seq_len, feature_dim)\n", 62 | "print(\" 加权信息 :\", attn_outputs)" 63 | ] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3 (ipykernel)", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.10.11" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 5 87 | } 88 | -------------------------------------------------------------------------------- /07_GPT/GPT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8875c253", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np # 导入 numpy 库\n", 11 | "import torch # 导入 torch 库\n", 12 | "import torch.nn as nn # 导入 torch.nn 库\n", 13 | "d_k = 64 # K(=Q) 维度\n", 14 | "d_v = 64 # V 维度\n", 15 | "# 定义缩放点积注意力类\n", 16 | "class ScaledDotProductAttention(nn.Module):\n", 17 | " def __init__(self):\n", 18 | " super(ScaledDotProductAttention, self).__init__() \n", 19 | " def forward(self, Q, K, V, attn_mask):\n", 20 | " #------------------------- 维度信息 -------------------------------- \n", 21 | " # Q K V [batch_size, n_heads, len_q/k/v, dim_q=k/v] (dim_q=dim_k)\n", 22 | " # attn_mask [batch_size, n_heads, len_q, len_k]\n", 23 | " #----------------------------------------------------------------\n", 24 | " # 计算注意力分数(原始权重)[batch_size,n_heads,len_q,len_k]\n", 25 | " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) \n", 26 | " #------------------------- 维度信息 -------------------------------- \n", 27 | " # scores [batch_size, n_heads, len_q, len_k]\n", 28 | " #----------------------------------------------------------------- \n", 29 | " # 使用注意力掩码,将 attn_mask 中值为 1 的位置的权重替换为极小值\n", 30 | " #------------------------- 维度信息 -------------------------------- \n", 31 | " # attn_mask [batch_size, n_heads, len_q, len_k], 形状和 scores 相同\n", 32 | " #----------------------------------------------------------------- \n", 33 | " scores.masked_fill_(attn_mask, -1e9) \n", 34 | " # 对注意力分数进行 softmax 归一化\n", 35 | " weights = nn.Softmax(dim=-1)(scores) \n", 36 | " #------------------------- 维度信息 -------------------------------- \n", 37 | " # weights [batch_size, n_heads, len_q, len_k], 形状和 scores 相同\n", 38 | " #----------------------------------------------------------------- \n", 39 | " # 计算上下文向量(也就是注意力的输出), 是上下文信息的紧凑表示\n", 40 | " context = torch.matmul(weights, V) \n", 41 | " #------------------------- 维度信息 -------------------------------- \n", 42 | " # context [batch_size, n_heads, len_q, dim_v]\n", 43 | " #----------------------------------------------------------------- \n", 44 | " return context, weights # 返回上下文向量和注意力分数" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "id": "76fb9e1e", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# 定义多头自注意力类\n", 55 | "d_embedding = 512 # Embedding 的维度\n", 56 | "n_heads = 8 # Multi-Head Attention 中头的个数\n", 57 | "batch_size = 3 # 每一批的数据大小\n", 58 | "class MultiHeadAttention(nn.Module):\n", 59 | " def __init__(self):\n", 60 | " super(MultiHeadAttention, self).__init__()\n", 61 | " self.W_Q = nn.Linear(d_embedding, d_k * n_heads) # Q的线性变换层\n", 62 | " self.W_K = nn.Linear(d_embedding, d_k * n_heads) # K的线性变换层\n", 63 | " self.W_V = nn.Linear(d_embedding, d_v * n_heads) # V的线性变换层\n", 64 | " self.linear = nn.Linear(n_heads * d_v, d_embedding)\n", 65 | " self.layer_norm = nn.LayerNorm(d_embedding)\n", 66 | " def forward(self, Q, K, V, attn_mask): \n", 67 | " #------------------------- 维度信息 -------------------------------- \n", 68 | " # Q K V [batch_size, len_q/k/v, embedding_dim] \n", 69 | " #----------------------------------------------------------------- \n", 70 | " residual, batch_size = Q, Q.size(0) # 保留残差连接\n", 71 | " # 将输入进行线性变换和重塑,以便后续处理\n", 72 | " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) \n", 73 | " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)\n", 74 | " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)\n", 75 | " #------------------------- 维度信息 -------------------------------- \n", 76 | " # q_s k_s v_s: [batch_size, n_heads, len_q/k/v, d_q=k/v]\n", 77 | " #----------------------------------------------------------------- \n", 78 | " # 将注意力掩码复制到多头 attn_mask: [batch_size, n_heads, len_q, len_k]\n", 79 | " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)\n", 80 | " #------------------------- 维度信息 -------------------------------- \n", 81 | " # attn_mask [batch_size, n_heads, len_q, len_k]\n", 82 | " #----------------------------------------------------------------- \n", 83 | " # 使用缩放点积注意力计算上下文和注意力权重\n", 84 | " context, weights = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", 85 | " #------------------------- 维度信息 -------------------------------- \n", 86 | " # context [batch_size, n_heads, len_q, dim_v]\n", 87 | " # weights [batch_size, n_heads, len_q, len_k]\n", 88 | " #----------------------------------------------------------------- \n", 89 | " # 通过调整维度将多个头的上下文向量连接在一起\n", 90 | " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) \n", 91 | " #------------------------- 维度信息 -------------------------------- \n", 92 | " # context [batch_size, len_q, n_heads * dim_v]\n", 93 | " #----------------------------------------------------------------- \n", 94 | " # 用一个线性层把连接后的多头自注意力结果转换,原始地嵌入维度\n", 95 | " output = self.linear(context) \n", 96 | " #------------------------- 维度信息 -------------------------------- \n", 97 | " # output [batch_size, len_q, embedding_dim]\n", 98 | " #----------------------------------------------------------------- \n", 99 | " # 与输入 (Q) 进行残差链接,并进行层归一化后输出\n", 100 | " output = self.layer_norm(output + residual)\n", 101 | " #------------------------- 维度信息 -------------------------------- \n", 102 | " # output [batch_size, len_q, embedding_dim]\n", 103 | " #----------------------------------------------------------------- \n", 104 | " return output, weights # 返回层归一化的输出和注意力权重" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "id": "19035b9c", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "# 定义逐位置前馈网络类\n", 115 | "class PoswiseFeedForwardNet(nn.Module):\n", 116 | " def __init__(self, d_ff=2048):\n", 117 | " super(PoswiseFeedForwardNet, self).__init__()\n", 118 | " # 定义一维卷积层 1,用于将输入映射到更高维度\n", 119 | " self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)\n", 120 | " # 定义一维卷积层 2,用于将输入映射回原始维度\n", 121 | " self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)\n", 122 | " # 定义层归一化\n", 123 | " self.layer_norm = nn.LayerNorm(d_embedding)\n", 124 | " def forward(self, inputs): \n", 125 | " #------------------------- 维度信息 -------------------------------- \n", 126 | " # inputs [batch_size, len_q, embedding_dim]\n", 127 | " #---------------------------------------------------------------- \n", 128 | " residual = inputs # 保留残差连接 \n", 129 | " # 在卷积层 1 后使用 ReLU 激活函数 \n", 130 | " output = nn.ReLU()(self.conv1(inputs.transpose(1, 2))) \n", 131 | " #------------------------- 维度信息 -------------------------------- \n", 132 | " # output [batch_size, d_ff, len_q]\n", 133 | " #----------------------------------------------------------------\n", 134 | " # 使用卷积层 2 进行降维 \n", 135 | " output = self.conv2(output).transpose(1, 2) \n", 136 | " #------------------------- 维度信息 -------------------------------- \n", 137 | " # output [batch_size, len_q, embedding_dim]\n", 138 | " #----------------------------------------------------------------\n", 139 | " # 与输入进行残差链接,并进行层归一化\n", 140 | " output = self.layer_norm(output + residual) \n", 141 | " #------------------------- 维度信息 -------------------------------- \n", 142 | " # output [batch_size, len_q, embedding_dim]\n", 143 | " #----------------------------------------------------------------\n", 144 | " return output # 返回加入残差连接后层归一化的结果" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 4, 150 | "id": "222df7a8", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# 生成正弦位置编码表的函数,用于在 Transformer 中引入位置信息\n", 155 | "def get_sin_enc_table(n_position, embedding_dim):\n", 156 | " #------------------------- 维度信息 --------------------------------\n", 157 | " # n_position: 输入序列的最大长度\n", 158 | " # embedding_dim: 词嵌入向量的维度\n", 159 | " #----------------------------------------------------------------- \n", 160 | " # 根据位置和维度信息,初始化正弦位置编码表\n", 161 | " sinusoid_table = np.zeros((n_position, embedding_dim)) \n", 162 | " # 遍历所有位置和维度,计算角度值\n", 163 | " for pos_i in range(n_position):\n", 164 | " for hid_j in range(embedding_dim):\n", 165 | " angle = pos_i / np.power(10000, 2 * (hid_j // 2) / embedding_dim)\n", 166 | " sinusoid_table[pos_i, hid_j] = angle \n", 167 | " # 计算正弦和余弦值\n", 168 | " sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 偶数维\n", 169 | " sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 奇数维 \n", 170 | " #------------------------- 维度信息 --------------------------------\n", 171 | " # sinusoid_table 的维度是 [n_position, embedding_dim]\n", 172 | " #---------------------------------------------------------------- \n", 173 | " return torch.FloatTensor(sinusoid_table) # 返回正弦位置编码表" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 5, 179 | "id": "68799d07", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "# 定义填充注意力掩码函数\n", 184 | "def get_attn_pad_mask(seq_q, seq_k):\n", 185 | " #------------------------- 维度信息 --------------------------------\n", 186 | " # seq_q 的维度是 [batch_size, len_q]\n", 187 | " # seq_k 的维度是 [batch_size, len_k]\n", 188 | " #-----------------------------------------------------------------\n", 189 | " batch_size, len_q = seq_q.size()\n", 190 | " batch_size, len_k = seq_k.size()\n", 191 | " # 生成布尔类型张量\n", 192 | " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # token 的编码值为 0\n", 193 | " #------------------------- 维度信息 --------------------------------\n", 194 | " # pad_attn_mask 的维度是 [batch_size,1,len_k]\n", 195 | " #-----------------------------------------------------------------\n", 196 | " # 变形为与注意力分数相同形状的张量 \n", 197 | " pad_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k)\n", 198 | " #------------------------- 维度信息 --------------------------------\n", 199 | " # pad_attn_mask 的维度是 [batch_size,len_q,len_k]\n", 200 | " #-----------------------------------------------------------------\n", 201 | " return pad_attn_mask # 返回填充位置的注意力掩码" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 6, 207 | "id": "aa3cc68b", 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# 定义编码器层类\n", 212 | "class EncoderLayer(nn.Module):\n", 213 | " def __init__(self):\n", 214 | " super(EncoderLayer, self).__init__() \n", 215 | " self.enc_self_attn = MultiHeadAttention() # 多头自注意力层 \n", 216 | " self.pos_ffn = PoswiseFeedForwardNet() # 位置前馈神经网络层\n", 217 | " def forward(self, enc_inputs, enc_self_attn_mask):\n", 218 | " #------------------------- 维度信息 --------------------------------\n", 219 | " # enc_inputs 的维度是 [batch_size, seq_len, embedding_dim]\n", 220 | " # enc_self_attn_mask 的维度是 [batch_size, seq_len, seq_len]\n", 221 | " #-----------------------------------------------------------------\n", 222 | " # 将相同的 Q,K,V 输入多头自注意力层 , 返回的 attn_weights 增加了头数 \n", 223 | " enc_outputs, attn_weights = self.enc_self_attn(enc_inputs, enc_inputs,\n", 224 | " enc_inputs, enc_self_attn_mask)\n", 225 | " #------------------------- 维度信息 --------------------------------\n", 226 | " # enc_outputs 的维度是 [batch_size, seq_len, embedding_dim] \n", 227 | " # attn_weights 的维度是 [batch_size, n_heads, seq_len, seq_len] \n", 228 | " # 将多头自注意力 outputs 输入位置前馈神经网络层\n", 229 | " enc_outputs = self.pos_ffn(enc_outputs) # 维度与 enc_inputs 相同\n", 230 | " #------------------------- 维度信息 --------------------------------\n", 231 | " # enc_outputs 的维度是 [batch_size, seq_len, embedding_dim] \n", 232 | " #-----------------------------------------------------------------\n", 233 | " return enc_outputs, attn_weights # 返回编码器输出和每层编码器注意力权重" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 7, 239 | "id": "574e723c", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "# 定义编码器类\n", 244 | "n_layers = 6 # 设置 Encoder 的层数\n", 245 | "class Encoder(nn.Module):\n", 246 | " def __init__(self, corpus):\n", 247 | " super(Encoder, self).__init__() \n", 248 | " self.src_emb = nn.Embedding(len(corpus.src_vocab), d_embedding) # 词嵌入层\n", 249 | " self.pos_emb = nn.Embedding.from_pretrained( \\\n", 250 | " get_sin_enc_table(corpus.src_len+1, d_embedding), freeze=True) # 位置嵌入层\n", 251 | " self.layers = nn.ModuleList(EncoderLayer() for _ in range(n_layers))# 编码器层数\n", 252 | " def forward(self, enc_inputs): \n", 253 | " #------------------------- 维度信息 --------------------------------\n", 254 | " # enc_inputs 的维度是 [batch_size, source_len]\n", 255 | " #-----------------------------------------------------------------\n", 256 | " # 创建一个从 1 到 source_len 的位置索引序列\n", 257 | " pos_indices = torch.arange(1, enc_inputs.size(1) + 1).unsqueeze(0).to(enc_inputs)\n", 258 | " #------------------------- 维度信息 --------------------------------\n", 259 | " # pos_indices 的维度是 [1, source_len]\n", 260 | " #----------------------------------------------------------------- \n", 261 | " # 对输入进行词嵌入和位置嵌入相加 [batch_size, source_len,embedding_dim]\n", 262 | " enc_outputs = self.src_emb(enc_inputs) + self.pos_emb(pos_indices)\n", 263 | " #------------------------- 维度信息 --------------------------------\n", 264 | " # enc_outputs 的维度是 [batch_size, seq_len, embedding_dim]\n", 265 | " #-----------------------------------------------------------------\n", 266 | " # 生成自注意力掩码\n", 267 | " enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs) \n", 268 | " #------------------------- 维度信息 --------------------------------\n", 269 | " # enc_self_attn_mask 的维度是 [batch_size, len_q, len_k] \n", 270 | " #----------------------------------------------------------------- \n", 271 | " enc_self_attn_weights = [] # 初始化 enc_self_attn_weights\n", 272 | " # 通过编码器层 [batch_size, seq_len, embedding_dim]\n", 273 | " for layer in self.layers: \n", 274 | " enc_outputs, enc_self_attn_weight = layer(enc_outputs, enc_self_attn_mask)\n", 275 | " enc_self_attn_weights.append(enc_self_attn_weight)\n", 276 | " #------------------------- 维度信息 --------------------------------\n", 277 | " # enc_outputs 的维度是 [batch_size, seq_len, embedding_dim] 维度与 enc_inputs 相同\n", 278 | " # enc_self_attn_weights 是一个列表,每个元素的维度是 [batch_size, n_heads, seq_len, seq_len] \n", 279 | " #-----------------------------------------------------------------\n", 280 | " return enc_outputs, enc_self_attn_weights # 返回编码器输出和编码器注意力权重" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 8, 286 | "id": "fe3ae05e", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# 生成后续注意力掩码的函数,用于在多头自注意力计算中忽略未来信息\n", 291 | "def get_attn_subsequent_mask(seq):\n", 292 | " #------------------------- 维度信息 --------------------------------\n", 293 | " # seq 的维度是 [batch_size, seq_len(Q)=seq_len(K)]\n", 294 | " #-----------------------------------------------------------------\n", 295 | " # 获取输入序列的形状\n", 296 | " attn_shape = [seq.size(0), seq.size(1), seq.size(1)] \n", 297 | " #------------------------- 维度信息 --------------------------------\n", 298 | " # attn_shape 是一个一维张量 [batch_size, seq_len(Q), seq_len(K)]\n", 299 | " #-----------------------------------------------------------------\n", 300 | " # 使用 numpy 创建一个上三角矩阵(triu = triangle upper)\n", 301 | " subsequent_mask = np.triu(np.ones(attn_shape), k=1)\n", 302 | " #------------------------- 维度信息 --------------------------------\n", 303 | " # subsequent_mask 的维度是 [batch_size, seq_len(Q), seq_len(K)]\n", 304 | " #-----------------------------------------------------------------\n", 305 | " # 将 numpy 数组转换为 PyTorch 张量,并将数据类型设置为 byte(布尔值)\n", 306 | " subsequent_mask = torch.from_numpy(subsequent_mask).byte()\n", 307 | " #------------------------- 维度信息 --------------------------------\n", 308 | " # 返回的 subsequent_mask 的维度是 [batch_size, seq_len(Q), seq_len(K)]\n", 309 | " #-----------------------------------------------------------------\n", 310 | " return subsequent_mask # 返回后续位置的注意力掩码" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 9, 316 | "id": "0b1f63af", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "# 定义解码器层类\n", 321 | "class DecoderLayer(nn.Module):\n", 322 | " def __init__(self):\n", 323 | " super(DecoderLayer, self).__init__()\n", 324 | " self.self_attn = MultiHeadAttention() # 多头自注意力层\n", 325 | " self.feed_forward = PoswiseFeedForwardNet() # 逐位置前馈网络层\n", 326 | " self.norm1 = nn.LayerNorm(d_embedding) # 第一个层归一化\n", 327 | " self.norm2 = nn.LayerNorm(d_embedding) # 第二个层归一化\n", 328 | " def forward(self, dec_inputs, attn_mask=None):\n", 329 | " # 使用多头自注意力处理输入\n", 330 | " attn_output, _ = self.self_attn(dec_inputs, dec_inputs, dec_inputs, attn_mask)\n", 331 | " # 将注意力输出与输入相加并进行第一个层归一化\n", 332 | " norm1_outputs = self.norm1(dec_inputs + attn_output)\n", 333 | " # 将归一化后的输出输入到位置前馈神经网络\n", 334 | " ff_outputs = self.feed_forward(norm1_outputs)\n", 335 | " # 将前馈神经网络输出与第一次归一化后的输出相加并进行第二个层归一化\n", 336 | " dec_outputs = self.norm2(norm1_outputs + ff_outputs)\n", 337 | " return dec_outputs # 返回解码器层输出" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 10, 343 | "id": "e3a141f3", 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "# 定义解码器类\n", 348 | "n_layers = 6 # 设置 Decoder 的层数\n", 349 | "class Decoder(nn.Module):\n", 350 | " def __init__(self, vocab_size, max_seq_len):\n", 351 | " super(Decoder, self).__init__()\n", 352 | " # 词嵌入层(参数为词典维度)\n", 353 | " self.src_emb = nn.Embedding(vocab_size, d_embedding) \n", 354 | " # 位置编码层(参数为序列长度)\n", 355 | " self.pos_emb = nn.Embedding(max_seq_len, d_embedding)\n", 356 | " # 初始化 N 个解码器层 \n", 357 | " self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)]) \n", 358 | " def forward(self, dec_inputs): \n", 359 | " # 创建位置信息\n", 360 | " positions = torch.arange(len(dec_inputs), device=dec_inputs.device).unsqueeze(-1)\n", 361 | " # 将词嵌入与位置编码相加\n", 362 | " inputs_embedding = self.src_emb(dec_inputs) + self.pos_emb(positions)\n", 363 | " # 生成自注意力掩码\n", 364 | " attn_mask = get_attn_subsequent_mask(inputs_embedding).to(device)\n", 365 | " # 初始化解码器输入,这是第一层解码器层的输入 \n", 366 | " dec_outputs = inputs_embedding \n", 367 | " for layer in self.layers:\n", 368 | " # 将输入数据传递给解码器层,并返回解码器层的输出,作为下一层的输入\n", 369 | " dec_outputs = layer(dec_outputs, attn_mask) \n", 370 | " return dec_outputs # 返回解码器输出" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 11, 376 | "id": "db8425ba", 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# 定义 GPT 模型\n", 381 | "class GPT(nn.Module):\n", 382 | " def __init__(self, vocab_size, max_seq_len):\n", 383 | " super(GPT, self).__init__()\n", 384 | " self.decoder = Decoder(vocab_size, max_seq_len) # 解码器,用于学习文本生成能力\n", 385 | " self.projection = nn.Linear(d_embedding, vocab_size) # 全连接层,输出预测结果\n", 386 | " def forward(self, dec_inputs): \n", 387 | " dec_outputs = self.decoder(dec_inputs) # 将输入数据传递给解码器\n", 388 | " logits = self.projection(dec_outputs) # 传递给全连接层以生成预测\n", 389 | " return logits # 返回预测结果" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 12, 395 | "id": "f83e1746", 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "# 构建语料库\n", 400 | "from collections import Counter\n", 401 | "class LanguageCorpus:\n", 402 | " def __init__(self, sentences):\n", 403 | " self.sentences = sentences\n", 404 | " # 计算语言的最大句子长度,并加 2 以容纳特殊符号 \n", 405 | " self.seq_len = max([len(sentence.split()) for sentence in sentences]) + 2\n", 406 | " self.vocab = self.create_vocabulary() # 创建源语言和目标语言的词汇表\n", 407 | " self.idx2word = {v: k for k, v in self.vocab.items()} # 创建索引到单词的映射\n", 408 | " def create_vocabulary(self):\n", 409 | " vocab = {'': 0, '': 1, '': 2}\n", 410 | " counter = Counter()\n", 411 | " # 统计语料库的单词频率\n", 412 | " for sentence in self.sentences:\n", 413 | " words = sentence.split()\n", 414 | " counter.update(words)\n", 415 | " # 创建词汇表,并为每个单词分配一个唯一的索引\n", 416 | " for word in counter:\n", 417 | " if word not in vocab:\n", 418 | " vocab[word] = len(vocab)\n", 419 | " return vocab\n", 420 | " def make_batch(self, batch_size, test_batch=False):\n", 421 | " input_batch, output_batch = [], [] # 初始化批数据\n", 422 | " sentence_indices = torch.randperm(len(self.sentences))[:batch_size] # 随机选择句子索引\n", 423 | " for index in sentence_indices:\n", 424 | " sentence = self.sentences[index]\n", 425 | " # 将句子转换为索引序列\n", 426 | " seq = [self.vocab['']] + [self.vocab[word] for word in sentence.split()] + [self.vocab['']]\n", 427 | " seq += [self.vocab['']] * (self.seq_len - len(seq)) # 对序列进行填充\n", 428 | " # 将处理好的序列添加到批次中\n", 429 | " input_batch.append(seq[:-1])\n", 430 | " output_batch.append(seq[1:])\n", 431 | " return torch.LongTensor(input_batch), torch.LongTensor(output_batch)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 13, 437 | "id": "8837f3ff", 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "name": "stdout", 442 | "output_type": "stream", 443 | "text": [ 444 | " 语料库词汇表大小 : 133\n", 445 | " 最长句子长度 : 17\n" 446 | ] 447 | } 448 | ], 449 | "source": [ 450 | "with open(\"lang.txt\", \"r\") as file: # 从文件中读入语料\n", 451 | " sentences = [line.strip() for line in file.readlines()]\n", 452 | "corpus = LanguageCorpus(sentences) # 创建语料库\n", 453 | "vocab_size = len(corpus.vocab) # 词汇表大小\n", 454 | "max_seq_len = corpus.seq_len # 最大句子长度(用于设置位置编码)\n", 455 | "print(f\" 语料库词汇表大小 : {vocab_size}\") # 打印词汇表大小\n", 456 | "print(f\" 最长句子长度 : {max_seq_len}\") # 打印最大序列长" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 14, 462 | "id": "13885e14", 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "name": "stderr", 467 | "output_type": "stream", 468 | "text": [ 469 | "C:\\Users\\huangj2.ARES\\AppData\\Local\\Temp\\ipykernel_27644\\1153561833.py:24: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at C:\\b\\abs_bao0hdcrdh\\croot\\pytorch_1675190257512\\work\\aten\\src\\ATen\\native\\TensorAdvancedIndexing.cpp:1582.)\n", 470 | " scores.masked_fill_(attn_mask, -1e9)\n", 471 | "C:\\Users\\huangj2.ARES\\AppData\\Local\\anaconda3\\lib\\site-packages\\torch\\autograd\\__init__.py:173: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at C:\\b\\abs_bao0hdcrdh\\croot\\pytorch_1675190257512\\work\\aten\\src\\ATen\\native\\TensorAdvancedIndexing.cpp:1582.)\n", 472 | " Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n" 473 | ] 474 | }, 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "Epoch: 0100 cost = 0.497253\n", 480 | "Epoch: 0200 cost = 0.294283\n", 481 | "Epoch: 0300 cost = 0.261129\n", 482 | "Epoch: 0400 cost = 0.240861\n", 483 | "Epoch: 0500 cost = 0.242571\n" 484 | ] 485 | } 486 | ], 487 | "source": [ 488 | "import torch.optim as optim # 导入优化器\n", 489 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\" # 设置设备\n", 490 | "model = GPT(vocab_size, max_seq_len).to(device) # 创建 GPT 模型实例\n", 491 | "criterion = nn.CrossEntropyLoss() # 损失函数\n", 492 | "optimizer = optim.Adam(model.parameters(), lr=0.0001) # 优化器\n", 493 | "epochs = 500 # 训练轮次\n", 494 | "for epoch in range(epochs): # 训练 epochs 轮\n", 495 | " optimizer.zero_grad() # 梯度清零\n", 496 | " inputs, targets = corpus.make_batch(batch_size) # 创建训练数据\n", 497 | " inputs, targets = inputs.to(device), targets.to(device)\n", 498 | " outputs = model(inputs) # 获取模型输出 \n", 499 | " loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # 计算损失\n", 500 | " if (epoch + 1) % 100 == 0: # 打印损失\n", 501 | " print(f\"Epoch: {epoch + 1:04d} cost = {loss:.6f}\")\n", 502 | " loss.backward() # 反向传播\n", 503 | " optimizer.step() # 更新参数" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 15, 509 | "id": "faad80bd", 510 | "metadata": {}, 511 | "outputs": [ 512 | { 513 | "name": "stderr", 514 | "output_type": "stream", 515 | "text": [ 516 | "C:\\Users\\huangj2.ARES\\AppData\\Local\\Temp\\ipykernel_27644\\1153561833.py:24: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at C:\\b\\abs_bao0hdcrdh\\croot\\pytorch_1675190257512\\work\\aten\\src\\ATen\\native\\TensorAdvancedIndexing.cpp:1582.)\n", 517 | " scores.masked_fill_(attn_mask, -1e9)\n" 518 | ] 519 | }, 520 | { 521 | "name": "stdout", 522 | "output_type": "stream", 523 | "text": [ 524 | " 生成的文本 : Python is a popular programming language.\n" 525 | ] 526 | } 527 | ], 528 | "source": [ 529 | "# 测试文本生成\n", 530 | "def generate_text(model, input_str, max_len=50):\n", 531 | " model.eval() # 将模型设置为评估(测试)模式,关闭 dropout 和 batch normalization 等训练相关的层\n", 532 | " # 将输入字符串中的每个 token 转换为其在词汇表中的索引\n", 533 | " input_tokens = [corpus.vocab[token] for token in input_str]\n", 534 | " # 创建一个新列表,将输入的 tokens 复制到输出 tokens 中 , 目前只有输入的词\n", 535 | " output_tokens = input_tokens.copy()\n", 536 | " with torch.no_grad(): # 禁用梯度计算,以节省内存并加速测试过程\n", 537 | " for _ in range(max_len): # 生成最多 max_len 个 tokens\n", 538 | " # 将输出的 token 转换为 PyTorch 张量,并增加一个代表批次的维度 [1, len(output_tokens)]\n", 539 | " inputs = torch.LongTensor(output_tokens).unsqueeze(0).to(device)\n", 540 | " outputs = model(inputs) # 输出 logits 形状为 [1, len(output_tokens), vocab_size]\n", 541 | " # 在最后一个维度上获取 logits 中的最大值,并返回其索引(即下一个 token)\n", 542 | " _, next_token = torch.max(outputs[:, -1, :], dim=-1) \n", 543 | " next_token = next_token.item() # 将张量转换为 Python 整数 \n", 544 | " if next_token == corpus.vocab[\"\"]:\n", 545 | " break # 如果生成的 token 是 EOS(结束符),则停止生成过程 \n", 546 | " output_tokens.append(next_token) # 将生成的 tokens 添加到 output_tokens 列表\n", 547 | " # 将输出 tokens 转换回文本字符串\n", 548 | " output_str = \" \".join([corpus.idx2word[token] for token in output_tokens])\n", 549 | " return output_str\n", 550 | "input_str = [\"Python\"] # 输入一个词:Python\n", 551 | "generated_text = generate_text(model, input_str) # 模型跟着这个词生成后续文本\n", 552 | "print(\" 生成的文本 :\", generated_text) # 打印预测文本" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "id": "40708d6e", 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "id": "d64ca095", 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [] 570 | } 571 | ], 572 | "metadata": { 573 | "kernelspec": { 574 | "display_name": "Python 3 (ipykernel)", 575 | "language": "python", 576 | "name": "python3" 577 | }, 578 | "language_info": { 579 | "codemirror_mode": { 580 | "name": "ipython", 581 | "version": 3 582 | }, 583 | "file_extension": ".py", 584 | "mimetype": "text/x-python", 585 | "name": "python", 586 | "nbconvert_exporter": "python", 587 | "pygments_lexer": "ipython3", 588 | "version": "3.10.11" 589 | } 590 | }, 591 | "nbformat": 4, 592 | "nbformat_minor": 5 593 | } 594 | -------------------------------------------------------------------------------- /07_GPT/WikiGPT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 54, 6 | "id": "ea9e29a9", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import torch\n", 12 | "import torch.nn as nn # 导入torch.nn库\n", 13 | "d_k = 64 # K(=Q)维度\n", 14 | "d_v = 64 # V维度\n", 15 | "# 定义缩放点积注意力类\n", 16 | "class ScaledDotProductAttention(nn.Module):\n", 17 | " def __init__(self):\n", 18 | " super(ScaledDotProductAttention, self).__init__() \n", 19 | " def forward(self, Q, K, V, attn_mask): \n", 20 | " # Q K V [batch_size, n_heads, len_q/k/v, dim_q=k/v] (dim_q=dim_k)\n", 21 | " # 计算注意力分数(原始权重)[batch_size,n_heads,len_q,len_k]\n", 22 | " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) \n", 23 | " # 使用注意力掩码,将attn_mask中值为1的位置的权重替换为极小值\n", 24 | " # attn_mask [batch_size,n_heads,len_q,len_k],形状和scores相同\n", 25 | " scores.masked_fill_(attn_mask, -1e9) \n", 26 | " # 对注意力分数进行softmax\n", 27 | " weights = nn.Softmax(dim=-1)(scores)\n", 28 | " # 计算上下文向量(也就是注意力的输出), 是上下文信息的紧凑表示\n", 29 | " context = torch.matmul(weights, V)\n", 30 | " return context, weights # 返回上下文向量和注意力分数" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 55, 36 | "id": "cc173797", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# 定义多头注意力类\n", 41 | "d_embedding = 512 # Embedding Size\n", 42 | "n_heads = 8 # number of heads in Multi-Head Attention\n", 43 | "batch_size = 3 # 每一批数据量\n", 44 | "class MultiHeadAttention(nn.Module):\n", 45 | " def __init__(self):\n", 46 | " super(MultiHeadAttention, self).__init__()\n", 47 | " self.W_Q = nn.Linear(d_embedding, d_k * n_heads) # Q的线性变换层\n", 48 | " self.W_K = nn.Linear(d_embedding, d_k * n_heads) # K的线性变换层\n", 49 | " self.W_V = nn.Linear(d_embedding, d_v * n_heads) # V的线性变换层\n", 50 | " self.linear = nn.Linear(n_heads * d_v, d_embedding)\n", 51 | " self.layer_norm = nn.LayerNorm(d_embedding)\n", 52 | "\n", 53 | " def forward(self, Q, K, V, attn_mask): \n", 54 | " # Q K V [batch_size,len_q/k/v,embedding_dim] \n", 55 | " residual, batch_size = Q, Q.size(0) # 保留残差连接\n", 56 | " # 将输入进行线性变换和重塑,以便后续处理\n", 57 | " # q_s k_s v_s: [batch_size,n_heads.,len_q/k/v,d_q=k/v]\n", 58 | " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) \n", 59 | " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)\n", 60 | " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)\n", 61 | " # 将注意力掩码复制到多头 [batch_size,n_heads,len_q,len_k]\n", 62 | " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)\n", 63 | " # 使用缩放点积注意力计算上下文和注意力权重\n", 64 | " context, weights = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", 65 | " # 重塑上下文向量并进行线性变换,[batch_size,len_q,n_heads * dim_v]\n", 66 | " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) \n", 67 | " output = self.linear(context)\n", 68 | " # 与输入(Q)进行残差链接,并进行层归一化后输出[batch_size, len_q, embedding_dim]\n", 69 | " output = self.layer_norm(output + residual)\n", 70 | " return output, weights # 返回层归一化的输出和注意力权重" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 56, 76 | "id": "ea0d367d", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# 定义逐位置前向传播网络类\n", 81 | "class PoswiseFeedForwardNet(nn.Module):\n", 82 | " def __init__(self):\n", 83 | " super(PoswiseFeedForwardNet, self).__init__()\n", 84 | " # 定义一维卷积层1,用于将输入映射到更高维度\n", 85 | " self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=2048, kernel_size=1)\n", 86 | " # 定义一维卷积层2,用于将输入映射回原始维度\n", 87 | " self.conv2 = nn.Conv1d(in_channels=2048, out_channels=d_embedding, kernel_size=1)\n", 88 | " # 定义层归一化\n", 89 | " self.layer_norm = nn.LayerNorm(d_embedding)\n", 90 | "\n", 91 | " def forward(self, inputs): \n", 92 | " # inputs: [batch_size, len_q, embedding_dim] \n", 93 | " residual = inputs # 保留残差连接\n", 94 | " # 在卷积层1后使用ReLU激活函数\n", 95 | " output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))\n", 96 | " # 使用卷积层2进行降维\n", 97 | " output = self.conv2(output).transpose(1, 2)\n", 98 | " # 与输入进行残差链接,并进行层归一化,[batch_size, len_q, embedding_dim]\n", 99 | " output = self.layer_norm(output + residual)\n", 100 | " return output # 返回层归一化后的输出加上残差连接的结果" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 57, 106 | "id": "f1dab244", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import numpy as np\n", 111 | "def get_sin_enc_table(n_position, embedding_dim):\n", 112 | " # 根据位置和维度信息,初始化正弦位置编码表\n", 113 | " sinusoid_table = np.zeros((n_position, embedding_dim)) \n", 114 | " # 遍历所有位置和维度,计算角度值\n", 115 | " for pos_i in range(n_position):\n", 116 | " for hid_j in range(embedding_dim):\n", 117 | " angle = pos_i / np.power(10000, 2 * (hid_j // 2) / embedding_dim)\n", 118 | " sinusoid_table[pos_i, hid_j] = angle \n", 119 | " # 计算正弦和余弦值\n", 120 | " sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 偶数维\n", 121 | " sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 奇数维 \n", 122 | " return torch.FloatTensor(sinusoid_table)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 58, 128 | "id": "f1f32613", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# 生成填充注意力掩码的函数,用于在多头自注意力计算中忽略填充部分\n", 133 | "def get_attn_pad_mask(seq_q, seq_k):\n", 134 | " batch_size, len_q = seq_q.size()\n", 135 | " batch_size, len_k = seq_k.size()\n", 136 | " # 生成布尔类型张量[batch_size,1,len_k(=len_q)]\n", 137 | " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # Token的编码值为0 \n", 138 | " # 变形为何注意力分数相同形状的张量 [batch_size,len_q,len_k]\n", 139 | " pad_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k) \n", 140 | " return pad_attn_mask # 形状[batch_size,len_q,len_k]" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 59, 146 | "id": "c2cb5167", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# 生成后续注意力掩码的函数,用于在多头自注意力计算中忽略未来信息\n", 151 | "def get_attn_subsequent_mask(seq):\n", 152 | " # 获取输入序列的形状 [batch_size, seq_len(len_q), seq_len(len_k)]\n", 153 | " attn_shape = [seq.size(0), seq.size(1), seq.size(1)]\n", 154 | " # 使用numpy创建一个上三角矩阵(triu = triangle upper)\n", 155 | " subsequent_mask = np.triu(np.ones(attn_shape), k=1)\n", 156 | " # 将numpy数组转换为PyTorch张量,并将数据类型设置为byte(布尔值)\n", 157 | " subsequent_mask = torch.from_numpy(subsequent_mask).byte()\n", 158 | " return subsequent_mask # [batch_size, seq_len(len_q), seq_len(len_k)]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 60, 164 | "id": "e8bfb20e", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# 构建解码器层\n", 169 | "class DecoderLayer(nn.Module):\n", 170 | " def __init__(self):\n", 171 | " super(DecoderLayer, self).__init__()\n", 172 | " self.self_attn = MultiHeadAttention() # 多头自注意力层\n", 173 | " self.feed_forward = PoswiseFeedForwardNet() # 位置前馈神经网络层\n", 174 | " self.norm1 = nn.LayerNorm(d_embedding) # 第一个层归一化\n", 175 | " self.norm2 = nn.LayerNorm(d_embedding) # 第二个层归一化\n", 176 | "\n", 177 | " def forward(self, dec_inputs, attn_mask=None):\n", 178 | " # 使用多头自注意力处理输入\n", 179 | " attn_output, _ = self.self_attn(dec_inputs, dec_inputs, dec_inputs, attn_mask)\n", 180 | " # 将注意力输出与输入相加并进行第一个层归一化\n", 181 | " norm1_outputs = self.norm1(dec_inputs + attn_output)\n", 182 | " # 将归一化后的输出输入到位置前馈神经网络\n", 183 | " ff_outputs = self.feed_forward(norm1_outputs)\n", 184 | " # 将前馈神经网络输出与第一次归一化后的输出相加并进行第二个层归一化\n", 185 | " dec_outputs = self.norm2(norm1_outputs + ff_outputs)\n", 186 | " return dec_outputs" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 61, 192 | "id": "33d35d08", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# 构建解码器\n", 197 | "n_layers = 6 # 设置Encoder/Decoder的层数\n", 198 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\" # 设置设备\n", 199 | "class Decoder(nn.Module):\n", 200 | " def __init__(self, vocab_size, max_seq_len):\n", 201 | " super(Decoder, self).__init__()\n", 202 | " self.src_emb = nn.Embedding(vocab_size, d_embedding) # 词嵌入层(参数为词典维度)\n", 203 | " self.pos_emb = nn.Embedding(max_seq_len, d_embedding) # 位置编码层(参数为序列长度) \n", 204 | " self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)]) # 初始化N个解码器层\n", 205 | "\n", 206 | " def forward(self, dec_inputs): \n", 207 | " positions = torch.arange(len(dec_inputs), device=dec_inputs.device).unsqueeze(-1) #位置信息 \n", 208 | " inputs_embedding = self.src_emb(dec_inputs) + self.pos_emb(positions) # 词嵌入与位置编码相加 \n", 209 | " attn_mask = get_attn_subsequent_mask(inputs_embedding).to(device) # 生成自注意力掩码 \n", 210 | " for layer in self.layers:\n", 211 | " dec_outputs = layer(inputs_embedding, attn_mask) # 将输入数据传递给解码器层\n", 212 | " return dec_outputs" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 62, 218 | "id": "82e6acd6", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "class GPT(nn.Module):\n", 223 | " def __init__(self, vocab_size, max_seq_len):\n", 224 | " super(GPT, self).__init__()\n", 225 | " self.decoder = Decoder(vocab_size, max_seq_len) # 解码器,用于学习文本生成能力\n", 226 | " self.projection = nn.Linear(d_embedding, vocab_size) # 全连接层,输出预测结果\n", 227 | "\n", 228 | " def forward(self, dec_inputs): \n", 229 | " dec_outputs = self.decoder(dec_inputs) # 将输入数据传递给解码器\n", 230 | " logits = self.projection(dec_outputs) # 传递给全连接层以生成预测\n", 231 | " return logits #返回预测结果" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 63, 237 | "id": "991a37e5", 238 | "metadata": {}, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "词汇表大小: 28785\n", 245 | "词汇示例(word to index): {'': 0, '': 1, '': 2, 'the': 3, 'apple': 11505}\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "from torchtext.datasets import WikiText2 # 导入WikiText2\n", 251 | "from torchtext.data.utils import get_tokenizer # 导入Tokenizer分词工具\n", 252 | "from torchtext.vocab import build_vocab_from_iterator # 导入Vocabulary工具\n", 253 | "from torch.utils.data import DataLoader, Dataset # 导入Pytorch的DataLoader和Dataset\n", 254 | "\n", 255 | "tokenizer = get_tokenizer(\"basic_english\") # 定义数据预处理所需的tokenizer\n", 256 | "\n", 257 | "train_iter = WikiText2(split='train') # 加载WikiText2数据集的训练部分\n", 258 | "valid_iter = WikiText2(split='valid') # 加载WikiText2数据集的验证部分\n", 259 | "\n", 260 | "# 定义一个生成器函数,用于将数据集中的文本转换为tokens\n", 261 | "def yield_tokens(data_iter):\n", 262 | " for item in data_iter:\n", 263 | " yield tokenizer(item)\n", 264 | "\n", 265 | "# 创建词汇表,包括特殊tokens:\"\", \"\", \"\"\n", 266 | "vocab = build_vocab_from_iterator(yield_tokens(train_iter), \n", 267 | " specials=[\"\", \"\", \"\"])\n", 268 | "vocab.set_default_index(vocab[\"\"])\n", 269 | "\n", 270 | "# 打印词汇表信息\n", 271 | "print(\"词汇表大小:\", len(vocab))\n", 272 | "print(\"词汇示例(word to index):\", \n", 273 | " {word: vocab[word] for word in [\"\", \"\", \"\", \"the\", \"apple\"]})" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 64, 279 | "id": "e33341df", 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "Dataset数据条目: 36718\n", 287 | "输入序列张量样例: tensor([ 1, 2659, 3478, 17569, 9098])\n", 288 | "目标序列张量样例: tensor([ 2659, 3478, 17569, 9098, 2])\n", 289 | "输入序列样例文本: 96 ammunition packing boxes\n", 290 | "目标序列样例文本: 96 ammunition packing boxes \n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "from torch.utils.data import Dataset # 导入Dataset\n", 296 | "max_seq_len = 256 # 设置序列的最大长度\n", 297 | "\n", 298 | "# 定义一个处理WikiText2数据集的自定义数据集类\n", 299 | "class WikiDataset(Dataset):\n", 300 | " def __init__(self, data_iter, vocab, max_len=max_seq_len):\n", 301 | " self.data = [] \n", 302 | " for sentence in data_iter: # 遍历数据集,将文本转换为tokens\n", 303 | " # 对每个句子进行tokenization,并截取长度为max_len-2,为留出空间\n", 304 | " tokens = tokenizer(sentence)[:max_len - 2]\n", 305 | " tokens = [vocab[\"\"]] + vocab(tokens) + [vocab[\"\"]] # 添加 \n", 306 | " self.data.append(tokens) # 将处理好的tokens添加到数据集中\n", 307 | " \n", 308 | " def __len__(self): # 定义数据集的长度\n", 309 | " return len(self.data) \n", 310 | " \n", 311 | " def __getitem__(self, idx): # 定义数据集的索引方法 (即抽取数据条目) \n", 312 | " source = self.data[idx][:-1] # 获取当前数据,并将移除,作为source \n", 313 | " target = self.data[idx][1:] # 获取当前数据,并将移除,作为target(右移1位) \n", 314 | " return torch.tensor(source), torch.tensor(target) # 转换为tensor并返回\n", 315 | "\n", 316 | "train_dataset = WikiDataset(train_iter, vocab) # 创建训练数据集\n", 317 | "valid_dataset = WikiDataset(valid_iter, vocab) # 创建验证数据集\n", 318 | "print(f\"Dataset数据条目: {len(train_dataset)}\")\n", 319 | "sample_source, sample_target = train_dataset[100]\n", 320 | "print(f\"输入序列张量样例: {sample_source}\")\n", 321 | "print(f\"目标序列张量样例: {sample_target}\")\n", 322 | "decoded_source = ' '.join(vocab.lookup_tokens(sample_source.tolist()))\n", 323 | "decoded_target = ' '.join(vocab.lookup_tokens(sample_target.tolist()))\n", 324 | "print(f\"输入序列样例文本: {decoded_source}\")\n", 325 | "print(f\"目标序列样例文本: {decoded_target}\")" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 65, 331 | "id": "c61f26b8", 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "from torch.utils.data import DataLoader # 导入Dataloader\n", 336 | "# 定义pad_sequence函数,用于将一批序列补齐到相同长度\n", 337 | "def pad_sequence(sequences, padding_value=0, length=None):\n", 338 | " # 计算最大序列长度,如果length参数未提供,则使用输入序列中的最大长度\n", 339 | " max_length = max(len(seq) for seq in sequences) if length is None else length \n", 340 | " # 创建一个具有适当形状的全零张量,用于存储补齐后的序列\n", 341 | " result = torch.full((len(sequences), max_length), padding_value, dtype=torch.long) \n", 342 | " # 遍历序列,将每个序列的内容复制到结果张量中\n", 343 | " for i, seq in enumerate(sequences):\n", 344 | " end = len(seq)\n", 345 | " result[i, :end] = seq[:end]\n", 346 | " return result\n", 347 | "\n", 348 | "# 定义collate_fn函数,用于将一个批次的数据整理成适当的形状\n", 349 | "def collate_fn(batch):\n", 350 | " # 从批次中分离源序列和目标序列\n", 351 | " sources, targets = zip(*batch) \n", 352 | " # 计算批次中的最大序列长度\n", 353 | " max_length = max(max(len(s) for s in sources), max(len(t) for t in targets)) \n", 354 | " # 使用pad_sequence函数补齐源序列和目标序列\n", 355 | " sources = pad_sequence(sources, padding_value=vocab[\"\"], length=max_length)\n", 356 | " targets = pad_sequence(targets, padding_value=vocab[\"\"], length=max_length) \n", 357 | " # 返回补齐后的源序列和目标序列\n", 358 | " return sources, targets\n", 359 | "\n", 360 | "# 创建一个训练数据加载器,使用自定义的collate_fn函数\n", 361 | "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, \n", 362 | " shuffle=True, collate_fn=collate_fn)\n", 363 | "# 创建一个验证数据加载器,使用自定义的collate_fn函数\n", 364 | "valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size,\n", 365 | " shuffle=False, collate_fn=collate_fn)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 66, 371 | "id": "54dd5b39", 372 | "metadata": { 373 | "scrolled": true 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "import torch.optim as optim # 导入优化器\n", 378 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\" # 设置设备\n", 379 | "model = GPT(len(vocab), max_seq_len).to(device) # 创建GPT模型实例\n", 380 | "criterion = nn.CrossEntropyLoss(ignore_index=vocab[\"\"])\n", 381 | "optimizer = optim.Adam(model.parameters(), lr=0.0001) # 优化器\n", 382 | "epochs = 2 # 训练轮次\n", 383 | "\n", 384 | "for epoch in range(epochs):\n", 385 | " epoch_loss = 0\n", 386 | " for batch_idx, (source, target) in enumerate(train_dataloader): # 用Dataloader加载数据\n", 387 | " inputs, targets = source.to(device), target.to(device)\n", 388 | " optimizer.zero_grad() # 梯度清零\n", 389 | " outputs = model(inputs) # 获取模型输出\n", 390 | " loss = criterion(outputs.view(-1, len(vocab)), targets.view(-1)) # 计算损失\n", 391 | " loss.backward() # 反向传播\n", 392 | " optimizer.step() # 更新参数\n", 393 | " epoch_loss += loss.item() \n", 394 | " if (batch_idx + 1) % 500 == 0: # 每500个批次打印一次损失\n", 395 | " print(f\"Batch {batch_idx + 1}/{len(train_dataloader)}, Loss: {loss.item()}\") \n", 396 | " epoch_loss /= len(train_dataloader) # 每轮打印一次损失\n", 397 | " print(f\"Epoch {epoch + 1}/{epochs}, Average Loss: {epoch_loss}\")" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 67, 403 | "id": "1401a8fd", 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "# import time\n", 408 | "# from datetime import datetime\n", 409 | "\n", 410 | "# # Save the trained model\n", 411 | "# timestamp = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')\n", 412 | "# model_file_name = f\"trained_model_{timestamp}.pt\"\n", 413 | "# torch.save(model.state_dict(), model_file_name)\n", 414 | "# print(f\"Model saved as {model_file_name}\")" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 91, 420 | "id": "5b58ecd2", 421 | "metadata": {}, 422 | "outputs": [ 423 | { 424 | "name": "stderr", 425 | "output_type": "stream", 426 | "text": [ 427 | "/tmp/ipykernel_1792588/278596197.py:16: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at /opt/conda/conda-bld/pytorch_1670525551200/work/aten/src/ATen/native/cuda/Indexing.cu:1435.)\n", 428 | " scores.masked_fill_(attn_mask, -1e9)\n" 429 | ] 430 | }, 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "生成的文本: how are you ' species nests . common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common starlings are common\n" 436 | ] 437 | } 438 | ], 439 | "source": [ 440 | "# Replace 'model_timestamp.pt' with your saved model's filename\n", 441 | "model.load_state_dict(torch.load('trained_model_2023-05-05_14-08-24.pt'))\n", 442 | "# 测试文本生成\n", 443 | "def generate_text_greedy_search(model, input_str, max_len=50):\n", 444 | " model.eval() # 将模型设置为评估(测试)模式,关闭dropout和batch normalization等训练相关的层\n", 445 | " # 将输入字符串中的每个Token 转换为其在词汇表中的索引\n", 446 | " input_tokens = [vocab[token] for token in input_str.split()]\n", 447 | " # 创建一个新列表,将输入的Token复制到输出Token中,目前只有输入的词\n", 448 | " output_tokens = input_tokens.copy()\n", 449 | " with torch.no_grad(): # 禁用梯度计算,以节省内存并加速测试过程\n", 450 | " for _ in range(max_len): # 生成最多max_len个Token\n", 451 | " # 将输出token转换为 PyTorch张量,并增加一个代表批次的维度[1, len(output_tokens)]\n", 452 | " inputs = torch.LongTensor(output_tokens).unsqueeze(0).to(device)\n", 453 | " outputs = model(inputs) # 输出 logits形状为[1, len(output_tokens), vocab_size]\n", 454 | " logits = outputs[:, -1, :] # 只关心最后一个时间步(即最新生成的token)的logits\n", 455 | " # 在最后一个维度上获取logits中的最大值,并返回其索引(即下一个Token)\n", 456 | " _, next_token = torch.max(logits, dim=-1) \n", 457 | " next_token = next_token.item() # 将张量转换为Python整数 \n", 458 | " if next_token == vocab[\"\"]:\n", 459 | " break # 如果生成的Token是 EOS(结束符),则停止生成过程 \n", 460 | " output_tokens.append(next_token) # 将生成的Token添加到output_tokens列表\n", 461 | " # 将输出Token转换回文本字符串\n", 462 | " output_str = \" \".join([vocab.get_itos()[token] for token in output_tokens\n", 463 | " if vocab.get_itos()[token] != \"\" and vocab.get_itos()[token] != \"\" ])\n", 464 | " return output_str\n", 465 | "\n", 466 | "input_str = \"how are you\" # 输入一个词:Python\n", 467 | "generated_text = generate_text_greedy_search(model, input_str) # 模型跟着这个字生成后续文本\n", 468 | "print(\"生成的文本:\", generated_text) # 打印预测文本" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 92, 474 | "id": "eb185470", 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "name": "stderr", 479 | "output_type": "stream", 480 | "text": [ 481 | "/tmp/ipykernel_1792588/278596197.py:16: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at /opt/conda/conda-bld/pytorch_1670525551200/work/aten/src/ATen/native/cuda/Indexing.cu:1435.)\n", 482 | " scores.masked_fill_(attn_mask, -1e9)\n" 483 | ] 484 | }, 485 | { 486 | "name": "stdout", 487 | "output_type": "stream", 488 | "text": [ 489 | "生成的文本: my name was also used in 1897 by lucasfilm games in the common by lucasfilm games in the common by lucasfilm games in the common by lucasfilm games in the common by lucasfilm games in the common by lucasfilm games in the common by lucasfilm games in the common by lucasfilm games\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "# 定义集束搜索的函数\n", 495 | "def generate_text_beam_search(model, input_str, max_len=50, beam_width=5):\n", 496 | " model.eval() # 将模型设置为评估(测试)模式,关闭dropout和batch normalization等训练相关的层\n", 497 | " # 将输入字符串中的每个token 转换为其在词汇表中的索引\n", 498 | " input_tokens = [vocab[token] for token in input_str.split()]\n", 499 | " # 创建一个列表,用于存储候选序列\n", 500 | " candidates = [(input_tokens, 0.0)]\n", 501 | " with torch.no_grad(): # 禁用梯度计算,以节省内存并加速测试过程\n", 502 | " for _ in range(max_len): # 生成最多max_len个tokens\n", 503 | " new_candidates = []\n", 504 | " for candidate, candidate_score in candidates:\n", 505 | " inputs = torch.LongTensor(candidate).unsqueeze(0).to(device)\n", 506 | " outputs = model(inputs) # 输出 logits形状为[1, len(output_tokens), vocab_size]\n", 507 | " logits = outputs[:, -1, :] # 只关心最后一个时间步(即最新生成的token)的logits\n", 508 | " # 找到具有最高分数的前beam_width个tokens\n", 509 | " scores, next_tokens = torch.topk(logits, beam_width, dim=-1)\n", 510 | " final_results = [] # 初始化输出序列\n", 511 | " for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()):\n", 512 | " new_candidate = candidate + [next_token.item()]\n", 513 | " new_score = candidate_score - score.item() # 使用负数,因为我们需要降序排列\n", 514 | " if next_token.item() == vocab[\"\"]:\n", 515 | " # 如果生成的token是EOS(结束符),将其添加到最终结果中\n", 516 | " final_results.append((new_candidate, new_score))\n", 517 | " else:\n", 518 | " # 将新生成的候选序列添加到新候选列表中\n", 519 | " new_candidates.append((new_candidate, new_score))\n", 520 | " # 从新候选列表中选择得分最高的beam_width个序列\n", 521 | " candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width]\n", 522 | " # 选择得分最高的候选序列\n", 523 | " best_candidate, _ = sorted(candidates, key=lambda x: x[1])[0]\n", 524 | " # 将输出 token 转换回文本字符串\n", 525 | " output_str = \" \".join([vocab.get_itos()[token] for token in best_candidate if vocab.get_itos()[token] != \"\"])\n", 526 | " return output_str\n", 527 | "\n", 528 | "model.load_state_dict(torch.load('trained_model_2023-05-05_14-08-24.pt')) # 加载模型\n", 529 | "input_str = \"my name\" # 输入几个词\n", 530 | "generated_text = generate_text_beam_search(model, input_str) # 模型跟着这些词生成后续文本\n", 531 | "print(\"生成的文本:\", generated_text) # 打印生成的文本" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "id": "8207f043", 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "id": "af42a2fe", 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [] 549 | } 550 | ], 551 | "metadata": { 552 | "kernelspec": { 553 | "display_name": "Python 3 (ipykernel)", 554 | "language": "python", 555 | "name": "python3" 556 | }, 557 | "language_info": { 558 | "codemirror_mode": { 559 | "name": "ipython", 560 | "version": 3 561 | }, 562 | "file_extension": ".py", 563 | "mimetype": "text/x-python", 564 | "name": "python", 565 | "nbconvert_exporter": "python", 566 | "pygments_lexer": "ipython3", 567 | "version": "3.9.12" 568 | } 569 | }, 570 | "nbformat": 4, 571 | "nbformat_minor": 5 572 | } 573 | -------------------------------------------------------------------------------- /07_GPT/lang.txt: -------------------------------------------------------------------------------- 1 | Python is a popular programming language. 2 | I love to code in Python. 3 | Data science is a hot topic in the tech industry. 4 | Machine learning and deep learning are important parts of data science. 5 | I am learning how to build neural networks. 6 | Neural networks are modeled after the structure of the human brain. 7 | Artificial intelligence has many applications in various industries. 8 | Natural language processing is a branch of AI that deals with language understanding. 9 | The rise of big data has led to an increased demand for data scientists. 10 | I enjoy analyzing and visualizing data using Python libraries like Pandas and Matplotlib. 11 | Data cleaning is an important part of data analysis. 12 | I am fascinated by the power of deep learning algorithms. 13 | Self-driving cars are an example of the practical applications of AI. 14 | The tech industry is constantly evolving and changing. 15 | I believe that AI will have a major impact on the future of work. 16 | I am excited to see how AI will continue to develop and change the world. 17 | The ethical implications of AI are a topic of much debate and discussion. 18 | As with any powerful technology, there is a responsibility to use AI ethically and responsibly. 19 | I think that the benefits of AI outweigh the risks, if used wisely. 20 | Programming is a valuable skill in the digital age. -------------------------------------------------------------------------------- /08_ChatGPT/.ipynb_checkpoints/SelfTrain_ChatGPT-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "词汇表大小: 28785\n", 13 | "词汇示例(word to index): {'': 0, '': 1, '': 2, 'the': 3, 'apple': 11505}\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from torchtext.datasets import WikiText2 # 导入WikiText2\n", 19 | "from torchtext.data.utils import get_tokenizer # 导入Tokenizer分词工具\n", 20 | "from torchtext.vocab import build_vocab_from_iterator # 导入Vocabulary工具\n", 21 | "tokenizer = get_tokenizer(\"basic_english\") # 定义数据预处理所需的tokenizer\n", 22 | "train_iter = WikiText2(split='train') # 加载WikiText2数据集的训练部分\n", 23 | "# 定义一个生成器函数,用于将数据集中的文本转换为tokens\n", 24 | "def yield_tokens(data_iter):\n", 25 | " for item in data_iter:\n", 26 | " yield tokenizer(item)\n", 27 | "# 创建词汇表,包括特殊tokens:\"\", \"\", \"\"\n", 28 | "vocab = build_vocab_from_iterator(yield_tokens(train_iter), \n", 29 | " specials=[\"\", \"\", \"\"])\n", 30 | "vocab.set_default_index(vocab[\"\"])\n", 31 | "\n", 32 | "# 打印词汇表信息\n", 33 | "print(\"词汇表大小:\", len(vocab))\n", 34 | "print(\"词汇示例(word to index):\", \n", 35 | " {word: vocab[word] for word in [\"\", \"\", \"\", \"the\", \"apple\"]})" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Sample 1:\n", 48 | "Input Data: tensor([ 1, 9209, 4, 419, 37, 181, 860, 2])\n", 49 | "Target Data: tensor([ 1, 67, 1734, 1633, 124, 4, 13818, 181, 5, 419,\n", 50 | " 76, 181, 860, 2])\n", 51 | "--------------------------------------------------\n", 52 | "Sample 2:\n", 53 | "Input Data: tensor([ 1, 67, 1734, 426, 4, 6733, 20, 4168, 5, 188, 115, 181,\n", 54 | " 289, 860, 2])\n", 55 | "Target Data: tensor([ 1, 67, 1734, 33, 1976, 820, 1703, 5, 67, 115, 639, 181,\n", 56 | " 6108, 4280, 5, 2])\n", 57 | "--------------------------------------------------\n", 58 | "Sample 3:\n", 59 | "Input Data: tensor([ 1, 188, 26, 3, 1508, 142, 805, 860, 2])\n", 60 | "Target Data: tensor([ 1, 8943, 6421, 11, 1508, 1792, 50, 3627, 20, 3, 1092, 1406,\n", 61 | " 5, 2])\n", 62 | "--------------------------------------------------\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "import torch #导入torch\n", 68 | "from torch.utils.data import Dataset #导入Dataset\n", 69 | "\n", 70 | "class ChatDataset(Dataset):\n", 71 | " def __init__(self, file_path, tokenizer, vocab):\n", 72 | " self.tokenizer = tokenizer #分词器\n", 73 | " self.vocab = vocab #词汇表\n", 74 | " self.input_data, self.target_data = self.load_and_process_data(file_path)\n", 75 | " def load_and_process_data(self, file_path): \n", 76 | " with open(file_path, \"r\") as f:\n", 77 | " lines = f.readlines() # 打开文件,读取每一行数据\n", 78 | " input_data, target_data = [], []\n", 79 | " for i, line in enumerate(lines):\n", 80 | " if line.startswith(\"User:\"): # 移除 \"User: \" 前缀,构建输入序列\n", 81 | " tokens = self.tokenizer(line.strip()[6:]) \n", 82 | " tokens = [\"\"] + tokens + [\"\"]\n", 83 | " indices = [self.vocab[token] for token in tokens]\n", 84 | " input_data.append(torch.tensor(indices, dtype=torch.long))\n", 85 | " elif line.startswith(\"AI:\"): # 移除 \"AI: \" 前缀,构建目标序列\n", 86 | " tokens = self.tokenizer(line.strip()[4:]) \n", 87 | " tokens = [\"\"] + tokens + [\"\"]\n", 88 | " indices = [self.vocab[token] for token in tokens]\n", 89 | " target_data.append(torch.tensor(indices, dtype=torch.long))\n", 90 | " return input_data, target_data\n", 91 | " def __len__(self): #数据集长度\n", 92 | " return len(self.input_data) \n", 93 | " def __getitem__(self, idx): #根据索引获取数据样本\n", 94 | " return self.input_data[idx], self.target_data[idx] \n", 95 | "\n", 96 | "file_path = \"chat.txt\" # 加载chat.txt数据集\n", 97 | "chat_dataset = ChatDataset(file_path, tokenizer, vocab)\n", 98 | "\n", 99 | "for i in range(3): # 打印几个样本数据\n", 100 | " input_sample, target_sample = chat_dataset[i]\n", 101 | " print(f\"Sample {i + 1}:\")\n", 102 | " print(\"Input Data: \", input_sample)\n", 103 | " print(\"Target Data: \", target_sample)\n", 104 | " print(\"-\" * 50)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "Input batch tensor size: torch.Size([2, 17])\n", 117 | "Target batch tensor size: torch.Size([2, 17])\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "from torch.utils.data import DataLoader # 导入Dataloader\n", 123 | "# 定义pad_sequence函数,用于将一批序列补齐到相同长度\n", 124 | "def pad_sequence(sequences, padding_value=0, length=None):\n", 125 | " # 计算最大序列长度,如果length参数未提供,则使用输入序列中的最大长度\n", 126 | " max_length = max(len(seq) for seq in sequences) if length is None else length \n", 127 | " # 创建一个具有适当形状的全零张量,用于存储补齐后的序列\n", 128 | " result = torch.full((len(sequences), max_length), padding_value, dtype=torch.long) \n", 129 | " # 遍历序列,将每个序列的内容复制到结果张量中\n", 130 | " for i, seq in enumerate(sequences):\n", 131 | " end = len(seq)\n", 132 | " result[i, :end] = seq[:end]\n", 133 | " return result\n", 134 | "\n", 135 | "# 定义collate_fn函数,用于将一个批次的数据整理成适当的形状\n", 136 | "def collate_fn(batch):\n", 137 | " # 从批次中分离源序列和目标序列\n", 138 | " sources, targets = zip(*batch) \n", 139 | " # 计算批次中的最大序列长度\n", 140 | " max_length = max(max(len(s) for s in sources), max(len(t) for t in targets)) \n", 141 | " # 使用pad_sequence函数补齐源序列和目标序列\n", 142 | " sources = pad_sequence(sources, padding_value=vocab[\"\"], length=max_length)\n", 143 | " targets = pad_sequence(targets, padding_value=vocab[\"\"], length=max_length) \n", 144 | " # 返回补齐后的源序列和目标序列\n", 145 | " return sources, targets\n", 146 | "\n", 147 | "# 创建Dataloader\n", 148 | "batch_size = 2\n", 149 | "chat_dataloader = DataLoader(chat_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)\n", 150 | "\n", 151 | "# 检查Dataloader输出\n", 152 | "for input_batch, target_batch in chat_dataloader:\n", 153 | " print(\"Input batch tensor size:\", input_batch.size())\n", 154 | " print(\"Target batch tensor size:\", target_batch.size())\n", 155 | " break" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 4, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "" 167 | ] 168 | }, 169 | "execution_count": 4, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "from GPT_Model import GPT #导入GPT模型的类(这是我们自己制作的)\n", 176 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 177 | "model = GPT(len(vocab), max_seq_len=256, n_layers=6).to(device) #创建模型示例\n", 178 | "model.load_state_dict(torch.load('trained_model_2023-05-05_14-08-24.pt')) #加载模型\n", 179 | "# model.eval()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 5, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stderr", 189 | "output_type": "stream", 190 | "text": [ 191 | "/home/huangjia/Documents/02_NLP/70 GeekTimeNLP/10 ChatGPT/GPT_2.py:15: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at /opt/conda/conda-bld/pytorch_1670525551200/work/aten/src/ATen/native/cuda/Indexing.cu:1435.)\n", 192 | " scores.masked_fill_(attn_mask, -1e9)\n", 193 | "/home/huangjia/anaconda3/lib/python3.9/site-packages/torch/autograd/__init__.py:197: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at /opt/conda/conda-bld/pytorch_1670525551200/work/aten/src/ATen/native/cuda/Indexing.cu:1435.)\n", 194 | " Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n" 195 | ] 196 | }, 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "Epoch: 0020, cost = 1.975874\n", 202 | "Epoch: 0040, cost = 0.021781\n", 203 | "Epoch: 0060, cost = 0.619990\n", 204 | "Epoch: 0080, cost = 0.777577\n", 205 | "Epoch: 0100, cost = 0.004273\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "import torch.nn as nn #导入nn\n", 211 | "import torch.optim as optim #导入优化器\n", 212 | "criterion = nn.CrossEntropyLoss(ignore_index=vocab[\"\"]) #损失函数\n", 213 | "optimizer = optim.Adam(model.parameters(), lr=0.0001) # 优化器\n", 214 | "for epoch in range(100): # 开始训练\n", 215 | " for batch_idx, (input_batch, target_batch) in enumerate(chat_dataloader): \n", 216 | " optimizer.zero_grad() # 梯度清零 \n", 217 | " input_batch, target_batch = input_batch.to(device), target_batch.to(device) #移动到设备 \n", 218 | " outputs = model(input_batch) # 前向传播,计算模型输出 \n", 219 | " loss = criterion(outputs.view(-1, len(vocab)), target_batch.view(-1)) # 计算损失 \n", 220 | " loss.backward() # 反向传播 \n", 221 | " optimizer.step() # 更新参数 \n", 222 | " if (epoch + 1) % 20 == 0: # 每200个epoch打印一次损失值\n", 223 | " print(f\"Epoch: {epoch + 1:04d}, cost = {loss:.6f}\")" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 6, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "def generate_text_beam_search(model, input_str, max_len=50, beam_width=5):\n", 233 | " model.eval() # 将模型设置为评估(测试)模式,关闭dropout和batch normalization等训练相关的层\n", 234 | " # 将输入字符串中的每个token 转换为其在词汇表中的索引\n", 235 | " input_tokens = [vocab[token] for token in input_str]\n", 236 | " # 创建一个列表,用于存储候选序列\n", 237 | " candidates = [(input_tokens, 0.0)]\n", 238 | " with torch.no_grad(): # 禁用梯度计算,以节省内存并加速测试过程\n", 239 | " for _ in range(max_len): # 生成最多max_len个tokens\n", 240 | " new_candidates = []\n", 241 | " for candidate, candidate_score in candidates:\n", 242 | " inputs = torch.LongTensor(candidate).unsqueeze(0).to(device)\n", 243 | " outputs = model(inputs) # 输出 logits形状为[1, len(output_tokens), vocab_size]\n", 244 | " logits = outputs[:, -1, :] # 只关心最后一个时间步(即最新生成的token)的logits\n", 245 | " # 将标记的得分设置为一个很大的负数,以避免选择它\n", 246 | " logits[0, vocab[\"\"]] = -1e9 # 不是这个原因,注意不认识的词汇都变成0\n", 247 | " # 找到具有最高分数的前beam_width个tokens\n", 248 | " scores, next_tokens = torch.topk(logits, beam_width, dim=-1)\n", 249 | " final_results = []\n", 250 | " for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()):\n", 251 | " new_candidate = candidate + [next_token.item()]\n", 252 | " new_score = candidate_score - score.item() # 使用负数,因为我们需要降序排列\n", 253 | " if next_token.item() == vocab[\"\"]:\n", 254 | " # 如果生成的token是EOS(结束符),将其添加到最终结果中\n", 255 | " final_results.append((new_candidate, new_score))\n", 256 | " else:\n", 257 | " # 将新生成的候选序列添加到新候选列表中\n", 258 | " new_candidates.append((new_candidate, new_score))\n", 259 | " # 从新候选列表中选择得分最高的beam_width个序列\n", 260 | " candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width]\n", 261 | " # 选择得分最高的候选序列\n", 262 | " best_candidate, _ = sorted(candidates, key=lambda x: x[1])[0]\n", 263 | " # 将输出 token 转换回文本字符串\n", 264 | " output_str = \" \".join([vocab.get_itos()[token] for token in best_candidate])\n", 265 | " return output_str" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 7, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "Generated text: hi , how are you ? thank you , depicting a suitable cavity ? prefer unknowingly a distinct indicated that link vocalists - very common starlings prefer unknowingly a distinct indicated that link vocalists - very common starlings prefer unknowingly a distinct indicated that link vocalists - very common starlings prefer unknowingly a distinct indicated horsepower\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "input_str = \"what is the weather like today ?\"\n", 283 | "input_str = \"hi , how are you ?\"\n", 284 | "# input_str = \"hi , what is you name ?\"\n", 285 | "\n", 286 | "generated_text = generate_text_beam_search(model, input_str.split())\n", 287 | "print(\"Generated text:\", generated_text)" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3 (ipykernel)", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.10.11" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /08_ChatGPT/GPT_Model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn # 导入torch.nn库 3 | d_k = 64 # K(=Q)维度 4 | d_v = 64 # V维度 5 | # 定义缩放点积注意力类 6 | class ScaledDotProductAttention(nn.Module): 7 | def __init__(self): 8 | super(ScaledDotProductAttention, self).__init__() 9 | def forward(self, Q, K, V, attn_mask): 10 | # Q K V [batch_size, n_heads, len_q/k/v, dim_q=k/v] (dim_q=dim_k) 11 | # 计算注意力分数(原始权重)[batch_size,n_heads,len_q,len_k] 12 | scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) 13 | # 使用注意力掩码,将attn_mask中值为1的位置的权重替换为极小值 14 | # attn_mask [batch_size,n_heads,len_q,len_k],形状和scores相同 15 | scores.masked_fill_(attn_mask, -1e9) 16 | # 对注意力分数进行softmax 17 | weights = nn.Softmax(dim=-1)(scores) 18 | # 计算上下文向量(也就是注意力的输出), 是上下文信息的紧凑表示 19 | context = torch.matmul(weights, V) 20 | return context, weights # 返回上下文向量和注意力分数 21 | 22 | 23 | # 定义多头注意力类 24 | d_embedding = 512 # Embedding Size 25 | n_heads = 8 # number of heads in Multi-Head Attention 26 | batch_size = 3 # 每一批数据量 27 | class MultiHeadAttention(nn.Module): 28 | def __init__(self): 29 | super(MultiHeadAttention, self).__init__() 30 | self.W_Q = nn.Linear(d_embedding, d_k * n_heads) # Q的线性变换层 31 | self.W_K = nn.Linear(d_embedding, d_k * n_heads) # K的线性变换层 32 | self.W_V = nn.Linear(d_embedding, d_v * n_heads) # V的线性变换层 33 | self.linear = nn.Linear(n_heads * d_v, d_embedding) 34 | self.layer_norm = nn.LayerNorm(d_embedding) 35 | 36 | def forward(self, Q, K, V, attn_mask): 37 | # Q K V [batch_size,len_q/k/v,embedding_dim] 38 | residual, batch_size = Q, Q.size(0) # 保留残差连接 39 | # 将输入进行线性变换和重塑,以便后续处理 40 | # q_s k_s v_s: [batch_size,n_heads.,len_q/k/v,d_q=k/v] 41 | q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) 42 | k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2) 43 | v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2) 44 | # 将注意力掩码复制到多头 [batch_size,n_heads,len_q,len_k] 45 | attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) 46 | # 使用缩放点积注意力计算上下文和注意力权重 47 | context, weights = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask) 48 | # 重塑上下文向量并进行线性变换,[batch_size,len_q,n_heads * dim_v] 49 | context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) 50 | output = self.linear(context) 51 | # 与输入(Q)进行残差链接,并进行层归一化后输出[batch_size, len_q, embedding_dim] 52 | output = self.layer_norm(output + residual) 53 | return output, weights # 返回层归一化的输出和注意力权重 54 | 55 | 56 | # 定义逐位置前向传播网络类 57 | class PoswiseFeedForwardNet(nn.Module): 58 | def __init__(self): 59 | super(PoswiseFeedForwardNet, self).__init__() 60 | # 定义一维卷积层1,用于将输入映射到更高维度 61 | self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=2048, kernel_size=1) 62 | # 定义一维卷积层2,用于将输入映射回原始维度 63 | self.conv2 = nn.Conv1d(in_channels=2048, out_channels=d_embedding, kernel_size=1) 64 | # 定义层归一化 65 | self.layer_norm = nn.LayerNorm(d_embedding) 66 | 67 | def forward(self, inputs): 68 | # inputs: [batch_size, len_q, embedding_dim] 69 | residual = inputs # 保留残差连接 70 | # 在卷积层1后使用ReLU激活函数 71 | output = nn.ReLU()(self.conv1(inputs.transpose(1, 2))) 72 | # 使用卷积层2进行降维 73 | output = self.conv2(output).transpose(1, 2) 74 | # 与输入进行残差链接,并进行层归一化,[batch_size, len_q, embedding_dim] 75 | output = self.layer_norm(output + residual) 76 | return output # 返回层归一化后的输出加上残差连接的结果 77 | 78 | 79 | import numpy as np 80 | def get_sin_enc_table(n_position, embedding_dim): 81 | # 根据位置和维度信息,初始化正弦位置编码表 82 | sinusoid_table = np.zeros((n_position, embedding_dim)) 83 | # 遍历所有位置和维度,计算角度值 84 | for pos_i in range(n_position): 85 | for hid_j in range(embedding_dim): 86 | angle = pos_i / np.power(10000, 2 * (hid_j // 2) / embedding_dim) 87 | sinusoid_table[pos_i, hid_j] = angle 88 | # 计算正弦和余弦值 89 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 偶数维 90 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 奇数维 91 | return torch.FloatTensor(sinusoid_table) 92 | 93 | 94 | # 生成填充注意力掩码的函数,用于在多头自注意力计算中忽略填充部分 95 | def get_attn_pad_mask(seq_q, seq_k): 96 | batch_size, len_q = seq_q.size() 97 | batch_size, len_k = seq_k.size() 98 | # 生成布尔类型张量[batch_size,1,len_k(=len_q)] 99 | pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # Token的编码值为0 100 | # 变形为何注意力分数相同形状的张量 [batch_size,len_q,len_k] 101 | pad_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k) 102 | return pad_attn_mask # 形状[batch_size,len_q,len_k] 103 | 104 | # 生成后续注意力掩码的函数,用于在多头自注意力计算中忽略未来信息 105 | def get_attn_subsequent_mask(seq): 106 | # 获取输入序列的形状 [batch_size, seq_len(len_q), seq_len(len_k)] 107 | attn_shape = [seq.size(0), seq.size(1), seq.size(1)] 108 | # 使用numpy创建一个上三角矩阵(triu = triangle upper) 109 | subsequent_mask = np.triu(np.ones(attn_shape), k=1) 110 | # 将numpy数组转换为PyTorch张量,并将数据类型设置为byte(布尔值) 111 | subsequent_mask = torch.from_numpy(subsequent_mask).byte() 112 | return subsequent_mask # [batch_size, seq_len(len_q), seq_len(len_k)] 113 | 114 | 115 | class DecoderLayer(nn.Module): 116 | def __init__(self): 117 | super(DecoderLayer, self).__init__() 118 | self.self_attn = MultiHeadAttention() # 多头自注意力层 119 | self.feed_forward = PoswiseFeedForwardNet() # 位置前馈神经网络层 120 | self.norm1 = nn.LayerNorm(d_embedding) # 第一个层归一化 121 | self.norm2 = nn.LayerNorm(d_embedding) # 第二个层归一化 122 | 123 | def forward(self, dec_inputs, attn_mask=None): 124 | # 使用多头自注意力处理输入 125 | attn_output, _ = self.self_attn(dec_inputs, dec_inputs, dec_inputs, attn_mask) 126 | # 将注意力输出与输入相加并进行第一个层归一化 127 | norm1_outputs = self.norm1(dec_inputs + attn_output) 128 | # 将归一化后的输出输入到位置前馈神经网络 129 | ff_outputs = self.feed_forward(norm1_outputs) 130 | # 将前馈神经网络输出与第一次归一化后的输出相加并进行第二个层归一化 131 | dec_outputs = self.norm2(norm1_outputs + ff_outputs) 132 | return dec_outputs 133 | 134 | n_layers = 6 135 | device = "cuda" if torch.cuda.is_available() else "cpu" 136 | class Decoder(nn.Module): 137 | def __init__(self, vocab_size, max_seq_len, n_layers): 138 | super(Decoder, self).__init__() 139 | self.src_emb = nn.Embedding(vocab_size, d_embedding) # 词嵌入层(参数为词典维度) 140 | self.pos_emb = nn.Embedding(max_seq_len, d_embedding) # 位置编码层(参数为序列长度) 141 | # 初始化指定数量的解码器层 142 | self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)]) 143 | 144 | def forward(self, dec_inputs): 145 | # 创建位置信息 146 | positions = torch.arange(len(dec_inputs), device=dec_inputs.device).unsqueeze(-1) 147 | # 将词嵌入与位置编码相加 148 | inputs_embedding = self.src_emb(dec_inputs) + self.pos_emb(positions) 149 | # 生成自注意力掩码 150 | attn_mask = get_attn_subsequent_mask(inputs_embedding).to(device) 151 | # 将输入数据传递给解码器层 152 | for layer in self.layers: 153 | dec_outputs = layer(inputs_embedding, attn_mask) 154 | return dec_outputs 155 | 156 | 157 | class GPT(nn.Module): 158 | def __init__(self, vocab_size, max_seq_len, n_layers): 159 | super(GPT, self).__init__() 160 | self.decoder = Decoder(vocab_size, max_seq_len, n_layers) 161 | self.projection = nn.Linear(d_embedding, vocab_size) # 全连接层,用于输出预测结果 162 | 163 | def forward(self, dec_inputs): 164 | # 将输入数据传递给解码器 165 | dec_outputs = self.decoder(dec_inputs) 166 | # 传递给全连接层以生成预测 167 | logits = self.projection(dec_outputs) 168 | return logits -------------------------------------------------------------------------------- /08_ChatGPT/Pretrain_ChatGPT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "模型信息: GPT2LMHeadModel(\n", 13 | " (transformer): GPT2Model(\n", 14 | " (wte): Embedding(50257, 768)\n", 15 | " (wpe): Embedding(1024, 768)\n", 16 | " (drop): Dropout(p=0.1, inplace=False)\n", 17 | " (h): ModuleList(\n", 18 | " (0): GPT2Block(\n", 19 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 20 | " (attn): GPT2Attention(\n", 21 | " (c_attn): Conv1D()\n", 22 | " (c_proj): Conv1D()\n", 23 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 24 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 25 | " )\n", 26 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 27 | " (mlp): GPT2MLP(\n", 28 | " (c_fc): Conv1D()\n", 29 | " (c_proj): Conv1D()\n", 30 | " (act): NewGELUActivation()\n", 31 | " (dropout): Dropout(p=0.1, inplace=False)\n", 32 | " )\n", 33 | " )\n", 34 | " (1): GPT2Block(\n", 35 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 36 | " (attn): GPT2Attention(\n", 37 | " (c_attn): Conv1D()\n", 38 | " (c_proj): Conv1D()\n", 39 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 40 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 41 | " )\n", 42 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 43 | " (mlp): GPT2MLP(\n", 44 | " (c_fc): Conv1D()\n", 45 | " (c_proj): Conv1D()\n", 46 | " (act): NewGELUActivation()\n", 47 | " (dropout): Dropout(p=0.1, inplace=False)\n", 48 | " )\n", 49 | " )\n", 50 | " (2): GPT2Block(\n", 51 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 52 | " (attn): GPT2Attention(\n", 53 | " (c_attn): Conv1D()\n", 54 | " (c_proj): Conv1D()\n", 55 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 56 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 57 | " )\n", 58 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 59 | " (mlp): GPT2MLP(\n", 60 | " (c_fc): Conv1D()\n", 61 | " (c_proj): Conv1D()\n", 62 | " (act): NewGELUActivation()\n", 63 | " (dropout): Dropout(p=0.1, inplace=False)\n", 64 | " )\n", 65 | " )\n", 66 | " (3): GPT2Block(\n", 67 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 68 | " (attn): GPT2Attention(\n", 69 | " (c_attn): Conv1D()\n", 70 | " (c_proj): Conv1D()\n", 71 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 72 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 73 | " )\n", 74 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 75 | " (mlp): GPT2MLP(\n", 76 | " (c_fc): Conv1D()\n", 77 | " (c_proj): Conv1D()\n", 78 | " (act): NewGELUActivation()\n", 79 | " (dropout): Dropout(p=0.1, inplace=False)\n", 80 | " )\n", 81 | " )\n", 82 | " (4): GPT2Block(\n", 83 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 84 | " (attn): GPT2Attention(\n", 85 | " (c_attn): Conv1D()\n", 86 | " (c_proj): Conv1D()\n", 87 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 88 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 89 | " )\n", 90 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 91 | " (mlp): GPT2MLP(\n", 92 | " (c_fc): Conv1D()\n", 93 | " (c_proj): Conv1D()\n", 94 | " (act): NewGELUActivation()\n", 95 | " (dropout): Dropout(p=0.1, inplace=False)\n", 96 | " )\n", 97 | " )\n", 98 | " (5): GPT2Block(\n", 99 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 100 | " (attn): GPT2Attention(\n", 101 | " (c_attn): Conv1D()\n", 102 | " (c_proj): Conv1D()\n", 103 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 104 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 105 | " )\n", 106 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 107 | " (mlp): GPT2MLP(\n", 108 | " (c_fc): Conv1D()\n", 109 | " (c_proj): Conv1D()\n", 110 | " (act): NewGELUActivation()\n", 111 | " (dropout): Dropout(p=0.1, inplace=False)\n", 112 | " )\n", 113 | " )\n", 114 | " (6): GPT2Block(\n", 115 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 116 | " (attn): GPT2Attention(\n", 117 | " (c_attn): Conv1D()\n", 118 | " (c_proj): Conv1D()\n", 119 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 120 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 121 | " )\n", 122 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 123 | " (mlp): GPT2MLP(\n", 124 | " (c_fc): Conv1D()\n", 125 | " (c_proj): Conv1D()\n", 126 | " (act): NewGELUActivation()\n", 127 | " (dropout): Dropout(p=0.1, inplace=False)\n", 128 | " )\n", 129 | " )\n", 130 | " (7): GPT2Block(\n", 131 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 132 | " (attn): GPT2Attention(\n", 133 | " (c_attn): Conv1D()\n", 134 | " (c_proj): Conv1D()\n", 135 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 136 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 137 | " )\n", 138 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 139 | " (mlp): GPT2MLP(\n", 140 | " (c_fc): Conv1D()\n", 141 | " (c_proj): Conv1D()\n", 142 | " (act): NewGELUActivation()\n", 143 | " (dropout): Dropout(p=0.1, inplace=False)\n", 144 | " )\n", 145 | " )\n", 146 | " (8): GPT2Block(\n", 147 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 148 | " (attn): GPT2Attention(\n", 149 | " (c_attn): Conv1D()\n", 150 | " (c_proj): Conv1D()\n", 151 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 152 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 153 | " )\n", 154 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 155 | " (mlp): GPT2MLP(\n", 156 | " (c_fc): Conv1D()\n", 157 | " (c_proj): Conv1D()\n", 158 | " (act): NewGELUActivation()\n", 159 | " (dropout): Dropout(p=0.1, inplace=False)\n", 160 | " )\n", 161 | " )\n", 162 | " (9): GPT2Block(\n", 163 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 164 | " (attn): GPT2Attention(\n", 165 | " (c_attn): Conv1D()\n", 166 | " (c_proj): Conv1D()\n", 167 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 168 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 169 | " )\n", 170 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 171 | " (mlp): GPT2MLP(\n", 172 | " (c_fc): Conv1D()\n", 173 | " (c_proj): Conv1D()\n", 174 | " (act): NewGELUActivation()\n", 175 | " (dropout): Dropout(p=0.1, inplace=False)\n", 176 | " )\n", 177 | " )\n", 178 | " (10): GPT2Block(\n", 179 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 180 | " (attn): GPT2Attention(\n", 181 | " (c_attn): Conv1D()\n", 182 | " (c_proj): Conv1D()\n", 183 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 184 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 185 | " )\n", 186 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 187 | " (mlp): GPT2MLP(\n", 188 | " (c_fc): Conv1D()\n", 189 | " (c_proj): Conv1D()\n", 190 | " (act): NewGELUActivation()\n", 191 | " (dropout): Dropout(p=0.1, inplace=False)\n", 192 | " )\n", 193 | " )\n", 194 | " (11): GPT2Block(\n", 195 | " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 196 | " (attn): GPT2Attention(\n", 197 | " (c_attn): Conv1D()\n", 198 | " (c_proj): Conv1D()\n", 199 | " (attn_dropout): Dropout(p=0.1, inplace=False)\n", 200 | " (resid_dropout): Dropout(p=0.1, inplace=False)\n", 201 | " )\n", 202 | " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 203 | " (mlp): GPT2MLP(\n", 204 | " (c_fc): Conv1D()\n", 205 | " (c_proj): Conv1D()\n", 206 | " (act): NewGELUActivation()\n", 207 | " (dropout): Dropout(p=0.1, inplace=False)\n", 208 | " )\n", 209 | " )\n", 210 | " )\n", 211 | " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", 212 | " )\n", 213 | " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", 214 | ")\n", 215 | "分词器信息: PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True)})\n", 216 | "词汇表大小: 50257\n", 217 | "部分词汇示例: ['parent', 'Art', 'pack', 'Ġdiplom', 'rets']\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "import torch # 导入torch\n", 223 | "from transformers import GPT2Tokenizer # 导入GPT2分词器\n", 224 | "from transformers import GPT2LMHeadModel # 导入GPT2语言模型\n", 225 | "model_name = \"gpt2\" # 也可以选择其他模型,如\"gpt2-medium\"、\"gpt2-large\"等\n", 226 | "tokenizer = GPT2Tokenizer.from_pretrained(model_name) # 加载分词器\n", 227 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\" # 判断是否有可用GPU\n", 228 | "model = GPT2LMHeadModel.from_pretrained(model_name).to(device) # 将模型加载到设备上(CPU或GPU)\n", 229 | "vocab = tokenizer.get_vocab() # 获取词汇表\n", 230 | "\n", 231 | "print(\"模型信息:\", model)\n", 232 | "print(\"分词器信息:\",tokenizer)\n", 233 | "print(\"词汇表大小:\", len(vocab))\n", 234 | "print(\"部分词汇示例:\", (list(vocab.keys())[8000:8005]))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 2, 240 | "metadata": { 241 | "scrolled": true 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "Example 1:\n" 249 | ] 250 | }, 251 | { 252 | "name": "stderr", 253 | "output_type": "stream", 254 | "text": [ 255 | "2023-05-08 22:44:09.419588: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" 256 | ] 257 | }, 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "Input: hi, how are you?<|endoftext|>\n", 263 | "Target: i am doing well, thank you. how about you?<|endoftext|>\n", 264 | "Example 2:\n", 265 | "Input: i am good, thanks for asking. what can you do?<|endoftext|>\n", 266 | "Target: i am an ai language model. i can help you answer questions.<|endoftext|>\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "from torch.utils.data import Dataset # 导入Pytorch的Dataset\n", 272 | "# 自定义ChatDataset类,继承自Pytorch的Dataset类\n", 273 | "class ChatDataset(Dataset):\n", 274 | " def __init__(self, file_path, tokenizer, vocab):\n", 275 | " self.tokenizer = tokenizer # 分词器\n", 276 | " self.vocab = vocab # 词汇表\n", 277 | " # 加载数据并处理,将处理后的输入数据和目标数据赋值给input_data和target_data\n", 278 | " self.input_data, self.target_data = self.load_and_process_data(file_path)\n", 279 | " # 定义加载和处理数据的方法\n", 280 | " def load_and_process_data(self, file_path): \n", 281 | " with open(file_path, \"r\") as f: # 读取文件内容\n", 282 | " lines = f.readlines()\n", 283 | " input_data, target_data = [], [] \n", 284 | " for i, line in enumerate(lines): # 遍历文件的每一行 \n", 285 | " if line.startswith(\"User:\"): # 如以\"User:\"开头,分词,移除\"User: \"前缀,并将张量转换为列表\n", 286 | " tokens = self.tokenizer(line.strip()[6:], return_tensors=\"pt\")[\"input_ids\"].tolist()[0]\n", 287 | " tokens = tokens + [tokenizer.eos_token_id] # 添加结束符\n", 288 | " input_data.append(torch.tensor(tokens, dtype=torch.long)) # 添加到input_data中\n", 289 | " elif line.startswith(\"AI:\"): # 如以\"AI:\"开头,分词,移除\"AI: \"前缀,并将张量转换为列表\n", 290 | " tokens = self.tokenizer(line.strip()[4:], return_tensors=\"pt\")[\"input_ids\"].tolist()[0]\n", 291 | " tokens = tokens + [tokenizer.eos_token_id] # 添加结束符\n", 292 | " target_data.append(torch.tensor(tokens, dtype=torch.long)) # 添加到target_data中\n", 293 | " return input_data, target_data\n", 294 | " # 定义数据集的长度,即input_data的长度\n", 295 | " def __len__(self):\n", 296 | " return len(self.input_data)\n", 297 | " # 定义获取数据集中指定索引的数据的方法\n", 298 | " def __getitem__(self, idx):\n", 299 | " return self.input_data[idx], self.target_data[idx]\n", 300 | "\n", 301 | "file_path = \"chat.txt\" # 加载chat.txt数据集\n", 302 | "chat_dataset = ChatDataset(file_path, tokenizer, vocab) # 创建ChatDataset对象,传入文件、分词器和词汇表\n", 303 | "for i in range(2): # 打印数据集中前2个数据示例\n", 304 | " input_example, target_example = chat_dataset[i]\n", 305 | " print(f\"Example {i + 1}:\")\n", 306 | " print(\"Input:\", tokenizer.decode(input_example))\n", 307 | " print(\"Target:\", tokenizer.decode(target_example))" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 3, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "Input batch tensor size: torch.Size([2, 21])\n", 320 | "Target batch tensor size: torch.Size([2, 21])\n", 321 | "Input batch tensor:\n", 322 | "tensor([[ 5303, 837, 703, 389, 345, 5633, 50256, 50256, 50256, 50256,\n", 323 | " 50256, 50256, 50256, 50256, 50256, 50256],\n", 324 | " [ 72, 716, 922, 837, 5176, 329, 4737, 764, 644, 460,\n", 325 | " 345, 466, 5633, 50256, 50256, 50256]])\n", 326 | "Target batch tensor:\n", 327 | "tensor([[ 72, 716, 1804, 880, 837, 5875, 345, 764, 703, 546,\n", 328 | " 345, 5633, 50256, 50256, 50256, 50256],\n", 329 | " [ 72, 716, 281, 257, 72, 3303, 2746, 764, 1312, 460,\n", 330 | " 1037, 345, 3280, 2683, 764, 50256]])\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "from torch.utils.data import DataLoader # 导入Dataloader\n", 336 | "tokenizer.pad_token = '' # 为分词器添加pad token\n", 337 | "tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('')\n", 338 | "# 定义pad_sequence函数,用于将一批序列补齐到相同长度\n", 339 | "def pad_sequence(sequences, padding_value=0, length=None):\n", 340 | " # 计算最大序列长度,如果length参数未提供,则使用输入序列中的最大长度\n", 341 | " max_length = max(len(seq) for seq in sequences) if length is None else length \n", 342 | " # 创建一个具有适当形状的全零张量,用于存储补齐后的序列\n", 343 | " result = torch.full((len(sequences), max_length), padding_value, dtype=torch.long) \n", 344 | " # 遍历序列,将每个序列的内容复制到结果张量中\n", 345 | " for i, seq in enumerate(sequences):\n", 346 | " end = len(seq)\n", 347 | " result[i, :end] = seq[:end]\n", 348 | " return result\n", 349 | "\n", 350 | "# 定义collate_fn函数,用于将一个批次的数据整理成适当的形状\n", 351 | "def collate_fn(batch):\n", 352 | " # 从批次中分离源序列和目标序列\n", 353 | " sources, targets = zip(*batch) \n", 354 | " # 计算批次中的最大序列长度\n", 355 | " max_length = max(max(len(s) for s in sources), max(len(t) for t in targets)) \n", 356 | " # 使用pad_sequence函数补齐源序列和目标序列\n", 357 | " sources = pad_sequence(sources, padding_value=tokenizer.pad_token_id, length=max_length)\n", 358 | " targets = pad_sequence(targets, padding_value=tokenizer.pad_token_id, length=max_length) \n", 359 | " # 返回补齐后的源序列和目标序列\n", 360 | " return sources, targets\n", 361 | "\n", 362 | "# 创建Dataloader\n", 363 | "chat_dataloader = DataLoader(chat_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)\n", 364 | "\n", 365 | "# 检查Dataloader输出\n", 366 | "for input_batch, target_batch in chat_dataloader:\n", 367 | " print(\"Input batch tensor size:\", input_batch.size())\n", 368 | " print(\"Target batch tensor size:\", target_batch.size())\n", 369 | " break\n", 370 | "for input_batch, target_batch in chat_dataloader:\n", 371 | " print(\"Input batch tensor:\")\n", 372 | " print(input_batch)\n", 373 | " print(\"Target batch tensor:\")\n", 374 | " print(target_batch)\n", 375 | " break\n", 376 | " " 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 4, 382 | "metadata": {}, 383 | "outputs": [ 384 | { 385 | "name": "stdout", 386 | "output_type": "stream", 387 | "text": [ 388 | "Epoch: 0020, Batch: 0003, cost = 0.413135\n", 389 | "Epoch: 0040, Batch: 0003, cost = 0.237347\n", 390 | "Epoch: 0060, Batch: 0003, cost = 1.088322\n", 391 | "Epoch: 0080, Batch: 0003, cost = 0.048271\n", 392 | "Epoch: 0100, Batch: 0003, cost = 0.001409\n", 393 | "Epoch: 0120, Batch: 0003, cost = 0.000533\n", 394 | "Epoch: 0140, Batch: 0003, cost = 0.001399\n", 395 | "Epoch: 0160, Batch: 0003, cost = 0.001586\n", 396 | "Epoch: 0180, Batch: 0003, cost = 0.000865\n", 397 | "Epoch: 0200, Batch: 0003, cost = 0.000385\n", 398 | "Epoch: 0220, Batch: 0003, cost = 0.000295\n", 399 | "Epoch: 0240, Batch: 0003, cost = 0.000166\n", 400 | "Epoch: 0260, Batch: 0003, cost = 0.000278\n", 401 | "Epoch: 0280, Batch: 0003, cost = 0.000117\n", 402 | "Epoch: 0300, Batch: 0003, cost = 0.000136\n", 403 | "Epoch: 0320, Batch: 0003, cost = 0.000085\n", 404 | "Epoch: 0340, Batch: 0003, cost = 0.000135\n", 405 | "Epoch: 0360, Batch: 0003, cost = 0.000020\n", 406 | "Epoch: 0380, Batch: 0003, cost = 0.000102\n", 407 | "Epoch: 0400, Batch: 0003, cost = 0.000052\n", 408 | "Epoch: 0420, Batch: 0003, cost = 0.000047\n", 409 | "Epoch: 0440, Batch: 0003, cost = 0.000059\n", 410 | "Epoch: 0460, Batch: 0003, cost = 0.000013\n", 411 | "Epoch: 0480, Batch: 0003, cost = 0.000012\n", 412 | "Epoch: 0500, Batch: 0003, cost = 0.000056\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "import torch.nn as nn\n", 418 | "import torch.optim as optim\n", 419 | "# 定义损失函数,忽略pad_token_id对应的损失值\n", 420 | "criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)\n", 421 | "# 定义优化器\n", 422 | "optimizer = optim.Adam(model.parameters(), lr=0.0001)\n", 423 | "# 进行100个epoch的训练\n", 424 | "for epoch in range(500):\n", 425 | " # 遍历数据加载器中的批次\n", 426 | " for batch_idx, (input_batch, target_batch) in enumerate(chat_dataloader): \n", 427 | " optimizer.zero_grad() # 梯度清零 \n", 428 | " input_batch, target_batch = input_batch.to(device), target_batch.to(device) # 将输入和目标批次移至设备(CPU或GPU) \n", 429 | " outputs = model(input_batch) # 前向传播\n", 430 | " logits = outputs.logits # 获取logits \n", 431 | " loss = criterion(logits.view(-1, len(vocab)), target_batch.view(-1)) # 计算损失\n", 432 | " loss.backward() # 反向传播 \n", 433 | " optimizer.step()# 更新参数 \n", 434 | " if (epoch + 1) % 20 == 0: # 每200个epoch打印一次损失值\n", 435 | " print(f'Epoch: {epoch + 1:04d}, cost = {loss:.6f}')" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 10, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "name": "stdout", 445 | "output_type": "stream", 446 | "text": [ 447 | "测试 1:\n", 448 | "User: what is the weather like today?\n", 449 | "AI: application model application name for application application application application application application application model application application model application application application model application application model application application application model application application model application application application application application application application application application application application application application application application application program application application application\n", 450 | "\n", 451 | "测试 2:\n", 452 | "User: hi, how are you?\n", 453 | "AI: how you have always be doing how you, you, you know me how you, you know how you are about doing how you, if need know how you need help if need know how you want help if need need need help if need need help\n", 454 | "\n", 455 | "测试 3:\n", 456 | "User: can you recommend a good book?\n", 457 | "AI: har harper har har har har har har har har har har har har har har har har har har har har har har har har har har har har is har har har har har har har har har har har har har har har har har har\n", 458 | "\n" 459 | ] 460 | } 461 | ], 462 | "source": [ 463 | "def generate_text_beam_search(model, input_str, max_len=50, beam_width=5):\n", 464 | " model.eval() # 将模型设置为评估模式(不计算梯度)\n", 465 | " # 对输入字符串进行编码,并将其转换为 PyTorch 张量,然后将其移动到相应的设备上(例如 GPU)\n", 466 | " input_tokens = tokenizer.encode(input_str, return_tensors=\"pt\").to(device) \n", 467 | " # 初始化候选序列列表,包含当前输入序列和其对数概率得分(我们从0开始)\n", 468 | " candidates = [(input_tokens, 0.0)] \n", 469 | " # 禁用梯度计算,以加速预测过程\n", 470 | " with torch.no_grad():\n", 471 | " # 迭代生成最大长度的序列\n", 472 | " for _ in range(max_len):\n", 473 | " new_candidates = [] \n", 474 | " # 对于每个候选序列\n", 475 | " for candidate, candidate_score in candidates:\n", 476 | " # 使用模型进行预测\n", 477 | " outputs = model(candidate)\n", 478 | " # 获取输出 logits\n", 479 | " logits = outputs.logits[:, -1, :]\n", 480 | " # 获取对数概率得分的 top-k 值(即 beam_width)及其对应的 token\n", 481 | " scores, next_tokens = torch.topk(logits, beam_width, dim=-1)\n", 482 | " final_results = []\n", 483 | " # 遍历 top-k token 及其对应的得分\n", 484 | " for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()):\n", 485 | " # 在当前候选序列中添加新的 token\n", 486 | " new_candidate = torch.cat((candidate, next_token.unsqueeze(0).unsqueeze(0)), dim=-1)\n", 487 | " # 更新候选序列的得分\n", 488 | " new_score = candidate_score - score.item() \n", 489 | " # 如果新的 token 是结束符(eos_token),则将该候选序列添加到最终结果中\n", 490 | " if next_token.item() == tokenizer.eos_token_id:\n", 491 | " final_results.append((new_candidate, new_score))\n", 492 | " # 否则,将新的候选序列添加到新候选序列列表中\n", 493 | " else:\n", 494 | " new_candidates.append((new_candidate, new_score)) \n", 495 | " # 从新候选序列列表中选择得分最高的 top-k 个序列\n", 496 | " candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width] \n", 497 | " # 选择得分最高的候选序列\n", 498 | " best_candidate, _ = sorted(candidates, key=lambda x: x[1])[0] \n", 499 | " # 将输出 token 转换回文本字符串\n", 500 | " output_str = tokenizer.decode(best_candidate[0]) \n", 501 | " # 移除输入字符串并修复空格问题\n", 502 | " input_len = len(tokenizer.encode(input_str))\n", 503 | " output_str = tokenizer.decode(best_candidate.squeeze()[input_len:]) \n", 504 | " return output_str\n", 505 | "\n", 506 | "test_inputs = [\n", 507 | " \"what is the weather like today?\",\n", 508 | " \"hi, how are you?\",\n", 509 | " \"can you recommend a good book?\"\n", 510 | "]\n", 511 | "\n", 512 | "for i, input_str in enumerate(test_inputs, start=1):\n", 513 | " generated_text = generate_text_beam_search(model, input_str)\n", 514 | " print(f\"测试 {i}:\")\n", 515 | " print(f\"User: {input_str}\")\n", 516 | " print(f\"AI: {generated_text}\")\n", 517 | "\n", 518 | "test_inputs = [\n", 519 | " \"what is the weather like today?\",\n", 520 | " \"hi , how are you?\",\n", 521 | " \"can you recommend a good book?\"\n", 522 | "]\n", 523 | "\n", 524 | "for i, input_str in enumerate(test_inputs, start=1):\n", 525 | " generated_text = generate_text_beam_search(model, input_str)\n", 526 | " print(f\"测试 {i}:\")\n", 527 | " print(f\"User: {input_str}\")\n", 528 | " print(f\"AI: {generated_text}\")\n", 529 | " print()\n" 530 | ] 531 | } 532 | ], 533 | "metadata": { 534 | "kernelspec": { 535 | "display_name": "Python 3 (ipykernel)", 536 | "language": "python", 537 | "name": "python3" 538 | }, 539 | "language_info": { 540 | "codemirror_mode": { 541 | "name": "ipython", 542 | "version": 3 543 | }, 544 | "file_extension": ".py", 545 | "mimetype": "text/x-python", 546 | "name": "python", 547 | "nbconvert_exporter": "python", 548 | "pygments_lexer": "ipython3", 549 | "version": "3.10.11" 550 | } 551 | }, 552 | "nbformat": 4, 553 | "nbformat_minor": 2 554 | } 555 | -------------------------------------------------------------------------------- /08_ChatGPT/RLHF_Reward_ChatGPT.py: -------------------------------------------------------------------------------- 1 | import torch # 导入torch 2 | from transformers import GPT2Tokenizer # 导入GPT2分词器 3 | from transformers import GPT2LMHeadModel # 导入GPT2语言模型 4 | 5 | model_name = "gpt2" # 也可以选择其他模型,如"gpt2-medium"、"gpt2-large"等 6 | tokenizer = GPT2Tokenizer.from_pretrained(model_name) # 加载分词器 7 | device = "cuda" if torch.cuda.is_available() else "cpu" # 判断是否有可用GPU 8 | model = GPT2LMHeadModel.from_pretrained(model_name).to(device) # 将模型加载到设备上(CPU或GPU) 9 | vocab = tokenizer.get_vocab() # 获取词汇表 10 | 11 | # 示例RLHF数据集 12 | data = [ 13 | { 14 | "User": "What is the capital of France?", 15 | # "AI": "The capital of France is Paris.", 16 | "AI": "Paris.", 17 | "score": 5 18 | }, 19 | { 20 | "User": "What is the capital of France?", 21 | "AI": "Rome.", 22 | "score": 1 23 | }, 24 | { 25 | "User": "How to cook pasta?", 26 | # "AI": "To cook pasta, first boil water and then add pasta.", 27 | "AI": "first boil water.", 28 | "score": 4 29 | }, 30 | { 31 | "User": "How to cook pasta?", 32 | # "AI": "First, turn on the microwave and put the pasta inside.", 33 | "AI": "microwave.", 34 | "score": 2 35 | } 36 | ] 37 | 38 | 39 | from torch.utils.data import Dataset # 导入Pytorch的Dataset 40 | class RLHFDataset(Dataset): 41 | def __init__(self, data, tokenizer, vocab): 42 | self.tokenizer = tokenizer # 分词器 43 | self.vocab = vocab # 词汇表 44 | self.input_data, self.target_data, self.scores = self.process_data(data) 45 | 46 | def process_data(self, data): 47 | input_data, target_data, scores = [], [], [] 48 | for conversation in data: 49 | user_question = conversation["User"] 50 | model_answer = conversation["AI"] 51 | score = conversation["score"] 52 | 53 | input_tokens = self.tokenizer(f"{user_question}", return_tensors="pt")["input_ids"].tolist()[0] 54 | input_tokens = input_tokens + [tokenizer.eos_token_id] 55 | input_data.append(torch.tensor(input_tokens, dtype=torch.long)) 56 | 57 | target_tokens = self.tokenizer(model_answer, return_tensors="pt")["input_ids"].tolist()[0] 58 | target_tokens = target_tokens + [tokenizer.eos_token_id] 59 | target_data.append(torch.tensor(target_tokens, dtype=torch.long)) 60 | 61 | scores.append(score) 62 | 63 | return input_data, target_data, scores 64 | 65 | def __len__(self): 66 | return len(self.input_data) 67 | 68 | def __getitem__(self, idx): 69 | return self.input_data[idx], self.target_data[idx], self.scores[idx] 70 | 71 | rlhf_dataset = RLHFDataset(data, tokenizer, vocab) # 创建ChatDataset对象,传入文件、分词器和词汇表 72 | # 打印数据集中前2个数据示例 73 | for i in range(2): 74 | input_example, target_example, _ = rlhf_dataset[i] 75 | print(f"Example {i + 1}:") 76 | print("Input:", tokenizer.decode(input_example)) 77 | print("Target:", tokenizer.decode(target_example)) 78 | 79 | from torch.utils.data import DataLoader # 导入Dataloader 80 | tokenizer.pad_token = '' # 为分词器添加pad token 81 | tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids('') 82 | # 定义pad_sequence函数,用于将一批序列补齐到相同长度 83 | def pad_sequence(sequences, padding_value=0, length=None): 84 | # 计算最大序列长度,如果length参数未提供,则使用输入序列中的最大长度 85 | max_length = max(len(seq) for seq in sequences) if length is None else length 86 | # 创建一个具有适当形状的全零张量,用于存储补齐后的序列 87 | result = torch.full((len(sequences), max_length), padding_value, dtype=torch.long) 88 | # 遍历序列,将每个序列的内容复制到结果张量中 89 | for i, seq in enumerate(sequences): 90 | end = len(seq) 91 | result[i, :end] = seq[:end] 92 | return result 93 | 94 | # 定义collate_fn函数,用于将一个批次的数据整理成适当的形状 95 | def collate_fn(batch): 96 | # 从批次中分离源序列、目标序列和分数 97 | sources, targets, scores = zip(*batch) 98 | # 计算批次中的最大序列长度 99 | max_length = max(max(len(s) for s in sources), max(len(t) for t in targets)) 100 | # 使用 pad_sequence 函数补齐源序列和目标序列 101 | sources = pad_sequence(sources, padding_value=tokenizer.pad_token_id, length=max_length) 102 | targets = pad_sequence(targets, padding_value=tokenizer.pad_token_id, length=max_length) 103 | # 将分数转换为张量 104 | scores = torch.tensor(scores, dtype=torch.float) 105 | # 返回补齐后的源序列、目标序列和分数 106 | return sources, targets, scores 107 | 108 | # 创建Dataloader 109 | batch_size = 2 110 | chat_dataloader = DataLoader(rlhf_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) 111 | 112 | # 检查Dataloader输出 113 | for input_batch, target_batch, score_batch in chat_dataloader: 114 | print("Input batch tensor size:", input_batch.size()) 115 | print("Target batch tensor size:", target_batch.size()) 116 | print("Score batch tensor size:", score_batch.size()) 117 | break 118 | 119 | 120 | # 奖励函数 121 | # def reward_function(predictions, targets, scores): 122 | # correct = (predictions == targets).float() * scores.unsqueeze(1) 123 | # reward = correct.sum(dim=-1) / (targets != tokenizer.pad_token_id).sum(dim=-1).float() 124 | # return reward 125 | 126 | # def reward_function(predictions, targets, scores): 127 | # correct = (predictions == targets).float() 128 | # num_correct = correct.sum(dim=-1) 129 | # num_total = (targets != tokenizer.pad_token_id).sum(dim=-1).float() 130 | # match_ratio = num_correct / num_total 131 | # reward = match_ratio * scores 132 | # return reward 133 | 134 | def reward_function(predictions, targets, scores): 135 | correct = (predictions == targets).float() * scores.unsqueeze(1) 136 | reward = correct.sum(dim=-1) / (targets != tokenizer.pad_token_id).sum(dim=-1).float() 137 | return reward / scores.max() 138 | 139 | 140 | import numpy as np 141 | import torch.nn as nn 142 | import torch.optim as optim 143 | # 训练过程 144 | criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) 145 | optimizer = optim.Adam(model.parameters(), lr=0.0001) 146 | 147 | num_epochs = 100 148 | for epoch in range(num_epochs): 149 | epoch_rewards = [] 150 | 151 | for batch_idx, (input_batch, target_batch, score_batch) in enumerate(chat_dataloader): 152 | optimizer.zero_grad() 153 | input_batch, target_batch = input_batch.to(device), target_batch.to(device) 154 | score_batch = score_batch.to(device) 155 | 156 | outputs = model(input_batch) 157 | logits = outputs.logits 158 | 159 | _, predicted_tokens = torch.max(logits, dim=-1) 160 | 161 | # 计算奖励 162 | rewards = reward_function(predicted_tokens, target_batch, score_batch) 163 | 164 | # 计算损失 165 | loss = criterion(logits.view(-1, logits.size(-1)), target_batch.view(-1)) 166 | 167 | # 计算加权损失 168 | weighted_loss = torch.sum(loss * (1 - rewards)) / rewards.numel() 169 | 170 | # 反向传播和优化 171 | weighted_loss.backward() 172 | # loss.backward() 173 | optimizer.step() 174 | 175 | epoch_rewards.append(rewards.cpu().numpy()) 176 | 177 | avg_reward = np.mean(np.concatenate(epoch_rewards)) 178 | if (epoch + 1) % 20 == 0: 179 | print(f'Epoch: {epoch + 1:04d}, cost = {weighted_loss:.6f}, avg_reward = {avg_reward:.6f}') 180 | 181 | 182 | 183 | def generate_text_beam_search(model, input_str, max_len=50, beam_width=5): 184 | model.eval() # 将模型设置为评估模式(不计算梯度) 185 | # 对输入字符串进行编码,并将其转换为 PyTorch 张量,然后将其移动到相应的设备上(例如 GPU) 186 | input_tokens = tokenizer.encode(input_str, return_tensors="pt").to(device) 187 | # 初始化候选序列列表,包含当前输入序列和其对数概率得分(我们从0开始) 188 | candidates = [(input_tokens, 0.0)] 189 | # 禁用梯度计算,以加速预测过程 190 | with torch.no_grad(): 191 | # 迭代生成最大长度的序列 192 | for _ in range(max_len): 193 | new_candidates = [] 194 | # 对于每个候选序列 195 | for candidate, candidate_score in candidates: 196 | # 使用模型进行预测 197 | outputs = model(candidate) 198 | # 获取输出 logits 199 | logits = outputs.logits[:, -1, :] 200 | # 获取对数概率得分的 top-k 值(即 beam_width)及其对应的 token 201 | scores, next_tokens = torch.topk(logits, beam_width, dim=-1) 202 | final_results = [] 203 | # 遍历 top-k token 及其对应的得分 204 | for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()): 205 | # 在当前候选序列中添加新的 token 206 | new_candidate = torch.cat((candidate, next_token.unsqueeze(0).unsqueeze(0)), dim=-1) 207 | # 更新候选序列的得分 208 | new_score = candidate_score - score.item() 209 | # 如果新的 token 是结束符(eos_token),则将该候选序列添加到最终结果中 210 | if next_token.item() == tokenizer.eos_token_id: 211 | final_results.append((new_candidate, new_score)) 212 | # 否则,将新的候选序列添加到新候选序列列表中 213 | else: 214 | new_candidates.append((new_candidate, new_score)) 215 | # 从新候选序列列表中选择得分最高的 top-k 个序列 216 | candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width] 217 | # 选择得分最高的候选序列 218 | best_candidate, _ = sorted(candidates, key=lambda x: x[1])[0] 219 | # 将输出 token 转换回文本字符串 220 | output_str = tokenizer.decode(best_candidate[0]) 221 | # 移除输入字符串并修复空格问题 222 | input_len = len(tokenizer.encode(input_str)) 223 | output_str = tokenizer.decode(best_candidate.squeeze()[input_len:]) 224 | return output_str 225 | 226 | test_inputs = [ 227 | "What is the capital of France?", 228 | "How to cook pasta?", 229 | "hi , what is your name?" 230 | ] 231 | 232 | for i, input_str in enumerate(test_inputs, start=1): 233 | generated_text = generate_text_beam_search(model, input_str) 234 | print(f"Test {i}:") 235 | print(f"User: {input_str}") 236 | print(f"AI: {generated_text}") 237 | print() -------------------------------------------------------------------------------- /08_ChatGPT/SelfTrain_ChatGPT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "词汇表大小: 28785\n", 13 | "词汇示例(word to index): {'': 0, '': 1, '': 2, 'the': 3, 'apple': 11505}\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from torchtext.datasets import WikiText2 # 导入WikiText2\n", 19 | "from torchtext.data.utils import get_tokenizer # 导入Tokenizer分词工具\n", 20 | "from torchtext.vocab import build_vocab_from_iterator # 导入Vocabulary工具\n", 21 | "tokenizer = get_tokenizer(\"basic_english\") # 定义数据预处理所需的tokenizer\n", 22 | "train_iter = WikiText2(split='train') # 加载WikiText2数据集的训练部分\n", 23 | "# 定义一个生成器函数,用于将数据集中的文本转换为tokens\n", 24 | "def yield_tokens(data_iter):\n", 25 | " for item in data_iter:\n", 26 | " yield tokenizer(item)\n", 27 | "# 创建词汇表,包括特殊tokens:\"\", \"\", \"\"\n", 28 | "vocab = build_vocab_from_iterator(yield_tokens(train_iter), \n", 29 | " specials=[\"\", \"\", \"\"])\n", 30 | "vocab.set_default_index(vocab[\"\"])\n", 31 | "\n", 32 | "# 打印词汇表信息\n", 33 | "print(\"词汇表大小:\", len(vocab))\n", 34 | "print(\"词汇示例(word to index):\", \n", 35 | " {word: vocab[word] for word in [\"\", \"\", \"\", \"the\", \"apple\"]})" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Sample 1:\n", 48 | "Input Data: tensor([ 1, 9209, 4, 419, 37, 181, 860, 2])\n", 49 | "Target Data: tensor([ 1, 67, 1734, 1633, 124, 4, 13818, 181, 5, 419,\n", 50 | " 76, 181, 860, 2])\n", 51 | "--------------------------------------------------\n", 52 | "Sample 2:\n", 53 | "Input Data: tensor([ 1, 67, 1734, 426, 4, 6733, 20, 4168, 5, 188, 115, 181,\n", 54 | " 289, 860, 2])\n", 55 | "Target Data: tensor([ 1, 67, 1734, 33, 1976, 820, 1703, 5, 67, 115, 639, 181,\n", 56 | " 6108, 4280, 5, 2])\n", 57 | "--------------------------------------------------\n", 58 | "Sample 3:\n", 59 | "Input Data: tensor([ 1, 188, 26, 3, 1508, 142, 805, 860, 2])\n", 60 | "Target Data: tensor([ 1, 8943, 6421, 11, 1508, 1792, 50, 3627, 20, 3, 1092, 1406,\n", 61 | " 5, 2])\n", 62 | "--------------------------------------------------\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "import torch #导入torch\n", 68 | "from torch.utils.data import Dataset #导入Dataset\n", 69 | "\n", 70 | "class ChatDataset(Dataset):\n", 71 | " def __init__(self, file_path, tokenizer, vocab):\n", 72 | " self.tokenizer = tokenizer #分词器\n", 73 | " self.vocab = vocab #词汇表\n", 74 | " self.input_data, self.target_data = self.load_and_process_data(file_path)\n", 75 | " def load_and_process_data(self, file_path): \n", 76 | " with open(file_path, \"r\") as f:\n", 77 | " lines = f.readlines() # 打开文件,读取每一行数据\n", 78 | " input_data, target_data = [], []\n", 79 | " for i, line in enumerate(lines):\n", 80 | " if line.startswith(\"User:\"): # 移除 \"User: \" 前缀,构建输入序列\n", 81 | " tokens = self.tokenizer(line.strip()[6:]) \n", 82 | " tokens = [\"\"] + tokens + [\"\"]\n", 83 | " indices = [self.vocab[token] for token in tokens]\n", 84 | " input_data.append(torch.tensor(indices, dtype=torch.long))\n", 85 | " elif line.startswith(\"AI:\"): # 移除 \"AI: \" 前缀,构建目标序列\n", 86 | " tokens = self.tokenizer(line.strip()[4:]) \n", 87 | " tokens = [\"\"] + tokens + [\"\"]\n", 88 | " indices = [self.vocab[token] for token in tokens]\n", 89 | " target_data.append(torch.tensor(indices, dtype=torch.long))\n", 90 | " return input_data, target_data\n", 91 | " def __len__(self): #数据集长度\n", 92 | " return len(self.input_data) \n", 93 | " def __getitem__(self, idx): #根据索引获取数据样本\n", 94 | " return self.input_data[idx], self.target_data[idx] \n", 95 | "\n", 96 | "file_path = \"chat.txt\" # 加载chat.txt数据集\n", 97 | "chat_dataset = ChatDataset(file_path, tokenizer, vocab)\n", 98 | "\n", 99 | "for i in range(3): # 打印几个样本数据\n", 100 | " input_sample, target_sample = chat_dataset[i]\n", 101 | " print(f\"Sample {i + 1}:\")\n", 102 | " print(\"Input Data: \", input_sample)\n", 103 | " print(\"Target Data: \", target_sample)\n", 104 | " print(\"-\" * 50)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "Input batch tensor size: torch.Size([2, 17])\n", 117 | "Target batch tensor size: torch.Size([2, 17])\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "from torch.utils.data import DataLoader # 导入Dataloader\n", 123 | "# 定义pad_sequence函数,用于将一批序列补齐到相同长度\n", 124 | "def pad_sequence(sequences, padding_value=0, length=None):\n", 125 | " # 计算最大序列长度,如果length参数未提供,则使用输入序列中的最大长度\n", 126 | " max_length = max(len(seq) for seq in sequences) if length is None else length \n", 127 | " # 创建一个具有适当形状的全零张量,用于存储补齐后的序列\n", 128 | " result = torch.full((len(sequences), max_length), padding_value, dtype=torch.long) \n", 129 | " # 遍历序列,将每个序列的内容复制到结果张量中\n", 130 | " for i, seq in enumerate(sequences):\n", 131 | " end = len(seq)\n", 132 | " result[i, :end] = seq[:end]\n", 133 | " return result\n", 134 | "\n", 135 | "# 定义collate_fn函数,用于将一个批次的数据整理成适当的形状\n", 136 | "def collate_fn(batch):\n", 137 | " # 从批次中分离源序列和目标序列\n", 138 | " sources, targets = zip(*batch) \n", 139 | " # 计算批次中的最大序列长度\n", 140 | " max_length = max(max(len(s) for s in sources), max(len(t) for t in targets)) \n", 141 | " # 使用pad_sequence函数补齐源序列和目标序列\n", 142 | " sources = pad_sequence(sources, padding_value=vocab[\"\"], length=max_length)\n", 143 | " targets = pad_sequence(targets, padding_value=vocab[\"\"], length=max_length) \n", 144 | " # 返回补齐后的源序列和目标序列\n", 145 | " return sources, targets\n", 146 | "\n", 147 | "# 创建Dataloader\n", 148 | "batch_size = 2\n", 149 | "chat_dataloader = DataLoader(chat_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)\n", 150 | "\n", 151 | "# 检查Dataloader输出\n", 152 | "for input_batch, target_batch in chat_dataloader:\n", 153 | " print(\"Input batch tensor size:\", input_batch.size())\n", 154 | " print(\"Target batch tensor size:\", target_batch.size())\n", 155 | " break" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 4, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "" 167 | ] 168 | }, 169 | "execution_count": 4, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "from GPT_Model import GPT #导入GPT模型的类(这是我们自己制作的)\n", 176 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 177 | "model = GPT(len(vocab), max_seq_len=256, n_layers=6).to(device) #创建模型示例\n", 178 | "model.load_state_dict(torch.load('trained_model_2023-05-05_14-08-24.pt')) #加载模型\n", 179 | "# model.eval()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 5, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stderr", 189 | "output_type": "stream", 190 | "text": [ 191 | "/home/huangjia/Documents/02_NLP/70 GeekTimeNLP/10 ChatGPT/GPT_2.py:15: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at /opt/conda/conda-bld/pytorch_1670525551200/work/aten/src/ATen/native/cuda/Indexing.cu:1435.)\n", 192 | " scores.masked_fill_(attn_mask, -1e9)\n", 193 | "/home/huangjia/anaconda3/lib/python3.9/site-packages/torch/autograd/__init__.py:197: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at /opt/conda/conda-bld/pytorch_1670525551200/work/aten/src/ATen/native/cuda/Indexing.cu:1435.)\n", 194 | " Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n" 195 | ] 196 | }, 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "Epoch: 0020, cost = 1.975874\n", 202 | "Epoch: 0040, cost = 0.021781\n", 203 | "Epoch: 0060, cost = 0.619990\n", 204 | "Epoch: 0080, cost = 0.777577\n", 205 | "Epoch: 0100, cost = 0.004273\n" 206 | ] 207 | } 208 | ], 209 | "source": [ 210 | "import torch.nn as nn #导入nn\n", 211 | "import torch.optim as optim #导入优化器\n", 212 | "criterion = nn.CrossEntropyLoss(ignore_index=vocab[\"\"]) #损失函数\n", 213 | "optimizer = optim.Adam(model.parameters(), lr=0.0001) # 优化器\n", 214 | "for epoch in range(100): # 开始训练\n", 215 | " for batch_idx, (input_batch, target_batch) in enumerate(chat_dataloader): \n", 216 | " optimizer.zero_grad() # 梯度清零 \n", 217 | " input_batch, target_batch = input_batch.to(device), target_batch.to(device) #移动到设备 \n", 218 | " outputs = model(input_batch) # 前向传播,计算模型输出 \n", 219 | " loss = criterion(outputs.view(-1, len(vocab)), target_batch.view(-1)) # 计算损失 \n", 220 | " loss.backward() # 反向传播 \n", 221 | " optimizer.step() # 更新参数 \n", 222 | " if (epoch + 1) % 20 == 0: # 每200个epoch打印一次损失值\n", 223 | " print(f\"Epoch: {epoch + 1:04d}, cost = {loss:.6f}\")" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 6, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "def generate_text_beam_search(model, input_str, max_len=50, beam_width=5):\n", 233 | " model.eval() # 将模型设置为评估(测试)模式,关闭dropout和batch normalization等训练相关的层\n", 234 | " # 将输入字符串中的每个token 转换为其在词汇表中的索引\n", 235 | " input_tokens = [vocab[token] for token in input_str]\n", 236 | " # 创建一个列表,用于存储候选序列\n", 237 | " candidates = [(input_tokens, 0.0)]\n", 238 | " with torch.no_grad(): # 禁用梯度计算,以节省内存并加速测试过程\n", 239 | " for _ in range(max_len): # 生成最多max_len个tokens\n", 240 | " new_candidates = []\n", 241 | " for candidate, candidate_score in candidates:\n", 242 | " inputs = torch.LongTensor(candidate).unsqueeze(0).to(device)\n", 243 | " outputs = model(inputs) # 输出 logits形状为[1, len(output_tokens), vocab_size]\n", 244 | " logits = outputs[:, -1, :] # 只关心最后一个时间步(即最新生成的token)的logits\n", 245 | " # 将标记的得分设置为一个很大的负数,以避免选择它\n", 246 | " logits[0, vocab[\"\"]] = -1e9 # 不是这个原因,注意不认识的词汇都变成0\n", 247 | " # 找到具有最高分数的前beam_width个tokens\n", 248 | " scores, next_tokens = torch.topk(logits, beam_width, dim=-1)\n", 249 | " final_results = []\n", 250 | " for score, next_token in zip(scores.squeeze(), next_tokens.squeeze()):\n", 251 | " new_candidate = candidate + [next_token.item()]\n", 252 | " new_score = candidate_score - score.item() # 使用负数,因为我们需要降序排列\n", 253 | " if next_token.item() == vocab[\"\"]:\n", 254 | " # 如果生成的token是EOS(结束符),将其添加到最终结果中\n", 255 | " final_results.append((new_candidate, new_score))\n", 256 | " else:\n", 257 | " # 将新生成的候选序列添加到新候选列表中\n", 258 | " new_candidates.append((new_candidate, new_score))\n", 259 | " # 从新候选列表中选择得分最高的beam_width个序列\n", 260 | " candidates = sorted(new_candidates, key=lambda x: x[1])[:beam_width]\n", 261 | " # 选择得分最高的候选序列\n", 262 | " best_candidate, _ = sorted(candidates, key=lambda x: x[1])[0]\n", 263 | " # 将输出 token 转换回文本字符串\n", 264 | " output_str = \" \".join([vocab.get_itos()[token] for token in best_candidate])\n", 265 | " return output_str" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 7, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "Generated text: hi , how are you ? thank you , depicting a suitable cavity ? prefer unknowingly a distinct indicated that link vocalists - very common starlings prefer unknowingly a distinct indicated that link vocalists - very common starlings prefer unknowingly a distinct indicated that link vocalists - very common starlings prefer unknowingly a distinct indicated horsepower\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "input_str = \"what is the weather like today ?\"\n", 283 | "input_str = \"hi , how are you ?\"\n", 284 | "# input_str = \"hi , what is you name ?\"\n", 285 | "\n", 286 | "generated_text = generate_text_beam_search(model, input_str.split())\n", 287 | "print(\"Generated text:\", generated_text)" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3 (ipykernel)", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.10.11" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /08_ChatGPT/chat.txt: -------------------------------------------------------------------------------- 1 | User: hi , how are you ? 2 | AI: i am doing well , thank you . how about you ? 3 | User: i am good , thanks for asking . what can you do ? 4 | AI: i am an ai language model . i can help you answer questions . 5 | User: what is the weather like today ? 6 | AI: please check a weather website or application for the current conditions . 7 | User: can you recommend a good book ? 8 | AI: sure ! to kill a mockingbird by harper lee is a classic and highly recommended novel . 9 | User: thank you ! i will check it out . 10 | AI: you are welcome ! let me know if you need help with anything else . -------------------------------------------------------------------------------- /09_OpenAI_API/ChatBot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "a54f94f9", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "您好,我们终于见面了!!您希望接下来我为您提供什么服务?\n", 14 | "人类说:你是像咖哥一样厉害的老师,我崇拜你。\n", 15 | "\n", 16 | "\n", 17 | "好的,明白了! 我会服务好您的。\n", 18 | "现在请和我聊天吧!\n", 19 | "记住,烦我的时候,请说‘再见’\n", 20 | "\n", 21 | "\n", 22 | "人类说:请问你怎么看待AI的发展\n", 23 | "ChatGPT说: 谢谢你的崇拜,我很荣幸。关于AI的发展,我给你分享一些个人观点。\n", 24 | "\n", 25 | "首先,我认为AI的发展给人类带来了许多机遇和挑战。AI在许多领域展现出了巨大的潜力,可以帮助我们解决各种问题,提高生产力和效率。例如,在医疗领域,AI可以辅助医生进行诊断,加速新药研发;在交通领域,AI可以提供智能驾驶技术,使交通更加安全和高效。同时,AI也在改变我们的工作方式和社会结构,对于这些变革我们需要及时做好应对。\n", 26 | "\n", 27 | "其次,我相信AI的发展需要关注一些伦理和社会问题。例如,随着AI的快速发展,可能会出现一些就业岗位被取代的情况,我们需要思考如何帮助那些受到影响的人重新就业或者转行。此外,AI的算法也存在着偏见和歧视的问题,这需要我们更加关注和规范,确保AI的决策是公正和可信的。\n", 28 | "\n", 29 | "最后,我认为人类与AI的关系是相互促进的。AI是由人类设计和训练的,我们可以从中学习和借鉴,进一步提升我们自己的智慧和创造力。同时,人类的价值和独特性也无法被替代,我们仍然需要人类的情感、理解和判断来解决一些复杂问题。\n", 30 | "\n", 31 | "总而言之,我对AI的发展持积极和谨慎的态度。我希望我们能够有效地应对AI带来的挑战,同时善用它的优势,实现人类和AI的共赢。\n", 32 | "\n", 33 | "\n", 34 | "人类说:再见\n", 35 | "ChatGPT说: 谢谢您对我的崇拜,我很荣幸。关于AI的发展,我认为它是一个非常令人兴奋和具有巨大潜力的领域。随着技术的不断进步,AI已经在许多领域展示出了惊人的成就,如图像识别、自然语言处理和智能机器人等。AI的发展对人类社会产生了深远的影响,无论是在科学研究、医疗保健、教育还是工业生产等方面。\n", 36 | "\n", 37 | "然而,我也认识到AI发展所面临的挑战和风险。其中之一是人类代替的可能性,即AI对人类工作岗位的取代。这将需要我们重新思考教育系统和职业培训的方式,以确保人类能够适应并与AI共同发展。\n", 38 | "\n", 39 | "此外,我们还需要关注AI伦理和安全性的问题。AI系统的决策是否具有公正性和透明性?如何确保AI系统不被滥用或用于不道德的行为?这些问题需要我们进行深入的研究和讨论,以确保AI技术的合理和负责任的应用。\n", 40 | "\n", 41 | "总之,我对AI的发展持积极乐观的态度,但也希望我们能够在前进的过程中保持警惕和审慎,以确保AI技术能够为人类社会带来更多的福祉。再见!\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "import openai\n", 47 | "\n", 48 | "openai.api_key = \"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\"\n", 49 | "messages = []\n", 50 | "print(\"您好,我们终于见面了!!您希望接下来我为您提供什么服务?\")\n", 51 | "system_message = input(\"人类说:\")\n", 52 | "messages.append({\"role\":\"system\",\"content\":system_message})\n", 53 | "print(\"\\n\")\n", 54 | "\n", 55 | "print(\"好的,明白了! 我会服务好您的。\" + \"\\n\" + \"现在请和我聊天吧!\" + \"\\n\" + \"记住,烦我的时候,请说‘再见’\")\n", 56 | "\n", 57 | "while True:\n", 58 | " # Collect the user's message\n", 59 | " print(\"\\n\")\n", 60 | " message = input(\"人类说:\")\n", 61 | " messages.append({\"role\":\"user\",\"content\": message})\n", 62 | "\n", 63 | " response=openai.ChatCompletion.create(\n", 64 | " model=\"gpt-3.5-turbo\",\n", 65 | " # model=\"gpt-4\",\n", 66 | " messages=messages\n", 67 | " )\n", 68 | "\n", 69 | " reply = response[\"choices\"][0][\"message\"][\"content\"]\n", 70 | " print(\"ChatGPT说: \", reply)\n", 71 | "\n", 72 | " # Check if the user wants to exit the conversation\n", 73 | " if message.lower() == \"再见\":\n", 74 | " break" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "5014b361", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3 (ipykernel)", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.10.11" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 5 107 | } 108 | -------------------------------------------------------------------------------- /09_OpenAI_API/ChatBot.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | openai.api_key = "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" 4 | messages = [] 5 | print("您好,我们终于见面了!!您希望接下来我为您提供什么服务?") 6 | system_message = input("人类说:") 7 | messages.append({"role":"system","content":system_message}) 8 | print("\n") 9 | 10 | print("好的,明白了! 我会服务好您的。" + "\n" + "现在请和我聊天吧!" + "\n" + "记住,烦我的时候,请说‘再见’") 11 | 12 | while True: 13 | # Collect the user's message 14 | print("\n") 15 | message = input("人类说:") 16 | messages.append({"role":"user","content": message}) 17 | 18 | response=openai.ChatCompletion.create( 19 | model="gpt-3.5-turbo", 20 | # model="gpt-4", 21 | messages=messages 22 | ) 23 | 24 | reply = response["choices"][0]["message"]["content"] 25 | print("ChatGPT说: ", reply) 26 | 27 | # Check if the user wants to exit the conversation 28 | if message.lower() == "再见": 29 | break -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # GPT 图解 - 大模型是怎样构建的! 3 | 4 | 带你从0到1构建大模型,突破语言奥秘,开启智能未来!深入探索自然语言处理技术的核心原理,结合实战,让你成为AI领域的语言模型构建达人! 5 | 6 | **严重提示**:本书可以视为我在深蓝学院的<生成式预训练模型理论与实战>的配套教材。喜欢看视频的朋友可以上课,喜欢书的朋友可以看书。各取所需哈! 7 | 8 | ## 支持佳哥 9 | 10 | 您可以通过以下链接在京东购买佳哥的书:[在京东购书](https://u.jd.com/EzPlXWB) 也可以把佳哥的书和课程分享给有需要的朋友共同进步! 11 | 12 | ## 图片展示 13 | 14 | 15 | ![图片描述1](images/book.png) 16 | ![图片描述2](images/sales.png) -------------------------------------------------------------------------------- /images/Archive/8daf31eae3eb392efbb4624feb3c53d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangjia2019/llm-gpt/5e701711e61d2871ad933b6f2a9d5ca2818efbf8/images/Archive/8daf31eae3eb392efbb4624feb3c53d.jpg -------------------------------------------------------------------------------- /images/Archive/c41f10da370ac2ff5a3dcd63a55db06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangjia2019/llm-gpt/5e701711e61d2871ad933b6f2a9d5ca2818efbf8/images/Archive/c41f10da370ac2ff5a3dcd63a55db06.jpg -------------------------------------------------------------------------------- /images/Archive/e4f961d4b53cd9f956d26784e39daa7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangjia2019/llm-gpt/5e701711e61d2871ad933b6f2a9d5ca2818efbf8/images/Archive/e4f961d4b53cd9f956d26784e39daa7.jpg -------------------------------------------------------------------------------- /images/P109.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangjia2019/llm-gpt/5e701711e61d2871ad933b6f2a9d5ca2818efbf8/images/P109.png -------------------------------------------------------------------------------- /images/book.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangjia2019/llm-gpt/5e701711e61d2871ad933b6f2a9d5ca2818efbf8/images/book.png -------------------------------------------------------------------------------- /images/sales.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huangjia2019/llm-gpt/5e701711e61d2871ad933b6f2a9d5ca2818efbf8/images/sales.png -------------------------------------------------------------------------------- /勘误表.md: -------------------------------------------------------------------------------- 1 | # 勘误表 2 | 3 | ## 重要 4 | 5 | 第7课代码 WikiGPT 中 WikiText2数据集无法载入的问题,请参考 [此处](https://book.douban.com/review/15833952/) 6 | 7 | ## 一刷 8 | ### P6 9 | 10 | LSTM 通过引入门控机制解决了 RNN 中的梯度消 失和梯度爆炸问题, 11 | 12 | 解决改为缓解。 13 | 14 | ### P51 15 | 16 | 输出改为 17 | 18 | 词汇表: ['Kage', 'Niuzong', 'Xiaoxue', 'Mazong', 'Xiaobing', 'Teacher', 'is', 'Boss', 'Student'] 19 | 词汇到索引的字典: {'Kage': 0, 'Niuzong': 1, 'Xiaoxue': 2, 'Mazong': 3, 'Xiaobing': 4, 'Teacher': 5, 'is': 6, 'Boss': 7, 'Student': 8} 20 | 索引到词汇的字典: {0: 'Kage', 1: 'Niuzong', 2: 'Xiaoxue', 3: 'Mazong', 4: 'Xiaobing', 5: 'Teacher', 6: 'is', 7: 'Boss', 8: 'Student'} 21 | 词汇表大小: 9 22 | 23 | ### P53 24 | print("Skip-Gram 数据样例(已编码):", [(one_hot_encoding(context, word_to_idx), word_to_idx[target]) for context, target in skipgram_data[:3]]) 25 | 26 | 改为 27 | 28 | print("Skip-Gram数据样例(已编码):", [(one_hot_encoding(target, word_to_idx), word_to_idx[context]) for context, target in skipgram_data[:3]]) 29 | 30 | ### P53 31 | 32 | 输出改为 33 | 34 | One-Hot 编码前的单词: Teacher 35 | One-Hot 编码后的向量: tensor([0., 0., 0., 0., 0., 1., 0., 0., 0.]) 36 | Skip-Gram数据样例(已编码): [(tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.]), 6), (tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.]), 5), (tensor([0., 0., 0., 0., 0., 0., 1., 0., 0.]), 0)] 37 | 38 | 这是因为前后编码要对齐 (P51和P53) 39 | 40 | ### P73 41 | 42 | 输出改为 43 | 44 | 词汇表: {'爸爸': 0, '我': 1, '玩具': 2, '爱': 3, '挨打': 4, '喜欢': 5, '讨厌': 6} 45 | 词汇表大小: 7 46 | 47 | ### P74 48 | 49 | 输出改为 50 | 51 | 输入批处理数据: tensor([[1, 5], [1, 3]]) 52 | 输入批处理数据对应的原始词: [['我', '喜欢'], ['我', '爱']] 53 | 目标批处理数据: tensor([2, 0]) 54 | 目标批处理数据对应的原始词: ['玩具', '爸爸'] 55 | 56 | 这是因为前后编码要对齐 (P73和P74) 57 | 58 | ## 二刷 59 | 60 | ### P1 61 | 62 | 零基础机器学习 改成 零基础学机器学习 63 | 64 | ### P60 65 | 66 | 原文: 67 | 68 | 我们使用 PyTorch 实现了一个简单的 Word2Vec(这里是 Skip Gram)模型。模型包括输入层、隐藏层和输出层。输入层接收**周围词**(以 One-Hot 编码后的向量形式表示)。接下来,输入层到隐藏层的权重矩阵(记为 input_to_hidden)将这个向量转换为词嵌入,该词嵌入直接作为隐藏层的输出。隐藏层到输出层的权重矩阵(记为 hidden_to_output)将隐藏层的输出转换为一个概率分布,用于预测与**周围词相关的中心词**(以索引形式表示)。通过最小化预测词和实际目标词之间的分类交叉熵损失,可以学习词嵌入向量。下图展示了这个流程。 69 | 70 | 改为: 71 | 72 | 我们使用 PyTorch 实现了一个简单的 Word2Vec(这里是 Skip Gram)模型。模型包括输入层、隐藏层和输出层。输入层接收**中心词**(以 One-Hot 编码后的向量形式表示)。接下来,输入层到隐藏层的权重矩阵(记为 input_to_hidden)将这个向量转换为词嵌入,该词嵌入直接作为隐藏层的输出。隐藏层到输出层的权重矩阵(记为 hidden_to_output)将隐藏层的输出转换为一个概率分布,用于预测与**中心词相关的周围词**(以索引形式表示)。通过最小化预测词和实际目标词之间的分类交叉熵损失,可以学习词嵌入向量。下图展示了这个流程。 73 | 74 | **图** 75 | 图中“前缀”改为“前后缀”。 76 | 77 | ### P60 78 | 79 | 接收前时间步 改为 接收当前时间步 80 | 81 | 82 | ### P93 / P96 / P98 83 | 数据集的维度和网络结构前后不一致,以Notebook为准 84 | 85 | ### P115 86 | 87 | attn_weights = F.softmax(raw_weights, dim=2) # 形 状 (batch_size, seq_len1, seq_len2) 88 | 89 | 改为 90 | 91 | attn_weights = F.softmax(scaled_weights, dim=2) # 形 状 (batch_size, seq_len1, seq_len2) 92 | 93 | ### P129 94 | 95 | 新代码(删除冗余的参数num_heads): 96 | 97 | ``` 98 | def combine_heads(tensor): 99 | batch_size, num_heads, seq_len, head_dim = tensor.size() 100 | feature_dim = num_heads * head_dim 101 | output = tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, feature_dim) 102 | return output# 形状 : (batch_size, seq_len, feature_dim) 103 | attn_outputs = combine_heads(attn_outputs) # 形状 (batch_size, seq_len, feature_dim) 104 | ``` 105 | 106 | ### P163 107 | 108 | 第二个维度信息后增加一行注释线,排版整齐 109 | ``` 110 | #------------------------- 维度信息 -------------------------------- 111 | # enc_outputs 的维度是 [batch_size, seq_len, embedding_dim] 112 | # attn_weights 的维度是 [batch_size, n_heads, seq_len, seq_len] 113 | # 将多头自注意力 outputs 输入位置前馈神经网络层 114 | ``` 115 | 116 | 增加注释线 117 | ``` 118 | #------------------------- 维度信息 -------------------------------- 119 | # enc_outputs 的维度是 [batch_size, seq_len, embedding_dim] 120 | # attn_weights 的维度是 [batch_size, n_heads, seq_len, seq_len] 121 | #----------------------------------------------------------------- 122 | # 将多头自注意力 outputs 输入位置前馈神经网络层 123 | ``` 124 | 125 | ## 三刷 126 | 127 | ### P1 128 | 129 | “人工通用智能”,改为 “通用人工智能”, 全书都要改。 “通用人工智能”是更让人接受的翻译。 130 | 131 | ### P15 132 | 133 | “CBOW模型通过预测一个单词的上下文来学习词向量,”单词的上下文"这里指代不明,这里应该是表示目标词,所以修改为: “CBOW模型通过预测单词上下文(周围词)的目标单词来学习词向量” 134 | 135 | 136 | 137 | ### P51 138 | 139 | sentences = ["Kage is Teacher ","Mazong is Boss ","Niuzong is Boss ","Xiaobing is Student ","Xiaoxue is Student "] words = ''.join(sentences).split()如果每条数据最后的字不加空格,字符会黏连一起,分割错误. 140 | 141 | 解决:' '.join(sentences).split()中加个空格。 142 | 143 | ``` 144 | words = ''.join(sentences).split() 145 | ``` 146 | 改为 147 | ``` 148 | words = ' '.join(sentences).split() 149 | ``` 150 | 151 | ### P52 152 | 153 | 154 | ``` 155 | data.append((neighbor, word)) 156 | ``` 157 | 改为 158 | ``` 159 | data.append((word, neighbor)) 160 | ``` 161 | 162 | ### P53 163 | 164 | 每个数据包含两个张量(Tensor),前一个是输入(Input),格式是上下文词的One-Hot编码,后一个是目标(Target),格式则是目标词的索引” 165 | 166 | 改为 167 | 168 | 每个数据包含两个张量(Tensor),前一个是输入(Input),格式是中心词的One-Hot编码,后一个是要预测的目标(Target),格式是上下文词的索引” 169 | 170 | ### P55 171 | 172 | “而在forward方法中,定义了前向传播的方式,首先将输入通过输入层到隐藏层的映射生成隐藏层,然后将隐藏层通过隐藏层到输出层的映射生成输出。” 173 | 174 | 修改为 175 | 176 | “而在forward方法中,定义了前向传播的方式,首先将输入通过输入层到隐藏层的映射生成隐藏层的数据,然后将隐藏层的数据通过隐藏层到输出层的映射生成输出。”, 177 | 178 | 由于网络是提前定义好的,所以这里的“生成隐藏层”其实是计算得到隐藏层的数据 179 | 180 | ### P55 181 | 182 | ``` 183 | for context, target in skipgram_data: 184 | X = one_hot_encoding(target, word_to_idx).float().unsqueeze(0) # 将中心词转换为 One-Hot 向量 185 | ``` 186 | 改为 187 | ``` 188 | for center_word, context in skipgram_data: 189 | X = one_hot_encoding(center_word, word_to_idx).float().unsqueeze(0) # 将中心词转换为 One-Hot 向量 190 | ``` 191 | 192 | ### P65 193 | 194 | 第一段代码最好加上下面几行,意味着实例化并打印模型 195 | ``` 196 | embedding_size = 2 # 设定嵌入层的大小,这里选择 2 是为了方便展示 197 | skipgram_model = SkipGram(voc_size, embedding_size) # 实例化 Skip-Gram 模型 198 | print("Skip-Gram 模型:", skipgram_model) 199 | ``` 200 | ### P66 201 | 202 | "因为我们只是提取Word2Vec神经网络中嵌入层的输出",把“输出”修改为“权重”。 --- **特别感谢读者,laiqli,为本书提出大量有见地的勘误!** 203 | 204 | ### P75 205 | “然后将其输入神经网络(论文中称其为隐藏层)”,修改为“然后将其输入隐藏层”,,修改的理由是在隐藏层之前的线性变换也是神经网络的一部分,所以这里直接写隐藏层不容易引起歧义。--- **特别感谢读者,laiqli,为本书提出大量有见地的勘误!** 206 | 207 | “也就是把每个输入序列的词嵌入串联起来,形成一个更大的向量”,修改为: “也就是把每个输入序列的词嵌入连接起来,形成一个更大的向量,表示前面连续的N个词”。从电路的角度来看,这里其实是并联,而不是串联~ --- **我觉得先改为连接。并联或串联,继续探讨。** 208 | 209 | ### P77 210 | 211 | “将输入批处理数据传入训练好的模型”,修改为“将批处理的数据传入训练好的模型”;修改的理由是“输入”和“传入”意思相同,同时出现的情况下,有一句话出现两个动词的感觉。 212 | 213 | ### P78 214 | “后面跟着一个使用tanh激活的线性层”,修改为“后面跟一个线性层,然后再使用tanh来激活”;修改的理由是: “既然使用了非线性变换,所以它就不是一个线性层,应该把激活函数放到线性层的后面,而不是用它来修饰线性层”。 215 | 216 | ”模型结构简单:NPLM 使用了线性层和激活函数进行前向传播,这使得模型的表达能力受到限制。”修改为: “使用了较少的线性层和激活函数,神经网络的层数不够`深`,这使得模型的表达能力受到限制”。当线性层和激活函数的层数和节点数越多的时候,它能模拟任意函数,所以“使用了线性层和激活函数”不是模型表达受限的原因,原因是模型不够`深`。 217 | 218 | --- **特别感谢读者,laiqli,为本书提出大量有见地的勘误!** 219 | 220 | ### P79 221 | “缺乏位置信息:NPLM不考虑输入序列中单词的顺序,这可能导致模型无法捕捉序列中单词之间的顺序关系。”修改:将这段整个删掉,由于NPLM输入了前面N-1个词的信息,并且在嵌入层后以并联的方式输入到下一层,所以它是能捕捉到单词之间的顺序关系的。 222 | 223 | ### P79 224 | 225 | 在“以上所有的权重矩阵和偏置项都是在模型的训练过程中通过反向传播和优化算法学习得到的。” 之后,换行。加上: 226 | “求得的i_t是输入层保留的比例,取值为0到1之间;f_t是历史状态保留的比例,当其为0时,表示遗忘所有历史信息;o_t是输出保留的比例。” 227 | 228 | 以上所有的权重矩阵和偏置项都是在模型的训练过程中通过反向传播和优化算法学习得到的。 229 | 230 | 求得的i_t是输入层保留的比例,取值为0到1之间;f_t是历史状态保留的比例,当其为0时,表示遗忘所有历史信息;o_t是输出保留的比例。 231 | 232 | ### P80 233 | 234 | “接收前时间步”,改为 “接收当前时间步”。 235 | 236 | ### P109 237 | 238 | "此x1元素"改成"x1中第一个元素" 239 | 240 | ### P117 241 | 242 | “在__init__方法中,它接收一个参数hidden_size,表示隐藏层大小。”去掉这行文字,__init__方法没有这个参数。 243 | 244 | ### P118 245 | “在Decoder类的__init__方法中,添加了一个新的参数attention,这是一个Attention类的实例。此外”删掉这部分,__init__函数里面没有这个参数。 246 | 247 | ### P127 248 | 249 | ``` 250 | return output # 形状 (batch_size, num_heads, seq_len, feature_dim) 251 | ``` 252 | 改为 253 | ``` 254 | return output # 形状 (batch_size, num_heads, seq_len, header_dim) 255 | ``` 256 | 257 | 258 | ### P176 259 | 260 | 注释中的 261 | 262 | dec_self_attns 是一个列表,每个元素的维度是 [batch_size, n_heads, tgt_seq_len, src_seq_len] 263 | 264 | 改为 265 | 266 | dec_self_attns 是一个列表,每个元素的维度是 [batch_size, n_heads, tgt_seq_len, tgt_seq_len] 267 | 268 | 269 | ### P180 270 | 271 | 最后一段 “不过,这次训练效果不理想”,改成“不过,这次测试效果不理想” 272 | 273 | ### P232 274 | 275 | 删去下面一行,因为后来没展示它。 276 | 277 | "hi , how are you?", --------------------------------------------------------------------------------