├── 微博评论情感分析
├── 基于堆叠transformer.ipynb
└── 情感分析-堆叠transformer.ipynb
└── 电商评论情感分析
├── README.md
└── 双向LSTM+word2vec.ipynb
/微博评论情感分析/基于堆叠transformer.ipynb:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/电商评论情感分析/README.md:
--------------------------------------------------------------------------------
1 | ## 基于双向LSTM的电商评论情感分析
2 |
3 | ### 1.数据准备
4 | #### 1.1电商评论数据集来源
5 | $~~~~$ $~~~~$本项目所用的数据是在百度飞浆里的公开数据集中的
6 | 👉[商品评论情感预测数据集](https://aistudio.baidu.com/datasetdetail/96333)
7 | #### 1.2电商评论数据集简介
8 | $~~~~$ $~~~~$当你下载了数据集后,里面有5个文件,本项目仅用了**训练集.csv** 这个文件。
9 | $~~~~$ $~~~~$在该文件中有以下一些指标(标❌代表用不上,标✔️代表用得上)
10 | | 数据ID | 用户ID |商品ID|评论时间|评论标题|评论内容|评分|
11 | |--|--|--|--|--|--|--|
12 | | ❌ | ❌ |❌|❌|✔️|✔️|✔️|
13 | #### 1.3预训练好的中文词向量
14 | $~~~~$ $~~~~$在很多地方都有预训练好的中文词向量,推荐这个[中文词向量](https://github.com/Embedding/Chinese-Word-Vectors/blob/master/README_zh.md),选一个你喜欢的词向量下载,我下载的是知乎问答的“Word + Ngram”词向量,感觉也就这个词向量和本项目有点点贴合🤔。
15 | $~~~~$ $~~~~$❗在其他地方下载词向量也可以,只不过数据预处理的方式可能有点不一样。
16 | $~~~~$ $~~~~$其实你也可以从头开始用bert啥的训练词向量啦,只不过这可能会花费亿点点时间😭
17 | ### 2.基本库版本
18 | 我在两台设备上分别运行了代码,以下是两台设备对应的库版本
19 | - torch 1.13.1+cu117 **or** 2.0.1+cu117
20 | - d2l 0.17.6 **or** 1.0.3
21 | - jieba 0.42.1
22 | - sklearn 1.0.2 **or** 1.3.0
23 |
24 | ### ❗3.使用代码时需要修改的地方❗
25 |
26 | 在第**8**个代码块中
27 | ~~~ Javascript
28 | comment_path = '' #数据集文件路径
29 | ~~~
30 | ~~~ Javascript
31 | embed_path = '' #词向量文件路径
32 | ~~~
33 | ~~~ Javascript
34 | embed_size = 300 #词向量维度
35 | ~~~
36 |
--------------------------------------------------------------------------------
/电商评论情感分析/双向LSTM+word2vec.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import re\n",
12 | "from d2l import torch as d2l\n",
13 | "import torch\n",
14 | "import jieba\n",
15 | "from sklearn.model_selection import train_test_split\n",
16 | "import torch.nn as nn\n",
17 | "import bz2"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "outputs": [],
24 | "source": [
25 | "def read_comments(path):\n",
26 | " '''读取评论和对应的评分'''\n",
27 | " #这是我心血来潮写的数据预处理脑抽代码\n",
28 | " with open(path,\"r\",encoding='utf-8') as fp:\n",
29 | " #第一行是数据指标标签,不要,最后一行是一个空格,不要\n",
30 | " reviews = fp.read().split('\\n')[1:-1]\n",
31 | " #csv文件的每个格子以逗号划分\n",
32 | " all_split_dot = [[i.start() for i in re.finditer(',',review)] for review in reviews]\n",
33 | " #提取评分\n",
34 | " labels = torch.tensor([int(float(reviews[i][all_split_dot[i][-1]+1:])) for i in range(len(all_split_dot))]) -1\n",
35 | " #提取评论\n",
36 | " comments = [reviews[i][all_split_dot[i][3]+1:all_split_dot[i][-1]].replace('\"','').replace(' ',\"\") for i in range(len(all_split_dot))]\n",
37 | " #把每条评论进行分词分词\n",
38 | " comments = [[i for i in jieba.cut(comment,cut_all=False)]for comment in comments]\n",
39 | " return comments,labels"
40 | ],
41 | "metadata": {
42 | "collapsed": false,
43 | "pycharm": {
44 | "name": "#%%\n"
45 | }
46 | }
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "outputs": [],
52 | "source": [
53 | "def process_data(root,batch_size,num_steps=60):\n",
54 | " \"\"\"返回数据迭代器和评论数据集的词表\"\"\"\n",
55 | " #把评论和对应的评分取出来\n",
56 | " comments,labels = read_comments(root)\n",
57 | " #创建vocab,使得每个词都有对应的索引,将词频小于3的词不要了,不然这个vocab太大了\n",
58 | " vocab = d2l.Vocab(comments, min_freq=3,reserved_tokens=[''])\n",
59 | " #划分训练集和测试集\n",
60 | " train_comment, test_comment, train_label, test_label = train_test_split(comments,labels,test_size=0.2)\n",
61 | " #鉴于绝大部分评论都在60个词以下,把每一条评论都处理成相同长度(60个词),对短评论进行填充'pad',对长评论进行截断\n",
62 | " train_features = torch.tensor([d2l.truncate_pad(\n",
63 | " vocab[comment], num_steps, vocab['']) for comment in train_comment])\n",
64 | " test_features = torch.tensor([d2l.truncate_pad(\n",
65 | " vocab[comment], num_steps, vocab['']) for comment in test_comment])\n",
66 | " #返回数据迭代器\n",
67 | " train_iter = d2l.load_array((train_features,train_label),\n",
68 | " batch_size)\n",
69 | " test_iter = d2l.load_array((test_features,test_label),\n",
70 | " batch_size,\n",
71 | " is_train=False)\n",
72 | " return train_iter, test_iter, vocab"
73 | ],
74 | "metadata": {
75 | "collapsed": false,
76 | "pycharm": {
77 | "name": "#%%\n"
78 | }
79 | }
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "outputs": [],
85 | "source": [
86 | "class BiRNN(nn.Module):\n",
87 | " def __init__(self, vocab_size, embed_size, num_hiddens,\n",
88 | " num_layers, **kwargs):\n",
89 | " super(BiRNN, self).__init__(**kwargs)\n",
90 | " self.embedding = nn.Embedding(vocab_size, embed_size)\n",
91 | " # 将bidirectional设置为True以获取双向循环神经网络\n",
92 | " self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,\n",
93 | " bidirectional=True)\n",
94 | " #这里评分有5个等级\n",
95 | " self.decoder = nn.Linear(4 * num_hiddens, 5)\n",
96 | "\n",
97 | " def forward(self, inputs):\n",
98 | " # inputs的形状是(批量大小,时间步数)\n",
99 | " # 因为长短期记忆网络要求其输入的第一个维度是时间维,\n",
100 | " # 所以在获得词元表示之前,输入会被转置。\n",
101 | " # 输出形状为(时间步数,批量大小,词向量维度)\n",
102 | " embeddings = self.embedding(inputs.T)\n",
103 | " self.encoder.flatten_parameters()\n",
104 | " # 返回上一个隐藏层在不同时间步的隐状态,\n",
105 | " # outputs的形状是(时间步数,批量大小,2*隐藏单元数)\n",
106 | " outputs, _ = self.encoder(embeddings)\n",
107 | " # 连结初始和最终时间步的隐状态,作为全连接层的输入,\n",
108 | " # 其形状为(批量大小,4*隐藏单元数)\n",
109 | " encoding = torch.cat((outputs[0], outputs[-1]), dim=1)\n",
110 | " outs = self.decoder(encoding)\n",
111 | " return outs"
112 | ],
113 | "metadata": {
114 | "collapsed": false,
115 | "pycharm": {
116 | "name": "#%%\n"
117 | }
118 | }
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "outputs": [],
124 | "source": [
125 | "class TokenEmbedding:\n",
126 | "\n",
127 | " def __init__(self, file_path):\n",
128 | "\n",
129 | " self.idx_to_token, self.idx_to_vec = self._load_embedding(\n",
130 | " file_path)\n",
131 | " self.unknown_idx = 0\n",
132 | " self.token_to_idx = {token: idx for idx, token in\n",
133 | " enumerate(self.idx_to_token)}\n",
134 | "\n",
135 | " def _load_embedding(self, file_path):\n",
136 | " idx_to_token, idx_to_vec = [''], []\n",
137 | "\n",
138 | " #!!!!!这里得用bz2.open打开文件\n",
139 | " with bz2.open(file_path, 'rb') as fp:\n",
140 | " word_vecs = fp.readlines()\n",
141 | " word_vecs = [i.decode('utf-8') for i in word_vecs][1:] #第一行信息没啥用,所以不要了\n",
142 | " for vec in word_vecs:\n",
143 | " #消除每一行后面的‘\\n’,已经后面多余空格,然后依据空格划分元素,形成一个列表\n",
144 | " elems = vec.rstrip().rstrip('\\n').split(' ')\n",
145 | " #每一行的第一个元素是词,剩余的元素是词向量\n",
146 | " token,elems = elems[0],[float(elem) for elem in elems[1:]]\n",
147 | " idx_to_token.append(token)\n",
148 | " idx_to_vec.append(elems)\n",
149 | "\n",
150 | " idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec\n",
151 | " return idx_to_token, d2l.tensor(idx_to_vec)\n",
152 | "\n",
153 | " def __getitem__(self, tokens):\n",
154 | " indices = [self.token_to_idx.get(token, self.unknown_idx)\n",
155 | " for token in tokens]\n",
156 | " vecs = self.idx_to_vec[d2l.tensor(indices)]\n",
157 | " return vecs\n",
158 | "\n",
159 | " def __len__(self):\n",
160 | " return len(self.idx_to_token)"
161 | ],
162 | "metadata": {
163 | "collapsed": false,
164 | "pycharm": {
165 | "name": "#%%\n"
166 | }
167 | }
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "outputs": [],
173 | "source": [
174 | "def train_one_epoch(net, X, y, loss, trainer, device):\n",
175 | "\n",
176 | " if isinstance(X, list):\n",
177 | " X = [x.to(device) for x in X]\n",
178 | " else:\n",
179 | " X = X.to(device)\n",
180 | "\n",
181 | " y = y.to(device)\n",
182 | " net.train()\n",
183 | " trainer.zero_grad()\n",
184 | " pred = net(X)\n",
185 | " l = loss(pred, y)\n",
186 | " l.sum().backward()\n",
187 | " trainer.step()\n",
188 | " train_loss_sum = l.sum()\n",
189 | " train_acc_sum = d2l.accuracy(pred, y)\n",
190 | " return train_loss_sum, train_acc_sum"
191 | ],
192 | "metadata": {
193 | "collapsed": false,
194 | "pycharm": {
195 | "name": "#%%\n"
196 | }
197 | }
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "outputs": [],
203 | "source": [
204 | "def train(net, train_iter, test_iter, loss, trainer, num_epochs,\n",
205 | " device=d2l.try_gpu()):\n",
206 | "\n",
207 | " #权重初始化\n",
208 | " def init_weights(m):\n",
209 | " if type(m) == nn.Linear:\n",
210 | " nn.init.xavier_uniform_(m.weight)\n",
211 | " if type(m) == nn.LSTM:\n",
212 | " for param in m._flat_weights_names:\n",
213 | " if \"weight\" in param:\n",
214 | " nn.init.xavier_uniform_(m._parameters[param])\n",
215 | "\n",
216 | " net.apply(init_weights)\n",
217 | "\n",
218 | " #将预训练好的词嵌入加载到net的embeding层里,并且不进行梯度回传\n",
219 | " net.embedding.weight.data.copy_(embeds)\n",
220 | " net.embedding.weight.requires_grad = False\n",
221 | "\n",
222 | "\n",
223 | " num_batches = len(train_iter)\n",
224 | " animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],\n",
225 | " legend=['train loss', 'train acc', 'test acc'])\n",
226 | " net.to(device)\n",
227 | " for epoch in range(num_epochs):\n",
228 | " #定义一个容器,里面存放着训练损失,训练准确度,样本数量\n",
229 | " metric = d2l.Accumulator(3)\n",
230 | " for i, (features, labels) in enumerate(train_iter):\n",
231 | "\n",
232 | "\n",
233 | " l, acc = train_one_epoch(net, features, labels, loss, trainer, device)\n",
234 | "\n",
235 | " metric.add(l, acc, labels.shape[0])\n",
236 | "\n",
237 | " if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:\n",
238 | " animator.add(epoch + (i + 1) / num_batches,\n",
239 | " (metric[0] / metric[2], metric[1] / metric[2],\n",
240 | " None))\n",
241 | "\n",
242 | " #查看测试集的准确率\n",
243 | " test_acc = d2l.evaluate_accuracy_gpu(net, test_iter)\n",
244 | " animator.add(epoch + 1, (None, None, test_acc))\n",
245 | "\n",
246 | " print(f'平均损失: {metric[0] / metric[2]:.3f}, 训练准确度: '\n",
247 | " f'{metric[1] / metric[2]:.3f}, 测试准确度: {test_acc:.3f}')\n"
248 | ],
249 | "metadata": {
250 | "collapsed": false,
251 | "pycharm": {
252 | "name": "#%%\n"
253 | }
254 | }
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "outputs": [],
260 | "source": [
261 | "batch_size = 512 #批量大小\n",
262 | "lr = 0.0003 #学习率\n",
263 | "num_epochs = 20 #训练几轮\n",
264 | "embed_size = 300 #词嵌入维度,我选了300维的\n",
265 | "num_hiddens = 256 #循环神经网络,隐藏层单元数量\n",
266 | "num_layers = 2 #多少个隐藏层\n",
267 | "devices = d2l.try_gpu() #设备\n",
268 | "\n",
269 | "comment_path = '' #数据集文件路径\n",
270 | "train_iter, test_iter, vocab = process_data(comment_path,batch_size)\n",
271 | "\n",
272 | "embed_path = '' #词向量文件路径\n",
273 | "my_embedding = TokenEmbedding(embed_path)\n",
274 | "embeds = my_embedding[vocab.idx_to_token] #把词向量和我的vocab结合起来\n",
275 | "\n",
276 | "\n",
277 | "net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers) #定义网络\n",
278 | "trainer = torch.optim.Adam(net.parameters(), lr=lr) #优化器\n",
279 | "loss = nn.CrossEntropyLoss(reduction=\"none\") #损失函数"
280 | ],
281 | "metadata": {
282 | "collapsed": false,
283 | "pycharm": {
284 | "name": "#%%\n"
285 | }
286 | }
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "outputs": [],
292 | "source": [
293 | "train(net, train_iter, test_iter, loss, trainer,num_epochs,d2l.try_gpu())"
294 | ],
295 | "metadata": {
296 | "collapsed": false,
297 | "pycharm": {
298 | "name": "#%%\n"
299 | }
300 | }
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "outputs": [],
306 | "source": [
307 | "def predict(net, vocab, comment):\n",
308 | " net.eval()\n",
309 | " comment = torch.tensor(vocab[[i for i in jieba.cut(comment,cut_all=False)]], device=d2l.try_gpu())\n",
310 | " label = torch.argmax(net(comment.reshape(1, -1)), dim=1)\n",
311 | " return label+1"
312 | ],
313 | "metadata": {
314 | "collapsed": false,
315 | "pycharm": {
316 | "name": "#%%\n"
317 | }
318 | }
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "outputs": [],
324 | "source": [
325 | "predict(net,vocab,'包装很好,内容也不错')"
326 | ],
327 | "metadata": {
328 | "collapsed": false,
329 | "pycharm": {
330 | "name": "#%%\n"
331 | }
332 | }
333 | }
334 | ],
335 | "metadata": {
336 | "kernelspec": {
337 | "display_name": "Python 3",
338 | "language": "python",
339 | "name": "python3"
340 | },
341 | "language_info": {
342 | "codemirror_mode": {
343 | "name": "ipython",
344 | "version": 2
345 | },
346 | "file_extension": ".py",
347 | "mimetype": "text/x-python",
348 | "name": "python",
349 | "nbconvert_exporter": "python",
350 | "pygments_lexer": "ipython2",
351 | "version": "2.7.6"
352 | }
353 | },
354 | "nbformat": 4,
355 | "nbformat_minor": 0
356 | }
357 |
--------------------------------------------------------------------------------
/微博评论情感分析/情感分析-堆叠transformer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import math\n",
12 | "from d2l import torch as d2l\n",
13 | "import torch\n",
14 | "import jieba\n",
15 | "from sklearn.model_selection import train_test_split\n",
16 | "import torch.nn as nn\n",
17 | "from wordcloud import WordCloud\n",
18 | "import matplotlib.pyplot as plt"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 3,
24 | "outputs": [],
25 | "source": [
26 | "def read_comments(comment_path,mode=False):\n",
27 | " with open(comment_path,'r',encoding='utf-8') as fp:\n",
28 | " #第一行是数据指标标签,不要,最后一行是一个空格,不要\n",
29 | " raw_data = fp.read().split('\\n')[1:-1]\n",
30 | " #每一行数据,第一个字符是标签,后续字符是评论内容\n",
31 | " comments = [raw_data[i][2:] for i in range(len(raw_data))]\n",
32 | " lables = torch.tensor([int(raw_data[i][0]) for i in range(len(raw_data))])\n",
33 | " #对评论进行分词,cut_all为分词模式\n",
34 | " comments = [[i for i in jieba.cut(comment,cut_all=mode)]for comment in comments]\n",
35 | " #返回分词后的评论和对应的评分\n",
36 | " return comments,lables"
37 | ],
38 | "metadata": {
39 | "collapsed": false,
40 | "pycharm": {
41 | "name": "#%%\n"
42 | }
43 | }
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 11,
48 | "outputs": [],
49 | "source": [
50 | "def process_data(comment_path,batch_size,max_len,min_freq):\n",
51 | " \"\"\"返回数据迭代器和评论数据集的词表\"\"\"\n",
52 | " #把评论和对应的评分取出来\n",
53 | " comments,labels = read_comments(comment_path)\n",
54 | " #创建vocab,使得每个词都有对应的索引,将词频小于min_freq的词不要了,不然这个vocab太大了\n",
55 | " vocab = d2l.Vocab(comments, min_freq=min_freq)\n",
56 | " #划分训练集和测试集\n",
57 | " train_comment, test_comment, train_label, test_label = train_test_split(comments,labels,test_size=0.3,random_state=26,shuffle=True)\n",
58 | " #鉴于绝大部分评论都在max_len个词以下,把每一条评论都处理成相同长度(max_len个词),对短评论进行填充'',对长评论进行截断\n",
59 | " train_features = torch.tensor([d2l.truncate_pad(\n",
60 | " vocab[comment], max_len, vocab['']) for comment in train_comment])\n",
61 | " test_features = torch.tensor([d2l.truncate_pad(\n",
62 | " vocab[comment], max_len, vocab['']) for comment in test_comment])\n",
63 | " #返回数据迭代器\n",
64 | " train_iter = d2l.load_array((train_features,train_label),\n",
65 | " batch_size)\n",
66 | " test_iter = d2l.load_array((test_features,test_label),\n",
67 | " batch_size,\n",
68 | " is_train=False)\n",
69 | " return train_iter, test_iter, vocab"
70 | ],
71 | "metadata": {
72 | "collapsed": false,
73 | "pycharm": {
74 | "name": "#%%\n"
75 | }
76 | }
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 13,
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": "2000001"
85 | },
86 | "execution_count": 13,
87 | "metadata": {},
88 | "output_type": "execute_result"
89 | }
90 | ],
91 | "source": [
92 | "class TokenEmbedding:\n",
93 | "\n",
94 | " def __init__(self, file_path):\n",
95 | "\n",
96 | " self.idx_to_token, self.idx_to_vec = self._load_embedding(\n",
97 | " file_path)\n",
98 | " self.unknown_idx = 0\n",
99 | " self.token_to_idx = {token: idx for idx, token in\n",
100 | " enumerate(self.idx_to_token)}\n",
101 | "\n",
102 | " def _load_embedding(self, file_path):\n",
103 | " idx_to_token, idx_to_vec = [''], []\n",
104 | "\n",
105 | " with open(file_path, 'r') as fp:\n",
106 | " word_vecs = fp.readlines()\n",
107 | " word_vecs = [i for i in word_vecs][1:] #第一行信息没啥用,所以不要了\n",
108 | " for vec in word_vecs:\n",
109 | " #消除每一行后面的‘\\n’,已经后面多余空格,然后依据空格划分元素,形成一个列表\n",
110 | " elems = vec.rstrip().rstrip('\\n').split(' ')\n",
111 | " #每一行的第一个元素是词,剩余的元素是词向量\n",
112 | " token,elems = elems[0],[float(elem) for elem in elems[1:]]\n",
113 | " idx_to_token.append(token)\n",
114 | " idx_to_vec.append(elems)\n",
115 | "\n",
116 | " idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec\n",
117 | " return idx_to_token, d2l.tensor(idx_to_vec)\n",
118 | "\n",
119 | " def __getitem__(self, tokens):\n",
120 | " indices = [self.token_to_idx.get(token, self.unknown_idx)\n",
121 | " for token in tokens]\n",
122 | " vecs = self.idx_to_vec[d2l.tensor(indices)]\n",
123 | " return vecs\n",
124 | "\n",
125 | " def __len__(self):\n",
126 | " return len(self.idx_to_token)"
127 | ],
128 | "metadata": {
129 | "collapsed": false,
130 | "pycharm": {
131 | "name": "#%%\n"
132 | }
133 | }
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "outputs": [],
139 | "source": [
140 | "class AddNorm(nn.Module):\n",
141 | " \"\"\"\n",
142 | " Transformer架构里面的Add & Norm layer.\n",
143 | " \"\"\"\n",
144 | " def __init__(self, normalized_shape, dropout, eps=1e-5):\n",
145 | " super(AddNorm, self).__init__()\n",
146 | " self.dropout = nn.Dropout(dropout)\n",
147 | " self.ln = nn.LayerNorm(normalized_shape, eps=eps)\n",
148 | "\n",
149 | " def forward(self, X, Y):\n",
150 | " return self.ln(self.dropout(Y) + X)\n",
151 | "\n",
152 | "class PositionWiseFFN(nn.Module):\n",
153 | " \"\"\"\n",
154 | " FFN\n",
155 | " \"\"\"\n",
156 | " def __init__(self, ffn_num_input, ffn_num_hiddens, ffn_num_outputs):\n",
157 | " super(PositionWiseFFN, self).__init__()\n",
158 | " self.dense1 = nn.Linear(ffn_num_input, ffn_num_hiddens)\n",
159 | " self.relu = nn.ReLU()\n",
160 | " self.dense2 = nn.Linear(ffn_num_hiddens, ffn_num_outputs)\n",
161 | "\n",
162 | " def forward(self, X):\n",
163 | " return self.dense2(self.relu(self.dense1(X)))\n",
164 | "\n",
165 | "class TransformerBlock(nn.Module):\n",
166 | " \"\"\"\n",
167 | " Transformer encoder block.\n",
168 | " \"\"\"\n",
169 | " def __init__(self, embed_dim, num_heads, ffn_hidden, dropout):\n",
170 | " super(TransformerBlock, self).__init__()\n",
171 | " self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout)\n",
172 | " self.addnorm1 = AddNorm(embed_dim, dropout)\n",
173 | " self.ffn = PositionWiseFFN(embed_dim, ffn_hidden, embed_dim)\n",
174 | " self.addnorm2 = AddNorm(embed_dim, dropout)\n",
175 | "\n",
176 | " def forward(self, X):\n",
177 | " Y = self.addnorm1(X, self.attention(X, X, X, need_weights=False)[0])\n",
178 | " return self.addnorm2(Y, self.ffn(Y))"
179 | ],
180 | "metadata": {
181 | "collapsed": false,
182 | "pycharm": {
183 | "name": "#%%\n"
184 | }
185 | }
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "outputs": [],
191 | "source": [
192 | "class semi_bert(nn.Module):\n",
193 | " def __init__(self, vocab_size, embed_size, ffn_hiddens, num_heads,num_blks, dropout, max_len=80, **kwargs):\n",
194 | " super(semi_bert, self).__init__(**kwargs)\n",
195 | " self.token_embedding = nn.Embedding(vocab_size, embed_size)\n",
196 | " self.pos_embedding = nn.Parameter(torch.randn(1, max_len,\n",
197 | " embed_size)*0.01)\n",
198 | " self.blks = nn.Sequential()\n",
199 | "\n",
200 | " for i in range(num_blks):\n",
201 | " self.blks.add_module(f\"{i}\", TransformerBlock(\n",
202 | " embed_dim=embed_size, num_heads=num_heads, ffn_hidden=ffn_hiddens, dropout=dropout))\n",
203 | " self.output = nn.Linear(embed_size, 2)\n",
204 | "\n",
205 | " def forward(self, tokens):\n",
206 | "\n",
207 | " # X的shape:(batch size, max_length,num_hiddens)\n",
208 | " X = self.token_embedding(tokens) + self.pos_embedding\n",
209 | " for blk in self.blks:\n",
210 | " X = blk(X)\n",
211 | " #获取句子的平均表示,而不是提取第一个字符\n",
212 | " X = self.output(torch.mean(X, dim=1))\n",
213 | " return X"
214 | ],
215 | "metadata": {
216 | "collapsed": false,
217 | "pycharm": {
218 | "name": "#%%\n"
219 | }
220 | }
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "outputs": [],
226 | "source": [
227 | "def train_one_epoch(net, X, y, loss, trainer, device):\n",
228 | "\n",
229 | " if isinstance(X, list):\n",
230 | " X = [x.to(device) for x in X]\n",
231 | " else:\n",
232 | " X = X.to(device)\n",
233 | "\n",
234 | " y = y.to(device)\n",
235 | " net.train()\n",
236 | " trainer.zero_grad()\n",
237 | " pred = net(X)\n",
238 | "\n",
239 | " l = loss(pred, y)\n",
240 | " l.mean().backward()\n",
241 | " trainer.step()\n",
242 | "\n",
243 | " train_loss_sum = l.sum()\n",
244 | "\n",
245 | " train_acc_sum = d2l.accuracy(pred, y)\n",
246 | " return train_loss_sum, train_acc_sum"
247 | ],
248 | "metadata": {
249 | "collapsed": false,
250 | "pycharm": {
251 | "name": "#%%\n"
252 | }
253 | }
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 2,
258 | "outputs": [],
259 | "source": [
260 | "def train(net, train_iter, test_iter, loss, trainer, num_epochs,device=d2l.try_gpu()):\n",
261 | "\n",
262 | " #权重初始化\n",
263 | " def initialize_weights(model):\n",
264 | " for name, param in model.named_parameters():\n",
265 | " if name.startswith('token_embedding'):\n",
266 | " continue # 跳过词嵌入层\n",
267 | "\n",
268 | " # 处理 TransformerBlock 中的层\n",
269 | " if isinstance(param, nn.Linear):\n",
270 | " if 'attention' in name:\n",
271 | " # 多头注意力层的线性层\n",
272 | " nn.init.xavier_uniform_(param.weight)\n",
273 | " else:\n",
274 | " # FFN 中的线性层\n",
275 | " nn.init.kaiming_uniform_(param.weight, a=math.sqrt(5))\n",
276 | " if param.bias is not None:\n",
277 | " fan_in, _ = nn.init._calculate_fan_in_and_fan_out(param.weight)\n",
278 | " bound = 1 / math.sqrt(fan_in)\n",
279 | " nn.init.uniform_(param.bias, -bound, bound)\n",
280 | " elif isinstance(param, nn.LayerNorm):\n",
281 | " nn.init.ones_(param.weight)\n",
282 | " nn.init.zeros_(param.bias)\n",
283 | "\n",
284 | " net.apply(initialize_weights)\n",
285 | "\n",
286 | " #将预训练好的词嵌入加载到net的embeding层里,并且不进行梯度回传\n",
287 | " #当然,你也可以进行训练\n",
288 | " net.token_embedding.weight.data.copy_(embeds)\n",
289 | " net.token_embedding.weight.requires_grad = False\n",
290 | "\n",
291 | "\n",
292 | " num_batches = len(train_iter)\n",
293 | " animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],\n",
294 | " legend=['train loss', 'train acc', 'test acc'])\n",
295 | " net.to(device)\n",
296 | " for epoch in range(num_epochs):\n",
297 | " #定义一个容器,里面存放着训练损失,训练准确度,样本数量\n",
298 | " metric = d2l.Accumulator(3)\n",
299 | " for i, (features, labels) in enumerate(train_iter):\n",
300 | "\n",
301 | "\n",
302 | " l, acc = train_one_epoch(net, features, labels, loss, trainer, device)\n",
303 | "\n",
304 | " metric.add(l, acc, labels.shape[0])\n",
305 | "\n",
306 | " if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:\n",
307 | " animator.add(epoch + (i + 1) / num_batches,\n",
308 | " (metric[0] / metric[2], metric[1] / metric[2],\n",
309 | " None))\n",
310 | "\n",
311 | " #查看测试集的准确率\n",
312 | " test_acc = d2l.evaluate_accuracy_gpu(net, test_iter)\n",
313 | " animator.add(epoch + 1, (None, None, test_acc))\n",
314 | "\n",
315 | " print(f'平均损失: {metric[0] / metric[2]:.3f}, 训练准确度: '\n",
316 | " f'{metric[1] / metric[2]:.3f}, 测试准确度: {test_acc:.3f}')\n"
317 | ],
318 | "metadata": {
319 | "collapsed": false,
320 | "pycharm": {
321 | "name": "#%%\n"
322 | }
323 | }
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 3,
328 | "outputs": [
329 | {
330 | "name": "stderr",
331 | "output_type": "stream",
332 | "text": [
333 | "Building prefix dict from the default dictionary ...\n",
334 | "Loading model from cache C:\\Users\\86180\\AppData\\Local\\Temp\\jieba.cache\n",
335 | "Loading model cost 0.875 seconds.\n",
336 | "Prefix dict has been built successfully.\n"
337 | ]
338 | }
339 | ],
340 | "source": [
341 | "batch_size = 512 #批量大小\n",
342 | "lr = 0.0003 #学习率\n",
343 | "num_epochs = 30 #训练几轮\n",
344 | "embed_size = 100 #词嵌入维度,我选了100维的\n",
345 | "ffn_hiddens = 64 #FFN,隐藏层单元数量\n",
346 | "num_heads = 4 #注意力头的个数\n",
347 | "num_blks = 1 #transformer_block的个数\n",
348 | "dropout = 0.5 #dropout率(用于正则化)\n",
349 | "max_len = 50 #每个句子的最大长度\n",
350 | "min_freq = 3 #最小词频阈值\n",
351 | "devices = d2l.try_gpu() #设备\n",
352 | "\n",
353 | "embedding_path = '腾讯词向量/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt' #词向量的位置\n",
354 | "comment_path = 'weibo_senti_100k.csv' #数据集的位置\n",
355 | "\n",
356 | "train_iter, test_iter, vocab = process_data(comment_path,batch_size,max_len=max_len,min_freq = min_freq)\n",
357 | "\n",
358 | "my_embedding = TokenEmbedding(embedding_path)\n",
359 | "embeds = my_embedding[vocab.idx_to_token] #把词向量和我的vocab结合起来\n",
360 | "\n",
361 | "\n"
362 | ],
363 | "metadata": {
364 | "collapsed": false,
365 | "pycharm": {
366 | "name": "#%%\n"
367 | }
368 | }
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": 9,
373 | "outputs": [
374 | {
375 | "data": {
376 | "text/plain": "['@',\n '张晓鹏',\n 'jonathan',\n ' ',\n '土耳其',\n '的',\n '事要',\n '认真对待',\n '[',\n '哈哈',\n ']',\n ',',\n '否则',\n '直接',\n '开除',\n '。',\n '@',\n '丁丁',\n '看',\n '世界',\n ' ',\n '很',\n '是',\n '细心',\n ',',\n '酒店',\n '都',\n '全部',\n 'OK',\n '啦',\n '。']"
377 | },
378 | "execution_count": 9,
379 | "metadata": {},
380 | "output_type": "execute_result"
381 | }
382 | ],
383 | "source": [
384 | "net = semi_bert(len(vocab), embed_size, ffn_hiddens, num_heads,num_blks,dropout,max_len) #定义网络\n",
385 | "trainer = torch.optim.Adam(net.parameters(), lr=0.0003,weight_decay=1e-4) #优化器,使用Adam\n",
386 | "loss = nn.CrossEntropyLoss(reduction=\"none\") #损失函数"
387 | ],
388 | "metadata": {
389 | "collapsed": false,
390 | "pycharm": {
391 | "name": "#%%\n"
392 | }
393 | }
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "outputs": [],
399 | "source": [
400 | "train(net, train_iter, test_iter, loss, trainer,num_epochs,d2l.try_gpu())"
401 | ],
402 | "metadata": {
403 | "collapsed": false,
404 | "pycharm": {
405 | "name": "#%%\n"
406 | }
407 | }
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": null,
412 | "outputs": [],
413 | "source": [
414 | "lable_map = ['负向评论','正向评论']\n",
415 | "def predict(net, vocab, comment):\n",
416 | " '''\n",
417 | " 对单个句子进行情感分析\n",
418 | " '''\n",
419 | " net.to(d2l.try_gpu())\n",
420 | " net.eval()\n",
421 | " comment = vocab[[i for i in jieba.cut(comment,cut_all=False)]]\n",
422 | " #将句子进行分词\n",
423 | " comment_pt = torch.tensor(d2l.truncate_pad(comment, max_len, vocab['']), device=d2l.try_gpu())\n",
424 | " label = torch.argmax(net(comment_pt.reshape(1, -1)), dim=1)\n",
425 | " return '该句子的情感是:' + lable_map[label]"
426 | ],
427 | "metadata": {
428 | "collapsed": false,
429 | "pycharm": {
430 | "name": "#%%\n"
431 | }
432 | }
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": null,
437 | "outputs": [],
438 | "source": [
439 | "predict(net, vocab, '堵心的我连灌饼都吃不下。。。。。。。')"
440 | ],
441 | "metadata": {
442 | "collapsed": false,
443 | "pycharm": {
444 | "name": "#%%\n"
445 | }
446 | }
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "outputs": [],
452 | "source": [
453 | "def predict_doc_visualize(net,vocab,test_path,out = False,draw = False):\n",
454 | " '''\n",
455 | " test_path:你的csv文件路径,注意:每条评论需要在第一列\n",
456 | " 对一个文档的句子进行情感分析,并绘制词云\n",
457 | " out参数指明需不需要将句子和对应的情感输出出来\n",
458 | " draw参数指明需不需要绘制词云\n",
459 | " '''\n",
460 | " net.to(d2l.try_gpu())\n",
461 | " net.eval()\n",
462 | " with open(test_path,'r',encoding = 'gbk') as fp:\n",
463 | " raw_data = fp.read().split('\\n')[:-1]\n",
464 | "\n",
465 | " comments = [raw_data[i] for i in range(len(raw_data))]\n",
466 | " #对评论进行分词,cut_all为分词模式\n",
467 | " commentss = [vocab[[i for i in jieba.cut(j,cut_all=False)]] for j in comments]\n",
468 | " comment_pt = [torch.tensor(d2l.truncate_pad(i, max_len, vocab['']), device=d2l.try_gpu()) for i in commentss]\n",
469 | " #预测出来的情感\n",
470 | " labels = [torch.argmax(net(i.reshape(1, -1)), dim=1) for i in comment_pt]\n",
471 | "\n",
472 | " def generate_wordcloud(comments, sentiment):\n",
473 | " text = \" \".join(comments)\n",
474 | "\n",
475 | " #font_path是字体文件,需要自己下载\n",
476 | " wordcloud = WordCloud(background_color=\"white\",font_path=\"字体家AI造字特隶.ttf\").generate(text)\n",
477 | " plt.figure(figsize=(8, 8), facecolor=None)\n",
478 | " plt.imshow(wordcloud)\n",
479 | " plt.axis(\"off\")\n",
480 | " plt.tight_layout(pad=0)\n",
481 | " plt.title(f\"Word Cloud - {sentiment} Sentiment\")\n",
482 | " plt.show()\n",
483 | "\n",
484 | " if draw:\n",
485 | " #正向评论\n",
486 | " positive_comments = []\n",
487 | " #负向评论\n",
488 | " negative_comments = []\n",
489 | " for i, label in enumerate(labels):\n",
490 | " if label == 1:\n",
491 | " positive_comments.append(comments[i])\n",
492 | " else:\n",
493 | " negative_comments.append(comments[i])\n",
494 | " generate_wordcloud(positive_comments, \"Positive\")\n",
495 | " generate_wordcloud(negative_comments, \"Negative\")\n",
496 | "\n",
497 | "\n",
498 | "\n",
499 | " if out:\n",
500 | " for i in range(len(comments)):\n",
501 | " print(comments[i])\n",
502 | " print('\\n')\n",
503 | " print('*情感*--->'+lable_map[labels[i]])\n",
504 | " print('\\n')\n",
505 | " print('--------------------------------------------------------')"
506 | ],
507 | "metadata": {
508 | "collapsed": false,
509 | "pycharm": {
510 | "name": "#%%\n"
511 | }
512 | }
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "outputs": [],
518 | "source": [
519 | "test_path = '微博测试.csv'\n",
520 | "predict_doc_visualize(net,vocab,test_path,out = False,draw = True)"
521 | ],
522 | "metadata": {
523 | "collapsed": false,
524 | "pycharm": {
525 | "name": "#%%\n"
526 | }
527 | }
528 | },
529 | {
530 | "cell_type": "markdown",
531 | "source": [],
532 | "metadata": {
533 | "collapsed": false
534 | }
535 | }
536 | ],
537 | "metadata": {
538 | "kernelspec": {
539 | "display_name": "Python 3",
540 | "language": "python",
541 | "name": "python3"
542 | },
543 | "language_info": {
544 | "codemirror_mode": {
545 | "name": "ipython",
546 | "version": 2
547 | },
548 | "file_extension": ".py",
549 | "mimetype": "text/x-python",
550 | "name": "python",
551 | "nbconvert_exporter": "python",
552 | "pygments_lexer": "ipython2",
553 | "version": "2.7.6"
554 | }
555 | },
556 | "nbformat": 4,
557 | "nbformat_minor": 0
558 | }
--------------------------------------------------------------------------------