├── README.md ├── deeplearning ├── resnet.ipynb └── resnet.py └── nlp ├── Bert_Blend_CNN.ipynb ├── at.py ├── bert_blend_cnn.py ├── bert_classify.ipynb ├── bert_classify.py └── focal_loss.py /README.md: -------------------------------------------------------------------------------- 1 | # 存放博客代码,希望能坚持下去呀hh 2 | 3 | - [博客园](https://www.cnblogs.com/qingyao/) 4 | - [知乎](https://www.zhihu.com/people/sheng-jian-93-86) 5 | 6 | -------------------------------------------------------------------------------- /deeplearning/resnet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "resnet.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "sSw_-JAWi_nk" 23 | }, 24 | "source": [ 25 | "# -*- coding:utf-8 -*-\n", 26 | "# handwritten digits recognition\n", 27 | "# Data: MINIST\n", 28 | "# model: resnet\n", 29 | "# date: 2021.10.8 14:18\n", 30 | "\n", 31 | "import math\n", 32 | "import torch\n", 33 | "import torchvision\n", 34 | "import torchvision.transforms as transforms\n", 35 | "import torch.nn as nn\n", 36 | "import torch.utils.data as Data\n", 37 | "import torch.optim as optim\n", 38 | "import pandas as pd\n", 39 | "import matplotlib.pyplot as plt" 40 | ], 41 | "execution_count": 1, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "metadata": { 47 | "id": "wAMLAt-7jlnL" 48 | }, 49 | "source": [ 50 | "train_curve = []\n", 51 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" 52 | ], 53 | "execution_count": 2, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "id": "MjTOGiQaqw4A" 60 | }, 61 | "source": [ 62 | "# param\n", 63 | "batch_size = 100\n", 64 | "n_class = 10\n", 65 | "padding_size = 15\n", 66 | "epoches = 10" 67 | ], 68 | "execution_count": 20, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "metadata": { 74 | "id": "Pz0nsjE-jwVt" 75 | }, 76 | "source": [ 77 | "train_dataset = torchvision.datasets.MNIST('./data/', train=True, transform=transforms.ToTensor(), download=True)\n", 78 | "test_dataset = torchvision.datasets.MNIST('./data/', train=False, transform=transforms.ToTensor(), download=False)\n", 79 | "train = Data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=5)\n", 80 | "test = Data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=5)" 81 | ], 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "metadata": { 88 | "id": "5t9GsIw5CCBx" 89 | }, 90 | "source": [ 91 | "def gelu(x):\n", 92 | " \"Implementation of the gelu activation function by Hugging Face\"\n", 93 | " return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))" 94 | ], 95 | "execution_count": 5, 96 | "outputs": [] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "metadata": { 101 | "id": "VHOBzUgztqQE" 102 | }, 103 | "source": [ 104 | "class ResBlock(nn.Module):\n", 105 | " def __init__(self, in_size, out_size1, out_size2):\n", 106 | " super(ResBlock, self).__init__()\n", 107 | " self.conv1 = nn.Conv2d(\n", 108 | " in_channels = in_size,\n", 109 | " out_channels = out_size1,\n", 110 | " kernel_size = 3,\n", 111 | " stride = 2,\n", 112 | " padding = padding_size\n", 113 | " )\n", 114 | " self.conv2 = nn.Conv2d(\n", 115 | " in_channels = out_size1,\n", 116 | " out_channels = out_size2,\n", 117 | " kernel_size = 3,\n", 118 | " stride = 2,\n", 119 | " padding = padding_size\n", 120 | " )\n", 121 | " self.batchnorm1 = nn.BatchNorm2d(out_size1)\n", 122 | " self.batchnorm2 = nn.BatchNorm2d(out_size2)\n", 123 | " \n", 124 | " def conv(self, x):\n", 125 | " x = gelu(self.batchnorm1(self.conv1(x)))\n", 126 | " x = gelu(self.batchnorm2(self.conv2(x)))\n", 127 | " return x\n", 128 | " \n", 129 | " def forward(self, x):\n", 130 | " return x + self.conv(x)" 131 | ], 132 | "execution_count": 6, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "rWj7lSNRqj7a" 139 | }, 140 | "source": [ 141 | "# resnet\n", 142 | "class Resnet(nn.Module):\n", 143 | " def __init__(self, n_class = n_class):\n", 144 | " super(Resnet, self).__init__()\n", 145 | " self.res1 = ResBlock(1, 8, 16)\n", 146 | " self.res2 = ResBlock(16, 32, 16)\n", 147 | " self.conv = nn.Conv2d(\n", 148 | " in_channels = 16,\n", 149 | " out_channels = n_class,\n", 150 | " kernel_size = 3,\n", 151 | " stride = 2,\n", 152 | " padding = padding_size\n", 153 | " )\n", 154 | " self.batchnorm = nn.BatchNorm2d(n_class)\n", 155 | " self.max_pooling = nn.AdaptiveAvgPool2d(1)\n", 156 | "\n", 157 | " def forward(self, x):\n", 158 | " x = x.view(-1, 1, 28, 28)\n", 159 | " x = self.res1(x)\n", 160 | " x = self.res2(x)\n", 161 | " x = self.max_pooling(self.batchnorm(self.conv(x)))\n", 162 | "\n", 163 | " return x.view(x.size(0), -1)\n" 164 | ], 165 | "execution_count": 7, 166 | "outputs": [] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "metadata": { 171 | "colab": { 172 | "base_uri": "https://localhost:8080/" 173 | }, 174 | "id": "jk63HsdJ5Pv_", 175 | "outputId": "d1a36e02-2bc5-46c1-ab8d-cf04ef5b3e3f" 176 | }, 177 | "source": [ 178 | "resnet = Resnet().to(device)\n", 179 | "resnet" 180 | ], 181 | "execution_count": 11, 182 | "outputs": [ 183 | { 184 | "output_type": "execute_result", 185 | "data": { 186 | "text/plain": [ 187 | "Resnet(\n", 188 | " (res1): ResBlock(\n", 189 | " (conv1): Conv2d(1, 8, kernel_size=(3, 3), stride=(2, 2), padding=(15, 15))\n", 190 | " (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(15, 15))\n", 191 | " (batchnorm1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 192 | " (batchnorm2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 193 | " )\n", 194 | " (res2): ResBlock(\n", 195 | " (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(15, 15))\n", 196 | " (conv2): Conv2d(32, 16, kernel_size=(3, 3), stride=(2, 2), padding=(15, 15))\n", 197 | " (batchnorm1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 198 | " (batchnorm2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 199 | " )\n", 200 | " (conv): Conv2d(16, 10, kernel_size=(3, 3), stride=(2, 2), padding=(15, 15))\n", 201 | " (batchnorm): BatchNorm2d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 202 | " (max_pooling): AdaptiveAvgPool2d(output_size=1)\n", 203 | ")" 204 | ] 205 | }, 206 | "metadata": {}, 207 | "execution_count": 11 208 | } 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "metadata": { 214 | "id": "br_L1zb65YZr" 215 | }, 216 | "source": [ 217 | "loss_fn = nn.CrossEntropyLoss()\n", 218 | "optimizer = optim.SGD(params=resnet.parameters(), lr=1e-2, momentum=0.9)" 219 | ], 220 | "execution_count": 16, 221 | "outputs": [] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "metadata": { 226 | "colab": { 227 | "base_uri": "https://localhost:8080/" 228 | }, 229 | "id": "2_DGwLWC6x4p", 230 | "outputId": "d0a9cc2d-a12a-4e15-972b-2a7bd1bef09d" 231 | }, 232 | "source": [ 233 | "# train\n", 234 | "total_step = len(train)\n", 235 | "sum_loss = 0\n", 236 | "for epoch in range(epoches):\n", 237 | " for i, (images, targets) in enumerate(train):\n", 238 | " optimizer.zero_grad()\n", 239 | " images = images.to(device)\n", 240 | " targets = targets.to(device)\n", 241 | " preds = resnet(images)\n", 242 | " \n", 243 | " loss = loss_fn(preds, targets)\n", 244 | " sum_loss += loss.item()\n", 245 | " loss.backward()\n", 246 | " optimizer.step()\n", 247 | " if (i+1)%100==0:\n", 248 | " print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))\n", 249 | " train_curve.append(sum_loss)\n", 250 | " sum_loss = 0\n", 251 | " " 252 | ], 253 | "execution_count": 22, 254 | "outputs": [ 255 | { 256 | "output_type": "stream", 257 | "name": "stderr", 258 | "text": [ 259 | "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:481: UserWarning: This DataLoader will create 5 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", 260 | " cpuset_checked))\n" 261 | ] 262 | }, 263 | { 264 | "output_type": "stream", 265 | "name": "stdout", 266 | "text": [ 267 | "[1|10] step:100/600 loss:0.0104\n", 268 | "[1|10] step:200/600 loss:0.0106\n", 269 | "[1|10] step:300/600 loss:0.0732\n", 270 | "[1|10] step:400/600 loss:0.0317\n", 271 | "[1|10] step:500/600 loss:0.1365\n", 272 | "[1|10] step:600/600 loss:0.0139\n", 273 | "[2|10] step:100/600 loss:0.1351\n", 274 | "[2|10] step:200/600 loss:0.0275\n", 275 | "[2|10] step:300/600 loss:0.0894\n", 276 | "[2|10] step:400/600 loss:0.0616\n", 277 | "[2|10] step:500/600 loss:0.0297\n", 278 | "[2|10] step:600/600 loss:0.0832\n", 279 | "[3|10] step:100/600 loss:0.0885\n", 280 | "[3|10] step:200/600 loss:0.0124\n", 281 | "[3|10] step:300/600 loss:0.0781\n", 282 | "[3|10] step:400/600 loss:0.0477\n", 283 | "[3|10] step:500/600 loss:0.0048\n", 284 | "[3|10] step:600/600 loss:0.0412\n", 285 | "[4|10] step:100/600 loss:0.0146\n", 286 | "[4|10] step:200/600 loss:0.0193\n", 287 | "[4|10] step:300/600 loss:0.0526\n", 288 | "[4|10] step:400/600 loss:0.0025\n", 289 | "[4|10] step:500/600 loss:0.0876\n", 290 | "[4|10] step:600/600 loss:0.0551\n", 291 | "[5|10] step:100/600 loss:0.0240\n", 292 | "[5|10] step:200/600 loss:0.0036\n", 293 | "[5|10] step:300/600 loss:0.0077\n", 294 | "[5|10] step:400/600 loss:0.0169\n", 295 | "[5|10] step:500/600 loss:0.0079\n", 296 | "[5|10] step:600/600 loss:0.0342\n", 297 | "[6|10] step:100/600 loss:0.0029\n", 298 | "[6|10] step:200/600 loss:0.0772\n", 299 | "[6|10] step:300/600 loss:0.0368\n", 300 | "[6|10] step:400/600 loss:0.0408\n", 301 | "[6|10] step:500/600 loss:0.0082\n", 302 | "[6|10] step:600/600 loss:0.0354\n", 303 | "[7|10] step:100/600 loss:0.0042\n", 304 | "[7|10] step:200/600 loss:0.0313\n", 305 | "[7|10] step:300/600 loss:0.0376\n", 306 | "[7|10] step:400/600 loss:0.0500\n", 307 | "[7|10] step:500/600 loss:0.0020\n", 308 | "[7|10] step:600/600 loss:0.0330\n", 309 | "[8|10] step:100/600 loss:0.0175\n", 310 | "[8|10] step:200/600 loss:0.0473\n", 311 | "[8|10] step:300/600 loss:0.0029\n", 312 | "[8|10] step:400/600 loss:0.0147\n", 313 | "[8|10] step:500/600 loss:0.0300\n", 314 | "[8|10] step:600/600 loss:0.0006\n", 315 | "[9|10] step:100/600 loss:0.0401\n", 316 | "[9|10] step:200/600 loss:0.0286\n", 317 | "[9|10] step:300/600 loss:0.0258\n", 318 | "[9|10] step:400/600 loss:0.0167\n", 319 | "[9|10] step:500/600 loss:0.0201\n", 320 | "[9|10] step:600/600 loss:0.0574\n", 321 | "[10|10] step:100/600 loss:0.0066\n", 322 | "[10|10] step:200/600 loss:0.0097\n", 323 | "[10|10] step:300/600 loss:0.0616\n", 324 | "[10|10] step:400/600 loss:0.0515\n", 325 | "[10|10] step:500/600 loss:0.0019\n", 326 | "[10|10] step:600/600 loss:0.0381\n" 327 | ] 328 | } 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "metadata": { 334 | "colab": { 335 | "base_uri": "https://localhost:8080/" 336 | }, 337 | "id": "qbngK_gf9RBl", 338 | "outputId": "cdbcda1b-3562-48fd-b52e-a77e141e4d22" 339 | }, 340 | "source": [ 341 | "# test\n", 342 | "resnet.eval()\n", 343 | "with torch.no_grad():\n", 344 | " correct = 0\n", 345 | " total = 0\n", 346 | " for images, labels in test:\n", 347 | " images = images.to(device)\n", 348 | " labels = labels.to(device)\n", 349 | " outputs = resnet(images)\n", 350 | " _, maxIndexes = torch.max(outputs, dim=1)\n", 351 | " correct += (maxIndexes==labels).sum().item()\n", 352 | " total += labels.size(0)\n", 353 | " \n", 354 | " print('in 1w test_data correct rate = {:.4f}'.format((correct/total)*100))" 355 | ], 356 | "execution_count": 23, 357 | "outputs": [ 358 | { 359 | "output_type": "stream", 360 | "name": "stderr", 361 | "text": [ 362 | "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:481: UserWarning: This DataLoader will create 5 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n", 363 | " cpuset_checked))\n" 364 | ] 365 | }, 366 | { 367 | "output_type": "stream", 368 | "name": "stdout", 369 | "text": [ 370 | "in 1w test_data correct rate = 98.5100\n" 371 | ] 372 | } 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "metadata": { 378 | "colab": { 379 | "base_uri": "https://localhost:8080/", 380 | "height": 283 381 | }, 382 | "id": "418J4zPhBlu6", 383 | "outputId": "041cfecc-0197-40fa-c716-3a1505515baf" 384 | }, 385 | "source": [ 386 | "pd.DataFrame(train_curve).plot() # loss曲线" 387 | ], 388 | "execution_count": 24, 389 | "outputs": [ 390 | { 391 | "output_type": "execute_result", 392 | "data": { 393 | "text/plain": [ 394 | "" 395 | ] 396 | }, 397 | "metadata": {}, 398 | "execution_count": 24 399 | }, 400 | { 401 | "output_type": "display_data", 402 | "data": { 403 | "image/png": "\n", 404 | "text/plain": [ 405 | "
" 406 | ] 407 | }, 408 | "metadata": { 409 | "needs_background": "light" 410 | } 411 | } 412 | ] 413 | } 414 | ] 415 | } -------------------------------------------------------------------------------- /deeplearning/resnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # handwritten digits recognition 3 | # Data: MINIST 4 | # model: resnet 5 | # date: 2021.10.8 14:18 6 | 7 | import math 8 | import torch 9 | import torchvision 10 | import torchvision.transforms as transforms 11 | import torch.nn as nn 12 | import torch.utils.data as Data 13 | import torch.optim as optim 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | 17 | train_curve = [] 18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 19 | 20 | # param 21 | batch_size = 100 22 | n_class = 10 23 | padding_size = 15 24 | epoches = 10 25 | 26 | train_dataset = torchvision.datasets.MNIST('./data/', train=True, transform=transforms.ToTensor(), download=True) 27 | test_dataset = torchvision.datasets.MNIST('./data/', train=False, transform=transforms.ToTensor(), download=False) 28 | train = Data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=5) 29 | test = Data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=5) 30 | 31 | def gelu(x): 32 | "Implementation of the gelu activation function by Hugging Face" 33 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 34 | 35 | class ResBlock(nn.Module): 36 | # 残差块 37 | def __init__(self, in_size, out_size1, out_size2): 38 | super(ResBlock, self).__init__() 39 | self.conv1 = nn.Conv2d( 40 | in_channels = in_size, 41 | out_channels = out_size1, 42 | kernel_size = 3, 43 | stride = 2, 44 | padding = padding_size 45 | ) 46 | self.conv2 = nn.Conv2d( 47 | in_channels = out_size1, 48 | out_channels = out_size2, 49 | kernel_size = 3, 50 | stride = 2, 51 | padding = padding_size 52 | ) 53 | self.batchnorm1 = nn.BatchNorm2d(out_size1) 54 | self.batchnorm2 = nn.BatchNorm2d(out_size2) 55 | 56 | def conv(self, x): 57 | # gelu效果比relu好呀哈哈 58 | x = gelu(self.batchnorm1(self.conv1(x))) 59 | x = gelu(self.batchnorm2(self.conv2(x))) 60 | return x 61 | 62 | def forward(self, x): 63 | # 残差连接 64 | return x + self.conv(x) 65 | 66 | # resnet 67 | class Resnet(nn.Module): 68 | def __init__(self, n_class = n_class): 69 | super(Resnet, self).__init__() 70 | self.res1 = ResBlock(1, 8, 16) 71 | self.res2 = ResBlock(16, 32, 16) 72 | self.conv = nn.Conv2d( 73 | in_channels = 16, 74 | out_channels = n_class, 75 | kernel_size = 3, 76 | stride = 2, 77 | padding = padding_size 78 | ) 79 | self.batchnorm = nn.BatchNorm2d(n_class) 80 | self.max_pooling = nn.AdaptiveAvgPool2d(1) 81 | 82 | def forward(self, x): 83 | # x: [bs, 1, h, w] 84 | # x = x.view(-1, 1, 28, 28) 85 | x = self.res1(x) 86 | x = self.res2(x) 87 | x = self.max_pooling(self.batchnorm(self.conv(x))) 88 | 89 | return x.view(x.size(0), -1) 90 | 91 | resnet = Resnet().to(device) 92 | 93 | loss_fn = nn.CrossEntropyLoss() 94 | optimizer = optim.SGD(params=resnet.parameters(), lr=1e-2, momentum=0.9) 95 | 96 | # train 97 | total_step = len(train) 98 | sum_loss = 0 99 | for epoch in range(epoches): 100 | for i, (images, targets) in enumerate(train): 101 | optimizer.zero_grad() 102 | images = images.to(device) 103 | targets = targets.to(device) 104 | preds = resnet(images) 105 | 106 | loss = loss_fn(preds, targets) 107 | sum_loss += loss.item() 108 | loss.backward() 109 | optimizer.step() 110 | if (i+1)%100==0: 111 | print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item())) 112 | train_curve.append(sum_loss) 113 | sum_loss = 0 114 | 115 | # test 116 | resnet.eval() 117 | with torch.no_grad(): 118 | correct = 0 119 | total = 0 120 | for images, labels in test: 121 | images = images.to(device) 122 | labels = labels.to(device) 123 | outputs = resnet(images) 124 | _, maxIndexes = torch.max(outputs, dim=1) 125 | correct += (maxIndexes==labels).sum().item() 126 | total += labels.size(0) 127 | 128 | print('in 1w test_data correct rate = {:.4f}'.format((correct/total)*100)) 129 | 130 | pd.DataFrame(train_curve).plot() # loss曲线 131 | -------------------------------------------------------------------------------- /nlp/Bert_Blend_CNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Bert_Blend-CNN.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "UahhCDlxtpWr" 23 | }, 24 | "source": [ 25 | "# -*- coding:utf-8 -*-\n", 26 | "# bert融合textcnn思想的Bert+Blend-CNN\n", 27 | "# model: Bert+Blend-CNN\n", 28 | "# date: 2021.10.11 18:06:11\n", 29 | "\n", 30 | "import os\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import torch\n", 34 | "import torch.nn as nn\n", 35 | "import torch.utils.data as Data\n", 36 | "import torch.nn.functional as F\n", 37 | "import torch.optim as optim\n", 38 | "import transformers\n", 39 | "from transformers import AutoModel, AutoTokenizer\n", 40 | "import matplotlib.pyplot as plt" 41 | ], 42 | "execution_count": 62, 43 | "outputs": [] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "metadata": { 48 | "id": "dL4eT_MTS9JY" 49 | }, 50 | "source": [ 51 | "train_curve = []\n", 52 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" 53 | ], 54 | "execution_count": 63, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "metadata": { 60 | "id": "ZdVB3Lt6TAEs" 61 | }, 62 | "source": [ 63 | "# # 定义一些参数,模型选择了最基础的bert中文模型\n", 64 | "batch_size = 2\n", 65 | "epoches = 100\n", 66 | "model = \"bert-base-chinese\"\n", 67 | "hidden_size = 768\n", 68 | "n_class = 2\n", 69 | "maxlen = 8\n", 70 | "\n", 71 | "encode_layer=12\n", 72 | "filter_sizes = [2, 2, 2]\n", 73 | "num_filters = 3" 74 | ], 75 | "execution_count": 64, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "metadata": { 81 | "id": "gQ3SK8_rTFGX" 82 | }, 83 | "source": [ 84 | "# data,构造一些训练数据\n", 85 | "sentences = [\"我喜欢打篮球\", \"这个相机很好看\", \"今天玩的特别开心\", \"我不喜欢你\", \"太糟糕了\", \"真是件令人伤心的事情\"]\n", 86 | "labels = [1, 1, 1, 0, 0, 0] # 1积极, 0消极." 87 | ], 88 | "execution_count": 65, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "d8eBhZbDTJSz" 95 | }, 96 | "source": [ 97 | "class MyDataset(Data.Dataset):\n", 98 | " def __init__(self, sentences, labels=None, with_labels=True,):\n", 99 | " self.tokenizer = AutoTokenizer.from_pretrained(model)\n", 100 | " self.with_labels = with_labels\n", 101 | " self.sentences = sentences\n", 102 | " self.labels = labels\n", 103 | " def __len__(self):\n", 104 | " return len(sentences)\n", 105 | "\n", 106 | " def __getitem__(self, index):\n", 107 | " # Selecting sentence1 and sentence2 at the specified index in the data frame\n", 108 | " sent = self.sentences[index]\n", 109 | "\n", 110 | " # Tokenize the pair of sentences to get token ids, attention masks and token type ids\n", 111 | " encoded_pair = self.tokenizer(sent,\n", 112 | " padding='max_length', # Pad to max_length\n", 113 | " truncation=True, # Truncate to max_length\n", 114 | " max_length=maxlen, \n", 115 | " return_tensors='pt') # Return torch.Tensor objects\n", 116 | "\n", 117 | " token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids\n", 118 | " attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with \"0\" for padded values and \"1\" for the other values\n", 119 | " token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with \"0\" for the 1st sentence tokens & \"1\" for the 2nd sentence tokens\n", 120 | "\n", 121 | " if self.with_labels: # True if the dataset has labels\n", 122 | " label = self.labels[index]\n", 123 | " return token_ids, attn_masks, token_type_ids, label\n", 124 | " else:\n", 125 | " return token_ids, attn_masks, token_type_ids" 126 | ], 127 | "execution_count": 66, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "id": "q-fhfJi7xkXd" 134 | }, 135 | "source": [ 136 | "train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=1)" 137 | ], 138 | "execution_count": 67, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "aqjKaE6-UPE1" 145 | }, 146 | "source": [ 147 | "class TextCNN(nn.Module):\n", 148 | " def __init__(self):\n", 149 | " super(TextCNN, self).__init__()\n", 150 | " self.num_filter_total = num_filters * len(filter_sizes)\n", 151 | " self.Weight = nn.Linear(self.num_filter_total, n_class, bias=False)\n", 152 | " self.bias = nn.Parameter(torch.ones([n_class]))\n", 153 | " self.filter_list = nn.ModuleList([\n", 154 | " nn.Conv2d(1, num_filters, kernel_size=(size, hidden_size)) for size in filter_sizes\n", 155 | " ])\n", 156 | "\n", 157 | " def forward(self, x):\n", 158 | " # x: [bs, seq, hidden]\n", 159 | " x = x.unsqueeze(1) # [bs, channel=1, seq, hidden]\n", 160 | " \n", 161 | " pooled_outputs = []\n", 162 | " for i, conv in enumerate(self.filter_list):\n", 163 | " h = F.relu(conv(x)) # [bs, channel=1, seq-kernel_size+1, 1]\n", 164 | " mp = nn.MaxPool2d(\n", 165 | " kernel_size = (encode_layer-filter_sizes[i]+1, 1)\n", 166 | " )\n", 167 | " # mp: [bs, channel=3, w, h]\n", 168 | " pooled = mp(h).permute(0, 3, 2, 1) # [bs, h=1, w=1, channel=3]\n", 169 | " pooled_outputs.append(pooled)\n", 170 | " \n", 171 | " h_pool = torch.cat(pooled_outputs, len(filter_sizes)) # [bs, h=1, w=1, channel=3 * 3]\n", 172 | " h_pool_flat = torch.reshape(h_pool, [-1, self.num_filter_total])\n", 173 | " \n", 174 | " output = self.Weight(h_pool_flat) + self.bias # [bs, n_class]\n", 175 | "\n", 176 | " return output" 177 | ], 178 | "execution_count": 68, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "metadata": { 184 | "id": "AAqolyEvTNYJ" 185 | }, 186 | "source": [ 187 | "# model\n", 188 | "class Bert_Blend_CNN(nn.Module):\n", 189 | " def __init__(self):\n", 190 | " super(Bert_Blend_CNN, self).__init__()\n", 191 | " self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True)\n", 192 | " self.linear = nn.Linear(hidden_size, n_class)\n", 193 | " self.textcnn = TextCNN()\n", 194 | " \n", 195 | " def forward(self, X):\n", 196 | " input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]\n", 197 | " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一个output字典\n", 198 | " # 取每一层encode出来的向量\n", 199 | " # outputs.pooler_output: [bs, hidden_size]\n", 200 | " hidden_states = outputs.hidden_states # 13*[bs, seq_len, hidden] 第一层是embedding层不需要\n", 201 | " cls_embeddings = hidden_states[1][:, 0, :].unsqueeze(1) # [bs, 1, hidden]\n", 202 | " # 将每一层的第一个token(cls向量)提取出来,拼在一起当作textcnn的输入\n", 203 | " for i in range(2, 13):\n", 204 | " cls_embeddings = torch.cat((cls_embeddings, hidden_states[i][:, 0, :].unsqueeze(1)), dim=1)\n", 205 | " # cls_embeddings: [bs, encode_layer=12, hidden]\n", 206 | " logits = self.textcnn(cls_embeddings)\n", 207 | " return logits" 208 | ], 209 | "execution_count": 69, 210 | "outputs": [] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "metadata": { 215 | "id": "_E6qpfSATZd1", 216 | "outputId": "baa36957-5382-42f5-b0d0-9d334070732b", 217 | "colab": { 218 | "base_uri": "https://localhost:8080/" 219 | } 220 | }, 221 | "source": [ 222 | "bert_blend_cnn = Bert_Blend_CNN().to(device)" 223 | ], 224 | "execution_count": 70, 225 | "outputs": [ 226 | { 227 | "output_type": "stream", 228 | "name": "stderr", 229 | "text": [ 230 | "Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']\n", 231 | "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 232 | "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" 233 | ] 234 | } 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "metadata": { 240 | "id": "QblwR6DexAFe" 241 | }, 242 | "source": [ 243 | "optimizer = optim.Adam(bert_blend_cnn.parameters(), lr=1e-3, weight_decay=1e-2)\n", 244 | "loss_fn = nn.CrossEntropyLoss()" 245 | ], 246 | "execution_count": 71, 247 | "outputs": [] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "metadata": { 252 | "id": "bBzQBHt8xbKm", 253 | "outputId": "a43c4a60-17af-4d63-f3c4-afd329aa8605", 254 | "colab": { 255 | "base_uri": "https://localhost:8080/" 256 | } 257 | }, 258 | "source": [ 259 | "# train\n", 260 | "sum_loss = 0\n", 261 | "total_step = len(train)\n", 262 | "for epoch in range(epoches):\n", 263 | " for i, batch in enumerate(train):\n", 264 | " optimizer.zero_grad()\n", 265 | " batch = tuple(p.to(device) for p in batch)\n", 266 | " pred = bert_blend_cnn([batch[0], batch[1], batch[2]])\n", 267 | " loss = loss_fn(pred, batch[3])\n", 268 | " sum_loss += loss.item()\n", 269 | "\n", 270 | " loss.backward()\n", 271 | " optimizer.step()\n", 272 | " if epoch % 10 == 0:\n", 273 | " print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))\n", 274 | " train_curve.append(sum_loss)\n", 275 | " sum_loss = 0" 276 | ], 277 | "execution_count": 72, 278 | "outputs": [ 279 | { 280 | "output_type": "stream", 281 | "name": "stdout", 282 | "text": [ 283 | "[1|100] step:1/3 loss:1.3587\n", 284 | "[1|100] step:2/3 loss:0.5860\n", 285 | "[1|100] step:3/3 loss:1.3804\n", 286 | "[11|100] step:1/3 loss:0.7330\n", 287 | "[11|100] step:2/3 loss:0.9912\n", 288 | "[11|100] step:3/3 loss:0.5007\n", 289 | "[21|100] step:1/3 loss:0.6944\n", 290 | "[21|100] step:2/3 loss:0.6947\n", 291 | "[21|100] step:3/3 loss:0.6936\n", 292 | "[31|100] step:1/3 loss:0.7441\n", 293 | "[31|100] step:2/3 loss:0.6923\n", 294 | "[31|100] step:3/3 loss:0.6735\n", 295 | "[41|100] step:1/3 loss:0.6875\n", 296 | "[41|100] step:2/3 loss:0.7020\n", 297 | "[41|100] step:3/3 loss:0.6898\n", 298 | "[51|100] step:1/3 loss:0.4228\n", 299 | "[51|100] step:2/3 loss:0.2038\n", 300 | "[51|100] step:3/3 loss:0.0154\n", 301 | "[61|100] step:1/3 loss:0.0064\n", 302 | "[61|100] step:2/3 loss:0.0058\n", 303 | "[61|100] step:3/3 loss:0.0060\n", 304 | "[71|100] step:1/3 loss:0.0039\n", 305 | "[71|100] step:2/3 loss:0.0036\n", 306 | "[71|100] step:3/3 loss:0.0039\n", 307 | "[81|100] step:1/3 loss:0.0021\n", 308 | "[81|100] step:2/3 loss:0.0020\n", 309 | "[81|100] step:3/3 loss:0.0020\n", 310 | "[91|100] step:1/3 loss:0.0029\n", 311 | "[91|100] step:2/3 loss:0.0025\n", 312 | "[91|100] step:3/3 loss:0.0256\n" 313 | ] 314 | } 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "metadata": { 320 | "id": "QkDiBPY3xhxK", 321 | "outputId": "f1cfed2a-c23d-4d2e-8269-90c1437c7831", 322 | "colab": { 323 | "base_uri": "https://localhost:8080/" 324 | } 325 | }, 326 | "source": [ 327 | "# test\n", 328 | "bert_blend_cnn.eval()\n", 329 | "with torch.no_grad():\n", 330 | " test_text = ['我不喜欢打篮球']\n", 331 | " test = MyDataset(test_text, labels=None, with_labels=False)\n", 332 | " x = test.__getitem__(0)\n", 333 | " x = tuple(p.unsqueeze(0).to(device) for p in x)\n", 334 | " pred = bert_blend_cnn([x[0], x[1], x[2]])\n", 335 | " pred = pred.data.max(dim=1, keepdim=True)[1]\n", 336 | " if pred[0][0] == 0:\n", 337 | " print('消极')\n", 338 | " else:\n", 339 | " print('积极')" 340 | ], 341 | "execution_count": 74, 342 | "outputs": [ 343 | { 344 | "output_type": "stream", 345 | "name": "stdout", 346 | "text": [ 347 | "消极\n" 348 | ] 349 | } 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "metadata": { 355 | "id": "qELdtXhu2qOw", 356 | "outputId": "b8bfc850-d8e8-4c5b-b624-7ae77b0b7bfb", 357 | "colab": { 358 | "base_uri": "https://localhost:8080/", 359 | "height": 283 360 | } 361 | }, 362 | "source": [ 363 | "pd.DataFrame(train_curve).plot() # loss曲线" 364 | ], 365 | "execution_count": 75, 366 | "outputs": [ 367 | { 368 | "output_type": "execute_result", 369 | "data": { 370 | "text/plain": [ 371 | "" 372 | ] 373 | }, 374 | "metadata": {}, 375 | "execution_count": 75 376 | }, 377 | { 378 | "output_type": "display_data", 379 | "data": { 380 | "image/png": "\n", 381 | "text/plain": [ 382 | "
" 383 | ] 384 | }, 385 | "metadata": { 386 | "needs_background": "light" 387 | } 388 | } 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "metadata": { 394 | "id": "gIaPMvXe2wrS" 395 | }, 396 | "source": [ 397 | "" 398 | ], 399 | "execution_count": null, 400 | "outputs": [] 401 | } 402 | ] 403 | } -------------------------------------------------------------------------------- /nlp/at.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class FGM(): 5 | def __init__(self, model): 6 | self.model = model 7 | self.backup = {} 8 | 9 | def attack(self, epsilon=1., emb_name='emb'): 10 | for name, param in self.model.named_parameters(): 11 | if param.requires_grad and emb_name in name: 12 | self.backup[name] = param.data.clone() 13 | norm = torch.norm(param.grad) 14 | if norm != 0: 15 | r_at = epsilon * param.grad / norm 16 | param.data.add_(r_at) 17 | 18 | def restore(self, emb_name='emb'): 19 | for name, param in self.model.named_parameters(): 20 | if param.requires_grad and emb_name in name: 21 | assert name in self.backup 22 | param.data = self.backup[name] 23 | self.backup = {} 24 | 25 | 26 | # 初始化 27 | fgm = FGM(model) 28 | for batch_input, batch_label in data: 29 | # 正常训练 30 | loss = model(batch_input, batch_label) 31 | loss.backward() 32 | # 对抗训练 33 | fgm.attack() # 修改embedding 34 | # optimizer.zero_grad() # 梯度累加,不累加去掉注释 35 | loss_sum = model(batch_input, batch_label) 36 | loss_sum.backward() # 累加对抗训练的梯度 37 | fgm.restore() # 恢复Embedding的参数 38 | 39 | optimizer.step() 40 | optimizer.zero_grad() 41 | -------------------------------------------------------------------------------- /nlp/bert_blend_cnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # bert融合textcnn思想的Bert+Blend-CNN 3 | # model: Bert+Blend-CNN 4 | # date: 2021.10.11 18:06:11 5 | 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | import torch 10 | import torch.nn as nn 11 | import torch.utils.data as Data 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | import transformers 15 | from transformers import AutoModel, AutoTokenizer 16 | import matplotlib.pyplot as plt 17 | 18 | train_curve = [] 19 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 20 | 21 | # # 定义一些参数,模型选择了最基础的bert中文模型 22 | batch_size = 2 23 | epoches = 100 24 | model = "bert-base-chinese" 25 | hidden_size = 768 26 | n_class = 2 27 | maxlen = 8 28 | 29 | encode_layer=12 30 | filter_sizes = [2, 2, 2] 31 | num_filters = 3 32 | 33 | # data,构造一些训练数据 34 | sentences = ["我喜欢打篮球", "这个相机很好看", "今天玩的特别开心", "我不喜欢你", "太糟糕了", "真是件令人伤心的事情"] 35 | labels = [1, 1, 1, 0, 0, 0] # 1积极, 0消极. 36 | 37 | class MyDataset(Data.Dataset): 38 | def __init__(self, sentences, labels=None, with_labels=True,): 39 | self.tokenizer = AutoTokenizer.from_pretrained(model) 40 | self.with_labels = with_labels 41 | self.sentences = sentences 42 | self.labels = labels 43 | def __len__(self): 44 | return len(sentences) 45 | 46 | def __getitem__(self, index): 47 | # Selecting sentence1 and sentence2 at the specified index in the data frame 48 | sent = self.sentences[index] 49 | 50 | # Tokenize the pair of sentences to get token ids, attention masks and token type ids 51 | encoded_pair = self.tokenizer(sent, 52 | padding='max_length', # Pad to max_length 53 | truncation=True, # Truncate to max_length 54 | max_length=maxlen, 55 | return_tensors='pt') # Return torch.Tensor objects 56 | 57 | token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids 58 | attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with "0" for padded values and "1" for the other values 59 | token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens 60 | 61 | if self.with_labels: # True if the dataset has labels 62 | label = self.labels[index] 63 | return token_ids, attn_masks, token_type_ids, label 64 | else: 65 | return token_ids, attn_masks, token_type_ids 66 | 67 | train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=1) 68 | 69 | class TextCNN(nn.Module): 70 | def __init__(self): 71 | super(TextCNN, self).__init__() 72 | self.num_filter_total = num_filters * len(filter_sizes) 73 | self.Weight = nn.Linear(self.num_filter_total, n_class, bias=False) 74 | self.bias = nn.Parameter(torch.ones([n_class])) 75 | self.filter_list = nn.ModuleList([ 76 | nn.Conv2d(1, num_filters, kernel_size=(size, hidden_size)) for size in filter_sizes 77 | ]) 78 | 79 | def forward(self, x): 80 | # x: [bs, seq, hidden] 81 | x = x.unsqueeze(1) # [bs, channel=1, seq, hidden] 82 | 83 | pooled_outputs = [] 84 | for i, conv in enumerate(self.filter_list): 85 | h = F.relu(conv(x)) # [bs, channel=1, seq-kernel_size+1, 1] 86 | mp = nn.MaxPool2d( 87 | kernel_size = (encode_layer-filter_sizes[i]+1, 1) 88 | ) 89 | # mp: [bs, channel=3, w, h] 90 | pooled = mp(h).permute(0, 3, 2, 1) # [bs, h=1, w=1, channel=3] 91 | pooled_outputs.append(pooled) 92 | 93 | h_pool = torch.cat(pooled_outputs, len(filter_sizes)) # [bs, h=1, w=1, channel=3 * 3] 94 | h_pool_flat = torch.reshape(h_pool, [-1, self.num_filter_total]) 95 | 96 | output = self.Weight(h_pool_flat) + self.bias # [bs, n_class] 97 | 98 | return output 99 | 100 | # model 101 | class Bert_Blend_CNN(nn.Module): 102 | def __init__(self): 103 | super(Bert_Blend_CNN, self).__init__() 104 | self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True) 105 | self.linear = nn.Linear(hidden_size, n_class) 106 | self.textcnn = TextCNN() 107 | 108 | def forward(self, X): 109 | input_ids, attention_mask, token_type_ids = X[0], X[1], X[2] 110 | outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一个output字典 111 | # 取每一层encode出来的向量 112 | # outputs.pooler_output: [bs, hidden_size] 113 | hidden_states = outputs.hidden_states # 13*[bs, seq_len, hidden] 第一层是embedding层不需要 114 | cls_embeddings = hidden_states[1][:, 0, :].unsqueeze(1) # [bs, 1, hidden] 115 | # 将每一层的第一个token(cls向量)提取出来,拼在一起当作textcnn的输入 116 | for i in range(2, 13): 117 | cls_embeddings = torch.cat((cls_embeddings, hidden_states[i][:, 0, :].unsqueeze(1)), dim=1) 118 | # cls_embeddings: [bs, encode_layer=12, hidden] 119 | logits = self.textcnn(cls_embeddings) 120 | return logits 121 | 122 | bert_blend_cnn = Bert_Blend_CNN().to(device) 123 | 124 | optimizer = optim.Adam(bert_blend_cnn.parameters(), lr=1e-3, weight_decay=1e-2) 125 | loss_fn = nn.CrossEntropyLoss() 126 | 127 | # train 128 | sum_loss = 0 129 | total_step = len(train) 130 | for epoch in range(epoches): 131 | for i, batch in enumerate(train): 132 | optimizer.zero_grad() 133 | batch = tuple(p.to(device) for p in batch) 134 | pred = bert_blend_cnn([batch[0], batch[1], batch[2]]) 135 | loss = loss_fn(pred, batch[3]) 136 | sum_loss += loss.item() 137 | 138 | loss.backward() 139 | optimizer.step() 140 | if epoch % 10 == 0: 141 | print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item())) 142 | train_curve.append(sum_loss) 143 | sum_loss = 0 144 | 145 | # test 146 | bert_blend_cnn.eval() 147 | with torch.no_grad(): 148 | test_text = ['我不喜欢打篮球'] 149 | test = MyDataset(test_text, labels=None, with_labels=False) 150 | x = test.__getitem__(0) 151 | x = tuple(p.unsqueeze(0).to(device) for p in x) 152 | pred = bert_blend_cnn([x[0], x[1], x[2]]) 153 | pred = pred.data.max(dim=1, keepdim=True)[1] 154 | if pred[0][0] == 0: 155 | print('消极') 156 | else: 157 | print('积极') 158 | 159 | pd.DataFrame(train_curve).plot() # loss曲线 160 | 161 | -------------------------------------------------------------------------------- /nlp/bert_classify.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "bert_classify.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "c3EUFvW1YZ23" 23 | }, 24 | "source": [ 25 | "# -*- coding:utf-8 -*-\n", 26 | "# bert文本分类baseline模型\n", 27 | "# model: bert\n", 28 | "# date: 2021.10.10 10:01\n", 29 | "\n", 30 | "import os\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import torch\n", 34 | "import torch.nn as nn\n", 35 | "import torch.utils.data as Data\n", 36 | "import torch.optim as optim\n", 37 | "import transformers\n", 38 | "from transformers import AutoModel, AutoTokenizer\n", 39 | "import matplotlib.pyplot as plt" 40 | ], 41 | "execution_count": 38, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "metadata": { 47 | "id": "honSDblyKbxc" 48 | }, 49 | "source": [ 50 | "train_curve = []\n", 51 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')" 52 | ], 53 | "execution_count": 39, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "id": "hNAVAq8o7Xbs" 60 | }, 61 | "source": [ 62 | "batch_size = 2\n", 63 | "epoches = 100\n", 64 | "model = \"bert-base-chinese\"\n", 65 | "hidden_size = 768\n", 66 | "n_class = 2\n", 67 | "maxlen = 8" 68 | ], 69 | "execution_count": 40, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "metadata": { 75 | "id": "wiq4EFV9LKO1" 76 | }, 77 | "source": [ 78 | "# data\n", 79 | "sentences = [\"我喜欢打篮球\", \"这个相机很好看\", \"今天玩的特别开心\", \"我不喜欢你\", \"太糟糕了\", \"真是件令人伤心的事情\"]\n", 80 | "labels = [1, 1, 1, 0, 0, 0] # 1积极, 0消极.\n", 81 | "\n", 82 | "# word_list = ' '.join(sentences).split()\n", 83 | "# word_list = list(set(word_list))\n", 84 | "# word_dict = {w: i for i, w in enumerate(word_list)}\n", 85 | "# num_dict = {i: w for w, i in word_dict.items()}\n", 86 | "# vocab_size = len(word_list)" 87 | ], 88 | "execution_count": 41, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "xG31wox1EKVX" 95 | }, 96 | "source": [ 97 | "class MyDataset(Data.Dataset):\n", 98 | " def __init__(self, sentences, labels=None, with_labels=True,):\n", 99 | " self.tokenizer = AutoTokenizer.from_pretrained(model)\n", 100 | " self.with_labels = with_labels\n", 101 | " self.sentences = sentences\n", 102 | " self.labels = labels\n", 103 | " def __len__(self):\n", 104 | " return len(sentences)\n", 105 | "\n", 106 | " def __getitem__(self, index):\n", 107 | " # Selecting sentence1 and sentence2 at the specified index in the data frame\n", 108 | " sent = self.sentences[index]\n", 109 | "\n", 110 | " # Tokenize the pair of sentences to get token ids, attention masks and token type ids\n", 111 | " encoded_pair = self.tokenizer(sent,\n", 112 | " padding='max_length', # Pad to max_length\n", 113 | " truncation=True, # Truncate to max_length\n", 114 | " max_length=maxlen, \n", 115 | " return_tensors='pt') # Return torch.Tensor objects\n", 116 | "\n", 117 | " token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids\n", 118 | " attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with \"0\" for padded values and \"1\" for the other values\n", 119 | " token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with \"0\" for the 1st sentence tokens & \"1\" for the 2nd sentence tokens\n", 120 | "\n", 121 | " if self.with_labels: # True if the dataset has labels\n", 122 | " label = self.labels[index]\n", 123 | " return token_ids, attn_masks, token_type_ids, label\n", 124 | " else:\n", 125 | " return token_ids, attn_masks, token_type_ids" 126 | ], 127 | "execution_count": 42, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "metadata": { 133 | "id": "nMRNgixZLXLj" 134 | }, 135 | "source": [ 136 | "train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=1)" 137 | ], 138 | "execution_count": 43, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "id": "u25BycPb8IhR" 145 | }, 146 | "source": [ 147 | "# model\n", 148 | "class BertClassify(nn.Module):\n", 149 | " def __init__(self):\n", 150 | " super(BertClassify, self).__init__()\n", 151 | " self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True)\n", 152 | " self.linear = nn.Linear(hidden_size, n_class)\n", 153 | " self.dropout = nn.Dropout(0.5)\n", 154 | " \n", 155 | " def forward(self, X):\n", 156 | " input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]\n", 157 | " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一个output字典\n", 158 | " # 用最后一层cls向量做分类\n", 159 | " # outputs.pooler_output: [bs, hidden_size]\n", 160 | " logits = self.linear(self.dropout(outputs.pooler_output))\n", 161 | " \n", 162 | " return logits" 163 | ], 164 | "execution_count": 44, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "metadata": { 170 | "colab": { 171 | "base_uri": "https://localhost:8080/" 172 | }, 173 | "id": "DWmpHgE38dPM", 174 | "outputId": "2a0d44a5-0d89-4478-8f33-d0e326aeb1d8" 175 | }, 176 | "source": [ 177 | "bc = BertClassify().to(device)" 178 | ], 179 | "execution_count": 45, 180 | "outputs": [ 181 | { 182 | "output_type": "stream", 183 | "name": "stderr", 184 | "text": [ 185 | "Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']\n", 186 | "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", 187 | "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" 188 | ] 189 | } 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "metadata": { 195 | "id": "oKn15V6oKG2B" 196 | }, 197 | "source": [ 198 | "optimizer = optim.Adam(bc.parameters(), lr=1e-3, weight_decay=1e-2)\n", 199 | "loss_fn = nn.CrossEntropyLoss()" 200 | ], 201 | "execution_count": 46, 202 | "outputs": [] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "metadata": { 207 | "colab": { 208 | "base_uri": "https://localhost:8080/" 209 | }, 210 | "id": "sFFxX8t2C4G0", 211 | "outputId": "38cff6f7-c1a9-48fb-ad18-dd28626382ca" 212 | }, 213 | "source": [ 214 | "# train\n", 215 | "sum_loss = 0\n", 216 | "total_step = len(train)\n", 217 | "for epoch in range(epoches):\n", 218 | " for i, batch in enumerate(train):\n", 219 | " optimizer.zero_grad()\n", 220 | " batch = tuple(p.to(device) for p in batch)\n", 221 | " pred = bc([batch[0], batch[1], batch[2]])\n", 222 | " loss = loss_fn(pred, batch[3])\n", 223 | " sum_loss += loss.item()\n", 224 | "\n", 225 | " loss.backward()\n", 226 | " optimizer.step()\n", 227 | " if epoch % 10 == 0:\n", 228 | " print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))\n", 229 | " train_curve.append(sum_loss)\n", 230 | " sum_loss = 0" 231 | ], 232 | "execution_count": 47, 233 | "outputs": [ 234 | { 235 | "output_type": "stream", 236 | "name": "stdout", 237 | "text": [ 238 | "[1|100] step:1/3 loss:1.1500\n", 239 | "[1|100] step:2/3 loss:1.2046\n", 240 | "[1|100] step:3/3 loss:2.5194\n", 241 | "[11|100] step:1/3 loss:1.1471\n", 242 | "[11|100] step:2/3 loss:0.7017\n", 243 | "[11|100] step:3/3 loss:0.8367\n", 244 | "[21|100] step:1/3 loss:1.0529\n", 245 | "[21|100] step:2/3 loss:1.4519\n", 246 | "[21|100] step:3/3 loss:1.4188\n", 247 | "[31|100] step:1/3 loss:1.2525\n", 248 | "[31|100] step:2/3 loss:1.2894\n", 249 | "[31|100] step:3/3 loss:0.6796\n", 250 | "[41|100] step:1/3 loss:1.2206\n", 251 | "[41|100] step:2/3 loss:0.7903\n", 252 | "[41|100] step:3/3 loss:0.9456\n", 253 | "[51|100] step:1/3 loss:0.4095\n", 254 | "[51|100] step:2/3 loss:0.4227\n", 255 | "[51|100] step:3/3 loss:1.7919\n", 256 | "[61|100] step:1/3 loss:2.2079\n", 257 | "[61|100] step:2/3 loss:1.0230\n", 258 | "[61|100] step:3/3 loss:1.0512\n", 259 | "[71|100] step:1/3 loss:0.5453\n", 260 | "[71|100] step:2/3 loss:0.2571\n", 261 | "[71|100] step:3/3 loss:1.0071\n", 262 | "[81|100] step:1/3 loss:1.1741\n", 263 | "[81|100] step:2/3 loss:0.6857\n", 264 | "[81|100] step:3/3 loss:0.2532\n", 265 | "[91|100] step:1/3 loss:0.5687\n", 266 | "[91|100] step:2/3 loss:0.4335\n", 267 | "[91|100] step:3/3 loss:0.8573\n" 268 | ] 269 | } 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "metadata": { 275 | "colab": { 276 | "base_uri": "https://localhost:8080/" 277 | }, 278 | "id": "y03sGa73O-Xb", 279 | "outputId": "b2b0f256-c55e-42c7-b9e1-121823ba5f67" 280 | }, 281 | "source": [ 282 | "# test\n", 283 | "bc.eval()\n", 284 | "with torch.no_grad():\n", 285 | " test_text = ['我不喜欢打篮球']\n", 286 | " test = MyDataset(test_text, labels=None, with_labels=False)\n", 287 | " x = test.__getitem__(0)\n", 288 | " x = tuple(p.unsqueeze(0).to(device) for p in x)\n", 289 | " pred = bc([x[0], x[1], x[2]])\n", 290 | " pred = pred.data.max(dim=1, keepdim=True)[1]\n", 291 | " if pred[0][0] == 0:\n", 292 | " print('消极')\n", 293 | " else:\n", 294 | " print('积极')" 295 | ], 296 | "execution_count": 52, 297 | "outputs": [ 298 | { 299 | "output_type": "stream", 300 | "name": "stdout", 301 | "text": [ 302 | "消极\n" 303 | ] 304 | } 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "metadata": { 310 | "colab": { 311 | "base_uri": "https://localhost:8080/", 312 | "height": 285 313 | }, 314 | "id": "u_d5OUCPSXgH", 315 | "outputId": "d266ed38-b7fa-4590-8640-c828d255ece6" 316 | }, 317 | "source": [ 318 | "pd.DataFrame(train_curve).plot() # loss曲线" 319 | ], 320 | "execution_count": 51, 321 | "outputs": [ 322 | { 323 | "output_type": "execute_result", 324 | "data": { 325 | "text/plain": [ 326 | "" 327 | ] 328 | }, 329 | "metadata": {}, 330 | "execution_count": 51 331 | }, 332 | { 333 | "output_type": "display_data", 334 | "data": { 335 | "image/png": "\n", 336 | "text/plain": [ 337 | "
" 338 | ] 339 | }, 340 | "metadata": { 341 | "needs_background": "light" 342 | } 343 | } 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "metadata": { 349 | "id": "Tlda0rBjX0vN" 350 | }, 351 | "source": [ 352 | "" 353 | ], 354 | "execution_count": null, 355 | "outputs": [] 356 | } 357 | ] 358 | } -------------------------------------------------------------------------------- /nlp/bert_classify.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # bert文本分类baseline模型 3 | # model: bert 4 | # date: 2021.10.10 10:01 5 | 6 | import os 7 | import numpy as np 8 | import pandas as pd 9 | import torch 10 | import torch.nn as nn 11 | import torch.utils.data as Data 12 | import torch.optim as optim 13 | import transformers 14 | from transformers import AutoModel, AutoTokenizer 15 | import matplotlib.pyplot as plt 16 | 17 | train_curve = [] 18 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 19 | 20 | batch_size = 2 21 | epoches = 100 22 | model = "bert-base-chinese" 23 | hidden_size = 768 24 | n_class = 2 25 | maxlen = 8 26 | 27 | # data 28 | sentences = ["我喜欢打篮球", "这个相机很好看", "今天玩的特别开心", "我不喜欢你", "太糟糕了", "真是件令人伤心的事情"] 29 | labels = [1, 1, 1, 0, 0, 0] # 1积极, 0消极. 30 | 31 | # word_list = ' '.join(sentences).split() 32 | # word_list = list(set(word_list)) 33 | # word_dict = {w: i for i, w in enumerate(word_list)} 34 | # num_dict = {i: w for w, i in word_dict.items()} 35 | # vocab_size = len(word_list) 36 | 37 | class MyDataset(Data.Dataset): 38 | def __init__(self, sentences, labels=None, with_labels=True,): 39 | self.tokenizer = AutoTokenizer.from_pretrained(model) 40 | self.with_labels = with_labels 41 | self.sentences = sentences 42 | self.labels = labels 43 | def __len__(self): 44 | return len(sentences) 45 | 46 | def __getitem__(self, index): 47 | # Selecting sentence1 and sentence2 at the specified index in the data frame 48 | sent = self.sentences[index] 49 | 50 | # Tokenize the pair of sentences to get token ids, attention masks and token type ids 51 | encoded_pair = self.tokenizer(sent, 52 | padding='max_length', # Pad to max_length 53 | truncation=True, # Truncate to max_length 54 | max_length=maxlen, 55 | return_tensors='pt') # Return torch.Tensor objects 56 | 57 | token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids 58 | attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with "0" for padded values and "1" for the other values 59 | token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens 60 | 61 | if self.with_labels: # True if the dataset has labels 62 | label = self.labels[index] 63 | return token_ids, attn_masks, token_type_ids, label 64 | else: 65 | return token_ids, attn_masks, token_type_ids 66 | 67 | train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=1) 68 | 69 | # model 70 | class BertClassify(nn.Module): 71 | def __init__(self): 72 | super(BertClassify, self).__init__() 73 | self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True) 74 | self.linear = nn.Linear(hidden_size, n_class) 75 | self.dropout = nn.Dropout(0.5) 76 | 77 | def forward(self, X): 78 | input_ids, attention_mask, token_type_ids = X[0], X[1], X[2] 79 | outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一个output字典 80 | # 用最后一层cls向量做分类 81 | # outputs.pooler_output: [bs, hidden_size] 82 | logits = self.linear(self.dropout(outputs.pooler_output)) 83 | 84 | return logits 85 | 86 | bc = BertClassify().to(device) 87 | 88 | optimizer = optim.Adam(bc.parameters(), lr=1e-3, weight_decay=1e-2) 89 | loss_fn = nn.CrossEntropyLoss() 90 | 91 | # train 92 | sum_loss = 0 93 | total_step = len(train) 94 | for epoch in range(epoches): 95 | for i, batch in enumerate(train): 96 | optimizer.zero_grad() 97 | batch = tuple(p.to(device) for p in batch) 98 | pred = bc([batch[0], batch[1], batch[2]]) 99 | loss = loss_fn(pred, batch[3]) 100 | sum_loss += loss.item() 101 | 102 | loss.backward() 103 | optimizer.step() 104 | if epoch % 10 == 0: 105 | print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item())) 106 | train_curve.append(sum_loss) 107 | sum_loss = 0 108 | 109 | # test 110 | bc.eval() 111 | with torch.no_grad(): 112 | test_text = ['我不喜欢打篮球'] 113 | test = MyDataset(test_text, labels=None, with_labels=False) 114 | x = test.__getitem__(0) 115 | x = tuple(p.unsqueeze(0).to(device) for p in x) 116 | pred = bc([x[0], x[1], x[2]]) 117 | pred = pred.data.max(dim=1, keepdim=True)[1] 118 | if pred[0][0] == 0: 119 | print('消极') 120 | else: 121 | print('积极') 122 | 123 | pd.DataFrame(train_curve).plot() # loss曲线 124 | 125 | -------------------------------------------------------------------------------- /nlp/focal_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class FocalLoss(nn.Module): 5 | """Multi-class Focal loss implementation""" 6 | def __init__(self, gamma=2, weight=None, reduction='mean', ignore_index=-100): 7 | super(FocalLoss, self).__init__() 8 | self.gamma = gamma 9 | self.weight = weight 10 | self.ignore_index = ignore_index 11 | self.reduction = reduction 12 | 13 | def forward(self, input, target): 14 | """ 15 | input: [N, C] 16 | target: [N, ] 17 | """ 18 | log_pt = torch.log_softmax(input, dim=1) 19 | pt = torch.exp(log_pt) 20 | log_pt = (1 - pt) ** self.gamma * log_pt 21 | loss = torch.nn.functional.nll_loss(log_pt, target, self.weight, reduction=self.reduction, ignore_index=self.ignore_index) 22 | return loss --------------------------------------------------------------------------------