├── Computer Vision
├── CNN
│ ├── DenseNet.ipynb
│ ├── EfficientNet.ipynb
│ ├── GoogLeNet.ipynb
│ ├── MobileNet_구현_실습.ipynb
│ ├── README.md
│ ├── ResNet.ipynb
│ └── Xception.ipynb
└── README.md
├── Multimodal Models
├── FLAVA
│ ├── Interacting with FLAVA.ipynb
│ └── README.md
└── README.md
├── Natural Language Processing
├── ALBERT
│ ├── ALBERT.ipynb
│ └── README.md
├── BERT
│ ├── BERT_model.ipynb
│ ├── BERT_구현_복습.ipynb
│ └── README.md
├── ELECTRA
│ ├── ELECTRA.ipynb
│ └── README.md
├── ELMo
│ ├── ELMo.ipynb
│ ├── README.md
│ ├── char_cnn.ipynb
│ └── character_dataset.ipynb
├── GPT-1
│ ├── GPT-1 Implementation.ipynb
│ └── README.md
├── README.md
├── RoBERTa
│ ├── README.md
│ └── RoBERTa.ipynb
├── Transformer-XL
│ ├── README.md
│ └── Transformer_XL_구현_실습.ipynb
├── Transformer
│ ├── README.md
│ ├── Transformer_구현_복습.ipynb
│ └── Transformer_구현_실습.ipynb
└── XLNet
│ ├── README.md
│ └── XLNet.ipynb
└── README.md
/Computer Vision/CNN/DenseNet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyP5VtNzKVdcgFotI0cRex0h",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "sGveVsqEBvXg"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import re\n",
38 | "import torch\n",
39 | "import torch.nn as nn\n",
40 | "import torch.nn.functional as F\n",
41 | "import torch.utils.checkpoint as cp\n",
42 | "from collections import OrderedDict\n",
43 | "#from .utils import load_state_dict_from_url\n",
44 | "from torch import Tensor\n",
45 | "from torch.jit.annotations import List\n",
46 | "\n",
47 | "__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']\n",
48 | "\n",
49 | "model_urls = {\n",
50 | " 'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',\n",
51 | " 'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',\n",
52 | " 'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',\n",
53 | " 'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',\n",
54 | "}\n",
55 | "\n",
56 | "#Dense Layer\n",
57 | "class _DenseLayer(nn.Module):\n",
58 | " def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient = False):\n",
59 | " super(_DenseLayer, self).__init__()\n",
60 | " self.add_module('norm1', nn.BatchNorm2d(num_input_features)),\n",
61 | " self.add_module('relu1', nn.ReLU(inplace = True)),\n",
62 | " self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size = 1, \n",
63 | " stride = 1, bias = False)),\n",
64 | " self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),\n",
65 | " self.add_module('relu2', nn.ReLU(inplace = True)),\n",
66 | " self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size = 3,\n",
67 | " stride = 1, padding = 1, bias = False)),\n",
68 | " self.drop_rate = float(drop_rate)\n",
69 | " self.memory_efficient = memory_efficient\n",
70 | "\n",
71 | " #Bacth Normalization 하는 부분\n",
72 | " def bn_function(self, inputs):\n",
73 | " # type: List[tensor] -> tensor\n",
74 | " concated_features = torch.cat(inputs, 1)\n",
75 | " bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))\n",
76 | " return bottleneck_output\n",
77 | "\n",
78 | " def any_requires_grad(self, input):\n",
79 | " # type: List[tensor] -> bool\n",
80 | " for tensor in input:\n",
81 | " if tensor.requires_grad:\n",
82 | " return True\n",
83 | " return False\n",
84 | "\n",
85 | " @torch.jit.unused\n",
86 | " def call_checkpoint_bottleneck(self, input):\n",
87 | " # type: List[tensor] -> tensor\n",
88 | " def closure(*inputs):\n",
89 | " return self.bn_function(inputs)\n",
90 | "\n",
91 | " return cp.checkpoint(closure, *input)\n",
92 | "\n",
93 | " @torch.jit._overload_method\n",
94 | " def forward(self, input):\n",
95 | " # type: List[tensor] -> tensor\n",
96 | " pass\n",
97 | "\n",
98 | " @torch.jit._overload_method\n",
99 | " def forward(self, input):\n",
100 | " # type: Tensor -> Tensor\n",
101 | " pass\n",
102 | "\n",
103 | " #아직 torchscript는 *args를 지원하지 않기 때문에, List[Tensor] 또는 single tensor를\n",
104 | " #오버로드 하는 방법을 사용\n",
105 | " #순전파\n",
106 | " def forward(self, input):\n",
107 | " if isinstance(input, Tensor):\n",
108 | " prev_features = [input]\n",
109 | " else:\n",
110 | " prev_features = input\n",
111 | "\n",
112 | " if self.memory_efficient and self.any_requires_grad(prev_features):\n",
113 | " if torch.jit.is_scripting():\n",
114 | " raise Exception('Memory Efficient not supported in JIT')\n",
115 | "\n",
116 | " bottleneck_output = self.call_checkpoint_bottleneck(prev_features)\n",
117 | "\n",
118 | " else:\n",
119 | " bottleneck_output = self.bn_function(prev_features)\n",
120 | "\n",
121 | " new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))\n",
122 | " if self.drop_rate > 0:\n",
123 | " new_features = F.dropout(new_features, p = self.drop_rate, training = self.training)\n",
124 | " \n",
125 | " return new_features\n",
126 | "\n",
127 | "#DenseBlock layer\n",
128 | "class _DenseBlock(nn.ModuleDict):\n",
129 | " _version = 2\n",
130 | "\n",
131 | " def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate,\n",
132 | " memory_efficient = False):\n",
133 | " super(_DenseBlock, self).__init__()\n",
134 | " for i in range(num_layers):\n",
135 | " layer = _DenseLayer(\n",
136 | " num_input_features + i * growth_rate, growth_rate = growth_rate, bn_size = bn_size,\n",
137 | " drop_rate = drop_rate, memory_efficient = memory_efficient,\n",
138 | " )\n",
139 | " self.add_module('denselayer%d' % (i + 1), layer)\n",
140 | "\n",
141 | " def forward(self, init_features):\n",
142 | " features = [init_features]\n",
143 | " for name, layer in self.items():\n",
144 | " new_features = layer(features)\n",
145 | " features.append(new_features)\n",
146 | " return torch.cat(features, 1)\n",
147 | "\n",
148 | "#Transition layer\n",
149 | "class _Transition(nn.Sequential):\n",
150 | " def __init__(self, num_input_features, num_output_features):\n",
151 | " super(_Transition, self).__init__()\n",
152 | " self.add_module('norm', nn.BacthNorm2d(num_input_features))\n",
153 | " self.add_module('relu', nn.ReLU(inplace = True))\n",
154 | " self.add_module('conv', nn.Conv2d(num_input_features, num_output_features, kernel_size = 1,\n",
155 | " stride = 1, bias = False))\n",
156 | " self.add_module('pool', nn.AvgPool2d(kernel_size = 2, stride = 2))\n",
157 | "\n",
158 | "class DenseNet(nn.Module):\n",
159 | " #growth_rate: 각 레이어에 얼만큼의 필터를 추가할지 (논문에서는 'k'로 표현)\n",
160 | " #block_config: 각 풀링 계층에서 얼마나 많은 레이어를 사용할지\n",
161 | " #num_init_features: 첫 합성곱 레이어에서 얼만큼의 필터를 배울지\n",
162 | " #bn_size: bottleneck layer의 숫자에 대한 factor\n",
163 | " #drop_rate: 각 dense layer 이후의 dropout rate\n",
164 | " #num_classes: 분류 클래스의 수\n",
165 | " #memort_efficient: True면 checkpoint 사용\n",
166 | "\n",
167 | " def __init__(self, growth_rate = 32, block_config = (6, 12, 24, 16),\n",
168 | " num_init_features = 64, bn_size = 4, drop_rate = 0, num_classes = 1000,\n",
169 | " memory_efficient = False):\n",
170 | " super(DenseNet, self).__init__()\n",
171 | "\n",
172 | " #첫 번째 convolution\n",
173 | " self.features = nn.Sequential(OrderedDict([\n",
174 | " ('conv0', nn.Conv2d(3, num_init_features, kernel_size = 7, stride = 2,\n",
175 | " padding = 3, bias = False)),\n",
176 | " ('norm0', nn.BactNorm2d(num_init_features)),\n",
177 | " ('relu0', nn.ReLU(inplace = True)),\n",
178 | " ('pool0', nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)),\n",
179 | " ]))\n",
180 | "\n",
181 | " #각 dense block\n",
182 | " num_features = num_init_features\n",
183 | " for i, num_layers in enumerate(block_config):\n",
184 | " block = _DenseBlock(\n",
185 | " num_layers = num_layers,\n",
186 | " num_input_features = num_features,\n",
187 | " bn_size = bn_size,\n",
188 | " growth_rate = growth_rate,\n",
189 | " drop_rate = drop_rate,\n",
190 | " memory_efficient = memory_efficient\n",
191 | " )\n",
192 | " self.features.add_module('denseblock%d' % (i + 1), block)\n",
193 | " num_features = num_features + num_layers * growth_rate\n",
194 | " if i != len(block_config) - 1:\n",
195 | " trans = _Transition(num_input_featurs = num_features,\n",
196 | " num_output_features = num_features // 2)\n",
197 | " self.featrues.add_module('transition%d' % (i + 1), trans)\n",
198 | " num_features = num_features // 2\n",
199 | "\n",
200 | " #마지막 batch norm\n",
201 | " self.features.add_module('norm5', nn.BatchNorm2d(num_features))\n",
202 | "\n",
203 | " #Liunear Layer\n",
204 | " self.classifier = nn.Linear(num_features, num_classes)\n",
205 | "\n",
206 | " for m in self.modules():\n",
207 | " if siinstance(m, nn.Conv2d):\n",
208 | " nn.init.kaiming_normal_(m.weight)\n",
209 | " elif isinstance(m, nn.BatchNorm2d):\n",
210 | " nn.init.constant_(m.weight, 1)\n",
211 | " nn.init.constant_(m.bias, 0)\n",
212 | " elif isinstance(m, nn.Linear):\n",
213 | " nn.init.constant_(m.bias, 0)\n",
214 | "\n",
215 | " def forward(self, x):\n",
216 | " features = self.features(x)\n",
217 | " out = F.relu(features, inplace = True)\n",
218 | " out = F.adaptive_avg_pool2d(out, (1, 1))\n",
219 | " out = torch.flatten(out, 1)\n",
220 | " out = self.classifier(out)\n",
221 | " return out\n",
222 | "\n",
223 | "def _load_state_dict(model, model_url, progress):\n",
224 | " pattern = re.compile(\n",
225 | " r'^(.*denselayer\\d+\\.(?:norm|relu|conv))\\.((?:[12])\\.(?:weight|bias|running_mean|running_var))$')\n",
226 | "\n",
227 | " state_dict = load_state_dict_from_url(model_url, progress=progress)\n",
228 | " for key in list(state_dict.keys()):\n",
229 | " res = pattern.match(key)\n",
230 | " if res:\n",
231 | " new_key = res.group(1) + res.group(2)\n",
232 | " state_dict[new_key] = state_dict[key]\n",
233 | " del state_dict[key]\n",
234 | " model.load_state_dict(state_dict)\n",
235 | "\n",
236 | "def _densenet(arch, growth_rate, block_config, num_init_features, pretrained, progress,\n",
237 | " **kwargs):\n",
238 | " model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)\n",
239 | " if pretrained:\n",
240 | " _load_state_dict(model, model_urls[arch], progress)\n",
241 | " return model\n",
242 | "\n",
243 | "def densenet121(pretrained = False, progress = True, **kwargs):\n",
244 | " return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress, **kwargs)\n",
245 | "\n",
246 | "def densenet161(pretrained = False, progress = True, **kwargs):\n",
247 | " return _densenet('dnesenet161', 48, (6, 12, 36, 24), 96, pretrained, progress, **kwargs)\n",
248 | "\n",
249 | "def densenet169(pretrained = False, progress = True, **kwargs):\n",
250 | " return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress, **kwargs)\n",
251 | "\n",
252 | "def densenet201(pretrained = False, progress = True, **kwargs):\n",
253 | " return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress, **kwargs)"
254 | ]
255 | }
256 | ]
257 | }
--------------------------------------------------------------------------------
/Computer Vision/CNN/EfficientNet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyMZpVwLqfasaEfsqwf3UBeE",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "ghfyI8deSjb_"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import torch\n",
38 | "from torch import nn\n",
39 | "from torch.nn import functional as F\n",
40 | "from .utils import (\n",
41 | " round_filters,\n",
42 | " round_repeats,\n",
43 | " drop_connect,\n",
44 | " get_same_padding_conv2d,\n",
45 | " get_model_params,\n",
46 | " efficientnet_params,\n",
47 | " load_pretrained_weights,\n",
48 | " Swish,\n",
49 | " MemoryEfficientSwish,\n",
50 | " calculate_output_image_size\n",
51 | ")\n",
52 | "\n",
53 | "VALID_MODELS = (\n",
54 | " 'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3',\n",
55 | " 'efficientnet-b4', 'efficientnet-b5', 'efficientnet-b6', 'efficientnet-b7',\n",
56 | " 'efficientnet-b8',\n",
57 | "\n",
58 | " # Support the construction of 'efficientnet-l2' without pretrained weights\n",
59 | " 'efficientnet-l2'\n",
60 | ")\n",
61 | "\n",
62 | "class MBConvBlock(nn.Module):\n",
63 | " #Mobile Inverted Residual Bottleneck Block\n",
64 | "\n",
65 | " def __init__(self, block_args, global_params, image_size = None):\n",
66 | " super().__init__()\n",
67 | " self.block_args = block_args\n",
68 | " self._bn_mom = 1 - global_aprams.batch_norm_momentum\n",
69 | " self._bn_eps = global_params.batch_norm_epsilon\n",
70 | " self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ration <= 1)\n",
71 | " self.id_skip = block_args.id_skip #use skip connection and drop connect\n",
72 | "\n",
73 | " #Expansion phase\n",
74 | " inp = self._block_args.input_filters #number of input channels\n",
75 | " oup = self._block_args.input_filters * self._block_args.expand_ratio #number of output channels\n",
76 | "\n",
77 | " if self._block_args.expand_ratio != 1:\n",
78 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
79 | " self._expand_conv = Conv2d(in_channels = inp, output_channels = oup, kernel_size = 1,\n",
80 | " bias = False)\n",
81 | " self._bn0 = nn.BatchNorm2d(num_features = oup, momentum = self._bn_mom, eps = self._bn_eps)\n",
82 | " #image_size = calculate_output_image_size(image_size, 1)\n",
83 | "\n",
84 | " #Depthwise convolution phase\n",
85 | " k = self._block_args.kernel_size\n",
86 | " s = self._block_args.stride\n",
87 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
88 | " self._depthwise_conv = Conv2d(\n",
89 | " in_channels = oup, out_channels = oup, groups = oup, #groups가 depthwise를 만듦\n",
90 | " kernel_size = k, strides = s, bias = False\n",
91 | " )\n",
92 | " self._bn1 = nn.BatchNorm2d(num_features = oup, momentum = self._bn_mom, \n",
93 | " eps = self._bn_eps)\n",
94 | " image_size = calculate_output_image_size(image_size, s)\n",
95 | "\n",
96 | " #Squeeze and Excitation layer\n",
97 | " if self.has_se:\n",
98 | " Conv2d = get_same_padding_conv2d(image_size = (1, 1))\n",
99 | " num_squeezed_channels = max(1, int(self.block_args.input_filters * \n",
100 | " self._block_args.se-ratio))\n",
101 | " self._se_reduce = Conv2d(in_channels = oup, out_channel = num_squeezed_channels,\n",
102 | " kernel_size = 1)\n",
103 | " self._se_expand = Conv2d(in_channels = num_squeezed_channels, out_channel = oup,\n",
104 | " kernel_size = 1)\n",
105 | " \n",
106 | " #Pointwise Convolution\n",
107 | " final_oup = self._block_args.output_filters\n",
108 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
109 | " self._project_conv = Conv2d(in_channels = oup, out_channels = final_oup, \n",
110 | " kernel_size = 1, bias = False)\n",
111 | " self._bn2 = nn.BatchNorm2d(num_features = final_oup, momentum = self._bn_mom,\n",
112 | " eps = self._bn_eps)\n",
113 | " self._swish = MemoryEfficientSwish()\n",
114 | "\n",
115 | " def forward(self, inputs, drop_connect_rate = None):\n",
116 | " #Expansion & Depthwise Convolution\n",
117 | " x = inputs\n",
118 | " if self._block_args.expand_ratio != 1:\n",
119 | " x = self.expand_conv(inputs)\n",
120 | " x = self._bn0(x)\n",
121 | " x = self._swish(x)\n",
122 | "\n",
123 | " x = self._depthwise_conv(x)\n",
124 | " x = self._bn1(x)\n",
125 | " x = self._swish(x)\n",
126 | "\n",
127 | " #Squeeze & Excitation\n",
128 | " if self.has_se:\n",
129 | " x_squeezed = F.adaptive_avg_pool2d(x, 1)\n",
130 | " x_squeezed = self._se_reduce(x_squeezed)\n",
131 | " x_squeezed = self._swish(x_squeezed)\n",
132 | " x_squeezed = self._se_expand(x_squeezed)\n",
133 | " x = torch.sigmoid(x_squeezed) * x\n",
134 | "\n",
135 | " #Pointwise Convolution\n",
136 | " x = self._project_conv(x)\n",
137 | " x = self._bn2(x)\n",
138 | "\n",
139 | " #Skip connection & drop connect\n",
140 | " input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters\n",
141 | "\n",
142 | " if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:\n",
143 | " #skip connection과 drop connect는 stochastic depth를 가져온다\n",
144 | " if drop_connect_rate:\n",
145 | " x = drop_connect(x, p = drop_connect_rate, training = self.training)\n",
146 | " x = x + inputs #skip connection\n",
147 | " \n",
148 | " return x\n",
149 | "\n",
150 | " def set_swish(self, memory_efficient = True):\n",
151 | " #memory efficient를 위한 swish 설정\n",
152 | "\n",
153 | " self._swish = MemoryEfficientSwish() if memory_efficient else Swish()\n",
154 | "\n",
155 | "class EfficientNet(nn.Module):\n",
156 | "\n",
157 | " def __init__(self, blocks_args = None, global_params = None):\n",
158 | " super().__init__()\n",
159 | " assert isinstance(block_args, list), 'blocks_args should be a list'\n",
160 | " assert len(block_args) > 0, 'block args must be greater than 0'\n",
161 | " self._global_params = global_params\n",
162 | " self._block_args = block_args\n",
163 | "\n",
164 | " #BatchNorm parameters\n",
165 | " bn_mom = 1 - self._global_params.batch_norm_momentum\n",
166 | " bn_eps = self._global_params.batch_norm_epsilon\n",
167 | "\n",
168 | " #이미지 크기에 따라서 정적 또는 동적 convolution을 함\n",
169 | " image_size = global_params.image_size\n",
170 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
171 | "\n",
172 | " #Stem\n",
173 | " in_channels = 3 #rgb\n",
174 | " out_channels = round_filters(32, self._global_params) #number of output channels\n",
175 | " self._conv_stem = Conv2d(in_channels, out_channels, kernel_size = 3, stride = 2,\n",
176 | " bias = False)\n",
177 | " self._bn0 = nn.BatchNorm2d(num_features = out_channels, momentum = bn_mom, eps = bn_eps)\n",
178 | " image_size = calculate_output_image_size(image_size, 2)\n",
179 | "\n",
180 | " #블록 쌓기\n",
181 | " self._blocks = nn.ModuleList([])\n",
182 | " for block_args in self._block_args:\n",
183 | " #depth multiplier에 따라 입력과 출력 필터 업데이트\n",
184 | " block_args = block_args._replace(\n",
185 | " input_filters = round_filters(block_args.input_filters, self._global_params),\n",
186 | " output_filter = round_filters(block_args.output_filters, self._global_params),\n",
187 | " num_repeat = round_filters(block_args.num_repeates, self._global_params)\n",
188 | " )\n",
189 | "\n",
190 | " #첫 번째 블록은 stride와 filter size 증가를 관리할 필요가 있음\n",
191 | " self._blocks.append(MBConvBlock(block_args, self._global_params, image_size = image_size))\n",
192 | " image_size = calculate_output_image_size(image_size, block_args.stride)\n",
193 | " if block_args.num_repeat > 1: #block_args를 조정해서 똑같은 output size 유지\n",
194 | " block_args = block_args._replace(input_filters = block_args.output_filters, stride = 1)\n",
195 | "\n",
196 | " for _ in range(block_args.num_repeat - 1):\n",
197 | " self._blocks.append(MBConvBlock(block_args, self._global_params, image_size = image_size))\n",
198 | "\n",
199 | " #Head\n",
200 | " in_channels = block_args.output_filters #output of final block\n",
201 | " out_channels = round_filters(1280, self._global_params)\n",
202 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
203 | " self._conv_head = Conv2d(in_channels, out_channels, kernel_size = 1, bias = False)\n",
204 | " self._bn1 = nn.BatchNorm2d(num_features = out_channels, momentum = bn_mom, eps = bn_eps)\n",
205 | "\n",
206 | " #Final Linear Layer\n",
207 | " self._avg_pooling = nn.AdaptiveAvgPool2d(1)\n",
208 | " self._dropout = nn.Dropout(self._global_params.dropout_rate)\n",
209 | " self._fc = nn.Linear(out_channels, self._global_params.num_classes)\n",
210 | " self._swish = MemoryEfficientSwish()\n",
211 | "\n",
212 | " def set_swish(self, memory_efficient = True):\n",
213 | " self._swish = MemoryEfficientSwish() if memory_efficient else Swish()\n",
214 | " for block in self._blocks:\n",
215 | " block.set_swish(memory_efficient)\n",
216 | "\n",
217 | " def extract_endpoints(self, inputs):\n",
218 | " #Convolution layer을 사용해서 feature을 extract\n",
219 | "\n",
220 | " endpoints = dict()\n",
221 | "\n",
222 | " #Stem\n",
223 | " x = self._swish(self._bn0(self._conv_stem(inputs)))\n",
224 | " prev_x = x\n",
225 | "\n",
226 | " #Blocks\n",
227 | " for idx, block in enumerate(self._blocks):\n",
228 | " drop_connect_rate = self._global_params.drop_connect_rate\n",
229 | " if drop_connect_rate:\n",
230 | " drop_connect_rate *= float(idx) / len(self._blocks) #scale drop connect_rate\n",
231 | " x = block(x, drop_connect_rate = drop_connect_rate)\n",
232 | " if prev_x.size(2) > x.size(2):\n",
233 | " endpoints[f'reduction_{len(endpoints)+1}'] = prev_x\n",
234 | " prev_x = x\n",
235 | "\n",
236 | " #Head\n",
237 | " x = self._swish(self._bn1(self._conv_head(x)))\n",
238 | " endpoints[f'reduction_{len(endpoints) + 1}'] = x\n",
239 | "\n",
240 | " return endpoints\n",
241 | "\n",
242 | " def extract_features(self, inputs):\n",
243 | " #Convolution layer을 사용해서 feature을 추출\n",
244 | "\n",
245 | " #Stem\n",
246 | " x = self._swish(self._bn0(self._conv_stem(inputs)))\n",
247 | "\n",
248 | " #Blocks\n",
249 | " for idx, block in enumerate(self._blocks):\n",
250 | " drop_connect_rate = self._global_params.drop_connect_rate\n",
251 | " if drop_connect_rate:\n",
252 | " drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect rate\n",
253 | " x = block(x, drop_connect_rate = drop_connect_rate)\n",
254 | "\n",
255 | " #Head\n",
256 | " x = self._swish(self._bn1(self._conv_head(x)))\n",
257 | "\n",
258 | " return x\n",
259 | "\n",
260 | " def forward(self, inputs):\n",
261 | " #EfficientNet의 순전파\n",
262 | "\n",
263 | " #Convolution Layers\n",
264 | " x = self.extract_features(inputs)\n",
265 | "\n",
266 | " #Pooling & final linear_layers\n",
267 | " x = self._avg_pooling(x)\n",
268 | " x = x.flatten(start_dim = 1)\n",
269 | " x = self._dropout(x)\n",
270 | " x = self._fc(x)\n",
271 | "\n",
272 | " return x\n",
273 | "\n",
274 | " @classmethod\n",
275 | " def from_name(cls, model_name, in_channels = 3, **override_params):\n",
276 | " #이름에 따라서 EfficientNet 생성\n",
277 | "\n",
278 | " cls._check_model_name_is_valid(model_name)\n",
279 | " blocks_args, clobal_params = get_model_params(model_name, override_params)\n",
280 | " model = cls(blocks_args, global_params)\n",
281 | " model._change_in_channels(in_channels)\n",
282 | " return model\n",
283 | "\n",
284 | " @classmethod\n",
285 | " def from_pretrained(cls, model_naem, weights_path = None, advprop = False,\n",
286 | " in_channels = 3, num_classes = 1000, **override_params):\n",
287 | " model = cls.from_name(model_name, num_classes = num_classes, **override_params)\n",
288 | " load_pretrained_weights(model, model_name, weights_path = weights_path, \n",
289 | " load_fc = (num_calss == 1000), advprop = advprop)\n",
290 | " model._change_in_channels(in_channels)\n",
291 | " return model\n",
292 | "\n",
293 | " @clasmethod\n",
294 | " def get_image_size(cls, model_name):\n",
295 | " #입력 이미지의 크기를 가져옴\n",
296 | "\n",
297 | " cls._check_model_name_is_valid(model_name)\n",
298 | " _, _, res, _ = efficientnet_params(model_name)\n",
299 | " return res\n",
300 | "\n",
301 | " @classmethod\n",
302 | " def _check_model_name_is_valid(cls, model_name):\n",
303 | " #model name check\n",
304 | "\n",
305 | " if model_name not in VALID_MODELS:\n",
306 | " raise ValueError('model_name should be one of: ' + ', '.join(VALID_MODELS))\n",
307 | "\n",
308 | " def _change_in_channels(self, in_channels):\n",
309 | " #첫 번째 합성곱 레이어에 사용되는 in_channels가 3이 아니라면, 조정\n",
310 | "\n",
311 | " if in_channels != 3:\n",
312 | " Conv2d = get_same_padding_conv2d(image_size = self._global_params.image_size)\n",
313 | " out_channels = round_filters(32, self._global_params)\n",
314 | " self._conv_stem = Conv2d(in_channels, out_channels, kernel_size = 3, stride = 2, bias = False)"
315 | ]
316 | }
317 | ]
318 | }
--------------------------------------------------------------------------------
/Computer Vision/CNN/GoogLeNet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyMtfpaygJNZbBpUa0WxvN2p",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "1f7qWOK7AQC5"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import warnings\n",
38 | "from collections import namedtuple\n",
39 | "import torch\n",
40 | "import torch.nn as nn\n",
41 | "import torch.nn.functional as F\n",
42 | "from torch.jit.annotations import Optional, Tuple\n",
43 | "from torch import Tensor\n",
44 | "\n",
45 | "\n",
46 | "__all__ = ['GoogLeNet', 'googlenet', 'GoogLeNetOutputs', '_GoogLeNetOutputs']\n",
47 | "\n",
48 | "model_urls = {'googlenet': 'https://download.pytorch.org/models/googlenet-1378be20.pth'}\n",
49 | "\n",
50 | "GoogLeNetOutputs = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])\n",
51 | "GoogLeNetOutputs.__annotations__ = {'logits': Tensor, 'aux_logits2': Optional[Tensor], \n",
52 | " 'aux_logits1': Optional[Tensor]}\n",
53 | "\n",
54 | "#역전파를 위한 GoogLeNet outputs 설정\n",
55 | "_GoogLeNetOutputs = GoogLeNetOutputs\n",
56 | "\n",
57 | "\n",
58 | "def googlenet(pretrained = False, progress = True, **kwargs):\n",
59 | " #pretraind: True면 ImageNet으로 pretrained된 모델 반환\n",
60 | " #progress: True면 download bar 보여주기\n",
61 | " #aux_logits: True면 두 개의 추가적인 branch 더해줌 --> 성능 향상에 도움 됌\n",
62 | " #transform input: True면 입력을 preprocessing\n",
63 | "\n",
64 | " if pretrained:\n",
65 | " if 'transform_input' not in kwargs:\n",
66 | " kwargs['transform_input'] = True\n",
67 | " if 'aux_logits' not in kwargs:\n",
68 | " kwargs['aux_logits'] = False\n",
69 | " if kwargs['aux_logits']:\n",
70 | " warnings.warn('auxiliary heads in the pretrained googlenet model are NOT pretrained, ')\n",
71 | "\n",
72 | " original_aux_logits = kwargs['aux_logits']\n",
73 | " kwargs['aux_logits'] = True\n",
74 | " kwargs['init_weights'] = False\n",
75 | " model = GoogLeNet(**kwargs)\n",
76 | " state_dict = load_state_dict_from_url(model_urls['googlenet'], progress = progress)\n",
77 | " model.load_state_dict(state_dict)\n",
78 | " if not original_aux_logits:\n",
79 | " model.aux_logits = False\n",
80 | " model.aux1 = None\n",
81 | " model.aux2 = None\n",
82 | " return model\n",
83 | "\n",
84 | " return GoogLeNet(**kwargs)\n",
85 | "\n",
86 | "class GoogLeNet(nn.Module):\n",
87 | " __constants__ = ['aux_logits', 'transform_input']\n",
88 | "\n",
89 | " def __init__(self, num_classes = 1000, aux_logits = True, transform_input = False,\n",
90 | " init_weights = None, blocks = None):\n",
91 | " super(GoogLeNet, self).__init__()\n",
92 | " if blocks is None:\n",
93 | " blocks = [BasicConv2d, Inception, InceptionAux]\n",
94 | " if init_weights is None:\n",
95 | " warnings.warn('The default weight initialization of GoogLeNet will be changed in future releases of')\n",
96 | " init_weights = True\n",
97 | " assert len(blocks) == 3\n",
98 | " conv_block = blocks[0]\n",
99 | " inception_block = blocks[1]\n",
100 | " inception_aux_block = blocks[2]\n",
101 | "\n",
102 | " self.aux_logits = aux_logits\n",
103 | " self.transform_input = transform_input\n",
104 | "\n",
105 | " self.conv1 = conv_block(3, 64, kernel_size = 7, stride = 2, padding = 3)\n",
106 | " self.maxpool1 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n",
107 | " self.conv2 = conv_block(64, 64, kernel_size = 1)\n",
108 | " self.conv3 = conv_block(64, 192, kernel_size = 3, padding = 1)\n",
109 | " self.maxpool2 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n",
110 | "\n",
111 | " self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)\n",
112 | " self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)\n",
113 | " self.maxpool3 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n",
114 | "\n",
115 | " self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)\n",
116 | " self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)\n",
117 | " self.inception4c = inception_block(512, 128, 127, 256, 24, 64, 64)\n",
118 | " self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)\n",
119 | " self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)\n",
120 | " self.maxpool4 = nn.MaxPool2d(2, stride = 2, ceil_mode = True)\n",
121 | "\n",
122 | " self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)\n",
123 | " self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)\n",
124 | "\n",
125 | " if aux_logits:\n",
126 | " self.aux1 = inception_aux_block(512, num_classes)\n",
127 | " self.aux2 = inception_aux_block(528, num_classes)\n",
128 | " else:\n",
129 | " self.aux1 = None\n",
130 | " self.aux2 = None\n",
131 | "\n",
132 | " self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\n",
133 | " self.dropout = nn.Dropout(0.2)\n",
134 | " self.fc = nn.Linear(1024, num_classes)\n",
135 | "\n",
136 | " if init_weights:\n",
137 | " self._initialize_weights()\n",
138 | "\n",
139 | " def _initialize_weights(self):\n",
140 | " for m in self.modules():\n",
141 | " if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):\n",
142 | " import scipy.stats as stats\n",
143 | " X = stats.truncnorm(-2, 2, scale = 0.01)\n",
144 | " values = torch.as_tensor(X.rvs(m.weight.numel()), dtype = m.weight.dtype)\n",
145 | " values = values.view(m.weight.size())\n",
146 | " with torch.no_grad():\n",
147 | " m.weight.copy_(values)\n",
148 | " elif isinstance(m, nn.BatchNorm2d):\n",
149 | " nn.init.constant_(m.weight, 1)\n",
150 | " nn.init.constant_(m.bias, 0)\n",
151 | "\n",
152 | " def _transform_input(self, x):\n",
153 | " #(Tensor) --> Tensor\n",
154 | " if self.transform_input:\n",
155 | " x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5\n",
156 | " x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5\n",
157 | " x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5\n",
158 | " x = torch.cat((x_ch0, x_ch1, x_ch2), 1)\n",
159 | " return x\n",
160 | "\n",
161 | " def _forward(self, x):\n",
162 | " #type: (Tensor) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]\n",
163 | " #N x 3 x 224 x 224\n",
164 | " x = self.conv1(x)\n",
165 | "\n",
166 | " #N x 64 x 112 x 112\n",
167 | " x = self.maxpool1(x)\n",
168 | "\n",
169 | " #N x 64 x 56 x 56\n",
170 | " x = self.conv2(x)\n",
171 | "\n",
172 | " #N x 64 x 56 x 56\n",
173 | " x = self.conv3(x)\n",
174 | "\n",
175 | " #N x 192 x 56 x 56\n",
176 | " x = self.maxpool2(x)\n",
177 | "\n",
178 | " #N x 192 x 28 x 28\n",
179 | " x = self.inception3a(x)\n",
180 | "\n",
181 | " #N x 256 x 28 x 28\n",
182 | " x = self.inception3b(x)\n",
183 | "\n",
184 | " #N x 480 x 28 x 28\n",
185 | " x = self.maxpool3(x)\n",
186 | "\n",
187 | " #N x 480 x 14 x 14\n",
188 | " x = self.inception4a(x)\n",
189 | "\n",
190 | " # N x 512 x 14 x 14\n",
191 | " aux1 = torch.hit.annotate(Optional[Tensor], None)\n",
192 | " if self.aux1 is not None:\n",
193 | " if self.training:\n",
194 | " aux1 = self.aux1(x)\n",
195 | "\n",
196 | " x = self.inception4b(x)\n",
197 | "\n",
198 | " #N x 512 x 14 x 14\n",
199 | " x = self.inception4c(x)\n",
200 | "\n",
201 | " #N x 512 x 14 x 14\n",
202 | " x = self.inception4d(x)\n",
203 | "\n",
204 | " #N x 528 x 14 x 14\n",
205 | " x = self.inception4e(x)\n",
206 | "\n",
207 | " #N x 832 x 14 x 14\n",
208 | " x = self.maxpool4(x)\n",
209 | "\n",
210 | " #N x 832 x 7 x 7\n",
211 | " x = self.inception5a(x)\n",
212 | "\n",
213 | " #N x 832 x 7 x 7\n",
214 | " x = self.inception5b(x)\n",
215 | " #N x 1024 x 7 x 7\n",
216 | "\n",
217 | " x = self.avgpool(x)\n",
218 | " #N x 1024 x 1 x 1\n",
219 | "\n",
220 | " x = torch.flatten(x, 1)\n",
221 | " # N x 1024\n",
222 | "\n",
223 | " x = self.dropout(x)\n",
224 | " x = self.fc(x)\n",
225 | " #N x 1000 (num_classes)\n",
226 | " return x, aux2, aux1\n",
227 | " \n",
228 | " @torch.jit.unused\n",
229 | " def eager_outputs(self, x, aux2, aux1):\n",
230 | " # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> GoogLeNetOutputs\n",
231 | " if self.training and self.aux_logits:\n",
232 | " return _GoogLeNetOutputs(x, aux2, aux1)\n",
233 | " else:\n",
234 | " return x\n",
235 | "\n",
236 | " def forward(self, x):\n",
237 | " # type: (Tensor) -> GoogLeNetOutputs\n",
238 | " x = self._transform_input(x)\n",
239 | " x, aux1, aux2 = self._forward(x)\n",
240 | " aux_defined = self.training and self.aux_logits\n",
241 | " if torch.jit.is_scripting():\n",
242 | " if not aux_defined:\n",
243 | " warnings.warn('Scripted Googlenet alwatd returns GoogleNetOutputs Tuple')\n",
244 | " return GoogLeNetOutputs(x, aux2, aux1)\n",
245 | " else:\n",
246 | " return self.eager_outputs(x, aux2, aux1)\n",
247 | "\n",
248 | "class Inception(nn.Module):\n",
249 | "\n",
250 | " def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj,\n",
251 | " conv_block = None):\n",
252 | " super(Inception, self).__init__()\n",
253 | " if conv_block is None:\n",
254 | " conv_block = BasicConv2d\n",
255 | " self.branch1 = conv_block(in_channels, ch1x1, kernel_size = 1)\n",
256 | "\n",
257 | " self.branch2 = nn.Sequential(\n",
258 | " conv_block(in_channels, ch3x3red, kernel_size = 1),\n",
259 | " conv_block(ch3x3red, ch3x3, kernel_size = 3, padding = 1)\n",
260 | " )\n",
261 | "\n",
262 | " self.branch3 = nn.Sequential(\n",
263 | " conv_block(in_channels, ch5x5red, kernel_size = 1),\n",
264 | " conv_block(ch5x5red, ch5x5, kernel_size = 3, padding = 1)\n",
265 | " )\n",
266 | "\n",
267 | " self.branch4 = nn.Sequential(\n",
268 | " nn.MaxPool2d(kernel_size = 3, stride = 1, padding = 1, ceil_mode = True),\n",
269 | " conv_block(in_channels, pool_proj, kernel_size = 1)\n",
270 | " )\n",
271 | "\n",
272 | " def _forward(self, x):\n",
273 | " branch1 = self.branch1(x)\n",
274 | " branch2 = self.branch2(x)\n",
275 | " branch3 = self.branch3(x)\n",
276 | " branch4 = self.branch4(x)\n",
277 | "\n",
278 | " outputs = [branch1, branch2, branch3, branch4]\n",
279 | " return outputs\n",
280 | "\n",
281 | " def forward(self, x):\n",
282 | " outputs = self._forward(x)\n",
283 | " return torch.cat(outputs, 1)\n",
284 | "\n",
285 | "class InceptionAux(nn.Module):\n",
286 | "\n",
287 | " def __init__(self, in_channels, num_classes, conv_block = None):\n",
288 | " super(InceptionAux, self).__init__()\n",
289 | " if conv_block is None:\n",
290 | " conv_block = BasicConv2d\n",
291 | " self.conv = conv_block(in_channels, 128, kernel_size = 1)\n",
292 | "\n",
293 | " self.fc1 = nn.Linear(2048, 1024)\n",
294 | " self.fc2 = nn.Linear(1024, num_classes)\n",
295 | "\n",
296 | " def forward(self, x):\n",
297 | " #aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14\n",
298 | " x = F.adaptive_avg_pool2d(x, (4, 4))\n",
299 | " \n",
300 | " #aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4\n",
301 | " x = self.conv(x)\n",
302 | "\n",
303 | " #N x 128 x 4 x 4\n",
304 | " x = self.torch.flatten(x, 1)\n",
305 | "\n",
306 | " #N x 2048\n",
307 | " x = F.relu(self.fc1(x), inplace = True)\n",
308 | "\n",
309 | " #N x 1024\n",
310 | " x = F.dropout(x, 0.7, training = self.training)\n",
311 | "\n",
312 | " #N x 1024\n",
313 | " x = self.fc2(x)\n",
314 | "\n",
315 | " # N x 1000 (num_classes)\n",
316 | "\n",
317 | " return x\n",
318 | "\n",
319 | "class BasicConv2d(nn.Module):\n",
320 | "\n",
321 | " def __init__(self, in_channels, out_channels, **kwargs):\n",
322 | " super(BasicConv2d, self).__init__()\n",
323 | " self.conv = nn.Conv2d(in_channels, out_channels, bias = False, **kwargs)\n",
324 | " self.bn = nn.BatchNorm2d(out_channels, eps = 0.001)\n",
325 | "\n",
326 | " def forward(self, x):\n",
327 | " x = self.conv(x)\n",
328 | " x = self.bn(x)\n",
329 | " return F.relu(x, inplace = True)"
330 | ]
331 | }
332 | ]
333 | }
--------------------------------------------------------------------------------
/Computer Vision/CNN/MobileNet_구현_실습.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyPvHJUiMPbRb/mjrrJcBBef",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "source": [
32 | "#MobileNet 설명\n",
33 | "#서로 다른 크기의 input layer와 width factor에서 사용 가능\n",
34 | "#서로 다른 width를 사용함으로써 cost를 줄일 수 있음\n",
35 | "#MobileNet은 32x32보다 큰 입력 이미지면 어떤 이미지든 가능\n",
36 | "#더 큰 크기의 이미지는 더욱 향상된 성능을 가져옴\n",
37 | "\n",
38 | "#파라미터 수와 multiply-adds는 alpha에 의해 결정됌\n",
39 | "#alpha는 각 레이어에서 필터의 수를 증감함\n",
40 | "\n",
41 | "from tensorflow.python.keras.layers.recurrent import layer_serialization\n",
42 | "from __future__ import absolute_import\n",
43 | "from __future__ import division\n",
44 | "from __future__ import print_function\n",
45 | "\n",
46 | "from tensorflow.python.keras import backend\n",
47 | "from tensorflow.python.keras.applications import imagenet_utils\n",
48 | "from tensorflow.python.keras.engine import training\n",
49 | "from tensorflow.python.keras.layers import VersionAwareLayers\n",
50 | "from tensorflow.python.keras.utils import data_utils\n",
51 | "from tensorflow.python.keras.utils import layer_utils\n",
52 | "from tensorflow.python.lib.io import file_io\n",
53 | "from tensorflow.python.platform import tf_logging as logging\n",
54 | "from tensorflow.python.util.tf_export import keras_export\n",
55 | "\n",
56 | "BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/')\n",
57 | "layers = None\n",
58 | "\n",
59 | "@keras_export('keras.applications.mobilenet.MobileNet', 'keras.applications.MobileNet')\n",
60 | "\n",
61 | "def MobileNet(input_shape = None, alpha = 1.0, depth_multiplier = 1, dropout = 1e-3, include_top = True, \n",
62 | " weights = 'imagenet', input_tensor = None, pooling = None, classes = 1000,\n",
63 | " classifier_activation = 'softmax', **kwargs):\n",
64 | " #input_shape: 옵션적 shape tuple\n",
65 | " #alpha: network의 width 조절 -> width multiplier\n",
66 | " #1.0이면, 각 레이어에서 비율적으로 필터의 수를 줄임\n",
67 | " #depth_multiplier: resolution multiplier\n",
68 | " #dropout: dropout rate 조정\n",
69 | " #include_top: network의 맨 위에서 fc-layer을 사용할지 결정\n",
70 | " #weights: 재량껏 weights를 사용 가능\n",
71 | " #input_tensor: 옵션적 keras tensor\n",
72 | " #pooling: 어떤 방식으로 풀링을 할 지 결정\n",
73 | " #classes: 분류해야 하는 class 수 결정\n",
74 | " \n",
75 | " global layer_s\n",
76 | " if 'layers' in kwargs:\n",
77 | " layers = kwargs.pop('layers')\n",
78 | " else:\n",
79 | " layers = VersionAwareLayers()\n",
80 | " if kwargs:\n",
81 | " raise ValueError('Unknown argument(s): %s' % (kwargs,))\n",
82 | " if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):\n",
83 | " raise ValueError('The `weights` argument should be either '\n",
84 | " '`None` (random initialization), `imagenet` '\n",
85 | " '(pre-training on ImageNet), '\n",
86 | " 'or the path to the weights file to be loaded.')\n",
87 | " \n",
88 | " if weights == 'imagenet' and include_top and classes != 1000:\n",
89 | " raise ValueError('If using `weights` as `\"imagenet\"` with `include_top` '\n",
90 | " 'as true, `classes` should be 1000')\n",
91 | " \n",
92 | " #적절한 입력 shape과 기본 크기\n",
93 | " if input_shape is None:\n",
94 | " default_size = 224\n",
95 | " else:\n",
96 | " if backend.image_data_format() == 'channels_first':\n",
97 | " rows = input_shape[1]\n",
98 | " cols = input_shape[2]\n",
99 | " else:\n",
100 | " rows = input_shape[0]\n",
101 | " cols = input_shape[1]\n",
102 | "\n",
103 | " if rows == cols and rows in [128, 160, 192, 224]:\n",
104 | " default_size = rows\n",
105 | " else:\n",
106 | " default_size = 224\n",
107 | "\n",
108 | " input_shape = imagenet_utils.obtain_input_shape(input_shape, default_size = default_size,\n",
109 | " min_size = 32, data_format = backend.image_data_format(),\n",
110 | " require_flatten = include_top, weights = weights)\n",
111 | " \n",
112 | " if backend.image_data_format() == 'channels_last':\n",
113 | " row_axis, col_axis = (0, 1)\n",
114 | " else:\n",
115 | " row_axis, col_axis = (1, 2)\n",
116 | " rows = input_shape[row_axis]\n",
117 | " cols = input_shape[col_axis]\n",
118 | "\n",
119 | " if weights == 'imagenet':\n",
120 | " if depth_multiplier != 1:\n",
121 | " raise ValueError('If imagenet weights are being loaded, '\n",
122 | " 'depth multiplier must be 1')\n",
123 | " \n",
124 | " if alpha not in [0.25, 0.50, 0.75, 1.0]:\n",
125 | " raise ValueError('If imagenet weights are being loaded, '\n",
126 | " 'alpha can be one of'\n",
127 | " '`0.25`, `0.50`, `0.75` or `1.0` only.')\n",
128 | " \n",
129 | " if rows != cols or rows not in [128, 160, 192, 224]:\n",
130 | " rows = 224\n",
131 | " logging.warning('`input_shape` is undefined or non-square, '\n",
132 | " 'or `rows` is not in [128, 160, 192, 224]. '\n",
133 | " 'Weights for input shape (224, 224) will be'\n",
134 | " ' loaded as the default.')\n",
135 | " \n",
136 | " if input_tensor is None:\n",
137 | " img_input = layers.Input(shape = input_shape)\n",
138 | " else:\n",
139 | " if not backend.is_keras_tensor(input_tensor):\n",
140 | " img_input = layers.Input(tensor = input_tensor, shape = input_shape)\n",
141 | " else:\n",
142 | " img_input = input_tensor\n",
143 | "\n",
144 | " x = _conv_block(img_input, 32, alpha, stirdes = (2, 2))\n",
145 | " x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id = 1)\n",
146 | " x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, strides = (2, 2), block_id = 2)\n",
147 | "\n",
148 | " x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id = 3)\n",
149 | " x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, strides = (2, 2), block_id = 4)\n",
150 | "\n",
151 | " x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id = 5)\n",
152 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, strides = (2, 2), block_id = 6)\n",
153 | "\n",
154 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 7)\n",
155 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 8)\n",
156 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 9)\n",
157 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 10)\n",
158 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 11)\n",
159 | "\n",
160 | " x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, strides = (2, 2), block_id = 12)\n",
161 | " x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id = 13)\n",
162 | "\n",
163 | " if include_top:\n",
164 | " if backend.image_data_format() == 'channels_first':\n",
165 | " shape = (int(1024 * alpha), 1, 1)\n",
166 | " else:\n",
167 | " shape = (1, 1, int(1024 * alpha))\n",
168 | "\n",
169 | " x = layers.GlobalAvergarePooling2D()(x)\n",
170 | " x = layers.Reshape(shape, name = 'reshape_1')(x)\n",
171 | " x = layers.Dropout(dropout, name = 'dropout')(x)\n",
172 | " x = layers.Conv2D(classes, (1, 1), padding = 'same', name = 'conv_preds')(x)\n",
173 | " x = layers.Reshape((classes,), name = 'reshape_2')(x)\n",
174 | " imagenet_utils.validate_activation(classifier_activation, weights)\n",
175 | " x = layers.Activation(activation = classifier_activation, name = 'predictions')(x)\n",
176 | "\n",
177 | " else:\n",
178 | " if pooling == 'avg':\n",
179 | " x = layers.GlobalAveragePooling2D()(x)\n",
180 | " elif pooling == 'max':\n",
181 | " x = layers.GlobalMaxPooling2D()(x)\n",
182 | "\n",
183 | " if input_tensor is not None:\n",
184 | " inputs = layer_utils.get_source_inputs(input_tensor)\n",
185 | " else:\n",
186 | " inputs = img_input\n",
187 | "\n",
188 | " #모델 생성\n",
189 | " model = training.Model(inputs, x, name = 'mobilent_%0.2f_%s' % (alpha, rows))\n",
190 | "\n",
191 | " #가중치 불러오기\n",
192 | " if weights == 'imagenet':\n",
193 | " if alpha == 1.0:\n",
194 | " alpha_test = '1_0'\n",
195 | " elif alpha == 0.75:\n",
196 | " aplha_text = '7_5'\n",
197 | " elif alpha == 0.50:\n",
198 | " alpha_text = '5_0'\n",
199 | " else:\n",
200 | " alpha_text = '2_5'\n",
201 | "\n",
202 | " if include_top:\n",
203 | " model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)\n",
204 | " weight_path = BASE_WEIGHT_PATH + model_name\n",
205 | " weights_path = data_utils.get_file(model_name, weight_path, cache_subdir = 'models')\n",
206 | " else:\n",
207 | " model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)\n",
208 | " weight_path = BASE_WEIGHT_PATH + model_name\n",
209 | " weights_path = data_utils.get_file(model_name, weight_path, cache_subdir = 'models')\n",
210 | " model.load_weights(weights_path)\n",
211 | " elif weights is not None:\n",
212 | " model.load_weights(weights)\n",
213 | "\n",
214 | " return model\n",
215 | "\n",
216 | "def _conv_block(inputs, filters, alpha, kernel = (3, 3), strides = (1, 1)):\n",
217 | " #inputs: 'channels_last'면 (rows, cols, 3) / 'channels_first'면 (3, rows, cols) 식으로 입력 조정\n",
218 | " #filters: output space의 차원수\n",
219 | " #alpha: network의 width 조정. alpha가 1.0보다 작으면 각 레이어의 필터 수 줄어듬\n",
220 | " #반면에, alpha가 1.0 보다 크다면 각 레이어의 필터 수가 증가함\n",
221 | " #kernel: 합성곱 윈도우의 height와 width 조정\n",
222 | " #strides: stride 정의\n",
223 | " \n",
224 | " channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n",
225 | " filters = int(filters * alpha)\n",
226 | " x = layers.Conv2D(filters, kernel, padding = 'same', use_bias = False,\n",
227 | " strides = strides, name = 'conv1')(inputs)\n",
228 | " x = layers.BatchNormalization(axis = channel_axis, name = 'conv1_bn')(x)\n",
229 | " return layers.ReLU(6., name = 'conv1_relu')(x)\n",
230 | "\n",
231 | "def _depthwise_conv_block(inputs, pointwise_conv_filters, alpha, depth_multiplier = 1,\n",
232 | " strides = (1, 1), block_id = 1):\n",
233 | " #input: 입력 텐서의 모양. 이전의 정의와 동일\n",
234 | " #pointwise_conv_filters: output space의 차원수\n",
235 | " #alpha: 이전의 정의와 동일\n",
236 | " #depth_multiplier: 각 입력 채널에 대한 depthwise convolution output channel의 수\n",
237 | " #strides: 이전의 정의와 동일\n",
238 | " #block_id: block의 수를 관리하기 위한 특별한 integer\n",
239 | "\n",
240 | " channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n",
241 | " filters = int(filters * alpha)\n",
242 | " \n",
243 | " if strides == (1, 1):\n",
244 | " x = inputs\n",
245 | " else:\n",
246 | " x = layers.ZeroPadding2D(((0, 1), (0, 1)), name = 'conv_pad_%d' % block_id)(inputs)\n",
247 | "\n",
248 | " x = layers.DepthwiseConv2D((3, 3), padding = 'same' if strides == (1, 1) else 'valid',\n",
249 | " depth_multiplier = depth_multiplier, strides = strides,\n",
250 | " use_bias = False, name = 'conv_dw_%d' % block_id)(x)\n",
251 | " x = layers.BatchNormalization(axis = channel_axis, name = 'conv_dw_%d_bn' % block_id)(x)\n",
252 | " x = layers.ReLU(6., name = 'conv_dw_%d_relu' % block_id)(x)\n",
253 | " \n",
254 | " x = layers.Conv2D(pointwise_conv_filters, (1, 1), padding = 'same', use_bias = False, \n",
255 | " strides = (1, 1), name = 'conv_dw_%d' % block_id)(x)\n",
256 | " x = layers.BatchNormalization(axis = channel_axis, name = 'conv_dw_%d_bn' % block_id)(x)\n",
257 | " return layers.ReLU(6., name = 'conv_dw_%d_relu' % block_id)(x)\n",
258 | "\n",
259 | "@keras_export('keras.applications.mobilenet.preprocess_input')\n",
260 | "def preprocess_input(x, data_format = None):\n",
261 | " return imagenet_utils.preprocess_input(x, data_format = data_format, mode = 'tf')\n",
262 | "\n",
263 | "@keras_export('keras.applications.mobilenet.decode_predictions')\n",
264 | "def decode_predictions(preds, top = 5):\n",
265 | " return imagenet_utils.decode_predictions(preds, top = top)\n",
266 | "\n",
267 | "preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(\n",
268 | " mode='',\n",
269 | " ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,\n",
270 | " error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)\n",
271 | "decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__"
272 | ],
273 | "metadata": {
274 | "id": "bcnEZ5GDo6cq"
275 | },
276 | "execution_count": null,
277 | "outputs": []
278 | }
279 | ]
280 | }
--------------------------------------------------------------------------------
/Computer Vision/CNN/README.md:
--------------------------------------------------------------------------------
1 | # Various CNN models implementation
2 |
3 | I implemented GoogLeNet, ResNet, DenseNet, EfficientNet, MobileNet.
4 |
5 | You can check my CNN models paper review here -> https://cartinoe5930.tistory.com/entry/CNN-network%EC%9D%98-%EC%97%AD%EC%82%AC
6 |
--------------------------------------------------------------------------------
/Computer Vision/CNN/ResNet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyOi4qpO/A/t6wycK3+hwkbI",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "el5SGMXsyXow"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import torch\n",
38 | "import torch.nn as nn\n",
39 | "#from .utils import load_state_from_url\n",
40 | "\n",
41 | "#ResNet 모델 종류\n",
42 | "__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',\n",
43 | " 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',\n",
44 | " 'wide_resnet50_2', 'wide_resnet101_2']\n",
45 | "\n",
46 | "#ResNet 모델별 URL\n",
47 | "model_urls = {\n",
48 | " 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',\n",
49 | " 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',\n",
50 | " 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',\n",
51 | " 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',\n",
52 | " 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',\n",
53 | " 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',\n",
54 | " 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',\n",
55 | " 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',\n",
56 | " 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',\n",
57 | "}\n",
58 | "\n",
59 | "#3X3 conv layer 구현\n",
60 | "def conv3x3(in_planes, out_planes, stride = 1, groups = 1, dilation = 1):\n",
61 | " #padding과 함께 3x3 conv layer 구현\n",
62 | " return nn.Conv2d(in_planes, out_planes, kernel_size = 3, stride = stride, padding = dilation, groups = groups, bias = False, dilation = dilation)\n",
63 | "\n",
64 | "def conv1x1(in_planes, out_planes, stride = 1):\n",
65 | " #1X1 conv layer 구현\n",
66 | " return nn.Conv2d(in_planes, out_planes, kernel_size = 1, stride = stride, bias = True)\n",
67 | "\n",
68 | "class BasicBlock(nn.Module):\n",
69 | " expansion = 1\n",
70 | "\n",
71 | " def __init__(self, inplanes, planes, stride = 1, downsample = None, groups = 1,\n",
72 | " base_width = 64, dilation = 1, norm_layers = None):\n",
73 | " super(BasicBlock, self).__init__()\n",
74 | " if norm_layer is None:\n",
75 | " norm_layer = nn.BatchNorm2d\n",
76 | " if groups != 1 or base_width != 64:\n",
77 | " raise ValueError('BasicBlock only supports groups = 1 and base_width = 64')\n",
78 | " if dilation > 1:\n",
79 | " raise NotImplementedError('Dilation > 1 not supported in BasicBlock')\n",
80 | "\n",
81 | " #stride가 1일 때, self.conv layer와 self.downsample layer는 입력을 downsample함\n",
82 | " self.conv1 = conv3x3(inplanes, planes, stride)\n",
83 | " self.bn1 = norm_layer(planes)\n",
84 | " self.relu = nn.ReLU(inplace = True)\n",
85 | " self.conv2 = conv3x3(planes, planes)\n",
86 | " self.bn2 = norm_layer(planes)\n",
87 | " self.downsample = downsample\n",
88 | " self.stride = stride\n",
89 | "\n",
90 | " def forward(self, x):\n",
91 | " identity = x\n",
92 | "\n",
93 | " out = self.conv1(x)\n",
94 | " out = self.bn1(out)\n",
95 | " out = self.relu(out)\n",
96 | " \n",
97 | " out = self.conv2(out)\n",
98 | " out = self.bn2(out)\n",
99 | "\n",
100 | " if self.downsample is not None:\n",
101 | " identity = self.downsample(x)\n",
102 | "\n",
103 | " out += identity\n",
104 | " out = self.relu(out)\n",
105 | "\n",
106 | " return out\n",
107 | "\n",
108 | "\n",
109 | "class Bottleneck(nn.Module):\n",
110 | " expansion = 4\n",
111 | "\n",
112 | " def __init__(self, inplanes, planes, stride = 1, downsample = None, groups = 1, \n",
113 | " base_width = 64, dilation = 1, norm_layer = None):\n",
114 | " super(Bottleneck, self).__init__()\n",
115 | " if norm_layer is None:\n",
116 | " norm_layer = nn.BatchNorm2d\n",
117 | " width = int(planes * (base_width / 64.)) * groups\n",
118 | " self.conv1 = conv1x1(inplanes, width)\n",
119 | " self.bn1 = norm_layer(width)\n",
120 | " self.conv2 = conv3x3(width, width, stride, groups, dilation)\n",
121 | " self.bn2 = norm_layer(width)\n",
122 | " self.conv3 = conv1x1(width, planes * self.expansion)\n",
123 | " self.bn3 = norm_layer(planes * self.expansion)\n",
124 | " self.relu = nn.ReLU(inplace = True)\n",
125 | " self.downsample = downsample\n",
126 | " self.stride = stride\n",
127 | "\n",
128 | " def forward(self, x):\n",
129 | " identity = x\n",
130 | "\n",
131 | " out = self.conv1(x)\n",
132 | " out = self.bn1(out)\n",
133 | " out = self.relu(out)\n",
134 | "\n",
135 | " out = self.conv2(out)\n",
136 | " out = self.bn2(out)\n",
137 | " out = self.relu(out)\n",
138 | "\n",
139 | " out = self.conv3(out)\n",
140 | " out = self.bn3(out)\n",
141 | "\n",
142 | " if self.downsample is not None:\n",
143 | " identity = self.downsample(x)\n",
144 | "\n",
145 | " out += identity\n",
146 | " out = self.relu(out)\n",
147 | "\n",
148 | " return out\n",
149 | "\n",
150 | "class ResNet(nn.Module):\n",
151 | "\n",
152 | " def __init__(self, block, layers, num_classes = 1000, zero_init_residual = False,\n",
153 | " groups = 1, width_per_group = 64, replace_stride_width_dilation = None):\n",
154 | " super(ResNet, self).__init__()\n",
155 | " if norm_layer is None:\n",
156 | " norm_layer = nn.BacthNorm2d\n",
157 | " self.norm_layer = norm_layer\n",
158 | "\n",
159 | " self.inplanes = 64\n",
160 | " self.dilation = 1\n",
161 | " if replace_stride_width_dilation is None:\n",
162 | " replace_stride_width_dilation = [False, False, False]\n",
163 | " if len(replace_stride_width_dilation) != 3:\n",
164 | " raise ValueError(\"replace_stride_width_dilation should be None\"\n",
165 | " \"of a 3-element tuple, got {}\".format(replace_stride_width_dilation))\n",
166 | " self.groups = groups\n",
167 | " self.base_width = width_per_group\n",
168 | " self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size = 7, stride = 2, padding = 3, bias = True)\n",
169 | " self.bn1 = norm_layer(self.inplanes)\n",
170 | " self.relu = nn.ReLU(inplace = True)\n",
171 | " self.maxpool = nn.MaxPool2D(kernel_size = 3, stride = 2, padding = 1)\n",
172 | " self.layer1 = self._make_layer(block, 64, layers[0])\n",
173 | " self.layer2 = self._make_layer(block, 128, layers[1], stride = 2, \n",
174 | " dilate = replace_stride_width_dilation[0])\n",
175 | " self.layer3 = self._make_layer(block, 256, layers[2], stride = 2, \n",
176 | " dilate = replace_stride_width_dilation[1])\n",
177 | " self.layer4 = self._make_layer(block, 512, layers[3], stride = 2, \n",
178 | " dilate = replace_stride_width_dilation[2])\n",
179 | " self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\n",
180 | " self.fc = nn.Linear(512 * block.expansion, num_classes)\n",
181 | "\n",
182 | " for m in self.modules():\n",
183 | " if isinstance(m, nn.Conv2d):\n",
184 | " nn.init.kaiming_normal_(m.weight, mode = 'fan_out', nonlinearity = 'relu')\n",
185 | " elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):\n",
186 | " nn.init.constant_(m.weight, 1)\n",
187 | " nn.init.constant_(m.bias, 0)\n",
188 | "\n",
189 | " if zero_init_residual:\n",
190 | " for m in self.modules():\n",
191 | " if isinstance(m, Bottleneck):\n",
192 | " nn.init.constant_(m.bn3.weight, 0)\n",
193 | " elif isinstance(m, BasicBlock):\n",
194 | " nn.init.constant_(m.bn2.weight, 0)\n",
195 | "\n",
196 | " def _make_layer(self, block, planes, blocks, stride = 1, dilate = False):\n",
197 | " norm_layer = self._norm_layer\n",
198 | " downsample = None\n",
199 | " previous_dilation = self.dilation\n",
200 | " if dilate:\n",
201 | " self.dilation *= stride\n",
202 | " stride = 1\n",
203 | " if stride != 1 or self.inplanes != planes * block.expansion:\n",
204 | " downsample = nn.Sequential(\n",
205 | " conv1x1(self.inplanes, planes * block.expansion, stride), \n",
206 | " norm_layer(planes * block.expansion),\n",
207 | " )\n",
208 | " \n",
209 | " layers = []\n",
210 | " layers.append(block(self.inplanes, planes, stride, downsample, self.groups,\n",
211 | " self.base_width, previous_dilation, norm_layer))\n",
212 | " self.inplanes = planes * block.expansion\n",
213 | "\n",
214 | " for _ in range(1, blocks):\n",
215 | " layers.append(block(self.inplanes, planes, groups = self.groups,\n",
216 | " base_width = self.base_width, dilation = self.dilation,\n",
217 | " norm_layer = norm_layer))\n",
218 | " \n",
219 | " return nn.Sequential(*layers)\n",
220 | "\n",
221 | " def _forward_impl(self, x):\n",
222 | " x = self.conv1(x)\n",
223 | " x = self.bn1(x)\n",
224 | " x = self.relu(x)\n",
225 | " x = self.maxpool(x)\n",
226 | "\n",
227 | " x = self.layer1(x)\n",
228 | " x = self.layer2(x)\n",
229 | " x = self.layer3(x)\n",
230 | " x = self.layer4(x)\n",
231 | "\n",
232 | " x = self.avgpool(x)\n",
233 | " x = torch.flatten(x, 1)\n",
234 | " x = self.fc(x)\n",
235 | "\n",
236 | " return x\n",
237 | "\n",
238 | " def forward(self, x):\n",
239 | " return self._forward_impl(x)\n",
240 | "\n",
241 | "def _resnet(arch, block, layers, pretrained, progress, **kwargs):\n",
242 | " model = ResNet(block, layers, **kwargs)\n",
243 | " if pretrained:\n",
244 | " state_dict = load_state_dict_from_url(model_urls[arch], progress = progress)\n",
245 | " model.load_state_dict(state_dict)\n",
246 | " return model\n",
247 | "\n",
248 | "def resnext50_32x4d(pretrained = False, progress = True, **kwargs):\n",
249 | " kwargs['groups'] = 32\n",
250 | " kwargs['width_per_group'] = 4\n",
251 | " return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],\n",
252 | " pretrained, progress, **kwargs)\n",
253 | " \n",
254 | "def resnext101_32x8d(pretrained = False, progress = True, **kwargs):\n",
255 | " kwargs['groups'] = 32\n",
256 | " kwargs['width_pre_group'] = 8\n",
257 | " return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],\n",
258 | " pretrained, progress, **kwargs)"
259 | ]
260 | }
261 | ]
262 | }
--------------------------------------------------------------------------------
/Computer Vision/CNN/Xception.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyOnDX2S9zG8B6oYDzT8Z794",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "source": [
32 | "#Xception 모델 설명\n",
33 | "#당시, ImageNet에 대해서 SoTA를 차지함\n",
34 | "#VGG16과 ResNet의 입력 이미지 크기(224x224)와 다르게 (299x299)를 사용함\n",
35 | "#전처리 방식도 다름(Inception V3와 동일)\n",
36 | "\n",
37 | "from __future__ import absolute_import\n",
38 | "from __future__ import division\n",
39 | "from __future__ import print_function\n",
40 | "\n",
41 | "import os\n",
42 | "import warnings\n",
43 | "\n",
44 | "import keras\n",
45 | "from keras import layers\n",
46 | "from keras.models import Sequential\n",
47 | "from keras import backend\n",
48 | "#얘네는 오류 발생\n",
49 | "#from . import get_submodules_from_kwargs\n",
50 | "#from . import imagenet_utils\n",
51 | "#from .imagenet_utils import decode_predictions\n",
52 | "#from .imagenet_utils import _obtain_input_shape\n",
53 | "\n",
54 | "TF_WEIGHTS_PATH = (\n",
55 | " 'https://github.com/fchollet/deep-learning-models/'\n",
56 | " 'releases/download/v0.4/'\n",
57 | " 'xception_weights_tf_dim_ordering_tf_kernels.h5'\n",
58 | ")\n",
59 | "\n",
60 | "TF_WEIGHTS_PATH_NO_TOP = (\n",
61 | " 'https://github.com/fchollet/deep-learning-models/'\n",
62 | " 'releases/download/v0.4/'\n",
63 | " 'xception_weights_tf_dim_ordering_tf_kernels_notop.h5'\n",
64 | ")\n",
65 | "\n",
66 | "def Xception(include_top = True, weights = 'imagenet', input_tensor = None, \n",
67 | " input_shape = None, pooling = None, classes = 1000, **kwargs):\n",
68 | " \n",
69 | " #기본 입력 이미지의 크기는 299 x 299\n",
70 | " #include_top: network의 맨 위에서 fc-layer을 포함할 지\n",
71 | " #weights: 'None'은 무작위, 'imagenet'은 Imagenet에서 pre-training, 또는 업로드할 파일 경로\n",
72 | " #input_tensor: 모델의 입력 이미지에 대해 사용할 추가적인 keras tensor\n",
73 | " #input_shape: 옵션적 tuple 모양, 'include_top'이 False일 때만 사용 가능\n",
74 | " #pooling: feature extraction을 위한 옵션적 pooling mode, 'include_top'이 False일 때만 사용 가능\n",
75 | " #'None': 모델 출력이 4D tensor, 'avg': global average pooling이고 output은 2D tensor\n",
76 | " #'max': global max pooling\n",
77 | " #classes: 옵션적 class 수. 'include_top'이 True일 때와 'weights'가 명시되지 않았을 때 사용 가능\n",
78 | "\n",
79 | " #weights에 아무런 값이 없을 때\n",
80 | " if not (weights in {'imagenet', None} or os.path.exists(weights)):\n",
81 | " raise ValueError('The `weights` argument should be either '\n",
82 | " '`None` (random initialization), `imagenet` '\n",
83 | " '(pre-training on ImageNet), '\n",
84 | " 'or the path to the weights file to be loaded.')\n",
85 | " \n",
86 | " #imagenet을 weights로 사용하는데 조건이 맞지 않을 때\n",
87 | " if weights == 'imagenet' and include_top and classes != 1000:\n",
88 | " raise ValueError('If using `weights` as `\"imagenet\"` with `include_top`'\n",
89 | " ' as true, `classes` should be 1000')\n",
90 | " \n",
91 | " #적절한 입력 모양 결정\n",
92 | " input_shape = _obtain_input_shape(input_shape, default_size = 299, min_size = 71,\n",
93 | " data_format = backend.image_data_format(),\n",
94 | " require_flatten = include_top, weights = weights)\n",
95 | " \n",
96 | " if input_tensor is None:\n",
97 | " img_input = layers.Input(shape = input_shape)\n",
98 | " else:\n",
99 | " if not backend.is_keras_tensor(input_tensor):\n",
100 | " img_input = layers.Input(tensor = input_tensor, shape = input_shape)\n",
101 | " else:\n",
102 | " img_input = input_tensor\n",
103 | "\n",
104 | " channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n",
105 | "\n",
106 | " #Entry Flow\n",
107 | " #입력 이미지 단계\n",
108 | " x = layers.Conv2D(32, (3, 3), strides = (2, 2), use_bias = False,\n",
109 | " name = 'block1_conv1')(img_input)\n",
110 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block1_conv1_bn')(x)\n",
111 | " x = layers.Activation('relu', name = 'block1_conv1_act')(x)\n",
112 | " x = layers.Conv2D(64, (3, 3), use_bias = False, name = 'block1_conv2_bn')(x)\n",
113 | " x = layers.Activation('relu', name = 'block1_conv2_act')(x)\n",
114 | "\n",
115 | " #첫 번째 residual network\n",
116 | " residual = layers.Conv2d(128, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
117 | " residual = layers.BatchNormalization(axis = channel_axis)(residual)\n",
118 | "\n",
119 | " x = layers.SeparableConv2D(128, (3, 3), padding = 'same', use_bias = False,\n",
120 | " name = 'block2_sepconv1')(x)\n",
121 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block2_sepconv1_bn')(x)\n",
122 | " x = layers.Activation('relu', name = 'block2_sepconv2_act')(x)\n",
123 | " x = layers.SeparableConv2D(128, (3, 3), padding = 'same', use_bias = 'same', \n",
124 | " name = 'block2_sepconv2')(x)\n",
125 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block2_sepconv2_bn')(x)\n",
126 | "\n",
127 | " x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', \n",
128 | " name = 'block2_pool')(x)\n",
129 | " x = layers.add([x, residual])\n",
130 | "\n",
131 | " #두 번째 residual network\n",
132 | " residual = layers.Conv2d(256, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
133 | " residual = layers.BatchNormalization(sxis = channel_axis)(residual)\n",
134 | "\n",
135 | " x = layers.Activation('relu', name = 'block3_conv1_act')(x)\n",
136 | " x = layers.SeparableConv2D(256, (3, 3), strides = 'same', use_bias = False, \n",
137 | " name = 'block3_sepconv1')(x)\n",
138 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block3_sepconv1_bn')(x)\n",
139 | "\n",
140 | " x = layers.Activation('relu', name = 'block3_conv2_act')(x)\n",
141 | " x = layers.SeparableConv2D(256, (3, 3), strides = 'same', use_bias = False, \n",
142 | " name = 'block3_sepconv2')(x)\n",
143 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block3_sepconv2_bn')(x)\n",
144 | "\n",
145 | " x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', \n",
146 | " name = 'block3_pool')(x)\n",
147 | "\n",
148 | " x = layers.add([x, residual])\n",
149 | "\n",
150 | " #세 번째 residual network\n",
151 | " residual = layers.Conv2d(728, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
152 | " residual = layers.BatchNormalization(axis = channel_axis)(residual)\n",
153 | "\n",
154 | " x = layers.Activation('relu', name = 'block4_conv1_act')(x)\n",
155 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
156 | " name = 'block4_sepconv1')(x)\n",
157 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block4_sepconv1_bn')(x)\n",
158 | "\n",
159 | " x = layer.Activation('relu', name = 'block4_conv2_act')(x)\n",
160 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False,\n",
161 | " name = 'block4_sepconv2')(x)\n",
162 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block4_sepconv2_bn')(x)\n",
163 | " \n",
164 | " x = MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', name = 'block4_pool')(x)\n",
165 | "\n",
166 | " x = layers.add([x, residual])\n",
167 | "\n",
168 | " #Middle Flow\n",
169 | " for i in range(8):\n",
170 | " residual = x\n",
171 | " prefix = 'block' + str(i + 5) #블록 이름 지정 자동화\n",
172 | " \n",
173 | " x = layers.Activation('relu', name = prefix + '_sepconv1_act')(x)\n",
174 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
175 | " name = prefix + '_sepconv1')(x)\n",
176 | " x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv1_bn')(x)\n",
177 | "\n",
178 | " x = layers.Activation('relu', name = prefix + '_sepconv2_act')(x)\n",
179 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
180 | " name = prefix + '_sepconv2')(x)\n",
181 | " x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv2_bn')(x)\n",
182 | "\n",
183 | " x = layers.Activation('relu', name = prefix + '_sepconv3_act')(x)\n",
184 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
185 | " name = prefix + '_sepconv3')(x)\n",
186 | " x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv3_bn')(x)\n",
187 | "\n",
188 | " x = layers.add([x, residual])\n",
189 | "\n",
190 | " #Exit Flow\n",
191 | " residual = layers.Conv2d(1024, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
192 | " residual = layers.BatchNormalization(axis = channel_axis)(residual)\n",
193 | "\n",
194 | " x = layers.Activation('relu', name = 'block13_sepconv1_act')(x)\n",
195 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
196 | " name = 'block13_sepconv1')(x)\n",
197 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block13_sepconv1_bn')(x)\n",
198 | "\n",
199 | " x = layers.Activation('relu', name = 'block13_speconv2_act')(x)\n",
200 | " x = layers.SeparableConv2D(1024, (3, 3), strides = 'same', use_bias = False, \n",
201 | " name = 'block13_sepconv2')(x)\n",
202 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block13_sepconv2_bn')(x)\n",
203 | "\n",
204 | " x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', name = 'block13_pool')(x)\n",
205 | "\n",
206 | " x = layers.add([x, residual])\n",
207 | "\n",
208 | " x = layers.SeparableConv2D(1536, (3, 3), strides = 'same', use_biad = False,\n",
209 | " name = 'block14_sepconv1')(x)\n",
210 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block14_sepconv1_bn')(x)\n",
211 | " x = layers.Activation('relu', name = 'block14_sepconv_act')(x)\n",
212 | "\n",
213 | " x = layers.SeparableConv2D(2048, (3, 3), strides = 'same', use_bias = False,\n",
214 | " name = 'block14_sepconv2')(x)\n",
215 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block14_speconv2_bn')(x)\n",
216 | " x = layers.Activation('relu', name = 'block14_sepconv2_act')(x)\n",
217 | "\n",
218 | " if include_top:\n",
219 | " x = layers.GlobalAveragePooling2D(name = 'avg_pool')(x)\n",
220 | " x = layers.Dense(classes, activation = 'softmax', name = 'predictions')(x)\n",
221 | " else:\n",
222 | " if pooling == 'avg':\n",
223 | " x = layers.GlobalAveragePooling2D()(x)\n",
224 | " elif pooling == 'max':\n",
225 | " x = layers.MaxPooling2D()(x)\n",
226 | "\n",
227 | " if input_tensor is not None:\n",
228 | " inputs = keras_utils.get_source_inputs(input_tensor)\n",
229 | " else:\n",
230 | " inputs = img_input\n",
231 | "\n",
232 | " #모델 생성\n",
233 | " if weights == 'imagenet':\n",
234 | " if include_top:\n",
235 | " weights_path = keras_utils.get_file(\n",
236 | " 'xception_weights_tf_dim_ordering_tf_kernels.h5',\n",
237 | " TF_WEIGHTS_PATH,\n",
238 | " cache_subdir='models',\n",
239 | " file_hash='0a58e3b7378bc2990ea3b43d5981f1f6'\n",
240 | " )\n",
241 | " else:\n",
242 | " weights_path = keras_utils.get_file(\n",
243 | " 'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',\n",
244 | " TF_WEIGHTS_PATH_NO_TOP,\n",
245 | " cache_subdir='models',\n",
246 | " file_hash='b0042744bf5b25fce3cb969f33bebb97'\n",
247 | " )\n",
248 | " model.load_weights(weights_path)\n",
249 | " if backend.backend() == 'theano':\n",
250 | " keras_utils.convert_all_kernels_in_model(model)\n",
251 | " elif weights is not None:\n",
252 | " model.load_weights(weights)\n",
253 | "\n",
254 | " return model\n",
255 | "\n",
256 | "def preprocess_input(x, **kwargs):\n",
257 | " #Numpy 배열을 이미지 배치로 전처리\n",
258 | " return imagenet_utils.preprocess_input(x, mode = 'tf', **kwargs)"
259 | ],
260 | "metadata": {
261 | "id": "2OJ6-sJPoqxo"
262 | },
263 | "execution_count": null,
264 | "outputs": []
265 | }
266 | ]
267 | }
--------------------------------------------------------------------------------
/Computer Vision/README.md:
--------------------------------------------------------------------------------
1 | # Computer Vision Paper Implementation
2 |
3 | I read those Deep Learning papers and implemented them by coding. 😉
4 | There are some that have not yet been implemented, but will be implemented additionally in the future. 😊
5 |
6 | |Paper Title|Paper or reference site Link|Paper Review|
7 | |---|---|---|
8 | |history of CNN|LeNet, AlexNet, VGGNet, GoogLeNet, ResNet, ResNeXt, Sception, Mobilenet, DenseNet, EfficientNet, ConvNext|https://cartinoe5930.tistory.com/entry/CNN-network%EC%9D%98-%EC%97%AD%EC%82%AC|
9 | |ViT: An Image Worth 16 x 16 Words: Transformers for Image Recognition at Scale|https://arxiv.org/abs/2010.11929|https://cartinoe5930.tistory.com/entry/ViT-An-Image-Worth-16-x-16-Words-Transformers-for-Image-Recognition-at-Scale|
10 | |Swin Transformer: Hierachical Vision Transformer using Shifted Winodws|https://arxiv.org/abs/2103.14030|https://cartinoe5930.tistory.com/entry/Swin-Transformer-Hierarchical-Vision-Transformer-using-Shifted-Windows-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
11 | |CLIP: Learning Transferable Visual Models From Natural Language Supervision|https://arxiv.org/abs/2103.00020|https://cartinoe5930.tistory.com/entry/CLIP-Learning-Transferable-Visual-Models-From-Natural-Language-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
12 |
--------------------------------------------------------------------------------
/Multimodal Models/FLAVA/README.md:
--------------------------------------------------------------------------------
1 | # Interacting with FLAVA
2 |
3 | https://github.com/apsdehal/flava-tutorials 참고하여 작성됨.
4 |
5 | paper review: https://cartinoe5930.tistory.com/entry/FLAVA-A-Foundational-Language-And-Vision-Alignment-Model-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 |
--------------------------------------------------------------------------------
/Multimodal Models/README.md:
--------------------------------------------------------------------------------
1 | # Multimodal Models paper code implementation
2 |
3 | I read those Multimodal Models papers and implemented them by coding(Pytorch, Tensorflow, etc.). 😉
4 | There are some that have not yet been implemented, but will be implemented additionally in the future. 😊
5 |
6 | ## Multi-modal Models
7 |
8 | |Paper Title|Paper or reference site Link|Paper Review|
9 | |---|---|---|
10 | |Let's learn about VLM(Visual-Language Model)|https://huggingface.co/blog/vision_language_pretraining#supporting-vision-language-models-in-%F0%9F%A4%97-transformers|https://cartinoe5930.tistory.com/entry/VLMVision-Language-Model%EC%97%90-%EB%8C%80%ED%95%B4-%EC%95%8C%EC%95%84%EB%B3%B4%EC%9E%90|
11 | |VisualBERT: A simple and Performant Baseline for Vision and Language |https://arxiv.org/abs/1908.03557|https://cartinoe5930.tistory.com/entry/VisualBERT-A-Simple-and-Performant-Baseline-for-Vision-and-Language-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
12 | |ViLBERT: Pre-training Task-Agnostic Visiolinguistic Representations for Visual-and-Language Tasks|https://arxiv.org/abs/1908.02265|https://cartinoe5930.tistory.com/entry/ViLBERT-Pretraining-Task-Agnostic-Visiolinguistic-Representations-for-Visual-and-Language-Tasks|
13 | |LXMERT: Learning Cross-Modality Encoder Representations from Transformers|https://arxiv.org/abs/1908.07490|https://cartinoe5930.tistory.com/entry/LXMERT-Learning-Cross-Modality-Encoder-Representations-from-Transformers-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
14 | |VL-BERT: Pre-training of Generic Visual-Linguistic Representations|https://arxiv.org/abs/1908.08530|https://cartinoe5930.tistory.com/entry/VL-BERT-Pre-training-of-Generic-Visual-Linguistic-Representations-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
15 | |VLP: Unified Vision-Language Pre-Training for Image Captioning and VQA|https://arxiv.org/abs/1909.11059|https://cartinoe5930.tistory.com/entry/VLP-Unified-Vision-Language-Pre-Traning-for-Image-Captioning-and-VQA-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
16 | |Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks|https://arxiv.org/abs/2004.06165|https://cartinoe5930.tistory.com/entry/Oscar-Object-Semantics-Aligned-Pre-training-for-Vision-Language-Tasks-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
17 | |ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision|https://arxiv.org/abs/2102.03334|https://cartinoe5930.tistory.com/entry/ViLT-Vision-and-Language-Transformer-Without-Convolution-or-Region-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
18 | |ALIGN: Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision|https://arxiv.org/abs/2102.05918|https://cartinoe5930.tistory.com/entry/ALIGN-Scaling-up-Visual-and-Vision-Language-Representation-with-Noisy-Text-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
19 | |ALBEF: Vision and Language Representation Learning with Momentum Distillation|https://arxiv.org/abs/2107.07651|https://cartinoe5930.tistory.com/entry/ALBEF-Vision-and-Language-Representation-Learning-with-Momentum-Distillation-%EB%85%BC%EB%AC%B8|
20 | |SimVLM: Simple Visual Language Model Pretraining with Weak Supervision|https://arxiv.org/abs/2108.10904|https://cartinoe5930.tistory.com/entry/SimVLM-Simple-Visual-Language-Model-Pre-training-with-Weak-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
21 | |BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation|https://arxiv.org/abs/2201.12086|https://cartinoe5930.tistory.com/entry/BLIP-Bootstrapping-Language-Image-Pre-training-fro-Unified-Vision-Language-Understanding-and-Generation-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
22 | |FLAVA: A Foundational Language And Vision Alignment Model|https://arxiv.org/abs/2112.04482|https://cartinoe5930.tistory.com/entry/FLAVA-A-Foundational-Language-And-Vision-Alignment-Model-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
23 |
--------------------------------------------------------------------------------
/Natural Language Processing/ALBERT/README.md:
--------------------------------------------------------------------------------
1 | # ALBERT Implementation
2 |
3 | https://github.com/google-research/albert/blob/master/modeling.py 참고하여 작성됌.
4 |
5 | paper review: https://cartinoe5930.tistory.com/entry/ALBERT-A-Lite-BERT-for-Self-supervised-Learning-of-Language-Representations-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/BERT/BERT_구현_복습.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyNIeutm5STI86h0MtzDj0Xc",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "source": [
32 | "# BERT 구현 복습\n",
33 | "\n",
34 | "이미 한 번 BERT를 구현했던 적이 있는데, 이번에는 좀 더 구체적인 example을 사용하여 직접 구현해보도록 하겠다. 이 코드는 [여기](https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial)를 참고하여 작성되었다.\n",
35 | "\n",
36 | "BERT를 PyTorch를 이용하여 구현하였고, BERT를 구현하는 과정을 다음과 같이 4개의 섹션으로 나눴다.\n",
37 | "\n",
38 | "1. 전처리\n",
39 | "2. 모델링\n",
40 | "3. Loss & Optimization\n",
41 | "4. 훈련\n"
42 | ],
43 | "metadata": {
44 | "id": "Nrb7y3QLDj3t"
45 | }
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "source": [
50 | "### 전처리\n",
51 | "\n",
52 | "전처리 과정에서는 신경망이 데이터를 처리할 수 있도록 다음과 같이 data를 구축한다. 일단 raw text부터 시작해보도록 하자."
53 | ],
54 | "metadata": {
55 | "id": "b0Qw4c4uEPTv"
56 | }
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {
62 | "id": "1fTURcMQDe59"
63 | },
64 | "outputs": [],
65 | "source": [
66 | "# raw text\n",
67 | "\n",
68 | "text = (\n",
69 | " 'Hello, how are you? I am Romeo.n'\n",
70 | " 'Hello, Romeo My name is Juliet. Nice to meet you.n'\n",
71 | " 'Nice meet you too. How are you today?n'\n",
72 | " 'Great. My baseball team won the competition.n'\n",
73 | " 'Oh Congratulations, Julietn'\n",
74 | " 'Thanks you Romeo'\n",
75 | " )"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "source": [
81 | "그 다음에 데이터를 다음과 같이 정리해야 한다.\n",
82 | "\n",
83 | "- 문장을 소문자로 변환\n",
84 | "- vocabulary를 만듦. **Vocabulary**는 문서 내의 독특한 단어의 list임."
85 | ],
86 | "metadata": {
87 | "id": "vkL4zzsxElEn"
88 | }
89 | },
90 | {
91 | "cell_type": "code",
92 | "source": [
93 | "# '.', ',', '?', '!' filtering\n",
94 | "sentences = re.sub(\"[.,!?-]\", '', text.lower()).split('n')\n",
95 | "\n",
96 | "word_list = list(set(\" \".join(sentences).split()))"
97 | ],
98 | "metadata": {
99 | "id": "OXS-z3vEE1ir"
100 | },
101 | "execution_count": null,
102 | "outputs": []
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "source": [
107 | "다음으로, BERT의 학습 도중에 사용되는 special token을 잘 기억해야 한다. 다음은 이 다양한 토큰들에 대한 설명이다.\n",
108 | "\n",
109 | "- [CLS]: 첫 번째 토큰은 항상 classification\n",
110 | "- [SEP]: 두 개의 문장을 분리\n",
111 | "- [END]: 문장을 끝내기\n",
112 | "- [PAD]: 문장을 똑같은 길이로 줄이기\n",
113 | "- [MASK]: 기존의 단어를 mask로 대체\n",
114 | "\n",
115 | "이러한 토큰들은 word dictionary에 들어가 있어야 하는데, 여기서 vocabulary에 들어가 있는는 각각의 토큰과 단어는 index number가 할당된다."
116 | ],
117 | "metadata": {
118 | "id": "xTaer5nqFH-r"
119 | }
120 | },
121 | {
122 | "cell_type": "code",
123 | "source": [
124 | "word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}\n",
125 | "for i, w in enumerate(word_list):\n",
126 | " word_dict[w] = i + 4\n",
127 | " number_dict = {i: w for i, w in enumerate(word_dict)}\n",
128 | " vocab_size = len(word_dict)"
129 | ],
130 | "metadata": {
131 | "id": "Kq4dprH2F5OG"
132 | },
133 | "execution_count": null,
134 | "outputs": []
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "source": [
139 | "이 과정이 완료되면, input sequence를 3개의 유형의 embedding으로 포맷하는 함수를 생성해야 한다.\n",
140 | "\n",
141 | "- **token embedding**\n",
142 | "- **segment embedding**\n",
143 | "- **position embedding**\n",
144 | "\n",
145 | "이제 각각에 대해 알아보도록 하자."
146 | ],
147 | "metadata": {
148 | "id": "NXSxlEK6GPji"
149 | }
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "source": [
154 | "**token embedding이 무엇일까?**\n",
155 | "\n",
156 | "예를 들어, 문장 \"The cat is walking. The dog is barking.\"이 주어졌을 때, 함수는 다음의 방식대로 sequence를 생성해야 한다.\n",
157 | "\n",
158 | "\"[CLS] the cat is walking [SEP] the dog is barking\"\n",
159 | "\n",
160 | "그 후에, 모든 것들은 word dictionary의 index로 바꿔야 한다. 따라서 이전의 문장은 다음과 같은 형태를 가지게 된다.\n",
161 | "\n",
162 | "\"[1, 5, 7, 9, 10, 2, 5, 6, 9, 11]\"\n",
163 | "\n",
164 | "여기서 1과 2는 각각 [CLS]와 [SEP]를 의미한다.\n",
165 | "\n",
166 | "**segment embedding이 무엇일까?**\n",
167 | "\n",
168 | "segment embedding은 두 개의 문장을 분리하는 역할을 한다. 보통 0과 1로 정의된다.\n",
169 | "\n",
170 | "**position embedding이 무엇일까?**\n",
171 | "\n",
172 | "position embedding은 sequence에서 각 embedding에게 position을 준다.\n",
173 | "\n"
174 | ],
175 | "metadata": {
176 | "id": "urJ0SqDiGwwD"
177 | }
178 | },
179 | {
180 | "cell_type": "markdown",
181 | "source": [
182 | "이제 다음 단계는 **masking**을 생성하는 것이다.\n",
183 | "\n",
184 | "논문에 의하면, BERT는 sequence의 15% word를 [MASK] 토큰으로 대체하고, padding을 추가하였다. Padding은 모든 문장의 길이를 똑같은 길이로 만들어준다. 예를 들어, 다음과 같은 문장을 받았다고 하였을 때,\n",
185 | "\n",
186 | "\"The cat is walking. The shog is barking at the tree\"\n",
187 | "\n",
188 | "이 문장에 padding을 적용하면 다음과 같이 바뀐다.\n",
189 | "\n",
190 | "\"[CLS] The cat is walking [PAD] [PAD] [PAD]. [CLS] The dog is barking at the tree.\"\n",
191 | "\n",
192 | "첫 번째 문장의 길이가 두 번째 문장의 길이와 같아진다."
193 | ],
194 | "metadata": {
195 | "id": "86CK0zidJFh9"
196 | }
197 | },
198 | {
199 | "cell_type": "code",
200 | "source": [
201 | "def make_batch():\n",
202 | " batch = []\n",
203 | " positive = negative = 0\n",
204 | " while positive != batch_size / 2 or negative != batch_size / 2:\n",
205 | " tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))\n",
206 | "\n",
207 | " tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]\n",
208 | "\n",
209 | " input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP']]\n",
210 | " segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n",
211 | "\n",
212 | " # LM masking\n",
213 | " n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 한 문장의 15% 정도의 토큰\n",
214 | " cand_maked_pos = [1 for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]\n",
215 | " shuffle(cand_maked_pos)\n",
216 | " masked_tokens, masked_pos = [], []\n",
217 | " for pos in cand_makes_pos[:n_pred]:\n",
218 | " masked_pos.append(pos)\n",
219 | " masked_tokens.append(input_ids[pos])\n",
220 | " if random() < 0.8: # 80%는 masking\n",
221 | " input_ids[pos] = word_dict['[MASK]']\n",
222 | " elif random() < 0.5: # 10%는 vocabulary에서 random indexing\n",
223 | " index = randint(0, vocab_size - 1)\n",
224 | " input_ids[pos] = word_dict[number_dict[index]]\n",
225 | " \n",
226 | " # Zero padding\n",
227 | " n_pad = maxlen - len(input_ids)\n",
228 | " input_ids.extend([0] * n_pad)\n",
229 | " segment_ids.extend([0] * n_pad)\n",
230 | "\n",
231 | " # Zero padding (100% - 15%) tokens\n",
232 | " if max_pred > n_pred:\n",
233 | " n_pad = max_pred - n_pred\n",
234 | " masked_tokens.extend([0] * n_pad)\n",
235 | " masked_pos.extend([0] * n_pad)\n",
236 | "\n",
237 | " if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:\n",
238 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext\n",
239 | " elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:\n",
240 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext\n",
241 | " negative += 1\n",
242 | "\n",
243 | " return batch"
244 | ],
245 | "metadata": {
246 | "id": "mZitlUMPNrU-"
247 | },
248 | "execution_count": null,
249 | "outputs": []
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "source": [
254 | "next-word prediction을 다루기 때문에, 문장이 이어진 문장인지 아닌지를 예측하는 label을 생성해야 한다. 이것이 바로 IsNext와 NotNext이다. 그래서 다음 문장 앞에 오는 모든 문장에 True를 할당하고 이를 위해 조건문을 사용하였다.\n",
255 | "\n",
256 | "예를 들어, 두 개의 문장이 하나의 document에 있으면, 이 둘은 서로를 문맥적으로 따른다. 따라서서 첫 번째 문장이 A이면 다음 문장은 A+1이어야 한다. 직관적으로 첫 번째 문장의 위치 즉, tokens_a_index + 1 == tokens_b_index, 즉 동일한 context의 두 번째 문장인 경우 이 입력에 대한 label을을 True로 설정할 수 있도록 코드를 작성해야 한다.\n",
257 | "\n",
258 | "만약 위 조건이 tokens_a_index + 1 != tokens_b_index라면 input에 대한 label을 False로 지정해야 한다."
259 | ],
260 | "metadata": {
261 | "id": "3Ifq41KQQlD4"
262 | }
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "source": [
267 | "### 모델링\n",
268 | "\n",
269 | "BERT는 매우 정교한 모델이라서 느리게 감지되면 논리를 잃게 된다. 그래서 BERT는 component와 함수에 의해 component를 설명하는 것이 가능하다.\n",
270 | "\n",
271 | "BERT는 다음의 component들을 가진다.\n",
272 | "\n",
273 | "1. Embedding layer\n",
274 | "2. Attention Mask\n",
275 | "3. Encoder layer\n",
276 | " - Multi-head attention\n",
277 | " - Scaled dot product attention\n",
278 | " - Position-wise feed-forward network\n",
279 | "4. BERT(모든 component를 합침)"
280 | ],
281 | "metadata": {
282 | "id": "G2QPTi8D1R5B"
283 | }
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "source": [
288 | "#### Embedding Layer\n",
289 | "\n",
290 | "embedding은 BERT의 첫 번째 레이어로 input을 받아서 lookup table을 생성한다. embedding layer의 파라미터는 학습 가능하고, 이는 학습 스포레스가 끝날 때, embedding은 비슷한 단어들끼리 모여있을 거라는 것이다.\n",
291 | "\n",
292 | "embedding layer는 단어 간의 서로 다른 관계를 보존한다. 여기에는 semantic, syntactic, linear, 그리고 BERT가 양방향성이기 때문에, contextual relationship을 잘 보존한다.\n",
293 | "\n",
294 | "BERT의 경우에, 다음 3개의 embedding을 생성한다.\n",
295 | "\n",
296 | "- Token\n",
297 | "- Segments\n",
298 | "- Position\n",
299 | "\n",
300 | "아까 전에 position embedding을 생성하는 함수를 정의해두지는 않았지만, token과 segment를 생성하는 함수는 이미 정의해두었다. 그래서 이제 input을 받아서 sequence에서 각 단어에 대한 position을 생성할 수 있다. 그리고 이는 다음과 같다."
301 | ],
302 | "metadata": {
303 | "id": "k7zuhRtl2cwx"
304 | }
305 | },
306 | {
307 | "cell_type": "code",
308 | "source": [
309 | "print(torch.arange(30, dtype = torch.long).expand_as(input_ids))"
310 | ],
311 | "metadata": {
312 | "id": "idIqPc1H3v1q"
313 | },
314 | "execution_count": null,
315 | "outputs": []
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "source": [
320 | "forward function에서, 모든 embedding을 합하고 정규화하였다."
321 | ],
322 | "metadata": {
323 | "id": "bSHPzyJL38Pd"
324 | }
325 | },
326 | {
327 | "cell_type": "code",
328 | "source": [
329 | "class Embedding(nn.Module):\n",
330 | " def __init__(self):\n",
331 | " super(EMbedding, self).__init__()\n",
332 | " self.tok_embed = nn.Embedding(vocab_size, d_model) # token embedding\n",
333 | " self.pos_embed = nn.Embedding(maxlen, d_model) # position embedding\n",
334 | " self.seg_embed = nn.Embedding(n_segments, d_model) # segment embedding\n",
335 | " self.norm = nn.LayerNorm(d_model)\n",
336 | "\n",
337 | " def forward(self, x, seg):\n",
338 | " seq_len = x.size(1)\n",
339 | " pos = torch.arange(seq_len, dtype = torch.long)\n",
340 | " pos = pos.unsqueeze(0).expand_as(x) # (seq_len,) -> (batch_size, seq_len)\n",
341 | " embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n",
342 | "\n",
343 | " return self.norm(embedding)"
344 | ],
345 | "metadata": {
346 | "id": "3Z5lR_DF4FBO"
347 | },
348 | "execution_count": null,
349 | "outputs": []
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "source": [
354 | "#### attention mask 생성\n",
355 | "\n",
356 | "BERT는 attention mask 또한 필요로 한다. 그리고 이것은 적절한 형식이 되어야 한다. 다음의 코드가 attention mask를 생성하는 코드이다. 아래 코드에서 [PAD]는 1로 변환되고, 다른 것들은 0으로 변환된다."
357 | ],
358 | "metadata": {
359 | "id": "4jKULKiI5GKe"
360 | }
361 | },
362 | {
363 | "cell_type": "code",
364 | "source": [
365 | "def get_attn_pad_mask(seq_q, seq_k):\n",
366 | " batch_size, len_q = seq_q.size()\n",
367 | " batch_size, len_k = seq_k.size()\n",
368 | " # eq(0)은 PAD token이다.\n",
369 | " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # batch_size x 1 x len_k(=len_q), 하나가 마스킹된다.\n",
370 | " return pad_attn_mask.expand(batch_size, len_q, len_k) # batch_size x len_q x len_k"
371 | ],
372 | "metadata": {
373 | "id": "6BErCR2k5Ype"
374 | },
375 | "execution_count": null,
376 | "outputs": []
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "source": [
381 | "#### Encoder\n",
382 | "\n",
383 | "Encdoer는 다음의 두 개의 주된 component를 가지고 있다.\n",
384 | "\n",
385 | "- Multi-head Attention\n",
386 | "- Position-wise feed-forward network\n",
387 | "\n",
388 | "encoder의 작업은 representation과 pattern을 input과 attention mask로부터 찾는 것이다."
389 | ],
390 | "metadata": {
391 | "id": "2VC7H8lr6gyu"
392 | }
393 | },
394 | {
395 | "cell_type": "code",
396 | "source": [
397 | "class EncoderLayer(nn.Module):\n",
398 | " def __init__(self):\n",
399 | " super(EncoderLayer, self).__init__()\n",
400 | " self.enc_self_attn = MultiHeadAttention()\n",
401 | " self.pos_ffn = PoswiseFeedForwardNet()\n",
402 | "\n",
403 | " def forward(self, en_inputs, enc_self_attn_mask):\n",
404 | " enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs는 Q, K, V와 같음\n",
405 | " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n",
406 | " return enc_outputs, attn"
407 | ],
408 | "metadata": {
409 | "id": "S1bboIq9606b"
410 | },
411 | "execution_count": null,
412 | "outputs": []
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "source": [
417 | "#### Multi-head attention\n",
418 | "\n",
419 | "이것이 encoder의 첫 번째 주된 component이다.\n",
420 | "\n",
421 | "attention model은 3개의 입력값 **Query, Key, Value**를 받는다.\n",
422 | "\n",
423 | "Multi-head attention은 4개의 입력값 **Query, Key, Value, Attention mask**를 받는다. embedding은 Query, Key, Value에 입력으로 주어지고, attention mask는 attention mask 인자에 입력으로 주어진다.\n",
424 | "\n",
425 | "이러한 3개의 입력과 attention mask에 대해 dot-product 연산을 수행한다. 이 dot-product 연산은 **context vector**와 **attention**을 산출한다. context vector는 선형 레이어를 지나서 최종적으로 output을 출력한다."
426 | ],
427 | "metadata": {
428 | "id": "i5Ffo_Pu7h_4"
429 | }
430 | },
431 | {
432 | "cell_type": "code",
433 | "source": [
434 | "class MultiHeadAttention(nn.Module):\n",
435 | " def __init__(self):\n",
436 | " super(MultiHeadAttention, self).__init__()\n",
437 | " self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
438 | " self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
439 | " self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
440 | "\n",
441 | " def forward(self, Q, K, V, attn_mask):\n",
442 | " # q: [batch_size x len_q x d_model]\n",
443 | " # k: [batch_size x len_k x d_model]\n",
444 | " # v: [batch_size x len_k x d_model]\n",
445 | " residual, batch_size = Q, Q.size(0)\n",
446 | " # (B, S, D) -proj- -> (B, S, D) -split- -> (B, S, H, W) -trans- -> (B, H, S, W)\n",
447 | " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # q_s: [batch_size x n_heads x len_q x d_k]\n",
448 | " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # k_s: [batch_size x n_heads x len_k x d_k]\n",
449 | " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # v_s: [batch_size x n_heads x len_k x d_v]\n",
450 | "\n",
451 | " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask: [batch_size x n_heads x len_q x len_k]\n",
452 | "\n",
453 | " # context: [batch_size x n_heads x len_q x d_v]\n",
454 | " # attn: [batch_size x n_heads x len_q x len_k]\n",
455 | " context, attn = ScaleDotProductAttention()(q_s, k_s, v_s, attn_mask)\n",
456 | " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n",
457 | " output = nn.Linear(n_heads * d_v, d_model)(context)\n",
458 | "\n",
459 | " return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]"
460 | ],
461 | "metadata": {
462 | "id": "hIE8aZIn80LD"
463 | },
464 | "execution_count": null,
465 | "outputs": []
466 | },
467 | {
468 | "cell_type": "markdown",
469 | "source": [
470 | "이제 이 Scaled Dot-Product attention에 대해 알아보도록 하자.\n",
471 | "\n",
472 | "- scaled dot-product attention 클래스는 4개의 인자 Query, Key, Value, Attention mask를 받는다. 본질적으로, 앞에 3개의 인자들은 word embedding과 함께 주어지고, attention mask 인자는 attention mask embedding과 함께 주어진다.\n",
473 | "- 그리고 scaled dot-product attention은 **query**와 **key**간에 행렬곱을 해서 점수를 얻는다.\n",
474 | "\n",
475 | "우리 코드에서는 scores.masked_fill_(attn_mask, -1e9)를 사용한다. 이 속성은 attention mask가 **True**인 -1e9로 score 요소를 채우고 나머지 요소는 attention score를 얻은 다음 0과 1 사이의 score를 제공하는 softmax 함수를 통해 전달된다.마지막으로, attention 과 value 간에 행렬곱을 수행함으로써 context vector을 얻었다."
476 | ],
477 | "metadata": {
478 | "id": "EcSZkO3u_Y6T"
479 | }
480 | },
481 | {
482 | "cell_type": "code",
483 | "source": [
484 | "class ScaledDotProductAttention(nn.Module):\n",
485 | " def __init__(self):\n",
486 | " super(ScaledDotProductAttention, self).__init__()\n",
487 | "\n",
488 | " def forward(self, Q, K, V, attn_mask):\n",
489 | " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores: [batch_size x n_heads x len_q x len_k]\n",
490 | " scores.masked_fill_(attn_mask, -1e9) # mask가 하나인 self tensor의 요소를 value로 채운다.\n",
491 | " attn = nn.Softmax(dim = -1)(scores)\n",
492 | " context = torch.matmul(attn, V)\n",
493 | " return score, context, attn\n",
494 | " "
495 | ],
496 | "metadata": {
497 | "id": "yHfeJSJKBmVo"
498 | },
499 | "execution_count": null,
500 | "outputs": []
501 | },
502 | {
503 | "cell_type": "markdown",
504 | "source": [
505 | "#### Position-Wise Feed Forward Network\n",
506 | "\n",
507 | "multi-head attention의 출력값은 feed-forward network로 가고 이는 encoder part를 결론 짓는다.\n",
508 | "\n",
509 | "#### 모든 component를 합치기\n",
510 | "\n",
511 | "encoder는 다음의 2개의 출력값을 내놓는다.\n",
512 | "\n",
513 | "- feed-forward layer의 출력값\n",
514 | "- Attention mask\n",
515 | "\n",
516 | "여기서 중요한 것은 BERT는 decoder를 사용하지 않는다는 것이다. 대시넹, output과 attention mask를 사용해서 원하는 결과를 얻는다.\n",
517 | "\n",
518 | "transformer의 decoder 부분은 아래 코드처럼 분류하는데 사용되는 얕은 네트워크로 대체된다. BERT 또한 **classifier**와 **masked** 2개의 출력값을 내놓는다."
519 | ],
520 | "metadata": {
521 | "id": "6KxhEHWVCbci"
522 | }
523 | },
524 | {
525 | "cell_type": "code",
526 | "source": [
527 | "class BERT(nn.Module):\n",
528 | " def __init__(self):\n",
529 | " super(BERT, self).__init__()\n",
530 | " self.embedding = Embedding()\n",
531 | " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
532 | " self.fc = nn.Linear(d_model, d_model)\n",
533 | " self.activ1 = nn.Train()\n",
534 | " self.linear = nn.Linear(d_model, d_model)\n",
535 | " self.activ2 = gelu\n",
536 | " self.norm = nn.LayerNorm(d_model)\n",
537 | " self.classifier = nn.Linear(d_model, 2)\n",
538 | " # decoder는 embedding layer와 공유됌\n",
539 | " embed_weight = self.embedding.tok_embed.weight\n",
540 | " n_vocab, n_dim = embed_weight.size()\n",
541 | " self.decoder = nn.Linear(n_dim, n_vocab, bias = False)\n",
542 | " self.decoder.weight = embed_weight\n",
543 | " self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n",
544 | "\n",
545 | " def forward(self, input_ids, segment_ids, masked_pos):\n",
546 | " output = self.embedding(input_ids, segment_ids)\n",
547 | " enc_self_attn_mask = deg_attn_pad_mask(input_ids, input_ids)\n",
548 | " for layer in self.layers:\n",
549 | " output, enc_self_attn = layer(output, enc_self_attn_mask)\n",
550 | " # output: [batch_size, len, d_model]\n",
551 | " # attn: [batch_size, n_heads, d_model, d_model]\n",
552 | " # 이는 첫 번째 토큰 (CLS)에 의해 결정됌\n",
553 | " h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]\n",
554 | " logits_clsf = self.classification(h_pooled) # [batch_size, 2]\n",
555 | "\n",
556 | " masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]\n",
557 | "\n",
558 | " # transformer의 최종 출력으로부터 masked position을 얻음\n",
559 | " h_masked = torch.gather(output, 1, masked_pos) # masking position: [batch_size, max_pred, d_model]\n",
560 | " h_masked = self.norm(self.activ2(self.linear(h_masked)))\n",
561 | " logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]\n",
562 | "\n",
563 | " return logits_lm, logits_clsf"
564 | ],
565 | "metadata": {
566 | "id": "LU4v48ovDlvF"
567 | },
568 | "execution_count": null,
569 | "outputs": []
570 | },
571 | {
572 | "cell_type": "markdown",
573 | "source": [
574 | "몇 가지 기억해두어야 할 사항이 있다.\n",
575 | "\n",
576 | "1. encoder의 수를 지정할 수 있다. 논문에서는 base model의 경우 12개였다.\n",
577 | "2. BERT에는 2개의 활성화 함수가 있는데, Tanh와 GELU이다."
578 | ],
579 | "metadata": {
580 | "id": "OILDxtxtF_3F"
581 | }
582 | },
583 | {
584 | "cell_type": "code",
585 | "source": [
586 | "def gelu(x):\n",
587 | " return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))"
588 | ],
589 | "metadata": {
590 | "id": "xaRraoH5GPqJ"
591 | },
592 | "execution_count": null,
593 | "outputs": []
594 | },
595 | {
596 | "cell_type": "markdown",
597 | "source": [
598 | "### Loss & Optimization\n",
599 | "\n",
600 | "논문에서는 모든 vocabulary에 대해 확률 분포를 계산하였지만, softmax 근사치를 이용해서 계산이 가능하다. 하지만, 확률 분포를 구하는 깔끔한 방법은 **cross-entropy**를 사용하는 것이다. cross-entropy loss는 *softmax*와 *negative log-likelihood*의 조합이다.\n",
601 | "\n",
602 | "그래서 모델을 구축하는 동안 softmax를 포함할 필요 없이 softmax 정규화 없이 feed-forward network에서 깔끔한 출력을 얻을 수 있다. \n",
603 | "\n",
604 | "optimization으로 넘어가서 BERT에서는 Adam optimizer를 사용하였다."
605 | ],
606 | "metadata": {
607 | "id": "9t7Z4xBFGW4s"
608 | }
609 | },
610 | {
611 | "cell_type": "code",
612 | "source": [
613 | "criterion = nn.CrossEntropyLoss()\n",
614 | "optimizer = optim.Adam(model.parameters(), lr = 0.001)"
615 | ],
616 | "metadata": {
617 | "id": "Vk5q2c4FHKB_"
618 | },
619 | "execution_count": null,
620 | "outputs": []
621 | },
622 | {
623 | "cell_type": "markdown",
624 | "source": [
625 | "### 훈련\n",
626 | "\n",
627 | "마지막으로 모델 훈련을 해보도록 하자."
628 | ],
629 | "metadata": {
630 | "id": "qP9szqwBHWtM"
631 | }
632 | },
633 | {
634 | "cell_type": "code",
635 | "source": [
636 | "model = BERT()\n",
637 | "batch = make_batch()\n",
638 | "input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))\n",
639 | "\n",
640 | " for epoch in range(100):\n",
641 | " optimizer.zero_grad()\n",
642 | " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n",
643 | " loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # masked LM을 위해\n",
644 | " loss_lm = (loss_lm.float()).mean()\n",
645 | " loss_clsf = criterion(logits_clsf, isNext) # sentence classification을 위해\n",
646 | " loss = loss_lm + loss_clsf\n",
647 | " if (epoch + 1) % 10 == 0:\n",
648 | " print('Epoch:', '%04d' % (epoch + 1), 'cost = ', '{:.6f}'.format(loss))\n",
649 | " loss.backward()\n",
650 | " optimizer.step()\n",
651 | "\n",
652 | " # mask token 예측하기\n",
653 | " input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))\n",
654 | " print(text)\n",
655 | " print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])\n",
656 | "\n",
657 | " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n",
658 | " logits_lm = logits_lm.data.mix(2)[1][0].data.numpy()\n",
659 | " print('masked tokens list: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])\n",
660 | " print('predict masked tokens list: ', [pos for pos in logits_lm if pos != 0])\n",
661 | "\n",
662 | " logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]\n",
663 | " print('isNext: ', True if isNext else False)\n",
664 | " print('predict isNext: ', True is logits_clsf else False)"
665 | ],
666 | "metadata": {
667 | "id": "Q9-I6oFuHV_c"
668 | },
669 | "execution_count": null,
670 | "outputs": []
671 | },
672 | {
673 | "cell_type": "code",
674 | "source": [
675 | "Output:\n",
676 | "\n",
677 | "Hello, how are you? I am Romeo.\n",
678 | "Hello, Romeo My name is Juliet. Nice to meet you.\n",
679 | "Nice meet you too. How are you today?\n",
680 | "Great. My baseball team won the competition.\n",
681 | "Oh Congratulations, Juliet\n",
682 | "Thanks you Romeo\n",
683 | "['[CLS]', 'nice', 'meet', 'you', 'too', 'how', 'are', 'you', 'today', '[SEP]', '[MASK]', 'congratulations', '[MASK]', '[SEP]']\n",
684 | "masked tokens list : [27, 22]\n",
685 | "predict masked tokens list : []\n",
686 | "isNext : False\n",
687 | "predict isNext : True"
688 | ],
689 | "metadata": {
690 | "id": "mRbWNVR5Jkx8"
691 | },
692 | "execution_count": null,
693 | "outputs": []
694 | },
695 | {
696 | "cell_type": "markdown",
697 | "source": [
698 | "이렇게 해서 BERT를 모두 구현하였다. 좀 더 큰 corpus에 대해서도 똑같은 BERT 모델을 사용할 수 있다.\n",
699 | "\n",
700 | "1. Pre-training: corpus를 사용하지만 앞서 언급한 input representation의 정확한 형식을 사용\n",
701 | "2. FIne-tuning: 지도학습 데이터를 사용해야 한다.\n",
702 | "3. 다양한 task 또는 topic modeling을 위한 feature extractor가 있어야 함"
703 | ],
704 | "metadata": {
705 | "id": "a9ikGiiNJtmn"
706 | }
707 | }
708 | ]
709 | }
710 |
--------------------------------------------------------------------------------
/Natural Language Processing/BERT/README.md:
--------------------------------------------------------------------------------
1 | # BERT Implementation
2 |
3 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading2-BERT-Pre-training-of-Deep-Bidirectional-Transformers-for-Language-Understanding
4 |
--------------------------------------------------------------------------------
/Natural Language Processing/ELECTRA/README.md:
--------------------------------------------------------------------------------
1 | # ELECTRA Implementation
2 |
3 | https://github.com/google-research/electra/blob/master/model/modeling.py 참고하여 작성됨
4 |
5 | paper review: https://cartinoe5930.tistory.com/entry/ELECTRA-Pre-training-Text-Encoders-as-Discriminators-rather-than-Generators
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/ELMo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyNanKVFKMnCVZJm48NJvEOL",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "EAKmz65EfvqQ"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "from typing import LIst, Tuple\n",
38 | "import torch\n",
39 | "import torch.nn as nn\n",
40 | "from char_cnn import CharEmbedding\n",
41 | "\n",
42 | "class ELMo(nn.Module):\n",
43 | " def __init__(self, vocab_size, output_dim, emb_dim, hid_dim, prj_dim, kernel_sizes,\n",
44 | " seq_len, n_layers, dropout):\n",
45 | " #파라미터 설명(몇 개만)\n",
46 | " #output_dim: word vocaulary 크기\n",
47 | " #n_layers: LSTM의 레이어 수. 기본값은 2\n",
48 | "\n",
49 | " super(ELMo, self).__init__()\n",
50 | "\n",
51 | " self.embedding = CharEmbedding(vocab_size, emb_dim, prj_dim, kernel_sizes, seq_len)\n",
52 | " self.bilms = BidirectionalLanguageModel(hid_dim, hid_dim, n_layers, dropout)\n",
53 | "\n",
54 | " self.predict = nn.Linear(hid_dim, output_dim)\n",
55 | "\n",
56 | " def forward(self, x):\n",
57 | " #파라미터: x(Sentence)\n",
58 | " #차원: x([batch, seq_len])\n",
59 | " emb = self.embedding(x)\n",
60 | " _, last_output = self.bilms(emb)\n",
61 | " y = self.predict(last_output)\n",
62 | "\n",
63 | " return y #훈련 단계에서는 오직 biLM의 마지막 LSTM의 output만을 사용하여라\n",
64 | "\n",
65 | " def get_embed_layer(self, x): #torch.Tensor --> List\n",
66 | " #순전파와 똑같지만, 모든 레이어의 임베딩을 반환함\n",
67 | " #파라미터: x(character로 이루어진 sentence)\n",
68 | " #차원: x([batch, seq_len])\n",
69 | " emb = self.embedding(x)\n",
70 | " first_output, last_output = self.bilms(emb)\n",
71 | "\n",
72 | " return emb, (first_output, last_output)\n",
73 | "\n",
74 | " def init_weights(self):\n",
75 | " for p in self.parameters():\n",
76 | " if p.dim() > 1:\n",
77 | " nn.init.xavier_uniform_(p)\n",
78 | "\n",
79 | " for lstm in self.bilms.lstms:\n",
80 | " for names in lstm._all_weights:\n",
81 | " for name in filter(lambda n: 'bias' in n, names):\n",
82 | " bias = getattr(lstm, name)\n",
83 | " n = bias.size(0)\n",
84 | " start, end = n // 4, n // 2\n",
85 | " bias.data[start:end].fill_(1.)\n",
86 | "\n",
87 | "class BidirectionalLanguageModel(nn.Module):\n",
88 | " def __init__(self, emb_dim, hid_dim, prj_emb, dropout):\n",
89 | " #LSTM 레이어의 이전과 이후 모두에 dropout 사용\n",
90 | " super(BidirectionalLanguageModel, self).__init__()\n",
91 | " self.lstms = nn.ModuleList([nn.LSTM(emb_dim, hid_dim, bidirectional = True, dropout = dropout,\n",
92 | " batch_first = True), nn.LSTM(prj_emb, hid_dim, bidirectional = True, dropout = dropout, bacth_first = True)])\n",
93 | " self.projection_layer = nn.Linear(2 * hid_dim, prj_emb)\n",
94 | "\n",
95 | " def forward(self, x, hidden = None):\n",
96 | " #파라미터: x(임베딩된 sentence tensor), hidden(hidden과 cell의 tuple)\n",
97 | " #차원: x([Batch, Seq_len, Emb_size]),\n",
98 | " #hidden([num_layers * num_directions, batch, hidden_size], [num_layers * num_directions, batch, hidden_size])\n",
99 | " \n",
100 | " #LSTM 레이어 사이에 residual connection 추가\n",
101 | " first_output, (hidden, cell) = self.lstms[0](x, hidden)\n",
102 | "\n",
103 | " projected = self.projection_layer(first_output)\n",
104 | " second_output, (hidden, cell) = self.lstms[1](projected, (hidden, cell))\n",
105 | "\n",
106 | " second_output = second_output.view(second_output.size(0), second_output.size(1), 2, -1)\n",
107 | "\n",
108 | " second_output = second_output[:, :, 0, :] + second_output[:, :, 1, :]\n",
109 | "\n",
110 | " return first_output, second_output"
111 | ]
112 | }
113 | ]
114 | }
115 |
--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/README.md:
--------------------------------------------------------------------------------
1 | # ELMo
2 |
3 | https://github.com/InhyeokYoo/NLP/blob/master/papers/4.ELMo 참고하여 작성
4 |
5 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading1-ELMo-Deep-contextualized-word-representations
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/char_cnn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyMpeFn+h3cVx7Sm4BlKoscT",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | },
17 | "accelerator": "GPU",
18 | "gpuClass": "standard"
19 | },
20 | "cells": [
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {
24 | "id": "view-in-github",
25 | "colab_type": "text"
26 | },
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "id": "E414FoesNyVv"
36 | },
37 | "outputs": [],
38 | "source": [
39 | "#char_cnn\n",
40 | "import torch\n",
41 | "import torch.nn as nn\n",
42 | "from typing import List\n",
43 | "\n",
44 | "class CharEmbedding(nn.Module):\n",
45 | " def __init__(self, vocab_size, emb_dim, prj_dim, kernel_sizes, char_len, device):\n",
46 | " super().__init__()\n",
47 | " self.device = device\n",
48 | " self.kernel_dim = sum([kernel_size for num_features, kernel_size in kernel_sizes]) #embedding dimenstion과 같음\n",
49 | " self.charcnn = CharCNN(vocab_size, emb_dim, self.kernel_dim, kernel_sizes, char_len, device)\n",
50 | " self.highway_net = HighWayNetwork(self.kernel_dim)\n",
51 | " self.highwat_net._init_bias()\n",
52 | " self.projection_layer = nn.Linear(self.kernel_dim, prj_dim)\n",
53 | "\n",
54 | " def forward(self, x):\n",
55 | " #파라미터: 문장의 캐릭터로 이루어져 있는 문장 벡터\n",
56 | " #차원: [Batch, Seq_len, Char_len]\n",
57 | " batch_size, seq_len, _ = x.size()\n",
58 | " y = torch.zeros(batch_size, seq_len, self.kernel_dim).to(self.device)\n",
59 | "\n",
60 | " for i in range(seq_len):\n",
61 | " char_emb = self.charcnn(x[:, i, :])\n",
62 | " highway_emb = self.highway_net(char_emb)\n",
63 | " y[:, i, :] = highway_emb.squeeze(1)\n",
64 | "\n",
65 | " emb = self.projection_layer(y)\n",
66 | " return emb\n",
67 | "\n",
68 | "class CharCNN(nn.Module):\n",
69 | " def __init__(self, vocab_size, char_emb_dim, word_emb_dim, kernel_sizes, char_len, device):\n",
70 | " super(CharCNN, self).__init__()\n",
71 | " self.device = device\n",
72 | " self.char_len = char_len\n",
73 | " self.word_emb_dim = word_emb_dim\n",
74 | " self.kernel_sizes = kernel_sizes\n",
75 | "\n",
76 | " self.embedding = nn.Embedding(vocab_size, char_meb_dim)\n",
77 | " self.kernels = nn.ModuleList([nn.Conv1d(in_channels = char_emb_dim, out_channels = num_features,\n",
78 | " kernel_size = kernel_size) for kernel_size, num_features in kernel_sizes])\n",
79 | "\n",
80 | " def forward(self, word):\n",
81 | " #파라미터: word(입력 텐서)\n",
82 | " #차원\n",
83 | " #입력: 단어([Batch, Emb_dim, Seq_len])\n",
84 | " #출력: y([Batch, Kernel_dim])\n",
85 | " batch_size = word.size(0)\n",
86 | " y = torch.zeros(batch_size, self.word_meb_dim).to(self.device)\n",
87 | "\n",
88 | " cnt = 0 #indec for y\n",
89 | "\n",
90 | " #torch.cat보다 비어있는 텐서를 채우는 것이 더 빠름\n",
91 | " for kernel in self.kernels:\n",
92 | " emb = self.embedding(word)\n",
93 | " emb = emb.permute(0, 2, 1)\n",
94 | " temp = kernel(emb)\n",
95 | " pooled = torch.max(temp, dim = 2)[0]\n",
96 | " y[:, cnt] = pooled\n",
97 | " cnt += pooled_size(1)\n",
98 | "\n",
99 | " return y\n",
100 | "\n",
101 | "class HighwayNetwork(nn.Module):\n",
102 | " def __init__(self, kernel_sizes):\n",
103 | " super(HighwayNetwork, self).__init__()\n",
104 | " self.h_gate = nn.Linear(kernel_sizes, kernel_sizes)\n",
105 | " self.t_gate = nn.Sequential(nn.Linear(kernel_sizes, kernel_sizes), nn.Sigmoid())\n",
106 | " self.relu = torch.nn.ReLU()\n",
107 | "\n",
108 | " def forward(self, x):\n",
109 | " #차원: x(Batch, Kernel_dim)\n",
110 | " x = x.unsqueeze(1)\n",
111 | " h = self.relu(self.h_gate(x))\n",
112 | " t = self.t_gate(x)\n",
113 | " c = 1 - t\n",
114 | " return t * h + c * x\n",
115 | "\n",
116 | " def _init_bias(self):\n",
117 | " self.t_gate[0].bias.data.fill_(-2)"
118 | ]
119 | }
120 | ]
121 | }
122 |
--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/character_dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyNgGR09iOxTjA3Q3sX+iuzH",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "0b8fNKIbbE0p"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import torchtext\n",
38 | "from torchtext.data import NestedField\n",
39 | "import math\n",
40 | "\n",
41 | "class BPTTIterator(torchtext.data.BPTTIterator):\n",
42 | " def __iter__(self):\n",
43 | " text = self.dataset[0].text\n",
44 | " TEXT = self.dataset.fields['text']\n",
45 | " TEXT.eos_token = None\n",
46 | " text = text + ([TEXT.pad_token] * int(math.ceil(len(text) / self.batch_size) * self.batch_size - len(text)))\n",
47 | " data = TEXT.pad([text]) #new\n",
48 | " data = TEXT.numericalize(data, device = self.device)\n",
49 | "\n",
50 | " #new line start\n",
51 | " size = list(data.size())\n",
52 | " size[0] = self.batch_size\n",
53 | " size[1] = -1\n",
54 | "\n",
55 | " data = data.view(*size).transpose(0, 1).contiguous()\n",
56 | " dataset = torchtext.data.Dataset(examples = self.dataset.examples, fields = [('text', 'TEXT'), ('target', 'TEXT')])\n",
57 | "\n",
58 | " while True:\n",
59 | " for i in range(0, len(self) * self.bptt_len, self.bptt_len):\n",
60 | " self.ierations += 1\n",
61 | " seq_len = min(self.bptt_len, len(data) - i - 1)\n",
62 | " batch_text = data[i:i + seq_len]\n",
63 | " if TEXT.batch_first:\n",
64 | " batch_text = batch_text.transpose(0, 1).contiguous()\n",
65 | " batch_target = batch_target.transpose(0, 1).contiguous()\n",
66 | " yield torchtext.data.Batch.fromvars(\n",
67 | " dataset, self.batch_size, text = batch_text, target = batch_target\n",
68 | " )\n",
69 | " if not self.repeat:\n",
70 | " return\n",
71 | "\n",
72 | "def gen_bptt_iter(dataset, batch_size, bptt_len, device):\n",
73 | " #dataset: tuple of dataset\n",
74 | " for batch_word, batch_char in zip(\n",
75 | " BPTTIterator(dataset[0], batch_size, bptt_len, device = device),\n",
76 | " BPTTIterator(dataset[1], batch_size, bptt_len, device = device),\n",
77 | " ):\n",
78 | " yield batch_word.text, batch_char.text, batch_word.target, batch_char.target\n",
79 | "\n",
80 | "def gen_language_model_corpus(dataset_cls: torchtext.datasets.LanguageModelingDataset):\n",
81 | " field_char = NestedField(Field(pad_token = PAD_WORD, tokenize = list, init_token = SOS_WORD,\n",
82 | " eos_token = EOS_WORD, batch_first = True), pad_token = PAD_WORD,)\n",
83 | " \n",
84 | " field_word = Field(batch_first = True)\n",
85 | " dataset_char = dataset_cls.splits(field_char)\n",
86 | " dataset_word = dataset_cls.splits(dielf_word)\n",
87 | " field_char.build_vocab(dataset_char[0])\n",
88 | " field_word.build_vocab(dataset_char[0])\n",
89 | " return [_ for _ in zip(dataset_word, dataset_char)], field_word, field_char\n",
90 | "\n",
91 | "#How to use\n",
92 | "if __name__ == '__main__':\n",
93 | " from torchtext.dataset import WIkiText2\n",
94 | " from torchtext.data import Field\n",
95 | "\n",
96 | " #FINAL\n",
97 | " PAD_WORD = ''\n",
98 | " SOS_WORD = ''\n",
99 | " EOS_WORD = ''\n",
100 | "\n",
101 | " datasets, field_word, field_char = gen_language_model_corpus(WikiText2)\n",
102 | " train_data, valid_data, test_data = datasets"
103 | ]
104 | }
105 | ]
106 | }
107 |
--------------------------------------------------------------------------------
/Natural Language Processing/GPT-1/GPT-1 Implementation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyNMCURwdSd6LE/DF4oH8QYA",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "source": [
32 | "# GPT-1 Implementation\n",
33 | "\n",
34 | "GPT-1 구현 코드는 [GPT 구현하기](https://paul-hyun.github.io/gpt-01/)를 참고하여 작성되었다.\n",
35 | "\n",
36 | "우선 GPT를 구현하기 전에 GPT에 대해 간략하게 설명하면 GPT는 Transformer의 Decoder만을 사용한 Pre-trained LM이다.\n",
37 | "\n",
38 | "### 1. Config\n",
39 | "\n",
40 | "Transformer와 파라미터를 동일하게 설정하였다. GPT는 Transformer의 Decoder만을 사용하므로 Encoder 부분은 제거하고 사용하였다."
41 | ],
42 | "metadata": {
43 | "id": "hMbeV9Y6mqNK"
44 | }
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "id": "XpRTodgTmhUR"
51 | },
52 | "outputs": [],
53 | "source": [
54 | "config = Config({\n",
55 | " 'n_dec_vocab': len(vocab),\n",
56 | " 'n_dec_seq': 256,\n",
57 | " 'n_layer': 6,\n",
58 | " 'd_hidn': 256,\n",
59 | " 'i_pad': 0,\n",
60 | " 'd_ff': 1024,\n",
61 | " 'n_head': 4,\n",
62 | " 'd_head': 64,\n",
63 | " 'dropout': 0.1,\n",
64 | " 'layer_norm_epsilon': 1e-12\n",
65 | "})\n",
66 | "print(config)"
67 | ]
68 | },
69 | {
70 | "cell_type": "markdown",
71 | "source": [
72 | "# 2. Decoder\n",
73 | "\n",
74 | "GPT는 Transformer의 Encoder는 사용하지 않고 Decoder만 사용하므로 Decoder에서 Encoder의 출력과 Attention을 하는 부분인 Encoder-Decoder-Multi-Head Attention 부분은 제거하고 사용하였다. 그 외에 나머지 부분은 Transformer와 동일하다."
75 | ],
76 | "metadata": {
77 | "id": "0u1Nu0LroUzG"
78 | }
79 | },
80 | {
81 | "cell_type": "code",
82 | "source": [
83 | "# Decoder Layer\n",
84 | "class DecoderLayer(nn.Module):\n",
85 | " def __init__(self, config):\n",
86 | " super().__init__()\n",
87 | " self.config = config\n",
88 | "\n",
89 | " self.self_attn = MultiHeadAttention(self.config)\n",
90 | " self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps = self.config.layer_norm_epsilon)\n",
91 | " self.pos_ffn = PoswiseFeedForwardNet(self.config)\n",
92 | " self.layer_norm3 = nn.LayerNorm(self.config.d_hidn, eps = self.config.layer_norm_epsilon)\n",
93 | "\n",
94 | " def forward(self, dec_inputs, self_attn_mask):\n",
95 | " # (batch_size, n_dec_seq, d_hidn), (batch_size, n_head, n_dec_seq, n_dec_seq)\n",
96 | " self_att_outputs, self_attn_prob = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)\n",
97 | " self_att_outputs = self.layer_norm1(dec_inputs + self_att_outputs)\n",
98 | " # (batch_size, n_dec_seq, d_hidn)\n",
99 | " ffn_outputs = self.po_ffn(self_att_outputs)\n",
100 | " ffn_outputs = self.layer_norm3(self_att_outputs + ffn_outputs)\n",
101 | " # (batch_size, n_dec_seq, d_hidn), (batch_size, n_head, n_dec_seq, n_dec_seq), (batch_size, n_head, n_dec_seq, n_enc_seq)\n",
102 | " return ffn_outputs, self_attn_prob"
103 | ],
104 | "metadata": {
105 | "id": "ZbvA3ofcom9U"
106 | },
107 | "execution_count": null,
108 | "outputs": []
109 | },
110 | {
111 | "cell_type": "code",
112 | "source": [
113 | "# Decoder\n",
114 | "class Decoder(nn.Module):\n",
115 | " def __init__(self, config):\n",
116 | " super().__init__()\n",
117 | " self.config = config\n",
118 | "\n",
119 | " self.dec_emb = nn.Embedding(self.config.n_dec_vocab, self.config.d_hidn)\n",
120 | " sinusoid_table = torch.FloatTensor(det_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.d_hidn))\n",
121 | " self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze = True)\n",
122 | "\n",
123 | " self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config.n_layer)])\n",
124 | "\n",
125 | " def forward(self, dec_inputs):\n",
126 | " positions = torch.arange(dec_inputs.size(1), device = dec_inputs.device, dtype = dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1\n",
127 | " pos_mask = dec_inputs.eq(self.config.i_pad)\n",
128 | " positions.masked_fill_(pos_mask, 0)\n",
129 | "\n",
130 | " # (batch_size, n_dec_seq, d_hidn)\n",
131 | " dec_outputs = self.dec_emb(dec_inputs) + self.pos_emb(positions)\n",
132 | "\n",
133 | " # (batch_size, n_dec_seq, n_dec_seq)\n",
134 | " dec_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config.i_pad)\n",
135 | " # (batch_size, n_dec_seq, n_dec_seq)\n",
136 | " dec_attn_decoder_mask = get_attn_decoder_mask(dec_inputs)\n",
137 | " # (batch_size, n_dec_seq, n_dec_seq)\n",
138 | " dec_self_attn_mask = torch.gt((dec_attn_mask + dec_attn_decoder_mask), 0)\n",
139 | "\n",
140 | " self_attn_probs = []\n",
141 | " for layer in self.layers:\n",
142 | " # (batch_size, n_dec_seq, d_hidn), (batch_size, n_dec_seq, n_dec_seq)\n",
143 | " dec_outputs, self_attn_prob = layer(dec_outputs, dec_self_attn_mask)\n",
144 | " self_attn_probs.append(self_attn_prob)\n",
145 | " # (batch_size, n_dec_seq, d_hidn), [(batch_size, n_dec_seq, n_dec_seq)]\n",
146 | " return dec_outputs, self_attn_probs"
147 | ],
148 | "metadata": {
149 | "id": "Z89dmNpSqq9K"
150 | },
151 | "execution_count": null,
152 | "outputs": []
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "source": [
157 | "# 3. GPT\n",
158 | "\n",
159 | "GPT는 단순히 Transformer Decoder를 실행\n",
160 | "Pre-traing 모델을 저장하기 위한 save, 저장된 모델을 읽기 위한 load 함수가 추가로 정의의"
161 | ],
162 | "metadata": {
163 | "id": "x59jUSdgF2FU"
164 | }
165 | },
166 | {
167 | "cell_type": "code",
168 | "source": [
169 | "class GPT(nn.Module):\n",
170 | " def __init__(self, config):\n",
171 | " super().__init__()\n",
172 | " self.config = config\n",
173 | "\n",
174 | " self.decoder = Decoder(self.config)\n",
175 | "\n",
176 | " def forward(self, dec_inputs):\n",
177 | " # (batch_size, n_seq, d_hidn), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
178 | " dec_outputs, dec_self_attn_probs = self.decoder(dec_inputs)\n",
179 | " # (batch_size, n_dec_seq, n_dec_vocab), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
180 | " return dec_outputs, dec_self_attn_probs\n",
181 | "\n",
182 | " def save(self, epoch, loss, path):\n",
183 | " torch.save({\n",
184 | " 'epoch': epoch, \n",
185 | " 'loss': loss, \n",
186 | " 'state_dict': self.state_dict()\n",
187 | " }, path)\n",
188 | "\n",
189 | " def load(self, path):\n",
190 | " save = torch.load(path)\n",
191 | " self.load_state_dict(save['state_dict'])\n",
192 | " return save['epoch'], save['loss']"
193 | ],
194 | "metadata": {
195 | "id": "uEbD9YQ8GEd-"
196 | },
197 | "execution_count": null,
198 | "outputs": []
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "source": [
203 | "# 4. Pre-traing Model\n",
204 | "\n",
205 | "GPT를 pre-train 하기 위한 클래스. GPT pre-train 클래스의 목적은 입력 단어에 대한 다음 단어를 예측하는 것이다."
206 | ],
207 | "metadata": {
208 | "id": "T5X1B3EzHaQ6"
209 | }
210 | },
211 | {
212 | "cell_type": "code",
213 | "source": [
214 | "class GPTPretraing(nn.Module):\n",
215 | " def __init__(self, config):\n",
216 | " super().__init__()\n",
217 | " self.config = config\n",
218 | "\n",
219 | " self.gpt = GPT(self.config)\n",
220 | " # 단어를 예측하기 위한 projection_lm을 선언\n",
221 | " self.projection_lm = nn.Linear(self.config.d_hidn, self.config.n_dec_vocab, bias = False)\n",
222 | " # Decoder의 Embedding & weight를 공유\n",
223 | " self.projection_lm.weight = self.gpt.decoder.dec_emb.weight\n",
224 | "\n",
225 | " def forward(self, dec_inputs):\n",
226 | " # (batch_size, n_dec_seq, d_hidn), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
227 | " dec_outputs, dec_self_attn_probs = self.gpt(dec_inputs)\n",
228 | " # (batch_size, n_dec_seq, n_dec_vocab)\n",
229 | " # GPT 실행 결과를 입력으로 projection_lm을 실행해서 단어를 예측측\n",
230 | " logits_lm = self.projection_lm(dec_outputs)\n",
231 | " # (batch_size, n_dec_seq - 1, n_dec_vocab), (batch_size, n_output), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
232 | " # 결과의 마지막을 제외한 나머지를 리턴\n",
233 | " return logits_lm[:, :-1, :].contiguous(), dec_self_attn_probs"
234 | ],
235 | "metadata": {
236 | "id": "tdUtbgF7HuRi"
237 | },
238 | "execution_count": null,
239 | "outputs": []
240 | }
241 | ]
242 | }
243 |
--------------------------------------------------------------------------------
/Natural Language Processing/GPT-1/README.md:
--------------------------------------------------------------------------------
1 | # GPT-1 Implementation
2 |
3 | https://paul-hyun.github.io/gpt-01/ 참고하여 작성됌
4 |
5 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading3-GPT-1-Improving-Language-Understanding-by-Generative-Pre-Training
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/RoBERTa/README.md:
--------------------------------------------------------------------------------
1 | # RoBERTa Implementation
2 |
3 | https://github.com/facebookresearch/fairseq/blob/main/fairseq/models/roberta/model.py 참고하여 작성됌
4 |
5 | paper review: https://cartinoe5930.tistory.com/entry/RoBERTa-A-Robustly-Optimized-BERT-Pretraining-Approach-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/Transformer-XL/README.md:
--------------------------------------------------------------------------------
1 | # Transformer-XL Implementation
2 |
3 | https://github.com/kimiyoung/transformer-xl/blob/master/tf/model.py 참고하여 작성
4 |
5 | paper review is here!! https://cartinoe5930.tistory.com/entry/Transformer-XL-Attentive-Language-Models-Beyond-a-Fixed-Length-Context-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/Transformer-XL/Transformer_XL_구현_실습.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyOtb06YYh5iyXi4CRZAWjAR",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "70fsbBslZZ7I"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import tensorflow as tf\n",
38 | "\n",
39 | "def positional_embedding(pos_seq, inv_freq, bsz = None):\n",
40 | " sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)\n",
41 | " pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)\n",
42 | " if bsz is not None:\n",
43 | " return tf.tile(pos_emb[:, None, :], [1, bsz, 1])\n",
44 | " else:\n",
45 | " return pos_emb[:, None, :]\n",
46 | "\n",
47 | "def positionwise_FF(inp, d_model d_inner, dropout, kernel_initializer, scope = 'ff', is_training = True):\n",
48 | " output = inp\n",
49 | " with tf.variable_scope(scope):\n",
50 | " output = tf.layers.dense(inp, d_inner, activation = tf.nn.relu,\n",
51 | " kernel_initializer = kernel_initializer,\n",
52 | " name = 'layer_1')\n",
53 | " output = tf.layers.dropout(output, dropout, training = is_training, name = 'drop_1')\n",
54 | " output = tf.layers.dense(output, d_model, kernel_initializer = kernel_initializer,\n",
55 | " name = 'layer2')\n",
56 | " output = tf.layers.dropout(output, dropout, training = is_training, name = 'drop_2')\n",
57 | " output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis = -1)\n",
58 | " return output\n",
59 | "\n",
60 | "def rel_shift(x):\n",
61 | " x_size = tf.shape(x)\n",
62 | "\n",
63 | " x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])\n",
64 | " x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])\n",
65 | " x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])\n",
66 | " x = tf.reshape(x, x_size)\n",
67 | "\n",
68 | " return x\n",
69 | "\n",
70 | "def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,\n",
71 | " n_head, d_head, dropout, dropatt, is_training,\n",
72 | " kernel_initializer, scope = 'rel_attn'):\n",
73 | " scale = 1 / (d_head ** 0.5)\n",
74 | " with tf.variable_scope(scope):\n",
75 | " qlen = tf.shape(w)[0]\n",
76 | " rlen = tf.shape(r)[0]\n",
77 | " bsz = tf.shape(w)[1]\n",
78 | "\n",
79 | " cat = tf.concat([mems, w], 0) if mems is not None and mems.shape.ndims > 1 else w\n",
80 | " w_heads = tf.layers.dense(cat, 3 * n_head, d_head, use_bias = False, kernel_initializer = kernel_initializer,\n",
81 | " name = 'qkv')\n",
82 | " r_head_k = tf.layers.dense(r, n_head * d_head, use_bias = False, kernel_initializer = kernel_initializer,\n",
83 | " name = 'r')\n",
84 | " \n",
85 | " w_head_q, w_kead_k, w_head_v = tf.split(w_heads, 3, -1)\n",
86 | " w_head_q = w_head_q[-qlen:]\n",
87 | "\n",
88 | " klen = tf.shape(w_head_k)[0]\n",
89 | "\n",
90 | " w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])\n",
91 | " w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])\n",
92 | " w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])\n",
93 | "\n",
94 | " r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])\n",
95 | "\n",
96 | " rw_head_q = w_head_q + r_w_bias\n",
97 | " rr_head_q = w_head_q + r_r_bias\n",
98 | "\n",
99 | " AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)\n",
100 | " BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)\n",
101 | " BD = rel_shift(BD)\n",
102 | "\n",
103 | " attn_score = (AC + BD) * scale\n",
104 | " attn_mask_t = attn_mask[:, :, None, None]\n",
105 | " attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t\n",
106 | "\n",
107 | " attn_prob = tf.nn.softmax(attn_score, 1)\n",
108 | " attn_prob = tf.layers.dropout(attn_prob, dropatt, training = is_training)\n",
109 | "\n",
110 | " attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)\n",
111 | " size_t = tf.shape(attn_vec)\n",
112 | " attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])\n",
113 | "\n",
114 | " attn_out = tf.layers.dense(attn_vec, d_model, use_bias = False,\n",
115 | " kernel_initializer = kernel_initializer, name ='o')\n",
116 | " attn_out = tf.layers.dropout(attn_out, dropout, training = is_training)\n",
117 | "\n",
118 | " output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis = -1)\n",
119 | "\n",
120 | " return output\n",
121 | "\n",
122 | "def embedding_lookup(lookup_table, x, use_tpu = True):\n",
123 | " if use_tpu:\n",
124 | " n_token = tf.shape(lookup_table)[0]\n",
125 | " one_hot_idx = tf.one_hot(x, n_token)\n",
126 | " if one_hot_idx.shape.ndims == 2:\n",
127 | " return tf.einsum('nd,in->id', lookup_table, one_hot_idx)\n",
128 | " else:\n",
129 | " return tf.einsum('nb,ibn->ibd', lookup_table, one_hot_idx)\n",
130 | " else:\n",
131 | " return tf.nn.embedding_lookup(lookup_table, x)\n",
132 | "\n",
133 | "def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,\n",
134 | " proj_initializer, div_val = 1,\n",
135 | " proj_same_dim = True,\n",
136 | " scope = 'adaptive_embed', **kwargs):\n",
137 | " emb_scale = d_proj ** 0.5\n",
138 | " with tf.variable_scope(scope):\n",
139 | " if div_val == 1:\n",
140 | " lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], initializer = initializer)\n",
141 | " y = embedding_lookup(lookup_table, x, use_tpu = False)\n",
142 | " if d_proj != d_embed:\n",
143 | " proj_W = tf.get_variable('proj_W', [d_embed, d_proj], initializer = proj_initializer)\n",
144 | " y = tf.einsum('ibe,ed->ibd', y, proj_w)\n",
145 | " else:\n",
146 | " proj_w = None\n",
147 | " ret_params = [lookup_table, proj_W]\n",
148 | " else:\n",
149 | " tables, projs = [], []\n",
150 | " curoff_ends = [0] + cutoffs + [n_token]\n",
151 | " x_size = tf.shape(x)\n",
152 | " y = tf.zeros([x_size[0], x_size[1], d_proj])\n",
153 | " for i in range(len(cutoff_ends) - 1):\n",
154 | " with tf.variable_scope('cutoff_{}'.format(i)):\n",
155 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
156 | " mask = (x >= l_idx) & (x < r_idx)\n",
157 | " cur_x = tf.boolean_mask(x, mask) - l_idx\n",
158 | " cur_d_embed = d_embed // (div_val ** i)\n",
159 | " lookup_table = tf.get_variable('lookup_table', [r_idx - l_idx, cur_d_embed].\n",
160 | " initializer = initializer)\n",
161 | " cur_y = embedding_lookup(lookup_table, cur_x, use_tpu = False)\n",
162 | " if d_proj == cur_d_embed and not proj_same_dim:\n",
163 | " proj_W = None\n",
164 | " else:\n",
165 | " proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],\n",
166 | " initializer = proj_initializer)\n",
167 | " cur_y = tf.einsum('id,de->ie', cur_y, proj_W)\n",
168 | " mask_idx = tf.to_int64(tf.where(mask))\n",
169 | " y += tf.scatter_nd(mask_idx, cur_y, tf.to_int64(tf.shape(y)))\n",
170 | " tables.append(lookup_table)\n",
171 | " projs.append(proj_W)\n",
172 | " ret_params = [tables, projs]\n",
173 | " \n",
174 | " y *= emb_scale\n",
175 | " return y, ret_params\n",
176 | "\n",
177 | "def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,\n",
178 | " proj_initializer, div_val = 1, perms = None,\n",
179 | " proj_same_dim = True, scope = 'adaptive_embed'):\n",
180 | " #만약 perm이 None이라면\n",
181 | " #W = W1 X W2와 같이 각각 projection되고, 그 다음에 X x W (embedding lookup)을 계산\n",
182 | " #None이 아니라면\n",
183 | " #bin-based embedding lookup을 사용\n",
184 | "\n",
185 | " emb_scale = d_proj ** 0.5\n",
186 | " with tf.variable_scope(scope):\n",
187 | " if div_val == 1:\n",
188 | " lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], initializer = initializer)\n",
189 | " y = embedding_lookup(lookup_table, x)\n",
190 | " if d_proj != d_embed:\n",
191 | " proj_W = tf.get_variable('proj_W', [d_embed, d_proj], initializer = proj_initializer)\n",
192 | " y = tf.einsum('ibe,ed->ibd', y, proj_W)\n",
193 | " else:\n",
194 | " proj_W = None\n",
195 | " ret_params = [lookup_table, proj_W]\n",
196 | " else:\n",
197 | " tables, projs = [], []\n",
198 | " cutoff_ends = [0] + cutoffs + [n_token]\n",
199 | " x_size = tf.shape(x)\n",
200 | " if perms is None:\n",
201 | " cat_lookup = []\n",
202 | " else:\n",
203 | " cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj])\n",
204 | " for i in range(len(cutoff_ends) - 1):\n",
205 | " with tf.variable_scope('cutoff_{}'.format(i)):\n",
206 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
207 | " cur_d_embed = d_embed // (div_val ** i)\n",
208 | " lookup_table = tf.get_variable('lookup_table',\n",
209 | " [r_idx - l_idx, cur_d_embed],\n",
210 | " initializer = initializer)\n",
211 | " if cur_d_embed == d_proj and not proj_same_dim:\n",
212 | " proj_W = None\n",
213 | " else:\n",
214 | " proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],\n",
215 | " initializer = proj_initializer)\n",
216 | " if perms is None:\n",
217 | " cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W))\n",
218 | " else:\n",
219 | " if i == 0:\n",
220 | " cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1))\n",
221 | " if proj_W is not None:\n",
222 | " cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W)\n",
223 | " cur_y *= perms[i][:, :, None]\n",
224 | " cat_lookup += cur_y\n",
225 | " else:\n",
226 | " cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i])\n",
227 | " cur_x = tf.to_int32(cur_x)\n",
228 | " cur_y = embedding_lookup(lookup_table, cur_x)\n",
229 | " if proj_W is not None:\n",
230 | " cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W)\n",
231 | " cat_lookup += tf.einsum('kd,idk->ibd', cur_y, perms[i])\n",
232 | " tables.append(lookup_table)\n",
233 | " projs.append(proj_W)\n",
234 | " if perms is None:\n",
235 | " cat_lookup = tf.concat(cat_lookup, 0)\n",
236 | " y = embedding_lookup(cat_lookup, x)\n",
237 | " else:\n",
238 | " y = cat_lookup\n",
239 | " ret_params = [tables, projs]\n",
240 | " \n",
241 | " y *= emb_scale\n",
242 | " return y, ret_params\n",
243 | "\n",
244 | "def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs, params,\n",
245 | " tie_projs, initializer = None, proj_initializer = None,\n",
246 | " div_val = 1, scope = 'adaptive_softmax', proj_same_dim = True,\n",
247 | " return_mean = True, **kwargs):\n",
248 | " def _logit(x, W, b, proj):\n",
249 | " y = x\n",
250 | " if proj is not None:\n",
251 | " y = tf.einsum('ibd,ed->ibe', y, proj)\n",
252 | " return tf.einsum('ibd, nd->ibn', y, W) + b\n",
253 | "\n",
254 | " params_W, params_projs = params[0], params[1]\n",
255 | "\n",
256 | " def _gather_logprob(logprob, target):\n",
257 | " lp_size = tf.shape(logprob)\n",
258 | " r = tf.range(lp_size[0])\n",
259 | " idx = tf.stack([r, target], 1)\n",
260 | " return tf.gather_nd(logprob, idx)\n",
261 | "\n",
262 | " with tf.variable_scope(scope):\n",
263 | " if len(cutoffs) == 0:\n",
264 | " softmax_b = tf.get_variable('bias', [n_token],\n",
265 | " initializer = tf.zeros_initializer())\n",
266 | " output = _logit(hidden, prams_W, softmax_b, params_projs)\n",
267 | " nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target, logits = output)\n",
268 | " else:\n",
269 | " cutoff_ends = [0] + cutoffs + [n_token]\n",
270 | " nll = tf.zeros_like(target, dtype = tf.float32)\n",
271 | " for i in range(len(cutoff_ends) - 1):\n",
272 | " with tf.variable_scope('cutoff_{}'.format(i)):\n",
273 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
274 | " mask = (target >= l_idx) & (target < r_idx)\n",
275 | " mask_idx = tf.where(mask)\n",
276 | " cur_target = tf.boolean_mask(target, mask) - l_idx\n",
277 | " cur_d_embed = d_embed // (div_val ** i)\n",
278 | "\n",
279 | " if div_val == 1:\n",
280 | " cur_W = params_W[l_idx: r_idx]\n",
281 | " else:\n",
282 | " cur_W = params_W[i]\n",
283 | " cur_b = tf.get_variable('b', [r_idx - l_idx], initializer = tf.zeros_initializer())\n",
284 | " if tie_projs[i]:\n",
285 | " if div_val == 1:\n",
286 | " cur_proj = params_projs\n",
287 | " else:\n",
288 | " cur_proj = params_projs[i]\n",
289 | " else:\n",
290 | " if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:\n",
291 | " cur_proj = None\n",
292 | " else:\n",
293 | " cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],\n",
294 | " initializer = proj_initializer)\n",
295 | " if i == 0:\n",
296 | " cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],\n",
297 | " initializer = tf.zeros_initializer())\n",
298 | " cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],\n",
299 | " initializer = tf.zeros_initializer())\n",
300 | " cur_W = tf.concat([cur_W, cluster_W], 0)\n",
301 | " cur_b = tf.concat([cur_b, cluster_b], 0)\n",
302 | "\n",
303 | " head_logit = _logit(hidden, cur_W, cur_b, cur_proj)\n",
304 | " head_logprob = tf.nn.log_softmax(head_logit)\n",
305 | " cur_head_logprob = tf.boolean_mask(head_logprob, mask)\n",
306 | " cur_logprob = _gather_logprob(cur_head_logprob, cur_target)\n",
307 | " else:\n",
308 | " cur_head_logprob = tf.boolean_mask(head_logprob, mask)\n",
309 | " cur_hidden = tf.boolean_mask(hidden_mask)\n",
310 | " tail_logit = tf.squeeze(_logit(cur_hidden[None], cur_W, cur_b, cur_proj), 0)\n",
311 | " tail_logprob = tf.nn.log_softmax(tail_logit)\n",
312 | " cur_logprob = (cur_head_logprob[:, cutoff_ends[1]+i-1] + _gather_logprob(tail_logprob, cur_target))\n",
313 | " nll += tf.scatter_nd(mask_idx, -cur_logprob, tf.to_int64(tf.shape(nll)))\n",
314 | "\n",
315 | " if return_mean:\n",
316 | " nll = tf.reduce_mean(nll)\n",
317 | " return nll\n",
318 | "\n",
319 | "def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,\n",
320 | " params, tie_projs,\n",
321 | " initializer=None, proj_initializer=None,\n",
322 | " div_val=1, perms=None, proj_same_dim=True,\n",
323 | " scope='adaptive_softmax',\n",
324 | " **kwargs):\n",
325 | " def _logit(x, W, b, proj):\n",
326 | " y = x\n",
327 | " if x.shape.ndims == 3:\n",
328 | " if proj is not None:\n",
329 | " y = tf.einsum('ibd,ed->ibe', y, proj)\n",
330 | " return tf.einsum('ibd,nd->ibn', y, W) + b\n",
331 | " else:\n",
332 | " if proj is not None:\n",
333 | " y = tf.einsum('id,ed->ie', y, proj)\n",
334 | " return tf.einsum('id,nd->in', y, W) + b\n",
335 | "\n",
336 | " params_W, params_projs = params[0], params[1]\n",
337 | "\n",
338 | " with tf.variable_scope(scope):\n",
339 | " if len(cutoffs) == 0:\n",
340 | " softmax_b = tf.get_variable('bias', [n_token], initializer = tf.zeros_initializer())\n",
341 | " output = _logit(hidden, params_W, softmax_b, params_projs)\n",
342 | " nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target, logits = output)\n",
343 | " nll = tf.reduce_mean(nll)\n",
344 | " else:\n",
345 | " total_loss, total_cnt = 0, 0\n",
346 | " cutoff_ends = [0] + cutoffs + [n_token]\n",
347 | " for i in range(len(cutoff_ends) - 1):\n",
348 | " with tf.variable_scope('cutoff_{}'.format(i)):\n",
349 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
350 | "\n",
351 | " cur_d_embed = d_embed // (div_val ** i)\n",
352 | "\n",
353 | " if div_val == 1:\n",
354 | " cur_W = params_W[l_idx: r_idx]\n",
355 | " else:\n",
356 | " cur_W = params_W[i]\n",
357 | " cur_b = tf.get_variable('b', [r_idx - l_idx], initializer = tf.zeros_initializer())\n",
358 | "\n",
359 | " if tie_projs[i]:\n",
360 | " if div_val == 1:\n",
361 | " cur_proj = params_projs\n",
362 | " else:\n",
363 | " cur_proj = params_projs[i]\n",
364 | " else:\n",
365 | " if (div_val == 1 of not proj_same_dim) and d_proj == cur_d_embed:\n",
366 | " cur_proj = None\n",
367 | " else:\n",
368 | " cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj], initializer = tf.zeros_initializer())\n",
369 | "\n",
370 | " if i == 0:\n",
371 | " cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],\n",
372 | " initializer = tf.zeros_initializer())\n",
373 | " cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],\n",
374 | " initializer = tf.zeros_initializer())\n",
375 | " cur_W = tf.concat([cur_W, cluster_W], 0)\n",
376 | " cur_b = tf.concat([cur_b, cluster_b], 0)\n",
377 | "\n",
378 | " head_logit = _logit(hidden, cur_W, cur_b, cur_proj)\n",
379 | "\n",
380 | " head_target = kwargs.get('head_target')\n",
381 | " head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
382 | " labels = head_target,\n",
383 | " logits = head_logit\n",
384 | " )\n",
385 | "\n",
386 | " masked_loss = head_nll * perms[i]\n",
387 | " total_loss += tf.reduce_sum(masked_loss)\n",
388 | " total_cnt += tf.reduce_sum(perms[i])\n",
389 | "\n",
390 | " else:\n",
391 | " cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i])\n",
392 | "\n",
393 | " cur_hidden_tf.einsum('ibd,ibk->kd', hidden, perms[i])\n",
394 | " tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj)\n",
395 | "\n",
396 | " tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx), perms[i])\n",
397 | " tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
398 | " labels = tf.to_int43(tail_target), logits = tail_logit\n",
399 | " )\n",
400 | "\n",
401 | " sum_nll = cur_head_nll + tail_nll\n",
402 | " mask = tf.reduce_sum(perms[i], [0, 1])\n",
403 | "\n",
404 | " masked_loss = sum_nll * mask\n",
405 | " total_loss += tf.reduce_sum(masked_loss)\n",
406 | " total_cnt += tf.reduce_sum(mask)\n",
407 | "\n",
408 | " nll = total_loss / total_cnt\n",
409 | "\n",
410 | " return nll\n",
411 | "\n",
412 | "def _create_mask(qlen, mlen, same_length = False):\n",
413 | " attn_mask = tf.ones([qlen, qlen])\n",
414 | " mask_u = tf.matrix_band_part(attn_mask, 0, -1)\n",
415 | " mask_dia = tf.matrix_band_part(attn_mask, 0, 0)\n",
416 | " attn_mask_pad = tf.zeros([qlen, mlen])\n",
417 | " ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)\n",
418 | " if same_length:\n",
419 | " mask_l = tf.matrix_band_part(attn_mask, -1, 0)\n",
420 | " ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)\n",
421 | " return ret\n",
422 | "\n",
423 | "def _cache_mem(curr_out, prev_mem, mem_len = None):\n",
424 | " if mem_len is None or prev_mem is None:\n",
425 | " new_mem = curr_out\n",
426 | " elif mem_len == 0:\n",
427 | " return prev_mem\n",
428 | " else:\n",
429 | " new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]\n",
430 | "\n",
431 | " return tf.stop_gradient(new_mem)\n",
432 | "\n",
433 | "def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,\n",
434 | " n_head, d_head, d_inner, dropout, dropatt,\n",
435 | " initializer, is_training, proj_initializer=None,\n",
436 | " mem_len=None, cutoffs=[], div_val=1, tie_projs=[],\n",
437 | " same_length=False, clamp_len=-1, use_tpu=True,\n",
438 | " input_perms=None, target_perms=None, head_target=None,\n",
439 | " untie_r=False, proj_same_dim=True,\n",
440 | " scope='transformer'):\n",
441 | " new_mems = []\n",
442 | " with tf.variable_scope(scope):\n",
443 | " if untie_r:\n",
444 | " r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],\n",
445 | " initializer = initializer)\n",
446 | " r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],\n",
447 | " initializer = initializer)\n",
448 | " else:\n",
449 | " r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],\n",
450 | " initializer = initializer)\n",
451 | " r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],\n",
452 | " initializer = initializer)\n",
453 | " \n",
454 | " qlen = tf.shape(dec_inp)[0]\n",
455 | " mlen = tf.shape(mems[0])[0] is mems is not None else 0\n",
456 | " klen = mlen + qlen\n",
457 | "\n",
458 | " if proj_initializer is None:\n",
459 | " proj_initializer = initializer\n",
460 | " lookup_fn = (mul_adaptive_embedding_lookup is use_tpu else\n",
461 | " mask_adaptive_embedding_lookup)\n",
462 | " embeddings, shared_params = lookup_fn(\n",
463 | " x=dec_inp,\n",
464 | " n_token=n_token,\n",
465 | " d_embed=d_embed,\n",
466 | " d_proj=d_model,\n",
467 | " cutoffs=cutoffs,\n",
468 | " initializer=initializer,\n",
469 | " proj_initializer=proj_initializer,\n",
470 | " div_val= div_val,\n",
471 | " perms=input_perms,\n",
472 | " proj_same_dim=proj_same_dim)\n",
473 | " \n",
474 | " attn_mask = _create_mask(qlen, mlen, same_length)\n",
475 | "\n",
476 | " pos_seq = tf.range(klen - 1, -1, -1.0)\n",
477 | " if clasm_len > 0:\n",
478 | " pos_seq = tf.minimum(pos_seq, clamp_len)\n",
479 | " inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))\n",
480 | " pos_emb = positional_embedding(pos_seq, inv_freq)\n",
481 | "\n",
482 | " output = tf.layers.dropout(embeddings, dropot, training = is_training)\n",
483 | " pos_emb = tf.layers.dropout(pos_emb, dropout, training = is_training)\n",
484 | "\n",
485 | " if mems is None:\n",
486 | " mems = [None] * n_layer\n",
487 | "\n",
488 | " for i in range(n_layer):\n",
489 | " new_mems.append(_cache_mem(output, mems[i], mem_len))\n",
490 | "\n",
491 | " with tf.variable_scope('layer_{}'.format(i)):\n",
492 | " output = rel_multihead_attn(\n",
493 | " w=output,\n",
494 | " r=pos_emb,\n",
495 | " r_w_bias=r_w_bias if not untie_r else r_w_bias[i],\n",
496 | " r_r_bias=r_r_bias if not untie_r else r_r_bias[i],\n",
497 | " attn_mask=attn_mask,\n",
498 | " mems=mems[i],\n",
499 | " d_model=d_model,\n",
500 | " n_head=n_head,\n",
501 | " d_head=d_head,\n",
502 | " dropout=dropout,\n",
503 | " dropatt=dropatt,\n",
504 | " is_training=is_training,\n",
505 | " kernel_initializer=initializer\n",
506 | " )\n",
507 | " output = positionwise_FF(\n",
508 | " inp=output,\n",
509 | " d_model=d_model,\n",
510 | " d_inner=d_inner,\n",
511 | " dropout=dropout,\n",
512 | " kernel_initializer=initializer,\n",
513 | " is_training=is_training\n",
514 | " )\n",
515 | "\n",
516 | " output = tf.layers.dropout(output, dropout, training = is_training)\n",
517 | "\n",
518 | " logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else\n",
519 | " mask_adaptive_logsoftmax)\n",
520 | " loss = logsoftmax_fn(\n",
521 | " hidden=output,\n",
522 | " target=target,\n",
523 | " n_token=n_token,\n",
524 | " d_embed=d_embed,\n",
525 | " d_proj=d_model,\n",
526 | " cutoffs=cutoffs,\n",
527 | " params=shared_params,\n",
528 | " tie_projs=tie_projs,\n",
529 | " initializer=initializer,\n",
530 | " proj_initializer=proj_initializer,\n",
531 | " div_val=div_val,\n",
532 | " perms=target_perms,\n",
533 | " head_target=head_target,\n",
534 | " proj_same_dim=proj_same_dim\n",
535 | " )\n",
536 | "\n",
537 | " return loss, new_mems"
538 | ]
539 | }
540 | ]
541 | }
542 |
--------------------------------------------------------------------------------
/Natural Language Processing/Transformer/README.md:
--------------------------------------------------------------------------------
1 | # Transformer Implementation
2 |
3 | https://github.com/tunz/transformer-pytorch/blob/e7266679f0b32fd99135ea617213f986ceede056/model/transformer.py#L201 참고하여 작성
4 |
5 | Transformer paper review: https://cartinoe5930.tistory.com/entry/Transformer-Attention-Is-All-You-Need-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/Transformer/Transformer_구현_실습.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyPEjZ5/XN13lrmM3kUVgIFW",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "nYoZgseydKyf"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import math\n",
38 | "\n",
39 | "import torch\n",
40 | "import torch.nn as nn\n",
41 | "import torch.nn.function as F\n",
42 | "\n",
43 | "from utils import utils\n",
44 | "\n",
45 | "def initialize_weight(x):\n",
46 | " nn.init.xavier_uniform_(x.weight)\n",
47 | " if x.bias is not None:\n",
48 | " nn.init.constant_(x.bias, 0)\n",
49 | "\n",
50 | "class FeedForwardNetwork(nn.Module):\n",
51 | " def __init__(self, hidden_size, filter_size, dropout_rate):\n",
52 | " super(FeedForwardNetwork, self).__init__()\n",
53 | "\n",
54 | " self.layer1 = nn.Linear(hidden_size, filter_size)\n",
55 | " self.relu = nn.ReLU()\n",
56 | " self.dropout = nn.Dropout(dropout_rate)\n",
57 | " self.layer2 = nn.Linear(filter_size, hidden_size)\n",
58 | "\n",
59 | " initialize_weight(self.layer1)\n",
60 | " initialize_weight(self.layer2)\n",
61 | "\n",
62 | " def forward(self, x):\n",
63 | " x = self.layer1(x)\n",
64 | " x = self.relu(x)\n",
65 | " x = self.dropout(x)\n",
66 | " x = self.layer2(x)\n",
67 | " return x\n",
68 | "\n",
69 | "class MultiHeadAttention(nn.Moculde):\n",
70 | " def __init__(self, hidden_size, dropout_rate, head_size = 8):\n",
71 | " super(MultiHeadAttention, self).__init__()\n",
72 | "\n",
73 | " self.head_size = head_size\n",
74 | "\n",
75 | " self.att_size = att_size = hidden_size // head_size\n",
76 | " self.scale = arr_size ** -0.5\n",
77 | "\n",
78 | " self.linear_q = nn.Linear(hidden_size, head_size * att_size, bias = False)\n",
79 | " self.linear_k = nn.Linear(hidden_size, head_size * att_size, bias = False)\n",
80 | " self.linear_v = nn.Linear(hidden_size, head_size * att_size, bias = False)\n",
81 | " initialize_weight(self.linear_q)\n",
82 | " initialize_weight(self.linear_k)\n",
83 | " initialize_weight(self.linear_v)\n",
84 | "\n",
85 | " self.att_dropout = nn.Dropout(dropout_rate)\n",
86 | " \n",
87 | " self.output_layer = nn.Linear(head_size * att_size, hidden_size, bias = False)\n",
88 | " initialize_weight(self.output_layer)\n",
89 | "\n",
90 | " def forward(self, q, k, v, mask, cache = None):\n",
91 | " orig_q_size = q.size()\n",
92 | "\n",
93 | " d_k = self.att_size\n",
94 | " d_v = self.att_size\n",
95 | " batch_size = q.size(0)\n",
96 | "\n",
97 | " #head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i)\n",
98 | " q = self.linear_q(q).view(batch_size, -1, self.head_size, d_k)\n",
99 | " if cache is not None and 'endec_k' in cache:\n",
100 | " k, v = cache['endec_k'], cahce['endec_v']\n",
101 | " else:\n",
102 | " k = self.linear_k(k).view(bacth_size, -1, self.head_size, d_k)\n",
103 | " v = self.linear_v(v).view(batch_size, -1, self.head_size, d_v)\n",
104 | "\n",
105 | " if cache is not None:\n",
106 | " cache['endec_k'], cache['endec_v'] = k, v\n",
107 | "\n",
108 | " q = q.transpose(1, 2) # [b, h, q_len, d_k]\n",
109 | " v = v.transpose(1, 2) # [b, h, v_len, d_v]\n",
110 | " k = k.transpose(1, 2).transpose(2, 3) # [b, h, d_k, k_len]\n",
111 | "\n",
112 | " #Scaled Dot-Product Attention\n",
113 | " #Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V\n",
114 | " q.mul_(self.scale)\n",
115 | " x = torch.matmul(q, k) # [b, h, q_len, k_len]\n",
116 | " x.masked_fill_(mask.unsqueeze(1), -1e9)\n",
117 | " x = torch.softmax(x, dim = 3)\n",
118 | " x = self.att_dropout(x)\n",
119 | " x = x.matmul(v) # [b, h, q_len, attn]\n",
120 | "\n",
121 | " x = x.transpose(1, 2).contiguous() # [b, q_len, h, attn]\n",
122 | " x = x.view(batch_size, -1, self.head_size * d_v)\n",
123 | "\n",
124 | " x = self.output_layer(x)\n",
125 | "\n",
126 | " assert x.size() == orig_q_size\n",
127 | " return x\n",
128 | "\n",
129 | "class EncoderLayer(nn.Module):\n",
130 | " def __init__(self, hidden_size, filter_size, dropout_rate):\n",
131 | " super(EncoderLayer, self).__init__()\n",
132 | "\n",
133 | " self.self_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
134 | " self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
135 | " self.self_attention_dropout = nn.Dropout(dropout_rate)\n",
136 | "\n",
137 | " self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
138 | " self.enc_dec_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
139 | " self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)\n",
140 | "\n",
141 | " self.ffn_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
142 | " self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)\n",
143 | " self.ffn_dropout = nn.Dropout(dropout_rate)\n",
144 | "\n",
145 | " def forward(self, x, enc_output, self_mask, i_mask, cache):\n",
146 | " y = self.self_attention_norm(x)\n",
147 | " y = self.self_attention(y, y, y, self_mask) #(q, k, v, mask)\n",
148 | " y = self.self_attention_dropout(y)\n",
149 | " x = x + y #skip connection\n",
150 | "\n",
151 | " y = self.ffn_norm(x)\n",
152 | " y = ffn(y)\n",
153 | " y = self.ffn_dropout(y)\n",
154 | " x = x + y #skip connection\n",
155 | " return x\n",
156 | "\n",
157 | "class DecoderLayer(nn.Module):\n",
158 | " def __init__(self, hidden_size, filter_size, dropout_rate):\n",
159 | " super(DecoderLayer, self).__init__()\n",
160 | "\n",
161 | " self.self_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
162 | " self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
163 | " self.self_attention_dropout = nn.Dropout(dropout_rate)\n",
164 | "\n",
165 | " self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
166 | " self.enc_dec_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
167 | " self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)\n",
168 | "\n",
169 | " self.ffn_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
170 | " self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)\n",
171 | " self.ffn_dropout = nn.Dropout(dropout_rate)\n",
172 | "\n",
173 | " def forward(self, x, enc_output, self_mask, i_mask, cache):\n",
174 | " y = self.self_attention_norm(x)\n",
175 | " y = self.self_attention(y, y, y, self_mask)\n",
176 | " y = self.self_attention_dropout(y)\n",
177 | " x = x + y\n",
178 | "\n",
179 | " if enc_output is not None:\n",
180 | " y = self.self_attention_norm(x)\n",
181 | " y = self.self_attention(y, enc_output, enc_output, i_mask, cache)\n",
182 | " y = self.enc_dec_attention_dropout(y)\n",
183 | " x = x + y\n",
184 | "\n",
185 | " y = self.ffn_norm(x)\n",
186 | " y = self.ffn(y)\n",
187 | " y = self.ffn_dropout(y)\n",
188 | " x = x + y\n",
189 | " return x\n",
190 | "\n",
191 | "class Encoder(nn.Module):\n",
192 | " def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):\n",
193 | " super(Encoder, self).__init__()\n",
194 | "\n",
195 | " encoders = [EncoderLayer(hidden_size, filter_size, dropout_rate) for _ in range(n_layers)]\n",
196 | " self.layers = nn.ModuleList(encoders)\n",
197 | "\n",
198 | " self.last_norm = nn.LayerNorm(gidden_size, eps = 1e-6)\n",
199 | "\n",
200 | " def forward(self, inputs, mask):\n",
201 | " encoder_output = inputs\n",
202 | " for enc_layer in self.layers:\n",
203 | " encoder_output = enc_layer(encoder_output, mask)\n",
204 | " return self.last_norm(encoder_output)\n",
205 | "\n",
206 | "class Decoder(nn.Module):\n",
207 | " def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):\n",
208 | " super(Decoder, self).__init__()\n",
209 | "\n",
210 | " decoders = [DecoderLayer(hidden_size, filter_size, dropout_rate) for _ in range(n_layers)]\n",
211 | " self.layers = nn.ModuleList(decoders)\n",
212 | "\n",
213 | " self.last_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
214 | "\n",
215 | " def forward(self, targets, enc_output, i_mask, t_self_mask, cache):\n",
216 | " decoder_output = targets\n",
217 | " for i, dec_layer in enumerate(self.layers):\n",
218 | " layer_cache = None\n",
219 | " if cache is not None:\n",
220 | " if i not in cache:\n",
221 | " cache[i] = {}\n",
222 | " layer_cache = cache[i]\n",
223 | " decoder_output = dec_layer(decoder_output, enc_output, t_self_mask, i_mask, layer_cache)\n",
224 | "\n",
225 | " return self.last_norm(decoder_output)\n",
226 | "\n",
227 | "class Transformer(nn.Module):\n",
228 | " def __init__(self, i_vocab_size, t_vocab_size, n_layers = 6, hidden_size = 512, \n",
229 | " filter_size = 2048, dropout_rate = 0.1, share_target_embedding = True,\n",
230 | " has_inputs = True, src_pad_idx = None, trg_pad_idx = None):\n",
231 | " super(Transformer, self).__init__()\n",
232 | "\n",
233 | " self.hidden_size = hidden_size\n",
234 | " self.emb_scale = hidden_size ** 0.5\n",
235 | " self.has_inputs = has_inputs\n",
236 | " self.src_pad_idx = src_pad_idx\n",
237 | " self.trg_pad_idx = trg_pad_idx\n",
238 | "\n",
239 | " self.t_vocab_embedding = nn.Embedding(t_vocab_size, hidden_size)\n",
240 | " nn.init.normal_(self.t_vocab_embedding.weight, mead = 0, std = hidden_size ** -0.5)\n",
241 | " self.t_emb_dropout = nn.Dropout(dropout_rate)\n",
242 | " self.decoder = Decoder(hidden_size, filter_size, dropout_rate, n_layers)\n",
243 | "\n",
244 | " if has_inputs:\n",
245 | " if not share_target_embedding:\n",
246 | " self.i_vocab_embedding = nn.Embedding(i_vocab_size, hidden_size)\n",
247 | " nn.init.normal_(self.i_vocab_embedding.weight, mean = 0, std = hidden_size ** -0.5)\n",
248 | " else:\n",
249 | " self.i_vocab_embedding = self.t_vocab_embedding\n",
250 | "\n",
251 | " self.i_emb_dropout = nn.Dropout(dropout_rate)\n",
252 | "\n",
253 | " self.encoder = Encoder(hidden_size, filter_size, dropout_rate, n_layers)\n",
254 | "\n",
255 | " #Positional Encoding\n",
256 | " num_timescales = self.hidden_size // 2\n",
257 | " max_timescale = 10000.0\n",
258 | " min_timescale = 1.0\n",
259 | " log_timescale_increment = (\n",
260 | " math.log(floast(max_timescale) / float(min_timescale)) / \n",
261 | " max(num_timescale - 1, 1))\n",
262 | " inv_timescales = min_timescale * torch.exp(\n",
263 | " torch.arange(num_timescales, dtype = torch.float32) * \n",
264 | " -log_timescale_increment)\n",
265 | " self.register_buffer('inv_timescales', inv_timescales)\n",
266 | "\n",
267 | " def forward(self, inputs, targets):\n",
268 | " enc_output, i_mask = None, None\n",
269 | " if self.has_inputs:\n",
270 | " i_mask = utils.create_pad_mask(inputs, self.src_pad_idx)\n",
271 | " enc_output = self.encode(inputs, i_mask)\n",
272 | "\n",
273 | " t_mask = utils.create_pad_mask(targets, self.trg_pad_idx)\n",
274 | " target_size = targets.size()[1]\n",
275 | " t_self_mask = utils.create_trg_self_mask(target_size, device = targets.device)\n",
276 | "\n",
277 | " return self.decode(targets, enc_output, i_mask, t_self_mask, t_mask)\n",
278 | "\n",
279 | " def encode(self, inputs, i_mask):\n",
280 | " #Input Embedding\n",
281 | " input_embedded = self.i_vocab_embedding(inputs)\n",
282 | " input_embedded.masked_fill_(i_mask.squeeze(1).unaqueeze(-1), 0)\n",
283 | " input_embedded *= self.emb_scale\n",
284 | " input_embedded += self.get_position_encoding(inputs)\n",
285 | " input_embedded = self.i_emb_dropout(input_embedded)\n",
286 | "\n",
287 | " return self.encoder(input_embedded, i_mask)\n",
288 | "\n",
289 | " def decoder(self, targets, enc_output, i_mask, t_self_mask, t_mask, cache = None):\n",
290 | " #target embedding\n",
291 | " target_embedded = self.t_vocab_embedding(targets)\n",
292 | " target_embedded.masked_fill(t_mask.squeeze(1).unsqueeze(-1), 0)\n",
293 | "\n",
294 | " #Shfting\n",
295 | " target_embedded = target_embedded[:, :-1]\n",
296 | " target_embedded = F.pad(target_embedded, (0, 0, 1, 0))\n",
297 | "\n",
298 | " target_embedded *= self.emb_scale\n",
299 | " target_embedded += self.get_position_encoding(targets)\n",
300 | " target_embedded = self.t_emb_dropout(target_embedded)\n",
301 | "\n",
302 | " #decoder\n",
303 | " decoder_output = self.decoder(target_embedded, enc_output, i_mask, t_self_mask, cache)\n",
304 | "\n",
305 | " #linear\n",
306 | " output = torch.matmul(decoder_output, self.t_vocab_embedding.weight.transpose(0, 1))\n",
307 | "\n",
308 | " return output\n",
309 | "\n",
310 | " def get_position_encoding(self, x):\n",
311 | " max_length = x.size()[1]\n",
312 | " position = torch.arange(max_length, dtype = torch.float32, device = x.device)\n",
313 | " scaled_time = position.unsqueeze(1) * self.inv_timescales.unsqueeze(0)\n",
314 | " signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim = 1)\n",
315 | " signal = F.pad(signal, (0, 0, 0, self.hidden_size % 2))\n",
316 | " signal = signal.view(1, max_length, self.hidden_size)\n",
317 | " return signal"
318 | ]
319 | }
320 | ]
321 | }
322 |
--------------------------------------------------------------------------------
/Natural Language Processing/XLNet/README.md:
--------------------------------------------------------------------------------
1 | # XLNet Implementaion
2 |
3 | https://github.com/graykode/xlnet-Pytorch/blob/master/xlnet.py 참고하여 작성됌
4 |
5 | paper review: https://cartinoe5930.tistory.com/entry/XLNet-Generalized-Autoregressive-Pretraining-for-Language-Understanding-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 |
--------------------------------------------------------------------------------
/Natural Language Processing/XLNet/XLNet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyO7PtJswjOPtdrbUQcICrcp",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "GgYufJCgpn7Z"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "from __future__ import absolute_import\n",
38 | "from __future__ import division\n",
39 | "from __future__ import print_function\n",
40 | "\n",
41 | "import json\n",
42 | "import os\n",
43 | "import tensorflow as tf\n",
44 | "import modeling\n",
45 | "\n",
46 | "def _get_initializer(FLAGS):\n",
47 | " # 변수 초기화\n",
48 | " if FLAGS.init == 'uniform':\n",
49 | " initializer = tf.initializers.random_uniform(\n",
50 | " minval = -FLAGS.init_range,\n",
51 | " maxval = FLAGS.init_range,\n",
52 | " seed = None\n",
53 | " )\n",
54 | "\n",
55 | " elif FLAGS.init == 'normal':\n",
56 | " initializer = tf.initializers.random_normal(\n",
57 | " stddev = FLAGS.init_std,\n",
58 | " seed = None\n",
59 | " )\n",
60 | "\n",
61 | " else:\n",
62 | " raise ValueError('Initializer {} not supported'.format(FALGS.init))\n",
63 | " return initializer\n",
64 | "\n",
65 | "class XLNetConfig(object):\n",
66 | " ''' XLNetConfig는 model checkpoint에 특정된 하이퍼 파라미터를 포함하고 있음\n",
67 | " 이 하이퍼 파라미터들은 pre-training 시와 fine-tuning 시에 모두 같아야 함\n",
68 | "\n",
69 | " n_layer: 레이어의 수\n",
70 | " d_model: hidden size\n",
71 | " n_head: attention head의 수\n",
72 | " d_head: 각 attention head의 차원 크기\n",
73 | " d_inner: feed-forward layer에서 hidden size\n",
74 | " ff_activation: 'relu' 또는 'gelu'\n",
75 | " untie_r: attention에서 bias들을 untie할 지 말지 결정\n",
76 | " n_token: vocab_size\n",
77 | " '''\n",
78 | "\n",
79 | " def __init__(self, FLAGS = None, json_path = None):\n",
80 | " '''\n",
81 | " XLNetConfig 구조\n",
82 | " 하나의 FLAGS 또는 json_path는 제공되어야 한다.\n",
83 | " '''\n",
84 | "\n",
85 | " assert FLAGS is not None or json_path is not None\n",
86 | "\n",
87 | " self.keys = ['n_layer', 'd_model', 'n_head', 'd_head', 'd_inner', 'ff_activation', \n",
88 | " 'untie_r', 'n_token']\n",
89 | "\n",
90 | " if FLAGS is not None:\n",
91 | " self.init_from_flags(FLAGS)\n",
92 | "\n",
93 | " if json_path is not None:\n",
94 | " self.init_from_json(json_path)\n",
95 | "\n",
96 | " def init_from_flags(self, FLAGS):\n",
97 | " for key in self.keys:\n",
98 | " setattr(self, key, getattr(FLAGS, key))\n",
99 | "\n",
100 | " def init_from_json(self, FLAGS):\n",
101 | " with tf.gfile.Open(json_path) as f:\n",
102 | " json_data = json.load(f)\n",
103 | " for key in self.keys:\n",
104 | " setattr(self, key, json_data[key])\n",
105 | "\n",
106 | " def to_json(self, json_path):\n",
107 | " # XLNetConfig를 json 파일로 저장\n",
108 | " json_data = {}\n",
109 | " for key in self.keys:\n",
110 | " json_data[key] = getattr(self, key)\n",
111 | "\n",
112 | " json_dir = os.path.dirname(json_path)\n",
113 | " if not tf.gfile.Exists(json_dir):\n",
114 | " tf.gfile.MakeDirs(json_dir)\n",
115 | " with tf.gfile.Open(json_path, 'w') as f:\n",
116 | " json.dump(json_data, f, indent = 4, sort_keys = True)\n",
117 | "\n",
118 | "def create_run_config(is_training, is_finetune, FLAGS):\n",
119 | " kwargs = dict(\n",
120 | " is_training=is_training,\n",
121 | " use_tpu=FLAGS.use_tpu,\n",
122 | " use_bfloat16=FLAGS.use_bfloat16,\n",
123 | " dropout=FLAGS.dropout,\n",
124 | " dropatt=FLAGS.dropatt,\n",
125 | " init=FLAGS.init,\n",
126 | " init_range=FLAGS.init_range,\n",
127 | " init_std=FLAGS.init_std,\n",
128 | " clamp_len=FLAGS.clamp_len\n",
129 | " )\n",
130 | "\n",
131 | " if not is_finetune:\n",
132 | " kwargs.update(dict(\n",
133 | " mem_len=FLAGS.mem_len,\n",
134 | " reuse_len=FLAGS.reuse_len,\n",
135 | " bi_data=FLAGS.bi_data,\n",
136 | " clamp_len=FLAGS.clamp_len,\n",
137 | " same_length=FLAGS.same_length\n",
138 | " ))\n",
139 | "\n",
140 | " return RunConfig(**kwargs)\n",
141 | "\n",
142 | "class RunConfig(object):\n",
143 | " '''\n",
144 | " RunConfig는 pre-training과 fine-tuning에서 서로 다른 하이퍼 파라미터를 가져야 함.\n",
145 | " 이 하이퍼 파라미터들은 실행할 때마다 변경할 수 있다.\n",
146 | " '''\n",
147 | "\n",
148 | " def __init__(self, is_training, use_tpu, use_bfloat16, dropout, dropatt,\n",
149 | " init = 'normal', init_range = 0.1, init_std = 0.02, mem_len = None,\n",
150 | " reuse_len = None, bi_data = False, clamp_len = -1, same_length = False):\n",
151 | " '''\n",
152 | " is_training: 학습 모드인지 아닌지 확인\n",
153 | " use_tpu: TPU를 사용할 지 말 지 확인\n",
154 | " use_bfloat16: float32 대신에 bfloat16 사용\n",
155 | " dropout: dropout 비율\n",
156 | " dropatt: attention 확률에 dropout 비율\n",
157 | " init: 초기화 scheme. 'normal' 또는 'uniform' 둘 중 하나\n",
158 | " init_range: [-init_range, init_range]에서 균일한 분포를 사용해서 파라미터를 초기화\n",
159 | " init='uniform'일 때 가장 효과적임\n",
160 | " mem_len: 캐시해둘 토큰의 수\n",
161 | " reuse_len: 캐시되고 향후 재사용될 현재 배치의 토큰 수이다.\n",
162 | " bi_data: 양방향성 입력 파이프라인을 사용할 지 말 지 정함. \n",
163 | " pre-training 중에는 True를 사용, fine-tuning 중에는 False를 사용\n",
164 | " clamp_len: clamp_len보다 큰 모든 상대 거리를 고정한다다. -1은 클램핑이 없음을 의미한다.\n",
165 | " same_length: 각 토큰에 대해 똑같은 attention length를 사용할 지 말 지 결정\n",
166 | " '''\n",
167 | "\n",
168 | " self.init = init\n",
169 | " self.init_range = init_range\n",
170 | " self.init_std = init_std\n",
171 | " self.is_training = is_training\n",
172 | " self.dropout = dropout\n",
173 | " self.dropatt = dropatt\n",
174 | " self.use_tpu = use_tpu\n",
175 | " self.use_bfloat16 = use_bfloat16\n",
176 | " self.mem_len = mem_len\n",
177 | " self.reuse_len = reuse_len\n",
178 | " self.bi_data = bi_data\n",
179 | " self.clamp_len = clamp_len\n",
180 | " self.same_length = same_length\n",
181 | "\n",
182 | "class XLNetModel(object):\n",
183 | " # pre-training 및 fine-tuning 중에 사용되는 XLNet 모델의 wrapper이다.\n",
184 | "\n",
185 | " def __init__(self, xlnet_config, run_config, input_ids, seg_ids, input_mask,\n",
186 | " memes = None, perm_mask = None, target_mapping = None, inp_q = None,\n",
187 | " **kwargs):\n",
188 | " \n",
189 | " initializer = _get_initializer(run_config)\n",
190 | "\n",
191 | " tfm_args = dict(\n",
192 | " n_token=xlnet_config.n_token,\n",
193 | " initializer=initializer,\n",
194 | " attn_type=\"bi\",\n",
195 | " n_layer=xlnet_config.n_layer,\n",
196 | " d_model=xlnet_config.d_model,\n",
197 | " n_head=xlnet_config.n_head,\n",
198 | " d_head=xlnet_config.d_head,\n",
199 | " d_inner=xlnet_config.d_inner,\n",
200 | " ff_activation=xlnet_config.ff_activation,\n",
201 | " untie_r=xlnet_config.untie_r,\n",
202 | "\n",
203 | " is_training=run_config.is_training,\n",
204 | " use_bfloat16=run_config.use_bfloat16,\n",
205 | " use_tpu=run_config.use_tpu,\n",
206 | " dropout=run_config.dropout,\n",
207 | " dropatt=run_config.dropatt,\n",
208 | "\n",
209 | " mem_len=run_config.mem_len,\n",
210 | " reuse_len=run_config.reuse_len,\n",
211 | " bi_data=run_config.bi_data,\n",
212 | " clamp_len=run_config.clamp_len,\n",
213 | " same_length=run_config.same_length\n",
214 | " )\n",
215 | "\n",
216 | " input_args = dict(\n",
217 | " inp_k=input_ids,\n",
218 | " seg_id=seg_ids,\n",
219 | " input_mask=input_mask,\n",
220 | " mems=mems,\n",
221 | " perm_mask=perm_mask,\n",
222 | " target_mapping=target_mapping,\n",
223 | " inp_q=inp_q\n",
224 | " )\n",
225 | "\n",
226 | " with tf.variable_scope('model', reuse = tf.AUTO_REUSE):\n",
227 | " (self.output, self.new_mems, self.lookup_table) = modeling.transformer_xl(**tfm_args)\n",
228 | "\n",
229 | " self.input_mask = input_mask\n",
230 | " self.initializer = initializer\n",
231 | " self.clnet_config = clnet_config\n",
232 | " self.run_config = run_config\n",
233 | "\n",
234 | " def get_pooled_out(self, summary_type, use_summ_proj = True):\n",
235 | " xlnet_config = self.xlnet_config\n",
236 | " run_config = self.run_config\n",
237 | "\n",
238 | " with tf.variable_scope('model', reuse = tf.AUTO_REUSE):\n",
239 | " summary = modeling.summarize_sequence(\n",
240 | " summary_type=summary_type,\n",
241 | " hidden=self.output,\n",
242 | " d_model=xlnet_config.d_model,\n",
243 | " n_head=xlnet_config.n_head,\n",
244 | " d_head=xlnet_config.d_head,\n",
245 | " dropout=run_config.dropout,\n",
246 | " dropatt=run_config.dropatt,\n",
247 | " is_training=run_config.is_training,\n",
248 | " input_mask=self.input_mask,\n",
249 | " initializer=self.initializer,\n",
250 | " use_proj=use_summ_proj\n",
251 | " )\n",
252 | "\n",
253 | " return summary\n",
254 | "\n",
255 | " def get_sequence_output(self):\n",
256 | " # XLNet의 마지막 레이어의 hidden representation\n",
257 | " \n",
258 | " return self.output\n",
259 | "\n",
260 | " def get_new_memory(self):\n",
261 | " # 이전 메모리와 현재 input representation을 합친 new memory\n",
262 | " # list의 길이는 n_layer와 같음\n",
263 | " return self.new_mems\n",
264 | "\n",
265 | " def get_embedding_table(self):\n",
266 | " # embedding lookup table\n",
267 | " # input 레이어와 output 레이어 간의 embedding tie\n",
268 | " return self.lookup_table\n",
269 | "\n",
270 | " def get_initializer(self):\n",
271 | " # tf initilizer\n",
272 | " # XLNet의 top layer에서 변수들을 초기화하기 위해 사용\n",
273 | " return self.initializer"
274 | ]
275 | }
276 | ]
277 | }
278 |
--------------------------------------------------------------------------------