├── Computer Vision ├── CNN │ ├── DenseNet.ipynb │ ├── EfficientNet.ipynb │ ├── GoogLeNet.ipynb │ ├── MobileNet_구현_실습.ipynb │ ├── README.md │ ├── ResNet.ipynb │ └── Xception.ipynb └── README.md ├── Multimodal Models ├── FLAVA │ ├── Interacting with FLAVA.ipynb │ └── README.md └── README.md ├── Natural Language Processing ├── ALBERT │ ├── ALBERT.ipynb │ └── README.md ├── BERT │ ├── BERT_model.ipynb │ ├── BERT_구현_복습.ipynb │ └── README.md ├── ELECTRA │ ├── ELECTRA.ipynb │ └── README.md ├── ELMo │ ├── ELMo.ipynb │ ├── README.md │ ├── char_cnn.ipynb │ └── character_dataset.ipynb ├── GPT-1 │ ├── GPT-1 Implementation.ipynb │ └── README.md ├── README.md ├── RoBERTa │ ├── README.md │ └── RoBERTa.ipynb ├── Transformer-XL │ ├── README.md │ └── Transformer_XL_구현_실습.ipynb ├── Transformer │ ├── README.md │ ├── Transformer_구현_복습.ipynb │ └── Transformer_구현_실습.ipynb └── XLNet │ ├── README.md │ └── XLNet.ipynb └── README.md /Computer Vision/CNN/DenseNet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyP5VtNzKVdcgFotI0cRex0h", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "sGveVsqEBvXg" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import re\n", 38 | "import torch\n", 39 | "import torch.nn as nn\n", 40 | "import torch.nn.functional as F\n", 41 | "import torch.utils.checkpoint as cp\n", 42 | "from collections import OrderedDict\n", 43 | "#from .utils import load_state_dict_from_url\n", 44 | "from torch import Tensor\n", 45 | "from torch.jit.annotations import List\n", 46 | "\n", 47 | "__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']\n", 48 | "\n", 49 | "model_urls = {\n", 50 | " 'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',\n", 51 | " 'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',\n", 52 | " 'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',\n", 53 | " 'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',\n", 54 | "}\n", 55 | "\n", 56 | "#Dense Layer\n", 57 | "class _DenseLayer(nn.Module):\n", 58 | " def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient = False):\n", 59 | " super(_DenseLayer, self).__init__()\n", 60 | " self.add_module('norm1', nn.BatchNorm2d(num_input_features)),\n", 61 | " self.add_module('relu1', nn.ReLU(inplace = True)),\n", 62 | " self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size = 1, \n", 63 | " stride = 1, bias = False)),\n", 64 | " self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),\n", 65 | " self.add_module('relu2', nn.ReLU(inplace = True)),\n", 66 | " self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size = 3,\n", 67 | " stride = 1, padding = 1, bias = False)),\n", 68 | " self.drop_rate = float(drop_rate)\n", 69 | " self.memory_efficient = memory_efficient\n", 70 | "\n", 71 | " #Bacth Normalization 하는 부분\n", 72 | " def bn_function(self, inputs):\n", 73 | " # type: List[tensor] -> tensor\n", 74 | " concated_features = torch.cat(inputs, 1)\n", 75 | " bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))\n", 76 | " return bottleneck_output\n", 77 | "\n", 78 | " def any_requires_grad(self, input):\n", 79 | " # type: List[tensor] -> bool\n", 80 | " for tensor in input:\n", 81 | " if tensor.requires_grad:\n", 82 | " return True\n", 83 | " return False\n", 84 | "\n", 85 | " @torch.jit.unused\n", 86 | " def call_checkpoint_bottleneck(self, input):\n", 87 | " # type: List[tensor] -> tensor\n", 88 | " def closure(*inputs):\n", 89 | " return self.bn_function(inputs)\n", 90 | "\n", 91 | " return cp.checkpoint(closure, *input)\n", 92 | "\n", 93 | " @torch.jit._overload_method\n", 94 | " def forward(self, input):\n", 95 | " # type: List[tensor] -> tensor\n", 96 | " pass\n", 97 | "\n", 98 | " @torch.jit._overload_method\n", 99 | " def forward(self, input):\n", 100 | " # type: Tensor -> Tensor\n", 101 | " pass\n", 102 | "\n", 103 | " #아직 torchscript는 *args를 지원하지 않기 때문에, List[Tensor] 또는 single tensor를\n", 104 | " #오버로드 하는 방법을 사용\n", 105 | " #순전파\n", 106 | " def forward(self, input):\n", 107 | " if isinstance(input, Tensor):\n", 108 | " prev_features = [input]\n", 109 | " else:\n", 110 | " prev_features = input\n", 111 | "\n", 112 | " if self.memory_efficient and self.any_requires_grad(prev_features):\n", 113 | " if torch.jit.is_scripting():\n", 114 | " raise Exception('Memory Efficient not supported in JIT')\n", 115 | "\n", 116 | " bottleneck_output = self.call_checkpoint_bottleneck(prev_features)\n", 117 | "\n", 118 | " else:\n", 119 | " bottleneck_output = self.bn_function(prev_features)\n", 120 | "\n", 121 | " new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))\n", 122 | " if self.drop_rate > 0:\n", 123 | " new_features = F.dropout(new_features, p = self.drop_rate, training = self.training)\n", 124 | " \n", 125 | " return new_features\n", 126 | "\n", 127 | "#DenseBlock layer\n", 128 | "class _DenseBlock(nn.ModuleDict):\n", 129 | " _version = 2\n", 130 | "\n", 131 | " def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate,\n", 132 | " memory_efficient = False):\n", 133 | " super(_DenseBlock, self).__init__()\n", 134 | " for i in range(num_layers):\n", 135 | " layer = _DenseLayer(\n", 136 | " num_input_features + i * growth_rate, growth_rate = growth_rate, bn_size = bn_size,\n", 137 | " drop_rate = drop_rate, memory_efficient = memory_efficient,\n", 138 | " )\n", 139 | " self.add_module('denselayer%d' % (i + 1), layer)\n", 140 | "\n", 141 | " def forward(self, init_features):\n", 142 | " features = [init_features]\n", 143 | " for name, layer in self.items():\n", 144 | " new_features = layer(features)\n", 145 | " features.append(new_features)\n", 146 | " return torch.cat(features, 1)\n", 147 | "\n", 148 | "#Transition layer\n", 149 | "class _Transition(nn.Sequential):\n", 150 | " def __init__(self, num_input_features, num_output_features):\n", 151 | " super(_Transition, self).__init__()\n", 152 | " self.add_module('norm', nn.BacthNorm2d(num_input_features))\n", 153 | " self.add_module('relu', nn.ReLU(inplace = True))\n", 154 | " self.add_module('conv', nn.Conv2d(num_input_features, num_output_features, kernel_size = 1,\n", 155 | " stride = 1, bias = False))\n", 156 | " self.add_module('pool', nn.AvgPool2d(kernel_size = 2, stride = 2))\n", 157 | "\n", 158 | "class DenseNet(nn.Module):\n", 159 | " #growth_rate: 각 레이어에 얼만큼의 필터를 추가할지 (논문에서는 'k'로 표현)\n", 160 | " #block_config: 각 풀링 계층에서 얼마나 많은 레이어를 사용할지\n", 161 | " #num_init_features: 첫 합성곱 레이어에서 얼만큼의 필터를 배울지\n", 162 | " #bn_size: bottleneck layer의 숫자에 대한 factor\n", 163 | " #drop_rate: 각 dense layer 이후의 dropout rate\n", 164 | " #num_classes: 분류 클래스의 수\n", 165 | " #memort_efficient: True면 checkpoint 사용\n", 166 | "\n", 167 | " def __init__(self, growth_rate = 32, block_config = (6, 12, 24, 16),\n", 168 | " num_init_features = 64, bn_size = 4, drop_rate = 0, num_classes = 1000,\n", 169 | " memory_efficient = False):\n", 170 | " super(DenseNet, self).__init__()\n", 171 | "\n", 172 | " #첫 번째 convolution\n", 173 | " self.features = nn.Sequential(OrderedDict([\n", 174 | " ('conv0', nn.Conv2d(3, num_init_features, kernel_size = 7, stride = 2,\n", 175 | " padding = 3, bias = False)),\n", 176 | " ('norm0', nn.BactNorm2d(num_init_features)),\n", 177 | " ('relu0', nn.ReLU(inplace = True)),\n", 178 | " ('pool0', nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)),\n", 179 | " ]))\n", 180 | "\n", 181 | " #각 dense block\n", 182 | " num_features = num_init_features\n", 183 | " for i, num_layers in enumerate(block_config):\n", 184 | " block = _DenseBlock(\n", 185 | " num_layers = num_layers,\n", 186 | " num_input_features = num_features,\n", 187 | " bn_size = bn_size,\n", 188 | " growth_rate = growth_rate,\n", 189 | " drop_rate = drop_rate,\n", 190 | " memory_efficient = memory_efficient\n", 191 | " )\n", 192 | " self.features.add_module('denseblock%d' % (i + 1), block)\n", 193 | " num_features = num_features + num_layers * growth_rate\n", 194 | " if i != len(block_config) - 1:\n", 195 | " trans = _Transition(num_input_featurs = num_features,\n", 196 | " num_output_features = num_features // 2)\n", 197 | " self.featrues.add_module('transition%d' % (i + 1), trans)\n", 198 | " num_features = num_features // 2\n", 199 | "\n", 200 | " #마지막 batch norm\n", 201 | " self.features.add_module('norm5', nn.BatchNorm2d(num_features))\n", 202 | "\n", 203 | " #Liunear Layer\n", 204 | " self.classifier = nn.Linear(num_features, num_classes)\n", 205 | "\n", 206 | " for m in self.modules():\n", 207 | " if siinstance(m, nn.Conv2d):\n", 208 | " nn.init.kaiming_normal_(m.weight)\n", 209 | " elif isinstance(m, nn.BatchNorm2d):\n", 210 | " nn.init.constant_(m.weight, 1)\n", 211 | " nn.init.constant_(m.bias, 0)\n", 212 | " elif isinstance(m, nn.Linear):\n", 213 | " nn.init.constant_(m.bias, 0)\n", 214 | "\n", 215 | " def forward(self, x):\n", 216 | " features = self.features(x)\n", 217 | " out = F.relu(features, inplace = True)\n", 218 | " out = F.adaptive_avg_pool2d(out, (1, 1))\n", 219 | " out = torch.flatten(out, 1)\n", 220 | " out = self.classifier(out)\n", 221 | " return out\n", 222 | "\n", 223 | "def _load_state_dict(model, model_url, progress):\n", 224 | " pattern = re.compile(\n", 225 | " r'^(.*denselayer\\d+\\.(?:norm|relu|conv))\\.((?:[12])\\.(?:weight|bias|running_mean|running_var))$')\n", 226 | "\n", 227 | " state_dict = load_state_dict_from_url(model_url, progress=progress)\n", 228 | " for key in list(state_dict.keys()):\n", 229 | " res = pattern.match(key)\n", 230 | " if res:\n", 231 | " new_key = res.group(1) + res.group(2)\n", 232 | " state_dict[new_key] = state_dict[key]\n", 233 | " del state_dict[key]\n", 234 | " model.load_state_dict(state_dict)\n", 235 | "\n", 236 | "def _densenet(arch, growth_rate, block_config, num_init_features, pretrained, progress,\n", 237 | " **kwargs):\n", 238 | " model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)\n", 239 | " if pretrained:\n", 240 | " _load_state_dict(model, model_urls[arch], progress)\n", 241 | " return model\n", 242 | "\n", 243 | "def densenet121(pretrained = False, progress = True, **kwargs):\n", 244 | " return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress, **kwargs)\n", 245 | "\n", 246 | "def densenet161(pretrained = False, progress = True, **kwargs):\n", 247 | " return _densenet('dnesenet161', 48, (6, 12, 36, 24), 96, pretrained, progress, **kwargs)\n", 248 | "\n", 249 | "def densenet169(pretrained = False, progress = True, **kwargs):\n", 250 | " return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress, **kwargs)\n", 251 | "\n", 252 | "def densenet201(pretrained = False, progress = True, **kwargs):\n", 253 | " return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress, **kwargs)" 254 | ] 255 | } 256 | ] 257 | } -------------------------------------------------------------------------------- /Computer Vision/CNN/EfficientNet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyMZpVwLqfasaEfsqwf3UBeE", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "ghfyI8deSjb_" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import torch\n", 38 | "from torch import nn\n", 39 | "from torch.nn import functional as F\n", 40 | "from .utils import (\n", 41 | " round_filters,\n", 42 | " round_repeats,\n", 43 | " drop_connect,\n", 44 | " get_same_padding_conv2d,\n", 45 | " get_model_params,\n", 46 | " efficientnet_params,\n", 47 | " load_pretrained_weights,\n", 48 | " Swish,\n", 49 | " MemoryEfficientSwish,\n", 50 | " calculate_output_image_size\n", 51 | ")\n", 52 | "\n", 53 | "VALID_MODELS = (\n", 54 | " 'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3',\n", 55 | " 'efficientnet-b4', 'efficientnet-b5', 'efficientnet-b6', 'efficientnet-b7',\n", 56 | " 'efficientnet-b8',\n", 57 | "\n", 58 | " # Support the construction of 'efficientnet-l2' without pretrained weights\n", 59 | " 'efficientnet-l2'\n", 60 | ")\n", 61 | "\n", 62 | "class MBConvBlock(nn.Module):\n", 63 | " #Mobile Inverted Residual Bottleneck Block\n", 64 | "\n", 65 | " def __init__(self, block_args, global_params, image_size = None):\n", 66 | " super().__init__()\n", 67 | " self.block_args = block_args\n", 68 | " self._bn_mom = 1 - global_aprams.batch_norm_momentum\n", 69 | " self._bn_eps = global_params.batch_norm_epsilon\n", 70 | " self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ration <= 1)\n", 71 | " self.id_skip = block_args.id_skip #use skip connection and drop connect\n", 72 | "\n", 73 | " #Expansion phase\n", 74 | " inp = self._block_args.input_filters #number of input channels\n", 75 | " oup = self._block_args.input_filters * self._block_args.expand_ratio #number of output channels\n", 76 | "\n", 77 | " if self._block_args.expand_ratio != 1:\n", 78 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n", 79 | " self._expand_conv = Conv2d(in_channels = inp, output_channels = oup, kernel_size = 1,\n", 80 | " bias = False)\n", 81 | " self._bn0 = nn.BatchNorm2d(num_features = oup, momentum = self._bn_mom, eps = self._bn_eps)\n", 82 | " #image_size = calculate_output_image_size(image_size, 1)\n", 83 | "\n", 84 | " #Depthwise convolution phase\n", 85 | " k = self._block_args.kernel_size\n", 86 | " s = self._block_args.stride\n", 87 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n", 88 | " self._depthwise_conv = Conv2d(\n", 89 | " in_channels = oup, out_channels = oup, groups = oup, #groups가 depthwise를 만듦\n", 90 | " kernel_size = k, strides = s, bias = False\n", 91 | " )\n", 92 | " self._bn1 = nn.BatchNorm2d(num_features = oup, momentum = self._bn_mom, \n", 93 | " eps = self._bn_eps)\n", 94 | " image_size = calculate_output_image_size(image_size, s)\n", 95 | "\n", 96 | " #Squeeze and Excitation layer\n", 97 | " if self.has_se:\n", 98 | " Conv2d = get_same_padding_conv2d(image_size = (1, 1))\n", 99 | " num_squeezed_channels = max(1, int(self.block_args.input_filters * \n", 100 | " self._block_args.se-ratio))\n", 101 | " self._se_reduce = Conv2d(in_channels = oup, out_channel = num_squeezed_channels,\n", 102 | " kernel_size = 1)\n", 103 | " self._se_expand = Conv2d(in_channels = num_squeezed_channels, out_channel = oup,\n", 104 | " kernel_size = 1)\n", 105 | " \n", 106 | " #Pointwise Convolution\n", 107 | " final_oup = self._block_args.output_filters\n", 108 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n", 109 | " self._project_conv = Conv2d(in_channels = oup, out_channels = final_oup, \n", 110 | " kernel_size = 1, bias = False)\n", 111 | " self._bn2 = nn.BatchNorm2d(num_features = final_oup, momentum = self._bn_mom,\n", 112 | " eps = self._bn_eps)\n", 113 | " self._swish = MemoryEfficientSwish()\n", 114 | "\n", 115 | " def forward(self, inputs, drop_connect_rate = None):\n", 116 | " #Expansion & Depthwise Convolution\n", 117 | " x = inputs\n", 118 | " if self._block_args.expand_ratio != 1:\n", 119 | " x = self.expand_conv(inputs)\n", 120 | " x = self._bn0(x)\n", 121 | " x = self._swish(x)\n", 122 | "\n", 123 | " x = self._depthwise_conv(x)\n", 124 | " x = self._bn1(x)\n", 125 | " x = self._swish(x)\n", 126 | "\n", 127 | " #Squeeze & Excitation\n", 128 | " if self.has_se:\n", 129 | " x_squeezed = F.adaptive_avg_pool2d(x, 1)\n", 130 | " x_squeezed = self._se_reduce(x_squeezed)\n", 131 | " x_squeezed = self._swish(x_squeezed)\n", 132 | " x_squeezed = self._se_expand(x_squeezed)\n", 133 | " x = torch.sigmoid(x_squeezed) * x\n", 134 | "\n", 135 | " #Pointwise Convolution\n", 136 | " x = self._project_conv(x)\n", 137 | " x = self._bn2(x)\n", 138 | "\n", 139 | " #Skip connection & drop connect\n", 140 | " input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters\n", 141 | "\n", 142 | " if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:\n", 143 | " #skip connection과 drop connect는 stochastic depth를 가져온다\n", 144 | " if drop_connect_rate:\n", 145 | " x = drop_connect(x, p = drop_connect_rate, training = self.training)\n", 146 | " x = x + inputs #skip connection\n", 147 | " \n", 148 | " return x\n", 149 | "\n", 150 | " def set_swish(self, memory_efficient = True):\n", 151 | " #memory efficient를 위한 swish 설정\n", 152 | "\n", 153 | " self._swish = MemoryEfficientSwish() if memory_efficient else Swish()\n", 154 | "\n", 155 | "class EfficientNet(nn.Module):\n", 156 | "\n", 157 | " def __init__(self, blocks_args = None, global_params = None):\n", 158 | " super().__init__()\n", 159 | " assert isinstance(block_args, list), 'blocks_args should be a list'\n", 160 | " assert len(block_args) > 0, 'block args must be greater than 0'\n", 161 | " self._global_params = global_params\n", 162 | " self._block_args = block_args\n", 163 | "\n", 164 | " #BatchNorm parameters\n", 165 | " bn_mom = 1 - self._global_params.batch_norm_momentum\n", 166 | " bn_eps = self._global_params.batch_norm_epsilon\n", 167 | "\n", 168 | " #이미지 크기에 따라서 정적 또는 동적 convolution을 함\n", 169 | " image_size = global_params.image_size\n", 170 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n", 171 | "\n", 172 | " #Stem\n", 173 | " in_channels = 3 #rgb\n", 174 | " out_channels = round_filters(32, self._global_params) #number of output channels\n", 175 | " self._conv_stem = Conv2d(in_channels, out_channels, kernel_size = 3, stride = 2,\n", 176 | " bias = False)\n", 177 | " self._bn0 = nn.BatchNorm2d(num_features = out_channels, momentum = bn_mom, eps = bn_eps)\n", 178 | " image_size = calculate_output_image_size(image_size, 2)\n", 179 | "\n", 180 | " #블록 쌓기\n", 181 | " self._blocks = nn.ModuleList([])\n", 182 | " for block_args in self._block_args:\n", 183 | " #depth multiplier에 따라 입력과 출력 필터 업데이트\n", 184 | " block_args = block_args._replace(\n", 185 | " input_filters = round_filters(block_args.input_filters, self._global_params),\n", 186 | " output_filter = round_filters(block_args.output_filters, self._global_params),\n", 187 | " num_repeat = round_filters(block_args.num_repeates, self._global_params)\n", 188 | " )\n", 189 | "\n", 190 | " #첫 번째 블록은 stride와 filter size 증가를 관리할 필요가 있음\n", 191 | " self._blocks.append(MBConvBlock(block_args, self._global_params, image_size = image_size))\n", 192 | " image_size = calculate_output_image_size(image_size, block_args.stride)\n", 193 | " if block_args.num_repeat > 1: #block_args를 조정해서 똑같은 output size 유지\n", 194 | " block_args = block_args._replace(input_filters = block_args.output_filters, stride = 1)\n", 195 | "\n", 196 | " for _ in range(block_args.num_repeat - 1):\n", 197 | " self._blocks.append(MBConvBlock(block_args, self._global_params, image_size = image_size))\n", 198 | "\n", 199 | " #Head\n", 200 | " in_channels = block_args.output_filters #output of final block\n", 201 | " out_channels = round_filters(1280, self._global_params)\n", 202 | " Conv2d = get_same_padding_conv2d(image_size = image_size)\n", 203 | " self._conv_head = Conv2d(in_channels, out_channels, kernel_size = 1, bias = False)\n", 204 | " self._bn1 = nn.BatchNorm2d(num_features = out_channels, momentum = bn_mom, eps = bn_eps)\n", 205 | "\n", 206 | " #Final Linear Layer\n", 207 | " self._avg_pooling = nn.AdaptiveAvgPool2d(1)\n", 208 | " self._dropout = nn.Dropout(self._global_params.dropout_rate)\n", 209 | " self._fc = nn.Linear(out_channels, self._global_params.num_classes)\n", 210 | " self._swish = MemoryEfficientSwish()\n", 211 | "\n", 212 | " def set_swish(self, memory_efficient = True):\n", 213 | " self._swish = MemoryEfficientSwish() if memory_efficient else Swish()\n", 214 | " for block in self._blocks:\n", 215 | " block.set_swish(memory_efficient)\n", 216 | "\n", 217 | " def extract_endpoints(self, inputs):\n", 218 | " #Convolution layer을 사용해서 feature을 extract\n", 219 | "\n", 220 | " endpoints = dict()\n", 221 | "\n", 222 | " #Stem\n", 223 | " x = self._swish(self._bn0(self._conv_stem(inputs)))\n", 224 | " prev_x = x\n", 225 | "\n", 226 | " #Blocks\n", 227 | " for idx, block in enumerate(self._blocks):\n", 228 | " drop_connect_rate = self._global_params.drop_connect_rate\n", 229 | " if drop_connect_rate:\n", 230 | " drop_connect_rate *= float(idx) / len(self._blocks) #scale drop connect_rate\n", 231 | " x = block(x, drop_connect_rate = drop_connect_rate)\n", 232 | " if prev_x.size(2) > x.size(2):\n", 233 | " endpoints[f'reduction_{len(endpoints)+1}'] = prev_x\n", 234 | " prev_x = x\n", 235 | "\n", 236 | " #Head\n", 237 | " x = self._swish(self._bn1(self._conv_head(x)))\n", 238 | " endpoints[f'reduction_{len(endpoints) + 1}'] = x\n", 239 | "\n", 240 | " return endpoints\n", 241 | "\n", 242 | " def extract_features(self, inputs):\n", 243 | " #Convolution layer을 사용해서 feature을 추출\n", 244 | "\n", 245 | " #Stem\n", 246 | " x = self._swish(self._bn0(self._conv_stem(inputs)))\n", 247 | "\n", 248 | " #Blocks\n", 249 | " for idx, block in enumerate(self._blocks):\n", 250 | " drop_connect_rate = self._global_params.drop_connect_rate\n", 251 | " if drop_connect_rate:\n", 252 | " drop_connect_rate *= float(idx) / len(self._blocks) # scale drop connect rate\n", 253 | " x = block(x, drop_connect_rate = drop_connect_rate)\n", 254 | "\n", 255 | " #Head\n", 256 | " x = self._swish(self._bn1(self._conv_head(x)))\n", 257 | "\n", 258 | " return x\n", 259 | "\n", 260 | " def forward(self, inputs):\n", 261 | " #EfficientNet의 순전파\n", 262 | "\n", 263 | " #Convolution Layers\n", 264 | " x = self.extract_features(inputs)\n", 265 | "\n", 266 | " #Pooling & final linear_layers\n", 267 | " x = self._avg_pooling(x)\n", 268 | " x = x.flatten(start_dim = 1)\n", 269 | " x = self._dropout(x)\n", 270 | " x = self._fc(x)\n", 271 | "\n", 272 | " return x\n", 273 | "\n", 274 | " @classmethod\n", 275 | " def from_name(cls, model_name, in_channels = 3, **override_params):\n", 276 | " #이름에 따라서 EfficientNet 생성\n", 277 | "\n", 278 | " cls._check_model_name_is_valid(model_name)\n", 279 | " blocks_args, clobal_params = get_model_params(model_name, override_params)\n", 280 | " model = cls(blocks_args, global_params)\n", 281 | " model._change_in_channels(in_channels)\n", 282 | " return model\n", 283 | "\n", 284 | " @classmethod\n", 285 | " def from_pretrained(cls, model_naem, weights_path = None, advprop = False,\n", 286 | " in_channels = 3, num_classes = 1000, **override_params):\n", 287 | " model = cls.from_name(model_name, num_classes = num_classes, **override_params)\n", 288 | " load_pretrained_weights(model, model_name, weights_path = weights_path, \n", 289 | " load_fc = (num_calss == 1000), advprop = advprop)\n", 290 | " model._change_in_channels(in_channels)\n", 291 | " return model\n", 292 | "\n", 293 | " @clasmethod\n", 294 | " def get_image_size(cls, model_name):\n", 295 | " #입력 이미지의 크기를 가져옴\n", 296 | "\n", 297 | " cls._check_model_name_is_valid(model_name)\n", 298 | " _, _, res, _ = efficientnet_params(model_name)\n", 299 | " return res\n", 300 | "\n", 301 | " @classmethod\n", 302 | " def _check_model_name_is_valid(cls, model_name):\n", 303 | " #model name check\n", 304 | "\n", 305 | " if model_name not in VALID_MODELS:\n", 306 | " raise ValueError('model_name should be one of: ' + ', '.join(VALID_MODELS))\n", 307 | "\n", 308 | " def _change_in_channels(self, in_channels):\n", 309 | " #첫 번째 합성곱 레이어에 사용되는 in_channels가 3이 아니라면, 조정\n", 310 | "\n", 311 | " if in_channels != 3:\n", 312 | " Conv2d = get_same_padding_conv2d(image_size = self._global_params.image_size)\n", 313 | " out_channels = round_filters(32, self._global_params)\n", 314 | " self._conv_stem = Conv2d(in_channels, out_channels, kernel_size = 3, stride = 2, bias = False)" 315 | ] 316 | } 317 | ] 318 | } -------------------------------------------------------------------------------- /Computer Vision/CNN/GoogLeNet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyMtfpaygJNZbBpUa0WxvN2p", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "1f7qWOK7AQC5" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import warnings\n", 38 | "from collections import namedtuple\n", 39 | "import torch\n", 40 | "import torch.nn as nn\n", 41 | "import torch.nn.functional as F\n", 42 | "from torch.jit.annotations import Optional, Tuple\n", 43 | "from torch import Tensor\n", 44 | "\n", 45 | "\n", 46 | "__all__ = ['GoogLeNet', 'googlenet', 'GoogLeNetOutputs', '_GoogLeNetOutputs']\n", 47 | "\n", 48 | "model_urls = {'googlenet': 'https://download.pytorch.org/models/googlenet-1378be20.pth'}\n", 49 | "\n", 50 | "GoogLeNetOutputs = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])\n", 51 | "GoogLeNetOutputs.__annotations__ = {'logits': Tensor, 'aux_logits2': Optional[Tensor], \n", 52 | " 'aux_logits1': Optional[Tensor]}\n", 53 | "\n", 54 | "#역전파를 위한 GoogLeNet outputs 설정\n", 55 | "_GoogLeNetOutputs = GoogLeNetOutputs\n", 56 | "\n", 57 | "\n", 58 | "def googlenet(pretrained = False, progress = True, **kwargs):\n", 59 | " #pretraind: True면 ImageNet으로 pretrained된 모델 반환\n", 60 | " #progress: True면 download bar 보여주기\n", 61 | " #aux_logits: True면 두 개의 추가적인 branch 더해줌 --> 성능 향상에 도움 됌\n", 62 | " #transform input: True면 입력을 preprocessing\n", 63 | "\n", 64 | " if pretrained:\n", 65 | " if 'transform_input' not in kwargs:\n", 66 | " kwargs['transform_input'] = True\n", 67 | " if 'aux_logits' not in kwargs:\n", 68 | " kwargs['aux_logits'] = False\n", 69 | " if kwargs['aux_logits']:\n", 70 | " warnings.warn('auxiliary heads in the pretrained googlenet model are NOT pretrained, ')\n", 71 | "\n", 72 | " original_aux_logits = kwargs['aux_logits']\n", 73 | " kwargs['aux_logits'] = True\n", 74 | " kwargs['init_weights'] = False\n", 75 | " model = GoogLeNet(**kwargs)\n", 76 | " state_dict = load_state_dict_from_url(model_urls['googlenet'], progress = progress)\n", 77 | " model.load_state_dict(state_dict)\n", 78 | " if not original_aux_logits:\n", 79 | " model.aux_logits = False\n", 80 | " model.aux1 = None\n", 81 | " model.aux2 = None\n", 82 | " return model\n", 83 | "\n", 84 | " return GoogLeNet(**kwargs)\n", 85 | "\n", 86 | "class GoogLeNet(nn.Module):\n", 87 | " __constants__ = ['aux_logits', 'transform_input']\n", 88 | "\n", 89 | " def __init__(self, num_classes = 1000, aux_logits = True, transform_input = False,\n", 90 | " init_weights = None, blocks = None):\n", 91 | " super(GoogLeNet, self).__init__()\n", 92 | " if blocks is None:\n", 93 | " blocks = [BasicConv2d, Inception, InceptionAux]\n", 94 | " if init_weights is None:\n", 95 | " warnings.warn('The default weight initialization of GoogLeNet will be changed in future releases of')\n", 96 | " init_weights = True\n", 97 | " assert len(blocks) == 3\n", 98 | " conv_block = blocks[0]\n", 99 | " inception_block = blocks[1]\n", 100 | " inception_aux_block = blocks[2]\n", 101 | "\n", 102 | " self.aux_logits = aux_logits\n", 103 | " self.transform_input = transform_input\n", 104 | "\n", 105 | " self.conv1 = conv_block(3, 64, kernel_size = 7, stride = 2, padding = 3)\n", 106 | " self.maxpool1 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n", 107 | " self.conv2 = conv_block(64, 64, kernel_size = 1)\n", 108 | " self.conv3 = conv_block(64, 192, kernel_size = 3, padding = 1)\n", 109 | " self.maxpool2 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n", 110 | "\n", 111 | " self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)\n", 112 | " self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)\n", 113 | " self.maxpool3 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n", 114 | "\n", 115 | " self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)\n", 116 | " self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)\n", 117 | " self.inception4c = inception_block(512, 128, 127, 256, 24, 64, 64)\n", 118 | " self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)\n", 119 | " self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)\n", 120 | " self.maxpool4 = nn.MaxPool2d(2, stride = 2, ceil_mode = True)\n", 121 | "\n", 122 | " self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)\n", 123 | " self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)\n", 124 | "\n", 125 | " if aux_logits:\n", 126 | " self.aux1 = inception_aux_block(512, num_classes)\n", 127 | " self.aux2 = inception_aux_block(528, num_classes)\n", 128 | " else:\n", 129 | " self.aux1 = None\n", 130 | " self.aux2 = None\n", 131 | "\n", 132 | " self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\n", 133 | " self.dropout = nn.Dropout(0.2)\n", 134 | " self.fc = nn.Linear(1024, num_classes)\n", 135 | "\n", 136 | " if init_weights:\n", 137 | " self._initialize_weights()\n", 138 | "\n", 139 | " def _initialize_weights(self):\n", 140 | " for m in self.modules():\n", 141 | " if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):\n", 142 | " import scipy.stats as stats\n", 143 | " X = stats.truncnorm(-2, 2, scale = 0.01)\n", 144 | " values = torch.as_tensor(X.rvs(m.weight.numel()), dtype = m.weight.dtype)\n", 145 | " values = values.view(m.weight.size())\n", 146 | " with torch.no_grad():\n", 147 | " m.weight.copy_(values)\n", 148 | " elif isinstance(m, nn.BatchNorm2d):\n", 149 | " nn.init.constant_(m.weight, 1)\n", 150 | " nn.init.constant_(m.bias, 0)\n", 151 | "\n", 152 | " def _transform_input(self, x):\n", 153 | " #(Tensor) --> Tensor\n", 154 | " if self.transform_input:\n", 155 | " x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5\n", 156 | " x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5\n", 157 | " x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5\n", 158 | " x = torch.cat((x_ch0, x_ch1, x_ch2), 1)\n", 159 | " return x\n", 160 | "\n", 161 | " def _forward(self, x):\n", 162 | " #type: (Tensor) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]\n", 163 | " #N x 3 x 224 x 224\n", 164 | " x = self.conv1(x)\n", 165 | "\n", 166 | " #N x 64 x 112 x 112\n", 167 | " x = self.maxpool1(x)\n", 168 | "\n", 169 | " #N x 64 x 56 x 56\n", 170 | " x = self.conv2(x)\n", 171 | "\n", 172 | " #N x 64 x 56 x 56\n", 173 | " x = self.conv3(x)\n", 174 | "\n", 175 | " #N x 192 x 56 x 56\n", 176 | " x = self.maxpool2(x)\n", 177 | "\n", 178 | " #N x 192 x 28 x 28\n", 179 | " x = self.inception3a(x)\n", 180 | "\n", 181 | " #N x 256 x 28 x 28\n", 182 | " x = self.inception3b(x)\n", 183 | "\n", 184 | " #N x 480 x 28 x 28\n", 185 | " x = self.maxpool3(x)\n", 186 | "\n", 187 | " #N x 480 x 14 x 14\n", 188 | " x = self.inception4a(x)\n", 189 | "\n", 190 | " # N x 512 x 14 x 14\n", 191 | " aux1 = torch.hit.annotate(Optional[Tensor], None)\n", 192 | " if self.aux1 is not None:\n", 193 | " if self.training:\n", 194 | " aux1 = self.aux1(x)\n", 195 | "\n", 196 | " x = self.inception4b(x)\n", 197 | "\n", 198 | " #N x 512 x 14 x 14\n", 199 | " x = self.inception4c(x)\n", 200 | "\n", 201 | " #N x 512 x 14 x 14\n", 202 | " x = self.inception4d(x)\n", 203 | "\n", 204 | " #N x 528 x 14 x 14\n", 205 | " x = self.inception4e(x)\n", 206 | "\n", 207 | " #N x 832 x 14 x 14\n", 208 | " x = self.maxpool4(x)\n", 209 | "\n", 210 | " #N x 832 x 7 x 7\n", 211 | " x = self.inception5a(x)\n", 212 | "\n", 213 | " #N x 832 x 7 x 7\n", 214 | " x = self.inception5b(x)\n", 215 | " #N x 1024 x 7 x 7\n", 216 | "\n", 217 | " x = self.avgpool(x)\n", 218 | " #N x 1024 x 1 x 1\n", 219 | "\n", 220 | " x = torch.flatten(x, 1)\n", 221 | " # N x 1024\n", 222 | "\n", 223 | " x = self.dropout(x)\n", 224 | " x = self.fc(x)\n", 225 | " #N x 1000 (num_classes)\n", 226 | " return x, aux2, aux1\n", 227 | " \n", 228 | " @torch.jit.unused\n", 229 | " def eager_outputs(self, x, aux2, aux1):\n", 230 | " # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> GoogLeNetOutputs\n", 231 | " if self.training and self.aux_logits:\n", 232 | " return _GoogLeNetOutputs(x, aux2, aux1)\n", 233 | " else:\n", 234 | " return x\n", 235 | "\n", 236 | " def forward(self, x):\n", 237 | " # type: (Tensor) -> GoogLeNetOutputs\n", 238 | " x = self._transform_input(x)\n", 239 | " x, aux1, aux2 = self._forward(x)\n", 240 | " aux_defined = self.training and self.aux_logits\n", 241 | " if torch.jit.is_scripting():\n", 242 | " if not aux_defined:\n", 243 | " warnings.warn('Scripted Googlenet alwatd returns GoogleNetOutputs Tuple')\n", 244 | " return GoogLeNetOutputs(x, aux2, aux1)\n", 245 | " else:\n", 246 | " return self.eager_outputs(x, aux2, aux1)\n", 247 | "\n", 248 | "class Inception(nn.Module):\n", 249 | "\n", 250 | " def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj,\n", 251 | " conv_block = None):\n", 252 | " super(Inception, self).__init__()\n", 253 | " if conv_block is None:\n", 254 | " conv_block = BasicConv2d\n", 255 | " self.branch1 = conv_block(in_channels, ch1x1, kernel_size = 1)\n", 256 | "\n", 257 | " self.branch2 = nn.Sequential(\n", 258 | " conv_block(in_channels, ch3x3red, kernel_size = 1),\n", 259 | " conv_block(ch3x3red, ch3x3, kernel_size = 3, padding = 1)\n", 260 | " )\n", 261 | "\n", 262 | " self.branch3 = nn.Sequential(\n", 263 | " conv_block(in_channels, ch5x5red, kernel_size = 1),\n", 264 | " conv_block(ch5x5red, ch5x5, kernel_size = 3, padding = 1)\n", 265 | " )\n", 266 | "\n", 267 | " self.branch4 = nn.Sequential(\n", 268 | " nn.MaxPool2d(kernel_size = 3, stride = 1, padding = 1, ceil_mode = True),\n", 269 | " conv_block(in_channels, pool_proj, kernel_size = 1)\n", 270 | " )\n", 271 | "\n", 272 | " def _forward(self, x):\n", 273 | " branch1 = self.branch1(x)\n", 274 | " branch2 = self.branch2(x)\n", 275 | " branch3 = self.branch3(x)\n", 276 | " branch4 = self.branch4(x)\n", 277 | "\n", 278 | " outputs = [branch1, branch2, branch3, branch4]\n", 279 | " return outputs\n", 280 | "\n", 281 | " def forward(self, x):\n", 282 | " outputs = self._forward(x)\n", 283 | " return torch.cat(outputs, 1)\n", 284 | "\n", 285 | "class InceptionAux(nn.Module):\n", 286 | "\n", 287 | " def __init__(self, in_channels, num_classes, conv_block = None):\n", 288 | " super(InceptionAux, self).__init__()\n", 289 | " if conv_block is None:\n", 290 | " conv_block = BasicConv2d\n", 291 | " self.conv = conv_block(in_channels, 128, kernel_size = 1)\n", 292 | "\n", 293 | " self.fc1 = nn.Linear(2048, 1024)\n", 294 | " self.fc2 = nn.Linear(1024, num_classes)\n", 295 | "\n", 296 | " def forward(self, x):\n", 297 | " #aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14\n", 298 | " x = F.adaptive_avg_pool2d(x, (4, 4))\n", 299 | " \n", 300 | " #aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4\n", 301 | " x = self.conv(x)\n", 302 | "\n", 303 | " #N x 128 x 4 x 4\n", 304 | " x = self.torch.flatten(x, 1)\n", 305 | "\n", 306 | " #N x 2048\n", 307 | " x = F.relu(self.fc1(x), inplace = True)\n", 308 | "\n", 309 | " #N x 1024\n", 310 | " x = F.dropout(x, 0.7, training = self.training)\n", 311 | "\n", 312 | " #N x 1024\n", 313 | " x = self.fc2(x)\n", 314 | "\n", 315 | " # N x 1000 (num_classes)\n", 316 | "\n", 317 | " return x\n", 318 | "\n", 319 | "class BasicConv2d(nn.Module):\n", 320 | "\n", 321 | " def __init__(self, in_channels, out_channels, **kwargs):\n", 322 | " super(BasicConv2d, self).__init__()\n", 323 | " self.conv = nn.Conv2d(in_channels, out_channels, bias = False, **kwargs)\n", 324 | " self.bn = nn.BatchNorm2d(out_channels, eps = 0.001)\n", 325 | "\n", 326 | " def forward(self, x):\n", 327 | " x = self.conv(x)\n", 328 | " x = self.bn(x)\n", 329 | " return F.relu(x, inplace = True)" 330 | ] 331 | } 332 | ] 333 | } -------------------------------------------------------------------------------- /Computer Vision/CNN/MobileNet_구현_실습.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyPvHJUiMPbRb/mjrrJcBBef", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "#MobileNet 설명\n", 33 | "#서로 다른 크기의 input layer와 width factor에서 사용 가능\n", 34 | "#서로 다른 width를 사용함으로써 cost를 줄일 수 있음\n", 35 | "#MobileNet은 32x32보다 큰 입력 이미지면 어떤 이미지든 가능\n", 36 | "#더 큰 크기의 이미지는 더욱 향상된 성능을 가져옴\n", 37 | "\n", 38 | "#파라미터 수와 multiply-adds는 alpha에 의해 결정됌\n", 39 | "#alpha는 각 레이어에서 필터의 수를 증감함\n", 40 | "\n", 41 | "from tensorflow.python.keras.layers.recurrent import layer_serialization\n", 42 | "from __future__ import absolute_import\n", 43 | "from __future__ import division\n", 44 | "from __future__ import print_function\n", 45 | "\n", 46 | "from tensorflow.python.keras import backend\n", 47 | "from tensorflow.python.keras.applications import imagenet_utils\n", 48 | "from tensorflow.python.keras.engine import training\n", 49 | "from tensorflow.python.keras.layers import VersionAwareLayers\n", 50 | "from tensorflow.python.keras.utils import data_utils\n", 51 | "from tensorflow.python.keras.utils import layer_utils\n", 52 | "from tensorflow.python.lib.io import file_io\n", 53 | "from tensorflow.python.platform import tf_logging as logging\n", 54 | "from tensorflow.python.util.tf_export import keras_export\n", 55 | "\n", 56 | "BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/')\n", 57 | "layers = None\n", 58 | "\n", 59 | "@keras_export('keras.applications.mobilenet.MobileNet', 'keras.applications.MobileNet')\n", 60 | "\n", 61 | "def MobileNet(input_shape = None, alpha = 1.0, depth_multiplier = 1, dropout = 1e-3, include_top = True, \n", 62 | " weights = 'imagenet', input_tensor = None, pooling = None, classes = 1000,\n", 63 | " classifier_activation = 'softmax', **kwargs):\n", 64 | " #input_shape: 옵션적 shape tuple\n", 65 | " #alpha: network의 width 조절 -> width multiplier\n", 66 | " #1.0이면, 각 레이어에서 비율적으로 필터의 수를 줄임\n", 67 | " #depth_multiplier: resolution multiplier\n", 68 | " #dropout: dropout rate 조정\n", 69 | " #include_top: network의 맨 위에서 fc-layer을 사용할지 결정\n", 70 | " #weights: 재량껏 weights를 사용 가능\n", 71 | " #input_tensor: 옵션적 keras tensor\n", 72 | " #pooling: 어떤 방식으로 풀링을 할 지 결정\n", 73 | " #classes: 분류해야 하는 class 수 결정\n", 74 | " \n", 75 | " global layer_s\n", 76 | " if 'layers' in kwargs:\n", 77 | " layers = kwargs.pop('layers')\n", 78 | " else:\n", 79 | " layers = VersionAwareLayers()\n", 80 | " if kwargs:\n", 81 | " raise ValueError('Unknown argument(s): %s' % (kwargs,))\n", 82 | " if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):\n", 83 | " raise ValueError('The `weights` argument should be either '\n", 84 | " '`None` (random initialization), `imagenet` '\n", 85 | " '(pre-training on ImageNet), '\n", 86 | " 'or the path to the weights file to be loaded.')\n", 87 | " \n", 88 | " if weights == 'imagenet' and include_top and classes != 1000:\n", 89 | " raise ValueError('If using `weights` as `\"imagenet\"` with `include_top` '\n", 90 | " 'as true, `classes` should be 1000')\n", 91 | " \n", 92 | " #적절한 입력 shape과 기본 크기\n", 93 | " if input_shape is None:\n", 94 | " default_size = 224\n", 95 | " else:\n", 96 | " if backend.image_data_format() == 'channels_first':\n", 97 | " rows = input_shape[1]\n", 98 | " cols = input_shape[2]\n", 99 | " else:\n", 100 | " rows = input_shape[0]\n", 101 | " cols = input_shape[1]\n", 102 | "\n", 103 | " if rows == cols and rows in [128, 160, 192, 224]:\n", 104 | " default_size = rows\n", 105 | " else:\n", 106 | " default_size = 224\n", 107 | "\n", 108 | " input_shape = imagenet_utils.obtain_input_shape(input_shape, default_size = default_size,\n", 109 | " min_size = 32, data_format = backend.image_data_format(),\n", 110 | " require_flatten = include_top, weights = weights)\n", 111 | " \n", 112 | " if backend.image_data_format() == 'channels_last':\n", 113 | " row_axis, col_axis = (0, 1)\n", 114 | " else:\n", 115 | " row_axis, col_axis = (1, 2)\n", 116 | " rows = input_shape[row_axis]\n", 117 | " cols = input_shape[col_axis]\n", 118 | "\n", 119 | " if weights == 'imagenet':\n", 120 | " if depth_multiplier != 1:\n", 121 | " raise ValueError('If imagenet weights are being loaded, '\n", 122 | " 'depth multiplier must be 1')\n", 123 | " \n", 124 | " if alpha not in [0.25, 0.50, 0.75, 1.0]:\n", 125 | " raise ValueError('If imagenet weights are being loaded, '\n", 126 | " 'alpha can be one of'\n", 127 | " '`0.25`, `0.50`, `0.75` or `1.0` only.')\n", 128 | " \n", 129 | " if rows != cols or rows not in [128, 160, 192, 224]:\n", 130 | " rows = 224\n", 131 | " logging.warning('`input_shape` is undefined or non-square, '\n", 132 | " 'or `rows` is not in [128, 160, 192, 224]. '\n", 133 | " 'Weights for input shape (224, 224) will be'\n", 134 | " ' loaded as the default.')\n", 135 | " \n", 136 | " if input_tensor is None:\n", 137 | " img_input = layers.Input(shape = input_shape)\n", 138 | " else:\n", 139 | " if not backend.is_keras_tensor(input_tensor):\n", 140 | " img_input = layers.Input(tensor = input_tensor, shape = input_shape)\n", 141 | " else:\n", 142 | " img_input = input_tensor\n", 143 | "\n", 144 | " x = _conv_block(img_input, 32, alpha, stirdes = (2, 2))\n", 145 | " x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id = 1)\n", 146 | " x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, strides = (2, 2), block_id = 2)\n", 147 | "\n", 148 | " x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id = 3)\n", 149 | " x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, strides = (2, 2), block_id = 4)\n", 150 | "\n", 151 | " x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id = 5)\n", 152 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, strides = (2, 2), block_id = 6)\n", 153 | "\n", 154 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 7)\n", 155 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 8)\n", 156 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 9)\n", 157 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 10)\n", 158 | " x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 11)\n", 159 | "\n", 160 | " x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, strides = (2, 2), block_id = 12)\n", 161 | " x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id = 13)\n", 162 | "\n", 163 | " if include_top:\n", 164 | " if backend.image_data_format() == 'channels_first':\n", 165 | " shape = (int(1024 * alpha), 1, 1)\n", 166 | " else:\n", 167 | " shape = (1, 1, int(1024 * alpha))\n", 168 | "\n", 169 | " x = layers.GlobalAvergarePooling2D()(x)\n", 170 | " x = layers.Reshape(shape, name = 'reshape_1')(x)\n", 171 | " x = layers.Dropout(dropout, name = 'dropout')(x)\n", 172 | " x = layers.Conv2D(classes, (1, 1), padding = 'same', name = 'conv_preds')(x)\n", 173 | " x = layers.Reshape((classes,), name = 'reshape_2')(x)\n", 174 | " imagenet_utils.validate_activation(classifier_activation, weights)\n", 175 | " x = layers.Activation(activation = classifier_activation, name = 'predictions')(x)\n", 176 | "\n", 177 | " else:\n", 178 | " if pooling == 'avg':\n", 179 | " x = layers.GlobalAveragePooling2D()(x)\n", 180 | " elif pooling == 'max':\n", 181 | " x = layers.GlobalMaxPooling2D()(x)\n", 182 | "\n", 183 | " if input_tensor is not None:\n", 184 | " inputs = layer_utils.get_source_inputs(input_tensor)\n", 185 | " else:\n", 186 | " inputs = img_input\n", 187 | "\n", 188 | " #모델 생성\n", 189 | " model = training.Model(inputs, x, name = 'mobilent_%0.2f_%s' % (alpha, rows))\n", 190 | "\n", 191 | " #가중치 불러오기\n", 192 | " if weights == 'imagenet':\n", 193 | " if alpha == 1.0:\n", 194 | " alpha_test = '1_0'\n", 195 | " elif alpha == 0.75:\n", 196 | " aplha_text = '7_5'\n", 197 | " elif alpha == 0.50:\n", 198 | " alpha_text = '5_0'\n", 199 | " else:\n", 200 | " alpha_text = '2_5'\n", 201 | "\n", 202 | " if include_top:\n", 203 | " model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)\n", 204 | " weight_path = BASE_WEIGHT_PATH + model_name\n", 205 | " weights_path = data_utils.get_file(model_name, weight_path, cache_subdir = 'models')\n", 206 | " else:\n", 207 | " model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)\n", 208 | " weight_path = BASE_WEIGHT_PATH + model_name\n", 209 | " weights_path = data_utils.get_file(model_name, weight_path, cache_subdir = 'models')\n", 210 | " model.load_weights(weights_path)\n", 211 | " elif weights is not None:\n", 212 | " model.load_weights(weights)\n", 213 | "\n", 214 | " return model\n", 215 | "\n", 216 | "def _conv_block(inputs, filters, alpha, kernel = (3, 3), strides = (1, 1)):\n", 217 | " #inputs: 'channels_last'면 (rows, cols, 3) / 'channels_first'면 (3, rows, cols) 식으로 입력 조정\n", 218 | " #filters: output space의 차원수\n", 219 | " #alpha: network의 width 조정. alpha가 1.0보다 작으면 각 레이어의 필터 수 줄어듬\n", 220 | " #반면에, alpha가 1.0 보다 크다면 각 레이어의 필터 수가 증가함\n", 221 | " #kernel: 합성곱 윈도우의 height와 width 조정\n", 222 | " #strides: stride 정의\n", 223 | " \n", 224 | " channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n", 225 | " filters = int(filters * alpha)\n", 226 | " x = layers.Conv2D(filters, kernel, padding = 'same', use_bias = False,\n", 227 | " strides = strides, name = 'conv1')(inputs)\n", 228 | " x = layers.BatchNormalization(axis = channel_axis, name = 'conv1_bn')(x)\n", 229 | " return layers.ReLU(6., name = 'conv1_relu')(x)\n", 230 | "\n", 231 | "def _depthwise_conv_block(inputs, pointwise_conv_filters, alpha, depth_multiplier = 1,\n", 232 | " strides = (1, 1), block_id = 1):\n", 233 | " #input: 입력 텐서의 모양. 이전의 정의와 동일\n", 234 | " #pointwise_conv_filters: output space의 차원수\n", 235 | " #alpha: 이전의 정의와 동일\n", 236 | " #depth_multiplier: 각 입력 채널에 대한 depthwise convolution output channel의 수\n", 237 | " #strides: 이전의 정의와 동일\n", 238 | " #block_id: block의 수를 관리하기 위한 특별한 integer\n", 239 | "\n", 240 | " channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n", 241 | " filters = int(filters * alpha)\n", 242 | " \n", 243 | " if strides == (1, 1):\n", 244 | " x = inputs\n", 245 | " else:\n", 246 | " x = layers.ZeroPadding2D(((0, 1), (0, 1)), name = 'conv_pad_%d' % block_id)(inputs)\n", 247 | "\n", 248 | " x = layers.DepthwiseConv2D((3, 3), padding = 'same' if strides == (1, 1) else 'valid',\n", 249 | " depth_multiplier = depth_multiplier, strides = strides,\n", 250 | " use_bias = False, name = 'conv_dw_%d' % block_id)(x)\n", 251 | " x = layers.BatchNormalization(axis = channel_axis, name = 'conv_dw_%d_bn' % block_id)(x)\n", 252 | " x = layers.ReLU(6., name = 'conv_dw_%d_relu' % block_id)(x)\n", 253 | " \n", 254 | " x = layers.Conv2D(pointwise_conv_filters, (1, 1), padding = 'same', use_bias = False, \n", 255 | " strides = (1, 1), name = 'conv_dw_%d' % block_id)(x)\n", 256 | " x = layers.BatchNormalization(axis = channel_axis, name = 'conv_dw_%d_bn' % block_id)(x)\n", 257 | " return layers.ReLU(6., name = 'conv_dw_%d_relu' % block_id)(x)\n", 258 | "\n", 259 | "@keras_export('keras.applications.mobilenet.preprocess_input')\n", 260 | "def preprocess_input(x, data_format = None):\n", 261 | " return imagenet_utils.preprocess_input(x, data_format = data_format, mode = 'tf')\n", 262 | "\n", 263 | "@keras_export('keras.applications.mobilenet.decode_predictions')\n", 264 | "def decode_predictions(preds, top = 5):\n", 265 | " return imagenet_utils.decode_predictions(preds, top = top)\n", 266 | "\n", 267 | "preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(\n", 268 | " mode='',\n", 269 | " ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,\n", 270 | " error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)\n", 271 | "decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__" 272 | ], 273 | "metadata": { 274 | "id": "bcnEZ5GDo6cq" 275 | }, 276 | "execution_count": null, 277 | "outputs": [] 278 | } 279 | ] 280 | } -------------------------------------------------------------------------------- /Computer Vision/CNN/README.md: -------------------------------------------------------------------------------- 1 | # Various CNN models implementation 2 | 3 | I implemented GoogLeNet, ResNet, DenseNet, EfficientNet, MobileNet. 4 | 5 | You can check my CNN models paper review here -> https://cartinoe5930.tistory.com/entry/CNN-network%EC%9D%98-%EC%97%AD%EC%82%AC 6 | -------------------------------------------------------------------------------- /Computer Vision/CNN/ResNet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOi4qpO/A/t6wycK3+hwkbI", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "el5SGMXsyXow" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import torch\n", 38 | "import torch.nn as nn\n", 39 | "#from .utils import load_state_from_url\n", 40 | "\n", 41 | "#ResNet 모델 종류\n", 42 | "__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',\n", 43 | " 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',\n", 44 | " 'wide_resnet50_2', 'wide_resnet101_2']\n", 45 | "\n", 46 | "#ResNet 모델별 URL\n", 47 | "model_urls = {\n", 48 | " 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',\n", 49 | " 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',\n", 50 | " 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',\n", 51 | " 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',\n", 52 | " 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',\n", 53 | " 'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',\n", 54 | " 'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',\n", 55 | " 'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',\n", 56 | " 'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',\n", 57 | "}\n", 58 | "\n", 59 | "#3X3 conv layer 구현\n", 60 | "def conv3x3(in_planes, out_planes, stride = 1, groups = 1, dilation = 1):\n", 61 | " #padding과 함께 3x3 conv layer 구현\n", 62 | " return nn.Conv2d(in_planes, out_planes, kernel_size = 3, stride = stride, padding = dilation, groups = groups, bias = False, dilation = dilation)\n", 63 | "\n", 64 | "def conv1x1(in_planes, out_planes, stride = 1):\n", 65 | " #1X1 conv layer 구현\n", 66 | " return nn.Conv2d(in_planes, out_planes, kernel_size = 1, stride = stride, bias = True)\n", 67 | "\n", 68 | "class BasicBlock(nn.Module):\n", 69 | " expansion = 1\n", 70 | "\n", 71 | " def __init__(self, inplanes, planes, stride = 1, downsample = None, groups = 1,\n", 72 | " base_width = 64, dilation = 1, norm_layers = None):\n", 73 | " super(BasicBlock, self).__init__()\n", 74 | " if norm_layer is None:\n", 75 | " norm_layer = nn.BatchNorm2d\n", 76 | " if groups != 1 or base_width != 64:\n", 77 | " raise ValueError('BasicBlock only supports groups = 1 and base_width = 64')\n", 78 | " if dilation > 1:\n", 79 | " raise NotImplementedError('Dilation > 1 not supported in BasicBlock')\n", 80 | "\n", 81 | " #stride가 1일 때, self.conv layer와 self.downsample layer는 입력을 downsample함\n", 82 | " self.conv1 = conv3x3(inplanes, planes, stride)\n", 83 | " self.bn1 = norm_layer(planes)\n", 84 | " self.relu = nn.ReLU(inplace = True)\n", 85 | " self.conv2 = conv3x3(planes, planes)\n", 86 | " self.bn2 = norm_layer(planes)\n", 87 | " self.downsample = downsample\n", 88 | " self.stride = stride\n", 89 | "\n", 90 | " def forward(self, x):\n", 91 | " identity = x\n", 92 | "\n", 93 | " out = self.conv1(x)\n", 94 | " out = self.bn1(out)\n", 95 | " out = self.relu(out)\n", 96 | " \n", 97 | " out = self.conv2(out)\n", 98 | " out = self.bn2(out)\n", 99 | "\n", 100 | " if self.downsample is not None:\n", 101 | " identity = self.downsample(x)\n", 102 | "\n", 103 | " out += identity\n", 104 | " out = self.relu(out)\n", 105 | "\n", 106 | " return out\n", 107 | "\n", 108 | "\n", 109 | "class Bottleneck(nn.Module):\n", 110 | " expansion = 4\n", 111 | "\n", 112 | " def __init__(self, inplanes, planes, stride = 1, downsample = None, groups = 1, \n", 113 | " base_width = 64, dilation = 1, norm_layer = None):\n", 114 | " super(Bottleneck, self).__init__()\n", 115 | " if norm_layer is None:\n", 116 | " norm_layer = nn.BatchNorm2d\n", 117 | " width = int(planes * (base_width / 64.)) * groups\n", 118 | " self.conv1 = conv1x1(inplanes, width)\n", 119 | " self.bn1 = norm_layer(width)\n", 120 | " self.conv2 = conv3x3(width, width, stride, groups, dilation)\n", 121 | " self.bn2 = norm_layer(width)\n", 122 | " self.conv3 = conv1x1(width, planes * self.expansion)\n", 123 | " self.bn3 = norm_layer(planes * self.expansion)\n", 124 | " self.relu = nn.ReLU(inplace = True)\n", 125 | " self.downsample = downsample\n", 126 | " self.stride = stride\n", 127 | "\n", 128 | " def forward(self, x):\n", 129 | " identity = x\n", 130 | "\n", 131 | " out = self.conv1(x)\n", 132 | " out = self.bn1(out)\n", 133 | " out = self.relu(out)\n", 134 | "\n", 135 | " out = self.conv2(out)\n", 136 | " out = self.bn2(out)\n", 137 | " out = self.relu(out)\n", 138 | "\n", 139 | " out = self.conv3(out)\n", 140 | " out = self.bn3(out)\n", 141 | "\n", 142 | " if self.downsample is not None:\n", 143 | " identity = self.downsample(x)\n", 144 | "\n", 145 | " out += identity\n", 146 | " out = self.relu(out)\n", 147 | "\n", 148 | " return out\n", 149 | "\n", 150 | "class ResNet(nn.Module):\n", 151 | "\n", 152 | " def __init__(self, block, layers, num_classes = 1000, zero_init_residual = False,\n", 153 | " groups = 1, width_per_group = 64, replace_stride_width_dilation = None):\n", 154 | " super(ResNet, self).__init__()\n", 155 | " if norm_layer is None:\n", 156 | " norm_layer = nn.BacthNorm2d\n", 157 | " self.norm_layer = norm_layer\n", 158 | "\n", 159 | " self.inplanes = 64\n", 160 | " self.dilation = 1\n", 161 | " if replace_stride_width_dilation is None:\n", 162 | " replace_stride_width_dilation = [False, False, False]\n", 163 | " if len(replace_stride_width_dilation) != 3:\n", 164 | " raise ValueError(\"replace_stride_width_dilation should be None\"\n", 165 | " \"of a 3-element tuple, got {}\".format(replace_stride_width_dilation))\n", 166 | " self.groups = groups\n", 167 | " self.base_width = width_per_group\n", 168 | " self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size = 7, stride = 2, padding = 3, bias = True)\n", 169 | " self.bn1 = norm_layer(self.inplanes)\n", 170 | " self.relu = nn.ReLU(inplace = True)\n", 171 | " self.maxpool = nn.MaxPool2D(kernel_size = 3, stride = 2, padding = 1)\n", 172 | " self.layer1 = self._make_layer(block, 64, layers[0])\n", 173 | " self.layer2 = self._make_layer(block, 128, layers[1], stride = 2, \n", 174 | " dilate = replace_stride_width_dilation[0])\n", 175 | " self.layer3 = self._make_layer(block, 256, layers[2], stride = 2, \n", 176 | " dilate = replace_stride_width_dilation[1])\n", 177 | " self.layer4 = self._make_layer(block, 512, layers[3], stride = 2, \n", 178 | " dilate = replace_stride_width_dilation[2])\n", 179 | " self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\n", 180 | " self.fc = nn.Linear(512 * block.expansion, num_classes)\n", 181 | "\n", 182 | " for m in self.modules():\n", 183 | " if isinstance(m, nn.Conv2d):\n", 184 | " nn.init.kaiming_normal_(m.weight, mode = 'fan_out', nonlinearity = 'relu')\n", 185 | " elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):\n", 186 | " nn.init.constant_(m.weight, 1)\n", 187 | " nn.init.constant_(m.bias, 0)\n", 188 | "\n", 189 | " if zero_init_residual:\n", 190 | " for m in self.modules():\n", 191 | " if isinstance(m, Bottleneck):\n", 192 | " nn.init.constant_(m.bn3.weight, 0)\n", 193 | " elif isinstance(m, BasicBlock):\n", 194 | " nn.init.constant_(m.bn2.weight, 0)\n", 195 | "\n", 196 | " def _make_layer(self, block, planes, blocks, stride = 1, dilate = False):\n", 197 | " norm_layer = self._norm_layer\n", 198 | " downsample = None\n", 199 | " previous_dilation = self.dilation\n", 200 | " if dilate:\n", 201 | " self.dilation *= stride\n", 202 | " stride = 1\n", 203 | " if stride != 1 or self.inplanes != planes * block.expansion:\n", 204 | " downsample = nn.Sequential(\n", 205 | " conv1x1(self.inplanes, planes * block.expansion, stride), \n", 206 | " norm_layer(planes * block.expansion),\n", 207 | " )\n", 208 | " \n", 209 | " layers = []\n", 210 | " layers.append(block(self.inplanes, planes, stride, downsample, self.groups,\n", 211 | " self.base_width, previous_dilation, norm_layer))\n", 212 | " self.inplanes = planes * block.expansion\n", 213 | "\n", 214 | " for _ in range(1, blocks):\n", 215 | " layers.append(block(self.inplanes, planes, groups = self.groups,\n", 216 | " base_width = self.base_width, dilation = self.dilation,\n", 217 | " norm_layer = norm_layer))\n", 218 | " \n", 219 | " return nn.Sequential(*layers)\n", 220 | "\n", 221 | " def _forward_impl(self, x):\n", 222 | " x = self.conv1(x)\n", 223 | " x = self.bn1(x)\n", 224 | " x = self.relu(x)\n", 225 | " x = self.maxpool(x)\n", 226 | "\n", 227 | " x = self.layer1(x)\n", 228 | " x = self.layer2(x)\n", 229 | " x = self.layer3(x)\n", 230 | " x = self.layer4(x)\n", 231 | "\n", 232 | " x = self.avgpool(x)\n", 233 | " x = torch.flatten(x, 1)\n", 234 | " x = self.fc(x)\n", 235 | "\n", 236 | " return x\n", 237 | "\n", 238 | " def forward(self, x):\n", 239 | " return self._forward_impl(x)\n", 240 | "\n", 241 | "def _resnet(arch, block, layers, pretrained, progress, **kwargs):\n", 242 | " model = ResNet(block, layers, **kwargs)\n", 243 | " if pretrained:\n", 244 | " state_dict = load_state_dict_from_url(model_urls[arch], progress = progress)\n", 245 | " model.load_state_dict(state_dict)\n", 246 | " return model\n", 247 | "\n", 248 | "def resnext50_32x4d(pretrained = False, progress = True, **kwargs):\n", 249 | " kwargs['groups'] = 32\n", 250 | " kwargs['width_per_group'] = 4\n", 251 | " return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],\n", 252 | " pretrained, progress, **kwargs)\n", 253 | " \n", 254 | "def resnext101_32x8d(pretrained = False, progress = True, **kwargs):\n", 255 | " kwargs['groups'] = 32\n", 256 | " kwargs['width_pre_group'] = 8\n", 257 | " return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],\n", 258 | " pretrained, progress, **kwargs)" 259 | ] 260 | } 261 | ] 262 | } -------------------------------------------------------------------------------- /Computer Vision/CNN/Xception.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOnDX2S9zG8B6oYDzT8Z794", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "#Xception 모델 설명\n", 33 | "#당시, ImageNet에 대해서 SoTA를 차지함\n", 34 | "#VGG16과 ResNet의 입력 이미지 크기(224x224)와 다르게 (299x299)를 사용함\n", 35 | "#전처리 방식도 다름(Inception V3와 동일)\n", 36 | "\n", 37 | "from __future__ import absolute_import\n", 38 | "from __future__ import division\n", 39 | "from __future__ import print_function\n", 40 | "\n", 41 | "import os\n", 42 | "import warnings\n", 43 | "\n", 44 | "import keras\n", 45 | "from keras import layers\n", 46 | "from keras.models import Sequential\n", 47 | "from keras import backend\n", 48 | "#얘네는 오류 발생\n", 49 | "#from . import get_submodules_from_kwargs\n", 50 | "#from . import imagenet_utils\n", 51 | "#from .imagenet_utils import decode_predictions\n", 52 | "#from .imagenet_utils import _obtain_input_shape\n", 53 | "\n", 54 | "TF_WEIGHTS_PATH = (\n", 55 | " 'https://github.com/fchollet/deep-learning-models/'\n", 56 | " 'releases/download/v0.4/'\n", 57 | " 'xception_weights_tf_dim_ordering_tf_kernels.h5'\n", 58 | ")\n", 59 | "\n", 60 | "TF_WEIGHTS_PATH_NO_TOP = (\n", 61 | " 'https://github.com/fchollet/deep-learning-models/'\n", 62 | " 'releases/download/v0.4/'\n", 63 | " 'xception_weights_tf_dim_ordering_tf_kernels_notop.h5'\n", 64 | ")\n", 65 | "\n", 66 | "def Xception(include_top = True, weights = 'imagenet', input_tensor = None, \n", 67 | " input_shape = None, pooling = None, classes = 1000, **kwargs):\n", 68 | " \n", 69 | " #기본 입력 이미지의 크기는 299 x 299\n", 70 | " #include_top: network의 맨 위에서 fc-layer을 포함할 지\n", 71 | " #weights: 'None'은 무작위, 'imagenet'은 Imagenet에서 pre-training, 또는 업로드할 파일 경로\n", 72 | " #input_tensor: 모델의 입력 이미지에 대해 사용할 추가적인 keras tensor\n", 73 | " #input_shape: 옵션적 tuple 모양, 'include_top'이 False일 때만 사용 가능\n", 74 | " #pooling: feature extraction을 위한 옵션적 pooling mode, 'include_top'이 False일 때만 사용 가능\n", 75 | " #'None': 모델 출력이 4D tensor, 'avg': global average pooling이고 output은 2D tensor\n", 76 | " #'max': global max pooling\n", 77 | " #classes: 옵션적 class 수. 'include_top'이 True일 때와 'weights'가 명시되지 않았을 때 사용 가능\n", 78 | "\n", 79 | " #weights에 아무런 값이 없을 때\n", 80 | " if not (weights in {'imagenet', None} or os.path.exists(weights)):\n", 81 | " raise ValueError('The `weights` argument should be either '\n", 82 | " '`None` (random initialization), `imagenet` '\n", 83 | " '(pre-training on ImageNet), '\n", 84 | " 'or the path to the weights file to be loaded.')\n", 85 | " \n", 86 | " #imagenet을 weights로 사용하는데 조건이 맞지 않을 때\n", 87 | " if weights == 'imagenet' and include_top and classes != 1000:\n", 88 | " raise ValueError('If using `weights` as `\"imagenet\"` with `include_top`'\n", 89 | " ' as true, `classes` should be 1000')\n", 90 | " \n", 91 | " #적절한 입력 모양 결정\n", 92 | " input_shape = _obtain_input_shape(input_shape, default_size = 299, min_size = 71,\n", 93 | " data_format = backend.image_data_format(),\n", 94 | " require_flatten = include_top, weights = weights)\n", 95 | " \n", 96 | " if input_tensor is None:\n", 97 | " img_input = layers.Input(shape = input_shape)\n", 98 | " else:\n", 99 | " if not backend.is_keras_tensor(input_tensor):\n", 100 | " img_input = layers.Input(tensor = input_tensor, shape = input_shape)\n", 101 | " else:\n", 102 | " img_input = input_tensor\n", 103 | "\n", 104 | " channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n", 105 | "\n", 106 | " #Entry Flow\n", 107 | " #입력 이미지 단계\n", 108 | " x = layers.Conv2D(32, (3, 3), strides = (2, 2), use_bias = False,\n", 109 | " name = 'block1_conv1')(img_input)\n", 110 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block1_conv1_bn')(x)\n", 111 | " x = layers.Activation('relu', name = 'block1_conv1_act')(x)\n", 112 | " x = layers.Conv2D(64, (3, 3), use_bias = False, name = 'block1_conv2_bn')(x)\n", 113 | " x = layers.Activation('relu', name = 'block1_conv2_act')(x)\n", 114 | "\n", 115 | " #첫 번째 residual network\n", 116 | " residual = layers.Conv2d(128, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n", 117 | " residual = layers.BatchNormalization(axis = channel_axis)(residual)\n", 118 | "\n", 119 | " x = layers.SeparableConv2D(128, (3, 3), padding = 'same', use_bias = False,\n", 120 | " name = 'block2_sepconv1')(x)\n", 121 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block2_sepconv1_bn')(x)\n", 122 | " x = layers.Activation('relu', name = 'block2_sepconv2_act')(x)\n", 123 | " x = layers.SeparableConv2D(128, (3, 3), padding = 'same', use_bias = 'same', \n", 124 | " name = 'block2_sepconv2')(x)\n", 125 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block2_sepconv2_bn')(x)\n", 126 | "\n", 127 | " x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', \n", 128 | " name = 'block2_pool')(x)\n", 129 | " x = layers.add([x, residual])\n", 130 | "\n", 131 | " #두 번째 residual network\n", 132 | " residual = layers.Conv2d(256, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n", 133 | " residual = layers.BatchNormalization(sxis = channel_axis)(residual)\n", 134 | "\n", 135 | " x = layers.Activation('relu', name = 'block3_conv1_act')(x)\n", 136 | " x = layers.SeparableConv2D(256, (3, 3), strides = 'same', use_bias = False, \n", 137 | " name = 'block3_sepconv1')(x)\n", 138 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block3_sepconv1_bn')(x)\n", 139 | "\n", 140 | " x = layers.Activation('relu', name = 'block3_conv2_act')(x)\n", 141 | " x = layers.SeparableConv2D(256, (3, 3), strides = 'same', use_bias = False, \n", 142 | " name = 'block3_sepconv2')(x)\n", 143 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block3_sepconv2_bn')(x)\n", 144 | "\n", 145 | " x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', \n", 146 | " name = 'block3_pool')(x)\n", 147 | "\n", 148 | " x = layers.add([x, residual])\n", 149 | "\n", 150 | " #세 번째 residual network\n", 151 | " residual = layers.Conv2d(728, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n", 152 | " residual = layers.BatchNormalization(axis = channel_axis)(residual)\n", 153 | "\n", 154 | " x = layers.Activation('relu', name = 'block4_conv1_act')(x)\n", 155 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n", 156 | " name = 'block4_sepconv1')(x)\n", 157 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block4_sepconv1_bn')(x)\n", 158 | "\n", 159 | " x = layer.Activation('relu', name = 'block4_conv2_act')(x)\n", 160 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False,\n", 161 | " name = 'block4_sepconv2')(x)\n", 162 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block4_sepconv2_bn')(x)\n", 163 | " \n", 164 | " x = MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', name = 'block4_pool')(x)\n", 165 | "\n", 166 | " x = layers.add([x, residual])\n", 167 | "\n", 168 | " #Middle Flow\n", 169 | " for i in range(8):\n", 170 | " residual = x\n", 171 | " prefix = 'block' + str(i + 5) #블록 이름 지정 자동화\n", 172 | " \n", 173 | " x = layers.Activation('relu', name = prefix + '_sepconv1_act')(x)\n", 174 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n", 175 | " name = prefix + '_sepconv1')(x)\n", 176 | " x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv1_bn')(x)\n", 177 | "\n", 178 | " x = layers.Activation('relu', name = prefix + '_sepconv2_act')(x)\n", 179 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n", 180 | " name = prefix + '_sepconv2')(x)\n", 181 | " x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv2_bn')(x)\n", 182 | "\n", 183 | " x = layers.Activation('relu', name = prefix + '_sepconv3_act')(x)\n", 184 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n", 185 | " name = prefix + '_sepconv3')(x)\n", 186 | " x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv3_bn')(x)\n", 187 | "\n", 188 | " x = layers.add([x, residual])\n", 189 | "\n", 190 | " #Exit Flow\n", 191 | " residual = layers.Conv2d(1024, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n", 192 | " residual = layers.BatchNormalization(axis = channel_axis)(residual)\n", 193 | "\n", 194 | " x = layers.Activation('relu', name = 'block13_sepconv1_act')(x)\n", 195 | " x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n", 196 | " name = 'block13_sepconv1')(x)\n", 197 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block13_sepconv1_bn')(x)\n", 198 | "\n", 199 | " x = layers.Activation('relu', name = 'block13_speconv2_act')(x)\n", 200 | " x = layers.SeparableConv2D(1024, (3, 3), strides = 'same', use_bias = False, \n", 201 | " name = 'block13_sepconv2')(x)\n", 202 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block13_sepconv2_bn')(x)\n", 203 | "\n", 204 | " x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', name = 'block13_pool')(x)\n", 205 | "\n", 206 | " x = layers.add([x, residual])\n", 207 | "\n", 208 | " x = layers.SeparableConv2D(1536, (3, 3), strides = 'same', use_biad = False,\n", 209 | " name = 'block14_sepconv1')(x)\n", 210 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block14_sepconv1_bn')(x)\n", 211 | " x = layers.Activation('relu', name = 'block14_sepconv_act')(x)\n", 212 | "\n", 213 | " x = layers.SeparableConv2D(2048, (3, 3), strides = 'same', use_bias = False,\n", 214 | " name = 'block14_sepconv2')(x)\n", 215 | " x = layers.BatchNormalization(axis = channel_axis, name = 'block14_speconv2_bn')(x)\n", 216 | " x = layers.Activation('relu', name = 'block14_sepconv2_act')(x)\n", 217 | "\n", 218 | " if include_top:\n", 219 | " x = layers.GlobalAveragePooling2D(name = 'avg_pool')(x)\n", 220 | " x = layers.Dense(classes, activation = 'softmax', name = 'predictions')(x)\n", 221 | " else:\n", 222 | " if pooling == 'avg':\n", 223 | " x = layers.GlobalAveragePooling2D()(x)\n", 224 | " elif pooling == 'max':\n", 225 | " x = layers.MaxPooling2D()(x)\n", 226 | "\n", 227 | " if input_tensor is not None:\n", 228 | " inputs = keras_utils.get_source_inputs(input_tensor)\n", 229 | " else:\n", 230 | " inputs = img_input\n", 231 | "\n", 232 | " #모델 생성\n", 233 | " if weights == 'imagenet':\n", 234 | " if include_top:\n", 235 | " weights_path = keras_utils.get_file(\n", 236 | " 'xception_weights_tf_dim_ordering_tf_kernels.h5',\n", 237 | " TF_WEIGHTS_PATH,\n", 238 | " cache_subdir='models',\n", 239 | " file_hash='0a58e3b7378bc2990ea3b43d5981f1f6'\n", 240 | " )\n", 241 | " else:\n", 242 | " weights_path = keras_utils.get_file(\n", 243 | " 'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',\n", 244 | " TF_WEIGHTS_PATH_NO_TOP,\n", 245 | " cache_subdir='models',\n", 246 | " file_hash='b0042744bf5b25fce3cb969f33bebb97'\n", 247 | " )\n", 248 | " model.load_weights(weights_path)\n", 249 | " if backend.backend() == 'theano':\n", 250 | " keras_utils.convert_all_kernels_in_model(model)\n", 251 | " elif weights is not None:\n", 252 | " model.load_weights(weights)\n", 253 | "\n", 254 | " return model\n", 255 | "\n", 256 | "def preprocess_input(x, **kwargs):\n", 257 | " #Numpy 배열을 이미지 배치로 전처리\n", 258 | " return imagenet_utils.preprocess_input(x, mode = 'tf', **kwargs)" 259 | ], 260 | "metadata": { 261 | "id": "2OJ6-sJPoqxo" 262 | }, 263 | "execution_count": null, 264 | "outputs": [] 265 | } 266 | ] 267 | } -------------------------------------------------------------------------------- /Computer Vision/README.md: -------------------------------------------------------------------------------- 1 | # Computer Vision Paper Implementation 2 | 3 | I read those Deep Learning papers and implemented them by coding. 😉 4 | There are some that have not yet been implemented, but will be implemented additionally in the future. 😊 5 | 6 | |Paper Title|Paper or reference site Link|Paper Review| 7 | |---|---|---| 8 | |history of CNN|LeNet, AlexNet, VGGNet, GoogLeNet, ResNet, ResNeXt, Sception, Mobilenet, DenseNet, EfficientNet, ConvNext|https://cartinoe5930.tistory.com/entry/CNN-network%EC%9D%98-%EC%97%AD%EC%82%AC| 9 | |ViT: An Image Worth 16 x 16 Words: Transformers for Image Recognition at Scale|https://arxiv.org/abs/2010.11929|https://cartinoe5930.tistory.com/entry/ViT-An-Image-Worth-16-x-16-Words-Transformers-for-Image-Recognition-at-Scale| 10 | |Swin Transformer: Hierachical Vision Transformer using Shifted Winodws|https://arxiv.org/abs/2103.14030|https://cartinoe5930.tistory.com/entry/Swin-Transformer-Hierarchical-Vision-Transformer-using-Shifted-Windows-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 11 | |CLIP: Learning Transferable Visual Models From Natural Language Supervision|https://arxiv.org/abs/2103.00020|https://cartinoe5930.tistory.com/entry/CLIP-Learning-Transferable-Visual-Models-From-Natural-Language-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 12 | -------------------------------------------------------------------------------- /Multimodal Models/FLAVA/README.md: -------------------------------------------------------------------------------- 1 | # Interacting with FLAVA 2 | 3 | https://github.com/apsdehal/flava-tutorials 참고하여 작성됨. 4 | 5 | paper review: https://cartinoe5930.tistory.com/entry/FLAVA-A-Foundational-Language-And-Vision-Alignment-Model-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0 6 | -------------------------------------------------------------------------------- /Multimodal Models/README.md: -------------------------------------------------------------------------------- 1 | # Multimodal Models paper code implementation 2 | 3 | I read those Multimodal Models papers and implemented them by coding(Pytorch, Tensorflow, etc.). 😉 4 | There are some that have not yet been implemented, but will be implemented additionally in the future. 😊 5 | 6 | ## Multi-modal Models 7 | 8 | |Paper Title|Paper or reference site Link|Paper Review| 9 | |---|---|---| 10 | |Let's learn about VLM(Visual-Language Model)|https://huggingface.co/blog/vision_language_pretraining#supporting-vision-language-models-in-%F0%9F%A4%97-transformers|https://cartinoe5930.tistory.com/entry/VLMVision-Language-Model%EC%97%90-%EB%8C%80%ED%95%B4-%EC%95%8C%EC%95%84%EB%B3%B4%EC%9E%90| 11 | |VisualBERT: A simple and Performant Baseline for Vision and Language |https://arxiv.org/abs/1908.03557|https://cartinoe5930.tistory.com/entry/VisualBERT-A-Simple-and-Performant-Baseline-for-Vision-and-Language-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 12 | |ViLBERT: Pre-training Task-Agnostic Visiolinguistic Representations for Visual-and-Language Tasks|https://arxiv.org/abs/1908.02265|https://cartinoe5930.tistory.com/entry/ViLBERT-Pretraining-Task-Agnostic-Visiolinguistic-Representations-for-Visual-and-Language-Tasks| 13 | |LXMERT: Learning Cross-Modality Encoder Representations from Transformers|https://arxiv.org/abs/1908.07490|https://cartinoe5930.tistory.com/entry/LXMERT-Learning-Cross-Modality-Encoder-Representations-from-Transformers-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 14 | |VL-BERT: Pre-training of Generic Visual-Linguistic Representations|https://arxiv.org/abs/1908.08530|https://cartinoe5930.tistory.com/entry/VL-BERT-Pre-training-of-Generic-Visual-Linguistic-Representations-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 15 | |VLP: Unified Vision-Language Pre-Training for Image Captioning and VQA|https://arxiv.org/abs/1909.11059|https://cartinoe5930.tistory.com/entry/VLP-Unified-Vision-Language-Pre-Traning-for-Image-Captioning-and-VQA-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 16 | |Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks|https://arxiv.org/abs/2004.06165|https://cartinoe5930.tistory.com/entry/Oscar-Object-Semantics-Aligned-Pre-training-for-Vision-Language-Tasks-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 17 | |ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision|https://arxiv.org/abs/2102.03334|https://cartinoe5930.tistory.com/entry/ViLT-Vision-and-Language-Transformer-Without-Convolution-or-Region-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 18 | |ALIGN: Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision|https://arxiv.org/abs/2102.05918|https://cartinoe5930.tistory.com/entry/ALIGN-Scaling-up-Visual-and-Vision-Language-Representation-with-Noisy-Text-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 19 | |ALBEF: Vision and Language Representation Learning with Momentum Distillation|https://arxiv.org/abs/2107.07651|https://cartinoe5930.tistory.com/entry/ALBEF-Vision-and-Language-Representation-Learning-with-Momentum-Distillation-%EB%85%BC%EB%AC%B8| 20 | |SimVLM: Simple Visual Language Model Pretraining with Weak Supervision|https://arxiv.org/abs/2108.10904|https://cartinoe5930.tistory.com/entry/SimVLM-Simple-Visual-Language-Model-Pre-training-with-Weak-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 21 | |BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation|https://arxiv.org/abs/2201.12086|https://cartinoe5930.tistory.com/entry/BLIP-Bootstrapping-Language-Image-Pre-training-fro-Unified-Vision-Language-Understanding-and-Generation-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 22 | |FLAVA: A Foundational Language And Vision Alignment Model|https://arxiv.org/abs/2112.04482|https://cartinoe5930.tistory.com/entry/FLAVA-A-Foundational-Language-And-Vision-Alignment-Model-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0| 23 | -------------------------------------------------------------------------------- /Natural Language Processing/ALBERT/README.md: -------------------------------------------------------------------------------- 1 | # ALBERT Implementation 2 | 3 | https://github.com/google-research/albert/blob/master/modeling.py 참고하여 작성됌. 4 | 5 | paper review: https://cartinoe5930.tistory.com/entry/ALBERT-A-Lite-BERT-for-Self-supervised-Learning-of-Language-Representations-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0 6 | -------------------------------------------------------------------------------- /Natural Language Processing/BERT/BERT_구현_복습.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNIeutm5STI86h0MtzDj0Xc", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "# BERT 구현 복습\n", 33 | "\n", 34 | "이미 한 번 BERT를 구현했던 적이 있는데, 이번에는 좀 더 구체적인 example을 사용하여 직접 구현해보도록 하겠다. 이 코드는 [여기](https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial)를 참고하여 작성되었다.\n", 35 | "\n", 36 | "BERT를 PyTorch를 이용하여 구현하였고, BERT를 구현하는 과정을 다음과 같이 4개의 섹션으로 나눴다.\n", 37 | "\n", 38 | "1. 전처리\n", 39 | "2. 모델링\n", 40 | "3. Loss & Optimization\n", 41 | "4. 훈련\n" 42 | ], 43 | "metadata": { 44 | "id": "Nrb7y3QLDj3t" 45 | } 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "source": [ 50 | "### 전처리\n", 51 | "\n", 52 | "전처리 과정에서는 신경망이 데이터를 처리할 수 있도록 다음과 같이 data를 구축한다. 일단 raw text부터 시작해보도록 하자." 53 | ], 54 | "metadata": { 55 | "id": "b0Qw4c4uEPTv" 56 | } 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "id": "1fTURcMQDe59" 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "# raw text\n", 67 | "\n", 68 | "text = (\n", 69 | " 'Hello, how are you? I am Romeo.n'\n", 70 | " 'Hello, Romeo My name is Juliet. Nice to meet you.n'\n", 71 | " 'Nice meet you too. How are you today?n'\n", 72 | " 'Great. My baseball team won the competition.n'\n", 73 | " 'Oh Congratulations, Julietn'\n", 74 | " 'Thanks you Romeo'\n", 75 | " )" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "source": [ 81 | "그 다음에 데이터를 다음과 같이 정리해야 한다.\n", 82 | "\n", 83 | "- 문장을 소문자로 변환\n", 84 | "- vocabulary를 만듦. **Vocabulary**는 문서 내의 독특한 단어의 list임." 85 | ], 86 | "metadata": { 87 | "id": "vkL4zzsxElEn" 88 | } 89 | }, 90 | { 91 | "cell_type": "code", 92 | "source": [ 93 | "# '.', ',', '?', '!' filtering\n", 94 | "sentences = re.sub(\"[.,!?-]\", '', text.lower()).split('n')\n", 95 | "\n", 96 | "word_list = list(set(\" \".join(sentences).split()))" 97 | ], 98 | "metadata": { 99 | "id": "OXS-z3vEE1ir" 100 | }, 101 | "execution_count": null, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "source": [ 107 | "다음으로, BERT의 학습 도중에 사용되는 special token을 잘 기억해야 한다. 다음은 이 다양한 토큰들에 대한 설명이다.\n", 108 | "\n", 109 | "- [CLS]: 첫 번째 토큰은 항상 classification\n", 110 | "- [SEP]: 두 개의 문장을 분리\n", 111 | "- [END]: 문장을 끝내기\n", 112 | "- [PAD]: 문장을 똑같은 길이로 줄이기\n", 113 | "- [MASK]: 기존의 단어를 mask로 대체\n", 114 | "\n", 115 | "이러한 토큰들은 word dictionary에 들어가 있어야 하는데, 여기서 vocabulary에 들어가 있는는 각각의 토큰과 단어는 index number가 할당된다." 116 | ], 117 | "metadata": { 118 | "id": "xTaer5nqFH-r" 119 | } 120 | }, 121 | { 122 | "cell_type": "code", 123 | "source": [ 124 | "word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}\n", 125 | "for i, w in enumerate(word_list):\n", 126 | " word_dict[w] = i + 4\n", 127 | " number_dict = {i: w for i, w in enumerate(word_dict)}\n", 128 | " vocab_size = len(word_dict)" 129 | ], 130 | "metadata": { 131 | "id": "Kq4dprH2F5OG" 132 | }, 133 | "execution_count": null, 134 | "outputs": [] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "source": [ 139 | "이 과정이 완료되면, input sequence를 3개의 유형의 embedding으로 포맷하는 함수를 생성해야 한다.\n", 140 | "\n", 141 | "- **token embedding**\n", 142 | "- **segment embedding**\n", 143 | "- **position embedding**\n", 144 | "\n", 145 | "이제 각각에 대해 알아보도록 하자." 146 | ], 147 | "metadata": { 148 | "id": "NXSxlEK6GPji" 149 | } 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "source": [ 154 | "**token embedding이 무엇일까?**\n", 155 | "\n", 156 | "예를 들어, 문장 \"The cat is walking. The dog is barking.\"이 주어졌을 때, 함수는 다음의 방식대로 sequence를 생성해야 한다.\n", 157 | "\n", 158 | "\"[CLS] the cat is walking [SEP] the dog is barking\"\n", 159 | "\n", 160 | "그 후에, 모든 것들은 word dictionary의 index로 바꿔야 한다. 따라서 이전의 문장은 다음과 같은 형태를 가지게 된다.\n", 161 | "\n", 162 | "\"[1, 5, 7, 9, 10, 2, 5, 6, 9, 11]\"\n", 163 | "\n", 164 | "여기서 1과 2는 각각 [CLS]와 [SEP]를 의미한다.\n", 165 | "\n", 166 | "**segment embedding이 무엇일까?**\n", 167 | "\n", 168 | "segment embedding은 두 개의 문장을 분리하는 역할을 한다. 보통 0과 1로 정의된다.\n", 169 | "\n", 170 | "**position embedding이 무엇일까?**\n", 171 | "\n", 172 | "position embedding은 sequence에서 각 embedding에게 position을 준다.\n", 173 | "\n" 174 | ], 175 | "metadata": { 176 | "id": "urJ0SqDiGwwD" 177 | } 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "source": [ 182 | "이제 다음 단계는 **masking**을 생성하는 것이다.\n", 183 | "\n", 184 | "논문에 의하면, BERT는 sequence의 15% word를 [MASK] 토큰으로 대체하고, padding을 추가하였다. Padding은 모든 문장의 길이를 똑같은 길이로 만들어준다. 예를 들어, 다음과 같은 문장을 받았다고 하였을 때,\n", 185 | "\n", 186 | "\"The cat is walking. The shog is barking at the tree\"\n", 187 | "\n", 188 | "이 문장에 padding을 적용하면 다음과 같이 바뀐다.\n", 189 | "\n", 190 | "\"[CLS] The cat is walking [PAD] [PAD] [PAD]. [CLS] The dog is barking at the tree.\"\n", 191 | "\n", 192 | "첫 번째 문장의 길이가 두 번째 문장의 길이와 같아진다." 193 | ], 194 | "metadata": { 195 | "id": "86CK0zidJFh9" 196 | } 197 | }, 198 | { 199 | "cell_type": "code", 200 | "source": [ 201 | "def make_batch():\n", 202 | " batch = []\n", 203 | " positive = negative = 0\n", 204 | " while positive != batch_size / 2 or negative != batch_size / 2:\n", 205 | " tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))\n", 206 | "\n", 207 | " tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]\n", 208 | "\n", 209 | " input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP']]\n", 210 | " segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n", 211 | "\n", 212 | " # LM masking\n", 213 | " n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 한 문장의 15% 정도의 토큰\n", 214 | " cand_maked_pos = [1 for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]\n", 215 | " shuffle(cand_maked_pos)\n", 216 | " masked_tokens, masked_pos = [], []\n", 217 | " for pos in cand_makes_pos[:n_pred]:\n", 218 | " masked_pos.append(pos)\n", 219 | " masked_tokens.append(input_ids[pos])\n", 220 | " if random() < 0.8: # 80%는 masking\n", 221 | " input_ids[pos] = word_dict['[MASK]']\n", 222 | " elif random() < 0.5: # 10%는 vocabulary에서 random indexing\n", 223 | " index = randint(0, vocab_size - 1)\n", 224 | " input_ids[pos] = word_dict[number_dict[index]]\n", 225 | " \n", 226 | " # Zero padding\n", 227 | " n_pad = maxlen - len(input_ids)\n", 228 | " input_ids.extend([0] * n_pad)\n", 229 | " segment_ids.extend([0] * n_pad)\n", 230 | "\n", 231 | " # Zero padding (100% - 15%) tokens\n", 232 | " if max_pred > n_pred:\n", 233 | " n_pad = max_pred - n_pred\n", 234 | " masked_tokens.extend([0] * n_pad)\n", 235 | " masked_pos.extend([0] * n_pad)\n", 236 | "\n", 237 | " if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:\n", 238 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext\n", 239 | " elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:\n", 240 | " batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext\n", 241 | " negative += 1\n", 242 | "\n", 243 | " return batch" 244 | ], 245 | "metadata": { 246 | "id": "mZitlUMPNrU-" 247 | }, 248 | "execution_count": null, 249 | "outputs": [] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "source": [ 254 | "next-word prediction을 다루기 때문에, 문장이 이어진 문장인지 아닌지를 예측하는 label을 생성해야 한다. 이것이 바로 IsNext와 NotNext이다. 그래서 다음 문장 앞에 오는 모든 문장에 True를 할당하고 이를 위해 조건문을 사용하였다.\n", 255 | "\n", 256 | "예를 들어, 두 개의 문장이 하나의 document에 있으면, 이 둘은 서로를 문맥적으로 따른다. 따라서서 첫 번째 문장이 A이면 다음 문장은 A+1이어야 한다. 직관적으로 첫 번째 문장의 위치 즉, tokens_a_index + 1 == tokens_b_index, 즉 동일한 context의 두 번째 문장인 경우 이 입력에 대한 label을을 True로 설정할 수 있도록 코드를 작성해야 한다.\n", 257 | "\n", 258 | "만약 위 조건이 tokens_a_index + 1 != tokens_b_index라면 input에 대한 label을 False로 지정해야 한다." 259 | ], 260 | "metadata": { 261 | "id": "3Ifq41KQQlD4" 262 | } 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "source": [ 267 | "### 모델링\n", 268 | "\n", 269 | "BERT는 매우 정교한 모델이라서 느리게 감지되면 논리를 잃게 된다. 그래서 BERT는 component와 함수에 의해 component를 설명하는 것이 가능하다.\n", 270 | "\n", 271 | "BERT는 다음의 component들을 가진다.\n", 272 | "\n", 273 | "1. Embedding layer\n", 274 | "2. Attention Mask\n", 275 | "3. Encoder layer\n", 276 | " - Multi-head attention\n", 277 | " - Scaled dot product attention\n", 278 | " - Position-wise feed-forward network\n", 279 | "4. BERT(모든 component를 합침)" 280 | ], 281 | "metadata": { 282 | "id": "G2QPTi8D1R5B" 283 | } 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "source": [ 288 | "#### Embedding Layer\n", 289 | "\n", 290 | "embedding은 BERT의 첫 번째 레이어로 input을 받아서 lookup table을 생성한다. embedding layer의 파라미터는 학습 가능하고, 이는 학습 스포레스가 끝날 때, embedding은 비슷한 단어들끼리 모여있을 거라는 것이다.\n", 291 | "\n", 292 | "embedding layer는 단어 간의 서로 다른 관계를 보존한다. 여기에는 semantic, syntactic, linear, 그리고 BERT가 양방향성이기 때문에, contextual relationship을 잘 보존한다.\n", 293 | "\n", 294 | "BERT의 경우에, 다음 3개의 embedding을 생성한다.\n", 295 | "\n", 296 | "- Token\n", 297 | "- Segments\n", 298 | "- Position\n", 299 | "\n", 300 | "아까 전에 position embedding을 생성하는 함수를 정의해두지는 않았지만, token과 segment를 생성하는 함수는 이미 정의해두었다. 그래서 이제 input을 받아서 sequence에서 각 단어에 대한 position을 생성할 수 있다. 그리고 이는 다음과 같다." 301 | ], 302 | "metadata": { 303 | "id": "k7zuhRtl2cwx" 304 | } 305 | }, 306 | { 307 | "cell_type": "code", 308 | "source": [ 309 | "print(torch.arange(30, dtype = torch.long).expand_as(input_ids))" 310 | ], 311 | "metadata": { 312 | "id": "idIqPc1H3v1q" 313 | }, 314 | "execution_count": null, 315 | "outputs": [] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "source": [ 320 | "forward function에서, 모든 embedding을 합하고 정규화하였다." 321 | ], 322 | "metadata": { 323 | "id": "bSHPzyJL38Pd" 324 | } 325 | }, 326 | { 327 | "cell_type": "code", 328 | "source": [ 329 | "class Embedding(nn.Module):\n", 330 | " def __init__(self):\n", 331 | " super(EMbedding, self).__init__()\n", 332 | " self.tok_embed = nn.Embedding(vocab_size, d_model) # token embedding\n", 333 | " self.pos_embed = nn.Embedding(maxlen, d_model) # position embedding\n", 334 | " self.seg_embed = nn.Embedding(n_segments, d_model) # segment embedding\n", 335 | " self.norm = nn.LayerNorm(d_model)\n", 336 | "\n", 337 | " def forward(self, x, seg):\n", 338 | " seq_len = x.size(1)\n", 339 | " pos = torch.arange(seq_len, dtype = torch.long)\n", 340 | " pos = pos.unsqueeze(0).expand_as(x) # (seq_len,) -> (batch_size, seq_len)\n", 341 | " embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n", 342 | "\n", 343 | " return self.norm(embedding)" 344 | ], 345 | "metadata": { 346 | "id": "3Z5lR_DF4FBO" 347 | }, 348 | "execution_count": null, 349 | "outputs": [] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "source": [ 354 | "#### attention mask 생성\n", 355 | "\n", 356 | "BERT는 attention mask 또한 필요로 한다. 그리고 이것은 적절한 형식이 되어야 한다. 다음의 코드가 attention mask를 생성하는 코드이다. 아래 코드에서 [PAD]는 1로 변환되고, 다른 것들은 0으로 변환된다." 357 | ], 358 | "metadata": { 359 | "id": "4jKULKiI5GKe" 360 | } 361 | }, 362 | { 363 | "cell_type": "code", 364 | "source": [ 365 | "def get_attn_pad_mask(seq_q, seq_k):\n", 366 | " batch_size, len_q = seq_q.size()\n", 367 | " batch_size, len_k = seq_k.size()\n", 368 | " # eq(0)은 PAD token이다.\n", 369 | " pad_attn_mask = seq_k.data.eq(0).unsqueeze(1) # batch_size x 1 x len_k(=len_q), 하나가 마스킹된다.\n", 370 | " return pad_attn_mask.expand(batch_size, len_q, len_k) # batch_size x len_q x len_k" 371 | ], 372 | "metadata": { 373 | "id": "6BErCR2k5Ype" 374 | }, 375 | "execution_count": null, 376 | "outputs": [] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "source": [ 381 | "#### Encoder\n", 382 | "\n", 383 | "Encdoer는 다음의 두 개의 주된 component를 가지고 있다.\n", 384 | "\n", 385 | "- Multi-head Attention\n", 386 | "- Position-wise feed-forward network\n", 387 | "\n", 388 | "encoder의 작업은 representation과 pattern을 input과 attention mask로부터 찾는 것이다." 389 | ], 390 | "metadata": { 391 | "id": "2VC7H8lr6gyu" 392 | } 393 | }, 394 | { 395 | "cell_type": "code", 396 | "source": [ 397 | "class EncoderLayer(nn.Module):\n", 398 | " def __init__(self):\n", 399 | " super(EncoderLayer, self).__init__()\n", 400 | " self.enc_self_attn = MultiHeadAttention()\n", 401 | " self.pos_ffn = PoswiseFeedForwardNet()\n", 402 | "\n", 403 | " def forward(self, en_inputs, enc_self_attn_mask):\n", 404 | " enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs는 Q, K, V와 같음\n", 405 | " enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]\n", 406 | " return enc_outputs, attn" 407 | ], 408 | "metadata": { 409 | "id": "S1bboIq9606b" 410 | }, 411 | "execution_count": null, 412 | "outputs": [] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "source": [ 417 | "#### Multi-head attention\n", 418 | "\n", 419 | "이것이 encoder의 첫 번째 주된 component이다.\n", 420 | "\n", 421 | "attention model은 3개의 입력값 **Query, Key, Value**를 받는다.\n", 422 | "\n", 423 | "Multi-head attention은 4개의 입력값 **Query, Key, Value, Attention mask**를 받는다. embedding은 Query, Key, Value에 입력으로 주어지고, attention mask는 attention mask 인자에 입력으로 주어진다.\n", 424 | "\n", 425 | "이러한 3개의 입력과 attention mask에 대해 dot-product 연산을 수행한다. 이 dot-product 연산은 **context vector**와 **attention**을 산출한다. context vector는 선형 레이어를 지나서 최종적으로 output을 출력한다." 426 | ], 427 | "metadata": { 428 | "id": "i5Ffo_Pu7h_4" 429 | } 430 | }, 431 | { 432 | "cell_type": "code", 433 | "source": [ 434 | "class MultiHeadAttention(nn.Module):\n", 435 | " def __init__(self):\n", 436 | " super(MultiHeadAttention, self).__init__()\n", 437 | " self.W_Q = nn.Linear(d_model, d_k * n_heads)\n", 438 | " self.W_K = nn.Linear(d_model, d_k * n_heads)\n", 439 | " self.W_V = nn.Linear(d_model, d_v * n_heads)\n", 440 | "\n", 441 | " def forward(self, Q, K, V, attn_mask):\n", 442 | " # q: [batch_size x len_q x d_model]\n", 443 | " # k: [batch_size x len_k x d_model]\n", 444 | " # v: [batch_size x len_k x d_model]\n", 445 | " residual, batch_size = Q, Q.size(0)\n", 446 | " # (B, S, D) -proj- -> (B, S, D) -split- -> (B, S, H, W) -trans- -> (B, H, S, W)\n", 447 | " q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # q_s: [batch_size x n_heads x len_q x d_k]\n", 448 | " k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2) # k_s: [batch_size x n_heads x len_k x d_k]\n", 449 | " v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2) # v_s: [batch_size x n_heads x len_k x d_v]\n", 450 | "\n", 451 | " attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask: [batch_size x n_heads x len_q x len_k]\n", 452 | "\n", 453 | " # context: [batch_size x n_heads x len_q x d_v]\n", 454 | " # attn: [batch_size x n_heads x len_q x len_k]\n", 455 | " context, attn = ScaleDotProductAttention()(q_s, k_s, v_s, attn_mask)\n", 456 | " context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]\n", 457 | " output = nn.Linear(n_heads * d_v, d_model)(context)\n", 458 | "\n", 459 | " return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]" 460 | ], 461 | "metadata": { 462 | "id": "hIE8aZIn80LD" 463 | }, 464 | "execution_count": null, 465 | "outputs": [] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "source": [ 470 | "이제 이 Scaled Dot-Product attention에 대해 알아보도록 하자.\n", 471 | "\n", 472 | "- scaled dot-product attention 클래스는 4개의 인자 Query, Key, Value, Attention mask를 받는다. 본질적으로, 앞에 3개의 인자들은 word embedding과 함께 주어지고, attention mask 인자는 attention mask embedding과 함께 주어진다.\n", 473 | "- 그리고 scaled dot-product attention은 **query**와 **key**간에 행렬곱을 해서 점수를 얻는다.\n", 474 | "\n", 475 | "우리 코드에서는 scores.masked_fill_(attn_mask, -1e9)를 사용한다. 이 속성은 attention mask가 **True**인 -1e9로 score 요소를 채우고 나머지 요소는 attention score를 얻은 다음 0과 1 사이의 score를 제공하는 softmax 함수를 통해 전달된다.마지막으로, attention 과 value 간에 행렬곱을 수행함으로써 context vector을 얻었다." 476 | ], 477 | "metadata": { 478 | "id": "EcSZkO3u_Y6T" 479 | } 480 | }, 481 | { 482 | "cell_type": "code", 483 | "source": [ 484 | "class ScaledDotProductAttention(nn.Module):\n", 485 | " def __init__(self):\n", 486 | " super(ScaledDotProductAttention, self).__init__()\n", 487 | "\n", 488 | " def forward(self, Q, K, V, attn_mask):\n", 489 | " scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores: [batch_size x n_heads x len_q x len_k]\n", 490 | " scores.masked_fill_(attn_mask, -1e9) # mask가 하나인 self tensor의 요소를 value로 채운다.\n", 491 | " attn = nn.Softmax(dim = -1)(scores)\n", 492 | " context = torch.matmul(attn, V)\n", 493 | " return score, context, attn\n", 494 | " " 495 | ], 496 | "metadata": { 497 | "id": "yHfeJSJKBmVo" 498 | }, 499 | "execution_count": null, 500 | "outputs": [] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "source": [ 505 | "#### Position-Wise Feed Forward Network\n", 506 | "\n", 507 | "multi-head attention의 출력값은 feed-forward network로 가고 이는 encoder part를 결론 짓는다.\n", 508 | "\n", 509 | "#### 모든 component를 합치기\n", 510 | "\n", 511 | "encoder는 다음의 2개의 출력값을 내놓는다.\n", 512 | "\n", 513 | "- feed-forward layer의 출력값\n", 514 | "- Attention mask\n", 515 | "\n", 516 | "여기서 중요한 것은 BERT는 decoder를 사용하지 않는다는 것이다. 대시넹, output과 attention mask를 사용해서 원하는 결과를 얻는다.\n", 517 | "\n", 518 | "transformer의 decoder 부분은 아래 코드처럼 분류하는데 사용되는 얕은 네트워크로 대체된다. BERT 또한 **classifier**와 **masked** 2개의 출력값을 내놓는다." 519 | ], 520 | "metadata": { 521 | "id": "6KxhEHWVCbci" 522 | } 523 | }, 524 | { 525 | "cell_type": "code", 526 | "source": [ 527 | "class BERT(nn.Module):\n", 528 | " def __init__(self):\n", 529 | " super(BERT, self).__init__()\n", 530 | " self.embedding = Embedding()\n", 531 | " self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n", 532 | " self.fc = nn.Linear(d_model, d_model)\n", 533 | " self.activ1 = nn.Train()\n", 534 | " self.linear = nn.Linear(d_model, d_model)\n", 535 | " self.activ2 = gelu\n", 536 | " self.norm = nn.LayerNorm(d_model)\n", 537 | " self.classifier = nn.Linear(d_model, 2)\n", 538 | " # decoder는 embedding layer와 공유됌\n", 539 | " embed_weight = self.embedding.tok_embed.weight\n", 540 | " n_vocab, n_dim = embed_weight.size()\n", 541 | " self.decoder = nn.Linear(n_dim, n_vocab, bias = False)\n", 542 | " self.decoder.weight = embed_weight\n", 543 | " self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n", 544 | "\n", 545 | " def forward(self, input_ids, segment_ids, masked_pos):\n", 546 | " output = self.embedding(input_ids, segment_ids)\n", 547 | " enc_self_attn_mask = deg_attn_pad_mask(input_ids, input_ids)\n", 548 | " for layer in self.layers:\n", 549 | " output, enc_self_attn = layer(output, enc_self_attn_mask)\n", 550 | " # output: [batch_size, len, d_model]\n", 551 | " # attn: [batch_size, n_heads, d_model, d_model]\n", 552 | " # 이는 첫 번째 토큰 (CLS)에 의해 결정됌\n", 553 | " h_pooled = self.activ1(self.fc(output[:, 0])) # [batch_size, d_model]\n", 554 | " logits_clsf = self.classification(h_pooled) # [batch_size, 2]\n", 555 | "\n", 556 | " masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]\n", 557 | "\n", 558 | " # transformer의 최종 출력으로부터 masked position을 얻음\n", 559 | " h_masked = torch.gather(output, 1, masked_pos) # masking position: [batch_size, max_pred, d_model]\n", 560 | " h_masked = self.norm(self.activ2(self.linear(h_masked)))\n", 561 | " logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]\n", 562 | "\n", 563 | " return logits_lm, logits_clsf" 564 | ], 565 | "metadata": { 566 | "id": "LU4v48ovDlvF" 567 | }, 568 | "execution_count": null, 569 | "outputs": [] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "source": [ 574 | "몇 가지 기억해두어야 할 사항이 있다.\n", 575 | "\n", 576 | "1. encoder의 수를 지정할 수 있다. 논문에서는 base model의 경우 12개였다.\n", 577 | "2. BERT에는 2개의 활성화 함수가 있는데, Tanh와 GELU이다." 578 | ], 579 | "metadata": { 580 | "id": "OILDxtxtF_3F" 581 | } 582 | }, 583 | { 584 | "cell_type": "code", 585 | "source": [ 586 | "def gelu(x):\n", 587 | " return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))" 588 | ], 589 | "metadata": { 590 | "id": "xaRraoH5GPqJ" 591 | }, 592 | "execution_count": null, 593 | "outputs": [] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "source": [ 598 | "### Loss & Optimization\n", 599 | "\n", 600 | "논문에서는 모든 vocabulary에 대해 확률 분포를 계산하였지만, softmax 근사치를 이용해서 계산이 가능하다. 하지만, 확률 분포를 구하는 깔끔한 방법은 **cross-entropy**를 사용하는 것이다. cross-entropy loss는 *softmax*와 *negative log-likelihood*의 조합이다.\n", 601 | "\n", 602 | "그래서 모델을 구축하는 동안 softmax를 포함할 필요 없이 softmax 정규화 없이 feed-forward network에서 깔끔한 출력을 얻을 수 있다. \n", 603 | "\n", 604 | "optimization으로 넘어가서 BERT에서는 Adam optimizer를 사용하였다." 605 | ], 606 | "metadata": { 607 | "id": "9t7Z4xBFGW4s" 608 | } 609 | }, 610 | { 611 | "cell_type": "code", 612 | "source": [ 613 | "criterion = nn.CrossEntropyLoss()\n", 614 | "optimizer = optim.Adam(model.parameters(), lr = 0.001)" 615 | ], 616 | "metadata": { 617 | "id": "Vk5q2c4FHKB_" 618 | }, 619 | "execution_count": null, 620 | "outputs": [] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "source": [ 625 | "### 훈련\n", 626 | "\n", 627 | "마지막으로 모델 훈련을 해보도록 하자." 628 | ], 629 | "metadata": { 630 | "id": "qP9szqwBHWtM" 631 | } 632 | }, 633 | { 634 | "cell_type": "code", 635 | "source": [ 636 | "model = BERT()\n", 637 | "batch = make_batch()\n", 638 | "input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))\n", 639 | "\n", 640 | " for epoch in range(100):\n", 641 | " optimizer.zero_grad()\n", 642 | " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n", 643 | " loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # masked LM을 위해\n", 644 | " loss_lm = (loss_lm.float()).mean()\n", 645 | " loss_clsf = criterion(logits_clsf, isNext) # sentence classification을 위해\n", 646 | " loss = loss_lm + loss_clsf\n", 647 | " if (epoch + 1) % 10 == 0:\n", 648 | " print('Epoch:', '%04d' % (epoch + 1), 'cost = ', '{:.6f}'.format(loss))\n", 649 | " loss.backward()\n", 650 | " optimizer.step()\n", 651 | "\n", 652 | " # mask token 예측하기\n", 653 | " input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))\n", 654 | " print(text)\n", 655 | " print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])\n", 656 | "\n", 657 | " logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n", 658 | " logits_lm = logits_lm.data.mix(2)[1][0].data.numpy()\n", 659 | " print('masked tokens list: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])\n", 660 | " print('predict masked tokens list: ', [pos for pos in logits_lm if pos != 0])\n", 661 | "\n", 662 | " logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]\n", 663 | " print('isNext: ', True if isNext else False)\n", 664 | " print('predict isNext: ', True is logits_clsf else False)" 665 | ], 666 | "metadata": { 667 | "id": "Q9-I6oFuHV_c" 668 | }, 669 | "execution_count": null, 670 | "outputs": [] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "source": [ 675 | "Output:\n", 676 | "\n", 677 | "Hello, how are you? I am Romeo.\n", 678 | "Hello, Romeo My name is Juliet. Nice to meet you.\n", 679 | "Nice meet you too. How are you today?\n", 680 | "Great. My baseball team won the competition.\n", 681 | "Oh Congratulations, Juliet\n", 682 | "Thanks you Romeo\n", 683 | "['[CLS]', 'nice', 'meet', 'you', 'too', 'how', 'are', 'you', 'today', '[SEP]', '[MASK]', 'congratulations', '[MASK]', '[SEP]']\n", 684 | "masked tokens list : [27, 22]\n", 685 | "predict masked tokens list : []\n", 686 | "isNext : False\n", 687 | "predict isNext : True" 688 | ], 689 | "metadata": { 690 | "id": "mRbWNVR5Jkx8" 691 | }, 692 | "execution_count": null, 693 | "outputs": [] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "source": [ 698 | "이렇게 해서 BERT를 모두 구현하였다. 좀 더 큰 corpus에 대해서도 똑같은 BERT 모델을 사용할 수 있다.\n", 699 | "\n", 700 | "1. Pre-training: corpus를 사용하지만 앞서 언급한 input representation의 정확한 형식을 사용\n", 701 | "2. FIne-tuning: 지도학습 데이터를 사용해야 한다.\n", 702 | "3. 다양한 task 또는 topic modeling을 위한 feature extractor가 있어야 함" 703 | ], 704 | "metadata": { 705 | "id": "a9ikGiiNJtmn" 706 | } 707 | } 708 | ] 709 | } 710 | -------------------------------------------------------------------------------- /Natural Language Processing/BERT/README.md: -------------------------------------------------------------------------------- 1 | # BERT Implementation 2 | 3 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading2-BERT-Pre-training-of-Deep-Bidirectional-Transformers-for-Language-Understanding 4 | -------------------------------------------------------------------------------- /Natural Language Processing/ELECTRA/README.md: -------------------------------------------------------------------------------- 1 | # ELECTRA Implementation 2 | 3 | https://github.com/google-research/electra/blob/master/model/modeling.py 참고하여 작성됨 4 | 5 | paper review: https://cartinoe5930.tistory.com/entry/ELECTRA-Pre-training-Text-Encoders-as-Discriminators-rather-than-Generators 6 | -------------------------------------------------------------------------------- /Natural Language Processing/ELMo/ELMo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNanKVFKMnCVZJm48NJvEOL", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "EAKmz65EfvqQ" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from typing import LIst, Tuple\n", 38 | "import torch\n", 39 | "import torch.nn as nn\n", 40 | "from char_cnn import CharEmbedding\n", 41 | "\n", 42 | "class ELMo(nn.Module):\n", 43 | " def __init__(self, vocab_size, output_dim, emb_dim, hid_dim, prj_dim, kernel_sizes,\n", 44 | " seq_len, n_layers, dropout):\n", 45 | " #파라미터 설명(몇 개만)\n", 46 | " #output_dim: word vocaulary 크기\n", 47 | " #n_layers: LSTM의 레이어 수. 기본값은 2\n", 48 | "\n", 49 | " super(ELMo, self).__init__()\n", 50 | "\n", 51 | " self.embedding = CharEmbedding(vocab_size, emb_dim, prj_dim, kernel_sizes, seq_len)\n", 52 | " self.bilms = BidirectionalLanguageModel(hid_dim, hid_dim, n_layers, dropout)\n", 53 | "\n", 54 | " self.predict = nn.Linear(hid_dim, output_dim)\n", 55 | "\n", 56 | " def forward(self, x):\n", 57 | " #파라미터: x(Sentence)\n", 58 | " #차원: x([batch, seq_len])\n", 59 | " emb = self.embedding(x)\n", 60 | " _, last_output = self.bilms(emb)\n", 61 | " y = self.predict(last_output)\n", 62 | "\n", 63 | " return y #훈련 단계에서는 오직 biLM의 마지막 LSTM의 output만을 사용하여라\n", 64 | "\n", 65 | " def get_embed_layer(self, x): #torch.Tensor --> List\n", 66 | " #순전파와 똑같지만, 모든 레이어의 임베딩을 반환함\n", 67 | " #파라미터: x(character로 이루어진 sentence)\n", 68 | " #차원: x([batch, seq_len])\n", 69 | " emb = self.embedding(x)\n", 70 | " first_output, last_output = self.bilms(emb)\n", 71 | "\n", 72 | " return emb, (first_output, last_output)\n", 73 | "\n", 74 | " def init_weights(self):\n", 75 | " for p in self.parameters():\n", 76 | " if p.dim() > 1:\n", 77 | " nn.init.xavier_uniform_(p)\n", 78 | "\n", 79 | " for lstm in self.bilms.lstms:\n", 80 | " for names in lstm._all_weights:\n", 81 | " for name in filter(lambda n: 'bias' in n, names):\n", 82 | " bias = getattr(lstm, name)\n", 83 | " n = bias.size(0)\n", 84 | " start, end = n // 4, n // 2\n", 85 | " bias.data[start:end].fill_(1.)\n", 86 | "\n", 87 | "class BidirectionalLanguageModel(nn.Module):\n", 88 | " def __init__(self, emb_dim, hid_dim, prj_emb, dropout):\n", 89 | " #LSTM 레이어의 이전과 이후 모두에 dropout 사용\n", 90 | " super(BidirectionalLanguageModel, self).__init__()\n", 91 | " self.lstms = nn.ModuleList([nn.LSTM(emb_dim, hid_dim, bidirectional = True, dropout = dropout,\n", 92 | " batch_first = True), nn.LSTM(prj_emb, hid_dim, bidirectional = True, dropout = dropout, bacth_first = True)])\n", 93 | " self.projection_layer = nn.Linear(2 * hid_dim, prj_emb)\n", 94 | "\n", 95 | " def forward(self, x, hidden = None):\n", 96 | " #파라미터: x(임베딩된 sentence tensor), hidden(hidden과 cell의 tuple)\n", 97 | " #차원: x([Batch, Seq_len, Emb_size]),\n", 98 | " #hidden([num_layers * num_directions, batch, hidden_size], [num_layers * num_directions, batch, hidden_size])\n", 99 | " \n", 100 | " #LSTM 레이어 사이에 residual connection 추가\n", 101 | " first_output, (hidden, cell) = self.lstms[0](x, hidden)\n", 102 | "\n", 103 | " projected = self.projection_layer(first_output)\n", 104 | " second_output, (hidden, cell) = self.lstms[1](projected, (hidden, cell))\n", 105 | "\n", 106 | " second_output = second_output.view(second_output.size(0), second_output.size(1), 2, -1)\n", 107 | "\n", 108 | " second_output = second_output[:, :, 0, :] + second_output[:, :, 1, :]\n", 109 | "\n", 110 | " return first_output, second_output" 111 | ] 112 | } 113 | ] 114 | } 115 | -------------------------------------------------------------------------------- /Natural Language Processing/ELMo/README.md: -------------------------------------------------------------------------------- 1 | # ELMo 2 | 3 | https://github.com/InhyeokYoo/NLP/blob/master/papers/4.ELMo 참고하여 작성 4 | 5 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading1-ELMo-Deep-contextualized-word-representations 6 | -------------------------------------------------------------------------------- /Natural Language Processing/ELMo/char_cnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyMpeFn+h3cVx7Sm4BlKoscT", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU", 18 | "gpuClass": "standard" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "view-in-github", 25 | "colab_type": "text" 26 | }, 27 | "source": [ 28 | "\"Open" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "id": "E414FoesNyVv" 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "#char_cnn\n", 40 | "import torch\n", 41 | "import torch.nn as nn\n", 42 | "from typing import List\n", 43 | "\n", 44 | "class CharEmbedding(nn.Module):\n", 45 | " def __init__(self, vocab_size, emb_dim, prj_dim, kernel_sizes, char_len, device):\n", 46 | " super().__init__()\n", 47 | " self.device = device\n", 48 | " self.kernel_dim = sum([kernel_size for num_features, kernel_size in kernel_sizes]) #embedding dimenstion과 같음\n", 49 | " self.charcnn = CharCNN(vocab_size, emb_dim, self.kernel_dim, kernel_sizes, char_len, device)\n", 50 | " self.highway_net = HighWayNetwork(self.kernel_dim)\n", 51 | " self.highwat_net._init_bias()\n", 52 | " self.projection_layer = nn.Linear(self.kernel_dim, prj_dim)\n", 53 | "\n", 54 | " def forward(self, x):\n", 55 | " #파라미터: 문장의 캐릭터로 이루어져 있는 문장 벡터\n", 56 | " #차원: [Batch, Seq_len, Char_len]\n", 57 | " batch_size, seq_len, _ = x.size()\n", 58 | " y = torch.zeros(batch_size, seq_len, self.kernel_dim).to(self.device)\n", 59 | "\n", 60 | " for i in range(seq_len):\n", 61 | " char_emb = self.charcnn(x[:, i, :])\n", 62 | " highway_emb = self.highway_net(char_emb)\n", 63 | " y[:, i, :] = highway_emb.squeeze(1)\n", 64 | "\n", 65 | " emb = self.projection_layer(y)\n", 66 | " return emb\n", 67 | "\n", 68 | "class CharCNN(nn.Module):\n", 69 | " def __init__(self, vocab_size, char_emb_dim, word_emb_dim, kernel_sizes, char_len, device):\n", 70 | " super(CharCNN, self).__init__()\n", 71 | " self.device = device\n", 72 | " self.char_len = char_len\n", 73 | " self.word_emb_dim = word_emb_dim\n", 74 | " self.kernel_sizes = kernel_sizes\n", 75 | "\n", 76 | " self.embedding = nn.Embedding(vocab_size, char_meb_dim)\n", 77 | " self.kernels = nn.ModuleList([nn.Conv1d(in_channels = char_emb_dim, out_channels = num_features,\n", 78 | " kernel_size = kernel_size) for kernel_size, num_features in kernel_sizes])\n", 79 | "\n", 80 | " def forward(self, word):\n", 81 | " #파라미터: word(입력 텐서)\n", 82 | " #차원\n", 83 | " #입력: 단어([Batch, Emb_dim, Seq_len])\n", 84 | " #출력: y([Batch, Kernel_dim])\n", 85 | " batch_size = word.size(0)\n", 86 | " y = torch.zeros(batch_size, self.word_meb_dim).to(self.device)\n", 87 | "\n", 88 | " cnt = 0 #indec for y\n", 89 | "\n", 90 | " #torch.cat보다 비어있는 텐서를 채우는 것이 더 빠름\n", 91 | " for kernel in self.kernels:\n", 92 | " emb = self.embedding(word)\n", 93 | " emb = emb.permute(0, 2, 1)\n", 94 | " temp = kernel(emb)\n", 95 | " pooled = torch.max(temp, dim = 2)[0]\n", 96 | " y[:, cnt] = pooled\n", 97 | " cnt += pooled_size(1)\n", 98 | "\n", 99 | " return y\n", 100 | "\n", 101 | "class HighwayNetwork(nn.Module):\n", 102 | " def __init__(self, kernel_sizes):\n", 103 | " super(HighwayNetwork, self).__init__()\n", 104 | " self.h_gate = nn.Linear(kernel_sizes, kernel_sizes)\n", 105 | " self.t_gate = nn.Sequential(nn.Linear(kernel_sizes, kernel_sizes), nn.Sigmoid())\n", 106 | " self.relu = torch.nn.ReLU()\n", 107 | "\n", 108 | " def forward(self, x):\n", 109 | " #차원: x(Batch, Kernel_dim)\n", 110 | " x = x.unsqueeze(1)\n", 111 | " h = self.relu(self.h_gate(x))\n", 112 | " t = self.t_gate(x)\n", 113 | " c = 1 - t\n", 114 | " return t * h + c * x\n", 115 | "\n", 116 | " def _init_bias(self):\n", 117 | " self.t_gate[0].bias.data.fill_(-2)" 118 | ] 119 | } 120 | ] 121 | } 122 | -------------------------------------------------------------------------------- /Natural Language Processing/ELMo/character_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNgGR09iOxTjA3Q3sX+iuzH", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "0b8fNKIbbE0p" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import torchtext\n", 38 | "from torchtext.data import NestedField\n", 39 | "import math\n", 40 | "\n", 41 | "class BPTTIterator(torchtext.data.BPTTIterator):\n", 42 | " def __iter__(self):\n", 43 | " text = self.dataset[0].text\n", 44 | " TEXT = self.dataset.fields['text']\n", 45 | " TEXT.eos_token = None\n", 46 | " text = text + ([TEXT.pad_token] * int(math.ceil(len(text) / self.batch_size) * self.batch_size - len(text)))\n", 47 | " data = TEXT.pad([text]) #new\n", 48 | " data = TEXT.numericalize(data, device = self.device)\n", 49 | "\n", 50 | " #new line start\n", 51 | " size = list(data.size())\n", 52 | " size[0] = self.batch_size\n", 53 | " size[1] = -1\n", 54 | "\n", 55 | " data = data.view(*size).transpose(0, 1).contiguous()\n", 56 | " dataset = torchtext.data.Dataset(examples = self.dataset.examples, fields = [('text', 'TEXT'), ('target', 'TEXT')])\n", 57 | "\n", 58 | " while True:\n", 59 | " for i in range(0, len(self) * self.bptt_len, self.bptt_len):\n", 60 | " self.ierations += 1\n", 61 | " seq_len = min(self.bptt_len, len(data) - i - 1)\n", 62 | " batch_text = data[i:i + seq_len]\n", 63 | " if TEXT.batch_first:\n", 64 | " batch_text = batch_text.transpose(0, 1).contiguous()\n", 65 | " batch_target = batch_target.transpose(0, 1).contiguous()\n", 66 | " yield torchtext.data.Batch.fromvars(\n", 67 | " dataset, self.batch_size, text = batch_text, target = batch_target\n", 68 | " )\n", 69 | " if not self.repeat:\n", 70 | " return\n", 71 | "\n", 72 | "def gen_bptt_iter(dataset, batch_size, bptt_len, device):\n", 73 | " #dataset: tuple of dataset\n", 74 | " for batch_word, batch_char in zip(\n", 75 | " BPTTIterator(dataset[0], batch_size, bptt_len, device = device),\n", 76 | " BPTTIterator(dataset[1], batch_size, bptt_len, device = device),\n", 77 | " ):\n", 78 | " yield batch_word.text, batch_char.text, batch_word.target, batch_char.target\n", 79 | "\n", 80 | "def gen_language_model_corpus(dataset_cls: torchtext.datasets.LanguageModelingDataset):\n", 81 | " field_char = NestedField(Field(pad_token = PAD_WORD, tokenize = list, init_token = SOS_WORD,\n", 82 | " eos_token = EOS_WORD, batch_first = True), pad_token = PAD_WORD,)\n", 83 | " \n", 84 | " field_word = Field(batch_first = True)\n", 85 | " dataset_char = dataset_cls.splits(field_char)\n", 86 | " dataset_word = dataset_cls.splits(dielf_word)\n", 87 | " field_char.build_vocab(dataset_char[0])\n", 88 | " field_word.build_vocab(dataset_char[0])\n", 89 | " return [_ for _ in zip(dataset_word, dataset_char)], field_word, field_char\n", 90 | "\n", 91 | "#How to use\n", 92 | "if __name__ == '__main__':\n", 93 | " from torchtext.dataset import WIkiText2\n", 94 | " from torchtext.data import Field\n", 95 | "\n", 96 | " #FINAL\n", 97 | " PAD_WORD = ''\n", 98 | " SOS_WORD = ''\n", 99 | " EOS_WORD = ''\n", 100 | "\n", 101 | " datasets, field_word, field_char = gen_language_model_corpus(WikiText2)\n", 102 | " train_data, valid_data, test_data = datasets" 103 | ] 104 | } 105 | ] 106 | } 107 | -------------------------------------------------------------------------------- /Natural Language Processing/GPT-1/GPT-1 Implementation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNMCURwdSd6LE/DF4oH8QYA", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "# GPT-1 Implementation\n", 33 | "\n", 34 | "GPT-1 구현 코드는 [GPT 구현하기](https://paul-hyun.github.io/gpt-01/)를 참고하여 작성되었다.\n", 35 | "\n", 36 | "우선 GPT를 구현하기 전에 GPT에 대해 간략하게 설명하면 GPT는 Transformer의 Decoder만을 사용한 Pre-trained LM이다.\n", 37 | "\n", 38 | "### 1. Config\n", 39 | "\n", 40 | "Transformer와 파라미터를 동일하게 설정하였다. GPT는 Transformer의 Decoder만을 사용하므로 Encoder 부분은 제거하고 사용하였다." 41 | ], 42 | "metadata": { 43 | "id": "hMbeV9Y6mqNK" 44 | } 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "id": "XpRTodgTmhUR" 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "config = Config({\n", 55 | " 'n_dec_vocab': len(vocab),\n", 56 | " 'n_dec_seq': 256,\n", 57 | " 'n_layer': 6,\n", 58 | " 'd_hidn': 256,\n", 59 | " 'i_pad': 0,\n", 60 | " 'd_ff': 1024,\n", 61 | " 'n_head': 4,\n", 62 | " 'd_head': 64,\n", 63 | " 'dropout': 0.1,\n", 64 | " 'layer_norm_epsilon': 1e-12\n", 65 | "})\n", 66 | "print(config)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "source": [ 72 | "# 2. Decoder\n", 73 | "\n", 74 | "GPT는 Transformer의 Encoder는 사용하지 않고 Decoder만 사용하므로 Decoder에서 Encoder의 출력과 Attention을 하는 부분인 Encoder-Decoder-Multi-Head Attention 부분은 제거하고 사용하였다. 그 외에 나머지 부분은 Transformer와 동일하다." 75 | ], 76 | "metadata": { 77 | "id": "0u1Nu0LroUzG" 78 | } 79 | }, 80 | { 81 | "cell_type": "code", 82 | "source": [ 83 | "# Decoder Layer\n", 84 | "class DecoderLayer(nn.Module):\n", 85 | " def __init__(self, config):\n", 86 | " super().__init__()\n", 87 | " self.config = config\n", 88 | "\n", 89 | " self.self_attn = MultiHeadAttention(self.config)\n", 90 | " self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps = self.config.layer_norm_epsilon)\n", 91 | " self.pos_ffn = PoswiseFeedForwardNet(self.config)\n", 92 | " self.layer_norm3 = nn.LayerNorm(self.config.d_hidn, eps = self.config.layer_norm_epsilon)\n", 93 | "\n", 94 | " def forward(self, dec_inputs, self_attn_mask):\n", 95 | " # (batch_size, n_dec_seq, d_hidn), (batch_size, n_head, n_dec_seq, n_dec_seq)\n", 96 | " self_att_outputs, self_attn_prob = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)\n", 97 | " self_att_outputs = self.layer_norm1(dec_inputs + self_att_outputs)\n", 98 | " # (batch_size, n_dec_seq, d_hidn)\n", 99 | " ffn_outputs = self.po_ffn(self_att_outputs)\n", 100 | " ffn_outputs = self.layer_norm3(self_att_outputs + ffn_outputs)\n", 101 | " # (batch_size, n_dec_seq, d_hidn), (batch_size, n_head, n_dec_seq, n_dec_seq), (batch_size, n_head, n_dec_seq, n_enc_seq)\n", 102 | " return ffn_outputs, self_attn_prob" 103 | ], 104 | "metadata": { 105 | "id": "ZbvA3ofcom9U" 106 | }, 107 | "execution_count": null, 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "source": [ 113 | "# Decoder\n", 114 | "class Decoder(nn.Module):\n", 115 | " def __init__(self, config):\n", 116 | " super().__init__()\n", 117 | " self.config = config\n", 118 | "\n", 119 | " self.dec_emb = nn.Embedding(self.config.n_dec_vocab, self.config.d_hidn)\n", 120 | " sinusoid_table = torch.FloatTensor(det_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.d_hidn))\n", 121 | " self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze = True)\n", 122 | "\n", 123 | " self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config.n_layer)])\n", 124 | "\n", 125 | " def forward(self, dec_inputs):\n", 126 | " positions = torch.arange(dec_inputs.size(1), device = dec_inputs.device, dtype = dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1\n", 127 | " pos_mask = dec_inputs.eq(self.config.i_pad)\n", 128 | " positions.masked_fill_(pos_mask, 0)\n", 129 | "\n", 130 | " # (batch_size, n_dec_seq, d_hidn)\n", 131 | " dec_outputs = self.dec_emb(dec_inputs) + self.pos_emb(positions)\n", 132 | "\n", 133 | " # (batch_size, n_dec_seq, n_dec_seq)\n", 134 | " dec_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config.i_pad)\n", 135 | " # (batch_size, n_dec_seq, n_dec_seq)\n", 136 | " dec_attn_decoder_mask = get_attn_decoder_mask(dec_inputs)\n", 137 | " # (batch_size, n_dec_seq, n_dec_seq)\n", 138 | " dec_self_attn_mask = torch.gt((dec_attn_mask + dec_attn_decoder_mask), 0)\n", 139 | "\n", 140 | " self_attn_probs = []\n", 141 | " for layer in self.layers:\n", 142 | " # (batch_size, n_dec_seq, d_hidn), (batch_size, n_dec_seq, n_dec_seq)\n", 143 | " dec_outputs, self_attn_prob = layer(dec_outputs, dec_self_attn_mask)\n", 144 | " self_attn_probs.append(self_attn_prob)\n", 145 | " # (batch_size, n_dec_seq, d_hidn), [(batch_size, n_dec_seq, n_dec_seq)]\n", 146 | " return dec_outputs, self_attn_probs" 147 | ], 148 | "metadata": { 149 | "id": "Z89dmNpSqq9K" 150 | }, 151 | "execution_count": null, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "source": [ 157 | "# 3. GPT\n", 158 | "\n", 159 | "GPT는 단순히 Transformer Decoder를 실행\n", 160 | "Pre-traing 모델을 저장하기 위한 save, 저장된 모델을 읽기 위한 load 함수가 추가로 정의의" 161 | ], 162 | "metadata": { 163 | "id": "x59jUSdgF2FU" 164 | } 165 | }, 166 | { 167 | "cell_type": "code", 168 | "source": [ 169 | "class GPT(nn.Module):\n", 170 | " def __init__(self, config):\n", 171 | " super().__init__()\n", 172 | " self.config = config\n", 173 | "\n", 174 | " self.decoder = Decoder(self.config)\n", 175 | "\n", 176 | " def forward(self, dec_inputs):\n", 177 | " # (batch_size, n_seq, d_hidn), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n", 178 | " dec_outputs, dec_self_attn_probs = self.decoder(dec_inputs)\n", 179 | " # (batch_size, n_dec_seq, n_dec_vocab), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n", 180 | " return dec_outputs, dec_self_attn_probs\n", 181 | "\n", 182 | " def save(self, epoch, loss, path):\n", 183 | " torch.save({\n", 184 | " 'epoch': epoch, \n", 185 | " 'loss': loss, \n", 186 | " 'state_dict': self.state_dict()\n", 187 | " }, path)\n", 188 | "\n", 189 | " def load(self, path):\n", 190 | " save = torch.load(path)\n", 191 | " self.load_state_dict(save['state_dict'])\n", 192 | " return save['epoch'], save['loss']" 193 | ], 194 | "metadata": { 195 | "id": "uEbD9YQ8GEd-" 196 | }, 197 | "execution_count": null, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "source": [ 203 | "# 4. Pre-traing Model\n", 204 | "\n", 205 | "GPT를 pre-train 하기 위한 클래스. GPT pre-train 클래스의 목적은 입력 단어에 대한 다음 단어를 예측하는 것이다." 206 | ], 207 | "metadata": { 208 | "id": "T5X1B3EzHaQ6" 209 | } 210 | }, 211 | { 212 | "cell_type": "code", 213 | "source": [ 214 | "class GPTPretraing(nn.Module):\n", 215 | " def __init__(self, config):\n", 216 | " super().__init__()\n", 217 | " self.config = config\n", 218 | "\n", 219 | " self.gpt = GPT(self.config)\n", 220 | " # 단어를 예측하기 위한 projection_lm을 선언\n", 221 | " self.projection_lm = nn.Linear(self.config.d_hidn, self.config.n_dec_vocab, bias = False)\n", 222 | " # Decoder의 Embedding & weight를 공유\n", 223 | " self.projection_lm.weight = self.gpt.decoder.dec_emb.weight\n", 224 | "\n", 225 | " def forward(self, dec_inputs):\n", 226 | " # (batch_size, n_dec_seq, d_hidn), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n", 227 | " dec_outputs, dec_self_attn_probs = self.gpt(dec_inputs)\n", 228 | " # (batch_size, n_dec_seq, n_dec_vocab)\n", 229 | " # GPT 실행 결과를 입력으로 projection_lm을 실행해서 단어를 예측측\n", 230 | " logits_lm = self.projection_lm(dec_outputs)\n", 231 | " # (batch_size, n_dec_seq - 1, n_dec_vocab), (batch_size, n_output), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n", 232 | " # 결과의 마지막을 제외한 나머지를 리턴\n", 233 | " return logits_lm[:, :-1, :].contiguous(), dec_self_attn_probs" 234 | ], 235 | "metadata": { 236 | "id": "tdUtbgF7HuRi" 237 | }, 238 | "execution_count": null, 239 | "outputs": [] 240 | } 241 | ] 242 | } 243 | -------------------------------------------------------------------------------- /Natural Language Processing/GPT-1/README.md: -------------------------------------------------------------------------------- 1 | # GPT-1 Implementation 2 | 3 | https://paul-hyun.github.io/gpt-01/ 참고하여 작성됌 4 | 5 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading3-GPT-1-Improving-Language-Understanding-by-Generative-Pre-Training 6 | -------------------------------------------------------------------------------- /Natural Language Processing/RoBERTa/README.md: -------------------------------------------------------------------------------- 1 | # RoBERTa Implementation 2 | 3 | https://github.com/facebookresearch/fairseq/blob/main/fairseq/models/roberta/model.py 참고하여 작성됌 4 | 5 | paper review: https://cartinoe5930.tistory.com/entry/RoBERTa-A-Robustly-Optimized-BERT-Pretraining-Approach-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0 6 | -------------------------------------------------------------------------------- /Natural Language Processing/Transformer-XL/README.md: -------------------------------------------------------------------------------- 1 | # Transformer-XL Implementation 2 | 3 | https://github.com/kimiyoung/transformer-xl/blob/master/tf/model.py 참고하여 작성 4 | 5 | paper review is here!! https://cartinoe5930.tistory.com/entry/Transformer-XL-Attentive-Language-Models-Beyond-a-Fixed-Length-Context-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0 6 | -------------------------------------------------------------------------------- /Natural Language Processing/Transformer-XL/Transformer_XL_구현_실습.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOtb06YYh5iyXi4CRZAWjAR", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "70fsbBslZZ7I" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import tensorflow as tf\n", 38 | "\n", 39 | "def positional_embedding(pos_seq, inv_freq, bsz = None):\n", 40 | " sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)\n", 41 | " pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)\n", 42 | " if bsz is not None:\n", 43 | " return tf.tile(pos_emb[:, None, :], [1, bsz, 1])\n", 44 | " else:\n", 45 | " return pos_emb[:, None, :]\n", 46 | "\n", 47 | "def positionwise_FF(inp, d_model d_inner, dropout, kernel_initializer, scope = 'ff', is_training = True):\n", 48 | " output = inp\n", 49 | " with tf.variable_scope(scope):\n", 50 | " output = tf.layers.dense(inp, d_inner, activation = tf.nn.relu,\n", 51 | " kernel_initializer = kernel_initializer,\n", 52 | " name = 'layer_1')\n", 53 | " output = tf.layers.dropout(output, dropout, training = is_training, name = 'drop_1')\n", 54 | " output = tf.layers.dense(output, d_model, kernel_initializer = kernel_initializer,\n", 55 | " name = 'layer2')\n", 56 | " output = tf.layers.dropout(output, dropout, training = is_training, name = 'drop_2')\n", 57 | " output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis = -1)\n", 58 | " return output\n", 59 | "\n", 60 | "def rel_shift(x):\n", 61 | " x_size = tf.shape(x)\n", 62 | "\n", 63 | " x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])\n", 64 | " x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])\n", 65 | " x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])\n", 66 | " x = tf.reshape(x, x_size)\n", 67 | "\n", 68 | " return x\n", 69 | "\n", 70 | "def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,\n", 71 | " n_head, d_head, dropout, dropatt, is_training,\n", 72 | " kernel_initializer, scope = 'rel_attn'):\n", 73 | " scale = 1 / (d_head ** 0.5)\n", 74 | " with tf.variable_scope(scope):\n", 75 | " qlen = tf.shape(w)[0]\n", 76 | " rlen = tf.shape(r)[0]\n", 77 | " bsz = tf.shape(w)[1]\n", 78 | "\n", 79 | " cat = tf.concat([mems, w], 0) if mems is not None and mems.shape.ndims > 1 else w\n", 80 | " w_heads = tf.layers.dense(cat, 3 * n_head, d_head, use_bias = False, kernel_initializer = kernel_initializer,\n", 81 | " name = 'qkv')\n", 82 | " r_head_k = tf.layers.dense(r, n_head * d_head, use_bias = False, kernel_initializer = kernel_initializer,\n", 83 | " name = 'r')\n", 84 | " \n", 85 | " w_head_q, w_kead_k, w_head_v = tf.split(w_heads, 3, -1)\n", 86 | " w_head_q = w_head_q[-qlen:]\n", 87 | "\n", 88 | " klen = tf.shape(w_head_k)[0]\n", 89 | "\n", 90 | " w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])\n", 91 | " w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])\n", 92 | " w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])\n", 93 | "\n", 94 | " r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])\n", 95 | "\n", 96 | " rw_head_q = w_head_q + r_w_bias\n", 97 | " rr_head_q = w_head_q + r_r_bias\n", 98 | "\n", 99 | " AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)\n", 100 | " BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)\n", 101 | " BD = rel_shift(BD)\n", 102 | "\n", 103 | " attn_score = (AC + BD) * scale\n", 104 | " attn_mask_t = attn_mask[:, :, None, None]\n", 105 | " attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t\n", 106 | "\n", 107 | " attn_prob = tf.nn.softmax(attn_score, 1)\n", 108 | " attn_prob = tf.layers.dropout(attn_prob, dropatt, training = is_training)\n", 109 | "\n", 110 | " attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)\n", 111 | " size_t = tf.shape(attn_vec)\n", 112 | " attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])\n", 113 | "\n", 114 | " attn_out = tf.layers.dense(attn_vec, d_model, use_bias = False,\n", 115 | " kernel_initializer = kernel_initializer, name ='o')\n", 116 | " attn_out = tf.layers.dropout(attn_out, dropout, training = is_training)\n", 117 | "\n", 118 | " output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis = -1)\n", 119 | "\n", 120 | " return output\n", 121 | "\n", 122 | "def embedding_lookup(lookup_table, x, use_tpu = True):\n", 123 | " if use_tpu:\n", 124 | " n_token = tf.shape(lookup_table)[0]\n", 125 | " one_hot_idx = tf.one_hot(x, n_token)\n", 126 | " if one_hot_idx.shape.ndims == 2:\n", 127 | " return tf.einsum('nd,in->id', lookup_table, one_hot_idx)\n", 128 | " else:\n", 129 | " return tf.einsum('nb,ibn->ibd', lookup_table, one_hot_idx)\n", 130 | " else:\n", 131 | " return tf.nn.embedding_lookup(lookup_table, x)\n", 132 | "\n", 133 | "def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,\n", 134 | " proj_initializer, div_val = 1,\n", 135 | " proj_same_dim = True,\n", 136 | " scope = 'adaptive_embed', **kwargs):\n", 137 | " emb_scale = d_proj ** 0.5\n", 138 | " with tf.variable_scope(scope):\n", 139 | " if div_val == 1:\n", 140 | " lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], initializer = initializer)\n", 141 | " y = embedding_lookup(lookup_table, x, use_tpu = False)\n", 142 | " if d_proj != d_embed:\n", 143 | " proj_W = tf.get_variable('proj_W', [d_embed, d_proj], initializer = proj_initializer)\n", 144 | " y = tf.einsum('ibe,ed->ibd', y, proj_w)\n", 145 | " else:\n", 146 | " proj_w = None\n", 147 | " ret_params = [lookup_table, proj_W]\n", 148 | " else:\n", 149 | " tables, projs = [], []\n", 150 | " curoff_ends = [0] + cutoffs + [n_token]\n", 151 | " x_size = tf.shape(x)\n", 152 | " y = tf.zeros([x_size[0], x_size[1], d_proj])\n", 153 | " for i in range(len(cutoff_ends) - 1):\n", 154 | " with tf.variable_scope('cutoff_{}'.format(i)):\n", 155 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n", 156 | " mask = (x >= l_idx) & (x < r_idx)\n", 157 | " cur_x = tf.boolean_mask(x, mask) - l_idx\n", 158 | " cur_d_embed = d_embed // (div_val ** i)\n", 159 | " lookup_table = tf.get_variable('lookup_table', [r_idx - l_idx, cur_d_embed].\n", 160 | " initializer = initializer)\n", 161 | " cur_y = embedding_lookup(lookup_table, cur_x, use_tpu = False)\n", 162 | " if d_proj == cur_d_embed and not proj_same_dim:\n", 163 | " proj_W = None\n", 164 | " else:\n", 165 | " proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],\n", 166 | " initializer = proj_initializer)\n", 167 | " cur_y = tf.einsum('id,de->ie', cur_y, proj_W)\n", 168 | " mask_idx = tf.to_int64(tf.where(mask))\n", 169 | " y += tf.scatter_nd(mask_idx, cur_y, tf.to_int64(tf.shape(y)))\n", 170 | " tables.append(lookup_table)\n", 171 | " projs.append(proj_W)\n", 172 | " ret_params = [tables, projs]\n", 173 | " \n", 174 | " y *= emb_scale\n", 175 | " return y, ret_params\n", 176 | "\n", 177 | "def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,\n", 178 | " proj_initializer, div_val = 1, perms = None,\n", 179 | " proj_same_dim = True, scope = 'adaptive_embed'):\n", 180 | " #만약 perm이 None이라면\n", 181 | " #W = W1 X W2와 같이 각각 projection되고, 그 다음에 X x W (embedding lookup)을 계산\n", 182 | " #None이 아니라면\n", 183 | " #bin-based embedding lookup을 사용\n", 184 | "\n", 185 | " emb_scale = d_proj ** 0.5\n", 186 | " with tf.variable_scope(scope):\n", 187 | " if div_val == 1:\n", 188 | " lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], initializer = initializer)\n", 189 | " y = embedding_lookup(lookup_table, x)\n", 190 | " if d_proj != d_embed:\n", 191 | " proj_W = tf.get_variable('proj_W', [d_embed, d_proj], initializer = proj_initializer)\n", 192 | " y = tf.einsum('ibe,ed->ibd', y, proj_W)\n", 193 | " else:\n", 194 | " proj_W = None\n", 195 | " ret_params = [lookup_table, proj_W]\n", 196 | " else:\n", 197 | " tables, projs = [], []\n", 198 | " cutoff_ends = [0] + cutoffs + [n_token]\n", 199 | " x_size = tf.shape(x)\n", 200 | " if perms is None:\n", 201 | " cat_lookup = []\n", 202 | " else:\n", 203 | " cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj])\n", 204 | " for i in range(len(cutoff_ends) - 1):\n", 205 | " with tf.variable_scope('cutoff_{}'.format(i)):\n", 206 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n", 207 | " cur_d_embed = d_embed // (div_val ** i)\n", 208 | " lookup_table = tf.get_variable('lookup_table',\n", 209 | " [r_idx - l_idx, cur_d_embed],\n", 210 | " initializer = initializer)\n", 211 | " if cur_d_embed == d_proj and not proj_same_dim:\n", 212 | " proj_W = None\n", 213 | " else:\n", 214 | " proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],\n", 215 | " initializer = proj_initializer)\n", 216 | " if perms is None:\n", 217 | " cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W))\n", 218 | " else:\n", 219 | " if i == 0:\n", 220 | " cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1))\n", 221 | " if proj_W is not None:\n", 222 | " cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W)\n", 223 | " cur_y *= perms[i][:, :, None]\n", 224 | " cat_lookup += cur_y\n", 225 | " else:\n", 226 | " cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i])\n", 227 | " cur_x = tf.to_int32(cur_x)\n", 228 | " cur_y = embedding_lookup(lookup_table, cur_x)\n", 229 | " if proj_W is not None:\n", 230 | " cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W)\n", 231 | " cat_lookup += tf.einsum('kd,idk->ibd', cur_y, perms[i])\n", 232 | " tables.append(lookup_table)\n", 233 | " projs.append(proj_W)\n", 234 | " if perms is None:\n", 235 | " cat_lookup = tf.concat(cat_lookup, 0)\n", 236 | " y = embedding_lookup(cat_lookup, x)\n", 237 | " else:\n", 238 | " y = cat_lookup\n", 239 | " ret_params = [tables, projs]\n", 240 | " \n", 241 | " y *= emb_scale\n", 242 | " return y, ret_params\n", 243 | "\n", 244 | "def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs, params,\n", 245 | " tie_projs, initializer = None, proj_initializer = None,\n", 246 | " div_val = 1, scope = 'adaptive_softmax', proj_same_dim = True,\n", 247 | " return_mean = True, **kwargs):\n", 248 | " def _logit(x, W, b, proj):\n", 249 | " y = x\n", 250 | " if proj is not None:\n", 251 | " y = tf.einsum('ibd,ed->ibe', y, proj)\n", 252 | " return tf.einsum('ibd, nd->ibn', y, W) + b\n", 253 | "\n", 254 | " params_W, params_projs = params[0], params[1]\n", 255 | "\n", 256 | " def _gather_logprob(logprob, target):\n", 257 | " lp_size = tf.shape(logprob)\n", 258 | " r = tf.range(lp_size[0])\n", 259 | " idx = tf.stack([r, target], 1)\n", 260 | " return tf.gather_nd(logprob, idx)\n", 261 | "\n", 262 | " with tf.variable_scope(scope):\n", 263 | " if len(cutoffs) == 0:\n", 264 | " softmax_b = tf.get_variable('bias', [n_token],\n", 265 | " initializer = tf.zeros_initializer())\n", 266 | " output = _logit(hidden, prams_W, softmax_b, params_projs)\n", 267 | " nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target, logits = output)\n", 268 | " else:\n", 269 | " cutoff_ends = [0] + cutoffs + [n_token]\n", 270 | " nll = tf.zeros_like(target, dtype = tf.float32)\n", 271 | " for i in range(len(cutoff_ends) - 1):\n", 272 | " with tf.variable_scope('cutoff_{}'.format(i)):\n", 273 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n", 274 | " mask = (target >= l_idx) & (target < r_idx)\n", 275 | " mask_idx = tf.where(mask)\n", 276 | " cur_target = tf.boolean_mask(target, mask) - l_idx\n", 277 | " cur_d_embed = d_embed // (div_val ** i)\n", 278 | "\n", 279 | " if div_val == 1:\n", 280 | " cur_W = params_W[l_idx: r_idx]\n", 281 | " else:\n", 282 | " cur_W = params_W[i]\n", 283 | " cur_b = tf.get_variable('b', [r_idx - l_idx], initializer = tf.zeros_initializer())\n", 284 | " if tie_projs[i]:\n", 285 | " if div_val == 1:\n", 286 | " cur_proj = params_projs\n", 287 | " else:\n", 288 | " cur_proj = params_projs[i]\n", 289 | " else:\n", 290 | " if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:\n", 291 | " cur_proj = None\n", 292 | " else:\n", 293 | " cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],\n", 294 | " initializer = proj_initializer)\n", 295 | " if i == 0:\n", 296 | " cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],\n", 297 | " initializer = tf.zeros_initializer())\n", 298 | " cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],\n", 299 | " initializer = tf.zeros_initializer())\n", 300 | " cur_W = tf.concat([cur_W, cluster_W], 0)\n", 301 | " cur_b = tf.concat([cur_b, cluster_b], 0)\n", 302 | "\n", 303 | " head_logit = _logit(hidden, cur_W, cur_b, cur_proj)\n", 304 | " head_logprob = tf.nn.log_softmax(head_logit)\n", 305 | " cur_head_logprob = tf.boolean_mask(head_logprob, mask)\n", 306 | " cur_logprob = _gather_logprob(cur_head_logprob, cur_target)\n", 307 | " else:\n", 308 | " cur_head_logprob = tf.boolean_mask(head_logprob, mask)\n", 309 | " cur_hidden = tf.boolean_mask(hidden_mask)\n", 310 | " tail_logit = tf.squeeze(_logit(cur_hidden[None], cur_W, cur_b, cur_proj), 0)\n", 311 | " tail_logprob = tf.nn.log_softmax(tail_logit)\n", 312 | " cur_logprob = (cur_head_logprob[:, cutoff_ends[1]+i-1] + _gather_logprob(tail_logprob, cur_target))\n", 313 | " nll += tf.scatter_nd(mask_idx, -cur_logprob, tf.to_int64(tf.shape(nll)))\n", 314 | "\n", 315 | " if return_mean:\n", 316 | " nll = tf.reduce_mean(nll)\n", 317 | " return nll\n", 318 | "\n", 319 | "def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,\n", 320 | " params, tie_projs,\n", 321 | " initializer=None, proj_initializer=None,\n", 322 | " div_val=1, perms=None, proj_same_dim=True,\n", 323 | " scope='adaptive_softmax',\n", 324 | " **kwargs):\n", 325 | " def _logit(x, W, b, proj):\n", 326 | " y = x\n", 327 | " if x.shape.ndims == 3:\n", 328 | " if proj is not None:\n", 329 | " y = tf.einsum('ibd,ed->ibe', y, proj)\n", 330 | " return tf.einsum('ibd,nd->ibn', y, W) + b\n", 331 | " else:\n", 332 | " if proj is not None:\n", 333 | " y = tf.einsum('id,ed->ie', y, proj)\n", 334 | " return tf.einsum('id,nd->in', y, W) + b\n", 335 | "\n", 336 | " params_W, params_projs = params[0], params[1]\n", 337 | "\n", 338 | " with tf.variable_scope(scope):\n", 339 | " if len(cutoffs) == 0:\n", 340 | " softmax_b = tf.get_variable('bias', [n_token], initializer = tf.zeros_initializer())\n", 341 | " output = _logit(hidden, params_W, softmax_b, params_projs)\n", 342 | " nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target, logits = output)\n", 343 | " nll = tf.reduce_mean(nll)\n", 344 | " else:\n", 345 | " total_loss, total_cnt = 0, 0\n", 346 | " cutoff_ends = [0] + cutoffs + [n_token]\n", 347 | " for i in range(len(cutoff_ends) - 1):\n", 348 | " with tf.variable_scope('cutoff_{}'.format(i)):\n", 349 | " l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n", 350 | "\n", 351 | " cur_d_embed = d_embed // (div_val ** i)\n", 352 | "\n", 353 | " if div_val == 1:\n", 354 | " cur_W = params_W[l_idx: r_idx]\n", 355 | " else:\n", 356 | " cur_W = params_W[i]\n", 357 | " cur_b = tf.get_variable('b', [r_idx - l_idx], initializer = tf.zeros_initializer())\n", 358 | "\n", 359 | " if tie_projs[i]:\n", 360 | " if div_val == 1:\n", 361 | " cur_proj = params_projs\n", 362 | " else:\n", 363 | " cur_proj = params_projs[i]\n", 364 | " else:\n", 365 | " if (div_val == 1 of not proj_same_dim) and d_proj == cur_d_embed:\n", 366 | " cur_proj = None\n", 367 | " else:\n", 368 | " cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj], initializer = tf.zeros_initializer())\n", 369 | "\n", 370 | " if i == 0:\n", 371 | " cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],\n", 372 | " initializer = tf.zeros_initializer())\n", 373 | " cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],\n", 374 | " initializer = tf.zeros_initializer())\n", 375 | " cur_W = tf.concat([cur_W, cluster_W], 0)\n", 376 | " cur_b = tf.concat([cur_b, cluster_b], 0)\n", 377 | "\n", 378 | " head_logit = _logit(hidden, cur_W, cur_b, cur_proj)\n", 379 | "\n", 380 | " head_target = kwargs.get('head_target')\n", 381 | " head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(\n", 382 | " labels = head_target,\n", 383 | " logits = head_logit\n", 384 | " )\n", 385 | "\n", 386 | " masked_loss = head_nll * perms[i]\n", 387 | " total_loss += tf.reduce_sum(masked_loss)\n", 388 | " total_cnt += tf.reduce_sum(perms[i])\n", 389 | "\n", 390 | " else:\n", 391 | " cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i])\n", 392 | "\n", 393 | " cur_hidden_tf.einsum('ibd,ibk->kd', hidden, perms[i])\n", 394 | " tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj)\n", 395 | "\n", 396 | " tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx), perms[i])\n", 397 | " tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(\n", 398 | " labels = tf.to_int43(tail_target), logits = tail_logit\n", 399 | " )\n", 400 | "\n", 401 | " sum_nll = cur_head_nll + tail_nll\n", 402 | " mask = tf.reduce_sum(perms[i], [0, 1])\n", 403 | "\n", 404 | " masked_loss = sum_nll * mask\n", 405 | " total_loss += tf.reduce_sum(masked_loss)\n", 406 | " total_cnt += tf.reduce_sum(mask)\n", 407 | "\n", 408 | " nll = total_loss / total_cnt\n", 409 | "\n", 410 | " return nll\n", 411 | "\n", 412 | "def _create_mask(qlen, mlen, same_length = False):\n", 413 | " attn_mask = tf.ones([qlen, qlen])\n", 414 | " mask_u = tf.matrix_band_part(attn_mask, 0, -1)\n", 415 | " mask_dia = tf.matrix_band_part(attn_mask, 0, 0)\n", 416 | " attn_mask_pad = tf.zeros([qlen, mlen])\n", 417 | " ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)\n", 418 | " if same_length:\n", 419 | " mask_l = tf.matrix_band_part(attn_mask, -1, 0)\n", 420 | " ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)\n", 421 | " return ret\n", 422 | "\n", 423 | "def _cache_mem(curr_out, prev_mem, mem_len = None):\n", 424 | " if mem_len is None or prev_mem is None:\n", 425 | " new_mem = curr_out\n", 426 | " elif mem_len == 0:\n", 427 | " return prev_mem\n", 428 | " else:\n", 429 | " new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]\n", 430 | "\n", 431 | " return tf.stop_gradient(new_mem)\n", 432 | "\n", 433 | "def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,\n", 434 | " n_head, d_head, d_inner, dropout, dropatt,\n", 435 | " initializer, is_training, proj_initializer=None,\n", 436 | " mem_len=None, cutoffs=[], div_val=1, tie_projs=[],\n", 437 | " same_length=False, clamp_len=-1, use_tpu=True,\n", 438 | " input_perms=None, target_perms=None, head_target=None,\n", 439 | " untie_r=False, proj_same_dim=True,\n", 440 | " scope='transformer'):\n", 441 | " new_mems = []\n", 442 | " with tf.variable_scope(scope):\n", 443 | " if untie_r:\n", 444 | " r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],\n", 445 | " initializer = initializer)\n", 446 | " r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],\n", 447 | " initializer = initializer)\n", 448 | " else:\n", 449 | " r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],\n", 450 | " initializer = initializer)\n", 451 | " r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],\n", 452 | " initializer = initializer)\n", 453 | " \n", 454 | " qlen = tf.shape(dec_inp)[0]\n", 455 | " mlen = tf.shape(mems[0])[0] is mems is not None else 0\n", 456 | " klen = mlen + qlen\n", 457 | "\n", 458 | " if proj_initializer is None:\n", 459 | " proj_initializer = initializer\n", 460 | " lookup_fn = (mul_adaptive_embedding_lookup is use_tpu else\n", 461 | " mask_adaptive_embedding_lookup)\n", 462 | " embeddings, shared_params = lookup_fn(\n", 463 | " x=dec_inp,\n", 464 | " n_token=n_token,\n", 465 | " d_embed=d_embed,\n", 466 | " d_proj=d_model,\n", 467 | " cutoffs=cutoffs,\n", 468 | " initializer=initializer,\n", 469 | " proj_initializer=proj_initializer,\n", 470 | " div_val= div_val,\n", 471 | " perms=input_perms,\n", 472 | " proj_same_dim=proj_same_dim)\n", 473 | " \n", 474 | " attn_mask = _create_mask(qlen, mlen, same_length)\n", 475 | "\n", 476 | " pos_seq = tf.range(klen - 1, -1, -1.0)\n", 477 | " if clasm_len > 0:\n", 478 | " pos_seq = tf.minimum(pos_seq, clamp_len)\n", 479 | " inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))\n", 480 | " pos_emb = positional_embedding(pos_seq, inv_freq)\n", 481 | "\n", 482 | " output = tf.layers.dropout(embeddings, dropot, training = is_training)\n", 483 | " pos_emb = tf.layers.dropout(pos_emb, dropout, training = is_training)\n", 484 | "\n", 485 | " if mems is None:\n", 486 | " mems = [None] * n_layer\n", 487 | "\n", 488 | " for i in range(n_layer):\n", 489 | " new_mems.append(_cache_mem(output, mems[i], mem_len))\n", 490 | "\n", 491 | " with tf.variable_scope('layer_{}'.format(i)):\n", 492 | " output = rel_multihead_attn(\n", 493 | " w=output,\n", 494 | " r=pos_emb,\n", 495 | " r_w_bias=r_w_bias if not untie_r else r_w_bias[i],\n", 496 | " r_r_bias=r_r_bias if not untie_r else r_r_bias[i],\n", 497 | " attn_mask=attn_mask,\n", 498 | " mems=mems[i],\n", 499 | " d_model=d_model,\n", 500 | " n_head=n_head,\n", 501 | " d_head=d_head,\n", 502 | " dropout=dropout,\n", 503 | " dropatt=dropatt,\n", 504 | " is_training=is_training,\n", 505 | " kernel_initializer=initializer\n", 506 | " )\n", 507 | " output = positionwise_FF(\n", 508 | " inp=output,\n", 509 | " d_model=d_model,\n", 510 | " d_inner=d_inner,\n", 511 | " dropout=dropout,\n", 512 | " kernel_initializer=initializer,\n", 513 | " is_training=is_training\n", 514 | " )\n", 515 | "\n", 516 | " output = tf.layers.dropout(output, dropout, training = is_training)\n", 517 | "\n", 518 | " logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else\n", 519 | " mask_adaptive_logsoftmax)\n", 520 | " loss = logsoftmax_fn(\n", 521 | " hidden=output,\n", 522 | " target=target,\n", 523 | " n_token=n_token,\n", 524 | " d_embed=d_embed,\n", 525 | " d_proj=d_model,\n", 526 | " cutoffs=cutoffs,\n", 527 | " params=shared_params,\n", 528 | " tie_projs=tie_projs,\n", 529 | " initializer=initializer,\n", 530 | " proj_initializer=proj_initializer,\n", 531 | " div_val=div_val,\n", 532 | " perms=target_perms,\n", 533 | " head_target=head_target,\n", 534 | " proj_same_dim=proj_same_dim\n", 535 | " )\n", 536 | "\n", 537 | " return loss, new_mems" 538 | ] 539 | } 540 | ] 541 | } 542 | -------------------------------------------------------------------------------- /Natural Language Processing/Transformer/README.md: -------------------------------------------------------------------------------- 1 | # Transformer Implementation 2 | 3 | https://github.com/tunz/transformer-pytorch/blob/e7266679f0b32fd99135ea617213f986ceede056/model/transformer.py#L201 참고하여 작성 4 | 5 | Transformer paper review: https://cartinoe5930.tistory.com/entry/Transformer-Attention-Is-All-You-Need-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0 6 | -------------------------------------------------------------------------------- /Natural Language Processing/Transformer/Transformer_구현_실습.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyPEjZ5/XN13lrmM3kUVgIFW", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "nYoZgseydKyf" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import math\n", 38 | "\n", 39 | "import torch\n", 40 | "import torch.nn as nn\n", 41 | "import torch.nn.function as F\n", 42 | "\n", 43 | "from utils import utils\n", 44 | "\n", 45 | "def initialize_weight(x):\n", 46 | " nn.init.xavier_uniform_(x.weight)\n", 47 | " if x.bias is not None:\n", 48 | " nn.init.constant_(x.bias, 0)\n", 49 | "\n", 50 | "class FeedForwardNetwork(nn.Module):\n", 51 | " def __init__(self, hidden_size, filter_size, dropout_rate):\n", 52 | " super(FeedForwardNetwork, self).__init__()\n", 53 | "\n", 54 | " self.layer1 = nn.Linear(hidden_size, filter_size)\n", 55 | " self.relu = nn.ReLU()\n", 56 | " self.dropout = nn.Dropout(dropout_rate)\n", 57 | " self.layer2 = nn.Linear(filter_size, hidden_size)\n", 58 | "\n", 59 | " initialize_weight(self.layer1)\n", 60 | " initialize_weight(self.layer2)\n", 61 | "\n", 62 | " def forward(self, x):\n", 63 | " x = self.layer1(x)\n", 64 | " x = self.relu(x)\n", 65 | " x = self.dropout(x)\n", 66 | " x = self.layer2(x)\n", 67 | " return x\n", 68 | "\n", 69 | "class MultiHeadAttention(nn.Moculde):\n", 70 | " def __init__(self, hidden_size, dropout_rate, head_size = 8):\n", 71 | " super(MultiHeadAttention, self).__init__()\n", 72 | "\n", 73 | " self.head_size = head_size\n", 74 | "\n", 75 | " self.att_size = att_size = hidden_size // head_size\n", 76 | " self.scale = arr_size ** -0.5\n", 77 | "\n", 78 | " self.linear_q = nn.Linear(hidden_size, head_size * att_size, bias = False)\n", 79 | " self.linear_k = nn.Linear(hidden_size, head_size * att_size, bias = False)\n", 80 | " self.linear_v = nn.Linear(hidden_size, head_size * att_size, bias = False)\n", 81 | " initialize_weight(self.linear_q)\n", 82 | " initialize_weight(self.linear_k)\n", 83 | " initialize_weight(self.linear_v)\n", 84 | "\n", 85 | " self.att_dropout = nn.Dropout(dropout_rate)\n", 86 | " \n", 87 | " self.output_layer = nn.Linear(head_size * att_size, hidden_size, bias = False)\n", 88 | " initialize_weight(self.output_layer)\n", 89 | "\n", 90 | " def forward(self, q, k, v, mask, cache = None):\n", 91 | " orig_q_size = q.size()\n", 92 | "\n", 93 | " d_k = self.att_size\n", 94 | " d_v = self.att_size\n", 95 | " batch_size = q.size(0)\n", 96 | "\n", 97 | " #head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i)\n", 98 | " q = self.linear_q(q).view(batch_size, -1, self.head_size, d_k)\n", 99 | " if cache is not None and 'endec_k' in cache:\n", 100 | " k, v = cache['endec_k'], cahce['endec_v']\n", 101 | " else:\n", 102 | " k = self.linear_k(k).view(bacth_size, -1, self.head_size, d_k)\n", 103 | " v = self.linear_v(v).view(batch_size, -1, self.head_size, d_v)\n", 104 | "\n", 105 | " if cache is not None:\n", 106 | " cache['endec_k'], cache['endec_v'] = k, v\n", 107 | "\n", 108 | " q = q.transpose(1, 2) # [b, h, q_len, d_k]\n", 109 | " v = v.transpose(1, 2) # [b, h, v_len, d_v]\n", 110 | " k = k.transpose(1, 2).transpose(2, 3) # [b, h, d_k, k_len]\n", 111 | "\n", 112 | " #Scaled Dot-Product Attention\n", 113 | " #Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V\n", 114 | " q.mul_(self.scale)\n", 115 | " x = torch.matmul(q, k) # [b, h, q_len, k_len]\n", 116 | " x.masked_fill_(mask.unsqueeze(1), -1e9)\n", 117 | " x = torch.softmax(x, dim = 3)\n", 118 | " x = self.att_dropout(x)\n", 119 | " x = x.matmul(v) # [b, h, q_len, attn]\n", 120 | "\n", 121 | " x = x.transpose(1, 2).contiguous() # [b, q_len, h, attn]\n", 122 | " x = x.view(batch_size, -1, self.head_size * d_v)\n", 123 | "\n", 124 | " x = self.output_layer(x)\n", 125 | "\n", 126 | " assert x.size() == orig_q_size\n", 127 | " return x\n", 128 | "\n", 129 | "class EncoderLayer(nn.Module):\n", 130 | " def __init__(self, hidden_size, filter_size, dropout_rate):\n", 131 | " super(EncoderLayer, self).__init__()\n", 132 | "\n", 133 | " self.self_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n", 134 | " self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)\n", 135 | " self.self_attention_dropout = nn.Dropout(dropout_rate)\n", 136 | "\n", 137 | " self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n", 138 | " self.enc_dec_attention = MultiHeadAttention(hidden_size, dropout_rate)\n", 139 | " self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)\n", 140 | "\n", 141 | " self.ffn_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n", 142 | " self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)\n", 143 | " self.ffn_dropout = nn.Dropout(dropout_rate)\n", 144 | "\n", 145 | " def forward(self, x, enc_output, self_mask, i_mask, cache):\n", 146 | " y = self.self_attention_norm(x)\n", 147 | " y = self.self_attention(y, y, y, self_mask) #(q, k, v, mask)\n", 148 | " y = self.self_attention_dropout(y)\n", 149 | " x = x + y #skip connection\n", 150 | "\n", 151 | " y = self.ffn_norm(x)\n", 152 | " y = ffn(y)\n", 153 | " y = self.ffn_dropout(y)\n", 154 | " x = x + y #skip connection\n", 155 | " return x\n", 156 | "\n", 157 | "class DecoderLayer(nn.Module):\n", 158 | " def __init__(self, hidden_size, filter_size, dropout_rate):\n", 159 | " super(DecoderLayer, self).__init__()\n", 160 | "\n", 161 | " self.self_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n", 162 | " self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)\n", 163 | " self.self_attention_dropout = nn.Dropout(dropout_rate)\n", 164 | "\n", 165 | " self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n", 166 | " self.enc_dec_attention = MultiHeadAttention(hidden_size, dropout_rate)\n", 167 | " self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)\n", 168 | "\n", 169 | " self.ffn_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n", 170 | " self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)\n", 171 | " self.ffn_dropout = nn.Dropout(dropout_rate)\n", 172 | "\n", 173 | " def forward(self, x, enc_output, self_mask, i_mask, cache):\n", 174 | " y = self.self_attention_norm(x)\n", 175 | " y = self.self_attention(y, y, y, self_mask)\n", 176 | " y = self.self_attention_dropout(y)\n", 177 | " x = x + y\n", 178 | "\n", 179 | " if enc_output is not None:\n", 180 | " y = self.self_attention_norm(x)\n", 181 | " y = self.self_attention(y, enc_output, enc_output, i_mask, cache)\n", 182 | " y = self.enc_dec_attention_dropout(y)\n", 183 | " x = x + y\n", 184 | "\n", 185 | " y = self.ffn_norm(x)\n", 186 | " y = self.ffn(y)\n", 187 | " y = self.ffn_dropout(y)\n", 188 | " x = x + y\n", 189 | " return x\n", 190 | "\n", 191 | "class Encoder(nn.Module):\n", 192 | " def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):\n", 193 | " super(Encoder, self).__init__()\n", 194 | "\n", 195 | " encoders = [EncoderLayer(hidden_size, filter_size, dropout_rate) for _ in range(n_layers)]\n", 196 | " self.layers = nn.ModuleList(encoders)\n", 197 | "\n", 198 | " self.last_norm = nn.LayerNorm(gidden_size, eps = 1e-6)\n", 199 | "\n", 200 | " def forward(self, inputs, mask):\n", 201 | " encoder_output = inputs\n", 202 | " for enc_layer in self.layers:\n", 203 | " encoder_output = enc_layer(encoder_output, mask)\n", 204 | " return self.last_norm(encoder_output)\n", 205 | "\n", 206 | "class Decoder(nn.Module):\n", 207 | " def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):\n", 208 | " super(Decoder, self).__init__()\n", 209 | "\n", 210 | " decoders = [DecoderLayer(hidden_size, filter_size, dropout_rate) for _ in range(n_layers)]\n", 211 | " self.layers = nn.ModuleList(decoders)\n", 212 | "\n", 213 | " self.last_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n", 214 | "\n", 215 | " def forward(self, targets, enc_output, i_mask, t_self_mask, cache):\n", 216 | " decoder_output = targets\n", 217 | " for i, dec_layer in enumerate(self.layers):\n", 218 | " layer_cache = None\n", 219 | " if cache is not None:\n", 220 | " if i not in cache:\n", 221 | " cache[i] = {}\n", 222 | " layer_cache = cache[i]\n", 223 | " decoder_output = dec_layer(decoder_output, enc_output, t_self_mask, i_mask, layer_cache)\n", 224 | "\n", 225 | " return self.last_norm(decoder_output)\n", 226 | "\n", 227 | "class Transformer(nn.Module):\n", 228 | " def __init__(self, i_vocab_size, t_vocab_size, n_layers = 6, hidden_size = 512, \n", 229 | " filter_size = 2048, dropout_rate = 0.1, share_target_embedding = True,\n", 230 | " has_inputs = True, src_pad_idx = None, trg_pad_idx = None):\n", 231 | " super(Transformer, self).__init__()\n", 232 | "\n", 233 | " self.hidden_size = hidden_size\n", 234 | " self.emb_scale = hidden_size ** 0.5\n", 235 | " self.has_inputs = has_inputs\n", 236 | " self.src_pad_idx = src_pad_idx\n", 237 | " self.trg_pad_idx = trg_pad_idx\n", 238 | "\n", 239 | " self.t_vocab_embedding = nn.Embedding(t_vocab_size, hidden_size)\n", 240 | " nn.init.normal_(self.t_vocab_embedding.weight, mead = 0, std = hidden_size ** -0.5)\n", 241 | " self.t_emb_dropout = nn.Dropout(dropout_rate)\n", 242 | " self.decoder = Decoder(hidden_size, filter_size, dropout_rate, n_layers)\n", 243 | "\n", 244 | " if has_inputs:\n", 245 | " if not share_target_embedding:\n", 246 | " self.i_vocab_embedding = nn.Embedding(i_vocab_size, hidden_size)\n", 247 | " nn.init.normal_(self.i_vocab_embedding.weight, mean = 0, std = hidden_size ** -0.5)\n", 248 | " else:\n", 249 | " self.i_vocab_embedding = self.t_vocab_embedding\n", 250 | "\n", 251 | " self.i_emb_dropout = nn.Dropout(dropout_rate)\n", 252 | "\n", 253 | " self.encoder = Encoder(hidden_size, filter_size, dropout_rate, n_layers)\n", 254 | "\n", 255 | " #Positional Encoding\n", 256 | " num_timescales = self.hidden_size // 2\n", 257 | " max_timescale = 10000.0\n", 258 | " min_timescale = 1.0\n", 259 | " log_timescale_increment = (\n", 260 | " math.log(floast(max_timescale) / float(min_timescale)) / \n", 261 | " max(num_timescale - 1, 1))\n", 262 | " inv_timescales = min_timescale * torch.exp(\n", 263 | " torch.arange(num_timescales, dtype = torch.float32) * \n", 264 | " -log_timescale_increment)\n", 265 | " self.register_buffer('inv_timescales', inv_timescales)\n", 266 | "\n", 267 | " def forward(self, inputs, targets):\n", 268 | " enc_output, i_mask = None, None\n", 269 | " if self.has_inputs:\n", 270 | " i_mask = utils.create_pad_mask(inputs, self.src_pad_idx)\n", 271 | " enc_output = self.encode(inputs, i_mask)\n", 272 | "\n", 273 | " t_mask = utils.create_pad_mask(targets, self.trg_pad_idx)\n", 274 | " target_size = targets.size()[1]\n", 275 | " t_self_mask = utils.create_trg_self_mask(target_size, device = targets.device)\n", 276 | "\n", 277 | " return self.decode(targets, enc_output, i_mask, t_self_mask, t_mask)\n", 278 | "\n", 279 | " def encode(self, inputs, i_mask):\n", 280 | " #Input Embedding\n", 281 | " input_embedded = self.i_vocab_embedding(inputs)\n", 282 | " input_embedded.masked_fill_(i_mask.squeeze(1).unaqueeze(-1), 0)\n", 283 | " input_embedded *= self.emb_scale\n", 284 | " input_embedded += self.get_position_encoding(inputs)\n", 285 | " input_embedded = self.i_emb_dropout(input_embedded)\n", 286 | "\n", 287 | " return self.encoder(input_embedded, i_mask)\n", 288 | "\n", 289 | " def decoder(self, targets, enc_output, i_mask, t_self_mask, t_mask, cache = None):\n", 290 | " #target embedding\n", 291 | " target_embedded = self.t_vocab_embedding(targets)\n", 292 | " target_embedded.masked_fill(t_mask.squeeze(1).unsqueeze(-1), 0)\n", 293 | "\n", 294 | " #Shfting\n", 295 | " target_embedded = target_embedded[:, :-1]\n", 296 | " target_embedded = F.pad(target_embedded, (0, 0, 1, 0))\n", 297 | "\n", 298 | " target_embedded *= self.emb_scale\n", 299 | " target_embedded += self.get_position_encoding(targets)\n", 300 | " target_embedded = self.t_emb_dropout(target_embedded)\n", 301 | "\n", 302 | " #decoder\n", 303 | " decoder_output = self.decoder(target_embedded, enc_output, i_mask, t_self_mask, cache)\n", 304 | "\n", 305 | " #linear\n", 306 | " output = torch.matmul(decoder_output, self.t_vocab_embedding.weight.transpose(0, 1))\n", 307 | "\n", 308 | " return output\n", 309 | "\n", 310 | " def get_position_encoding(self, x):\n", 311 | " max_length = x.size()[1]\n", 312 | " position = torch.arange(max_length, dtype = torch.float32, device = x.device)\n", 313 | " scaled_time = position.unsqueeze(1) * self.inv_timescales.unsqueeze(0)\n", 314 | " signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim = 1)\n", 315 | " signal = F.pad(signal, (0, 0, 0, self.hidden_size % 2))\n", 316 | " signal = signal.view(1, max_length, self.hidden_size)\n", 317 | " return signal" 318 | ] 319 | } 320 | ] 321 | } 322 | -------------------------------------------------------------------------------- /Natural Language Processing/XLNet/README.md: -------------------------------------------------------------------------------- 1 | # XLNet Implementaion 2 | 3 | https://github.com/graykode/xlnet-Pytorch/blob/master/xlnet.py 참고하여 작성됌 4 | 5 | paper review: https://cartinoe5930.tistory.com/entry/XLNet-Generalized-Autoregressive-Pretraining-for-Language-Understanding-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0 6 | -------------------------------------------------------------------------------- /Natural Language Processing/XLNet/XLNet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyO7PtJswjOPtdrbUQcICrcp", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "GgYufJCgpn7Z" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from __future__ import absolute_import\n", 38 | "from __future__ import division\n", 39 | "from __future__ import print_function\n", 40 | "\n", 41 | "import json\n", 42 | "import os\n", 43 | "import tensorflow as tf\n", 44 | "import modeling\n", 45 | "\n", 46 | "def _get_initializer(FLAGS):\n", 47 | " # 변수 초기화\n", 48 | " if FLAGS.init == 'uniform':\n", 49 | " initializer = tf.initializers.random_uniform(\n", 50 | " minval = -FLAGS.init_range,\n", 51 | " maxval = FLAGS.init_range,\n", 52 | " seed = None\n", 53 | " )\n", 54 | "\n", 55 | " elif FLAGS.init == 'normal':\n", 56 | " initializer = tf.initializers.random_normal(\n", 57 | " stddev = FLAGS.init_std,\n", 58 | " seed = None\n", 59 | " )\n", 60 | "\n", 61 | " else:\n", 62 | " raise ValueError('Initializer {} not supported'.format(FALGS.init))\n", 63 | " return initializer\n", 64 | "\n", 65 | "class XLNetConfig(object):\n", 66 | " ''' XLNetConfig는 model checkpoint에 특정된 하이퍼 파라미터를 포함하고 있음\n", 67 | " 이 하이퍼 파라미터들은 pre-training 시와 fine-tuning 시에 모두 같아야 함\n", 68 | "\n", 69 | " n_layer: 레이어의 수\n", 70 | " d_model: hidden size\n", 71 | " n_head: attention head의 수\n", 72 | " d_head: 각 attention head의 차원 크기\n", 73 | " d_inner: feed-forward layer에서 hidden size\n", 74 | " ff_activation: 'relu' 또는 'gelu'\n", 75 | " untie_r: attention에서 bias들을 untie할 지 말지 결정\n", 76 | " n_token: vocab_size\n", 77 | " '''\n", 78 | "\n", 79 | " def __init__(self, FLAGS = None, json_path = None):\n", 80 | " '''\n", 81 | " XLNetConfig 구조\n", 82 | " 하나의 FLAGS 또는 json_path는 제공되어야 한다.\n", 83 | " '''\n", 84 | "\n", 85 | " assert FLAGS is not None or json_path is not None\n", 86 | "\n", 87 | " self.keys = ['n_layer', 'd_model', 'n_head', 'd_head', 'd_inner', 'ff_activation', \n", 88 | " 'untie_r', 'n_token']\n", 89 | "\n", 90 | " if FLAGS is not None:\n", 91 | " self.init_from_flags(FLAGS)\n", 92 | "\n", 93 | " if json_path is not None:\n", 94 | " self.init_from_json(json_path)\n", 95 | "\n", 96 | " def init_from_flags(self, FLAGS):\n", 97 | " for key in self.keys:\n", 98 | " setattr(self, key, getattr(FLAGS, key))\n", 99 | "\n", 100 | " def init_from_json(self, FLAGS):\n", 101 | " with tf.gfile.Open(json_path) as f:\n", 102 | " json_data = json.load(f)\n", 103 | " for key in self.keys:\n", 104 | " setattr(self, key, json_data[key])\n", 105 | "\n", 106 | " def to_json(self, json_path):\n", 107 | " # XLNetConfig를 json 파일로 저장\n", 108 | " json_data = {}\n", 109 | " for key in self.keys:\n", 110 | " json_data[key] = getattr(self, key)\n", 111 | "\n", 112 | " json_dir = os.path.dirname(json_path)\n", 113 | " if not tf.gfile.Exists(json_dir):\n", 114 | " tf.gfile.MakeDirs(json_dir)\n", 115 | " with tf.gfile.Open(json_path, 'w') as f:\n", 116 | " json.dump(json_data, f, indent = 4, sort_keys = True)\n", 117 | "\n", 118 | "def create_run_config(is_training, is_finetune, FLAGS):\n", 119 | " kwargs = dict(\n", 120 | " is_training=is_training,\n", 121 | " use_tpu=FLAGS.use_tpu,\n", 122 | " use_bfloat16=FLAGS.use_bfloat16,\n", 123 | " dropout=FLAGS.dropout,\n", 124 | " dropatt=FLAGS.dropatt,\n", 125 | " init=FLAGS.init,\n", 126 | " init_range=FLAGS.init_range,\n", 127 | " init_std=FLAGS.init_std,\n", 128 | " clamp_len=FLAGS.clamp_len\n", 129 | " )\n", 130 | "\n", 131 | " if not is_finetune:\n", 132 | " kwargs.update(dict(\n", 133 | " mem_len=FLAGS.mem_len,\n", 134 | " reuse_len=FLAGS.reuse_len,\n", 135 | " bi_data=FLAGS.bi_data,\n", 136 | " clamp_len=FLAGS.clamp_len,\n", 137 | " same_length=FLAGS.same_length\n", 138 | " ))\n", 139 | "\n", 140 | " return RunConfig(**kwargs)\n", 141 | "\n", 142 | "class RunConfig(object):\n", 143 | " '''\n", 144 | " RunConfig는 pre-training과 fine-tuning에서 서로 다른 하이퍼 파라미터를 가져야 함.\n", 145 | " 이 하이퍼 파라미터들은 실행할 때마다 변경할 수 있다.\n", 146 | " '''\n", 147 | "\n", 148 | " def __init__(self, is_training, use_tpu, use_bfloat16, dropout, dropatt,\n", 149 | " init = 'normal', init_range = 0.1, init_std = 0.02, mem_len = None,\n", 150 | " reuse_len = None, bi_data = False, clamp_len = -1, same_length = False):\n", 151 | " '''\n", 152 | " is_training: 학습 모드인지 아닌지 확인\n", 153 | " use_tpu: TPU를 사용할 지 말 지 확인\n", 154 | " use_bfloat16: float32 대신에 bfloat16 사용\n", 155 | " dropout: dropout 비율\n", 156 | " dropatt: attention 확률에 dropout 비율\n", 157 | " init: 초기화 scheme. 'normal' 또는 'uniform' 둘 중 하나\n", 158 | " init_range: [-init_range, init_range]에서 균일한 분포를 사용해서 파라미터를 초기화\n", 159 | " init='uniform'일 때 가장 효과적임\n", 160 | " mem_len: 캐시해둘 토큰의 수\n", 161 | " reuse_len: 캐시되고 향후 재사용될 현재 배치의 토큰 수이다.\n", 162 | " bi_data: 양방향성 입력 파이프라인을 사용할 지 말 지 정함. \n", 163 | " pre-training 중에는 True를 사용, fine-tuning 중에는 False를 사용\n", 164 | " clamp_len: clamp_len보다 큰 모든 상대 거리를 고정한다다. -1은 클램핑이 없음을 의미한다.\n", 165 | " same_length: 각 토큰에 대해 똑같은 attention length를 사용할 지 말 지 결정\n", 166 | " '''\n", 167 | "\n", 168 | " self.init = init\n", 169 | " self.init_range = init_range\n", 170 | " self.init_std = init_std\n", 171 | " self.is_training = is_training\n", 172 | " self.dropout = dropout\n", 173 | " self.dropatt = dropatt\n", 174 | " self.use_tpu = use_tpu\n", 175 | " self.use_bfloat16 = use_bfloat16\n", 176 | " self.mem_len = mem_len\n", 177 | " self.reuse_len = reuse_len\n", 178 | " self.bi_data = bi_data\n", 179 | " self.clamp_len = clamp_len\n", 180 | " self.same_length = same_length\n", 181 | "\n", 182 | "class XLNetModel(object):\n", 183 | " # pre-training 및 fine-tuning 중에 사용되는 XLNet 모델의 wrapper이다.\n", 184 | "\n", 185 | " def __init__(self, xlnet_config, run_config, input_ids, seg_ids, input_mask,\n", 186 | " memes = None, perm_mask = None, target_mapping = None, inp_q = None,\n", 187 | " **kwargs):\n", 188 | " \n", 189 | " initializer = _get_initializer(run_config)\n", 190 | "\n", 191 | " tfm_args = dict(\n", 192 | " n_token=xlnet_config.n_token,\n", 193 | " initializer=initializer,\n", 194 | " attn_type=\"bi\",\n", 195 | " n_layer=xlnet_config.n_layer,\n", 196 | " d_model=xlnet_config.d_model,\n", 197 | " n_head=xlnet_config.n_head,\n", 198 | " d_head=xlnet_config.d_head,\n", 199 | " d_inner=xlnet_config.d_inner,\n", 200 | " ff_activation=xlnet_config.ff_activation,\n", 201 | " untie_r=xlnet_config.untie_r,\n", 202 | "\n", 203 | " is_training=run_config.is_training,\n", 204 | " use_bfloat16=run_config.use_bfloat16,\n", 205 | " use_tpu=run_config.use_tpu,\n", 206 | " dropout=run_config.dropout,\n", 207 | " dropatt=run_config.dropatt,\n", 208 | "\n", 209 | " mem_len=run_config.mem_len,\n", 210 | " reuse_len=run_config.reuse_len,\n", 211 | " bi_data=run_config.bi_data,\n", 212 | " clamp_len=run_config.clamp_len,\n", 213 | " same_length=run_config.same_length\n", 214 | " )\n", 215 | "\n", 216 | " input_args = dict(\n", 217 | " inp_k=input_ids,\n", 218 | " seg_id=seg_ids,\n", 219 | " input_mask=input_mask,\n", 220 | " mems=mems,\n", 221 | " perm_mask=perm_mask,\n", 222 | " target_mapping=target_mapping,\n", 223 | " inp_q=inp_q\n", 224 | " )\n", 225 | "\n", 226 | " with tf.variable_scope('model', reuse = tf.AUTO_REUSE):\n", 227 | " (self.output, self.new_mems, self.lookup_table) = modeling.transformer_xl(**tfm_args)\n", 228 | "\n", 229 | " self.input_mask = input_mask\n", 230 | " self.initializer = initializer\n", 231 | " self.clnet_config = clnet_config\n", 232 | " self.run_config = run_config\n", 233 | "\n", 234 | " def get_pooled_out(self, summary_type, use_summ_proj = True):\n", 235 | " xlnet_config = self.xlnet_config\n", 236 | " run_config = self.run_config\n", 237 | "\n", 238 | " with tf.variable_scope('model', reuse = tf.AUTO_REUSE):\n", 239 | " summary = modeling.summarize_sequence(\n", 240 | " summary_type=summary_type,\n", 241 | " hidden=self.output,\n", 242 | " d_model=xlnet_config.d_model,\n", 243 | " n_head=xlnet_config.n_head,\n", 244 | " d_head=xlnet_config.d_head,\n", 245 | " dropout=run_config.dropout,\n", 246 | " dropatt=run_config.dropatt,\n", 247 | " is_training=run_config.is_training,\n", 248 | " input_mask=self.input_mask,\n", 249 | " initializer=self.initializer,\n", 250 | " use_proj=use_summ_proj\n", 251 | " )\n", 252 | "\n", 253 | " return summary\n", 254 | "\n", 255 | " def get_sequence_output(self):\n", 256 | " # XLNet의 마지막 레이어의 hidden representation\n", 257 | " \n", 258 | " return self.output\n", 259 | "\n", 260 | " def get_new_memory(self):\n", 261 | " # 이전 메모리와 현재 input representation을 합친 new memory\n", 262 | " # list의 길이는 n_layer와 같음\n", 263 | " return self.new_mems\n", 264 | "\n", 265 | " def get_embedding_table(self):\n", 266 | " # embedding lookup table\n", 267 | " # input 레이어와 output 레이어 간의 embedding tie\n", 268 | " return self.lookup_table\n", 269 | "\n", 270 | " def get_initializer(self):\n", 271 | " # tf initilizer\n", 272 | " # XLNet의 top layer에서 변수들을 초기화하기 위해 사용\n", 273 | " return self.initializer" 274 | ] 275 | } 276 | ] 277 | } 278 | --------------------------------------------------------------------------------