├── Computer Vision
    ├── CNN
    │   ├── DenseNet.ipynb
    │   ├── EfficientNet.ipynb
    │   ├── GoogLeNet.ipynb
    │   ├── MobileNet_구현_실습.ipynb
    │   ├── README.md
    │   ├── ResNet.ipynb
    │   └── Xception.ipynb
    └── README.md
├── Multimodal Models
    ├── FLAVA
    │   ├── Interacting with FLAVA.ipynb
    │   └── README.md
    └── README.md
├── Natural Language Processing
    ├── ALBERT
    │   ├── ALBERT.ipynb
    │   └── README.md
    ├── BERT
    │   ├── BERT_model.ipynb
    │   ├── BERT_구현_복습.ipynb
    │   └── README.md
    ├── ELECTRA
    │   ├── ELECTRA.ipynb
    │   └── README.md
    ├── ELMo
    │   ├── ELMo.ipynb
    │   ├── README.md
    │   ├── char_cnn.ipynb
    │   └── character_dataset.ipynb
    ├── GPT-1
    │   ├── GPT-1 Implementation.ipynb
    │   └── README.md
    ├── README.md
    ├── RoBERTa
    │   ├── README.md
    │   └── RoBERTa.ipynb
    ├── Transformer-XL
    │   ├── README.md
    │   └── Transformer_XL_구현_실습.ipynb
    ├── Transformer
    │   ├── README.md
    │   ├── Transformer_구현_복습.ipynb
    │   └── Transformer_구현_실습.ipynb
    └── XLNet
    │   ├── README.md
    │   └── XLNet.ipynb
└── README.md


/Computer Vision/CNN/DenseNet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyP5VtNzKVdcgFotI0cRex0h",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Deep-Learning-Paper/blob/main/Computer%20Vision/CNN/DenseNet.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "sGveVsqEBvXg"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "import re\n",
 38 |         "import torch\n",
 39 |         "import torch.nn as nn\n",
 40 |         "import torch.nn.functional as F\n",
 41 |         "import torch.utils.checkpoint as cp\n",
 42 |         "from collections import OrderedDict\n",
 43 |         "#from .utils import load_state_dict_from_url\n",
 44 |         "from torch import Tensor\n",
 45 |         "from torch.jit.annotations import List\n",
 46 |         "\n",
 47 |         "__all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet161']\n",
 48 |         "\n",
 49 |         "model_urls = {\n",
 50 |         "    'densenet121': 'https://download.pytorch.org/models/densenet121-a639ec97.pth',\n",
 51 |         "    'densenet169': 'https://download.pytorch.org/models/densenet169-b2777c0a.pth',\n",
 52 |         "    'densenet201': 'https://download.pytorch.org/models/densenet201-c1103571.pth',\n",
 53 |         "    'densenet161': 'https://download.pytorch.org/models/densenet161-8d451a50.pth',\n",
 54 |         "}\n",
 55 |         "\n",
 56 |         "#Dense Layer\n",
 57 |         "class _DenseLayer(nn.Module):\n",
 58 |         "  def __init__(self, num_input_features, growth_rate, bn_size, drop_rate, memory_efficient = False):\n",
 59 |         "    super(_DenseLayer, self).__init__()\n",
 60 |         "    self.add_module('norm1', nn.BatchNorm2d(num_input_features)),\n",
 61 |         "    self.add_module('relu1', nn.ReLU(inplace = True)),\n",
 62 |         "    self.add_module('conv1', nn.Conv2d(num_input_features, bn_size * growth_rate, kernel_size = 1, \n",
 63 |         "                                       stride = 1, bias = False)),\n",
 64 |         "    self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),\n",
 65 |         "    self.add_module('relu2', nn.ReLU(inplace = True)),\n",
 66 |         "    self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size = 3,\n",
 67 |         "                                       stride = 1, padding = 1, bias = False)),\n",
 68 |         "    self.drop_rate = float(drop_rate)\n",
 69 |         "    self.memory_efficient = memory_efficient\n",
 70 |         "\n",
 71 |         "  #Bacth Normalization 하는 부분\n",
 72 |         "  def bn_function(self, inputs):\n",
 73 |         "    # type: List[tensor] -> tensor\n",
 74 |         "    concated_features = torch.cat(inputs, 1)\n",
 75 |         "    bottleneck_output = self.conv1(self.relu1(self.norm1(concated_features)))\n",
 76 |         "    return bottleneck_output\n",
 77 |         "\n",
 78 |         "  def any_requires_grad(self, input):\n",
 79 |         "    # type: List[tensor] -> bool\n",
 80 |         "    for tensor in input:\n",
 81 |         "      if tensor.requires_grad:\n",
 82 |         "        return True\n",
 83 |         "    return False\n",
 84 |         "\n",
 85 |         "  @torch.jit.unused\n",
 86 |         "  def call_checkpoint_bottleneck(self, input):\n",
 87 |         "    # type: List[tensor] -> tensor\n",
 88 |         "    def closure(*inputs):\n",
 89 |         "      return self.bn_function(inputs)\n",
 90 |         "\n",
 91 |         "    return cp.checkpoint(closure, *input)\n",
 92 |         "\n",
 93 |         "  @torch.jit._overload_method\n",
 94 |         "  def forward(self, input):\n",
 95 |         "    # type: List[tensor] -> tensor\n",
 96 |         "    pass\n",
 97 |         "\n",
 98 |         "  @torch.jit._overload_method\n",
 99 |         "  def forward(self, input):\n",
100 |         "    # type: Tensor -> Tensor\n",
101 |         "    pass\n",
102 |         "\n",
103 |         "  #아직 torchscript는 *args를 지원하지 않기 때문에, List[Tensor] 또는 single tensor를\n",
104 |         "  #오버로드 하는 방법을 사용\n",
105 |         "  #순전파\n",
106 |         "  def forward(self, input):\n",
107 |         "    if isinstance(input, Tensor):\n",
108 |         "      prev_features = [input]\n",
109 |         "    else:\n",
110 |         "      prev_features = input\n",
111 |         "\n",
112 |         "    if self.memory_efficient and self.any_requires_grad(prev_features):\n",
113 |         "      if torch.jit.is_scripting():\n",
114 |         "        raise Exception('Memory Efficient not supported in JIT')\n",
115 |         "\n",
116 |         "      bottleneck_output = self.call_checkpoint_bottleneck(prev_features)\n",
117 |         "\n",
118 |         "    else:\n",
119 |         "      bottleneck_output = self.bn_function(prev_features)\n",
120 |         "\n",
121 |         "    new_features = self.conv2(self.relu2(self.norm2(bottleneck_output)))\n",
122 |         "    if self.drop_rate > 0:\n",
123 |         "      new_features = F.dropout(new_features, p = self.drop_rate, training = self.training)\n",
124 |         "    \n",
125 |         "    return new_features\n",
126 |         "\n",
127 |         "#DenseBlock layer\n",
128 |         "class _DenseBlock(nn.ModuleDict):\n",
129 |         "  _version = 2\n",
130 |         "\n",
131 |         "  def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate,\n",
132 |         "               memory_efficient = False):\n",
133 |         "    super(_DenseBlock, self).__init__()\n",
134 |         "    for i in range(num_layers):\n",
135 |         "      layer = _DenseLayer(\n",
136 |         "          num_input_features + i * growth_rate, growth_rate = growth_rate, bn_size = bn_size,\n",
137 |         "          drop_rate = drop_rate, memory_efficient = memory_efficient,\n",
138 |         "      )\n",
139 |         "      self.add_module('denselayer%d' % (i + 1), layer)\n",
140 |         "\n",
141 |         "  def forward(self, init_features):\n",
142 |         "    features = [init_features]\n",
143 |         "    for name, layer in self.items():\n",
144 |         "      new_features = layer(features)\n",
145 |         "      features.append(new_features)\n",
146 |         "    return torch.cat(features, 1)\n",
147 |         "\n",
148 |         "#Transition layer\n",
149 |         "class _Transition(nn.Sequential):\n",
150 |         "  def __init__(self, num_input_features, num_output_features):\n",
151 |         "    super(_Transition, self).__init__()\n",
152 |         "    self.add_module('norm', nn.BacthNorm2d(num_input_features))\n",
153 |         "    self.add_module('relu', nn.ReLU(inplace = True))\n",
154 |         "    self.add_module('conv', nn.Conv2d(num_input_features, num_output_features, kernel_size = 1,\n",
155 |         "                                      stride = 1, bias = False))\n",
156 |         "    self.add_module('pool', nn.AvgPool2d(kernel_size = 2, stride = 2))\n",
157 |         "\n",
158 |         "class DenseNet(nn.Module):\n",
159 |         "  #growth_rate: 각 레이어에 얼만큼의 필터를 추가할지 (논문에서는 'k'로 표현)\n",
160 |         "  #block_config: 각 풀링 계층에서 얼마나 많은 레이어를 사용할지\n",
161 |         "  #num_init_features: 첫 합성곱 레이어에서 얼만큼의 필터를 배울지\n",
162 |         "  #bn_size: bottleneck layer의 숫자에 대한 factor\n",
163 |         "  #drop_rate: 각 dense layer 이후의 dropout rate\n",
164 |         "  #num_classes: 분류 클래스의 수\n",
165 |         "  #memort_efficient: True면 checkpoint 사용\n",
166 |         "\n",
167 |         "  def __init__(self, growth_rate = 32, block_config = (6, 12, 24, 16),\n",
168 |         "               num_init_features = 64, bn_size = 4, drop_rate = 0, num_classes = 1000,\n",
169 |         "               memory_efficient = False):\n",
170 |         "    super(DenseNet, self).__init__()\n",
171 |         "\n",
172 |         "    #첫 번째 convolution\n",
173 |         "    self.features = nn.Sequential(OrderedDict([\n",
174 |         "        ('conv0', nn.Conv2d(3, num_init_features, kernel_size = 7, stride = 2,\n",
175 |         "                            padding = 3, bias = False)),\n",
176 |         "        ('norm0', nn.BactNorm2d(num_init_features)),\n",
177 |         "        ('relu0', nn.ReLU(inplace = True)),\n",
178 |         "        ('pool0', nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)),\n",
179 |         "    ]))\n",
180 |         "\n",
181 |         "    #각 dense block\n",
182 |         "    num_features = num_init_features\n",
183 |         "    for i, num_layers in enumerate(block_config):\n",
184 |         "      block = _DenseBlock(\n",
185 |         "          num_layers = num_layers,\n",
186 |         "          num_input_features = num_features,\n",
187 |         "          bn_size = bn_size,\n",
188 |         "          growth_rate = growth_rate,\n",
189 |         "          drop_rate = drop_rate,\n",
190 |         "          memory_efficient = memory_efficient\n",
191 |         "      )\n",
192 |         "      self.features.add_module('denseblock%d' % (i + 1), block)\n",
193 |         "      num_features = num_features + num_layers * growth_rate\n",
194 |         "      if i != len(block_config) - 1:\n",
195 |         "        trans = _Transition(num_input_featurs = num_features,\n",
196 |         "                            num_output_features = num_features // 2)\n",
197 |         "        self.featrues.add_module('transition%d' % (i + 1), trans)\n",
198 |         "        num_features = num_features // 2\n",
199 |         "\n",
200 |         "    #마지막 batch norm\n",
201 |         "    self.features.add_module('norm5', nn.BatchNorm2d(num_features))\n",
202 |         "\n",
203 |         "    #Liunear Layer\n",
204 |         "    self.classifier = nn.Linear(num_features, num_classes)\n",
205 |         "\n",
206 |         "    for m in self.modules():\n",
207 |         "      if siinstance(m, nn.Conv2d):\n",
208 |         "        nn.init.kaiming_normal_(m.weight)\n",
209 |         "      elif isinstance(m, nn.BatchNorm2d):\n",
210 |         "        nn.init.constant_(m.weight, 1)\n",
211 |         "        nn.init.constant_(m.bias, 0)\n",
212 |         "      elif isinstance(m, nn.Linear):\n",
213 |         "        nn.init.constant_(m.bias, 0)\n",
214 |         "\n",
215 |         "  def forward(self, x):\n",
216 |         "    features = self.features(x)\n",
217 |         "    out = F.relu(features, inplace = True)\n",
218 |         "    out = F.adaptive_avg_pool2d(out, (1, 1))\n",
219 |         "    out = torch.flatten(out, 1)\n",
220 |         "    out = self.classifier(out)\n",
221 |         "    return out\n",
222 |         "\n",
223 |         "def _load_state_dict(model, model_url, progress):\n",
224 |         "  pattern = re.compile(\n",
225 |         "        r'^(.*denselayer\\d+\\.(?:norm|relu|conv))\\.((?:[12])\\.(?:weight|bias|running_mean|running_var))$')\n",
226 |         "\n",
227 |         "  state_dict = load_state_dict_from_url(model_url, progress=progress)\n",
228 |         "  for key in list(state_dict.keys()):\n",
229 |         "      res = pattern.match(key)\n",
230 |         "      if res:\n",
231 |         "          new_key = res.group(1) + res.group(2)\n",
232 |         "          state_dict[new_key] = state_dict[key]\n",
233 |         "          del state_dict[key]\n",
234 |         "  model.load_state_dict(state_dict)\n",
235 |         "\n",
236 |         "def _densenet(arch, growth_rate, block_config, num_init_features, pretrained, progress,\n",
237 |         "              **kwargs):\n",
238 |         "  model = DenseNet(growth_rate, block_config, num_init_features, **kwargs)\n",
239 |         "  if pretrained:\n",
240 |         "    _load_state_dict(model, model_urls[arch], progress)\n",
241 |         "  return model\n",
242 |         "\n",
243 |         "def densenet121(pretrained = False, progress = True, **kwargs):\n",
244 |         "  return _densenet('densenet121', 32, (6, 12, 24, 16), 64, pretrained, progress, **kwargs)\n",
245 |         "\n",
246 |         "def densenet161(pretrained = False, progress = True, **kwargs):\n",
247 |         "  return _densenet('dnesenet161', 48, (6, 12, 36, 24), 96, pretrained, progress, **kwargs)\n",
248 |         "\n",
249 |         "def densenet169(pretrained = False, progress = True, **kwargs):\n",
250 |         "  return _densenet('densenet169', 32, (6, 12, 32, 32), 64, pretrained, progress, **kwargs)\n",
251 |         "\n",
252 |         "def densenet201(pretrained = False, progress = True, **kwargs):\n",
253 |         "  return _densenet('densenet201', 32, (6, 12, 48, 32), 64, pretrained, progress, **kwargs)"
254 |       ]
255 |     }
256 |   ]
257 | }


--------------------------------------------------------------------------------
/Computer Vision/CNN/EfficientNet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyMZpVwLqfasaEfsqwf3UBeE",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Deep-Learning-Paper/blob/main/Computer%20Vision/CNN/EfficientNet.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "ghfyI8deSjb_"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "import torch\n",
 38 |         "from torch import nn\n",
 39 |         "from torch.nn import functional as F\n",
 40 |         "from .utils import (\n",
 41 |         "    round_filters,\n",
 42 |         "    round_repeats,\n",
 43 |         "    drop_connect,\n",
 44 |         "    get_same_padding_conv2d,\n",
 45 |         "    get_model_params,\n",
 46 |         "    efficientnet_params,\n",
 47 |         "    load_pretrained_weights,\n",
 48 |         "    Swish,\n",
 49 |         "    MemoryEfficientSwish,\n",
 50 |         "    calculate_output_image_size\n",
 51 |         ")\n",
 52 |         "\n",
 53 |         "VALID_MODELS = (\n",
 54 |         "    'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3',\n",
 55 |         "    'efficientnet-b4', 'efficientnet-b5', 'efficientnet-b6', 'efficientnet-b7',\n",
 56 |         "    'efficientnet-b8',\n",
 57 |         "\n",
 58 |         "    # Support the construction of 'efficientnet-l2' without pretrained weights\n",
 59 |         "    'efficientnet-l2'\n",
 60 |         ")\n",
 61 |         "\n",
 62 |         "class MBConvBlock(nn.Module):\n",
 63 |         "  #Mobile Inverted Residual Bottleneck Block\n",
 64 |         "\n",
 65 |         "  def __init__(self, block_args, global_params, image_size = None):\n",
 66 |         "    super().__init__()\n",
 67 |         "    self.block_args = block_args\n",
 68 |         "    self._bn_mom = 1 - global_aprams.batch_norm_momentum\n",
 69 |         "    self._bn_eps = global_params.batch_norm_epsilon\n",
 70 |         "    self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ration <= 1)\n",
 71 |         "    self.id_skip = block_args.id_skip   #use skip connection and drop connect\n",
 72 |         "\n",
 73 |         "    #Expansion phase\n",
 74 |         "    inp = self._block_args.input_filters   #number of input channels\n",
 75 |         "    oup = self._block_args.input_filters * self._block_args.expand_ratio   #number of output channels\n",
 76 |         "\n",
 77 |         "    if self._block_args.expand_ratio != 1:\n",
 78 |         "      Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
 79 |         "      self._expand_conv = Conv2d(in_channels = inp, output_channels = oup, kernel_size = 1,\n",
 80 |         "                                 bias = False)\n",
 81 |         "      self._bn0 = nn.BatchNorm2d(num_features = oup, momentum = self._bn_mom, eps = self._bn_eps)\n",
 82 |         "      #image_size = calculate_output_image_size(image_size, 1)\n",
 83 |         "\n",
 84 |         "    #Depthwise convolution phase\n",
 85 |         "    k = self._block_args.kernel_size\n",
 86 |         "    s = self._block_args.stride\n",
 87 |         "    Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
 88 |         "    self._depthwise_conv = Conv2d(\n",
 89 |         "        in_channels = oup, out_channels = oup, groups = oup,   #groups가 depthwise를 만듦\n",
 90 |         "        kernel_size = k, strides = s, bias = False\n",
 91 |         "    )\n",
 92 |         "    self._bn1 = nn.BatchNorm2d(num_features = oup, momentum = self._bn_mom, \n",
 93 |         "                               eps = self._bn_eps)\n",
 94 |         "    image_size = calculate_output_image_size(image_size, s)\n",
 95 |         "\n",
 96 |         "    #Squeeze and Excitation layer\n",
 97 |         "    if self.has_se:\n",
 98 |         "      Conv2d = get_same_padding_conv2d(image_size = (1, 1))\n",
 99 |         "      num_squeezed_channels = max(1, int(self.block_args.input_filters * \n",
100 |         "                                         self._block_args.se-ratio))\n",
101 |         "      self._se_reduce = Conv2d(in_channels = oup, out_channel = num_squeezed_channels,\n",
102 |         "                               kernel_size = 1)\n",
103 |         "      self._se_expand = Conv2d(in_channels = num_squeezed_channels, out_channel = oup,\n",
104 |         "                               kernel_size = 1)\n",
105 |         "      \n",
106 |         "    #Pointwise Convolution\n",
107 |         "    final_oup = self._block_args.output_filters\n",
108 |         "    Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
109 |         "    self._project_conv = Conv2d(in_channels = oup, out_channels = final_oup, \n",
110 |         "                                kernel_size = 1, bias = False)\n",
111 |         "    self._bn2 = nn.BatchNorm2d(num_features = final_oup, momentum = self._bn_mom,\n",
112 |         "                               eps = self._bn_eps)\n",
113 |         "    self._swish = MemoryEfficientSwish()\n",
114 |         "\n",
115 |         "  def forward(self, inputs, drop_connect_rate = None):\n",
116 |         "    #Expansion & Depthwise Convolution\n",
117 |         "    x = inputs\n",
118 |         "    if self._block_args.expand_ratio != 1:\n",
119 |         "      x = self.expand_conv(inputs)\n",
120 |         "      x = self._bn0(x)\n",
121 |         "      x = self._swish(x)\n",
122 |         "\n",
123 |         "    x = self._depthwise_conv(x)\n",
124 |         "    x = self._bn1(x)\n",
125 |         "    x = self._swish(x)\n",
126 |         "\n",
127 |         "    #Squeeze & Excitation\n",
128 |         "    if self.has_se:\n",
129 |         "      x_squeezed = F.adaptive_avg_pool2d(x, 1)\n",
130 |         "      x_squeezed = self._se_reduce(x_squeezed)\n",
131 |         "      x_squeezed = self._swish(x_squeezed)\n",
132 |         "      x_squeezed = self._se_expand(x_squeezed)\n",
133 |         "      x = torch.sigmoid(x_squeezed) * x\n",
134 |         "\n",
135 |         "    #Pointwise Convolution\n",
136 |         "    x = self._project_conv(x)\n",
137 |         "    x = self._bn2(x)\n",
138 |         "\n",
139 |         "    #Skip connection & drop connect\n",
140 |         "    input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters\n",
141 |         "\n",
142 |         "    if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:\n",
143 |         "      #skip connection과 drop connect는 stochastic depth를 가져온다\n",
144 |         "      if drop_connect_rate:\n",
145 |         "        x = drop_connect(x, p = drop_connect_rate, training = self.training)\n",
146 |         "      x = x + inputs   #skip connection\n",
147 |         "    \n",
148 |         "    return x\n",
149 |         "\n",
150 |         "  def set_swish(self, memory_efficient = True):\n",
151 |         "    #memory efficient를 위한 swish 설정\n",
152 |         "\n",
153 |         "    self._swish = MemoryEfficientSwish() if memory_efficient else Swish()\n",
154 |         "\n",
155 |         "class EfficientNet(nn.Module):\n",
156 |         "\n",
157 |         "    def __init__(self, blocks_args = None, global_params =  None):\n",
158 |         "      super().__init__()\n",
159 |         "      assert isinstance(block_args, list), 'blocks_args should be a list'\n",
160 |         "      assert len(block_args) > 0, 'block args must be greater than 0'\n",
161 |         "      self._global_params = global_params\n",
162 |         "      self._block_args = block_args\n",
163 |         "\n",
164 |         "      #BatchNorm parameters\n",
165 |         "      bn_mom = 1 - self._global_params.batch_norm_momentum\n",
166 |         "      bn_eps = self._global_params.batch_norm_epsilon\n",
167 |         "\n",
168 |         "      #이미지 크기에 따라서 정적 또는 동적 convolution을 함\n",
169 |         "      image_size = global_params.image_size\n",
170 |         "      Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
171 |         "\n",
172 |         "      #Stem\n",
173 |         "      in_channels = 3   #rgb\n",
174 |         "      out_channels = round_filters(32, self._global_params)   #number of output channels\n",
175 |         "      self._conv_stem = Conv2d(in_channels, out_channels, kernel_size = 3, stride = 2,\n",
176 |         "                               bias = False)\n",
177 |         "      self._bn0 = nn.BatchNorm2d(num_features = out_channels, momentum = bn_mom, eps = bn_eps)\n",
178 |         "      image_size = calculate_output_image_size(image_size, 2)\n",
179 |         "\n",
180 |         "      #블록 쌓기\n",
181 |         "      self._blocks = nn.ModuleList([])\n",
182 |         "      for block_args in self._block_args:\n",
183 |         "        #depth multiplier에 따라 입력과 출력 필터 업데이트\n",
184 |         "        block_args = block_args._replace(\n",
185 |         "            input_filters = round_filters(block_args.input_filters, self._global_params),\n",
186 |         "            output_filter = round_filters(block_args.output_filters, self._global_params),\n",
187 |         "            num_repeat = round_filters(block_args.num_repeates, self._global_params)\n",
188 |         "        )\n",
189 |         "\n",
190 |         "        #첫 번째 블록은 stride와 filter size 증가를 관리할 필요가 있음\n",
191 |         "        self._blocks.append(MBConvBlock(block_args, self._global_params, image_size = image_size))\n",
192 |         "        image_size = calculate_output_image_size(image_size, block_args.stride)\n",
193 |         "        if block_args.num_repeat > 1:   #block_args를 조정해서 똑같은 output size 유지\n",
194 |         "          block_args = block_args._replace(input_filters = block_args.output_filters, stride = 1)\n",
195 |         "\n",
196 |         "        for _ in range(block_args.num_repeat - 1):\n",
197 |         "          self._blocks.append(MBConvBlock(block_args, self._global_params, image_size = image_size))\n",
198 |         "\n",
199 |         "      #Head\n",
200 |         "      in_channels = block_args.output_filters   #output of final block\n",
201 |         "      out_channels = round_filters(1280, self._global_params)\n",
202 |         "      Conv2d = get_same_padding_conv2d(image_size = image_size)\n",
203 |         "      self._conv_head = Conv2d(in_channels, out_channels, kernel_size = 1, bias = False)\n",
204 |         "      self._bn1 = nn.BatchNorm2d(num_features = out_channels, momentum = bn_mom, eps = bn_eps)\n",
205 |         "\n",
206 |         "      #Final Linear Layer\n",
207 |         "      self._avg_pooling = nn.AdaptiveAvgPool2d(1)\n",
208 |         "      self._dropout = nn.Dropout(self._global_params.dropout_rate)\n",
209 |         "      self._fc = nn.Linear(out_channels, self._global_params.num_classes)\n",
210 |         "      self._swish = MemoryEfficientSwish()\n",
211 |         "\n",
212 |         "    def set_swish(self, memory_efficient = True):\n",
213 |         "      self._swish = MemoryEfficientSwish() if memory_efficient else Swish()\n",
214 |         "      for block in self._blocks:\n",
215 |         "          block.set_swish(memory_efficient)\n",
216 |         "\n",
217 |         "    def extract_endpoints(self, inputs):\n",
218 |         "      #Convolution layer을 사용해서 feature을 extract\n",
219 |         "\n",
220 |         "      endpoints = dict()\n",
221 |         "\n",
222 |         "      #Stem\n",
223 |         "      x = self._swish(self._bn0(self._conv_stem(inputs)))\n",
224 |         "      prev_x = x\n",
225 |         "\n",
226 |         "      #Blocks\n",
227 |         "      for idx, block in enumerate(self._blocks):\n",
228 |         "        drop_connect_rate = self._global_params.drop_connect_rate\n",
229 |         "        if drop_connect_rate:\n",
230 |         "          drop_connect_rate *= float(idx) / len(self._blocks)   #scale drop connect_rate\n",
231 |         "        x = block(x, drop_connect_rate = drop_connect_rate)\n",
232 |         "        if prev_x.size(2) > x.size(2):\n",
233 |         "          endpoints[f'reduction_{len(endpoints)+1}'] = prev_x\n",
234 |         "        prev_x = x\n",
235 |         "\n",
236 |         "      #Head\n",
237 |         "      x = self._swish(self._bn1(self._conv_head(x)))\n",
238 |         "      endpoints[f'reduction_{len(endpoints) + 1}'] = x\n",
239 |         "\n",
240 |         "      return endpoints\n",
241 |         "\n",
242 |         "    def extract_features(self, inputs):\n",
243 |         "      #Convolution layer을 사용해서 feature을 추출\n",
244 |         "\n",
245 |         "      #Stem\n",
246 |         "      x = self._swish(self._bn0(self._conv_stem(inputs)))\n",
247 |         "\n",
248 |         "      #Blocks\n",
249 |         "      for idx, block in enumerate(self._blocks):\n",
250 |         "        drop_connect_rate = self._global_params.drop_connect_rate\n",
251 |         "        if drop_connect_rate:\n",
252 |         "          drop_connect_rate *= float(idx) / len(self._blocks)   # scale drop connect rate\n",
253 |         "        x = block(x, drop_connect_rate = drop_connect_rate)\n",
254 |         "\n",
255 |         "      #Head\n",
256 |         "      x = self._swish(self._bn1(self._conv_head(x)))\n",
257 |         "\n",
258 |         "      return x\n",
259 |         "\n",
260 |         "    def forward(self, inputs):\n",
261 |         "      #EfficientNet의 순전파\n",
262 |         "\n",
263 |         "      #Convolution Layers\n",
264 |         "      x = self.extract_features(inputs)\n",
265 |         "\n",
266 |         "      #Pooling & final linear_layers\n",
267 |         "      x = self._avg_pooling(x)\n",
268 |         "      x = x.flatten(start_dim = 1)\n",
269 |         "      x = self._dropout(x)\n",
270 |         "      x = self._fc(x)\n",
271 |         "\n",
272 |         "      return x\n",
273 |         "\n",
274 |         "    @classmethod\n",
275 |         "    def from_name(cls, model_name, in_channels = 3, **override_params):\n",
276 |         "      #이름에 따라서 EfficientNet 생성\n",
277 |         "\n",
278 |         "      cls._check_model_name_is_valid(model_name)\n",
279 |         "      blocks_args, clobal_params = get_model_params(model_name, override_params)\n",
280 |         "      model = cls(blocks_args, global_params)\n",
281 |         "      model._change_in_channels(in_channels)\n",
282 |         "      return model\n",
283 |         "\n",
284 |         "    @classmethod\n",
285 |         "    def from_pretrained(cls, model_naem, weights_path = None, advprop = False,\n",
286 |         "                        in_channels = 3, num_classes = 1000, **override_params):\n",
287 |         "      model = cls.from_name(model_name, num_classes = num_classes, **override_params)\n",
288 |         "      load_pretrained_weights(model, model_name, weights_path = weights_path, \n",
289 |         "                              load_fc = (num_calss == 1000), advprop = advprop)\n",
290 |         "      model._change_in_channels(in_channels)\n",
291 |         "      return model\n",
292 |         "\n",
293 |         "    @clasmethod\n",
294 |         "    def get_image_size(cls, model_name):\n",
295 |         "      #입력 이미지의 크기를 가져옴\n",
296 |         "\n",
297 |         "      cls._check_model_name_is_valid(model_name)\n",
298 |         "      _, _, res, _ = efficientnet_params(model_name)\n",
299 |         "      return res\n",
300 |         "\n",
301 |         "    @classmethod\n",
302 |         "    def _check_model_name_is_valid(cls, model_name):\n",
303 |         "      #model name check\n",
304 |         "\n",
305 |         "      if model_name not in VALID_MODELS:\n",
306 |         "        raise ValueError('model_name should be one of: ' + ', '.join(VALID_MODELS))\n",
307 |         "\n",
308 |         "    def _change_in_channels(self, in_channels):\n",
309 |         "      #첫 번째 합성곱 레이어에 사용되는 in_channels가 3이 아니라면, 조정\n",
310 |         "\n",
311 |         "      if in_channels != 3:\n",
312 |         "        Conv2d = get_same_padding_conv2d(image_size = self._global_params.image_size)\n",
313 |         "        out_channels = round_filters(32, self._global_params)\n",
314 |         "        self._conv_stem = Conv2d(in_channels, out_channels, kernel_size = 3, stride = 2, bias = False)"
315 |       ]
316 |     }
317 |   ]
318 | }


--------------------------------------------------------------------------------
/Computer Vision/CNN/GoogLeNet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyMtfpaygJNZbBpUa0WxvN2p",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Deep-Learning-Paper/blob/main/Computer%20Vision/CNN/GoogLeNet.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "1f7qWOK7AQC5"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "import warnings\n",
 38 |         "from collections import namedtuple\n",
 39 |         "import torch\n",
 40 |         "import torch.nn as nn\n",
 41 |         "import torch.nn.functional as F\n",
 42 |         "from torch.jit.annotations import Optional, Tuple\n",
 43 |         "from torch import Tensor\n",
 44 |         "\n",
 45 |         "\n",
 46 |         "__all__ = ['GoogLeNet', 'googlenet', 'GoogLeNetOutputs', '_GoogLeNetOutputs']\n",
 47 |         "\n",
 48 |         "model_urls = {'googlenet': 'https://download.pytorch.org/models/googlenet-1378be20.pth'}\n",
 49 |         "\n",
 50 |         "GoogLeNetOutputs = namedtuple('GoogLeNetOutputs', ['logits', 'aux_logits2', 'aux_logits1'])\n",
 51 |         "GoogLeNetOutputs.__annotations__ = {'logits': Tensor, 'aux_logits2': Optional[Tensor], \n",
 52 |         "                                    'aux_logits1': Optional[Tensor]}\n",
 53 |         "\n",
 54 |         "#역전파를 위한 GoogLeNet outputs 설정\n",
 55 |         "_GoogLeNetOutputs = GoogLeNetOutputs\n",
 56 |         "\n",
 57 |         "\n",
 58 |         "def googlenet(pretrained = False, progress = True, **kwargs):\n",
 59 |         "  #pretraind: True면 ImageNet으로 pretrained된 모델 반환\n",
 60 |         "  #progress: True면 download bar 보여주기\n",
 61 |         "  #aux_logits: True면 두 개의 추가적인 branch 더해줌 --> 성능 향상에 도움 됌\n",
 62 |         "  #transform input: True면 입력을 preprocessing\n",
 63 |         "\n",
 64 |         "  if pretrained:\n",
 65 |         "    if 'transform_input' not in kwargs:\n",
 66 |         "      kwargs['transform_input'] = True\n",
 67 |         "    if 'aux_logits' not in kwargs:\n",
 68 |         "      kwargs['aux_logits'] = False\n",
 69 |         "    if kwargs['aux_logits']:\n",
 70 |         "      warnings.warn('auxiliary heads in the pretrained googlenet model are NOT pretrained, ')\n",
 71 |         "\n",
 72 |         "    original_aux_logits = kwargs['aux_logits']\n",
 73 |         "    kwargs['aux_logits'] = True\n",
 74 |         "    kwargs['init_weights'] = False\n",
 75 |         "    model = GoogLeNet(**kwargs)\n",
 76 |         "    state_dict = load_state_dict_from_url(model_urls['googlenet'], progress = progress)\n",
 77 |         "    model.load_state_dict(state_dict)\n",
 78 |         "    if not original_aux_logits:\n",
 79 |         "      model.aux_logits = False\n",
 80 |         "      model.aux1 = None\n",
 81 |         "      model.aux2 = None\n",
 82 |         "    return model\n",
 83 |         "\n",
 84 |         "  return GoogLeNet(**kwargs)\n",
 85 |         "\n",
 86 |         "class GoogLeNet(nn.Module):\n",
 87 |         "  __constants__ = ['aux_logits', 'transform_input']\n",
 88 |         "\n",
 89 |         "  def __init__(self, num_classes = 1000, aux_logits = True, transform_input = False,\n",
 90 |         "               init_weights = None, blocks = None):\n",
 91 |         "    super(GoogLeNet, self).__init__()\n",
 92 |         "    if blocks is None:\n",
 93 |         "      blocks = [BasicConv2d, Inception, InceptionAux]\n",
 94 |         "    if init_weights is None:\n",
 95 |         "      warnings.warn('The default weight initialization of GoogLeNet will be changed in future releases of')\n",
 96 |         "      init_weights = True\n",
 97 |         "    assert len(blocks) == 3\n",
 98 |         "    conv_block = blocks[0]\n",
 99 |         "    inception_block = blocks[1]\n",
100 |         "    inception_aux_block = blocks[2]\n",
101 |         "\n",
102 |         "    self.aux_logits = aux_logits\n",
103 |         "    self.transform_input = transform_input\n",
104 |         "\n",
105 |         "    self.conv1 = conv_block(3, 64, kernel_size = 7, stride = 2, padding = 3)\n",
106 |         "    self.maxpool1 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n",
107 |         "    self.conv2 = conv_block(64, 64, kernel_size = 1)\n",
108 |         "    self.conv3 = conv_block(64, 192, kernel_size = 3, padding = 1)\n",
109 |         "    self.maxpool2 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n",
110 |         "\n",
111 |         "    self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)\n",
112 |         "    self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)\n",
113 |         "    self.maxpool3 = nn.MaxPool2d(3, stride = 2, ceil_mode = True)\n",
114 |         "\n",
115 |         "    self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)\n",
116 |         "    self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)\n",
117 |         "    self.inception4c = inception_block(512, 128, 127, 256, 24, 64, 64)\n",
118 |         "    self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)\n",
119 |         "    self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)\n",
120 |         "    self.maxpool4 = nn.MaxPool2d(2, stride = 2, ceil_mode = True)\n",
121 |         "\n",
122 |         "    self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)\n",
123 |         "    self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)\n",
124 |         "\n",
125 |         "    if aux_logits:\n",
126 |         "      self.aux1 = inception_aux_block(512, num_classes)\n",
127 |         "      self.aux2 = inception_aux_block(528, num_classes)\n",
128 |         "    else:\n",
129 |         "      self.aux1 = None\n",
130 |         "      self.aux2 = None\n",
131 |         "\n",
132 |         "    self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\n",
133 |         "    self.dropout = nn.Dropout(0.2)\n",
134 |         "    self.fc = nn.Linear(1024, num_classes)\n",
135 |         "\n",
136 |         "    if init_weights:\n",
137 |         "      self._initialize_weights()\n",
138 |         "\n",
139 |         "  def _initialize_weights(self):\n",
140 |         "    for m in self.modules():\n",
141 |         "      if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):\n",
142 |         "        import scipy.stats as stats\n",
143 |         "        X = stats.truncnorm(-2, 2, scale = 0.01)\n",
144 |         "        values = torch.as_tensor(X.rvs(m.weight.numel()), dtype = m.weight.dtype)\n",
145 |         "        values = values.view(m.weight.size())\n",
146 |         "        with torch.no_grad():\n",
147 |         "          m.weight.copy_(values)\n",
148 |         "      elif isinstance(m, nn.BatchNorm2d):\n",
149 |         "        nn.init.constant_(m.weight, 1)\n",
150 |         "        nn.init.constant_(m.bias, 0)\n",
151 |         "\n",
152 |         "  def _transform_input(self, x):\n",
153 |         "    #(Tensor) --> Tensor\n",
154 |         "    if self.transform_input:\n",
155 |         "      x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5\n",
156 |         "      x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5\n",
157 |         "      x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5\n",
158 |         "      x = torch.cat((x_ch0, x_ch1, x_ch2), 1)\n",
159 |         "    return x\n",
160 |         "\n",
161 |         "  def _forward(self, x):\n",
162 |         "    #type: (Tensor) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]\n",
163 |         "    #N x 3 x 224 x 224\n",
164 |         "    x = self.conv1(x)\n",
165 |         "\n",
166 |         "    #N x 64 x 112 x 112\n",
167 |         "    x = self.maxpool1(x)\n",
168 |         "\n",
169 |         "    #N x 64 x 56 x 56\n",
170 |         "    x = self.conv2(x)\n",
171 |         "\n",
172 |         "    #N x 64 x 56 x 56\n",
173 |         "    x = self.conv3(x)\n",
174 |         "\n",
175 |         "    #N x 192 x 56 x 56\n",
176 |         "    x = self.maxpool2(x)\n",
177 |         "\n",
178 |         "    #N x 192 x 28 x 28\n",
179 |         "    x = self.inception3a(x)\n",
180 |         "\n",
181 |         "    #N x 256 x 28 x 28\n",
182 |         "    x = self.inception3b(x)\n",
183 |         "\n",
184 |         "    #N x 480 x 28 x 28\n",
185 |         "    x = self.maxpool3(x)\n",
186 |         "\n",
187 |         "    #N x 480 x 14 x 14\n",
188 |         "    x = self.inception4a(x)\n",
189 |         "\n",
190 |         "    # N x 512 x 14 x 14\n",
191 |         "    aux1 = torch.hit.annotate(Optional[Tensor], None)\n",
192 |         "    if self.aux1 is not None:\n",
193 |         "      if self.training:\n",
194 |         "        aux1 = self.aux1(x)\n",
195 |         "\n",
196 |         "    x = self.inception4b(x)\n",
197 |         "\n",
198 |         "    #N x 512 x 14 x 14\n",
199 |         "    x = self.inception4c(x)\n",
200 |         "\n",
201 |         "    #N x 512 x 14 x 14\n",
202 |         "    x = self.inception4d(x)\n",
203 |         "\n",
204 |         "    #N x 528 x 14 x 14\n",
205 |         "    x = self.inception4e(x)\n",
206 |         "\n",
207 |         "    #N x 832 x 14 x 14\n",
208 |         "    x = self.maxpool4(x)\n",
209 |         "\n",
210 |         "    #N x 832 x 7 x 7\n",
211 |         "    x = self.inception5a(x)\n",
212 |         "\n",
213 |         "    #N x 832 x 7 x 7\n",
214 |         "    x = self.inception5b(x)\n",
215 |         "    #N x 1024 x 7 x 7\n",
216 |         "\n",
217 |         "    x = self.avgpool(x)\n",
218 |         "    #N x 1024 x 1 x 1\n",
219 |         "\n",
220 |         "    x = torch.flatten(x, 1)\n",
221 |         "    # N x 1024\n",
222 |         "\n",
223 |         "    x = self.dropout(x)\n",
224 |         "    x = self.fc(x)\n",
225 |         "    #N x 1000 (num_classes)\n",
226 |         "    return x, aux2, aux1\n",
227 |         "  \n",
228 |         "  @torch.jit.unused\n",
229 |         "  def eager_outputs(self, x, aux2, aux1):\n",
230 |         "    # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> GoogLeNetOutputs\n",
231 |         "    if self.training and self.aux_logits:\n",
232 |         "      return _GoogLeNetOutputs(x, aux2, aux1)\n",
233 |         "    else:\n",
234 |         "      return x\n",
235 |         "\n",
236 |         "  def forward(self, x):\n",
237 |         "    # type: (Tensor) -> GoogLeNetOutputs\n",
238 |         "    x = self._transform_input(x)\n",
239 |         "    x, aux1, aux2 = self._forward(x)\n",
240 |         "    aux_defined = self.training and self.aux_logits\n",
241 |         "    if torch.jit.is_scripting():\n",
242 |         "      if not aux_defined:\n",
243 |         "        warnings.warn('Scripted Googlenet alwatd returns GoogleNetOutputs Tuple')\n",
244 |         "      return GoogLeNetOutputs(x, aux2, aux1)\n",
245 |         "    else:\n",
246 |         "      return self.eager_outputs(x, aux2, aux1)\n",
247 |         "\n",
248 |         "class Inception(nn.Module):\n",
249 |         "\n",
250 |         "  def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj,\n",
251 |         "               conv_block = None):\n",
252 |         "    super(Inception, self).__init__()\n",
253 |         "    if conv_block is None:\n",
254 |         "      conv_block = BasicConv2d\n",
255 |         "    self.branch1 = conv_block(in_channels, ch1x1, kernel_size = 1)\n",
256 |         "\n",
257 |         "    self.branch2 = nn.Sequential(\n",
258 |         "        conv_block(in_channels, ch3x3red, kernel_size = 1),\n",
259 |         "        conv_block(ch3x3red, ch3x3, kernel_size = 3, padding = 1)\n",
260 |         "    )\n",
261 |         "\n",
262 |         "    self.branch3 = nn.Sequential(\n",
263 |         "        conv_block(in_channels, ch5x5red, kernel_size = 1),\n",
264 |         "        conv_block(ch5x5red, ch5x5, kernel_size = 3, padding = 1)\n",
265 |         "    )\n",
266 |         "\n",
267 |         "    self.branch4 = nn.Sequential(\n",
268 |         "        nn.MaxPool2d(kernel_size = 3, stride = 1, padding = 1, ceil_mode = True),\n",
269 |         "        conv_block(in_channels, pool_proj, kernel_size = 1)\n",
270 |         "    )\n",
271 |         "\n",
272 |         "  def _forward(self, x):\n",
273 |         "    branch1 = self.branch1(x)\n",
274 |         "    branch2 = self.branch2(x)\n",
275 |         "    branch3 = self.branch3(x)\n",
276 |         "    branch4 = self.branch4(x)\n",
277 |         "\n",
278 |         "    outputs = [branch1, branch2, branch3, branch4]\n",
279 |         "    return outputs\n",
280 |         "\n",
281 |         "  def forward(self, x):\n",
282 |         "    outputs = self._forward(x)\n",
283 |         "    return torch.cat(outputs, 1)\n",
284 |         "\n",
285 |         "class InceptionAux(nn.Module):\n",
286 |         "\n",
287 |         "  def __init__(self, in_channels, num_classes, conv_block = None):\n",
288 |         "    super(InceptionAux, self).__init__()\n",
289 |         "    if conv_block is None:\n",
290 |         "      conv_block = BasicConv2d\n",
291 |         "    self.conv = conv_block(in_channels, 128, kernel_size = 1)\n",
292 |         "\n",
293 |         "    self.fc1 = nn.Linear(2048, 1024)\n",
294 |         "    self.fc2 = nn.Linear(1024, num_classes)\n",
295 |         "\n",
296 |         "  def forward(self, x):\n",
297 |         "    #aux1: N x 512 x 14 x 14, aux2: N x 528 x 14 x 14\n",
298 |         "    x = F.adaptive_avg_pool2d(x, (4, 4))\n",
299 |         "    \n",
300 |         "    #aux1: N x 512 x 4 x 4, aux2: N x 528 x 4 x 4\n",
301 |         "    x = self.conv(x)\n",
302 |         "\n",
303 |         "    #N x 128 x 4 x 4\n",
304 |         "    x = self.torch.flatten(x, 1)\n",
305 |         "\n",
306 |         "    #N x 2048\n",
307 |         "    x = F.relu(self.fc1(x), inplace = True)\n",
308 |         "\n",
309 |         "    #N x 1024\n",
310 |         "    x = F.dropout(x, 0.7, training = self.training)\n",
311 |         "\n",
312 |         "    #N x 1024\n",
313 |         "    x = self.fc2(x)\n",
314 |         "\n",
315 |         "    # N x 1000 (num_classes)\n",
316 |         "\n",
317 |         "    return x\n",
318 |         "\n",
319 |         "class BasicConv2d(nn.Module):\n",
320 |         "\n",
321 |         "  def __init__(self, in_channels, out_channels, **kwargs):\n",
322 |         "    super(BasicConv2d, self).__init__()\n",
323 |         "    self.conv = nn.Conv2d(in_channels, out_channels, bias = False, **kwargs)\n",
324 |         "    self.bn = nn.BatchNorm2d(out_channels, eps = 0.001)\n",
325 |         "\n",
326 |         "  def forward(self, x):\n",
327 |         "    x = self.conv(x)\n",
328 |         "    x = self.bn(x)\n",
329 |         "    return F.relu(x, inplace = True)"
330 |       ]
331 |     }
332 |   ]
333 | }


--------------------------------------------------------------------------------
/Computer Vision/CNN/MobileNet_구현_실습.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyPvHJUiMPbRb/mjrrJcBBef",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Deep-Learning-Paper/blob/main/Computer%20Vision/CNN/MobileNet_%EA%B5%AC%ED%98%84_%EC%8B%A4%EC%8A%B5.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "source": [
 32 |         "#MobileNet 설명\n",
 33 |         "#서로 다른 크기의 input layer와 width factor에서 사용 가능\n",
 34 |         "#서로 다른 width를 사용함으로써 cost를 줄일 수 있음\n",
 35 |         "#MobileNet은 32x32보다 큰 입력 이미지면 어떤 이미지든 가능\n",
 36 |         "#더 큰 크기의 이미지는 더욱 향상된 성능을 가져옴\n",
 37 |         "\n",
 38 |         "#파라미터 수와 multiply-adds는 alpha에 의해 결정됌\n",
 39 |         "#alpha는 각 레이어에서 필터의 수를 증감함\n",
 40 |         "\n",
 41 |         "from tensorflow.python.keras.layers.recurrent import layer_serialization\n",
 42 |         "from __future__ import absolute_import\n",
 43 |         "from __future__ import division\n",
 44 |         "from __future__ import print_function\n",
 45 |         "\n",
 46 |         "from tensorflow.python.keras import backend\n",
 47 |         "from tensorflow.python.keras.applications import imagenet_utils\n",
 48 |         "from tensorflow.python.keras.engine import training\n",
 49 |         "from tensorflow.python.keras.layers import VersionAwareLayers\n",
 50 |         "from tensorflow.python.keras.utils import data_utils\n",
 51 |         "from tensorflow.python.keras.utils import layer_utils\n",
 52 |         "from tensorflow.python.lib.io import file_io\n",
 53 |         "from tensorflow.python.platform import tf_logging as logging\n",
 54 |         "from tensorflow.python.util.tf_export import keras_export\n",
 55 |         "\n",
 56 |         "BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/')\n",
 57 |         "layers = None\n",
 58 |         "\n",
 59 |         "@keras_export('keras.applications.mobilenet.MobileNet', 'keras.applications.MobileNet')\n",
 60 |         "\n",
 61 |         "def MobileNet(input_shape = None, alpha = 1.0, depth_multiplier = 1, dropout = 1e-3, include_top = True, \n",
 62 |         "              weights = 'imagenet', input_tensor = None, pooling = None, classes = 1000,\n",
 63 |         "              classifier_activation = 'softmax', **kwargs):\n",
 64 |         "  #input_shape: 옵션적 shape tuple\n",
 65 |         "  #alpha: network의 width 조절 -> width multiplier\n",
 66 |         "  #1.0이면, 각 레이어에서 비율적으로 필터의 수를 줄임\n",
 67 |         "  #depth_multiplier: resolution multiplier\n",
 68 |         "  #dropout: dropout rate 조정\n",
 69 |         "  #include_top: network의 맨 위에서 fc-layer을 사용할지 결정\n",
 70 |         "  #weights: 재량껏 weights를 사용 가능\n",
 71 |         "  #input_tensor: 옵션적 keras tensor\n",
 72 |         "  #pooling: 어떤 방식으로 풀링을 할 지 결정\n",
 73 |         "  #classes: 분류해야 하는 class 수 결정\n",
 74 |         "  \n",
 75 |         "  global layer_s\n",
 76 |         "  if 'layers' in kwargs:\n",
 77 |         "    layers = kwargs.pop('layers')\n",
 78 |         "  else:\n",
 79 |         "    layers = VersionAwareLayers()\n",
 80 |         "  if kwargs:\n",
 81 |         "    raise ValueError('Unknown argument(s): %s' % (kwargs,))\n",
 82 |         "  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):\n",
 83 |         "    raise ValueError('The `weights` argument should be either '\n",
 84 |         "                     '`None` (random initialization), `imagenet` '\n",
 85 |         "                     '(pre-training on ImageNet), '\n",
 86 |         "                     'or the path to the weights file to be loaded.')\n",
 87 |         "    \n",
 88 |         "  if weights == 'imagenet' and include_top and classes != 1000:\n",
 89 |         "    raise ValueError('If using `weights` as `\"imagenet\"` with `include_top` '\n",
 90 |         "                     'as true, `classes` should be 1000')\n",
 91 |         "  \n",
 92 |         "  #적절한 입력 shape과 기본 크기\n",
 93 |         "  if input_shape is None:\n",
 94 |         "    default_size = 224\n",
 95 |         "  else:\n",
 96 |         "    if backend.image_data_format() == 'channels_first':\n",
 97 |         "      rows = input_shape[1]\n",
 98 |         "      cols = input_shape[2]\n",
 99 |         "    else:\n",
100 |         "      rows = input_shape[0]\n",
101 |         "      cols = input_shape[1]\n",
102 |         "\n",
103 |         "    if rows == cols and rows in [128, 160, 192, 224]:\n",
104 |         "      default_size = rows\n",
105 |         "    else:\n",
106 |         "      default_size = 224\n",
107 |         "\n",
108 |         "  input_shape = imagenet_utils.obtain_input_shape(input_shape, default_size = default_size,\n",
109 |         "                                                  min_size = 32, data_format = backend.image_data_format(),\n",
110 |         "                                                  require_flatten = include_top, weights = weights)\n",
111 |         "  \n",
112 |         "  if backend.image_data_format() == 'channels_last':\n",
113 |         "    row_axis, col_axis = (0, 1)\n",
114 |         "  else:\n",
115 |         "    row_axis, col_axis = (1, 2)\n",
116 |         "  rows = input_shape[row_axis]\n",
117 |         "  cols = input_shape[col_axis]\n",
118 |         "\n",
119 |         "  if weights == 'imagenet':\n",
120 |         "    if depth_multiplier != 1:\n",
121 |         "      raise ValueError('If imagenet weights are being loaded, '\n",
122 |         "                       'depth multiplier must be 1')\n",
123 |         "    \n",
124 |         "    if alpha not in [0.25, 0.50, 0.75, 1.0]:\n",
125 |         "      raise ValueError('If imagenet weights are being loaded, '\n",
126 |         "                       'alpha can be one of'\n",
127 |         "                       '`0.25`, `0.50`, `0.75` or `1.0` only.')\n",
128 |         "      \n",
129 |         "    if rows != cols or rows not in [128, 160, 192, 224]:\n",
130 |         "      rows = 224\n",
131 |         "      logging.warning('`input_shape` is undefined or non-square, '\n",
132 |         "                      'or `rows` is not in [128, 160, 192, 224]. '\n",
133 |         "                      'Weights for input shape (224, 224) will be'\n",
134 |         "                      ' loaded as the default.')\n",
135 |         "      \n",
136 |         "    if input_tensor is None:\n",
137 |         "      img_input = layers.Input(shape = input_shape)\n",
138 |         "    else:\n",
139 |         "      if not backend.is_keras_tensor(input_tensor):\n",
140 |         "        img_input = layers.Input(tensor = input_tensor, shape = input_shape)\n",
141 |         "      else:\n",
142 |         "        img_input = input_tensor\n",
143 |         "\n",
144 |         "  x = _conv_block(img_input, 32, alpha, stirdes = (2, 2))\n",
145 |         "  x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id = 1)\n",
146 |         "  x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, strides = (2, 2), block_id = 2)\n",
147 |         "\n",
148 |         "  x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id = 3)\n",
149 |         "  x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, strides = (2, 2), block_id = 4)\n",
150 |         "\n",
151 |         "  x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id = 5)\n",
152 |         "  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, strides = (2, 2), block_id = 6)\n",
153 |         "\n",
154 |         "  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 7)\n",
155 |         "  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 8)\n",
156 |         "  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 9)\n",
157 |         "  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 10)\n",
158 |         "  x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id = 11)\n",
159 |         "\n",
160 |         "  x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, strides = (2, 2), block_id = 12)\n",
161 |         "  x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id = 13)\n",
162 |         "\n",
163 |         "  if include_top:\n",
164 |         "    if backend.image_data_format() == 'channels_first':\n",
165 |         "      shape = (int(1024 * alpha), 1, 1)\n",
166 |         "    else:\n",
167 |         "      shape = (1, 1, int(1024 * alpha))\n",
168 |         "\n",
169 |         "    x = layers.GlobalAvergarePooling2D()(x)\n",
170 |         "    x = layers.Reshape(shape, name = 'reshape_1')(x)\n",
171 |         "    x = layers.Dropout(dropout, name = 'dropout')(x)\n",
172 |         "    x = layers.Conv2D(classes, (1, 1), padding = 'same', name = 'conv_preds')(x)\n",
173 |         "    x = layers.Reshape((classes,), name = 'reshape_2')(x)\n",
174 |         "    imagenet_utils.validate_activation(classifier_activation, weights)\n",
175 |         "    x = layers.Activation(activation = classifier_activation, name = 'predictions')(x)\n",
176 |         "\n",
177 |         "  else:\n",
178 |         "    if pooling == 'avg':\n",
179 |         "      x = layers.GlobalAveragePooling2D()(x)\n",
180 |         "    elif pooling == 'max':\n",
181 |         "      x = layers.GlobalMaxPooling2D()(x)\n",
182 |         "\n",
183 |         "  if input_tensor is not None:\n",
184 |         "    inputs = layer_utils.get_source_inputs(input_tensor)\n",
185 |         "  else:\n",
186 |         "    inputs = img_input\n",
187 |         "\n",
188 |         "  #모델 생성\n",
189 |         "  model = training.Model(inputs, x, name = 'mobilent_%0.2f_%s' % (alpha, rows))\n",
190 |         "\n",
191 |         "  #가중치 불러오기\n",
192 |         "  if weights == 'imagenet':\n",
193 |         "    if alpha == 1.0:\n",
194 |         "      alpha_test = '1_0'\n",
195 |         "    elif alpha == 0.75:\n",
196 |         "      aplha_text = '7_5'\n",
197 |         "    elif alpha == 0.50:\n",
198 |         "      alpha_text = '5_0'\n",
199 |         "    else:\n",
200 |         "      alpha_text = '2_5'\n",
201 |         "\n",
202 |         "    if include_top:\n",
203 |         "      model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows)\n",
204 |         "      weight_path = BASE_WEIGHT_PATH + model_name\n",
205 |         "      weights_path = data_utils.get_file(model_name, weight_path, cache_subdir = 'models')\n",
206 |         "    else:\n",
207 |         "      model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows)\n",
208 |         "      weight_path = BASE_WEIGHT_PATH + model_name\n",
209 |         "      weights_path = data_utils.get_file(model_name, weight_path, cache_subdir = 'models')\n",
210 |         "    model.load_weights(weights_path)\n",
211 |         "  elif weights is not None:\n",
212 |         "    model.load_weights(weights)\n",
213 |         "\n",
214 |         "  return model\n",
215 |         "\n",
216 |         "def _conv_block(inputs, filters, alpha, kernel = (3, 3), strides = (1, 1)):\n",
217 |         "  #inputs: 'channels_last'면 (rows, cols, 3) / 'channels_first'면 (3, rows, cols) 식으로 입력 조정\n",
218 |         "  #filters: output space의 차원수\n",
219 |         "  #alpha: network의 width 조정. alpha가 1.0보다 작으면 각 레이어의 필터 수 줄어듬\n",
220 |         "  #반면에, alpha가 1.0 보다 크다면 각 레이어의 필터 수가 증가함\n",
221 |         "  #kernel: 합성곱 윈도우의 height와 width 조정\n",
222 |         "  #strides: stride 정의\n",
223 |         "  \n",
224 |         "  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n",
225 |         "  filters = int(filters * alpha)\n",
226 |         "  x = layers.Conv2D(filters, kernel, padding = 'same', use_bias = False,\n",
227 |         "                    strides = strides, name = 'conv1')(inputs)\n",
228 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'conv1_bn')(x)\n",
229 |         "  return layers.ReLU(6., name = 'conv1_relu')(x)\n",
230 |         "\n",
231 |         "def _depthwise_conv_block(inputs, pointwise_conv_filters, alpha, depth_multiplier = 1,\n",
232 |         "                          strides = (1, 1), block_id = 1):\n",
233 |         "  #input: 입력 텐서의 모양. 이전의 정의와 동일\n",
234 |         "  #pointwise_conv_filters: output space의 차원수\n",
235 |         "  #alpha: 이전의 정의와 동일\n",
236 |         "  #depth_multiplier: 각 입력 채널에 대한 depthwise convolution output channel의 수\n",
237 |         "  #strides: 이전의 정의와 동일\n",
238 |         "  #block_id: block의 수를 관리하기 위한 특별한 integer\n",
239 |         "\n",
240 |         "  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n",
241 |         "  filters = int(filters * alpha)\n",
242 |         "  \n",
243 |         "  if strides == (1, 1):\n",
244 |         "    x = inputs\n",
245 |         "  else:\n",
246 |         "    x = layers.ZeroPadding2D(((0, 1), (0, 1)), name = 'conv_pad_%d' % block_id)(inputs)\n",
247 |         "\n",
248 |         "  x = layers.DepthwiseConv2D((3, 3), padding = 'same' if strides == (1, 1) else 'valid',\n",
249 |         "                             depth_multiplier = depth_multiplier, strides = strides,\n",
250 |         "                             use_bias = False, name = 'conv_dw_%d' % block_id)(x)\n",
251 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'conv_dw_%d_bn' % block_id)(x)\n",
252 |         "  x = layers.ReLU(6., name = 'conv_dw_%d_relu' % block_id)(x)\n",
253 |         "  \n",
254 |         "  x = layers.Conv2D(pointwise_conv_filters, (1, 1), padding = 'same', use_bias = False, \n",
255 |         "                    strides = (1, 1), name = 'conv_dw_%d' % block_id)(x)\n",
256 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'conv_dw_%d_bn' % block_id)(x)\n",
257 |         "  return layers.ReLU(6., name = 'conv_dw_%d_relu' % block_id)(x)\n",
258 |         "\n",
259 |         "@keras_export('keras.applications.mobilenet.preprocess_input')\n",
260 |         "def preprocess_input(x, data_format = None):\n",
261 |         "  return imagenet_utils.preprocess_input(x, data_format = data_format, mode = 'tf')\n",
262 |         "\n",
263 |         "@keras_export('keras.applications.mobilenet.decode_predictions')\n",
264 |         "def decode_predictions(preds, top = 5):\n",
265 |         "  return imagenet_utils.decode_predictions(preds, top = top)\n",
266 |         "\n",
267 |         "preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(\n",
268 |         "    mode='',\n",
269 |         "    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,\n",
270 |         "    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)\n",
271 |         "decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__"
272 |       ],
273 |       "metadata": {
274 |         "id": "bcnEZ5GDo6cq"
275 |       },
276 |       "execution_count": null,
277 |       "outputs": []
278 |     }
279 |   ]
280 | }


--------------------------------------------------------------------------------
/Computer Vision/CNN/README.md:
--------------------------------------------------------------------------------
1 | # Various CNN models implementation
2 | 
3 | I implemented GoogLeNet, ResNet, DenseNet, EfficientNet, MobileNet.
4 | 
5 | You can check my CNN models paper review here -> https://cartinoe5930.tistory.com/entry/CNN-network%EC%9D%98-%EC%97%AD%EC%82%AC
6 | 


--------------------------------------------------------------------------------
/Computer Vision/CNN/ResNet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyOi4qpO/A/t6wycK3+hwkbI",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Deep-Learning-Paper/blob/main/Computer%20Vision/CNN/ResNet.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "el5SGMXsyXow"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "import torch\n",
 38 |         "import torch.nn as nn\n",
 39 |         "#from .utils import load_state_from_url\n",
 40 |         "\n",
 41 |         "#ResNet 모델 종류\n",
 42 |         "__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',\n",
 43 |         "           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',\n",
 44 |         "           'wide_resnet50_2', 'wide_resnet101_2']\n",
 45 |         "\n",
 46 |         "#ResNet 모델별 URL\n",
 47 |         "model_urls = {\n",
 48 |         "    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',\n",
 49 |         "    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',\n",
 50 |         "    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',\n",
 51 |         "    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',\n",
 52 |         "    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',\n",
 53 |         "    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',\n",
 54 |         "    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',\n",
 55 |         "    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',\n",
 56 |         "    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',\n",
 57 |         "}\n",
 58 |         "\n",
 59 |         "#3X3 conv layer 구현\n",
 60 |         "def conv3x3(in_planes, out_planes, stride = 1, groups = 1, dilation = 1):\n",
 61 |         "  #padding과 함께 3x3 conv layer 구현\n",
 62 |         "  return nn.Conv2d(in_planes, out_planes, kernel_size = 3, stride = stride, padding = dilation, groups = groups, bias = False, dilation = dilation)\n",
 63 |         "\n",
 64 |         "def conv1x1(in_planes, out_planes, stride = 1):\n",
 65 |         "  #1X1 conv layer 구현\n",
 66 |         "  return nn.Conv2d(in_planes, out_planes, kernel_size = 1, stride = stride, bias = True)\n",
 67 |         "\n",
 68 |         "class BasicBlock(nn.Module):\n",
 69 |         "  expansion = 1\n",
 70 |         "\n",
 71 |         "  def __init__(self, inplanes, planes, stride = 1, downsample = None, groups = 1,\n",
 72 |         "               base_width = 64, dilation = 1, norm_layers = None):\n",
 73 |         "    super(BasicBlock, self).__init__()\n",
 74 |         "    if norm_layer is None:\n",
 75 |         "      norm_layer = nn.BatchNorm2d\n",
 76 |         "    if groups != 1 or base_width != 64:\n",
 77 |         "      raise ValueError('BasicBlock only supports groups = 1 and base_width = 64')\n",
 78 |         "    if dilation > 1:\n",
 79 |         "      raise NotImplementedError('Dilation > 1 not supported in BasicBlock')\n",
 80 |         "\n",
 81 |         "    #stride가 1일 때, self.conv layer와 self.downsample layer는 입력을 downsample함\n",
 82 |         "    self.conv1 = conv3x3(inplanes, planes, stride)\n",
 83 |         "    self.bn1 = norm_layer(planes)\n",
 84 |         "    self.relu = nn.ReLU(inplace = True)\n",
 85 |         "    self.conv2 = conv3x3(planes, planes)\n",
 86 |         "    self.bn2 = norm_layer(planes)\n",
 87 |         "    self.downsample = downsample\n",
 88 |         "    self.stride = stride\n",
 89 |         "\n",
 90 |         "  def forward(self, x):\n",
 91 |         "    identity = x\n",
 92 |         "\n",
 93 |         "    out = self.conv1(x)\n",
 94 |         "    out = self.bn1(out)\n",
 95 |         "    out = self.relu(out)\n",
 96 |         "    \n",
 97 |         "    out = self.conv2(out)\n",
 98 |         "    out = self.bn2(out)\n",
 99 |         "\n",
100 |         "    if self.downsample is not None:\n",
101 |         "      identity = self.downsample(x)\n",
102 |         "\n",
103 |         "    out += identity\n",
104 |         "    out = self.relu(out)\n",
105 |         "\n",
106 |         "    return out\n",
107 |         "\n",
108 |         "\n",
109 |         "class Bottleneck(nn.Module):\n",
110 |         "  expansion = 4\n",
111 |         "\n",
112 |         "  def __init__(self, inplanes, planes, stride = 1, downsample = None, groups = 1, \n",
113 |         "               base_width = 64, dilation = 1, norm_layer = None):\n",
114 |         "    super(Bottleneck, self).__init__()\n",
115 |         "    if norm_layer is None:\n",
116 |         "      norm_layer = nn.BatchNorm2d\n",
117 |         "    width = int(planes * (base_width / 64.)) * groups\n",
118 |         "    self.conv1 = conv1x1(inplanes, width)\n",
119 |         "    self.bn1 = norm_layer(width)\n",
120 |         "    self.conv2 = conv3x3(width, width, stride, groups, dilation)\n",
121 |         "    self.bn2 = norm_layer(width)\n",
122 |         "    self.conv3 = conv1x1(width, planes * self.expansion)\n",
123 |         "    self.bn3 = norm_layer(planes * self.expansion)\n",
124 |         "    self.relu = nn.ReLU(inplace = True)\n",
125 |         "    self.downsample = downsample\n",
126 |         "    self.stride = stride\n",
127 |         "\n",
128 |         "  def forward(self, x):\n",
129 |         "    identity = x\n",
130 |         "\n",
131 |         "    out = self.conv1(x)\n",
132 |         "    out = self.bn1(out)\n",
133 |         "    out = self.relu(out)\n",
134 |         "\n",
135 |         "    out = self.conv2(out)\n",
136 |         "    out = self.bn2(out)\n",
137 |         "    out = self.relu(out)\n",
138 |         "\n",
139 |         "    out = self.conv3(out)\n",
140 |         "    out = self.bn3(out)\n",
141 |         "\n",
142 |         "    if self.downsample is not None:\n",
143 |         "      identity = self.downsample(x)\n",
144 |         "\n",
145 |         "    out += identity\n",
146 |         "    out = self.relu(out)\n",
147 |         "\n",
148 |         "    return out\n",
149 |         "\n",
150 |         "class ResNet(nn.Module):\n",
151 |         "\n",
152 |         "  def __init__(self, block, layers, num_classes = 1000, zero_init_residual = False,\n",
153 |         "               groups = 1, width_per_group = 64, replace_stride_width_dilation = None):\n",
154 |         "    super(ResNet, self).__init__()\n",
155 |         "    if norm_layer is None:\n",
156 |         "      norm_layer = nn.BacthNorm2d\n",
157 |         "    self.norm_layer = norm_layer\n",
158 |         "\n",
159 |         "    self.inplanes = 64\n",
160 |         "    self.dilation = 1\n",
161 |         "    if replace_stride_width_dilation is None:\n",
162 |         "      replace_stride_width_dilation = [False, False, False]\n",
163 |         "    if len(replace_stride_width_dilation) != 3:\n",
164 |         "      raise ValueError(\"replace_stride_width_dilation should be None\"\n",
165 |         "      \"of a 3-element tuple, got {}\".format(replace_stride_width_dilation))\n",
166 |         "    self.groups = groups\n",
167 |         "    self.base_width = width_per_group\n",
168 |         "    self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size = 7, stride = 2, padding = 3, bias = True)\n",
169 |         "    self.bn1 = norm_layer(self.inplanes)\n",
170 |         "    self.relu = nn.ReLU(inplace = True)\n",
171 |         "    self.maxpool = nn.MaxPool2D(kernel_size = 3, stride = 2, padding = 1)\n",
172 |         "    self.layer1 = self._make_layer(block, 64, layers[0])\n",
173 |         "    self.layer2 = self._make_layer(block, 128, layers[1], stride = 2, \n",
174 |         "                                   dilate = replace_stride_width_dilation[0])\n",
175 |         "    self.layer3 = self._make_layer(block, 256, layers[2], stride = 2, \n",
176 |         "                                   dilate = replace_stride_width_dilation[1])\n",
177 |         "    self.layer4 = self._make_layer(block, 512, layers[3], stride = 2, \n",
178 |         "                                   dilate = replace_stride_width_dilation[2])\n",
179 |         "    self.avgpool = nn.AdaptiveAvgPool2d((1, 1))\n",
180 |         "    self.fc = nn.Linear(512 * block.expansion, num_classes)\n",
181 |         "\n",
182 |         "    for m in self.modules():\n",
183 |         "      if isinstance(m, nn.Conv2d):\n",
184 |         "        nn.init.kaiming_normal_(m.weight, mode = 'fan_out', nonlinearity = 'relu')\n",
185 |         "      elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):\n",
186 |         "        nn.init.constant_(m.weight, 1)\n",
187 |         "        nn.init.constant_(m.bias, 0)\n",
188 |         "\n",
189 |         "    if zero_init_residual:\n",
190 |         "      for m in self.modules():\n",
191 |         "        if isinstance(m, Bottleneck):\n",
192 |         "          nn.init.constant_(m.bn3.weight, 0)\n",
193 |         "        elif isinstance(m, BasicBlock):\n",
194 |         "          nn.init.constant_(m.bn2.weight, 0)\n",
195 |         "\n",
196 |         "  def _make_layer(self, block, planes, blocks, stride = 1, dilate = False):\n",
197 |         "    norm_layer = self._norm_layer\n",
198 |         "    downsample = None\n",
199 |         "    previous_dilation = self.dilation\n",
200 |         "    if dilate:\n",
201 |         "      self.dilation *= stride\n",
202 |         "      stride = 1\n",
203 |         "    if stride != 1 or self.inplanes != planes * block.expansion:\n",
204 |         "      downsample = nn.Sequential(\n",
205 |         "          conv1x1(self.inplanes, planes * block.expansion, stride), \n",
206 |         "          norm_layer(planes * block.expansion),\n",
207 |         "      )\n",
208 |         "    \n",
209 |         "    layers = []\n",
210 |         "    layers.append(block(self.inplanes, planes, stride, downsample, self.groups,\n",
211 |         "                        self.base_width, previous_dilation, norm_layer))\n",
212 |         "    self.inplanes = planes * block.expansion\n",
213 |         "\n",
214 |         "    for _ in range(1, blocks):\n",
215 |         "      layers.append(block(self.inplanes, planes, groups = self.groups,\n",
216 |         "                          base_width = self.base_width, dilation = self.dilation,\n",
217 |         "                          norm_layer = norm_layer))\n",
218 |         "      \n",
219 |         "    return nn.Sequential(*layers)\n",
220 |         "\n",
221 |         "  def _forward_impl(self, x):\n",
222 |         "    x = self.conv1(x)\n",
223 |         "    x = self.bn1(x)\n",
224 |         "    x = self.relu(x)\n",
225 |         "    x = self.maxpool(x)\n",
226 |         "\n",
227 |         "    x = self.layer1(x)\n",
228 |         "    x = self.layer2(x)\n",
229 |         "    x = self.layer3(x)\n",
230 |         "    x = self.layer4(x)\n",
231 |         "\n",
232 |         "    x = self.avgpool(x)\n",
233 |         "    x = torch.flatten(x, 1)\n",
234 |         "    x = self.fc(x)\n",
235 |         "\n",
236 |         "    return x\n",
237 |         "\n",
238 |         "  def forward(self, x):\n",
239 |         "    return self._forward_impl(x)\n",
240 |         "\n",
241 |         "def _resnet(arch, block, layers, pretrained, progress, **kwargs):\n",
242 |         "  model = ResNet(block, layers, **kwargs)\n",
243 |         "  if pretrained:\n",
244 |         "    state_dict = load_state_dict_from_url(model_urls[arch], progress = progress)\n",
245 |         "    model.load_state_dict(state_dict)\n",
246 |         "  return model\n",
247 |         "\n",
248 |         "def resnext50_32x4d(pretrained = False, progress = True, **kwargs):\n",
249 |         "  kwargs['groups'] = 32\n",
250 |         "  kwargs['width_per_group'] = 4\n",
251 |         "  return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],\n",
252 |         "                 pretrained, progress, **kwargs)\n",
253 |         "  \n",
254 |         "def resnext101_32x8d(pretrained = False, progress = True, **kwargs):\n",
255 |         "  kwargs['groups'] = 32\n",
256 |         "  kwargs['width_pre_group'] = 8\n",
257 |         "  return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],\n",
258 |         "                 pretrained, progress, **kwargs)"
259 |       ]
260 |     }
261 |   ]
262 | }


--------------------------------------------------------------------------------
/Computer Vision/CNN/Xception.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyOnDX2S9zG8B6oYDzT8Z794",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Deep-Learning-Paper/blob/main/Computer%20Vision/CNN/Xception.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "source": [
 32 |         "#Xception 모델 설명\n",
 33 |         "#당시, ImageNet에 대해서 SoTA를 차지함\n",
 34 |         "#VGG16과 ResNet의 입력 이미지 크기(224x224)와 다르게 (299x299)를 사용함\n",
 35 |         "#전처리 방식도 다름(Inception V3와 동일)\n",
 36 |         "\n",
 37 |         "from __future__ import absolute_import\n",
 38 |         "from __future__ import division\n",
 39 |         "from __future__ import print_function\n",
 40 |         "\n",
 41 |         "import os\n",
 42 |         "import warnings\n",
 43 |         "\n",
 44 |         "import keras\n",
 45 |         "from keras import layers\n",
 46 |         "from keras.models import Sequential\n",
 47 |         "from keras import backend\n",
 48 |         "#얘네는 오류 발생\n",
 49 |         "#from . import get_submodules_from_kwargs\n",
 50 |         "#from . import imagenet_utils\n",
 51 |         "#from .imagenet_utils import decode_predictions\n",
 52 |         "#from .imagenet_utils import _obtain_input_shape\n",
 53 |         "\n",
 54 |         "TF_WEIGHTS_PATH = (\n",
 55 |         "    'https://github.com/fchollet/deep-learning-models/'\n",
 56 |         "    'releases/download/v0.4/'\n",
 57 |         "    'xception_weights_tf_dim_ordering_tf_kernels.h5'\n",
 58 |         ")\n",
 59 |         "\n",
 60 |         "TF_WEIGHTS_PATH_NO_TOP = (\n",
 61 |         "    'https://github.com/fchollet/deep-learning-models/'\n",
 62 |         "    'releases/download/v0.4/'\n",
 63 |         "    'xception_weights_tf_dim_ordering_tf_kernels_notop.h5'\n",
 64 |         ")\n",
 65 |         "\n",
 66 |         "def Xception(include_top = True, weights = 'imagenet', input_tensor = None, \n",
 67 |         "             input_shape = None, pooling = None, classes = 1000, **kwargs):\n",
 68 |         "  \n",
 69 |         "  #기본 입력 이미지의 크기는 299 x 299\n",
 70 |         "  #include_top: network의 맨 위에서 fc-layer을 포함할 지\n",
 71 |         "  #weights: 'None'은 무작위, 'imagenet'은 Imagenet에서 pre-training, 또는 업로드할 파일 경로\n",
 72 |         "  #input_tensor: 모델의 입력 이미지에 대해 사용할 추가적인 keras tensor\n",
 73 |         "  #input_shape: 옵션적 tuple 모양, 'include_top'이 False일 때만 사용 가능\n",
 74 |         "  #pooling: feature extraction을 위한 옵션적 pooling mode, 'include_top'이 False일 때만 사용 가능\n",
 75 |         "  #'None': 모델 출력이 4D tensor, 'avg': global average pooling이고 output은 2D tensor\n",
 76 |         "  #'max': global max pooling\n",
 77 |         "  #classes: 옵션적 class 수. 'include_top'이 True일 때와 'weights'가 명시되지 않았을 때 사용 가능\n",
 78 |         "\n",
 79 |         "  #weights에 아무런 값이 없을 때\n",
 80 |         "  if not (weights in {'imagenet', None} or os.path.exists(weights)):\n",
 81 |         "    raise ValueError('The `weights` argument should be either '\n",
 82 |         "                         '`None` (random initialization), `imagenet` '\n",
 83 |         "                         '(pre-training on ImageNet), '\n",
 84 |         "                         'or the path to the weights file to be loaded.')\n",
 85 |         "  \n",
 86 |         "  #imagenet을 weights로 사용하는데 조건이 맞지 않을 때\n",
 87 |         "  if weights == 'imagenet' and include_top and classes != 1000:\n",
 88 |         "    raise ValueError('If using `weights` as `\"imagenet\"` with `include_top`'\n",
 89 |         "                         ' as true, `classes` should be 1000')\n",
 90 |         "    \n",
 91 |         "  #적절한 입력 모양 결정\n",
 92 |         "  input_shape = _obtain_input_shape(input_shape, default_size = 299, min_size = 71,\n",
 93 |         "                                    data_format = backend.image_data_format(),\n",
 94 |         "                                    require_flatten = include_top, weights = weights)\n",
 95 |         "  \n",
 96 |         "  if input_tensor is None:\n",
 97 |         "    img_input = layers.Input(shape = input_shape)\n",
 98 |         "  else:\n",
 99 |         "    if not backend.is_keras_tensor(input_tensor):\n",
100 |         "      img_input = layers.Input(tensor = input_tensor, shape = input_shape)\n",
101 |         "    else:\n",
102 |         "      img_input = input_tensor\n",
103 |         "\n",
104 |         "  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1\n",
105 |         "\n",
106 |         "  #Entry Flow\n",
107 |         "  #입력 이미지 단계\n",
108 |         "  x = layers.Conv2D(32, (3, 3), strides = (2, 2), use_bias = False,\n",
109 |         "                    name = 'block1_conv1')(img_input)\n",
110 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block1_conv1_bn')(x)\n",
111 |         "  x = layers.Activation('relu', name = 'block1_conv1_act')(x)\n",
112 |         "  x = layers.Conv2D(64, (3, 3), use_bias = False, name = 'block1_conv2_bn')(x)\n",
113 |         "  x = layers.Activation('relu', name = 'block1_conv2_act')(x)\n",
114 |         "\n",
115 |         "  #첫 번째 residual network\n",
116 |         "  residual = layers.Conv2d(128, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
117 |         "  residual = layers.BatchNormalization(axis = channel_axis)(residual)\n",
118 |         "\n",
119 |         "  x = layers.SeparableConv2D(128, (3, 3), padding = 'same', use_bias = False,\n",
120 |         "                             name = 'block2_sepconv1')(x)\n",
121 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block2_sepconv1_bn')(x)\n",
122 |         "  x = layers.Activation('relu', name = 'block2_sepconv2_act')(x)\n",
123 |         "  x = layers.SeparableConv2D(128, (3, 3), padding = 'same', use_bias = 'same', \n",
124 |         "                             name = 'block2_sepconv2')(x)\n",
125 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block2_sepconv2_bn')(x)\n",
126 |         "\n",
127 |         "  x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', \n",
128 |         "                          name = 'block2_pool')(x)\n",
129 |         "  x = layers.add([x, residual])\n",
130 |         "\n",
131 |         "  #두 번째 residual network\n",
132 |         "  residual = layers.Conv2d(256, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
133 |         "  residual = layers.BatchNormalization(sxis = channel_axis)(residual)\n",
134 |         "\n",
135 |         "  x = layers.Activation('relu', name = 'block3_conv1_act')(x)\n",
136 |         "  x = layers.SeparableConv2D(256, (3, 3), strides = 'same', use_bias = False, \n",
137 |         "                             name = 'block3_sepconv1')(x)\n",
138 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block3_sepconv1_bn')(x)\n",
139 |         "\n",
140 |         "  x = layers.Activation('relu', name = 'block3_conv2_act')(x)\n",
141 |         "  x = layers.SeparableConv2D(256, (3, 3), strides = 'same', use_bias = False, \n",
142 |         "                             name = 'block3_sepconv2')(x)\n",
143 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block3_sepconv2_bn')(x)\n",
144 |         "\n",
145 |         "  x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', \n",
146 |         "                          name = 'block3_pool')(x)\n",
147 |         "\n",
148 |         "  x = layers.add([x, residual])\n",
149 |         "\n",
150 |         "  #세 번째 residual network\n",
151 |         "  residual = layers.Conv2d(728, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
152 |         "  residual = layers.BatchNormalization(axis = channel_axis)(residual)\n",
153 |         "\n",
154 |         "  x = layers.Activation('relu', name = 'block4_conv1_act')(x)\n",
155 |         "  x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
156 |         "                             name = 'block4_sepconv1')(x)\n",
157 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block4_sepconv1_bn')(x)\n",
158 |         "\n",
159 |         "  x = layer.Activation('relu', name = 'block4_conv2_act')(x)\n",
160 |         "  x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False,\n",
161 |         "                             name = 'block4_sepconv2')(x)\n",
162 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block4_sepconv2_bn')(x)\n",
163 |         "  \n",
164 |         "  x = MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', name = 'block4_pool')(x)\n",
165 |         "\n",
166 |         "  x = layers.add([x, residual])\n",
167 |         "\n",
168 |         "  #Middle Flow\n",
169 |         "  for i in range(8):\n",
170 |         "    residual = x\n",
171 |         "    prefix = 'block' + str(i + 5)   #블록 이름 지정 자동화\n",
172 |         "    \n",
173 |         "    x = layers.Activation('relu', name = prefix + '_sepconv1_act')(x)\n",
174 |         "    x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
175 |         "                               name = prefix + '_sepconv1')(x)\n",
176 |         "    x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv1_bn')(x)\n",
177 |         "\n",
178 |         "    x = layers.Activation('relu', name = prefix + '_sepconv2_act')(x)\n",
179 |         "    x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
180 |         "                               name = prefix + '_sepconv2')(x)\n",
181 |         "    x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv2_bn')(x)\n",
182 |         "\n",
183 |         "    x = layers.Activation('relu', name = prefix + '_sepconv3_act')(x)\n",
184 |         "    x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
185 |         "                               name = prefix + '_sepconv3')(x)\n",
186 |         "    x = layers.BatchNormalization(axis = channel_axis, name = prefix + '_sepconv3_bn')(x)\n",
187 |         "\n",
188 |         "    x = layers.add([x, residual])\n",
189 |         "\n",
190 |         "  #Exit Flow\n",
191 |         "  residual = layers.Conv2d(1024, (1, 1), strides = (2, 2), padding = 'same', use_bias = False)(x)\n",
192 |         "  residual = layers.BatchNormalization(axis = channel_axis)(residual)\n",
193 |         "\n",
194 |         "  x = layers.Activation('relu', name = 'block13_sepconv1_act')(x)\n",
195 |         "  x = layers.SeparableConv2D(728, (3, 3), strides = 'same', use_bias = False, \n",
196 |         "                             name = 'block13_sepconv1')(x)\n",
197 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block13_sepconv1_bn')(x)\n",
198 |         "\n",
199 |         "  x = layers.Activation('relu', name = 'block13_speconv2_act')(x)\n",
200 |         "  x = layers.SeparableConv2D(1024, (3, 3), strides = 'same', use_bias = False, \n",
201 |         "                             name = 'block13_sepconv2')(x)\n",
202 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block13_sepconv2_bn')(x)\n",
203 |         "\n",
204 |         "  x = layers.MaxPooling2D((3, 3), strides = (2, 2), padding = 'same', name = 'block13_pool')(x)\n",
205 |         "\n",
206 |         "  x = layers.add([x, residual])\n",
207 |         "\n",
208 |         "  x = layers.SeparableConv2D(1536, (3, 3), strides = 'same', use_biad = False,\n",
209 |         "                             name = 'block14_sepconv1')(x)\n",
210 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block14_sepconv1_bn')(x)\n",
211 |         "  x = layers.Activation('relu', name = 'block14_sepconv_act')(x)\n",
212 |         "\n",
213 |         "  x = layers.SeparableConv2D(2048, (3, 3), strides = 'same', use_bias = False,\n",
214 |         "                             name = 'block14_sepconv2')(x)\n",
215 |         "  x = layers.BatchNormalization(axis = channel_axis, name = 'block14_speconv2_bn')(x)\n",
216 |         "  x = layers.Activation('relu', name = 'block14_sepconv2_act')(x)\n",
217 |         "\n",
218 |         "  if include_top:\n",
219 |         "    x = layers.GlobalAveragePooling2D(name = 'avg_pool')(x)\n",
220 |         "    x = layers.Dense(classes, activation = 'softmax', name = 'predictions')(x)\n",
221 |         "  else:\n",
222 |         "    if pooling == 'avg':\n",
223 |         "      x = layers.GlobalAveragePooling2D()(x)\n",
224 |         "    elif pooling == 'max':\n",
225 |         "      x = layers.MaxPooling2D()(x)\n",
226 |         "\n",
227 |         "  if input_tensor is not None:\n",
228 |         "    inputs = keras_utils.get_source_inputs(input_tensor)\n",
229 |         "  else:\n",
230 |         "    inputs = img_input\n",
231 |         "\n",
232 |         "  #모델 생성\n",
233 |         "  if weights == 'imagenet':\n",
234 |         "    if include_top:\n",
235 |         "      weights_path = keras_utils.get_file(\n",
236 |         "          'xception_weights_tf_dim_ordering_tf_kernels.h5',\n",
237 |         "          TF_WEIGHTS_PATH,\n",
238 |         "          cache_subdir='models',\n",
239 |         "          file_hash='0a58e3b7378bc2990ea3b43d5981f1f6'\n",
240 |         "      )\n",
241 |         "    else:\n",
242 |         "      weights_path = keras_utils.get_file(\n",
243 |         "          'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',\n",
244 |         "          TF_WEIGHTS_PATH_NO_TOP,\n",
245 |         "          cache_subdir='models',\n",
246 |         "          file_hash='b0042744bf5b25fce3cb969f33bebb97'\n",
247 |         "      )\n",
248 |         "    model.load_weights(weights_path)\n",
249 |         "    if backend.backend() == 'theano':\n",
250 |         "      keras_utils.convert_all_kernels_in_model(model)\n",
251 |         "  elif weights is not None:\n",
252 |         "    model.load_weights(weights)\n",
253 |         "\n",
254 |         "  return model\n",
255 |         "\n",
256 |         "def preprocess_input(x, **kwargs):\n",
257 |         "  #Numpy 배열을 이미지 배치로 전처리\n",
258 |         "  return imagenet_utils.preprocess_input(x, mode = 'tf', **kwargs)"
259 |       ],
260 |       "metadata": {
261 |         "id": "2OJ6-sJPoqxo"
262 |       },
263 |       "execution_count": null,
264 |       "outputs": []
265 |     }
266 |   ]
267 | }


--------------------------------------------------------------------------------
/Computer Vision/README.md:
--------------------------------------------------------------------------------
 1 | # Computer Vision Paper Implementation
 2 | 
 3 | I read those Deep Learning papers and implemented them by coding. 😉 
 4 | There are some that have not yet been implemented, but will be implemented additionally in the future. 😊
 5 | 
 6 | |Paper Title|Paper or reference site Link|Paper Review|
 7 | |---|---|---|
 8 | |history of CNN|LeNet, AlexNet, VGGNet, GoogLeNet, ResNet, ResNeXt, Sception, Mobilenet, DenseNet, EfficientNet, ConvNext|https://cartinoe5930.tistory.com/entry/CNN-network%EC%9D%98-%EC%97%AD%EC%82%AC|
 9 | |ViT: An Image Worth 16 x 16 Words: Transformers for Image Recognition at Scale|https://arxiv.org/abs/2010.11929|https://cartinoe5930.tistory.com/entry/ViT-An-Image-Worth-16-x-16-Words-Transformers-for-Image-Recognition-at-Scale|
10 | |Swin Transformer: Hierachical Vision Transformer using Shifted Winodws|https://arxiv.org/abs/2103.14030|https://cartinoe5930.tistory.com/entry/Swin-Transformer-Hierarchical-Vision-Transformer-using-Shifted-Windows-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
11 | |CLIP: Learning Transferable Visual Models From Natural Language Supervision|https://arxiv.org/abs/2103.00020|https://cartinoe5930.tistory.com/entry/CLIP-Learning-Transferable-Visual-Models-From-Natural-Language-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
12 | 


--------------------------------------------------------------------------------
/Multimodal Models/FLAVA/README.md:
--------------------------------------------------------------------------------
1 | # Interacting with FLAVA
2 | 
3 | https://github.com/apsdehal/flava-tutorials 참고하여 작성됨.
4 | 
5 | paper review: https://cartinoe5930.tistory.com/entry/FLAVA-A-Foundational-Language-And-Vision-Alignment-Model-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 | 


--------------------------------------------------------------------------------
/Multimodal Models/README.md:
--------------------------------------------------------------------------------
 1 | # Multimodal Models paper code implementation
 2 | 
 3 | I read those Multimodal Models papers and implemented them by coding(Pytorch, Tensorflow, etc.). 😉 
 4 | There are some that have not yet been implemented, but will be implemented additionally in the future. 😊
 5 | 
 6 | ## Multi-modal Models
 7 | 
 8 | |Paper Title|Paper or reference site Link|Paper Review|
 9 | |---|---|---|
10 | |Let's learn about VLM(Visual-Language Model)|https://huggingface.co/blog/vision_language_pretraining#supporting-vision-language-models-in-%F0%9F%A4%97-transformers|https://cartinoe5930.tistory.com/entry/VLMVision-Language-Model%EC%97%90-%EB%8C%80%ED%95%B4-%EC%95%8C%EC%95%84%EB%B3%B4%EC%9E%90|
11 | |VisualBERT: A simple and Performant Baseline for Vision and Language |https://arxiv.org/abs/1908.03557|https://cartinoe5930.tistory.com/entry/VisualBERT-A-Simple-and-Performant-Baseline-for-Vision-and-Language-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
12 | |ViLBERT: Pre-training Task-Agnostic Visiolinguistic Representations for Visual-and-Language Tasks|https://arxiv.org/abs/1908.02265|https://cartinoe5930.tistory.com/entry/ViLBERT-Pretraining-Task-Agnostic-Visiolinguistic-Representations-for-Visual-and-Language-Tasks|
13 | |LXMERT: Learning Cross-Modality Encoder Representations from Transformers|https://arxiv.org/abs/1908.07490|https://cartinoe5930.tistory.com/entry/LXMERT-Learning-Cross-Modality-Encoder-Representations-from-Transformers-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
14 | |VL-BERT: Pre-training of Generic Visual-Linguistic Representations|https://arxiv.org/abs/1908.08530|https://cartinoe5930.tistory.com/entry/VL-BERT-Pre-training-of-Generic-Visual-Linguistic-Representations-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
15 | |VLP: Unified Vision-Language Pre-Training for Image Captioning and VQA|https://arxiv.org/abs/1909.11059|https://cartinoe5930.tistory.com/entry/VLP-Unified-Vision-Language-Pre-Traning-for-Image-Captioning-and-VQA-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
16 | |Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks|https://arxiv.org/abs/2004.06165|https://cartinoe5930.tistory.com/entry/Oscar-Object-Semantics-Aligned-Pre-training-for-Vision-Language-Tasks-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
17 | |ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision|https://arxiv.org/abs/2102.03334|https://cartinoe5930.tistory.com/entry/ViLT-Vision-and-Language-Transformer-Without-Convolution-or-Region-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
18 | |ALIGN: Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision|https://arxiv.org/abs/2102.05918|https://cartinoe5930.tistory.com/entry/ALIGN-Scaling-up-Visual-and-Vision-Language-Representation-with-Noisy-Text-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
19 | |ALBEF: Vision and Language Representation Learning with Momentum Distillation|https://arxiv.org/abs/2107.07651|https://cartinoe5930.tistory.com/entry/ALBEF-Vision-and-Language-Representation-Learning-with-Momentum-Distillation-%EB%85%BC%EB%AC%B8|
20 | |SimVLM: Simple Visual Language Model Pretraining with Weak Supervision|https://arxiv.org/abs/2108.10904|https://cartinoe5930.tistory.com/entry/SimVLM-Simple-Visual-Language-Model-Pre-training-with-Weak-Supervision-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
21 | |BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation|https://arxiv.org/abs/2201.12086|https://cartinoe5930.tistory.com/entry/BLIP-Bootstrapping-Language-Image-Pre-training-fro-Unified-Vision-Language-Understanding-and-Generation-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
22 | |FLAVA: A Foundational Language And Vision Alignment Model|https://arxiv.org/abs/2112.04482|https://cartinoe5930.tistory.com/entry/FLAVA-A-Foundational-Language-And-Vision-Alignment-Model-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0|
23 | 


--------------------------------------------------------------------------------
/Natural Language Processing/ALBERT/README.md:
--------------------------------------------------------------------------------
1 | # ALBERT Implementation
2 | 
3 | https://github.com/google-research/albert/blob/master/modeling.py 참고하여 작성됌.
4 | 
5 | paper review: https://cartinoe5930.tistory.com/entry/ALBERT-A-Lite-BERT-for-Self-supervised-Learning-of-Language-Representations-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/BERT/BERT_구현_복습.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyNIeutm5STI86h0MtzDj0Xc",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/BERT/BERT_%EA%B5%AC%ED%98%84_%EB%B3%B5%EC%8A%B5.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "source": [
 32 |         "# BERT 구현 복습\n",
 33 |         "\n",
 34 |         "이미 한 번 BERT를 구현했던 적이 있는데, 이번에는 좀 더 구체적인 example을 사용하여 직접 구현해보도록 하겠다. 이 코드는 [여기](https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial)를 참고하여 작성되었다.\n",
 35 |         "\n",
 36 |         "BERT를 PyTorch를 이용하여 구현하였고, BERT를 구현하는 과정을 다음과 같이 4개의 섹션으로 나눴다.\n",
 37 |         "\n",
 38 |         "1. 전처리\n",
 39 |         "2. 모델링\n",
 40 |         "3. Loss & Optimization\n",
 41 |         "4. 훈련\n"
 42 |       ],
 43 |       "metadata": {
 44 |         "id": "Nrb7y3QLDj3t"
 45 |       }
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "source": [
 50 |         "### 전처리\n",
 51 |         "\n",
 52 |         "전처리 과정에서는 신경망이 데이터를 처리할 수 있도록 다음과 같이 data를 구축한다. 일단 raw text부터 시작해보도록 하자."
 53 |       ],
 54 |       "metadata": {
 55 |         "id": "b0Qw4c4uEPTv"
 56 |       }
 57 |     },
 58 |     {
 59 |       "cell_type": "code",
 60 |       "execution_count": null,
 61 |       "metadata": {
 62 |         "id": "1fTURcMQDe59"
 63 |       },
 64 |       "outputs": [],
 65 |       "source": [
 66 |         "# raw text\n",
 67 |         "\n",
 68 |         "text = (\n",
 69 |         "       'Hello, how are you? I am Romeo.n'\n",
 70 |         "       'Hello, Romeo My name is Juliet. Nice to meet you.n'\n",
 71 |         "       'Nice meet you too. How are you today?n'\n",
 72 |         "       'Great. My baseball team won the competition.n'\n",
 73 |         "       'Oh Congratulations, Julietn'\n",
 74 |         "       'Thanks you Romeo'\n",
 75 |         "   )"
 76 |       ]
 77 |     },
 78 |     {
 79 |       "cell_type": "markdown",
 80 |       "source": [
 81 |         "그 다음에 데이터를 다음과 같이 정리해야 한다.\n",
 82 |         "\n",
 83 |         "- 문장을 소문자로 변환\n",
 84 |         "- vocabulary를 만듦. **Vocabulary**는 문서 내의 독특한 단어의 list임."
 85 |       ],
 86 |       "metadata": {
 87 |         "id": "vkL4zzsxElEn"
 88 |       }
 89 |     },
 90 |     {
 91 |       "cell_type": "code",
 92 |       "source": [
 93 |         "# '.', ',', '?', '!' filtering\n",
 94 |         "sentences = re.sub(\"[.,!?-]\", '', text.lower()).split('n')\n",
 95 |         "\n",
 96 |         "word_list = list(set(\" \".join(sentences).split()))"
 97 |       ],
 98 |       "metadata": {
 99 |         "id": "OXS-z3vEE1ir"
100 |       },
101 |       "execution_count": null,
102 |       "outputs": []
103 |     },
104 |     {
105 |       "cell_type": "markdown",
106 |       "source": [
107 |         "다음으로, BERT의 학습 도중에 사용되는 special token을 잘 기억해야 한다. 다음은 이 다양한 토큰들에 대한 설명이다.\n",
108 |         "\n",
109 |         "- [CLS]: 첫 번째 토큰은 항상 classification\n",
110 |         "- [SEP]: 두 개의 문장을 분리\n",
111 |         "- [END]: 문장을 끝내기\n",
112 |         "- [PAD]: 문장을 똑같은 길이로 줄이기\n",
113 |         "- [MASK]: 기존의 단어를 mask로 대체\n",
114 |         "\n",
115 |         "이러한 토큰들은 word dictionary에 들어가 있어야 하는데, 여기서 vocabulary에 들어가 있는는 각각의 토큰과 단어는 index number가 할당된다."
116 |       ],
117 |       "metadata": {
118 |         "id": "xTaer5nqFH-r"
119 |       }
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "source": [
124 |         "word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}\n",
125 |         "for i, w in enumerate(word_list):\n",
126 |         "  word_dict[w] = i + 4\n",
127 |         "  number_dict = {i: w for i, w in enumerate(word_dict)}\n",
128 |         "  vocab_size = len(word_dict)"
129 |       ],
130 |       "metadata": {
131 |         "id": "Kq4dprH2F5OG"
132 |       },
133 |       "execution_count": null,
134 |       "outputs": []
135 |     },
136 |     {
137 |       "cell_type": "markdown",
138 |       "source": [
139 |         "이 과정이 완료되면, input sequence를 3개의 유형의 embedding으로 포맷하는 함수를 생성해야 한다.\n",
140 |         "\n",
141 |         "- **token embedding**\n",
142 |         "- **segment embedding**\n",
143 |         "- **position embedding**\n",
144 |         "\n",
145 |         "이제 각각에 대해 알아보도록 하자."
146 |       ],
147 |       "metadata": {
148 |         "id": "NXSxlEK6GPji"
149 |       }
150 |     },
151 |     {
152 |       "cell_type": "markdown",
153 |       "source": [
154 |         "**token embedding이 무엇일까?**\n",
155 |         "\n",
156 |         "예를 들어, 문장 \"The cat is walking. The dog is barking.\"이 주어졌을 때, 함수는 다음의 방식대로 sequence를 생성해야 한다.\n",
157 |         "\n",
158 |         "\"[CLS] the cat is walking [SEP] the dog is barking\"\n",
159 |         "\n",
160 |         "그 후에, 모든 것들은 word dictionary의 index로 바꿔야 한다. 따라서 이전의 문장은 다음과 같은 형태를 가지게 된다.\n",
161 |         "\n",
162 |         "\"[1, 5, 7, 9, 10, 2, 5, 6, 9, 11]\"\n",
163 |         "\n",
164 |         "여기서 1과 2는 각각 [CLS]와 [SEP]를 의미한다.\n",
165 |         "\n",
166 |         "**segment embedding이 무엇일까?**\n",
167 |         "\n",
168 |         "segment embedding은 두 개의 문장을 분리하는 역할을 한다. 보통 0과 1로 정의된다.\n",
169 |         "\n",
170 |         "**position embedding이 무엇일까?**\n",
171 |         "\n",
172 |         "position embedding은 sequence에서 각 embedding에게 position을 준다.\n",
173 |         "\n"
174 |       ],
175 |       "metadata": {
176 |         "id": "urJ0SqDiGwwD"
177 |       }
178 |     },
179 |     {
180 |       "cell_type": "markdown",
181 |       "source": [
182 |         "이제 다음 단계는 **masking**을 생성하는 것이다.\n",
183 |         "\n",
184 |         "논문에 의하면, BERT는 sequence의 15% word를 [MASK] 토큰으로 대체하고, padding을 추가하였다. Padding은 모든 문장의 길이를 똑같은 길이로 만들어준다. 예를 들어, 다음과 같은 문장을 받았다고 하였을 때,\n",
185 |         "\n",
186 |         "\"The cat is walking. The shog is barking at the tree\"\n",
187 |         "\n",
188 |         "이 문장에 padding을 적용하면 다음과 같이 바뀐다.\n",
189 |         "\n",
190 |         "\"[CLS] The cat is walking [PAD] [PAD] [PAD]. [CLS] The dog is barking at the tree.\"\n",
191 |         "\n",
192 |         "첫 번째 문장의 길이가 두 번째 문장의 길이와 같아진다."
193 |       ],
194 |       "metadata": {
195 |         "id": "86CK0zidJFh9"
196 |       }
197 |     },
198 |     {
199 |       "cell_type": "code",
200 |       "source": [
201 |         "def make_batch():\n",
202 |         "  batch = []\n",
203 |         "  positive = negative = 0\n",
204 |         "  while positive != batch_size / 2 or negative != batch_size / 2:\n",
205 |         "    tokens_a_index, tokens_b_index = randrange(len(sentences)), randrange(len(sentences))\n",
206 |         "\n",
207 |         "    tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]\n",
208 |         "\n",
209 |         "    input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP']]\n",
210 |         "    segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)\n",
211 |         "\n",
212 |         "    # LM masking\n",
213 |         "    n_pred = min(max_pred, max(1, int(round(len(input_ids) * 0.15))))   # 한 문장의 15% 정도의 토큰\n",
214 |         "    cand_maked_pos = [1 for i, token in enumerate(input_ids) if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]\n",
215 |         "    shuffle(cand_maked_pos)\n",
216 |         "    masked_tokens, masked_pos = [], []\n",
217 |         "    for pos in cand_makes_pos[:n_pred]:\n",
218 |         "      masked_pos.append(pos)\n",
219 |         "      masked_tokens.append(input_ids[pos])\n",
220 |         "      if random() < 0.8:   # 80%는 masking\n",
221 |         "        input_ids[pos] = word_dict['[MASK]']\n",
222 |         "      elif random() < 0.5: # 10%는 vocabulary에서 random indexing\n",
223 |         "        index = randint(0, vocab_size - 1)\n",
224 |         "        input_ids[pos] = word_dict[number_dict[index]]\n",
225 |         "    \n",
226 |         "    # Zero padding\n",
227 |         "    n_pad = maxlen - len(input_ids)\n",
228 |         "    input_ids.extend([0] * n_pad)\n",
229 |         "    segment_ids.extend([0] * n_pad)\n",
230 |         "\n",
231 |         "    # Zero padding (100% - 15%) tokens\n",
232 |         "    if max_pred > n_pred:\n",
233 |         "      n_pad = max_pred - n_pred\n",
234 |         "      masked_tokens.extend([0] * n_pad)\n",
235 |         "      masked_pos.extend([0] * n_pad)\n",
236 |         "\n",
237 |         "    if tokens_a_index + 1 == tokens_b_index and positive < batch_size / 2:\n",
238 |         "      batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])   # IsNext\n",
239 |         "    elif tokens_a_index + 1 != tokens_b_index and negative < batch_size / 2:\n",
240 |         "      batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])  # NotNext\n",
241 |         "      negative += 1\n",
242 |         "\n",
243 |         "    return batch"
244 |       ],
245 |       "metadata": {
246 |         "id": "mZitlUMPNrU-"
247 |       },
248 |       "execution_count": null,
249 |       "outputs": []
250 |     },
251 |     {
252 |       "cell_type": "markdown",
253 |       "source": [
254 |         "next-word prediction을 다루기 때문에, 문장이 이어진 문장인지 아닌지를 예측하는 label을 생성해야 한다. 이것이 바로 IsNext와 NotNext이다. 그래서 다음 문장 앞에 오는 모든 문장에 True를 할당하고 이를 위해 조건문을 사용하였다.\n",
255 |         "\n",
256 |         "예를 들어, 두 개의 문장이 하나의 document에 있으면, 이 둘은 서로를 문맥적으로 따른다. 따라서서 첫 번째 문장이 A이면 다음 문장은 A+1이어야 한다. 직관적으로 첫 번째 문장의 위치 즉, tokens_a_index + 1 == tokens_b_index, 즉 동일한 context의 두 번째 문장인 경우 이 입력에 대한 label을을 True로 설정할 수 있도록 코드를 작성해야 한다.\n",
257 |         "\n",
258 |         "만약 위 조건이 tokens_a_index + 1 != tokens_b_index라면 input에 대한 label을 False로 지정해야 한다."
259 |       ],
260 |       "metadata": {
261 |         "id": "3Ifq41KQQlD4"
262 |       }
263 |     },
264 |     {
265 |       "cell_type": "markdown",
266 |       "source": [
267 |         "### 모델링\n",
268 |         "\n",
269 |         "BERT는 매우 정교한 모델이라서 느리게 감지되면 논리를 잃게 된다. 그래서 BERT는 component와 함수에 의해 component를 설명하는 것이 가능하다.\n",
270 |         "\n",
271 |         "BERT는 다음의 component들을 가진다.\n",
272 |         "\n",
273 |         "1. Embedding layer\n",
274 |         "2. Attention Mask\n",
275 |         "3. Encoder layer\n",
276 |         "  - Multi-head attention\n",
277 |         "    - Scaled dot product attention\n",
278 |         "  - Position-wise feed-forward network\n",
279 |         "4. BERT(모든 component를 합침)"
280 |       ],
281 |       "metadata": {
282 |         "id": "G2QPTi8D1R5B"
283 |       }
284 |     },
285 |     {
286 |       "cell_type": "markdown",
287 |       "source": [
288 |         "#### Embedding Layer\n",
289 |         "\n",
290 |         "embedding은 BERT의 첫 번째 레이어로 input을 받아서 lookup table을 생성한다. embedding layer의 파라미터는 학습 가능하고, 이는 학습 스포레스가 끝날 때, embedding은 비슷한 단어들끼리 모여있을 거라는 것이다.\n",
291 |         "\n",
292 |         "embedding layer는 단어 간의 서로 다른 관계를 보존한다. 여기에는 semantic, syntactic, linear, 그리고 BERT가 양방향성이기 때문에, contextual relationship을 잘 보존한다.\n",
293 |         "\n",
294 |         "BERT의 경우에, 다음 3개의 embedding을 생성한다.\n",
295 |         "\n",
296 |         "- Token\n",
297 |         "- Segments\n",
298 |         "- Position\n",
299 |         "\n",
300 |         "아까 전에 position embedding을 생성하는 함수를 정의해두지는 않았지만, token과 segment를 생성하는 함수는 이미 정의해두었다. 그래서 이제 input을 받아서 sequence에서 각 단어에 대한 position을 생성할 수 있다. 그리고 이는 다음과 같다."
301 |       ],
302 |       "metadata": {
303 |         "id": "k7zuhRtl2cwx"
304 |       }
305 |     },
306 |     {
307 |       "cell_type": "code",
308 |       "source": [
309 |         "print(torch.arange(30, dtype = torch.long).expand_as(input_ids))"
310 |       ],
311 |       "metadata": {
312 |         "id": "idIqPc1H3v1q"
313 |       },
314 |       "execution_count": null,
315 |       "outputs": []
316 |     },
317 |     {
318 |       "cell_type": "markdown",
319 |       "source": [
320 |         "forward function에서, 모든 embedding을 합하고 정규화하였다."
321 |       ],
322 |       "metadata": {
323 |         "id": "bSHPzyJL38Pd"
324 |       }
325 |     },
326 |     {
327 |       "cell_type": "code",
328 |       "source": [
329 |         "class Embedding(nn.Module):\n",
330 |         "  def __init__(self):\n",
331 |         "    super(EMbedding, self).__init__()\n",
332 |         "    self.tok_embed = nn.Embedding(vocab_size, d_model)   # token embedding\n",
333 |         "    self.pos_embed = nn.Embedding(maxlen, d_model)       # position embedding\n",
334 |         "    self.seg_embed = nn.Embedding(n_segments, d_model)   # segment embedding\n",
335 |         "    self.norm = nn.LayerNorm(d_model)\n",
336 |         "\n",
337 |         "  def forward(self, x, seg):\n",
338 |         "    seq_len = x.size(1)\n",
339 |         "    pos = torch.arange(seq_len, dtype = torch.long)\n",
340 |         "    pos = pos.unsqueeze(0).expand_as(x)                  # (seq_len,) -> (batch_size, seq_len)\n",
341 |         "    embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)\n",
342 |         "\n",
343 |         "    return self.norm(embedding)"
344 |       ],
345 |       "metadata": {
346 |         "id": "3Z5lR_DF4FBO"
347 |       },
348 |       "execution_count": null,
349 |       "outputs": []
350 |     },
351 |     {
352 |       "cell_type": "markdown",
353 |       "source": [
354 |         "#### attention mask 생성\n",
355 |         "\n",
356 |         "BERT는 attention mask 또한 필요로 한다. 그리고 이것은 적절한 형식이 되어야 한다. 다음의 코드가 attention mask를 생성하는 코드이다. 아래 코드에서 [PAD]는 1로 변환되고, 다른 것들은 0으로 변환된다."
357 |       ],
358 |       "metadata": {
359 |         "id": "4jKULKiI5GKe"
360 |       }
361 |     },
362 |     {
363 |       "cell_type": "code",
364 |       "source": [
365 |         "def get_attn_pad_mask(seq_q, seq_k):\n",
366 |         "  batch_size, len_q = seq_q.size()\n",
367 |         "  batch_size, len_k = seq_k.size()\n",
368 |         "  # eq(0)은 PAD token이다.\n",
369 |         "  pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)          # batch_size x 1 x len_k(=len_q), 하나가 마스킹된다.\n",
370 |         "  return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k"
371 |       ],
372 |       "metadata": {
373 |         "id": "6BErCR2k5Ype"
374 |       },
375 |       "execution_count": null,
376 |       "outputs": []
377 |     },
378 |     {
379 |       "cell_type": "markdown",
380 |       "source": [
381 |         "#### Encoder\n",
382 |         "\n",
383 |         "Encdoer는 다음의 두 개의 주된 component를 가지고 있다.\n",
384 |         "\n",
385 |         "- Multi-head Attention\n",
386 |         "- Position-wise feed-forward network\n",
387 |         "\n",
388 |         "encoder의 작업은 representation과 pattern을 input과 attention mask로부터 찾는 것이다."
389 |       ],
390 |       "metadata": {
391 |         "id": "2VC7H8lr6gyu"
392 |       }
393 |     },
394 |     {
395 |       "cell_type": "code",
396 |       "source": [
397 |         "class EncoderLayer(nn.Module):\n",
398 |         "  def __init__(self):\n",
399 |         "    super(EncoderLayer, self).__init__()\n",
400 |         "    self.enc_self_attn = MultiHeadAttention()\n",
401 |         "    self.pos_ffn = PoswiseFeedForwardNet()\n",
402 |         "\n",
403 |         "  def forward(self, en_inputs, enc_self_attn_mask):\n",
404 |         "    enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)   # enc_inputs는 Q, K, V와 같음\n",
405 |         "    enc_outputs = self.pos_ffn(enc_outputs)   # enc_outputs: [batch_size x len_q x d_model]\n",
406 |         "    return enc_outputs, attn"
407 |       ],
408 |       "metadata": {
409 |         "id": "S1bboIq9606b"
410 |       },
411 |       "execution_count": null,
412 |       "outputs": []
413 |     },
414 |     {
415 |       "cell_type": "markdown",
416 |       "source": [
417 |         "#### Multi-head attention\n",
418 |         "\n",
419 |         "이것이 encoder의 첫 번째 주된 component이다.\n",
420 |         "\n",
421 |         "attention model은 3개의 입력값 **Query, Key, Value**를 받는다.\n",
422 |         "\n",
423 |         "Multi-head attention은 4개의 입력값 **Query, Key, Value, Attention mask**를 받는다. embedding은 Query, Key, Value에 입력으로 주어지고, attention mask는 attention mask 인자에 입력으로 주어진다.\n",
424 |         "\n",
425 |         "이러한 3개의 입력과 attention mask에 대해 dot-product 연산을 수행한다. 이 dot-product 연산은 **context vector**와 **attention**을 산출한다. context vector는 선형 레이어를 지나서 최종적으로 output을 출력한다."
426 |       ],
427 |       "metadata": {
428 |         "id": "i5Ffo_Pu7h_4"
429 |       }
430 |     },
431 |     {
432 |       "cell_type": "code",
433 |       "source": [
434 |         "class MultiHeadAttention(nn.Module):\n",
435 |         "  def __init__(self):\n",
436 |         "    super(MultiHeadAttention, self).__init__()\n",
437 |         "    self.W_Q = nn.Linear(d_model, d_k * n_heads)\n",
438 |         "    self.W_K = nn.Linear(d_model, d_k * n_heads)\n",
439 |         "    self.W_V = nn.Linear(d_model, d_v * n_heads)\n",
440 |         "\n",
441 |         "  def forward(self, Q, K, V, attn_mask):\n",
442 |         "    # q: [batch_size x len_q x d_model]\n",
443 |         "    # k: [batch_size x len_k x d_model]\n",
444 |         "    # v: [batch_size x len_k x d_model]\n",
445 |         "    residual, batch_size = Q, Q.size(0)\n",
446 |         "    # (B, S, D) -proj- -> (B, S, D) -split- -> (B, S, H, W) -trans- -> (B, H, S, W)\n",
447 |         "    q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)   # q_s: [batch_size x n_heads x len_q x d_k]\n",
448 |         "    k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)   # k_s: [batch_size x n_heads x len_k x d_k]\n",
449 |         "    v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)   # v_s: [batch_size x n_heads x len_k x d_v]\n",
450 |         "\n",
451 |         "    attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)            # attn_mask: [batch_size x n_heads x len_q x len_k]\n",
452 |         "\n",
453 |         "    # context: [batch_size x n_heads x len_q x d_v]\n",
454 |         "    # attn: [batch_size x n_heads x len_q x len_k]\n",
455 |         "    context, attn = ScaleDotProductAttention()(q_s, k_s, v_s, attn_mask)\n",
456 |         "    context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)   # context: [batch_size x len_q x n_heads * d_v]\n",
457 |         "    output = nn.Linear(n_heads * d_v, d_model)(context)\n",
458 |         "\n",
459 |         "    return nn.LayerNorm(d_model)(output + residual), attn   # output: [batch_size x len_q x d_model]"
460 |       ],
461 |       "metadata": {
462 |         "id": "hIE8aZIn80LD"
463 |       },
464 |       "execution_count": null,
465 |       "outputs": []
466 |     },
467 |     {
468 |       "cell_type": "markdown",
469 |       "source": [
470 |         "이제 이 Scaled Dot-Product attention에 대해 알아보도록 하자.\n",
471 |         "\n",
472 |         "- scaled dot-product attention 클래스는 4개의 인자 Query, Key, Value, Attention mask를 받는다. 본질적으로, 앞에 3개의 인자들은 word embedding과 함께 주어지고, attention mask 인자는 attention mask embedding과 함께 주어진다.\n",
473 |         "- 그리고 scaled dot-product attention은 **query**와 **key**간에 행렬곱을 해서 점수를 얻는다.\n",
474 |         "\n",
475 |         "우리 코드에서는 scores.masked_fill_(attn_mask, -1e9)를 사용한다. 이 속성은 attention mask가 **True**인 -1e9로 score 요소를 채우고 나머지 요소는 attention score를 얻은 다음 0과 1 사이의 score를 제공하는 softmax 함수를 통해 전달된다.마지막으로, attention 과 value 간에 행렬곱을 수행함으로써 context vector을 얻었다."
476 |       ],
477 |       "metadata": {
478 |         "id": "EcSZkO3u_Y6T"
479 |       }
480 |     },
481 |     {
482 |       "cell_type": "code",
483 |       "source": [
484 |         "class ScaledDotProductAttention(nn.Module):\n",
485 |         "  def __init__(self):\n",
486 |         "    super(ScaledDotProductAttention, self).__init__()\n",
487 |         "\n",
488 |         "  def forward(self, Q, K, V, attn_mask):\n",
489 |         "    scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k)   # scores: [batch_size x n_heads x len_q x len_k]\n",
490 |         "    scores.masked_fill_(attn_mask, -1e9)   # mask가 하나인 self tensor의 요소를 value로 채운다.\n",
491 |         "    attn = nn.Softmax(dim = -1)(scores)\n",
492 |         "    context = torch.matmul(attn, V)\n",
493 |         "    return score, context, attn\n",
494 |         "    "
495 |       ],
496 |       "metadata": {
497 |         "id": "yHfeJSJKBmVo"
498 |       },
499 |       "execution_count": null,
500 |       "outputs": []
501 |     },
502 |     {
503 |       "cell_type": "markdown",
504 |       "source": [
505 |         "#### Position-Wise Feed Forward Network\n",
506 |         "\n",
507 |         "multi-head attention의 출력값은 feed-forward network로 가고 이는 encoder part를 결론 짓는다.\n",
508 |         "\n",
509 |         "#### 모든 component를 합치기\n",
510 |         "\n",
511 |         "encoder는 다음의 2개의 출력값을 내놓는다.\n",
512 |         "\n",
513 |         "- feed-forward layer의 출력값\n",
514 |         "- Attention mask\n",
515 |         "\n",
516 |         "여기서 중요한 것은 BERT는 decoder를 사용하지 않는다는 것이다. 대시넹, output과 attention mask를 사용해서 원하는 결과를 얻는다.\n",
517 |         "\n",
518 |         "transformer의 decoder 부분은 아래 코드처럼 분류하는데 사용되는 얕은 네트워크로 대체된다. BERT 또한 **classifier**와 **masked** 2개의 출력값을 내놓는다."
519 |       ],
520 |       "metadata": {
521 |         "id": "6KxhEHWVCbci"
522 |       }
523 |     },
524 |     {
525 |       "cell_type": "code",
526 |       "source": [
527 |         "class BERT(nn.Module):\n",
528 |         "  def __init__(self):\n",
529 |         "    super(BERT, self).__init__()\n",
530 |         "    self.embedding = Embedding()\n",
531 |         "    self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])\n",
532 |         "    self.fc = nn.Linear(d_model, d_model)\n",
533 |         "    self.activ1 = nn.Train()\n",
534 |         "    self.linear = nn.Linear(d_model, d_model)\n",
535 |         "    self.activ2 = gelu\n",
536 |         "    self.norm = nn.LayerNorm(d_model)\n",
537 |         "    self.classifier = nn.Linear(d_model, 2)\n",
538 |         "    # decoder는 embedding layer와 공유됌\n",
539 |         "    embed_weight = self.embedding.tok_embed.weight\n",
540 |         "    n_vocab, n_dim = embed_weight.size()\n",
541 |         "    self.decoder = nn.Linear(n_dim, n_vocab, bias = False)\n",
542 |         "    self.decoder.weight = embed_weight\n",
543 |         "    self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))\n",
544 |         "\n",
545 |         "  def forward(self, input_ids, segment_ids, masked_pos):\n",
546 |         "    output = self.embedding(input_ids, segment_ids)\n",
547 |         "    enc_self_attn_mask = deg_attn_pad_mask(input_ids, input_ids)\n",
548 |         "    for layer in self.layers:\n",
549 |         "      output, enc_self_attn = layer(output, enc_self_attn_mask)\n",
550 |         "      # output: [batch_size, len, d_model]\n",
551 |         "      # attn: [batch_size, n_heads, d_model, d_model]\n",
552 |         "      # 이는 첫 번째 토큰 (CLS)에 의해 결정됌\n",
553 |         "    h_pooled = self.activ1(self.fc(output[:, 0]))   # [batch_size, d_model]\n",
554 |         "    logits_clsf = self.classification(h_pooled)     # [batch_size, 2]\n",
555 |         "\n",
556 |         "    masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1))   # [batch_size, max_pred, d_model]\n",
557 |         "\n",
558 |         "    # transformer의 최종 출력으로부터 masked position을 얻음\n",
559 |         "    h_masked = torch.gather(output, 1, masked_pos)   # masking position: [batch_size, max_pred, d_model]\n",
560 |         "    h_masked = self.norm(self.activ2(self.linear(h_masked)))\n",
561 |         "    logits_lm = self.decoder(h_masked) + self.decoder_bias   # [batch_size, max_pred, n_vocab]\n",
562 |         "\n",
563 |         "    return logits_lm, logits_clsf"
564 |       ],
565 |       "metadata": {
566 |         "id": "LU4v48ovDlvF"
567 |       },
568 |       "execution_count": null,
569 |       "outputs": []
570 |     },
571 |     {
572 |       "cell_type": "markdown",
573 |       "source": [
574 |         "몇 가지 기억해두어야 할 사항이 있다.\n",
575 |         "\n",
576 |         "1. encoder의 수를 지정할 수 있다. 논문에서는 base model의 경우 12개였다.\n",
577 |         "2. BERT에는 2개의 활성화 함수가 있는데, Tanh와 GELU이다."
578 |       ],
579 |       "metadata": {
580 |         "id": "OILDxtxtF_3F"
581 |       }
582 |     },
583 |     {
584 |       "cell_type": "code",
585 |       "source": [
586 |         "def gelu(x):\n",
587 |         "  return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))"
588 |       ],
589 |       "metadata": {
590 |         "id": "xaRraoH5GPqJ"
591 |       },
592 |       "execution_count": null,
593 |       "outputs": []
594 |     },
595 |     {
596 |       "cell_type": "markdown",
597 |       "source": [
598 |         "### Loss & Optimization\n",
599 |         "\n",
600 |         "논문에서는 모든 vocabulary에 대해 확률 분포를 계산하였지만, softmax 근사치를 이용해서 계산이 가능하다. 하지만, 확률 분포를 구하는 깔끔한 방법은 **cross-entropy**를 사용하는 것이다. cross-entropy loss는 *softmax*와 *negative log-likelihood*의 조합이다.\n",
601 |         "\n",
602 |         "그래서 모델을 구축하는 동안 softmax를 포함할 필요 없이 softmax 정규화 없이 feed-forward network에서 깔끔한 출력을 얻을 수 있다. \n",
603 |         "\n",
604 |         "optimization으로 넘어가서 BERT에서는 Adam optimizer를 사용하였다."
605 |       ],
606 |       "metadata": {
607 |         "id": "9t7Z4xBFGW4s"
608 |       }
609 |     },
610 |     {
611 |       "cell_type": "code",
612 |       "source": [
613 |         "criterion = nn.CrossEntropyLoss()\n",
614 |         "optimizer = optim.Adam(model.parameters(), lr = 0.001)"
615 |       ],
616 |       "metadata": {
617 |         "id": "Vk5q2c4FHKB_"
618 |       },
619 |       "execution_count": null,
620 |       "outputs": []
621 |     },
622 |     {
623 |       "cell_type": "markdown",
624 |       "source": [
625 |         "### 훈련\n",
626 |         "\n",
627 |         "마지막으로 모델 훈련을 해보도록 하자."
628 |       ],
629 |       "metadata": {
630 |         "id": "qP9szqwBHWtM"
631 |       }
632 |     },
633 |     {
634 |       "cell_type": "code",
635 |       "source": [
636 |         "model = BERT()\n",
637 |         "batch = make_batch()\n",
638 |         "input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))\n",
639 |         "\n",
640 |         "  for epoch in range(100):\n",
641 |         "    optimizer.zero_grad()\n",
642 |         "    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n",
643 |         "    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens)   # masked LM을 위해\n",
644 |         "    loss_lm = (loss_lm.float()).mean()\n",
645 |         "    loss_clsf = criterion(logits_clsf, isNext)   # sentence classification을 위해\n",
646 |         "    loss = loss_lm + loss_clsf\n",
647 |         "    if (epoch + 1) % 10 == 0:\n",
648 |         "      print('Epoch:', '%04d' % (epoch + 1), 'cost = ', '{:.6f}'.format(loss))\n",
649 |         "      loss.backward()\n",
650 |         "      optimizer.step()\n",
651 |         "\n",
652 |         "    # mask token 예측하기\n",
653 |         "    input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0]))\n",
654 |         "    print(text)\n",
655 |         "    print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])\n",
656 |         "\n",
657 |         "    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)\n",
658 |         "    logits_lm = logits_lm.data.mix(2)[1][0].data.numpy()\n",
659 |         "    print('masked tokens list: ', [pos.item() for pos in masked_tokens[0] if pos.item() != 0])\n",
660 |         "    print('predict masked tokens list: ', [pos for pos in logits_lm if pos != 0])\n",
661 |         "\n",
662 |         "    logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]\n",
663 |         "    print('isNext: ', True if isNext else False)\n",
664 |         "    print('predict isNext: ', True is logits_clsf else False)"
665 |       ],
666 |       "metadata": {
667 |         "id": "Q9-I6oFuHV_c"
668 |       },
669 |       "execution_count": null,
670 |       "outputs": []
671 |     },
672 |     {
673 |       "cell_type": "code",
674 |       "source": [
675 |         "Output:\n",
676 |         "\n",
677 |         "Hello, how are you? I am Romeo.\n",
678 |         "Hello, Romeo My name is Juliet. Nice to meet you.\n",
679 |         "Nice meet you too. How are you today?\n",
680 |         "Great. My baseball team won the competition.\n",
681 |         "Oh Congratulations, Juliet\n",
682 |         "Thanks you Romeo\n",
683 |         "['[CLS]', 'nice', 'meet', 'you', 'too', 'how', 'are', 'you', 'today', '[SEP]', '[MASK]', 'congratulations', '[MASK]', '[SEP]']\n",
684 |         "masked tokens list :  [27, 22]\n",
685 |         "predict masked tokens list :  []\n",
686 |         "isNext :  False\n",
687 |         "predict isNext :  True"
688 |       ],
689 |       "metadata": {
690 |         "id": "mRbWNVR5Jkx8"
691 |       },
692 |       "execution_count": null,
693 |       "outputs": []
694 |     },
695 |     {
696 |       "cell_type": "markdown",
697 |       "source": [
698 |         "이렇게 해서 BERT를 모두 구현하였다. 좀 더 큰 corpus에 대해서도 똑같은 BERT 모델을 사용할 수 있다.\n",
699 |         "\n",
700 |         "1. Pre-training: corpus를 사용하지만 앞서 언급한 input representation의 정확한 형식을 사용\n",
701 |         "2. FIne-tuning: 지도학습 데이터를 사용해야 한다.\n",
702 |         "3. 다양한 task 또는 topic modeling을 위한 feature extractor가 있어야 함"
703 |       ],
704 |       "metadata": {
705 |         "id": "a9ikGiiNJtmn"
706 |       }
707 |     }
708 |   ]
709 | }
710 | 


--------------------------------------------------------------------------------
/Natural Language Processing/BERT/README.md:
--------------------------------------------------------------------------------
1 | # BERT Implementation
2 | 
3 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading2-BERT-Pre-training-of-Deep-Bidirectional-Transformers-for-Language-Understanding
4 | 


--------------------------------------------------------------------------------
/Natural Language Processing/ELECTRA/README.md:
--------------------------------------------------------------------------------
1 | # ELECTRA Implementation
2 | 
3 | https://github.com/google-research/electra/blob/master/model/modeling.py 참고하여 작성됨
4 | 
5 | paper review: https://cartinoe5930.tistory.com/entry/ELECTRA-Pre-training-Text-Encoders-as-Discriminators-rather-than-Generators
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/ELMo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyNanKVFKMnCVZJm48NJvEOL",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/ELMo/ELMo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "EAKmz65EfvqQ"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "from typing import LIst, Tuple\n",
 38 |         "import torch\n",
 39 |         "import torch.nn as nn\n",
 40 |         "from char_cnn import CharEmbedding\n",
 41 |         "\n",
 42 |         "class ELMo(nn.Module):\n",
 43 |         "  def __init__(self, vocab_size, output_dim, emb_dim, hid_dim, prj_dim, kernel_sizes,\n",
 44 |         "               seq_len, n_layers, dropout):\n",
 45 |         "    #파라미터 설명(몇 개만)\n",
 46 |         "    #output_dim: word vocaulary 크기\n",
 47 |         "    #n_layers: LSTM의 레이어 수. 기본값은 2\n",
 48 |         "\n",
 49 |         "    super(ELMo, self).__init__()\n",
 50 |         "\n",
 51 |         "    self.embedding = CharEmbedding(vocab_size, emb_dim, prj_dim, kernel_sizes, seq_len)\n",
 52 |         "    self.bilms = BidirectionalLanguageModel(hid_dim, hid_dim, n_layers, dropout)\n",
 53 |         "\n",
 54 |         "    self.predict = nn.Linear(hid_dim, output_dim)\n",
 55 |         "\n",
 56 |         "  def forward(self, x):\n",
 57 |         "    #파라미터: x(Sentence)\n",
 58 |         "    #차원: x([batch, seq_len])\n",
 59 |         "    emb = self.embedding(x)\n",
 60 |         "    _, last_output = self.bilms(emb)\n",
 61 |         "    y = self.predict(last_output)\n",
 62 |         "\n",
 63 |         "    return y   #훈련 단계에서는 오직 biLM의 마지막 LSTM의 output만을 사용하여라\n",
 64 |         "\n",
 65 |         "  def get_embed_layer(self, x):   #torch.Tensor --> List\n",
 66 |         "    #순전파와 똑같지만, 모든 레이어의 임베딩을 반환함\n",
 67 |         "    #파라미터: x(character로 이루어진 sentence)\n",
 68 |         "    #차원: x([batch, seq_len])\n",
 69 |         "    emb = self.embedding(x)\n",
 70 |         "    first_output, last_output = self.bilms(emb)\n",
 71 |         "\n",
 72 |         "    return emb, (first_output, last_output)\n",
 73 |         "\n",
 74 |         "  def init_weights(self):\n",
 75 |         "    for p in self.parameters():\n",
 76 |         "      if p.dim() > 1:\n",
 77 |         "        nn.init.xavier_uniform_(p)\n",
 78 |         "\n",
 79 |         "    for lstm in self.bilms.lstms:\n",
 80 |         "      for names in lstm._all_weights:\n",
 81 |         "        for name in filter(lambda n: 'bias' in n, names):\n",
 82 |         "          bias = getattr(lstm, name)\n",
 83 |         "          n = bias.size(0)\n",
 84 |         "          start, end = n // 4, n // 2\n",
 85 |         "          bias.data[start:end].fill_(1.)\n",
 86 |         "\n",
 87 |         "class BidirectionalLanguageModel(nn.Module):\n",
 88 |         "  def __init__(self, emb_dim, hid_dim, prj_emb, dropout):\n",
 89 |         "    #LSTM 레이어의 이전과 이후 모두에 dropout 사용\n",
 90 |         "    super(BidirectionalLanguageModel, self).__init__()\n",
 91 |         "    self.lstms = nn.ModuleList([nn.LSTM(emb_dim, hid_dim, bidirectional = True, dropout = dropout,\n",
 92 |         "                                        batch_first = True), nn.LSTM(prj_emb, hid_dim, bidirectional = True, dropout = dropout, bacth_first = True)])\n",
 93 |         "    self.projection_layer = nn.Linear(2 * hid_dim, prj_emb)\n",
 94 |         "\n",
 95 |         "  def forward(self, x, hidden = None):\n",
 96 |         "    #파라미터: x(임베딩된 sentence tensor), hidden(hidden과 cell의 tuple)\n",
 97 |         "    #차원: x([Batch, Seq_len, Emb_size]),\n",
 98 |         "    #hidden([num_layers * num_directions, batch, hidden_size], [num_layers * num_directions, batch, hidden_size])\n",
 99 |         "    \n",
100 |         "    #LSTM 레이어 사이에 residual connection 추가\n",
101 |         "    first_output, (hidden, cell) = self.lstms[0](x, hidden)\n",
102 |         "\n",
103 |         "    projected = self.projection_layer(first_output)\n",
104 |         "    second_output, (hidden, cell) = self.lstms[1](projected, (hidden, cell))\n",
105 |         "\n",
106 |         "    second_output = second_output.view(second_output.size(0), second_output.size(1), 2, -1)\n",
107 |         "\n",
108 |         "    second_output = second_output[:, :, 0, :] + second_output[:, :, 1, :]\n",
109 |         "\n",
110 |         "    return first_output, second_output"
111 |       ]
112 |     }
113 |   ]
114 | }
115 | 


--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/README.md:
--------------------------------------------------------------------------------
1 | # ELMo 
2 | 
3 | https://github.com/InhyeokYoo/NLP/blob/master/papers/4.ELMo 참고하여 작성
4 | 
5 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading1-ELMo-Deep-contextualized-word-representations
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/char_cnn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyMpeFn+h3cVx7Sm4BlKoscT",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     },
 17 |     "accelerator": "GPU",
 18 |     "gpuClass": "standard"
 19 |   },
 20 |   "cells": [
 21 |     {
 22 |       "cell_type": "markdown",
 23 |       "metadata": {
 24 |         "id": "view-in-github",
 25 |         "colab_type": "text"
 26 |       },
 27 |       "source": [
 28 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/ELMo/char_cnn.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "code",
 33 |       "execution_count": null,
 34 |       "metadata": {
 35 |         "id": "E414FoesNyVv"
 36 |       },
 37 |       "outputs": [],
 38 |       "source": [
 39 |         "#char_cnn\n",
 40 |         "import torch\n",
 41 |         "import torch.nn as nn\n",
 42 |         "from typing import List\n",
 43 |         "\n",
 44 |         "class CharEmbedding(nn.Module):\n",
 45 |         "  def __init__(self, vocab_size, emb_dim, prj_dim, kernel_sizes, char_len, device):\n",
 46 |         "    super().__init__()\n",
 47 |         "    self.device = device\n",
 48 |         "    self.kernel_dim = sum([kernel_size for num_features, kernel_size in kernel_sizes])   #embedding dimenstion과 같음\n",
 49 |         "    self.charcnn = CharCNN(vocab_size, emb_dim, self.kernel_dim, kernel_sizes, char_len, device)\n",
 50 |         "    self.highway_net = HighWayNetwork(self.kernel_dim)\n",
 51 |         "    self.highwat_net._init_bias()\n",
 52 |         "    self.projection_layer = nn.Linear(self.kernel_dim, prj_dim)\n",
 53 |         "\n",
 54 |         "  def forward(self, x):\n",
 55 |         "    #파라미터: 문장의 캐릭터로 이루어져 있는 문장 벡터\n",
 56 |         "    #차원: [Batch, Seq_len, Char_len]\n",
 57 |         "    batch_size, seq_len, _ = x.size()\n",
 58 |         "    y = torch.zeros(batch_size, seq_len, self.kernel_dim).to(self.device)\n",
 59 |         "\n",
 60 |         "    for i in range(seq_len):\n",
 61 |         "      char_emb = self.charcnn(x[:, i, :])\n",
 62 |         "      highway_emb = self.highway_net(char_emb)\n",
 63 |         "      y[:, i, :] = highway_emb.squeeze(1)\n",
 64 |         "\n",
 65 |         "    emb = self.projection_layer(y)\n",
 66 |         "    return emb\n",
 67 |         "\n",
 68 |         "class CharCNN(nn.Module):\n",
 69 |         "  def __init__(self, vocab_size, char_emb_dim, word_emb_dim, kernel_sizes, char_len, device):\n",
 70 |         "    super(CharCNN, self).__init__()\n",
 71 |         "    self.device = device\n",
 72 |         "    self.char_len = char_len\n",
 73 |         "    self.word_emb_dim = word_emb_dim\n",
 74 |         "    self.kernel_sizes = kernel_sizes\n",
 75 |         "\n",
 76 |         "    self.embedding = nn.Embedding(vocab_size, char_meb_dim)\n",
 77 |         "    self.kernels = nn.ModuleList([nn.Conv1d(in_channels = char_emb_dim, out_channels = num_features,\n",
 78 |         "                                            kernel_size = kernel_size) for kernel_size, num_features in kernel_sizes])\n",
 79 |         "\n",
 80 |         "  def forward(self, word):\n",
 81 |         "    #파라미터: word(입력 텐서)\n",
 82 |         "    #차원\n",
 83 |         "    #입력: 단어([Batch, Emb_dim, Seq_len])\n",
 84 |         "    #출력: y([Batch, Kernel_dim])\n",
 85 |         "    batch_size = word.size(0)\n",
 86 |         "    y = torch.zeros(batch_size, self.word_meb_dim).to(self.device)\n",
 87 |         "\n",
 88 |         "    cnt = 0   #indec for y\n",
 89 |         "\n",
 90 |         "    #torch.cat보다 비어있는 텐서를 채우는 것이 더 빠름\n",
 91 |         "    for kernel in self.kernels:\n",
 92 |         "      emb = self.embedding(word)\n",
 93 |         "      emb = emb.permute(0, 2, 1)\n",
 94 |         "      temp = kernel(emb)\n",
 95 |         "      pooled = torch.max(temp, dim = 2)[0]\n",
 96 |         "      y[:, cnt] = pooled\n",
 97 |         "      cnt += pooled_size(1)\n",
 98 |         "\n",
 99 |         "    return y\n",
100 |         "\n",
101 |         "class HighwayNetwork(nn.Module):\n",
102 |         "  def __init__(self, kernel_sizes):\n",
103 |         "    super(HighwayNetwork, self).__init__()\n",
104 |         "    self.h_gate = nn.Linear(kernel_sizes, kernel_sizes)\n",
105 |         "    self.t_gate = nn.Sequential(nn.Linear(kernel_sizes, kernel_sizes), nn.Sigmoid())\n",
106 |         "    self.relu = torch.nn.ReLU()\n",
107 |         "\n",
108 |         "  def forward(self, x):\n",
109 |         "    #차원: x(Batch, Kernel_dim)\n",
110 |         "    x = x.unsqueeze(1)\n",
111 |         "    h = self.relu(self.h_gate(x))\n",
112 |         "    t = self.t_gate(x)\n",
113 |         "    c = 1 - t\n",
114 |         "    return t * h + c * x\n",
115 |         "\n",
116 |         "  def _init_bias(self):\n",
117 |         "    self.t_gate[0].bias.data.fill_(-2)"
118 |       ]
119 |     }
120 |   ]
121 | }
122 | 


--------------------------------------------------------------------------------
/Natural Language Processing/ELMo/character_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyNgGR09iOxTjA3Q3sX+iuzH",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/ELMo/character_dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "0b8fNKIbbE0p"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "import torchtext\n",
 38 |         "from torchtext.data import NestedField\n",
 39 |         "import math\n",
 40 |         "\n",
 41 |         "class BPTTIterator(torchtext.data.BPTTIterator):\n",
 42 |         "  def __iter__(self):\n",
 43 |         "    text = self.dataset[0].text\n",
 44 |         "    TEXT = self.dataset.fields['text']\n",
 45 |         "    TEXT.eos_token = None\n",
 46 |         "    text = text + ([TEXT.pad_token] * int(math.ceil(len(text) / self.batch_size) * self.batch_size - len(text)))\n",
 47 |         "    data = TEXT.pad([text])   #new\n",
 48 |         "    data = TEXT.numericalize(data, device = self.device)\n",
 49 |         "\n",
 50 |         "    #new line start\n",
 51 |         "    size = list(data.size())\n",
 52 |         "    size[0] = self.batch_size\n",
 53 |         "    size[1] = -1\n",
 54 |         "\n",
 55 |         "    data = data.view(*size).transpose(0, 1).contiguous()\n",
 56 |         "    dataset = torchtext.data.Dataset(examples = self.dataset.examples, fields = [('text', 'TEXT'), ('target', 'TEXT')])\n",
 57 |         "\n",
 58 |         "    while True:\n",
 59 |         "      for i in range(0, len(self) * self.bptt_len, self.bptt_len):\n",
 60 |         "        self.ierations += 1\n",
 61 |         "        seq_len = min(self.bptt_len, len(data) - i - 1)\n",
 62 |         "        batch_text = data[i:i + seq_len]\n",
 63 |         "        if TEXT.batch_first:\n",
 64 |         "          batch_text = batch_text.transpose(0, 1).contiguous()\n",
 65 |         "          batch_target = batch_target.transpose(0, 1).contiguous()\n",
 66 |         "        yield torchtext.data.Batch.fromvars(\n",
 67 |         "            dataset, self.batch_size, text = batch_text, target = batch_target\n",
 68 |         "        )\n",
 69 |         "      if not self.repeat:\n",
 70 |         "        return\n",
 71 |         "\n",
 72 |         "def gen_bptt_iter(dataset, batch_size, bptt_len, device):\n",
 73 |         "  #dataset: tuple of dataset\n",
 74 |         "  for batch_word, batch_char in zip(\n",
 75 |         "      BPTTIterator(dataset[0], batch_size, bptt_len, device = device),\n",
 76 |         "      BPTTIterator(dataset[1], batch_size, bptt_len, device = device),\n",
 77 |         "  ):\n",
 78 |         "    yield batch_word.text, batch_char.text, batch_word.target, batch_char.target\n",
 79 |         "\n",
 80 |         "def gen_language_model_corpus(dataset_cls: torchtext.datasets.LanguageModelingDataset):\n",
 81 |         "  field_char = NestedField(Field(pad_token = PAD_WORD, tokenize = list, init_token = SOS_WORD,\n",
 82 |         "                                 eos_token = EOS_WORD, batch_first = True), pad_token = PAD_WORD,)\n",
 83 |         "  \n",
 84 |         "  field_word = Field(batch_first = True)\n",
 85 |         "  dataset_char = dataset_cls.splits(field_char)\n",
 86 |         "  dataset_word = dataset_cls.splits(dielf_word)\n",
 87 |         "  field_char.build_vocab(dataset_char[0])\n",
 88 |         "  field_word.build_vocab(dataset_char[0])\n",
 89 |         "  return [_ for _ in zip(dataset_word, dataset_char)], field_word, field_char\n",
 90 |         "\n",
 91 |         "#How to use\n",
 92 |         "if __name__ == '__main__':\n",
 93 |         "  from torchtext.dataset import WIkiText2\n",
 94 |         "  from torchtext.data import Field\n",
 95 |         "\n",
 96 |         "  #FINAL\n",
 97 |         "  PAD_WORD = '<pad>'\n",
 98 |         "  SOS_WORD = '<sow>'\n",
 99 |         "  EOS_WORD = '<eow>'\n",
100 |         "\n",
101 |         "  datasets, field_word, field_char = gen_language_model_corpus(WikiText2)\n",
102 |         "  train_data, valid_data, test_data = datasets"
103 |       ]
104 |     }
105 |   ]
106 | }
107 | 


--------------------------------------------------------------------------------
/Natural Language Processing/GPT-1/GPT-1 Implementation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyNMCURwdSd6LE/DF4oH8QYA",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/GPT-1/GPT_1%20Implementation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "markdown",
 31 |       "source": [
 32 |         "# GPT-1 Implementation\n",
 33 |         "\n",
 34 |         "GPT-1 구현 코드는 [GPT 구현하기](https://paul-hyun.github.io/gpt-01/)를 참고하여 작성되었다.\n",
 35 |         "\n",
 36 |         "우선 GPT를 구현하기 전에 GPT에 대해 간략하게 설명하면 GPT는 Transformer의 Decoder만을 사용한 Pre-trained LM이다.\n",
 37 |         "\n",
 38 |         "### 1. Config\n",
 39 |         "\n",
 40 |         "Transformer와 파라미터를 동일하게 설정하였다. GPT는 Transformer의 Decoder만을 사용하므로 Encoder 부분은 제거하고 사용하였다."
 41 |       ],
 42 |       "metadata": {
 43 |         "id": "hMbeV9Y6mqNK"
 44 |       }
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "execution_count": null,
 49 |       "metadata": {
 50 |         "id": "XpRTodgTmhUR"
 51 |       },
 52 |       "outputs": [],
 53 |       "source": [
 54 |         "config = Config({\n",
 55 |         "    'n_dec_vocab': len(vocab),\n",
 56 |         "    'n_dec_seq': 256,\n",
 57 |         "    'n_layer': 6,\n",
 58 |         "    'd_hidn': 256,\n",
 59 |         "    'i_pad': 0,\n",
 60 |         "    'd_ff': 1024,\n",
 61 |         "    'n_head': 4,\n",
 62 |         "    'd_head': 64,\n",
 63 |         "    'dropout': 0.1,\n",
 64 |         "    'layer_norm_epsilon': 1e-12\n",
 65 |         "})\n",
 66 |         "print(config)"
 67 |       ]
 68 |     },
 69 |     {
 70 |       "cell_type": "markdown",
 71 |       "source": [
 72 |         "# 2. Decoder\n",
 73 |         "\n",
 74 |         "GPT는 Transformer의 Encoder는 사용하지 않고 Decoder만 사용하므로 Decoder에서 Encoder의 출력과 Attention을 하는 부분인 Encoder-Decoder-Multi-Head Attention 부분은 제거하고 사용하였다. 그 외에 나머지 부분은 Transformer와 동일하다."
 75 |       ],
 76 |       "metadata": {
 77 |         "id": "0u1Nu0LroUzG"
 78 |       }
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "source": [
 83 |         "# Decoder Layer\n",
 84 |         "class DecoderLayer(nn.Module):\n",
 85 |         "  def __init__(self, config):\n",
 86 |         "    super().__init__()\n",
 87 |         "    self.config = config\n",
 88 |         "\n",
 89 |         "    self.self_attn = MultiHeadAttention(self.config)\n",
 90 |         "    self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps = self.config.layer_norm_epsilon)\n",
 91 |         "    self.pos_ffn = PoswiseFeedForwardNet(self.config)\n",
 92 |         "    self.layer_norm3 = nn.LayerNorm(self.config.d_hidn, eps = self.config.layer_norm_epsilon)\n",
 93 |         "\n",
 94 |         "  def forward(self, dec_inputs, self_attn_mask):\n",
 95 |         "    # (batch_size, n_dec_seq, d_hidn), (batch_size, n_head, n_dec_seq, n_dec_seq)\n",
 96 |         "    self_att_outputs, self_attn_prob = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)\n",
 97 |         "    self_att_outputs = self.layer_norm1(dec_inputs + self_att_outputs)\n",
 98 |         "    # (batch_size, n_dec_seq, d_hidn)\n",
 99 |         "    ffn_outputs = self.po_ffn(self_att_outputs)\n",
100 |         "    ffn_outputs = self.layer_norm3(self_att_outputs + ffn_outputs)\n",
101 |         "    # (batch_size, n_dec_seq, d_hidn), (batch_size, n_head, n_dec_seq, n_dec_seq), (batch_size, n_head, n_dec_seq, n_enc_seq)\n",
102 |         "    return ffn_outputs, self_attn_prob"
103 |       ],
104 |       "metadata": {
105 |         "id": "ZbvA3ofcom9U"
106 |       },
107 |       "execution_count": null,
108 |       "outputs": []
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "source": [
113 |         "# Decoder\n",
114 |         "class Decoder(nn.Module):\n",
115 |         "  def __init__(self, config):\n",
116 |         "    super().__init__()\n",
117 |         "    self.config = config\n",
118 |         "\n",
119 |         "    self.dec_emb = nn.Embedding(self.config.n_dec_vocab, self.config.d_hidn)\n",
120 |         "    sinusoid_table = torch.FloatTensor(det_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.d_hidn))\n",
121 |         "    self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze = True)\n",
122 |         "\n",
123 |         "    self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config.n_layer)])\n",
124 |         "\n",
125 |         "  def forward(self, dec_inputs):\n",
126 |         "    positions = torch.arange(dec_inputs.size(1), device = dec_inputs.device, dtype = dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1\n",
127 |         "    pos_mask = dec_inputs.eq(self.config.i_pad)\n",
128 |         "    positions.masked_fill_(pos_mask, 0)\n",
129 |         "\n",
130 |         "    # (batch_size, n_dec_seq, d_hidn)\n",
131 |         "    dec_outputs = self.dec_emb(dec_inputs) + self.pos_emb(positions)\n",
132 |         "\n",
133 |         "    # (batch_size, n_dec_seq, n_dec_seq)\n",
134 |         "    dec_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config.i_pad)\n",
135 |         "    # (batch_size, n_dec_seq, n_dec_seq)\n",
136 |         "    dec_attn_decoder_mask = get_attn_decoder_mask(dec_inputs)\n",
137 |         "    # (batch_size, n_dec_seq, n_dec_seq)\n",
138 |         "    dec_self_attn_mask = torch.gt((dec_attn_mask + dec_attn_decoder_mask), 0)\n",
139 |         "\n",
140 |         "    self_attn_probs = []\n",
141 |         "    for layer in self.layers:\n",
142 |         "      # (batch_size, n_dec_seq, d_hidn), (batch_size, n_dec_seq, n_dec_seq)\n",
143 |         "      dec_outputs, self_attn_prob = layer(dec_outputs, dec_self_attn_mask)\n",
144 |         "      self_attn_probs.append(self_attn_prob)\n",
145 |         "    # (batch_size, n_dec_seq, d_hidn), [(batch_size, n_dec_seq, n_dec_seq)]\n",
146 |         "    return dec_outputs, self_attn_probs"
147 |       ],
148 |       "metadata": {
149 |         "id": "Z89dmNpSqq9K"
150 |       },
151 |       "execution_count": null,
152 |       "outputs": []
153 |     },
154 |     {
155 |       "cell_type": "markdown",
156 |       "source": [
157 |         "# 3. GPT\n",
158 |         "\n",
159 |         "GPT는 단순히 Transformer Decoder를 실행\n",
160 |         "Pre-traing 모델을 저장하기 위한 save, 저장된 모델을 읽기 위한 load 함수가 추가로 정의의"
161 |       ],
162 |       "metadata": {
163 |         "id": "x59jUSdgF2FU"
164 |       }
165 |     },
166 |     {
167 |       "cell_type": "code",
168 |       "source": [
169 |         "class GPT(nn.Module):\n",
170 |         "  def __init__(self, config):\n",
171 |         "    super().__init__()\n",
172 |         "    self.config = config\n",
173 |         "\n",
174 |         "    self.decoder = Decoder(self.config)\n",
175 |         "\n",
176 |         "  def forward(self, dec_inputs):\n",
177 |         "    # (batch_size, n_seq, d_hidn), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
178 |         "    dec_outputs, dec_self_attn_probs = self.decoder(dec_inputs)\n",
179 |         "    # (batch_size, n_dec_seq, n_dec_vocab), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
180 |         "    return dec_outputs, dec_self_attn_probs\n",
181 |         "\n",
182 |         "  def save(self, epoch, loss, path):\n",
183 |         "    torch.save({\n",
184 |         "        'epoch': epoch, \n",
185 |         "        'loss': loss, \n",
186 |         "        'state_dict': self.state_dict()\n",
187 |         "    }, path)\n",
188 |         "\n",
189 |         "  def load(self, path):\n",
190 |         "    save = torch.load(path)\n",
191 |         "    self.load_state_dict(save['state_dict'])\n",
192 |         "    return save['epoch'], save['loss']"
193 |       ],
194 |       "metadata": {
195 |         "id": "uEbD9YQ8GEd-"
196 |       },
197 |       "execution_count": null,
198 |       "outputs": []
199 |     },
200 |     {
201 |       "cell_type": "markdown",
202 |       "source": [
203 |         "# 4. Pre-traing Model\n",
204 |         "\n",
205 |         "GPT를 pre-train 하기 위한 클래스. GPT pre-train 클래스의 목적은 입력 단어에 대한 다음 단어를 예측하는 것이다."
206 |       ],
207 |       "metadata": {
208 |         "id": "T5X1B3EzHaQ6"
209 |       }
210 |     },
211 |     {
212 |       "cell_type": "code",
213 |       "source": [
214 |         "class GPTPretraing(nn.Module):\n",
215 |         "  def __init__(self, config):\n",
216 |         "    super().__init__()\n",
217 |         "    self.config = config\n",
218 |         "\n",
219 |         "    self.gpt = GPT(self.config)\n",
220 |         "    # 단어를 예측하기 위한 projection_lm을 선언\n",
221 |         "    self.projection_lm = nn.Linear(self.config.d_hidn, self.config.n_dec_vocab, bias = False)\n",
222 |         "    # Decoder의 Embedding & weight를 공유\n",
223 |         "    self.projection_lm.weight = self.gpt.decoder.dec_emb.weight\n",
224 |         "\n",
225 |         "  def forward(self, dec_inputs):\n",
226 |         "    # (batch_size, n_dec_seq, d_hidn), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
227 |         "    dec_outputs, dec_self_attn_probs = self.gpt(dec_inputs)\n",
228 |         "    # (batch_size, n_dec_seq, n_dec_vocab)\n",
229 |         "    # GPT 실행 결과를 입력으로 projection_lm을 실행해서 단어를 예측측\n",
230 |         "    logits_lm = self.projection_lm(dec_outputs)\n",
231 |         "    # (batch_size, n_dec_seq - 1, n_dec_vocab), (batch_size, n_output), [(batch_size, n_head, n_dec_seq, n_dec_seq)]\n",
232 |         "    # 결과의 마지막을 제외한 나머지를 리턴\n",
233 |         "    return logits_lm[:, :-1, :].contiguous(), dec_self_attn_probs"
234 |       ],
235 |       "metadata": {
236 |         "id": "tdUtbgF7HuRi"
237 |       },
238 |       "execution_count": null,
239 |       "outputs": []
240 |     }
241 |   ]
242 | }
243 | 


--------------------------------------------------------------------------------
/Natural Language Processing/GPT-1/README.md:
--------------------------------------------------------------------------------
1 | # GPT-1 Implementation
2 | 
3 | https://paul-hyun.github.io/gpt-01/ 참고하여 작성됌
4 | 
5 | paper review: https://cartinoe5930.tistory.com/entry/Pre-trained-Language-Modeling-paper-reading3-GPT-1-Improving-Language-Understanding-by-Generative-Pre-Training
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/RoBERTa/README.md:
--------------------------------------------------------------------------------
1 | # RoBERTa Implementation
2 | 
3 | https://github.com/facebookresearch/fairseq/blob/main/fairseq/models/roberta/model.py 참고하여 작성됌
4 | 
5 | paper review: https://cartinoe5930.tistory.com/entry/RoBERTa-A-Robustly-Optimized-BERT-Pretraining-Approach-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/Transformer-XL/README.md:
--------------------------------------------------------------------------------
1 | # Transformer-XL Implementation
2 | 
3 | https://github.com/kimiyoung/transformer-xl/blob/master/tf/model.py 참고하여 작성
4 | 
5 | paper review is here!! https://cartinoe5930.tistory.com/entry/Transformer-XL-Attentive-Language-Models-Beyond-a-Fixed-Length-Context-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/Transformer-XL/Transformer_XL_구현_실습.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyOtb06YYh5iyXi4CRZAWjAR",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/Transformer-XL/Transformer_XL_%EA%B5%AC%ED%98%84_%EC%8B%A4%EC%8A%B5.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "70fsbBslZZ7I"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "import tensorflow as tf\n",
 38 |         "\n",
 39 |         "def positional_embedding(pos_seq, inv_freq, bsz = None):\n",
 40 |         "  sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)\n",
 41 |         "  pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)\n",
 42 |         "  if bsz is not None:\n",
 43 |         "    return tf.tile(pos_emb[:, None, :], [1, bsz, 1])\n",
 44 |         "  else:\n",
 45 |         "    return pos_emb[:, None, :]\n",
 46 |         "\n",
 47 |         "def positionwise_FF(inp, d_model d_inner, dropout, kernel_initializer, scope = 'ff', is_training = True):\n",
 48 |         "  output = inp\n",
 49 |         "  with tf.variable_scope(scope):\n",
 50 |         "    output = tf.layers.dense(inp, d_inner, activation = tf.nn.relu,\n",
 51 |         "                             kernel_initializer = kernel_initializer,\n",
 52 |         "                             name = 'layer_1')\n",
 53 |         "    output = tf.layers.dropout(output, dropout, training = is_training, name = 'drop_1')\n",
 54 |         "    output = tf.layers.dense(output, d_model, kernel_initializer = kernel_initializer,\n",
 55 |         "                             name = 'layer2')\n",
 56 |         "    output = tf.layers.dropout(output, dropout, training = is_training, name = 'drop_2')\n",
 57 |         "    output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis = -1)\n",
 58 |         "  return output\n",
 59 |         "\n",
 60 |         "def rel_shift(x):\n",
 61 |         "  x_size = tf.shape(x)\n",
 62 |         "\n",
 63 |         "  x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])\n",
 64 |         "  x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])\n",
 65 |         "  x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])\n",
 66 |         "  x = tf.reshape(x, x_size)\n",
 67 |         "\n",
 68 |         "  return x\n",
 69 |         "\n",
 70 |         "def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,\n",
 71 |         "                       n_head, d_head, dropout, dropatt, is_training,\n",
 72 |         "                       kernel_initializer, scope = 'rel_attn'):\n",
 73 |         "  scale = 1 / (d_head ** 0.5)\n",
 74 |         "  with tf.variable_scope(scope):\n",
 75 |         "    qlen = tf.shape(w)[0]\n",
 76 |         "    rlen = tf.shape(r)[0]\n",
 77 |         "    bsz = tf.shape(w)[1]\n",
 78 |         "\n",
 79 |         "    cat = tf.concat([mems, w], 0) if mems is not None and mems.shape.ndims > 1 else w\n",
 80 |         "    w_heads = tf.layers.dense(cat, 3 * n_head, d_head, use_bias = False, kernel_initializer = kernel_initializer,\n",
 81 |         "                              name = 'qkv')\n",
 82 |         "    r_head_k = tf.layers.dense(r, n_head * d_head, use_bias = False, kernel_initializer = kernel_initializer,\n",
 83 |         "                               name = 'r')\n",
 84 |         "    \n",
 85 |         "    w_head_q, w_kead_k, w_head_v = tf.split(w_heads, 3, -1)\n",
 86 |         "    w_head_q = w_head_q[-qlen:]\n",
 87 |         "\n",
 88 |         "    klen = tf.shape(w_head_k)[0]\n",
 89 |         "\n",
 90 |         "    w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])\n",
 91 |         "    w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])\n",
 92 |         "    w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])\n",
 93 |         "\n",
 94 |         "    r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])\n",
 95 |         "\n",
 96 |         "    rw_head_q = w_head_q + r_w_bias\n",
 97 |         "    rr_head_q = w_head_q + r_r_bias\n",
 98 |         "\n",
 99 |         "    AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)\n",
100 |         "    BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)\n",
101 |         "    BD = rel_shift(BD)\n",
102 |         "\n",
103 |         "    attn_score = (AC + BD) * scale\n",
104 |         "    attn_mask_t = attn_mask[:, :, None, None]\n",
105 |         "    attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t\n",
106 |         "\n",
107 |         "    attn_prob = tf.nn.softmax(attn_score, 1)\n",
108 |         "    attn_prob = tf.layers.dropout(attn_prob, dropatt, training = is_training)\n",
109 |         "\n",
110 |         "    attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)\n",
111 |         "    size_t = tf.shape(attn_vec)\n",
112 |         "    attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])\n",
113 |         "\n",
114 |         "    attn_out = tf.layers.dense(attn_vec, d_model, use_bias = False,\n",
115 |         "                               kernel_initializer = kernel_initializer, name ='o')\n",
116 |         "    attn_out = tf.layers.dropout(attn_out, dropout, training = is_training)\n",
117 |         "\n",
118 |         "    output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis = -1)\n",
119 |         "\n",
120 |         "  return output\n",
121 |         "\n",
122 |         "def embedding_lookup(lookup_table, x, use_tpu = True):\n",
123 |         "  if use_tpu:\n",
124 |         "    n_token = tf.shape(lookup_table)[0]\n",
125 |         "    one_hot_idx = tf.one_hot(x, n_token)\n",
126 |         "    if one_hot_idx.shape.ndims == 2:\n",
127 |         "      return tf.einsum('nd,in->id', lookup_table, one_hot_idx)\n",
128 |         "    else:\n",
129 |         "      return tf.einsum('nb,ibn->ibd', lookup_table, one_hot_idx)\n",
130 |         "  else:\n",
131 |         "    return tf.nn.embedding_lookup(lookup_table, x)\n",
132 |         "\n",
133 |         "def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,\n",
134 |         "                                   proj_initializer, div_val = 1,\n",
135 |         "                                   proj_same_dim = True,\n",
136 |         "                                   scope = 'adaptive_embed', **kwargs):\n",
137 |         "  emb_scale = d_proj ** 0.5\n",
138 |         "  with tf.variable_scope(scope):\n",
139 |         "    if div_val == 1:\n",
140 |         "      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], initializer = initializer)\n",
141 |         "      y = embedding_lookup(lookup_table, x, use_tpu = False)\n",
142 |         "      if d_proj != d_embed:\n",
143 |         "        proj_W = tf.get_variable('proj_W', [d_embed, d_proj], initializer = proj_initializer)\n",
144 |         "        y = tf.einsum('ibe,ed->ibd', y, proj_w)\n",
145 |         "      else:\n",
146 |         "        proj_w = None\n",
147 |         "      ret_params = [lookup_table, proj_W]\n",
148 |         "    else:\n",
149 |         "      tables, projs = [], []\n",
150 |         "      curoff_ends = [0] + cutoffs + [n_token]\n",
151 |         "      x_size = tf.shape(x)\n",
152 |         "      y = tf.zeros([x_size[0], x_size[1], d_proj])\n",
153 |         "      for i in range(len(cutoff_ends) - 1):\n",
154 |         "        with tf.variable_scope('cutoff_{}'.format(i)):\n",
155 |         "          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
156 |         "          mask = (x >= l_idx) & (x < r_idx)\n",
157 |         "          cur_x = tf.boolean_mask(x, mask) - l_idx\n",
158 |         "          cur_d_embed = d_embed // (div_val ** i)\n",
159 |         "          lookup_table = tf.get_variable('lookup_table', [r_idx - l_idx, cur_d_embed].\n",
160 |         "                                         initializer = initializer)\n",
161 |         "          cur_y = embedding_lookup(lookup_table, cur_x, use_tpu = False)\n",
162 |         "          if d_proj == cur_d_embed and not proj_same_dim:\n",
163 |         "            proj_W = None\n",
164 |         "          else:\n",
165 |         "            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],\n",
166 |         "                                     initializer = proj_initializer)\n",
167 |         "            cur_y = tf.einsum('id,de->ie', cur_y, proj_W)\n",
168 |         "          mask_idx = tf.to_int64(tf.where(mask))\n",
169 |         "          y += tf.scatter_nd(mask_idx, cur_y, tf.to_int64(tf.shape(y)))\n",
170 |         "          tables.append(lookup_table)\n",
171 |         "          projs.append(proj_W)\n",
172 |         "      ret_params = [tables, projs]\n",
173 |         "  \n",
174 |         "  y *= emb_scale\n",
175 |         "  return y, ret_params\n",
176 |         "\n",
177 |         "def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,\n",
178 |         "                                  proj_initializer, div_val = 1, perms = None,\n",
179 |         "                                  proj_same_dim = True, scope = 'adaptive_embed'):\n",
180 |         "  #만약 perm이 None이라면\n",
181 |         "  #W = W1 X W2와 같이 각각 projection되고, 그 다음에 X x W (embedding lookup)을 계산\n",
182 |         "  #None이 아니라면\n",
183 |         "  #bin-based embedding lookup을 사용\n",
184 |         "\n",
185 |         "  emb_scale = d_proj ** 0.5\n",
186 |         "  with tf.variable_scope(scope):\n",
187 |         "    if div_val == 1:\n",
188 |         "      lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], initializer = initializer)\n",
189 |         "      y = embedding_lookup(lookup_table, x)\n",
190 |         "      if d_proj != d_embed:\n",
191 |         "        proj_W = tf.get_variable('proj_W', [d_embed, d_proj], initializer = proj_initializer)\n",
192 |         "        y = tf.einsum('ibe,ed->ibd', y, proj_W)\n",
193 |         "      else:\n",
194 |         "        proj_W = None\n",
195 |         "      ret_params = [lookup_table, proj_W]\n",
196 |         "    else:\n",
197 |         "      tables, projs = [], []\n",
198 |         "      cutoff_ends = [0] + cutoffs + [n_token]\n",
199 |         "      x_size = tf.shape(x)\n",
200 |         "      if perms is None:\n",
201 |         "        cat_lookup = []\n",
202 |         "      else:\n",
203 |         "        cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj])\n",
204 |         "      for i in range(len(cutoff_ends) - 1):\n",
205 |         "        with tf.variable_scope('cutoff_{}'.format(i)):\n",
206 |         "          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
207 |         "          cur_d_embed = d_embed // (div_val ** i)\n",
208 |         "          lookup_table = tf.get_variable('lookup_table',\n",
209 |         "                                         [r_idx - l_idx, cur_d_embed],\n",
210 |         "                                         initializer = initializer)\n",
211 |         "          if cur_d_embed == d_proj and not proj_same_dim:\n",
212 |         "            proj_W = None\n",
213 |         "          else:\n",
214 |         "            proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj],\n",
215 |         "                                     initializer = proj_initializer)\n",
216 |         "          if perms is None:\n",
217 |         "            cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W))\n",
218 |         "          else:\n",
219 |         "            if i == 0:\n",
220 |         "              cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1))\n",
221 |         "              if proj_W is not None:\n",
222 |         "                cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W)\n",
223 |         "              cur_y *= perms[i][:, :, None]\n",
224 |         "              cat_lookup += cur_y\n",
225 |         "            else:\n",
226 |         "              cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i])\n",
227 |         "              cur_x = tf.to_int32(cur_x)\n",
228 |         "              cur_y = embedding_lookup(lookup_table, cur_x)\n",
229 |         "              if proj_W is not None:\n",
230 |         "                cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W)\n",
231 |         "              cat_lookup += tf.einsum('kd,idk->ibd', cur_y, perms[i])\n",
232 |         "          tables.append(lookup_table)\n",
233 |         "          projs.append(proj_W)\n",
234 |         "      if perms is None:\n",
235 |         "        cat_lookup = tf.concat(cat_lookup, 0)\n",
236 |         "        y = embedding_lookup(cat_lookup, x)\n",
237 |         "      else:\n",
238 |         "        y = cat_lookup\n",
239 |         "      ret_params = [tables, projs]\n",
240 |         "  \n",
241 |         "  y *= emb_scale\n",
242 |         "  return y, ret_params\n",
243 |         "\n",
244 |         "def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs, params,\n",
245 |         "                             tie_projs, initializer = None, proj_initializer = None,\n",
246 |         "                             div_val = 1, scope = 'adaptive_softmax', proj_same_dim = True,\n",
247 |         "                             return_mean = True, **kwargs):\n",
248 |         "  def _logit(x, W, b, proj):\n",
249 |         "    y = x\n",
250 |         "    if proj is not None:\n",
251 |         "      y = tf.einsum('ibd,ed->ibe', y, proj)\n",
252 |         "    return tf.einsum('ibd, nd->ibn', y, W) + b\n",
253 |         "\n",
254 |         "  params_W, params_projs = params[0], params[1]\n",
255 |         "\n",
256 |         "  def _gather_logprob(logprob, target):\n",
257 |         "    lp_size = tf.shape(logprob)\n",
258 |         "    r = tf.range(lp_size[0])\n",
259 |         "    idx = tf.stack([r, target], 1)\n",
260 |         "    return tf.gather_nd(logprob, idx)\n",
261 |         "\n",
262 |         "  with tf.variable_scope(scope):\n",
263 |         "    if len(cutoffs) == 0:\n",
264 |         "      softmax_b = tf.get_variable('bias', [n_token],\n",
265 |         "                                  initializer = tf.zeros_initializer())\n",
266 |         "      output = _logit(hidden, prams_W, softmax_b, params_projs)\n",
267 |         "      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target, logits = output)\n",
268 |         "    else:\n",
269 |         "      cutoff_ends = [0] + cutoffs + [n_token]\n",
270 |         "      nll = tf.zeros_like(target, dtype = tf.float32)\n",
271 |         "      for i in range(len(cutoff_ends) - 1):\n",
272 |         "        with tf.variable_scope('cutoff_{}'.format(i)):\n",
273 |         "          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
274 |         "          mask = (target >= l_idx) & (target < r_idx)\n",
275 |         "          mask_idx = tf.where(mask)\n",
276 |         "          cur_target = tf.boolean_mask(target, mask) - l_idx\n",
277 |         "          cur_d_embed = d_embed // (div_val ** i)\n",
278 |         "\n",
279 |         "          if div_val == 1:\n",
280 |         "            cur_W = params_W[l_idx: r_idx]\n",
281 |         "          else:\n",
282 |         "            cur_W = params_W[i]\n",
283 |         "          cur_b = tf.get_variable('b', [r_idx - l_idx], initializer = tf.zeros_initializer())\n",
284 |         "          if tie_projs[i]:\n",
285 |         "            if div_val == 1:\n",
286 |         "              cur_proj = params_projs\n",
287 |         "            else:\n",
288 |         "              cur_proj = params_projs[i]\n",
289 |         "          else:\n",
290 |         "            if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed:\n",
291 |         "              cur_proj = None\n",
292 |         "            else:\n",
293 |         "              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj],\n",
294 |         "                                         initializer = proj_initializer)\n",
295 |         "          if i == 0:\n",
296 |         "            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],\n",
297 |         "                                        initializer = tf.zeros_initializer())\n",
298 |         "            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],\n",
299 |         "                                        initializer = tf.zeros_initializer())\n",
300 |         "            cur_W = tf.concat([cur_W, cluster_W], 0)\n",
301 |         "            cur_b = tf.concat([cur_b, cluster_b], 0)\n",
302 |         "\n",
303 |         "            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)\n",
304 |         "            head_logprob = tf.nn.log_softmax(head_logit)\n",
305 |         "            cur_head_logprob = tf.boolean_mask(head_logprob, mask)\n",
306 |         "            cur_logprob = _gather_logprob(cur_head_logprob, cur_target)\n",
307 |         "          else:\n",
308 |         "            cur_head_logprob = tf.boolean_mask(head_logprob, mask)\n",
309 |         "            cur_hidden = tf.boolean_mask(hidden_mask)\n",
310 |         "            tail_logit = tf.squeeze(_logit(cur_hidden[None], cur_W, cur_b, cur_proj), 0)\n",
311 |         "            tail_logprob = tf.nn.log_softmax(tail_logit)\n",
312 |         "            cur_logprob = (cur_head_logprob[:, cutoff_ends[1]+i-1] + _gather_logprob(tail_logprob, cur_target))\n",
313 |         "          nll += tf.scatter_nd(mask_idx, -cur_logprob, tf.to_int64(tf.shape(nll)))\n",
314 |         "\n",
315 |         "  if return_mean:\n",
316 |         "    nll = tf.reduce_mean(nll)\n",
317 |         "  return nll\n",
318 |         "\n",
319 |         "def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,\n",
320 |         "                            params, tie_projs,\n",
321 |         "                            initializer=None, proj_initializer=None,\n",
322 |         "                            div_val=1, perms=None, proj_same_dim=True,\n",
323 |         "                            scope='adaptive_softmax',\n",
324 |         "                            **kwargs):\n",
325 |         "  def _logit(x, W, b, proj):\n",
326 |         "    y = x\n",
327 |         "    if x.shape.ndims == 3:\n",
328 |         "      if proj is not None:\n",
329 |         "        y = tf.einsum('ibd,ed->ibe', y, proj)\n",
330 |         "      return tf.einsum('ibd,nd->ibn', y, W) + b\n",
331 |         "    else:\n",
332 |         "      if proj is not None:\n",
333 |         "        y = tf.einsum('id,ed->ie', y, proj)\n",
334 |         "      return tf.einsum('id,nd->in', y, W) + b\n",
335 |         "\n",
336 |         "  params_W, params_projs = params[0], params[1]\n",
337 |         "\n",
338 |         "  with tf.variable_scope(scope):\n",
339 |         "    if len(cutoffs) == 0:\n",
340 |         "      softmax_b = tf.get_variable('bias', [n_token], initializer = tf.zeros_initializer())\n",
341 |         "      output = _logit(hidden, params_W, softmax_b, params_projs)\n",
342 |         "      nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = target, logits = output)\n",
343 |         "      nll = tf.reduce_mean(nll)\n",
344 |         "    else:\n",
345 |         "      total_loss, total_cnt = 0, 0\n",
346 |         "      cutoff_ends = [0] + cutoffs + [n_token]\n",
347 |         "      for i in range(len(cutoff_ends) - 1):\n",
348 |         "        with tf.variable_scope('cutoff_{}'.format(i)):\n",
349 |         "          l_idx, r_idx = cutoff_ends[i], cutoff_ends[i+1]\n",
350 |         "\n",
351 |         "          cur_d_embed = d_embed // (div_val ** i)\n",
352 |         "\n",
353 |         "          if div_val == 1:\n",
354 |         "            cur_W = params_W[l_idx: r_idx]\n",
355 |         "          else:\n",
356 |         "            cur_W = params_W[i]\n",
357 |         "          cur_b = tf.get_variable('b', [r_idx - l_idx], initializer = tf.zeros_initializer())\n",
358 |         "\n",
359 |         "          if tie_projs[i]:\n",
360 |         "            if div_val == 1:\n",
361 |         "              cur_proj = params_projs\n",
362 |         "            else:\n",
363 |         "              cur_proj = params_projs[i]\n",
364 |         "          else:\n",
365 |         "            if (div_val == 1 of not proj_same_dim) and d_proj == cur_d_embed:\n",
366 |         "              cur_proj = None\n",
367 |         "            else:\n",
368 |         "              cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj], initializer = tf.zeros_initializer())\n",
369 |         "\n",
370 |         "          if i == 0:\n",
371 |         "            cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed],\n",
372 |         "                                        initializer = tf.zeros_initializer())\n",
373 |         "            cluster_b = tf.get_variable('cluster_b', [len(cutoffs)],\n",
374 |         "                                        initializer = tf.zeros_initializer())\n",
375 |         "            cur_W = tf.concat([cur_W, cluster_W], 0)\n",
376 |         "            cur_b = tf.concat([cur_b, cluster_b], 0)\n",
377 |         "\n",
378 |         "            head_logit = _logit(hidden, cur_W, cur_b, cur_proj)\n",
379 |         "\n",
380 |         "            head_target = kwargs.get('head_target')\n",
381 |         "            head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
382 |         "                labels = head_target,\n",
383 |         "                logits = head_logit\n",
384 |         "            )\n",
385 |         "\n",
386 |         "            masked_loss = head_nll * perms[i]\n",
387 |         "            total_loss += tf.reduce_sum(masked_loss)\n",
388 |         "            total_cnt += tf.reduce_sum(perms[i])\n",
389 |         "\n",
390 |         "          else:\n",
391 |         "            cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i])\n",
392 |         "\n",
393 |         "            cur_hidden_tf.einsum('ibd,ibk->kd', hidden, perms[i])\n",
394 |         "            tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj)\n",
395 |         "\n",
396 |         "            tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx), perms[i])\n",
397 |         "            tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
398 |         "                labels = tf.to_int43(tail_target), logits = tail_logit\n",
399 |         "            )\n",
400 |         "\n",
401 |         "            sum_nll = cur_head_nll + tail_nll\n",
402 |         "            mask = tf.reduce_sum(perms[i], [0, 1])\n",
403 |         "\n",
404 |         "            masked_loss = sum_nll * mask\n",
405 |         "            total_loss += tf.reduce_sum(masked_loss)\n",
406 |         "            total_cnt += tf.reduce_sum(mask)\n",
407 |         "\n",
408 |         "      nll = total_loss / total_cnt\n",
409 |         "\n",
410 |         "  return nll\n",
411 |         "\n",
412 |         "def _create_mask(qlen, mlen, same_length = False):\n",
413 |         "  attn_mask = tf.ones([qlen, qlen])\n",
414 |         "  mask_u = tf.matrix_band_part(attn_mask, 0, -1)\n",
415 |         "  mask_dia = tf.matrix_band_part(attn_mask, 0, 0)\n",
416 |         "  attn_mask_pad = tf.zeros([qlen, mlen])\n",
417 |         "  ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)\n",
418 |         "  if same_length:\n",
419 |         "    mask_l = tf.matrix_band_part(attn_mask, -1, 0)\n",
420 |         "    ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)\n",
421 |         "  return ret\n",
422 |         "\n",
423 |         "def _cache_mem(curr_out, prev_mem, mem_len = None):\n",
424 |         "  if mem_len is None or prev_mem is None:\n",
425 |         "    new_mem = curr_out\n",
426 |         "  elif mem_len == 0:\n",
427 |         "    return prev_mem\n",
428 |         "  else:\n",
429 |         "    new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]\n",
430 |         "\n",
431 |         "  return tf.stop_gradient(new_mem)\n",
432 |         "\n",
433 |         "def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,\n",
434 |         "                n_head, d_head, d_inner, dropout, dropatt,\n",
435 |         "                initializer, is_training, proj_initializer=None,\n",
436 |         "                mem_len=None, cutoffs=[], div_val=1, tie_projs=[],\n",
437 |         "                same_length=False, clamp_len=-1, use_tpu=True,\n",
438 |         "                input_perms=None, target_perms=None, head_target=None,\n",
439 |         "                untie_r=False, proj_same_dim=True,\n",
440 |         "                scope='transformer'):\n",
441 |         "  new_mems = []\n",
442 |         "  with tf.variable_scope(scope):\n",
443 |         "    if untie_r:\n",
444 |         "      r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],\n",
445 |         "                                 initializer = initializer)\n",
446 |         "      r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],\n",
447 |         "                                 initializer = initializer)\n",
448 |         "    else:\n",
449 |         "      r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],\n",
450 |         "                                 initializer = initializer)\n",
451 |         "      r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],\n",
452 |         "                                 initializer = initializer)\n",
453 |         "      \n",
454 |         "    qlen = tf.shape(dec_inp)[0]\n",
455 |         "    mlen = tf.shape(mems[0])[0] is mems is not None else 0\n",
456 |         "    klen = mlen + qlen\n",
457 |         "\n",
458 |         "    if proj_initializer is None:\n",
459 |         "      proj_initializer = initializer\n",
460 |         "    lookup_fn = (mul_adaptive_embedding_lookup is use_tpu else\n",
461 |         "                 mask_adaptive_embedding_lookup)\n",
462 |         "    embeddings, shared_params = lookup_fn(\n",
463 |         "        x=dec_inp,\n",
464 |         "        n_token=n_token,\n",
465 |         "        d_embed=d_embed,\n",
466 |         "        d_proj=d_model,\n",
467 |         "        cutoffs=cutoffs,\n",
468 |         "        initializer=initializer,\n",
469 |         "        proj_initializer=proj_initializer,\n",
470 |         "        div_val= div_val,\n",
471 |         "        perms=input_perms,\n",
472 |         "        proj_same_dim=proj_same_dim)\n",
473 |         "    \n",
474 |         "    attn_mask = _create_mask(qlen, mlen, same_length)\n",
475 |         "\n",
476 |         "    pos_seq = tf.range(klen - 1, -1, -1.0)\n",
477 |         "    if clasm_len > 0:\n",
478 |         "      pos_seq = tf.minimum(pos_seq, clamp_len)\n",
479 |         "    inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))\n",
480 |         "    pos_emb = positional_embedding(pos_seq, inv_freq)\n",
481 |         "\n",
482 |         "    output = tf.layers.dropout(embeddings, dropot, training = is_training)\n",
483 |         "    pos_emb = tf.layers.dropout(pos_emb, dropout, training = is_training)\n",
484 |         "\n",
485 |         "    if mems is None:\n",
486 |         "      mems = [None] * n_layer\n",
487 |         "\n",
488 |         "    for i in range(n_layer):\n",
489 |         "      new_mems.append(_cache_mem(output, mems[i], mem_len))\n",
490 |         "\n",
491 |         "      with tf.variable_scope('layer_{}'.format(i)):\n",
492 |         "        output = rel_multihead_attn(\n",
493 |         "            w=output,\n",
494 |         "            r=pos_emb,\n",
495 |         "            r_w_bias=r_w_bias if not untie_r else r_w_bias[i],\n",
496 |         "            r_r_bias=r_r_bias if not untie_r else r_r_bias[i],\n",
497 |         "            attn_mask=attn_mask,\n",
498 |         "            mems=mems[i],\n",
499 |         "            d_model=d_model,\n",
500 |         "            n_head=n_head,\n",
501 |         "            d_head=d_head,\n",
502 |         "            dropout=dropout,\n",
503 |         "            dropatt=dropatt,\n",
504 |         "            is_training=is_training,\n",
505 |         "            kernel_initializer=initializer\n",
506 |         "        )\n",
507 |         "        output = positionwise_FF(\n",
508 |         "            inp=output,\n",
509 |         "            d_model=d_model,\n",
510 |         "            d_inner=d_inner,\n",
511 |         "            dropout=dropout,\n",
512 |         "            kernel_initializer=initializer,\n",
513 |         "            is_training=is_training\n",
514 |         "        )\n",
515 |         "\n",
516 |         "        output = tf.layers.dropout(output, dropout, training = is_training)\n",
517 |         "\n",
518 |         "        logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else\n",
519 |         "                         mask_adaptive_logsoftmax)\n",
520 |         "        loss = logsoftmax_fn(\n",
521 |         "            hidden=output,\n",
522 |         "            target=target,\n",
523 |         "            n_token=n_token,\n",
524 |         "            d_embed=d_embed,\n",
525 |         "            d_proj=d_model,\n",
526 |         "            cutoffs=cutoffs,\n",
527 |         "            params=shared_params,\n",
528 |         "            tie_projs=tie_projs,\n",
529 |         "            initializer=initializer,\n",
530 |         "            proj_initializer=proj_initializer,\n",
531 |         "            div_val=div_val,\n",
532 |         "            perms=target_perms,\n",
533 |         "            head_target=head_target,\n",
534 |         "            proj_same_dim=proj_same_dim\n",
535 |         "        )\n",
536 |         "\n",
537 |         "        return loss, new_mems"
538 |       ]
539 |     }
540 |   ]
541 | }
542 | 


--------------------------------------------------------------------------------
/Natural Language Processing/Transformer/README.md:
--------------------------------------------------------------------------------
1 | # Transformer Implementation
2 | 
3 | https://github.com/tunz/transformer-pytorch/blob/e7266679f0b32fd99135ea617213f986ceede056/model/transformer.py#L201 참고하여 작성
4 | 
5 | Transformer paper review: https://cartinoe5930.tistory.com/entry/Transformer-Attention-Is-All-You-Need-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/Transformer/Transformer_구현_실습.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyPEjZ5/XN13lrmM3kUVgIFW",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/Transformer_%EA%B5%AC%ED%98%84_%EC%8B%A4%EC%8A%B5.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "nYoZgseydKyf"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "import math\n",
 38 |         "\n",
 39 |         "import torch\n",
 40 |         "import torch.nn as nn\n",
 41 |         "import torch.nn.function as F\n",
 42 |         "\n",
 43 |         "from utils import utils\n",
 44 |         "\n",
 45 |         "def initialize_weight(x):\n",
 46 |         "  nn.init.xavier_uniform_(x.weight)\n",
 47 |         "  if x.bias is not None:\n",
 48 |         "    nn.init.constant_(x.bias, 0)\n",
 49 |         "\n",
 50 |         "class FeedForwardNetwork(nn.Module):\n",
 51 |         "  def __init__(self, hidden_size, filter_size, dropout_rate):\n",
 52 |         "    super(FeedForwardNetwork, self).__init__()\n",
 53 |         "\n",
 54 |         "    self.layer1 = nn.Linear(hidden_size, filter_size)\n",
 55 |         "    self.relu = nn.ReLU()\n",
 56 |         "    self.dropout = nn.Dropout(dropout_rate)\n",
 57 |         "    self.layer2 = nn.Linear(filter_size, hidden_size)\n",
 58 |         "\n",
 59 |         "    initialize_weight(self.layer1)\n",
 60 |         "    initialize_weight(self.layer2)\n",
 61 |         "\n",
 62 |         "  def forward(self, x):\n",
 63 |         "    x = self.layer1(x)\n",
 64 |         "    x = self.relu(x)\n",
 65 |         "    x = self.dropout(x)\n",
 66 |         "    x = self.layer2(x)\n",
 67 |         "    return x\n",
 68 |         "\n",
 69 |         "class MultiHeadAttention(nn.Moculde):\n",
 70 |         "  def __init__(self, hidden_size, dropout_rate, head_size = 8):\n",
 71 |         "    super(MultiHeadAttention, self).__init__()\n",
 72 |         "\n",
 73 |         "    self.head_size = head_size\n",
 74 |         "\n",
 75 |         "    self.att_size = att_size = hidden_size // head_size\n",
 76 |         "    self.scale = arr_size ** -0.5\n",
 77 |         "\n",
 78 |         "    self.linear_q = nn.Linear(hidden_size, head_size * att_size, bias = False)\n",
 79 |         "    self.linear_k = nn.Linear(hidden_size, head_size * att_size, bias = False)\n",
 80 |         "    self.linear_v = nn.Linear(hidden_size, head_size * att_size, bias = False)\n",
 81 |         "    initialize_weight(self.linear_q)\n",
 82 |         "    initialize_weight(self.linear_k)\n",
 83 |         "    initialize_weight(self.linear_v)\n",
 84 |         "\n",
 85 |         "    self.att_dropout = nn.Dropout(dropout_rate)\n",
 86 |         "    \n",
 87 |         "    self.output_layer = nn.Linear(head_size * att_size, hidden_size, bias = False)\n",
 88 |         "    initialize_weight(self.output_layer)\n",
 89 |         "\n",
 90 |         "  def forward(self, q, k, v, mask, cache = None):\n",
 91 |         "    orig_q_size = q.size()\n",
 92 |         "\n",
 93 |         "    d_k = self.att_size\n",
 94 |         "    d_v = self.att_size\n",
 95 |         "    batch_size = q.size(0)\n",
 96 |         "\n",
 97 |         "    #head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i)\n",
 98 |         "    q = self.linear_q(q).view(batch_size, -1, self.head_size, d_k)\n",
 99 |         "    if cache is not None and 'endec_k' in cache:\n",
100 |         "      k, v = cache['endec_k'], cahce['endec_v']\n",
101 |         "    else:\n",
102 |         "      k = self.linear_k(k).view(bacth_size, -1, self.head_size, d_k)\n",
103 |         "      v = self.linear_v(v).view(batch_size, -1, self.head_size, d_v)\n",
104 |         "\n",
105 |         "      if cache is not None:\n",
106 |         "        cache['endec_k'], cache['endec_v'] = k, v\n",
107 |         "\n",
108 |         "    q = q.transpose(1, 2)                   # [b, h, q_len, d_k]\n",
109 |         "    v = v.transpose(1, 2)                   # [b, h, v_len, d_v]\n",
110 |         "    k = k.transpose(1, 2).transpose(2, 3)   # [b, h, d_k, k_len]\n",
111 |         "\n",
112 |         "    #Scaled Dot-Product Attention\n",
113 |         "    #Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V\n",
114 |         "    q.mul_(self.scale)\n",
115 |         "    x = torch.matmul(q, k)   # [b, h, q_len, k_len]\n",
116 |         "    x.masked_fill_(mask.unsqueeze(1), -1e9)\n",
117 |         "    x = torch.softmax(x, dim = 3)\n",
118 |         "    x = self.att_dropout(x)\n",
119 |         "    x = x.matmul(v)   # [b, h, q_len, attn]\n",
120 |         "\n",
121 |         "    x = x.transpose(1, 2).contiguous()   # [b, q_len, h, attn]\n",
122 |         "    x = x.view(batch_size, -1, self.head_size * d_v)\n",
123 |         "\n",
124 |         "    x = self.output_layer(x)\n",
125 |         "\n",
126 |         "    assert x.size() == orig_q_size\n",
127 |         "    return x\n",
128 |         "\n",
129 |         "class EncoderLayer(nn.Module):\n",
130 |         "  def __init__(self, hidden_size, filter_size, dropout_rate):\n",
131 |         "    super(EncoderLayer, self).__init__()\n",
132 |         "\n",
133 |         "    self.self_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
134 |         "    self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
135 |         "    self.self_attention_dropout = nn.Dropout(dropout_rate)\n",
136 |         "\n",
137 |         "    self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
138 |         "    self.enc_dec_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
139 |         "    self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)\n",
140 |         "\n",
141 |         "    self.ffn_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
142 |         "    self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)\n",
143 |         "    self.ffn_dropout = nn.Dropout(dropout_rate)\n",
144 |         "\n",
145 |         "  def forward(self, x, enc_output, self_mask, i_mask, cache):\n",
146 |         "    y = self.self_attention_norm(x)\n",
147 |         "    y = self.self_attention(y, y, y, self_mask)   #(q, k, v, mask)\n",
148 |         "    y = self.self_attention_dropout(y)\n",
149 |         "    x =  x + y   #skip connection\n",
150 |         "\n",
151 |         "    y = self.ffn_norm(x)\n",
152 |         "    y = ffn(y)\n",
153 |         "    y = self.ffn_dropout(y)\n",
154 |         "    x = x + y   #skip connection\n",
155 |         "    return x\n",
156 |         "\n",
157 |         "class DecoderLayer(nn.Module):\n",
158 |         "  def __init__(self, hidden_size, filter_size, dropout_rate):\n",
159 |         "    super(DecoderLayer, self).__init__()\n",
160 |         "\n",
161 |         "    self.self_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
162 |         "    self.self_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
163 |         "    self.self_attention_dropout = nn.Dropout(dropout_rate)\n",
164 |         "\n",
165 |         "    self.enc_dec_attention_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
166 |         "    self.enc_dec_attention = MultiHeadAttention(hidden_size, dropout_rate)\n",
167 |         "    self.enc_dec_attention_dropout = nn.Dropout(dropout_rate)\n",
168 |         "\n",
169 |         "    self.ffn_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
170 |         "    self.ffn = FeedForwardNetwork(hidden_size, filter_size, dropout_rate)\n",
171 |         "    self.ffn_dropout = nn.Dropout(dropout_rate)\n",
172 |         "\n",
173 |         "  def forward(self, x, enc_output, self_mask, i_mask, cache):\n",
174 |         "    y = self.self_attention_norm(x)\n",
175 |         "    y = self.self_attention(y, y, y, self_mask)\n",
176 |         "    y = self.self_attention_dropout(y)\n",
177 |         "    x = x + y\n",
178 |         "\n",
179 |         "    if enc_output is not None:\n",
180 |         "      y = self.self_attention_norm(x)\n",
181 |         "      y = self.self_attention(y, enc_output, enc_output, i_mask, cache)\n",
182 |         "      y = self.enc_dec_attention_dropout(y)\n",
183 |         "      x = x + y\n",
184 |         "\n",
185 |         "    y = self.ffn_norm(x)\n",
186 |         "    y = self.ffn(y)\n",
187 |         "    y = self.ffn_dropout(y)\n",
188 |         "    x = x + y\n",
189 |         "    return x\n",
190 |         "\n",
191 |         "class Encoder(nn.Module):\n",
192 |         "  def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):\n",
193 |         "    super(Encoder, self).__init__()\n",
194 |         "\n",
195 |         "    encoders = [EncoderLayer(hidden_size, filter_size, dropout_rate) for _ in range(n_layers)]\n",
196 |         "    self.layers = nn.ModuleList(encoders)\n",
197 |         "\n",
198 |         "    self.last_norm = nn.LayerNorm(gidden_size, eps = 1e-6)\n",
199 |         "\n",
200 |         "  def forward(self, inputs, mask):\n",
201 |         "    encoder_output = inputs\n",
202 |         "    for enc_layer in self.layers:\n",
203 |         "      encoder_output = enc_layer(encoder_output, mask)\n",
204 |         "    return self.last_norm(encoder_output)\n",
205 |         "\n",
206 |         "class Decoder(nn.Module):\n",
207 |         "  def __init__(self, hidden_size, filter_size, dropout_rate, n_layers):\n",
208 |         "    super(Decoder, self).__init__()\n",
209 |         "\n",
210 |         "    decoders = [DecoderLayer(hidden_size, filter_size, dropout_rate) for _ in range(n_layers)]\n",
211 |         "    self.layers = nn.ModuleList(decoders)\n",
212 |         "\n",
213 |         "    self.last_norm = nn.LayerNorm(hidden_size, eps = 1e-6)\n",
214 |         "\n",
215 |         "  def forward(self, targets, enc_output, i_mask, t_self_mask, cache):\n",
216 |         "    decoder_output = targets\n",
217 |         "    for i, dec_layer in enumerate(self.layers):\n",
218 |         "      layer_cache = None\n",
219 |         "      if cache is not None:\n",
220 |         "        if i not in cache:\n",
221 |         "          cache[i] = {}\n",
222 |         "        layer_cache = cache[i]\n",
223 |         "      decoder_output = dec_layer(decoder_output, enc_output, t_self_mask, i_mask, layer_cache)\n",
224 |         "\n",
225 |         "    return self.last_norm(decoder_output)\n",
226 |         "\n",
227 |         "class Transformer(nn.Module):\n",
228 |         "  def __init__(self, i_vocab_size, t_vocab_size, n_layers = 6, hidden_size = 512, \n",
229 |         "               filter_size = 2048, dropout_rate = 0.1, share_target_embedding = True,\n",
230 |         "               has_inputs = True, src_pad_idx = None, trg_pad_idx = None):\n",
231 |         "    super(Transformer, self).__init__()\n",
232 |         "\n",
233 |         "    self.hidden_size = hidden_size\n",
234 |         "    self.emb_scale = hidden_size ** 0.5\n",
235 |         "    self.has_inputs = has_inputs\n",
236 |         "    self.src_pad_idx = src_pad_idx\n",
237 |         "    self.trg_pad_idx = trg_pad_idx\n",
238 |         "\n",
239 |         "    self.t_vocab_embedding = nn.Embedding(t_vocab_size, hidden_size)\n",
240 |         "    nn.init.normal_(self.t_vocab_embedding.weight, mead = 0, std = hidden_size ** -0.5)\n",
241 |         "    self.t_emb_dropout = nn.Dropout(dropout_rate)\n",
242 |         "    self.decoder = Decoder(hidden_size, filter_size, dropout_rate, n_layers)\n",
243 |         "\n",
244 |         "    if has_inputs:\n",
245 |         "      if not share_target_embedding:\n",
246 |         "        self.i_vocab_embedding = nn.Embedding(i_vocab_size, hidden_size)\n",
247 |         "        nn.init.normal_(self.i_vocab_embedding.weight, mean = 0, std = hidden_size ** -0.5)\n",
248 |         "      else:\n",
249 |         "        self.i_vocab_embedding = self.t_vocab_embedding\n",
250 |         "\n",
251 |         "      self.i_emb_dropout = nn.Dropout(dropout_rate)\n",
252 |         "\n",
253 |         "      self.encoder = Encoder(hidden_size, filter_size, dropout_rate, n_layers)\n",
254 |         "\n",
255 |         "    #Positional Encoding\n",
256 |         "    num_timescales = self.hidden_size // 2\n",
257 |         "    max_timescale = 10000.0\n",
258 |         "    min_timescale = 1.0\n",
259 |         "    log_timescale_increment = (\n",
260 |         "        math.log(floast(max_timescale) / float(min_timescale)) / \n",
261 |         "        max(num_timescale - 1, 1))\n",
262 |         "    inv_timescales = min_timescale * torch.exp(\n",
263 |         "        torch.arange(num_timescales, dtype = torch.float32) * \n",
264 |         "        -log_timescale_increment)\n",
265 |         "    self.register_buffer('inv_timescales', inv_timescales)\n",
266 |         "\n",
267 |         "  def forward(self, inputs, targets):\n",
268 |         "    enc_output, i_mask = None, None\n",
269 |         "    if self.has_inputs:\n",
270 |         "      i_mask = utils.create_pad_mask(inputs, self.src_pad_idx)\n",
271 |         "      enc_output = self.encode(inputs, i_mask)\n",
272 |         "\n",
273 |         "    t_mask = utils.create_pad_mask(targets, self.trg_pad_idx)\n",
274 |         "    target_size = targets.size()[1]\n",
275 |         "    t_self_mask = utils.create_trg_self_mask(target_size, device = targets.device)\n",
276 |         "\n",
277 |         "    return self.decode(targets, enc_output, i_mask, t_self_mask, t_mask)\n",
278 |         "\n",
279 |         "  def encode(self, inputs, i_mask):\n",
280 |         "    #Input Embedding\n",
281 |         "    input_embedded = self.i_vocab_embedding(inputs)\n",
282 |         "    input_embedded.masked_fill_(i_mask.squeeze(1).unaqueeze(-1), 0)\n",
283 |         "    input_embedded *= self.emb_scale\n",
284 |         "    input_embedded += self.get_position_encoding(inputs)\n",
285 |         "    input_embedded = self.i_emb_dropout(input_embedded)\n",
286 |         "\n",
287 |         "    return self.encoder(input_embedded, i_mask)\n",
288 |         "\n",
289 |         "  def decoder(self, targets, enc_output, i_mask, t_self_mask, t_mask, cache = None):\n",
290 |         "    #target embedding\n",
291 |         "    target_embedded = self.t_vocab_embedding(targets)\n",
292 |         "    target_embedded.masked_fill(t_mask.squeeze(1).unsqueeze(-1), 0)\n",
293 |         "\n",
294 |         "    #Shfting\n",
295 |         "    target_embedded = target_embedded[:, :-1]\n",
296 |         "    target_embedded = F.pad(target_embedded, (0, 0, 1, 0))\n",
297 |         "\n",
298 |         "    target_embedded *= self.emb_scale\n",
299 |         "    target_embedded += self.get_position_encoding(targets)\n",
300 |         "    target_embedded = self.t_emb_dropout(target_embedded)\n",
301 |         "\n",
302 |         "    #decoder\n",
303 |         "    decoder_output = self.decoder(target_embedded, enc_output, i_mask, t_self_mask, cache)\n",
304 |         "\n",
305 |         "    #linear\n",
306 |         "    output = torch.matmul(decoder_output, self.t_vocab_embedding.weight.transpose(0, 1))\n",
307 |         "\n",
308 |         "    return output\n",
309 |         "\n",
310 |         "  def get_position_encoding(self, x):\n",
311 |         "    max_length = x.size()[1]\n",
312 |         "    position = torch.arange(max_length, dtype = torch.float32, device = x.device)\n",
313 |         "    scaled_time = position.unsqueeze(1) * self.inv_timescales.unsqueeze(0)\n",
314 |         "    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim = 1)\n",
315 |         "    signal = F.pad(signal, (0, 0, 0, self.hidden_size % 2))\n",
316 |         "    signal = signal.view(1, max_length, self.hidden_size)\n",
317 |         "    return signal"
318 |       ]
319 |     }
320 |   ]
321 | }
322 | 


--------------------------------------------------------------------------------
/Natural Language Processing/XLNet/README.md:
--------------------------------------------------------------------------------
1 | # XLNet Implementaion
2 | 
3 | https://github.com/graykode/xlnet-Pytorch/blob/master/xlnet.py 참고하여 작성됌
4 | 
5 | paper review: https://cartinoe5930.tistory.com/entry/XLNet-Generalized-Autoregressive-Pretraining-for-Language-Understanding-%EB%85%BC%EB%AC%B8-%EB%A6%AC%EB%B7%B0
6 | 


--------------------------------------------------------------------------------
/Natural Language Processing/XLNet/XLNet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "provenance": [],
  7 |       "authorship_tag": "ABX9TyO7PtJswjOPtdrbUQcICrcp",
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "view-in-github",
 23 |         "colab_type": "text"
 24 |       },
 25 |       "source": [
 26 |         "<a href=\"https://colab.research.google.com/github/gauss5930/Natural-Language-Processing/blob/main/XLNet/XLNet.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "cell_type": "code",
 31 |       "execution_count": null,
 32 |       "metadata": {
 33 |         "id": "GgYufJCgpn7Z"
 34 |       },
 35 |       "outputs": [],
 36 |       "source": [
 37 |         "from __future__ import absolute_import\n",
 38 |         "from __future__ import division\n",
 39 |         "from __future__ import print_function\n",
 40 |         "\n",
 41 |         "import json\n",
 42 |         "import os\n",
 43 |         "import tensorflow as tf\n",
 44 |         "import modeling\n",
 45 |         "\n",
 46 |         "def _get_initializer(FLAGS):\n",
 47 |         "  # 변수 초기화\n",
 48 |         "  if FLAGS.init == 'uniform':\n",
 49 |         "    initializer = tf.initializers.random_uniform(\n",
 50 |         "        minval = -FLAGS.init_range,\n",
 51 |         "        maxval = FLAGS.init_range,\n",
 52 |         "        seed = None\n",
 53 |         "    )\n",
 54 |         "\n",
 55 |         "  elif FLAGS.init == 'normal':\n",
 56 |         "    initializer = tf.initializers.random_normal(\n",
 57 |         "        stddev = FLAGS.init_std,\n",
 58 |         "        seed = None\n",
 59 |         "    )\n",
 60 |         "\n",
 61 |         "  else:\n",
 62 |         "    raise ValueError('Initializer {} not supported'.format(FALGS.init))\n",
 63 |         "  return initializer\n",
 64 |         "\n",
 65 |         "class XLNetConfig(object):\n",
 66 |         "  ''' XLNetConfig는 model checkpoint에 특정된 하이퍼 파라미터를 포함하고 있음\n",
 67 |         "  이 하이퍼 파라미터들은 pre-training 시와 fine-tuning 시에 모두 같아야 함\n",
 68 |         "\n",
 69 |         "  n_layer: 레이어의 수\n",
 70 |         "  d_model: hidden size\n",
 71 |         "  n_head: attention head의 수\n",
 72 |         "  d_head: 각 attention head의 차원 크기\n",
 73 |         "  d_inner: feed-forward layer에서 hidden size\n",
 74 |         "  ff_activation: 'relu' 또는 'gelu'\n",
 75 |         "  untie_r: attention에서 bias들을 untie할 지 말지 결정\n",
 76 |         "  n_token: vocab_size\n",
 77 |         "  '''\n",
 78 |         "\n",
 79 |         "  def __init__(self, FLAGS = None, json_path = None):\n",
 80 |         "    '''\n",
 81 |         "    XLNetConfig 구조\n",
 82 |         "    하나의 FLAGS 또는 json_path는 제공되어야 한다.\n",
 83 |         "    '''\n",
 84 |         "\n",
 85 |         "    assert FLAGS is not None or json_path is not None\n",
 86 |         "\n",
 87 |         "    self.keys = ['n_layer', 'd_model', 'n_head', 'd_head', 'd_inner', 'ff_activation', \n",
 88 |         "                 'untie_r', 'n_token']\n",
 89 |         "\n",
 90 |         "    if FLAGS is not None:\n",
 91 |         "      self.init_from_flags(FLAGS)\n",
 92 |         "\n",
 93 |         "    if json_path is not None:\n",
 94 |         "      self.init_from_json(json_path)\n",
 95 |         "\n",
 96 |         "  def init_from_flags(self, FLAGS):\n",
 97 |         "    for key in self.keys:\n",
 98 |         "      setattr(self, key, getattr(FLAGS, key))\n",
 99 |         "\n",
100 |         "  def init_from_json(self, FLAGS):\n",
101 |         "    with tf.gfile.Open(json_path) as f:\n",
102 |         "      json_data = json.load(f)\n",
103 |         "      for key in self.keys:\n",
104 |         "        setattr(self, key, json_data[key])\n",
105 |         "\n",
106 |         "  def to_json(self, json_path):\n",
107 |         "    # XLNetConfig를 json 파일로 저장\n",
108 |         "    json_data = {}\n",
109 |         "    for key in self.keys:\n",
110 |         "      json_data[key] = getattr(self, key)\n",
111 |         "\n",
112 |         "    json_dir = os.path.dirname(json_path)\n",
113 |         "    if not tf.gfile.Exists(json_dir):\n",
114 |         "      tf.gfile.MakeDirs(json_dir)\n",
115 |         "    with tf.gfile.Open(json_path, 'w') as f:\n",
116 |         "      json.dump(json_data, f, indent = 4, sort_keys = True)\n",
117 |         "\n",
118 |         "def create_run_config(is_training, is_finetune, FLAGS):\n",
119 |         "  kwargs = dict(\n",
120 |         "      is_training=is_training,\n",
121 |         "      use_tpu=FLAGS.use_tpu,\n",
122 |         "      use_bfloat16=FLAGS.use_bfloat16,\n",
123 |         "      dropout=FLAGS.dropout,\n",
124 |         "      dropatt=FLAGS.dropatt,\n",
125 |         "      init=FLAGS.init,\n",
126 |         "      init_range=FLAGS.init_range,\n",
127 |         "      init_std=FLAGS.init_std,\n",
128 |         "      clamp_len=FLAGS.clamp_len\n",
129 |         "  )\n",
130 |         "\n",
131 |         "  if not is_finetune:\n",
132 |         "    kwargs.update(dict(\n",
133 |         "        mem_len=FLAGS.mem_len,\n",
134 |         "        reuse_len=FLAGS.reuse_len,\n",
135 |         "        bi_data=FLAGS.bi_data,\n",
136 |         "        clamp_len=FLAGS.clamp_len,\n",
137 |         "        same_length=FLAGS.same_length\n",
138 |         "    ))\n",
139 |         "\n",
140 |         "  return RunConfig(**kwargs)\n",
141 |         "\n",
142 |         "class RunConfig(object):\n",
143 |         "  '''\n",
144 |         "  RunConfig는 pre-training과 fine-tuning에서 서로 다른 하이퍼 파라미터를 가져야 함.\n",
145 |         "  이 하이퍼 파라미터들은 실행할 때마다 변경할 수 있다.\n",
146 |         "  '''\n",
147 |         "\n",
148 |         "  def __init__(self, is_training, use_tpu, use_bfloat16, dropout, dropatt,\n",
149 |         "               init = 'normal', init_range = 0.1, init_std = 0.02, mem_len = None,\n",
150 |         "               reuse_len = None, bi_data = False, clamp_len = -1, same_length = False):\n",
151 |         "    '''\n",
152 |         "    is_training: 학습 모드인지 아닌지 확인\n",
153 |         "    use_tpu: TPU를 사용할 지 말 지 확인\n",
154 |         "    use_bfloat16: float32 대신에 bfloat16 사용\n",
155 |         "    dropout: dropout 비율\n",
156 |         "    dropatt: attention 확률에 dropout 비율\n",
157 |         "    init: 초기화 scheme. 'normal' 또는 'uniform' 둘 중 하나\n",
158 |         "    init_range: [-init_range, init_range]에서 균일한 분포를 사용해서 파라미터를 초기화\n",
159 |         "      init='uniform'일 때 가장 효과적임\n",
160 |         "    mem_len: 캐시해둘 토큰의 수\n",
161 |         "    reuse_len: 캐시되고 향후 재사용될 현재 배치의 토큰 수이다.\n",
162 |         "    bi_data: 양방향성 입력 파이프라인을 사용할 지 말 지 정함. \n",
163 |         "      pre-training 중에는 True를 사용, fine-tuning 중에는 False를 사용\n",
164 |         "    clamp_len: clamp_len보다 큰 모든 상대 거리를 고정한다다. -1은 클램핑이 없음을 의미한다.\n",
165 |         "    same_length: 각 토큰에 대해 똑같은 attention length를 사용할 지 말 지 결정\n",
166 |         "    '''\n",
167 |         "\n",
168 |         "    self.init = init\n",
169 |         "    self.init_range = init_range\n",
170 |         "    self.init_std = init_std\n",
171 |         "    self.is_training = is_training\n",
172 |         "    self.dropout = dropout\n",
173 |         "    self.dropatt = dropatt\n",
174 |         "    self.use_tpu = use_tpu\n",
175 |         "    self.use_bfloat16 = use_bfloat16\n",
176 |         "    self.mem_len = mem_len\n",
177 |         "    self.reuse_len = reuse_len\n",
178 |         "    self.bi_data = bi_data\n",
179 |         "    self.clamp_len = clamp_len\n",
180 |         "    self.same_length = same_length\n",
181 |         "\n",
182 |         "class XLNetModel(object):\n",
183 |         "  # pre-training 및 fine-tuning 중에 사용되는 XLNet 모델의 wrapper이다.\n",
184 |         "\n",
185 |         "  def __init__(self, xlnet_config, run_config, input_ids, seg_ids, input_mask,\n",
186 |         "               memes = None, perm_mask = None, target_mapping = None, inp_q = None,\n",
187 |         "               **kwargs):\n",
188 |         "    \n",
189 |         "    initializer = _get_initializer(run_config)\n",
190 |         "\n",
191 |         "    tfm_args = dict(\n",
192 |         "        n_token=xlnet_config.n_token,\n",
193 |         "        initializer=initializer,\n",
194 |         "        attn_type=\"bi\",\n",
195 |         "        n_layer=xlnet_config.n_layer,\n",
196 |         "        d_model=xlnet_config.d_model,\n",
197 |         "        n_head=xlnet_config.n_head,\n",
198 |         "        d_head=xlnet_config.d_head,\n",
199 |         "        d_inner=xlnet_config.d_inner,\n",
200 |         "        ff_activation=xlnet_config.ff_activation,\n",
201 |         "        untie_r=xlnet_config.untie_r,\n",
202 |         "\n",
203 |         "        is_training=run_config.is_training,\n",
204 |         "        use_bfloat16=run_config.use_bfloat16,\n",
205 |         "        use_tpu=run_config.use_tpu,\n",
206 |         "        dropout=run_config.dropout,\n",
207 |         "        dropatt=run_config.dropatt,\n",
208 |         "\n",
209 |         "        mem_len=run_config.mem_len,\n",
210 |         "        reuse_len=run_config.reuse_len,\n",
211 |         "        bi_data=run_config.bi_data,\n",
212 |         "        clamp_len=run_config.clamp_len,\n",
213 |         "        same_length=run_config.same_length\n",
214 |         "    )\n",
215 |         "\n",
216 |         "    input_args = dict(\n",
217 |         "        inp_k=input_ids,\n",
218 |         "        seg_id=seg_ids,\n",
219 |         "        input_mask=input_mask,\n",
220 |         "        mems=mems,\n",
221 |         "        perm_mask=perm_mask,\n",
222 |         "        target_mapping=target_mapping,\n",
223 |         "        inp_q=inp_q\n",
224 |         "    )\n",
225 |         "\n",
226 |         "    with tf.variable_scope('model', reuse = tf.AUTO_REUSE):\n",
227 |         "      (self.output, self.new_mems, self.lookup_table) = modeling.transformer_xl(**tfm_args)\n",
228 |         "\n",
229 |         "    self.input_mask = input_mask\n",
230 |         "    self.initializer = initializer\n",
231 |         "    self.clnet_config = clnet_config\n",
232 |         "    self.run_config = run_config\n",
233 |         "\n",
234 |         "  def get_pooled_out(self, summary_type, use_summ_proj = True):\n",
235 |         "    xlnet_config = self.xlnet_config\n",
236 |         "    run_config = self.run_config\n",
237 |         "\n",
238 |         "    with tf.variable_scope('model', reuse = tf.AUTO_REUSE):\n",
239 |         "      summary = modeling.summarize_sequence(\n",
240 |         "          summary_type=summary_type,\n",
241 |         "          hidden=self.output,\n",
242 |         "          d_model=xlnet_config.d_model,\n",
243 |         "          n_head=xlnet_config.n_head,\n",
244 |         "          d_head=xlnet_config.d_head,\n",
245 |         "          dropout=run_config.dropout,\n",
246 |         "          dropatt=run_config.dropatt,\n",
247 |         "          is_training=run_config.is_training,\n",
248 |         "          input_mask=self.input_mask,\n",
249 |         "          initializer=self.initializer,\n",
250 |         "          use_proj=use_summ_proj\n",
251 |         "      )\n",
252 |         "\n",
253 |         "    return summary\n",
254 |         "\n",
255 |         "  def get_sequence_output(self):\n",
256 |         "    # XLNet의 마지막 레이어의 hidden representation\n",
257 |         "    \n",
258 |         "    return self.output\n",
259 |         "\n",
260 |         "  def get_new_memory(self):\n",
261 |         "    # 이전 메모리와 현재 input representation을 합친 new memory\n",
262 |         "    # list의 길이는 n_layer와 같음\n",
263 |         "    return self.new_mems\n",
264 |         "\n",
265 |         "  def get_embedding_table(self):\n",
266 |         "    # embedding lookup table\n",
267 |         "    # input 레이어와 output 레이어 간의 embedding tie\n",
268 |         "    return self.lookup_table\n",
269 |         "\n",
270 |         "  def get_initializer(self):\n",
271 |         "    # tf initilizer\n",
272 |         "    # XLNet의 top layer에서 변수들을 초기화하기 위해 사용\n",
273 |         "    return self.initializer"
274 |       ]
275 |     }
276 |   ]
277 | }
278 | 


--------------------------------------------------------------------------------