├── .gitignore ├── 20_07_02_박장원님_ELECTRA.ipynb ├── 20_07_09_김현님_gpt.ipynb ├── 20_07_23_김보섭_BERT.ipynb ├── 20_08_13_박혜웅_ALBERT.ipynb └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /20_07_02_박장원님_ELECTRA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ELECTRA on 🤗 Transformers 🤗" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/plain": [ 18 | "'2.11.0'" 19 | ] 20 | }, 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "output_type": "execute_result" 24 | } 25 | ], 26 | "source": [ 27 | "import transformers\n", 28 | "transformers.__version__" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# 불필요한 로깅 메시지 제거용\n", 38 | "import logging\n", 39 | "\n", 40 | "logging.getLogger().setLevel(logging.WARN)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Model Architecture" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "![](https://user-images.githubusercontent.com/28896432/80024445-0f444e00-851a-11ea-9137-9da2abfd553d.png)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## TL;DR (Example)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### 1. Discriminator\n", 69 | "- [electra-base-discriminator](https://huggingface.co/google/electra-base-discriminator#how-to-use-the-discriminator-in-transformers)\n", 70 | "- Fake Token Detection" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stderr", 80 | "output_type": "stream", 81 | "text": [ 82 | "C:\\Users\\Park_Jang_Won\\Anaconda3\\envs\\tensorflow\\lib\\site-packages\\transformers\\tokenization_utils.py:831: FutureWarning: Parameter max_len is deprecated and will be removed in a future release. Use model_max_length instead.\n", 83 | " category=FutureWarning,\n" 84 | ] 85 | }, 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "[('나는', 0.0), ('왜', 1.0), ('밥을', 0.0), ('먹었다', 0.0), ('.', 0.0)]\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "import torch\n", 96 | "from transformers import ElectraForPreTraining, ElectraTokenizer\n", 97 | "from pprint import pprint\n", 98 | "\n", 99 | "discriminator = ElectraForPreTraining.from_pretrained(\"monologg/koelectra-base-discriminator\")\n", 100 | "tokenizer = ElectraTokenizer.from_pretrained(\"monologg/koelectra-base-discriminator\")\n", 101 | "\n", 102 | "sentence = \"나는 방금 밥을 먹었다.\"\n", 103 | "fake_sentence = \"나는 왜 밥을 먹었다.\"\n", 104 | "\n", 105 | "fake_tokens = tokenizer.tokenize(fake_sentence)\n", 106 | "fake_inputs = tokenizer.encode(fake_sentence, return_tensors=\"pt\")\n", 107 | "\n", 108 | "discriminator_outputs = discriminator(fake_inputs)\n", 109 | "predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)\n", 110 | "\n", 111 | "pprint(list(zip(fake_tokens, predictions.tolist()[1:-1])))" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "### 2. Generator\n", 119 | "\n", 120 | "- [electra-base-generator](https://huggingface.co/google/electra-base-generator#how-to-use-the-generator-in-transformers)\n", 121 | "- 기존 BERT의 Mask Token Prediction과 동일하다고 생각하면 됨!" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 6, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "application/vnd.jupyter.widget-view+json": { 132 | "model_id": "a37afcb376274353a0dcdc9df541a442", 133 | "version_major": 2, 134 | "version_minor": 0 135 | }, 136 | "text/plain": [ 137 | "HBox(children=(IntProgress(value=0, description='Downloading', max=463, style=ProgressStyle(description_width=…" 138 | ] 139 | }, 140 | "metadata": {}, 141 | "output_type": "display_data" 142 | }, 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "\n" 148 | ] 149 | }, 150 | { 151 | "data": { 152 | "application/vnd.jupyter.widget-view+json": { 153 | "model_id": "6f0bab4fb58342dc97d67b364f6c652e", 154 | "version_major": 2, 155 | "version_minor": 0 156 | }, 157 | "text/plain": [ 158 | "HBox(children=(IntProgress(value=0, description='Downloading', max=279173, style=ProgressStyle(description_wid…" 159 | ] 160 | }, 161 | "metadata": {}, 162 | "output_type": "display_data" 163 | }, 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "\n" 169 | ] 170 | }, 171 | { 172 | "data": { 173 | "application/vnd.jupyter.widget-view+json": { 174 | "model_id": "b20f94c8a8a343639705214492c92fd1", 175 | "version_major": 2, 176 | "version_minor": 0 177 | }, 178 | "text/plain": [ 179 | "HBox(children=(IntProgress(value=0, description='Downloading', max=51, style=ProgressStyle(description_width='…" 180 | ] 181 | }, 182 | "metadata": {}, 183 | "output_type": "display_data" 184 | }, 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "\n" 190 | ] 191 | }, 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "C:\\Users\\Park_Jang_Won\\Anaconda3\\envs\\tensorflow\\lib\\site-packages\\transformers\\tokenization_utils.py:831: FutureWarning: Parameter max_len is deprecated and will be removed in a future release. Use model_max_length instead.\n", 197 | " category=FutureWarning,\n" 198 | ] 199 | }, 200 | { 201 | "data": { 202 | "application/vnd.jupyter.widget-view+json": { 203 | "model_id": "cec86e2841ee48aabc43964c03edc668", 204 | "version_major": 2, 205 | "version_minor": 0 206 | }, 207 | "text/plain": [ 208 | "HBox(children=(IntProgress(value=0, description='Downloading', max=140170987, style=ProgressStyle(description_…" 209 | ] 210 | }, 211 | "metadata": {}, 212 | "output_type": "display_data" 213 | }, 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "\n", 219 | "[{'score': 0.0713089108467102,\n", 220 | " 'sequence': '[CLS] 나는 식당에서 밥을 먹었다. [SEP]',\n", 221 | " 'token': 26194},\n", 222 | " {'score': 0.04359067603945732,\n", 223 | " 'sequence': '[CLS] 나는 방금 밥을 먹었다. [SEP]',\n", 224 | " 'token': 24499},\n", 225 | " {'score': 0.02970987744629383,\n", 226 | " 'sequence': '[CLS] 나는 다시 밥을 먹었다. [SEP]',\n", 227 | " 'token': 10715},\n", 228 | " {'score': 0.027878431603312492,\n", 229 | " 'sequence': '[CLS] 나는 앉아서 밥을 먹었다. [SEP]',\n", 230 | " 'token': 23755},\n", 231 | " {'score': 0.025679852813482285,\n", 232 | " 'sequence': '[CLS] 나는 내 밥을 먹었다. [SEP]',\n", 233 | " 'token': 783}]\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "from transformers import pipeline\n", 239 | "from pprint import pprint\n", 240 | "\n", 241 | "fill_mask = pipeline(\n", 242 | " \"fill-mask\",\n", 243 | " model=\"monologg/koelectra-base-generator\",\n", 244 | " tokenizer=\"monologg/koelectra-base-generator\"\n", 245 | ")\n", 246 | "\n", 247 | "pprint(fill_mask(\"나는 {} 밥을 먹었다.\".format(fill_mask.tokenizer.mask_token)))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "## Detail Review for Code" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "- Pretraining의 경우는 원 저자의 Tensorflow 코드를 쓰는 것을 추천\n", 262 | "- Huggingface Transformers의 코드는 Pretraining이 완료된 모델을 가져다 쓰는 용도\n", 263 | "- [modeling_electra.py](https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_electra.py)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 24, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "import torch\n", 273 | "import torch.nn as nn\n", 274 | "\n", 275 | "from transformers.modeling_bert import BertEmbeddings\n", 276 | "from transformers.modeling_electra import ElectraPreTrainedModel" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "### 1. ElectraEmbeddings\n", 284 | "\n", 285 | "BertEmbeddings와 동일!" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 21, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "class ElectraEmbeddings(BertEmbeddings):\n", 295 | " \"\"\"Construct the embeddings from word, position and token_type embeddings.\"\"\"\n", 296 | "\n", 297 | " def __init__(self, config):\n", 298 | " super().__init__(config)\n", 299 | " self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)\n", 300 | " self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)\n", 301 | " self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)\n", 302 | "\n", 303 | " # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load\n", 304 | " # any TensorFlow checkpoint file\n", 305 | " self.LayerNorm = BertLayerNorm(config.embedding_size, eps=config.layer_norm_eps)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### 2. ElectraModel\n", 313 | "\n", 314 | "BertModel과 동일하지만 Pooler는 없음!" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "### 3. ElectraForPreTraining\n", 322 | "\n", 323 | "- Electra model with a binary classification head on top as used during pre-training for identifying generated tokens.\n", 324 | "- 개인적으로 이 클래스 이름을 좋아하진 않음\n", 325 | "- 논문에서는 `ElectraModel`도 discriminator의 것을 사용하라고 하고 있음\n", 326 | "\n", 327 | "```python\n", 328 | "model = ElectraForPreTraining.from_pretrained(\"google/electra-base-discriminator\")\n", 329 | "```" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 15, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "from transformers import ElectraForPreTraining\n", 339 | "\n", 340 | "model = ElectraForPreTraining.from_pretrained(\"google/electra-base-discriminator\")" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "- `nn.Linear(config.hidden_size, 1)`를 통과한 후, `BCEWithLogitsLoss`를 사용하여 Sigmoid 적용!" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 25, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "class ElectraDiscriminatorPredictions(nn.Module):\n", 357 | " \"\"\"Prediction module for the discriminator, made up of two dense layers.\"\"\"\n", 358 | "\n", 359 | " def __init__(self, config):\n", 360 | " super().__init__()\n", 361 | "\n", 362 | " self.dense = nn.Linear(config.hidden_size, config.hidden_size)\n", 363 | " self.dense_prediction = nn.Linear(config.hidden_size, 1)\n", 364 | " self.config = config\n", 365 | "\n", 366 | " def forward(self, discriminator_hidden_states, attention_mask):\n", 367 | " hidden_states = self.dense(discriminator_hidden_states)\n", 368 | " hidden_states = get_activation(self.config.hidden_act)(hidden_states)\n", 369 | " logits = self.dense_prediction(hidden_states).squeeze()\n", 370 | "\n", 371 | " return logits" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 26, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "class ElectraForPreTraining(ElectraPreTrainedModel):\n", 381 | " def __init__(self, config):\n", 382 | " super().__init__(config)\n", 383 | "\n", 384 | " self.electra = ElectraModel(config)\n", 385 | " self.discriminator_predictions = ElectraDiscriminatorPredictions(config)\n", 386 | " self.init_weights()\n", 387 | " \n", 388 | " def forward(\n", 389 | " self,\n", 390 | " input_ids=None,\n", 391 | " attention_mask=None,\n", 392 | " token_type_ids=None,\n", 393 | " position_ids=None,\n", 394 | " head_mask=None,\n", 395 | " inputs_embeds=None,\n", 396 | " labels=None,\n", 397 | " output_attentions=None,\n", 398 | " output_hidden_states=None,\n", 399 | " ):\n", 400 | " discriminator_hidden_states = self.electra(\n", 401 | " input_ids,\n", 402 | " attention_mask,\n", 403 | " token_type_ids,\n", 404 | " position_ids,\n", 405 | " head_mask,\n", 406 | " inputs_embeds,\n", 407 | " output_attentions,\n", 408 | " output_hidden_states,\n", 409 | " )\n", 410 | " discriminator_sequence_output = discriminator_hidden_states[0]\n", 411 | "\n", 412 | " logits = self.discriminator_predictions(discriminator_sequence_output, attention_mask)\n", 413 | "\n", 414 | " output = (logits,)\n", 415 | "\n", 416 | " if labels is not None:\n", 417 | " loss_fct = nn.BCEWithLogitsLoss()\n", 418 | " if attention_mask is not None:\n", 419 | " active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1\n", 420 | " active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss]\n", 421 | " active_labels = labels[active_loss]\n", 422 | " loss = loss_fct(active_logits, active_labels.float())\n", 423 | " else:\n", 424 | " loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float())\n", 425 | "\n", 426 | " output = (loss,) + output\n", 427 | "\n", 428 | " output += discriminator_hidden_states[1:]\n", 429 | "\n", 430 | " return output # (loss), scores, (hidden_states), (attentions)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "### 4. ElectraForMaskedLM\n", 438 | "\n", 439 | "- Electra model with a language modeling head on top.\n", 440 | "- 우리가 아는 BERT의 Masked Token Prediction\n", 441 | "\n", 442 | "```python\n", 443 | "model = ElectraForMaskedLM.from_pretrained('google/electra-base-generator')\n", 444 | "```" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 27, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "from transformers import ElectraForMaskedLM\n", 454 | "\n", 455 | "model = ElectraForMaskedLM.from_pretrained('google/electra-base-generator')" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 29, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "class ElectraGeneratorPredictions(nn.Module):\n", 465 | " \"\"\"Prediction module for the generator, made up of two dense layers.\"\"\"\n", 466 | "\n", 467 | " def __init__(self, config):\n", 468 | " super().__init__()\n", 469 | "\n", 470 | " self.LayerNorm = BertLayerNorm(config.embedding_size)\n", 471 | " self.dense = nn.Linear(config.hidden_size, config.embedding_size)\n", 472 | "\n", 473 | " def forward(self, generator_hidden_states):\n", 474 | " hidden_states = self.dense(generator_hidden_states)\n", 475 | " hidden_states = get_activation(\"gelu\")(hidden_states)\n", 476 | " hidden_states = self.LayerNorm(hidden_states)\n", 477 | "\n", 478 | " return hidden_states" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 30, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "class ElectraForMaskedLM(ElectraPreTrainedModel):\n", 488 | " def __init__(self, config):\n", 489 | " super().__init__(config)\n", 490 | "\n", 491 | " self.electra = ElectraModel(config)\n", 492 | " self.generator_predictions = ElectraGeneratorPredictions(config)\n", 493 | "\n", 494 | " self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)\n", 495 | " self.init_weights()\n", 496 | "\n", 497 | " def get_output_embeddings(self):\n", 498 | " return self.generator_lm_head\n", 499 | " \n", 500 | " def forward(\n", 501 | " self,\n", 502 | " input_ids=None,\n", 503 | " attention_mask=None,\n", 504 | " token_type_ids=None,\n", 505 | " position_ids=None,\n", 506 | " head_mask=None,\n", 507 | " inputs_embeds=None,\n", 508 | " labels=None,\n", 509 | " output_attentions=None,\n", 510 | " output_hidden_states=None,\n", 511 | " **kwargs\n", 512 | " ):\n", 513 | " if \"masked_lm_labels\" in kwargs:\n", 514 | " warnings.warn(\n", 515 | " \"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.\",\n", 516 | " DeprecationWarning,\n", 517 | " )\n", 518 | " labels = kwargs.pop(\"masked_lm_labels\")\n", 519 | " assert kwargs == {}, f\"Unexpected keyword arguments: {list(kwargs.keys())}.\"\n", 520 | "\n", 521 | " generator_hidden_states = self.electra(\n", 522 | " input_ids,\n", 523 | " attention_mask,\n", 524 | " token_type_ids,\n", 525 | " position_ids,\n", 526 | " head_mask,\n", 527 | " inputs_embeds,\n", 528 | " output_attentions,\n", 529 | " output_hidden_states,\n", 530 | " )\n", 531 | " generator_sequence_output = generator_hidden_states[0]\n", 532 | "\n", 533 | " prediction_scores = self.generator_predictions(generator_sequence_output)\n", 534 | " prediction_scores = self.generator_lm_head(prediction_scores)\n", 535 | "\n", 536 | " output = (prediction_scores,)\n", 537 | "\n", 538 | " # Masked language modeling softmax layer\n", 539 | " if labels is not None:\n", 540 | " loss_fct = nn.CrossEntropyLoss() # -100 index = padding token\n", 541 | " loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))\n", 542 | " output = (loss,) + output\n", 543 | "\n", 544 | " output += generator_hidden_states[1:]\n", 545 | "\n", 546 | " return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)" 547 | ] 548 | } 549 | ], 550 | "metadata": { 551 | "kernelspec": { 552 | "display_name": "tensorflow", 553 | "language": "python", 554 | "name": "tensorflow" 555 | }, 556 | "language_info": { 557 | "codemirror_mode": { 558 | "name": "ipython", 559 | "version": 3 560 | }, 561 | "file_extension": ".py", 562 | "mimetype": "text/x-python", 563 | "name": "python", 564 | "nbconvert_exporter": "python", 565 | "pygments_lexer": "ipython3", 566 | "version": "3.6.6" 567 | } 568 | }, 569 | "nbformat": 4, 570 | "nbformat_minor": 2 571 | } 572 | -------------------------------------------------------------------------------- /20_07_23_김보섭_BERT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# BERT\n", 8 | "`kcbert`를 기반으로 파악해보기\n", 9 | "- `kcbert`\n", 10 | " - hugginface.co: https://huggingface.co/beomi/kcbert-base\n", 11 | " - github: https://github.com/Beomi/KcBERT\n", 12 | "\n", 13 | "- `config.json`\n", 14 | "```json\n", 15 | "{\n", 16 | "max_position_embeddings: 300,\n", 17 | "hidden_dropout_prob: 0.1,\n", 18 | "pooler_size_per_head: 128,\n", 19 | "hidden_act: \"gelu\",\n", 20 | "initializer_range: 0.02,\n", 21 | "num_hidden_layers: 12,\n", 22 | "pooler_num_attention_heads: 12,\n", 23 | "type_vocab_size: 2,\n", 24 | "vocab_size: 30000,\n", 25 | "hidden_size: 768,\n", 26 | "attention_probs_dropout_prob: 0.1,\n", 27 | "directionality: \"bidi\",\n", 28 | "num_attention_heads: 12,\n", 29 | "pooler_fc_size: 768,\n", 30 | "pooler_type: \"first_token_transform\",\n", 31 | "pooler_num_fc_layers: 3,\n", 32 | "intermediate_size: 3072,\n", 33 | "architectures: [\n", 34 | "\"BertForMaskedLM\"\n", 35 | "],\n", 36 | "model_type: \"bert\"\n", 37 | "}\n", 38 | "```\n", 39 | "- `tokenizer_config.json`\n", 40 | "```json\n", 41 | "{\n", 42 | "do_lower_case: false,\n", 43 | "model_max_length: 300\n", 44 | "}\n", 45 | "```" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## loading BERT\n", 53 | "예시는 [KcBERT](https://github.com/Beomi/KcBERT)로 " 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 14, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | "/home/kbs/.pyenv/versions/3.7.7/envs/hacking_transformers/lib/python3.7/site-packages/transformers/modeling_auto.py:798: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", 66 | " FutureWarning,\n", 67 | "Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n", 68 | "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n", 69 | "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "from transformers import AutoTokenizer, AutoModelWithLMHead\n", 75 | "\n", 76 | "tokenizer = AutoTokenizer.from_pretrained(\"beomi/kcbert-base\")\n", 77 | "model = AutoModelWithLMHead.from_pretrained(\"beomi/kcbert-base\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 15, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "[['bert.embeddings.word_embeddings.weight', torch.Size([30000, 768])],\n", 89 | " ['bert.embeddings.position_embeddings.weight', torch.Size([300, 768])],\n", 90 | " ['bert.embeddings.token_type_embeddings.weight', torch.Size([2, 768])],\n", 91 | " ['bert.embeddings.LayerNorm.weight', torch.Size([768])],\n", 92 | " ['bert.embeddings.LayerNorm.bias', torch.Size([768])],\n", 93 | " ['bert.encoder.layer.0.attention.self.query.weight', torch.Size([768, 768])],\n", 94 | " ['bert.encoder.layer.0.attention.self.query.bias', torch.Size([768])],\n", 95 | " ['bert.encoder.layer.0.attention.self.key.weight', torch.Size([768, 768])],\n", 96 | " ['bert.encoder.layer.0.attention.self.key.bias', torch.Size([768])],\n", 97 | " ['bert.encoder.layer.0.attention.self.value.weight', torch.Size([768, 768])],\n", 98 | " ['bert.encoder.layer.0.attention.self.value.bias', torch.Size([768])],\n", 99 | " ['bert.encoder.layer.0.attention.output.dense.weight',\n", 100 | " torch.Size([768, 768])],\n", 101 | " ['bert.encoder.layer.0.attention.output.dense.bias', torch.Size([768])],\n", 102 | " ['bert.encoder.layer.0.attention.output.LayerNorm.weight', torch.Size([768])],\n", 103 | " ['bert.encoder.layer.0.attention.output.LayerNorm.bias', torch.Size([768])],\n", 104 | " ['bert.encoder.layer.0.intermediate.dense.weight', torch.Size([3072, 768])],\n", 105 | " ['bert.encoder.layer.0.intermediate.dense.bias', torch.Size([3072])],\n", 106 | " ['bert.encoder.layer.0.output.dense.weight', torch.Size([768, 3072])],\n", 107 | " ['bert.encoder.layer.0.output.dense.bias', torch.Size([768])],\n", 108 | " ['bert.encoder.layer.0.output.LayerNorm.weight', torch.Size([768])],\n", 109 | " ['bert.encoder.layer.0.output.LayerNorm.bias', torch.Size([768])],\n", 110 | " ['bert.encoder.layer.1.attention.self.query.weight', torch.Size([768, 768])],\n", 111 | " ['bert.encoder.layer.1.attention.self.query.bias', torch.Size([768])],\n", 112 | " ['bert.encoder.layer.1.attention.self.key.weight', torch.Size([768, 768])],\n", 113 | " ['bert.encoder.layer.1.attention.self.key.bias', torch.Size([768])],\n", 114 | " ['bert.encoder.layer.1.attention.self.value.weight', torch.Size([768, 768])],\n", 115 | " ['bert.encoder.layer.1.attention.self.value.bias', torch.Size([768])],\n", 116 | " ['bert.encoder.layer.1.attention.output.dense.weight',\n", 117 | " torch.Size([768, 768])],\n", 118 | " ['bert.encoder.layer.1.attention.output.dense.bias', torch.Size([768])],\n", 119 | " ['bert.encoder.layer.1.attention.output.LayerNorm.weight', torch.Size([768])],\n", 120 | " ['bert.encoder.layer.1.attention.output.LayerNorm.bias', torch.Size([768])],\n", 121 | " ['bert.encoder.layer.1.intermediate.dense.weight', torch.Size([3072, 768])],\n", 122 | " ['bert.encoder.layer.1.intermediate.dense.bias', torch.Size([3072])],\n", 123 | " ['bert.encoder.layer.1.output.dense.weight', torch.Size([768, 3072])],\n", 124 | " ['bert.encoder.layer.1.output.dense.bias', torch.Size([768])],\n", 125 | " ['bert.encoder.layer.1.output.LayerNorm.weight', torch.Size([768])],\n", 126 | " ['bert.encoder.layer.1.output.LayerNorm.bias', torch.Size([768])],\n", 127 | " ['bert.encoder.layer.2.attention.self.query.weight', torch.Size([768, 768])],\n", 128 | " ['bert.encoder.layer.2.attention.self.query.bias', torch.Size([768])],\n", 129 | " ['bert.encoder.layer.2.attention.self.key.weight', torch.Size([768, 768])],\n", 130 | " ['bert.encoder.layer.2.attention.self.key.bias', torch.Size([768])],\n", 131 | " ['bert.encoder.layer.2.attention.self.value.weight', torch.Size([768, 768])],\n", 132 | " ['bert.encoder.layer.2.attention.self.value.bias', torch.Size([768])],\n", 133 | " ['bert.encoder.layer.2.attention.output.dense.weight',\n", 134 | " torch.Size([768, 768])],\n", 135 | " ['bert.encoder.layer.2.attention.output.dense.bias', torch.Size([768])],\n", 136 | " ['bert.encoder.layer.2.attention.output.LayerNorm.weight', torch.Size([768])],\n", 137 | " ['bert.encoder.layer.2.attention.output.LayerNorm.bias', torch.Size([768])],\n", 138 | " ['bert.encoder.layer.2.intermediate.dense.weight', torch.Size([3072, 768])],\n", 139 | " ['bert.encoder.layer.2.intermediate.dense.bias', torch.Size([3072])],\n", 140 | " ['bert.encoder.layer.2.output.dense.weight', torch.Size([768, 3072])],\n", 141 | " ['bert.encoder.layer.2.output.dense.bias', torch.Size([768])],\n", 142 | " ['bert.encoder.layer.2.output.LayerNorm.weight', torch.Size([768])],\n", 143 | " ['bert.encoder.layer.2.output.LayerNorm.bias', torch.Size([768])],\n", 144 | " ['bert.encoder.layer.3.attention.self.query.weight', torch.Size([768, 768])],\n", 145 | " ['bert.encoder.layer.3.attention.self.query.bias', torch.Size([768])],\n", 146 | " ['bert.encoder.layer.3.attention.self.key.weight', torch.Size([768, 768])],\n", 147 | " ['bert.encoder.layer.3.attention.self.key.bias', torch.Size([768])],\n", 148 | " ['bert.encoder.layer.3.attention.self.value.weight', torch.Size([768, 768])],\n", 149 | " ['bert.encoder.layer.3.attention.self.value.bias', torch.Size([768])],\n", 150 | " ['bert.encoder.layer.3.attention.output.dense.weight',\n", 151 | " torch.Size([768, 768])],\n", 152 | " ['bert.encoder.layer.3.attention.output.dense.bias', torch.Size([768])],\n", 153 | " ['bert.encoder.layer.3.attention.output.LayerNorm.weight', torch.Size([768])],\n", 154 | " ['bert.encoder.layer.3.attention.output.LayerNorm.bias', torch.Size([768])],\n", 155 | " ['bert.encoder.layer.3.intermediate.dense.weight', torch.Size([3072, 768])],\n", 156 | " ['bert.encoder.layer.3.intermediate.dense.bias', torch.Size([3072])],\n", 157 | " ['bert.encoder.layer.3.output.dense.weight', torch.Size([768, 3072])],\n", 158 | " ['bert.encoder.layer.3.output.dense.bias', torch.Size([768])],\n", 159 | " ['bert.encoder.layer.3.output.LayerNorm.weight', torch.Size([768])],\n", 160 | " ['bert.encoder.layer.3.output.LayerNorm.bias', torch.Size([768])],\n", 161 | " ['bert.encoder.layer.4.attention.self.query.weight', torch.Size([768, 768])],\n", 162 | " ['bert.encoder.layer.4.attention.self.query.bias', torch.Size([768])],\n", 163 | " ['bert.encoder.layer.4.attention.self.key.weight', torch.Size([768, 768])],\n", 164 | " ['bert.encoder.layer.4.attention.self.key.bias', torch.Size([768])],\n", 165 | " ['bert.encoder.layer.4.attention.self.value.weight', torch.Size([768, 768])],\n", 166 | " ['bert.encoder.layer.4.attention.self.value.bias', torch.Size([768])],\n", 167 | " ['bert.encoder.layer.4.attention.output.dense.weight',\n", 168 | " torch.Size([768, 768])],\n", 169 | " ['bert.encoder.layer.4.attention.output.dense.bias', torch.Size([768])],\n", 170 | " ['bert.encoder.layer.4.attention.output.LayerNorm.weight', torch.Size([768])],\n", 171 | " ['bert.encoder.layer.4.attention.output.LayerNorm.bias', torch.Size([768])],\n", 172 | " ['bert.encoder.layer.4.intermediate.dense.weight', torch.Size([3072, 768])],\n", 173 | " ['bert.encoder.layer.4.intermediate.dense.bias', torch.Size([3072])],\n", 174 | " ['bert.encoder.layer.4.output.dense.weight', torch.Size([768, 3072])],\n", 175 | " ['bert.encoder.layer.4.output.dense.bias', torch.Size([768])],\n", 176 | " ['bert.encoder.layer.4.output.LayerNorm.weight', torch.Size([768])],\n", 177 | " ['bert.encoder.layer.4.output.LayerNorm.bias', torch.Size([768])],\n", 178 | " ['bert.encoder.layer.5.attention.self.query.weight', torch.Size([768, 768])],\n", 179 | " ['bert.encoder.layer.5.attention.self.query.bias', torch.Size([768])],\n", 180 | " ['bert.encoder.layer.5.attention.self.key.weight', torch.Size([768, 768])],\n", 181 | " ['bert.encoder.layer.5.attention.self.key.bias', torch.Size([768])],\n", 182 | " ['bert.encoder.layer.5.attention.self.value.weight', torch.Size([768, 768])],\n", 183 | " ['bert.encoder.layer.5.attention.self.value.bias', torch.Size([768])],\n", 184 | " ['bert.encoder.layer.5.attention.output.dense.weight',\n", 185 | " torch.Size([768, 768])],\n", 186 | " ['bert.encoder.layer.5.attention.output.dense.bias', torch.Size([768])],\n", 187 | " ['bert.encoder.layer.5.attention.output.LayerNorm.weight', torch.Size([768])],\n", 188 | " ['bert.encoder.layer.5.attention.output.LayerNorm.bias', torch.Size([768])],\n", 189 | " ['bert.encoder.layer.5.intermediate.dense.weight', torch.Size([3072, 768])],\n", 190 | " ['bert.encoder.layer.5.intermediate.dense.bias', torch.Size([3072])],\n", 191 | " ['bert.encoder.layer.5.output.dense.weight', torch.Size([768, 3072])],\n", 192 | " ['bert.encoder.layer.5.output.dense.bias', torch.Size([768])],\n", 193 | " ['bert.encoder.layer.5.output.LayerNorm.weight', torch.Size([768])],\n", 194 | " ['bert.encoder.layer.5.output.LayerNorm.bias', torch.Size([768])],\n", 195 | " ['bert.encoder.layer.6.attention.self.query.weight', torch.Size([768, 768])],\n", 196 | " ['bert.encoder.layer.6.attention.self.query.bias', torch.Size([768])],\n", 197 | " ['bert.encoder.layer.6.attention.self.key.weight', torch.Size([768, 768])],\n", 198 | " ['bert.encoder.layer.6.attention.self.key.bias', torch.Size([768])],\n", 199 | " ['bert.encoder.layer.6.attention.self.value.weight', torch.Size([768, 768])],\n", 200 | " ['bert.encoder.layer.6.attention.self.value.bias', torch.Size([768])],\n", 201 | " ['bert.encoder.layer.6.attention.output.dense.weight',\n", 202 | " torch.Size([768, 768])],\n", 203 | " ['bert.encoder.layer.6.attention.output.dense.bias', torch.Size([768])],\n", 204 | " ['bert.encoder.layer.6.attention.output.LayerNorm.weight', torch.Size([768])],\n", 205 | " ['bert.encoder.layer.6.attention.output.LayerNorm.bias', torch.Size([768])],\n", 206 | " ['bert.encoder.layer.6.intermediate.dense.weight', torch.Size([3072, 768])],\n", 207 | " ['bert.encoder.layer.6.intermediate.dense.bias', torch.Size([3072])],\n", 208 | " ['bert.encoder.layer.6.output.dense.weight', torch.Size([768, 3072])],\n", 209 | " ['bert.encoder.layer.6.output.dense.bias', torch.Size([768])],\n", 210 | " ['bert.encoder.layer.6.output.LayerNorm.weight', torch.Size([768])],\n", 211 | " ['bert.encoder.layer.6.output.LayerNorm.bias', torch.Size([768])],\n", 212 | " ['bert.encoder.layer.7.attention.self.query.weight', torch.Size([768, 768])],\n", 213 | " ['bert.encoder.layer.7.attention.self.query.bias', torch.Size([768])],\n", 214 | " ['bert.encoder.layer.7.attention.self.key.weight', torch.Size([768, 768])],\n", 215 | " ['bert.encoder.layer.7.attention.self.key.bias', torch.Size([768])],\n", 216 | " ['bert.encoder.layer.7.attention.self.value.weight', torch.Size([768, 768])],\n", 217 | " ['bert.encoder.layer.7.attention.self.value.bias', torch.Size([768])],\n", 218 | " ['bert.encoder.layer.7.attention.output.dense.weight',\n", 219 | " torch.Size([768, 768])],\n", 220 | " ['bert.encoder.layer.7.attention.output.dense.bias', torch.Size([768])],\n", 221 | " ['bert.encoder.layer.7.attention.output.LayerNorm.weight', torch.Size([768])],\n", 222 | " ['bert.encoder.layer.7.attention.output.LayerNorm.bias', torch.Size([768])],\n", 223 | " ['bert.encoder.layer.7.intermediate.dense.weight', torch.Size([3072, 768])],\n", 224 | " ['bert.encoder.layer.7.intermediate.dense.bias', torch.Size([3072])],\n", 225 | " ['bert.encoder.layer.7.output.dense.weight', torch.Size([768, 3072])],\n", 226 | " ['bert.encoder.layer.7.output.dense.bias', torch.Size([768])],\n", 227 | " ['bert.encoder.layer.7.output.LayerNorm.weight', torch.Size([768])],\n", 228 | " ['bert.encoder.layer.7.output.LayerNorm.bias', torch.Size([768])],\n", 229 | " ['bert.encoder.layer.8.attention.self.query.weight', torch.Size([768, 768])],\n", 230 | " ['bert.encoder.layer.8.attention.self.query.bias', torch.Size([768])],\n", 231 | " ['bert.encoder.layer.8.attention.self.key.weight', torch.Size([768, 768])],\n", 232 | " ['bert.encoder.layer.8.attention.self.key.bias', torch.Size([768])],\n", 233 | " ['bert.encoder.layer.8.attention.self.value.weight', torch.Size([768, 768])],\n", 234 | " ['bert.encoder.layer.8.attention.self.value.bias', torch.Size([768])],\n", 235 | " ['bert.encoder.layer.8.attention.output.dense.weight',\n", 236 | " torch.Size([768, 768])],\n", 237 | " ['bert.encoder.layer.8.attention.output.dense.bias', torch.Size([768])],\n", 238 | " ['bert.encoder.layer.8.attention.output.LayerNorm.weight', torch.Size([768])],\n", 239 | " ['bert.encoder.layer.8.attention.output.LayerNorm.bias', torch.Size([768])],\n", 240 | " ['bert.encoder.layer.8.intermediate.dense.weight', torch.Size([3072, 768])],\n", 241 | " ['bert.encoder.layer.8.intermediate.dense.bias', torch.Size([3072])],\n", 242 | " ['bert.encoder.layer.8.output.dense.weight', torch.Size([768, 3072])],\n", 243 | " ['bert.encoder.layer.8.output.dense.bias', torch.Size([768])],\n", 244 | " ['bert.encoder.layer.8.output.LayerNorm.weight', torch.Size([768])],\n", 245 | " ['bert.encoder.layer.8.output.LayerNorm.bias', torch.Size([768])],\n", 246 | " ['bert.encoder.layer.9.attention.self.query.weight', torch.Size([768, 768])],\n", 247 | " ['bert.encoder.layer.9.attention.self.query.bias', torch.Size([768])],\n", 248 | " ['bert.encoder.layer.9.attention.self.key.weight', torch.Size([768, 768])],\n", 249 | " ['bert.encoder.layer.9.attention.self.key.bias', torch.Size([768])],\n", 250 | " ['bert.encoder.layer.9.attention.self.value.weight', torch.Size([768, 768])],\n", 251 | " ['bert.encoder.layer.9.attention.self.value.bias', torch.Size([768])],\n", 252 | " ['bert.encoder.layer.9.attention.output.dense.weight',\n", 253 | " torch.Size([768, 768])],\n", 254 | " ['bert.encoder.layer.9.attention.output.dense.bias', torch.Size([768])],\n", 255 | " ['bert.encoder.layer.9.attention.output.LayerNorm.weight', torch.Size([768])],\n", 256 | " ['bert.encoder.layer.9.attention.output.LayerNorm.bias', torch.Size([768])],\n", 257 | " ['bert.encoder.layer.9.intermediate.dense.weight', torch.Size([3072, 768])],\n", 258 | " ['bert.encoder.layer.9.intermediate.dense.bias', torch.Size([3072])],\n", 259 | " ['bert.encoder.layer.9.output.dense.weight', torch.Size([768, 3072])],\n", 260 | " ['bert.encoder.layer.9.output.dense.bias', torch.Size([768])],\n", 261 | " ['bert.encoder.layer.9.output.LayerNorm.weight', torch.Size([768])],\n", 262 | " ['bert.encoder.layer.9.output.LayerNorm.bias', torch.Size([768])],\n", 263 | " ['bert.encoder.layer.10.attention.self.query.weight', torch.Size([768, 768])],\n", 264 | " ['bert.encoder.layer.10.attention.self.query.bias', torch.Size([768])],\n", 265 | " ['bert.encoder.layer.10.attention.self.key.weight', torch.Size([768, 768])],\n", 266 | " ['bert.encoder.layer.10.attention.self.key.bias', torch.Size([768])],\n", 267 | " ['bert.encoder.layer.10.attention.self.value.weight', torch.Size([768, 768])],\n", 268 | " ['bert.encoder.layer.10.attention.self.value.bias', torch.Size([768])],\n", 269 | " ['bert.encoder.layer.10.attention.output.dense.weight',\n", 270 | " torch.Size([768, 768])],\n", 271 | " ['bert.encoder.layer.10.attention.output.dense.bias', torch.Size([768])],\n", 272 | " ['bert.encoder.layer.10.attention.output.LayerNorm.weight',\n", 273 | " torch.Size([768])],\n", 274 | " ['bert.encoder.layer.10.attention.output.LayerNorm.bias', torch.Size([768])],\n", 275 | " ['bert.encoder.layer.10.intermediate.dense.weight', torch.Size([3072, 768])],\n", 276 | " ['bert.encoder.layer.10.intermediate.dense.bias', torch.Size([3072])],\n", 277 | " ['bert.encoder.layer.10.output.dense.weight', torch.Size([768, 3072])],\n", 278 | " ['bert.encoder.layer.10.output.dense.bias', torch.Size([768])],\n", 279 | " ['bert.encoder.layer.10.output.LayerNorm.weight', torch.Size([768])],\n", 280 | " ['bert.encoder.layer.10.output.LayerNorm.bias', torch.Size([768])],\n", 281 | " ['bert.encoder.layer.11.attention.self.query.weight', torch.Size([768, 768])],\n", 282 | " ['bert.encoder.layer.11.attention.self.query.bias', torch.Size([768])],\n", 283 | " ['bert.encoder.layer.11.attention.self.key.weight', torch.Size([768, 768])],\n", 284 | " ['bert.encoder.layer.11.attention.self.key.bias', torch.Size([768])],\n", 285 | " ['bert.encoder.layer.11.attention.self.value.weight', torch.Size([768, 768])],\n", 286 | " ['bert.encoder.layer.11.attention.self.value.bias', torch.Size([768])],\n", 287 | " ['bert.encoder.layer.11.attention.output.dense.weight',\n", 288 | " torch.Size([768, 768])],\n", 289 | " ['bert.encoder.layer.11.attention.output.dense.bias', torch.Size([768])],\n", 290 | " ['bert.encoder.layer.11.attention.output.LayerNorm.weight',\n", 291 | " torch.Size([768])],\n", 292 | " ['bert.encoder.layer.11.attention.output.LayerNorm.bias', torch.Size([768])],\n", 293 | " ['bert.encoder.layer.11.intermediate.dense.weight', torch.Size([3072, 768])],\n", 294 | " ['bert.encoder.layer.11.intermediate.dense.bias', torch.Size([3072])],\n", 295 | " ['bert.encoder.layer.11.output.dense.weight', torch.Size([768, 3072])],\n", 296 | " ['bert.encoder.layer.11.output.dense.bias', torch.Size([768])],\n", 297 | " ['bert.encoder.layer.11.output.LayerNorm.weight', torch.Size([768])],\n", 298 | " ['bert.encoder.layer.11.output.LayerNorm.bias', torch.Size([768])],\n", 299 | " ['bert.pooler.dense.weight', torch.Size([768, 768])],\n", 300 | " ['bert.pooler.dense.bias', torch.Size([768])],\n", 301 | " ['cls.predictions.bias', torch.Size([30000])],\n", 302 | " ['cls.predictions.transform.dense.weight', torch.Size([768, 768])],\n", 303 | " ['cls.predictions.transform.dense.bias', torch.Size([768])],\n", 304 | " ['cls.predictions.transform.LayerNorm.weight', torch.Size([768])],\n", 305 | " ['cls.predictions.transform.LayerNorm.bias', torch.Size([768])]]" 306 | ] 307 | }, 308 | "execution_count": 15, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "# ops 별 이름과 ops를 구성하는 weight의 dimension을 확인\n", 315 | "[[ops[0], ops[1].size()] for ops in model.named_parameters()]" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "## tokenizer" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 73, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "max_len=300\n", 335 | "do_basic_tokenize=True\n", 336 | "안녕하세요, 반갑습니다.\n" 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "print(f\"max_len={tokenizer.max_len}\")\n", 342 | "print(f\"do_basic_tokenize={tokenizer.do_basic_tokenize}\")\n", 343 | "\n", 344 | "example_0 = \"안녕하세요, 반갑습니다.\"\n", 345 | "print(example_0)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 91, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "[['안녕', '##하세요'], [','], ['반', '##갑', '##습니다'], ['.']]\n", 358 | "['안녕', '##하세요', '.', '반', '##갑', '##습니다', '.']\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "tokenized_by_basic =tokenizer.basic_tokenizer.tokenize(cleaned)\n", 364 | "print([(tokenizer.wordpiece_tokenizer.tokenize(token)) for token in tokenized_by_basic])\n", 365 | "print(tokenizer.tokenize(\"안녕하세요. 반갑습니다.\"))" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 85, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "ename": "AttributeError", 375 | "evalue": "'list' object has no attribute 'get'", 376 | "output_type": "error", 377 | "traceback": [ 378 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 379 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 380 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mall_special_tokens_extended\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 381 | "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'get'" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "tokenizer.all_special_tokens_extended.get()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 82, 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "('안녕하세요, 반갑습니다.', {})" 398 | ] 399 | }, 400 | "execution_count": 82, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "tokenizer.prepare_for_tokenization(cleaned)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 53, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "name": "stdout", 416 | "output_type": "stream", 417 | "text": [ 418 | "tokenized=['모두', '##연', '자연', '##어', '##처리', '##랩', '##에', '오신', '것을', '환영합니다', '.']\n", 419 | "tokenized2indices=[8248, 4132, 10459, 4071, 11385, 5116, 4113, 28914, 9153, 29502, 17]\n", 420 | "encoded=[2, 8248, 4132, 10459, 4071, 11385, 5116, 4113, 28914, 9153, 29502, 17, 3]\n", 421 | "['[CLS]', '모두', '##연', '자연', '##어', '##처리', '##랩', '##에', '오신', '것을', '환영합니다', '.', '[SEP]']\n", 422 | "decoded=[CLS] 모두연 자연어처리랩에 오신 것을 환영합니다. [SEP]\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "example_text = \"모두연 자연어처리랩에 오신 것을 환영합니다.\"\n", 428 | "tokenized = tokenizer.tokenize(example_text)\n", 429 | "tokenized2indices = tokenizer.convert_tokens_to_ids(tokenized)\n", 430 | "encoded = tokenizer.encode(example_text)\n", 431 | "decoded = tokenizer.decode(encoded)\n", 432 | "print(f\"tokenized={tokenized}\")\n", 433 | "print(f\"tokenized2indices={tokenized2indices}\")\n", 434 | "print(f\"encoded={encoded}\")\n", 435 | "print(tokenizer.convert_ids_to_tokens(encoded))\n", 436 | "print(f\"decoded={decoded}\")" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 54, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "data": { 446 | "text/plain": [ 447 | "" 448 | ] 449 | }, 450 | "execution_count": 54, 451 | "metadata": {}, 452 | "output_type": "execute_result" 453 | } 454 | ], 455 | "source": [] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 36, 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "data": { 464 | "text/plain": [ 465 | "{'input_ids': [2, 19017, 8482, 3], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}" 466 | ] 467 | }, 468 | "execution_count": 36, 469 | "metadata": {}, 470 | "output_type": "execute_result" 471 | } 472 | ], 473 | "source": [ 474 | "tokenizer.encode_plus(\"안녕하세요\")" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 23, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/plain": [ 485 | "['[CLS]',\n", 486 | " '모두',\n", 487 | " '##연',\n", 488 | " '자연',\n", 489 | " '##어',\n", 490 | " '##처리',\n", 491 | " '##랩',\n", 492 | " '##에',\n", 493 | " '오신',\n", 494 | " '것을',\n", 495 | " '환영합니다',\n", 496 | " '.',\n", 497 | " '[SEP]']" 498 | ] 499 | }, 500 | "execution_count": 23, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "tokenizer.convert_ids_to_tokens(encoded)\n", 507 | "tokenizer.convert_tokens_to_ids()" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [] 516 | } 517 | ], 518 | "metadata": { 519 | "kernelspec": { 520 | "display_name": "Python 3", 521 | "language": "python", 522 | "name": "python3" 523 | }, 524 | "language_info": { 525 | "codemirror_mode": { 526 | "name": "ipython", 527 | "version": 3 528 | }, 529 | "file_extension": ".py", 530 | "mimetype": "text/x-python", 531 | "name": "python", 532 | "nbconvert_exporter": "python", 533 | "pygments_lexer": "ipython3", 534 | "version": "3.7.7" 535 | } 536 | }, 537 | "nbformat": 4, 538 | "nbformat_minor": 4 539 | } 540 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ModuNLP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | --------------------------------------------------------------------------------