├── asset ├── explain_1.png ├── explain_2.png ├── explain_3.png └── explain_4.png ├── README.md ├── 7.PRETRAIN_METHOD ├── 7.4.1.gpt2_finetune_novel_LM.ipynb ├── 7.4.2.gpt2_finetune_NSMC.ipynb ├── 7.2.1.bert_finetune_NSMC.ipynb ├── 7.2.2.bert_finetune_KorNLI.ipynb ├── 7.4.3.gpt2_finetune_KorNLI.ipynb ├── 7.2.3.bert_finetune_NER.ipynb ├── 7.2.4.bert_finetune_KorSTS.ipynb ├── 7.4.4.gpt2_finetune_KorSTS.ipynb └── 7.2.5.bert_finetune_KorQuAD.ipynb └── 8.GPT3 ├── 8.3.gpt2_fewshot_NSMC.ipynb └── 8.4.gpt2_p_tuning_NSMC.ipynb /asset/explain_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_1.png -------------------------------------------------------------------------------- /asset/explain_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_2.png -------------------------------------------------------------------------------- /asset/explain_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_3.png -------------------------------------------------------------------------------- /asset/explain_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_4.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLPBOOK (개정판) colab 실습 2 | 3 | 텐서플로2와 머신러닝으로 시작하는 자연어처리(로지스틱회귀부터 BERT와 GPT2까지) colab 실습 저장소 4 | 5 | ## 소개 (Introduction) 6 | 7 | 책에 수록된 자연어 처리 예제들 중 컴퓨터 리소스를 많이 필요로 하는 실슾 파일들을 모아놓은 저장소입니다. 8 | 9 | 해당 실습 파일들을 colab에 불러 실행해보실 수 있습니다. 10 | 11 | ## Colab 실행 12 | 13 | 1. https://colab.research.google.com 에 접속한다. 14 | 15 | 2. 깃헙 저장소의 파일을 불러온다. 16 | 17 |

18 | 19 |
20 | 21 | 3. 'Copy to Drive'를 클릭하여 실습 파일을 자신의 드라이브로 옮긴다. 22 | 23 |

24 | 25 |
26 | 27 | 4. Change runtime type 메뉴를 클릭하여 사용할 리소스를 GPU로 선택한다. 28 | 29 | > - Change runtime type을 클릭한다. 30 | 31 |

32 | 33 | > - GPU 리소스로 선택한다. 34 | 35 |

36 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.4.1.gpt2_finetune_novel_LM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/finetune_data.txt \\\n", 36 | " -O data_in/KOR/finetune_data.txt " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import os\n", 46 | "\n", 47 | "import numpy as np\n", 48 | "import tensorflow as tf\n", 49 | "\n", 50 | "import gluonnlp as nlp\n", 51 | "from gluonnlp.data import SentencepieceTokenizer\n", 52 | "from transformers import TFGPT2LMHeadModel\n", 53 | "\n", 54 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 55 | "\n", 56 | "from nltk.tokenize import sent_tokenize" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n", 73 | "!unzip -o gpt_ckpt.zip" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "class GPT2Model(tf.keras.Model):\n", 83 | " def __init__(self, dir_path):\n", 84 | " super(GPT2Model, self).__init__()\n", 85 | " self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)\n", 86 | " \n", 87 | " def call(self, inputs):\n", 88 | " return self.gpt2(inputs)[0]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "BASE_MODEL_PATH = './gpt_ckpt'\n", 98 | "gpt_model = GPT2Model(BASE_MODEL_PATH)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "BATCH_SIZE = 16\n", 108 | "NUM_EPOCHS = 10\n", 109 | "MAX_LEN = 30\n", 110 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n", 111 | "\n", 112 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n", 113 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n", 114 | " mask_token=None,\n", 115 | " sep_token=None,\n", 116 | " cls_token=None,\n", 117 | " unknown_token='',\n", 118 | " padding_token='',\n", 119 | " bos_token='~~',\n", 120 | " eos_token='~~')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "def tf_top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-99999):\n", 130 | " _logits = logits.numpy()\n", 131 | " top_k = min(top_k, logits.shape[-1]) \n", 132 | " if top_k > 0:\n", 133 | " indices_to_remove = logits < tf.math.top_k(logits, top_k)[0][..., -1, None]\n", 134 | " _logits[indices_to_remove] = filter_value\n", 135 | "\n", 136 | " if top_p > 0.0:\n", 137 | " sorted_logits = tf.sort(logits, direction='DESCENDING')\n", 138 | " sorted_indices = tf.argsort(logits, direction='DESCENDING')\n", 139 | " cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)\n", 140 | "\n", 141 | " sorted_indices_to_remove = cumulative_probs > top_p\n", 142 | " sorted_indices_to_remove = tf.concat([[False], sorted_indices_to_remove[..., :-1]], axis=0)\n", 143 | " indices_to_remove = sorted_indices[sorted_indices_to_remove].numpy().tolist()\n", 144 | " \n", 145 | " _logits[indices_to_remove] = filter_value\n", 146 | " return tf.constant([_logits])\n", 147 | "\n", 148 | "\n", 149 | "def generate_sent(seed_word, model, max_step=100, greedy=False, top_k=0, top_p=0.):\n", 150 | " sent = seed_word\n", 151 | " toked = tokenizer(sent)\n", 152 | " \n", 153 | " for _ in range(max_step):\n", 154 | " input_ids = tf.constant([vocab[vocab.bos_token],] + vocab[toked])[None, :] \n", 155 | " outputs = model(input_ids)[:, -1, :]\n", 156 | " if greedy:\n", 157 | " gen = vocab.to_tokens(tf.argmax(outputs, axis=-1).numpy().tolist()[0])\n", 158 | " else:\n", 159 | " output_logit = tf_top_k_top_p_filtering(outputs[0], top_k=top_k, top_p=top_p)\n", 160 | " gen = vocab.to_tokens(tf.random.categorical(output_logit, 1).numpy().tolist()[0])[0]\n", 161 | " if gen == '':\n", 162 | " break\n", 163 | " sent += gen.replace('▁', ' ')\n", 164 | " toked = tokenizer(sent)\n", 165 | "\n", 166 | " return sent" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "generate_sent('이때', gpt_model, greedy=True)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "generate_sent('이때', gpt_model, top_k=0, top_p=0.95)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "DATA_IN_PATH = './data_in/KOR/'\n", 194 | "TRAIN_DATA_FILE = 'finetune_data.txt'\n", 195 | "\n", 196 | "sents = [s[:-1] for s in open(DATA_IN_PATH + TRAIN_DATA_FILE).readlines()]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "input_data = []\n", 206 | "output_data = []\n", 207 | "\n", 208 | "for s in sents:\n", 209 | " tokens = [vocab[vocab.bos_token],] + vocab[tokenizer(s)] + [vocab[vocab.eos_token],]\n", 210 | " input_data.append(tokens[:-1])\n", 211 | " output_data.append(tokens[1:])\n", 212 | "\n", 213 | "input_data = pad_sequences(input_data, MAX_LEN, value=vocab[vocab.padding_token])\n", 214 | "output_data = pad_sequences(output_data, MAX_LEN, value=vocab[vocab.padding_token])\n", 215 | "\n", 216 | "input_data = np.array(input_data, dtype=np.int64)\n", 217 | "output_data = np.array(output_data, dtype=np.int64)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "loss_object = tf.keras.losses.SparseCategoricalCrossentropy(\n", 227 | " from_logits=True, reduction='none')\n", 228 | "\n", 229 | "train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')\n", 230 | "\n", 231 | "def loss_function(real, pred):\n", 232 | " mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))\n", 233 | " loss_ = loss_object(real, pred)\n", 234 | "\n", 235 | " mask = tf.cast(mask, dtype=loss_.dtype)\n", 236 | " loss_ *= mask\n", 237 | "\n", 238 | " return tf.reduce_mean(loss_)\n", 239 | "\n", 240 | "def accuracy_function(real, pred):\n", 241 | " mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))\n", 242 | " mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)\n", 243 | " pred *= mask \n", 244 | " acc = train_accuracy(real, pred)\n", 245 | "\n", 246 | " return tf.reduce_mean(acc)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "gpt_model.compile(loss=loss_function,\n", 256 | " optimizer=tf.keras.optimizers.Adam(1e-4),\n", 257 | " metrics=[accuracy_function])" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "history = gpt_model.fit(input_data, output_data, \n", 267 | " batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,\n", 268 | " validation_split=0.1)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "DATA_OUT_PATH = './data_out'\n", 278 | "model_name = \"tf2_gpt2_finetuned_model\"\n", 279 | "\n", 280 | "save_path = os.path.join(DATA_OUT_PATH, model_name)\n", 281 | "\n", 282 | "if not os.path.exists(save_path):\n", 283 | " os.makedirs(save_path)\n", 284 | "\n", 285 | "gpt_model.gpt2.save_pretrained(save_path)\n", 286 | "\n", 287 | "loaded_gpt_model = GPT2Model(save_path)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "generate_sent('이때', gpt_model, greedy=True)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "generate_sent('이때', gpt_model, top_k=0, top_p=0.95)" 306 | ] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "Python 3", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.7.4" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 2 330 | } 331 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.4.2.gpt2_finetune_NSMC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR/naver_movie\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n", 36 | " -O data_in/KOR/naver_movie/ratings_train.txt\n", 37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n", 38 | " -O data_in/KOR/naver_movie/ratings_test.txt" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "scrolled": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "import os\n", 50 | "import tensorflow as tf\n", 51 | "from transformers import TFGPT2Model\n", 52 | "\n", 53 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 54 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 55 | "\n", 56 | "import gluonnlp as nlp\n", 57 | "from gluonnlp.data import SentencepieceTokenizer\n", 58 | "\n", 59 | "import pandas as pd\n", 60 | "import matplotlib.pyplot as plt\n", 61 | "\n", 62 | "import numpy as np\n", 63 | "import re" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n", 80 | "!unzip -o gpt_ckpt.zip" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# 시각화\n", 90 | "\n", 91 | "def plot_graphs(history, string):\n", 92 | " plt.plot(history.history[string])\n", 93 | " plt.plot(history.history['val_'+string], '')\n", 94 | " plt.xlabel(\"Epochs\")\n", 95 | " plt.ylabel(string)\n", 96 | " plt.legend([string, 'val_'+string])\n", 97 | " plt.show()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "SEED_NUM = 1234\n", 107 | "tf.random.set_seed(SEED_NUM)\n", 108 | "np.random.seed(SEED_NUM)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## 데이터 준비하기" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n", 125 | "\n", 126 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n", 127 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n", 128 | " mask_token=None,\n", 129 | " sep_token='',\n", 130 | " cls_token=None,\n", 131 | " unknown_token='',\n", 132 | " padding_token='',\n", 133 | " bos_token='~~',\n", 134 | " eos_token='~~')" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "BATCH_SIZE = 32\n", 144 | "NUM_EPOCHS = 3\n", 145 | "VALID_SPLIT = 0.1\n", 146 | "SENT_MAX_LEN = 39\n", 147 | "\n", 148 | "DATA_IN_PATH = './data_in/KOR'\n", 149 | "DATA_OUT_PATH = \"./data_out/KOR\"" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# 데이터 전처리 준비\n", 159 | "\n", 160 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_train.txt\")\n", 161 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_test.txt\")\n", 162 | "\n", 163 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n", 164 | "train_data = train_data.dropna()\n", 165 | "train_data.head()\n", 166 | "\n", 167 | "print(\"Total # dataset: train - {}\".format(len(train_data)))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# 텍스트 전처리\n", 177 | "\n", 178 | "def clean_text(sent):\n", 179 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \"\", sent)\n", 180 | " return sent_clean" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "# train_data = train_data[:50] # for test\n", 190 | "\n", 191 | "train_data_sents = []\n", 192 | "train_data_labels = []\n", 193 | "\n", 194 | "for train_sent, train_label in train_data[['document', 'label']].values:\n", 195 | " train_tokenized_text = vocab[tokenizer(clean_text(train_sent))]\n", 196 | "\n", 197 | " tokens = [vocab[vocab.bos_token]] \n", 198 | " tokens += pad_sequences([train_tokenized_text], \n", 199 | " SENT_MAX_LEN, \n", 200 | " value=vocab[vocab.padding_token], \n", 201 | " padding='post').tolist()[0] \n", 202 | " tokens += [vocab[vocab.eos_token]]\n", 203 | "\n", 204 | " train_data_sents.append(tokens)\n", 205 | " train_data_labels.append(train_label)\n", 206 | "\n", 207 | "train_data_sents = np.array(train_data_sents, dtype=np.int64)\n", 208 | "train_data_labels = np.array(train_data_labels, dtype=np.int64)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## 모델 학습" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "class TFGPT2Classifier(tf.keras.Model):\n", 225 | " def __init__(self, dir_path, num_class):\n", 226 | " super(TFGPT2Classifier, self).__init__()\n", 227 | " \n", 228 | " self.gpt2 = TFGPT2Model.from_pretrained(dir_path)\n", 229 | " self.num_class = num_class\n", 230 | " \n", 231 | " self.dropout = tf.keras.layers.Dropout(self.gpt2.config.summary_first_dropout)\n", 232 | " self.classifier = tf.keras.layers.Dense(self.num_class, \n", 233 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.gpt2.config.initializer_range), \n", 234 | " name=\"classifier\")\n", 235 | " \n", 236 | " def call(self, inputs):\n", 237 | " outputs = self.gpt2(inputs)\n", 238 | " pooled_output = outputs[0][:, -1]\n", 239 | "\n", 240 | " pooled_output = self.dropout(pooled_output)\n", 241 | " logits = self.classifier(pooled_output)\n", 242 | "\n", 243 | " return logits" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "BASE_MODEL_PATH = './gpt_ckpt'\n", 253 | "cls_model = TFGPT2Classifier(dir_path=BASE_MODEL_PATH, num_class=2)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "optimizer = tf.keras.optimizers.Adam(learning_rate=6.25e-5)\n", 263 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", 264 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n", 265 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "model_name = \"tf2_gpt2_naver_movie\"\n", 275 | "\n", 276 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)\n", 277 | "\n", 278 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 279 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 280 | "\n", 281 | "if os.path.exists(checkpoint_dir):\n", 282 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 283 | "else:\n", 284 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 285 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 286 | " \n", 287 | "cp_callback = ModelCheckpoint(\n", 288 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n", 289 | "\n", 290 | "history = cls_model.fit(train_data_sents, train_data_labels, \n", 291 | " epochs=NUM_EPOCHS, \n", 292 | " batch_size=BATCH_SIZE,\n", 293 | " validation_split=VALID_SPLIT, \n", 294 | " callbacks=[earlystop_callback, cp_callback])" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "plot_graphs(history, 'accuracy')" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "plot_graphs(history, 'loss')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "## 모델 테스트" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\\t', quoting=3)\n", 329 | "test_data = test_data.dropna()\n", 330 | "test_data.head()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "# test_data = test_data[:50] # for test\n", 340 | "\n", 341 | "test_data_sents = []\n", 342 | "test_data_labels = []\n", 343 | "\n", 344 | "for test_sent, test_label in test_data[['document','label']].values:\n", 345 | " test_tokenized_text = vocab[tokenizer(clean_text(test_sent))]\n", 346 | "\n", 347 | " tokens = [vocab[vocab.bos_token]] \n", 348 | " tokens += pad_sequences([test_tokenized_text], \n", 349 | " SENT_MAX_LEN, \n", 350 | " value=vocab[vocab.padding_token], \n", 351 | " padding='post').tolist()[0] \n", 352 | " tokens += [vocab[vocab.eos_token]]\n", 353 | "\n", 354 | " test_data_sents.append(tokens)\n", 355 | " test_data_labels.append(test_label)\n", 356 | "\n", 357 | "test_data_sents = np.array(test_data_sents, dtype=np.int64)\n", 358 | "test_data_labels = np.array(test_data_labels, dtype=np.int64)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "print(\"num sents, labels {}, {}\".format(len(test_data_sents), len(test_data_labels)))" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "cls_model.load_weights(checkpoint_path)\n", 377 | "\n", 378 | "results = cls_model.evaluate(test_data_sents, test_data_labels, batch_size=1024)\n", 379 | "print(\"test loss, test acc: \", results)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [] 388 | } 389 | ], 390 | "metadata": { 391 | "kernelspec": { 392 | "display_name": "Python 3", 393 | "language": "python", 394 | "name": "python3" 395 | }, 396 | "language_info": { 397 | "codemirror_mode": { 398 | "name": "ipython", 399 | "version": 3 400 | }, 401 | "file_extension": ".py", 402 | "mimetype": "text/x-python", 403 | "name": "python", 404 | "nbconvert_exporter": "python", 405 | "pygments_lexer": "ipython3", 406 | "version": "3.7.4" 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 2 411 | } 412 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.2.1.bert_finetune_NSMC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR/naver_movie\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n", 36 | " -O data_in/KOR/naver_movie/ratings_train.txt\n", 37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n", 38 | " -O data_in/KOR/naver_movie/ratings_test.txt" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import os\n", 48 | "import re\n", 49 | "import numpy as np\n", 50 | "from tqdm import tqdm\n", 51 | "\n", 52 | "import tensorflow as tf\n", 53 | "from transformers import *\n", 54 | "\n", 55 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 56 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 57 | "\n", 58 | "import pandas as pd\n", 59 | "import matplotlib.pyplot as plt" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# 시각화\n", 69 | "\n", 70 | "def plot_graphs(history, string):\n", 71 | " plt.plot(history.history[string])\n", 72 | " plt.plot(history.history['val_'+string], '')\n", 73 | " plt.xlabel(\"Epochs\")\n", 74 | " plt.ylabel(string)\n", 75 | " plt.legend([string, 'val_'+string])\n", 76 | " plt.show()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "#random seed 고정\n", 86 | "tf.random.set_seed(1234)\n", 87 | "np.random.seed(1234)\n", 88 | "\n", 89 | "BATCH_SIZE = 32\n", 90 | "NUM_EPOCHS = 3\n", 91 | "VALID_SPLIT = 0.2\n", 92 | "MAX_LEN = 39 # EDA에서 추출된 Max Length\n", 93 | "DATA_IN_PATH = 'data_in/KOR'\n", 94 | "DATA_OUT_PATH = \"data_out/KOR\"" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt', do_lower_case=False)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## 토크나이저 테스트" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "test_sentence = \"안녕하세요, 반갑습니다.\"\n", 120 | "\n", 121 | "encode = tokenizer.encode(test_sentence)\n", 122 | "token_print = [tokenizer.decode(token) for token in encode]\n", 123 | "\n", 124 | "print(encode)\n", 125 | "print(token_print)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "kor_encode = tokenizer.encode(\"안녕하세요, 반갑습니다\")\n", 135 | "eng_encode = tokenizer.encode(\"Hello world\")\n", 136 | "kor_decode = tokenizer.decode(kor_encode)\n", 137 | "eng_decode = tokenizer.decode(eng_encode)\n", 138 | "\n", 139 | "print(kor_encode)\n", 140 | "# [101, 9521, 118741, 35506, 24982, 48549, 117, 9321, 118610, 119081, 48345, 102]\n", 141 | "print(eng_encode)\n", 142 | "# [101, 31178, 11356, 102]\n", 143 | "print(kor_decode)\n", 144 | "# [CLS] 안녕하세요, 반갑습니다 [SEP]\n", 145 | "print(eng_decode)\n", 146 | "# [CLS] Hello world [SEP]" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "# Korean Movie Review Classification" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "scrolled": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# 데이터 전처리 준비\n", 165 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_train.txt\")\n", 166 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_test.txt\")\n", 167 | "\n", 168 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n", 169 | "train_data = train_data.dropna()\n", 170 | "train_data.head()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "# 스페셜 토큰\n", 180 | "print(tokenizer.all_special_tokens, \"\\n\", tokenizer.all_special_ids)\n", 181 | "\n", 182 | "# 토크나이저 테스트하기\n", 183 | "kor_encode = tokenizer.encode(\"안녕하세요, 반갑습니다. \")\n", 184 | "eng_encode = tokenizer.encode(\"Hello world\")\n", 185 | "\n", 186 | "kor_decode = tokenizer.decode(kor_encode)\n", 187 | "eng_decode = tokenizer.decode(eng_encode)\n", 188 | "\n", 189 | "print(kor_encode)\n", 190 | "print(eng_encode)\n", 191 | "print(kor_decode)\n", 192 | "print(eng_decode)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# Bert Tokenizer\n", 202 | "\n", 203 | "# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n", 204 | "\n", 205 | "def bert_tokenizer(sent, MAX_LEN):\n", 206 | " \n", 207 | " encoded_dict = tokenizer.encode_plus(\n", 208 | " text = sent,\n", 209 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", 210 | " max_length = MAX_LEN, # Pad & truncate all sentences.\n", 211 | " pad_to_max_length = True,\n", 212 | " return_attention_mask = True # Construct attn. masks.\n", 213 | " \n", 214 | " )\n", 215 | " \n", 216 | " input_id = encoded_dict['input_ids']\n", 217 | " attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n", 218 | " token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n", 219 | " \n", 220 | " return input_id, attention_mask, token_type_id" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "# train_data = train_data[:1000] # for test\n", 230 | "\n", 231 | "input_ids = []\n", 232 | "attention_masks = []\n", 233 | "token_type_ids = []\n", 234 | "train_data_labels = []\n", 235 | "\n", 236 | "for train_sent, train_label in tqdm(zip(train_data[\"document\"], train_data[\"label\"]), total=len(train_data)):\n", 237 | " try:\n", 238 | " input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)\n", 239 | " \n", 240 | " input_ids.append(input_id)\n", 241 | " attention_masks.append(attention_mask)\n", 242 | " token_type_ids.append(token_type_id)\n", 243 | " train_data_labels.append(train_label)\n", 244 | "\n", 245 | " except Exception as e:\n", 246 | " print(e)\n", 247 | " print(train_sent)\n", 248 | " pass\n", 249 | "\n", 250 | "train_movie_input_ids = np.array(input_ids, dtype=int)\n", 251 | "train_movie_attention_masks = np.array(attention_masks, dtype=int)\n", 252 | "train_movie_type_ids = np.array(token_type_ids, dtype=int)\n", 253 | "train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_type_ids)\n", 254 | "\n", 255 | "train_data_labels = np.asarray(train_data_labels, dtype=np.int32) #레이블 토크나이징 리스트\n", 256 | "\n", 257 | "print(\"# sents: {}, # labels: {}\".format(len(train_movie_input_ids), len(train_data_labels)))" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "# 최대 길이: 39\n", 267 | "input_id = train_movie_input_ids[1]\n", 268 | "attention_mask = train_movie_attention_masks[1]\n", 269 | "token_type_id = train_movie_type_ids[1]\n", 270 | "\n", 271 | "print(input_id)\n", 272 | "print(attention_mask)\n", 273 | "print(token_type_id)\n", 274 | "print(tokenizer.decode(input_id))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "class TFBertClassifier(tf.keras.Model):\n", 284 | " def __init__(self, model_name, dir_path, num_class):\n", 285 | " super(TFBertClassifier, self).__init__()\n", 286 | "\n", 287 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n", 288 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n", 289 | " self.classifier = tf.keras.layers.Dense(num_class, \n", 290 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n", 291 | " name=\"classifier\")\n", 292 | " \n", 293 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n", 294 | " \n", 295 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n", 296 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n", 297 | " pooled_output = outputs[1] \n", 298 | " pooled_output = self.dropout(pooled_output, training=training)\n", 299 | " logits = self.classifier(pooled_output)\n", 300 | "\n", 301 | " return logits\n", 302 | "\n", 303 | "cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',\n", 304 | " dir_path='bert_ckpt',\n", 305 | " num_class=2)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "# 학습 준비하기\n", 315 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n", 316 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", 317 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n", 318 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "model_name = \"tf2_bert_naver_movie\"\n", 328 | "\n", 329 | "# overfitting을 막기 위한 ealrystop 추가\n", 330 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)\n", 331 | "# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)\n", 332 | "# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\\\n", 333 | "\n", 334 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 335 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 336 | "\n", 337 | "# Create path if exists\n", 338 | "if os.path.exists(checkpoint_dir):\n", 339 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 340 | "else:\n", 341 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 342 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 343 | " \n", 344 | "cp_callback = ModelCheckpoint(\n", 345 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n", 346 | "\n", 347 | "# 학습과 eval 시작\n", 348 | "history = cls_model.fit(train_movie_inputs, train_data_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,\n", 349 | " validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])\n", 350 | "\n", 351 | "#steps_for_epoch\n", 352 | "\n", 353 | "print(history.history)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "plot_graphs(history, 'loss')" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "# Korean Movie Review Test 데이터" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "test_data = pd.read_csv(DATA_TEST_PATH, header = 0, delimiter = '\\t', quoting = 3)\n", 379 | "test_data = test_data.dropna()\n", 380 | "test_data.head()" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "input_ids = []\n", 390 | "attention_masks = []\n", 391 | "token_type_ids = []\n", 392 | "test_data_labels = []\n", 393 | "\n", 394 | "for test_sent, test_label in tqdm(zip(test_data[\"document\"], test_data[\"label\"])):\n", 395 | " try:\n", 396 | " input_id, attention_mask, token_type_id = bert_tokenizer(test_sent, MAX_LEN)\n", 397 | "\n", 398 | " input_ids.append(input_id)\n", 399 | " attention_masks.append(attention_mask)\n", 400 | " token_type_ids.append(token_type_id)\n", 401 | " test_data_labels.append(test_label)\n", 402 | " except Exception as e:\n", 403 | " print(e)\n", 404 | " print(test_sent)\n", 405 | " pass\n", 406 | "\n", 407 | "test_movie_input_ids = np.array(input_ids, dtype=int)\n", 408 | "test_movie_attention_masks = np.array(attention_masks, dtype=int)\n", 409 | "test_movie_type_ids = np.array(token_type_ids, dtype=int)\n", 410 | "test_movie_inputs = (test_movie_input_ids, test_movie_attention_masks, test_movie_type_ids)\n", 411 | "\n", 412 | "test_data_labels = np.asarray(test_data_labels, dtype=np.int32) #레이블 토크나이징 리스트\n", 413 | "\n", 414 | "print(\"num sents, labels {}, {}\".format(len(test_movie_input_ids), len(test_data_labels)))" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "results = cls_model.evaluate(test_movie_inputs, test_data_labels, batch_size=1024)\n", 424 | "print(\"test loss, test acc: \", results)" 425 | ] 426 | } 427 | ], 428 | "metadata": { 429 | "kernelspec": { 430 | "display_name": "Python 3", 431 | "language": "python", 432 | "name": "python3" 433 | }, 434 | "language_info": { 435 | "codemirror_mode": { 436 | "name": "ipython", 437 | "version": 3 438 | }, 439 | "file_extension": ".py", 440 | "mimetype": "text/x-python", 441 | "name": "python", 442 | "nbconvert_exporter": "python", 443 | "pygments_lexer": "ipython3", 444 | "version": "3.7.4" 445 | } 446 | }, 447 | "nbformat": 4, 448 | "nbformat_minor": 2 449 | } 450 | -------------------------------------------------------------------------------- /8.GPT3/8.3.gpt2_fewshot_NSMC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### 주의!!\n", 8 | "\n", 9 | "이 실습은 가급적 NVIDIA GPU가 설치된 컴퓨터 환경이거나 Google Colab에서 진행해주세요." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "2NmYZYYhXrcZ" 16 | }, 17 | "source": [ 18 | "## 환경 준비 \n", 19 | "(Google Colab 환경에서 사용하세요)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "id": "6-bFpckCXrcb" 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 31 | "!pip install -r requirements.txt\n", 32 | "!pip install tensorflow==2.2.0" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "cvFHjoTCXrcc" 39 | }, 40 | "source": [ 41 | "## 데이터 다운로드\n", 42 | "(Google Colab 환경에서 사용하세요)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "id": "HbKNloVoXrcd" 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "!mkdir -p data_in/KOR/naver_movie\n", 54 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n", 55 | " -O data_in/KOR/naver_movie/ratings_train.txt\n", 56 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n", 57 | " -O data_in/KOR/naver_movie/ratings_test.txt" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "id": "xs88fDX8Xrcd", 65 | "scrolled": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "import os\n", 70 | "import tensorflow as tf\n", 71 | "from transformers import TFGPT2LMHeadModel\n", 72 | "\n", 73 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 74 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 75 | "\n", 76 | "import gluonnlp as nlp\n", 77 | "from gluonnlp.data import SentencepieceTokenizer\n", 78 | "\n", 79 | "import pandas as pd\n", 80 | "import matplotlib.pyplot as plt\n", 81 | "\n", 82 | "import numpy as np\n", 83 | "import re\n", 84 | "\n", 85 | "import random\n", 86 | "from random import sample" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "id": "XgV0aK1KXrce" 93 | }, 94 | "source": [ 95 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "id": "XmofLC_rXrce" 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "import wget\n", 107 | "import zipfile\n", 108 | "\n", 109 | "wget.download('https://github.com/NLP-kr/tensorflow-ml-nlp-tf2/releases/download/v1.0/gpt_ckpt.zip')\n", 110 | "\n", 111 | "with zipfile.ZipFile('gpt_ckpt.zip') as z:\n", 112 | " z.extractall()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "id": "TVExOYgEXrcf" 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "# 시각화\n", 124 | "\n", 125 | "def plot_graphs(history, string):\n", 126 | " plt.plot(history.history[string])\n", 127 | " plt.plot(history.history['val_'+string], '')\n", 128 | " plt.xlabel('Epochs')\n", 129 | " plt.ylabel(string)\n", 130 | " plt.legend([string, 'val_'+string])\n", 131 | " plt.show()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "id": "s6dM4ebxXrcg" 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "SEED_NUM = 1234\n", 143 | "tf.random.set_seed(SEED_NUM)\n", 144 | "np.random.seed(SEED_NUM)\n", 145 | "random.seed(SEED_NUM)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "id": "WSYro-2hvbOI" 152 | }, 153 | "source": [ 154 | "## 퓨샷 러닝을 위한 네이버 영화 리뷰 모델 구성\n" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "id": "lAaKKUqbXrch" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n", 166 | "\n", 167 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n", 168 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n", 169 | " mask_token=None,\n", 170 | " sep_token='',\n", 171 | " cls_token=None,\n", 172 | " unknown_token='',\n", 173 | " padding_token='',\n", 174 | " bos_token='~~',\n", 175 | " eos_token='~~')" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "id": "AypWVja1Xrcj" 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "class TFGPT2FewshotClassifier(tf.keras.Model):\n", 187 | " def __init__(self, dir_path):\n", 188 | " super(TFGPT2FewshotClassifier, self).__init__()\n", 189 | " self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)\n", 190 | " \n", 191 | " def call(self, inputs):\n", 192 | " outputs = self.gpt2({'input_ids': inputs})[0][:, -1, :]\n", 193 | "\n", 194 | " return outputs" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "colab": { 202 | "base_uri": "https://localhost:8080/" 203 | }, 204 | "id": "9J5VOzCwXrcj", 205 | "outputId": "537cde6c-958a-4bc7-f98d-996b3bb13bb3" 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "BASE_MODEL_PATH = './gpt_ckpt'\n", 210 | "cls_model = TFGPT2FewshotClassifier(dir_path=BASE_MODEL_PATH)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": { 216 | "id": "pCN8Lh7gXrch" 217 | }, 218 | "source": [ 219 | "## 퓨샷 러닝을 위한 네이버 영화 리뷰 데이터 구성" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "id": "Ct1IbwATXrci" 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "# 데이터 전처리 준비\n", 231 | "DATA_IN_PATH = './data_in/KOR'\n", 232 | "DATA_OUT_PATH = './data_out/KOR'\n", 233 | "\n", 234 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_train.txt')\n", 235 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_test.txt')\n", 236 | "\n", 237 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n", 238 | "train_data = train_data.dropna()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "colab": { 246 | "base_uri": "https://localhost:8080/" 247 | }, 248 | "id": "WED9P9SUSyR9", 249 | "outputId": "5c4ba8bd-9a78-49fa-ad19-9fe14603723f" 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "print('데이터 positive 라벨: ', tokenizer('긍정'))\n", 254 | "print('데이터 negative 라벨: ', tokenizer('부정'))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "colab": { 262 | "base_uri": "https://localhost:8080/" 263 | }, 264 | "id": "WaQ1miXfwQfn", 265 | "outputId": "6d506ff6-ac64-4478-b3cc-dbbf7aa9526d" 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "print('학습 예시 케이스 구조: ', tokenizer('문장: 오늘 기분이 좋아\\n감정: 긍정\\n'))" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "colab": { 277 | "base_uri": "https://localhost:8080/" 278 | }, 279 | "id": "7h0USc0RxQqG", 280 | "outputId": "7ad08962-3798-4653-ce76-b6a69a1eb4e9" 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "print('gpt2 최대 토큰 길이: ', cls_model.gpt2.config.n_ctx)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "colab": { 292 | "base_uri": "https://localhost:8080/" 293 | }, 294 | "id": "MRwI-RcOyFRj", 295 | "outputId": "ccbf5e78-27a2-4c4e-8b74-327de16c6673" 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "sent_lens = [len(tokenizer(s)) for s in train_data['document']]\n", 300 | "\n", 301 | "print('Few shot 케이스 토큰 평균 길이: ', np.mean(sent_lens))\n", 302 | "print('Few shot 케이스 토큰 최대 길이: ', np.max(sent_lens))\n", 303 | "print('Few shot 케이스 토큰 길이 표준편차: ',np.std(sent_lens))\n", 304 | "print('Few shot 케이스 토큰 길이 80 퍼센타일: ',np.percentile(sent_lens, 80))" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "id": "PdIWfc6Pzyfz" 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "train_fewshot_data = []\n", 316 | "\n", 317 | "for train_sent, train_label in train_data[['document', 'label']].values:\n", 318 | " tokens = vocab[tokenizer(train_sent)]\n", 319 | "\n", 320 | " if len(tokens) <= 25:\n", 321 | " train_fewshot_data.append((train_sent, train_label))" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": { 327 | "id": "4jFe7XMeXrcl" 328 | }, 329 | "source": [ 330 | "## 네이버 영화 리뷰 데이터를 활용한 퓨샷 러닝 및 평가" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "colab": { 338 | "base_uri": "https://localhost:8080/", 339 | "height": 206 340 | }, 341 | "id": "1_OhF3hVhK0y", 342 | "outputId": "6a661ba0-e27e-4aaf-ba2e-5d49809ef866" 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\\t', quoting=3)\n", 347 | "test_data = test_data.dropna()\n", 348 | "test_data.head()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": { 355 | "id": "liE91_rhsQdY" 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "sample_size = 5000\n", 360 | "\n", 361 | "train_fewshot_samples = []\n", 362 | "\n", 363 | "for _ in range(sample_size):\n", 364 | " fewshot_examples = sample(train_fewshot_data, 30)\n", 365 | " train_fewshot_samples.append(fewshot_examples)\n", 366 | "\n", 367 | "if sample_size < len(test_data['id']):\n", 368 | " test_data = test_data.sample(sample_size, random_state=SEED_NUM)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "id": "s-ZiFs-aRIXy" 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "def build_prompt_text(sent):\n", 380 | " return \"문장: \" + sent + '\\n감정: '\n", 381 | "\n", 382 | "def clean_text(sent):\n", 383 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \"\", sent)\n", 384 | " return sent_clean\n", 385 | "\n", 386 | "real_labels = []\n", 387 | "pred_tokens = []\n", 388 | "\n", 389 | "for i, (test_sent, test_label) in enumerate(test_data[['document','label']].values):\n", 390 | " tokens = [vocab[vocab.bos_token]]\n", 391 | "\n", 392 | " for ex in train_fewshot_samples[i]:\n", 393 | " example_text, example_label = ex\n", 394 | " cleaned_example_text = clean_text(example_text)\n", 395 | " appended_prompt_example_text = build_prompt_text(cleaned_example_text)\n", 396 | " appended_prompt_example_text += '긍정' if example_label == 1 else '부정' + '\\n'\n", 397 | "\n", 398 | " tokens += vocab[tokenizer(appended_prompt_example_text)]\n", 399 | "\n", 400 | " cleaned_sent = clean_text(test_sent)\n", 401 | " appended_prompt_sent = build_prompt_text(cleaned_sent)\n", 402 | " test_tokens = vocab[tokenizer(appended_prompt_sent)]\n", 403 | "\n", 404 | " tokens += test_tokens\n", 405 | "\n", 406 | " pred = tf.argmax(cls_model(np.array([tokens], dtype=np.int64)), axis=-1).numpy()\n", 407 | " label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]\n", 408 | "\n", 409 | " pred_tokens.append(pred[0])\n", 410 | " real_labels.append(label[0])" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "colab": { 418 | "base_uri": "https://localhost:8080/" 419 | }, 420 | "id": "0oZ1GfUPeuec", 421 | "outputId": "becdea25-d59b-4dd2-d6d6-9caece719dad" 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]\n", 426 | "accuracy = len([m for m in accuracy_match if m]) / len(real_labels)\n", 427 | "\n", 428 | "print(accuracy)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "id": "vMYgRRGI-Gu4" 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "def build_prompt_text(sent):\n", 440 | " return '감정 분석 문장: ' + sent + '\\n결과: '\n", 441 | "\n", 442 | "real_labels = []\n", 443 | "pred_tokens = []\n", 444 | "\n", 445 | "\n", 446 | "for i, (test_sent, test_label) in enumerate(test_data[['document','label']].values):\n", 447 | " tokens = [vocab[vocab.bos_token]]\n", 448 | "\n", 449 | " for ex in train_fewshot_samples[i]:\n", 450 | " example_text, example_label = ex\n", 451 | " cleaned_example_text = clean_text(example_text)\n", 452 | " appended_prompt_example_text = build_prompt_text(cleaned_example_text)\n", 453 | " appended_prompt_example_text += '긍정' if example_label == 1 else '부정' + '\\n'\n", 454 | "\n", 455 | " tokens += vocab[tokenizer(appended_prompt_example_text)]\n", 456 | "\n", 457 | " cleaned_sent = clean_text(test_sent)\n", 458 | " appended_prompt_sent = build_prompt_text(cleaned_sent)\n", 459 | " test_tokens = vocab[tokenizer(appended_prompt_sent)]\n", 460 | "\n", 461 | " tokens += test_tokens\n", 462 | "\n", 463 | " pred = tf.argmax(cls_model(np.array([tokens], dtype=np.int64)), axis=-1).numpy()\n", 464 | " label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]\n", 465 | "\n", 466 | " pred_tokens.append(pred[0])\n", 467 | " real_labels.append(label[0])" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": { 474 | "colab": { 475 | "base_uri": "https://localhost:8080/" 476 | }, 477 | "id": "8ufjRihAzNBK", 478 | "outputId": "580a3580-8120-41bd-b0e6-5b94616fe0d1" 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]\n", 483 | "accuracy = len([m for m in accuracy_match if m]) / len(real_labels)\n", 484 | "\n", 485 | "print(accuracy)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": { 492 | "id": "uDbMaaxr2kfL" 493 | }, 494 | "outputs": [], 495 | "source": [] 496 | } 497 | ], 498 | "metadata": { 499 | "accelerator": "GPU", 500 | "colab": { 501 | "collapsed_sections": [], 502 | "machine_shape": "hm", 503 | "name": "7.4.2.gpt2_fewshot_NSMC.ipynb", 504 | "provenance": [] 505 | }, 506 | "kernelspec": { 507 | "display_name": "Python 3", 508 | "language": "python", 509 | "name": "python3" 510 | }, 511 | "language_info": { 512 | "codemirror_mode": { 513 | "name": "ipython", 514 | "version": 3 515 | }, 516 | "file_extension": ".py", 517 | "mimetype": "text/x-python", 518 | "name": "python", 519 | "nbconvert_exporter": "python", 520 | "pygments_lexer": "ipython3", 521 | "version": "3.8.3" 522 | } 523 | }, 524 | "nbformat": 4, 525 | "nbformat_minor": 4 526 | } 527 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.2.2.bert_finetune_KorNLI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR/KorNLI\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/multinli.train.ko.tsv \\\n", 36 | " -O data_in/KOR/KorNLI/multinli.train.ko.tsv\n", 37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/snli_1.0_train.kor.tsv \\\n", 38 | " -O data_in/KOR/KorNLI/snli_1.0_train.kor.tsv\n", 39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.dev.ko.tsv \\\n", 40 | " -O data_in/KOR/KorNLI/xnli.dev.ko.tsv\n", 41 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.test.ko.tsv \\\n", 42 | " -O data_in/KOR/KorNLI/xnli.test.ko.tsv" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import os\n", 52 | "import tensorflow as tf\n", 53 | "from transformers import BertTokenizer, TFBertModel\n", 54 | "\n", 55 | "import numpy as np\n", 56 | "import pandas as pd\n", 57 | "\n", 58 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 59 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 60 | "\n", 61 | "import matplotlib.pyplot as plt" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# 시각화\n", 71 | "\n", 72 | "def plot_graphs(history, string):\n", 73 | " plt.plot(history.history[string])\n", 74 | " plt.plot(history.history['val_'+string], '')\n", 75 | " plt.xlabel(\"Epochs\")\n", 76 | " plt.ylabel(string)\n", 77 | " plt.legend([string, 'val_'+string])\n", 78 | " plt.show()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "#random seed 고정\n", 88 | "\n", 89 | "tf.random.set_seed(1234)\n", 90 | "np.random.seed(1234)\n", 91 | "\n", 92 | "# BASE PARAM\n", 93 | "\n", 94 | "BATCH_SIZE = 32\n", 95 | "NUM_EPOCHS = 3\n", 96 | "MAX_LEN = 24 * 2 # Average total * 2\n", 97 | "\n", 98 | "DATA_IN_PATH = './data_in/KOR'\n", 99 | "DATA_OUT_PATH = \"./data_out/KOR\"" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# KorNLI Dataset\n", 107 | "\n", 108 | "Data from Kakaobrain: https://github.com/kakaobrain/KorNLUDatasets" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# Load Train dataset\n", 118 | "\n", 119 | "TRAIN_SNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'snli_1.0_train.kor.tsv')\n", 120 | "TRAIN_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'multinli.train.ko.tsv')\n", 121 | "DEV_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.dev.ko.tsv')\n", 122 | "\n", 123 | "train_data_snli = pd.read_csv(TRAIN_SNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n", 124 | "train_data_xnli = pd.read_csv(TRAIN_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n", 125 | "dev_data_xnli = pd.read_csv(DEV_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n", 126 | "\n", 127 | "train_data_snli_xnli = train_data_snli.append(train_data_xnli)\n", 128 | "train_data_snli_xnli = train_data_snli_xnli.dropna()\n", 129 | "train_data_snli_xnli = train_data_snli_xnli.reset_index()\n", 130 | "\n", 131 | "dev_data_xnli = dev_data_xnli.dropna()\n", 132 | "\n", 133 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data_snli_xnli), len(dev_data_xnli)))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "# Bert Tokenizer\n", 143 | "\n", 144 | "# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n", 145 | "\n", 146 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt', do_lower_case=False)\n", 147 | "\n", 148 | "def bert_tokenizer_v2(sent1, sent2, MAX_LEN):\n", 149 | " \n", 150 | " # For Two setenece input\n", 151 | " \n", 152 | " encoded_dict = tokenizer.encode_plus(\n", 153 | " text = sent1,\n", 154 | " text_pair = sent2,\n", 155 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", 156 | " max_length = MAX_LEN, # Pad & truncate all sentences.\n", 157 | " pad_to_max_length = True,\n", 158 | " return_attention_mask = True # Construct attn. masks.\n", 159 | " \n", 160 | " )\n", 161 | " \n", 162 | " input_id = encoded_dict['input_ids']\n", 163 | " attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n", 164 | " token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n", 165 | " \n", 166 | " return input_id, attention_mask, token_type_id" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "input_ids = []\n", 176 | "attention_masks = []\n", 177 | "token_type_ids = []\n", 178 | "\n", 179 | "for sent1, sent2 in zip(train_data_snli_xnli['sentence1'], train_data_snli_xnli['sentence2']):\n", 180 | " try:\n", 181 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n", 182 | "\n", 183 | " input_ids.append(input_id)\n", 184 | " attention_masks.append(attention_mask)\n", 185 | " token_type_ids.append(token_type_id)\n", 186 | " except Exception as e:\n", 187 | " print(e)\n", 188 | " print(sent1, sent2)\n", 189 | " pass\n", 190 | " \n", 191 | "train_snli_xnli_input_ids = np.array(input_ids, dtype=int)\n", 192 | "train_snli_xnli_attention_masks = np.array(attention_masks, dtype=int)\n", 193 | "train_snli_xnli_type_ids = np.array(token_type_ids, dtype=int)\n", 194 | "train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "# DEV SET Preprocessing" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n", 211 | "input_ids = []\n", 212 | "attention_masks = []\n", 213 | "token_type_ids = []\n", 214 | "\n", 215 | "for sent1, sent2 in zip(dev_data_xnli['sentence1'], dev_data_xnli['sentence2']):\n", 216 | " try:\n", 217 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n", 218 | "\n", 219 | " input_ids.append(input_id)\n", 220 | " attention_masks.append(attention_mask)\n", 221 | " token_type_ids.append(token_type_id)\n", 222 | " except Exception as e:\n", 223 | " print(e)\n", 224 | " print(sent1, sent2)\n", 225 | " pass\n", 226 | " \n", 227 | "dev_xnli_input_ids = np.array(input_ids, dtype=int)\n", 228 | "dev_xnli_attention_masks = np.array(attention_masks, dtype=int)\n", 229 | "dev_xnli_type_ids = np.array(token_type_ids, dtype=int)\n", 230 | "dev_xnli_inputs = (dev_xnli_input_ids, dev_xnli_attention_masks, dev_xnli_type_ids)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "scrolled": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "# Label을 Netural, Contradiction, Entailment 에서 숫자 형으로 변경한다.\n", 242 | "label_dict = {\"entailment\": 0, \"contradiction\": 1, \"neutral\": 2}\n", 243 | "def convert_int(label):\n", 244 | " num_label = label_dict[label] \n", 245 | " return num_label\n", 246 | "\n", 247 | "train_data_snli_xnli[\"gold_label_int\"] = train_data_snli_xnli[\"gold_label\"].apply(convert_int)\n", 248 | "train_data_labels = np.array(train_data_snli_xnli['gold_label_int'], dtype=int)\n", 249 | "\n", 250 | "dev_data_xnli[\"gold_label_int\"] = dev_data_xnli[\"gold_label\"].apply(convert_int)\n", 251 | "dev_data_labels = np.array(dev_data_xnli['gold_label_int'], dtype=int)\n", 252 | "\n", 253 | "print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "class TFBertClassifier(tf.keras.Model):\n", 263 | " def __init__(self, model_name, dir_path, num_class):\n", 264 | " super(TFBertClassifier, self).__init__()\n", 265 | "\n", 266 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n", 267 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n", 268 | " self.classifier = tf.keras.layers.Dense(num_class, \n", 269 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n", 270 | " name=\"classifier\")\n", 271 | " \n", 272 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n", 273 | " \n", 274 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n", 275 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n", 276 | " pooled_output = outputs[1] \n", 277 | " pooled_output = self.dropout(pooled_output, training=training)\n", 278 | " logits = self.classifier(pooled_output)\n", 279 | "\n", 280 | " return logits\n", 281 | "\n", 282 | "cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',\n", 283 | " dir_path='bert_ckpt',\n", 284 | " num_class=3)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "# 학습 준비하기\n", 294 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n", 295 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", 296 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n", 297 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "#학습 진행하기\n", 307 | "model_name = \"tf2_KorNLI\"\n", 308 | "\n", 309 | "# overfitting을 막기 위한 ealrystop 추가\n", 310 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)\n", 311 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 312 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 313 | "\n", 314 | "# Create path if exists\n", 315 | "if os.path.exists(checkpoint_dir):\n", 316 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 317 | "else:\n", 318 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 319 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 320 | " \n", 321 | "cp_callback = ModelCheckpoint(\n", 322 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n", 323 | "\n", 324 | "# 학습과 eval 시작\n", 325 | "history = cls_model.fit(train_snli_xnli_inputs, train_data_labels, epochs=NUM_EPOCHS,\n", 326 | " validation_data = (dev_xnli_inputs, dev_data_labels),\n", 327 | " batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])\n", 328 | "\n", 329 | "#steps_for_epoch\n", 330 | "print(history.history)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "plot_graphs(history, 'accuracy')\n", 340 | "plot_graphs(history, 'loss')" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "# KorNLI Test dataset" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "# Load Test dataset\n", 357 | "TEST_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.test.ko.tsv')\n", 358 | "\n", 359 | "test_data_xnli = pd.read_csv(TEST_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n", 360 | "test_data_xnli = test_data_xnli.dropna()\n", 361 | "test_data_xnli.head()" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "# Test set도 똑같은 방법으로 구성한다.\n", 371 | "\n", 372 | "input_ids = []\n", 373 | "attention_masks = []\n", 374 | "token_type_ids = []\n", 375 | "\n", 376 | "for sent1, sent2 in zip(test_data_xnli['sentence1'], test_data_xnli['sentence2']):\n", 377 | " \n", 378 | " try:\n", 379 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n", 380 | "\n", 381 | " input_ids.append(input_id)\n", 382 | " attention_masks.append(attention_mask)\n", 383 | " token_type_ids.append(token_type_id)\n", 384 | " except Exception as e:\n", 385 | " print(e)\n", 386 | " print(sent1, sent2)\n", 387 | " pass\n", 388 | " \n", 389 | " \n", 390 | "test_xnli_input_ids = np.array(input_ids, dtype=int)\n", 391 | "test_xnli_attention_masks = np.array(attention_masks, dtype=int)\n", 392 | "test_xnli_type_ids = np.array(token_type_ids, dtype=int)\n", 393 | "test_xnli_inputs = (test_xnli_input_ids, test_xnli_attention_masks, test_xnli_type_ids)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "test_data_xnli[\"gold_label_int\"] = test_data_xnli[\"gold_label\"].apply(convert_int)\n", 403 | "test_data_xnli_labels = np.array(test_data_xnli['gold_label_int'], dtype=int)\n", 404 | "\n", 405 | "print(\"# sents: {}, # labels: {}\".format(len(test_xnli_input_ids), len(test_data_xnli_labels)))" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "results = cls_model.evaluate(test_xnli_inputs, test_data_xnli_labels, batch_size=512)\n", 415 | "print(\"test loss, test acc: \", results)" 416 | ] 417 | } 418 | ], 419 | "metadata": { 420 | "kernelspec": { 421 | "display_name": "Python 3", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.7.4" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 2 440 | } 441 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.4.3.gpt2_finetune_KorNLI.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR/KorNLI\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/multinli.train.ko.tsv \\\n", 36 | " -O data_in/KOR/KorNLI/multinli.train.ko.tsv\n", 37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/snli_1.0_train.kor.tsv \\\n", 38 | " -O data_in/KOR/KorNLI/snli_1.0_train.kor.tsv\n", 39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.dev.ko.tsv \\\n", 40 | " -O data_in/KOR/KorNLI/xnli.dev.ko.tsv\n", 41 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.test.ko.tsv \\\n", 42 | " -O data_in/KOR/KorNLI/xnli.test.ko.tsv" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import os\n", 52 | "import tensorflow as tf\n", 53 | "from transformers import TFGPT2Model\n", 54 | "\n", 55 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 56 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 57 | "\n", 58 | "import gluonnlp as nlp\n", 59 | "from gluonnlp.data import SentencepieceTokenizer\n", 60 | "\n", 61 | "import pandas as pd\n", 62 | "import matplotlib.pyplot as plt\n", 63 | "\n", 64 | "import numpy as np\n", 65 | "import re" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n", 82 | "!unzip -o gpt_ckpt.zip" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# 시각화\n", 92 | "\n", 93 | "def plot_graphs(history, string):\n", 94 | " plt.plot(history.history[string])\n", 95 | " plt.plot(history.history['val_'+string], '')\n", 96 | " plt.xlabel(\"Epochs\")\n", 97 | " plt.ylabel(string)\n", 98 | " plt.legend([string, 'val_'+string])\n", 99 | " plt.show()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "SEED_NUM = 1234\n", 109 | "tf.random.set_seed(SEED_NUM)\n", 110 | "np.random.seed(SEED_NUM)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## 데이터 준비하기" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n", 127 | "\n", 128 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n", 129 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n", 130 | " mask_token=None,\n", 131 | " sep_token='',\n", 132 | " cls_token=None,\n", 133 | " unknown_token='',\n", 134 | " padding_token='',\n", 135 | " bos_token='~~',\n", 136 | " eos_token='~~')" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "BATCH_SIZE = 32\n", 146 | "NUM_EPOCHS = 3\n", 147 | "SENT_MAX_LEN = 31\n", 148 | "\n", 149 | "DATA_IN_PATH = './data_in/KOR'\n", 150 | "DATA_OUT_PATH = \"./data_out/KOR\"" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# Load Train dataset\n", 160 | "\n", 161 | "TRAIN_SNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'snli_1.0_train.kor.tsv')\n", 162 | "TRAIN_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'multinli.train.ko.tsv')\n", 163 | "DEV_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.dev.ko.tsv')\n", 164 | "\n", 165 | "train_data_snli = pd.read_csv(TRAIN_SNLI_DF, header=0, delimiter='\\t', quoting=3)\n", 166 | "train_data_xnli = pd.read_csv(TRAIN_XNLI_DF, header=0, delimiter='\\t', quoting=3)\n", 167 | "dev_data_xnli = pd.read_csv(DEV_XNLI_DF, header=0, delimiter='\\t', quoting=3)\n", 168 | "\n", 169 | "train_data_snli_xnli = train_data_snli.append(train_data_xnli)\n", 170 | "train_data_snli_xnli = train_data_snli_xnli.dropna()\n", 171 | "train_data_snli_xnli = train_data_snli_xnli.reset_index()\n", 172 | "\n", 173 | "dev_data_xnli = dev_data_xnli.dropna()\n", 174 | "\n", 175 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data_snli_xnli), len(dev_data_xnli)))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n", 185 | "def clean_text(sent):\n", 186 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \" \", sent)\n", 187 | " return sent_clean\n", 188 | "\n", 189 | "train_data_sents = []\n", 190 | "\n", 191 | "for train_sent_1, train_sent_2 in train_data_snli_xnli[['sentence1', 'sentence2']].values:\n", 192 | " train_tokenized_sent_1 = vocab[tokenizer(clean_text(train_sent_1))]\n", 193 | " train_tokenized_sent_2 = vocab[tokenizer(clean_text(train_sent_2))]\n", 194 | "\n", 195 | " tokens = [vocab[vocab.bos_token]] \n", 196 | " tokens += pad_sequences([train_tokenized_sent_1], \n", 197 | " SENT_MAX_LEN, \n", 198 | " value=vocab[vocab.padding_token], \n", 199 | " padding='post').tolist()[0] \n", 200 | " tokens += [vocab[vocab.sep_token]] \n", 201 | " tokens += pad_sequences([train_tokenized_sent_2], \n", 202 | " SENT_MAX_LEN, \n", 203 | " value=vocab[vocab.padding_token], \n", 204 | " padding='post').tolist()[0] \n", 205 | " tokens += [vocab[vocab.eos_token]]\n", 206 | "\n", 207 | " train_data_sents.append(tokens) \n", 208 | "\n", 209 | "train_data_sents = np.array(train_data_sents, dtype=np.int64)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "dev_data_sents = []\n", 219 | "\n", 220 | "for dev_sent_1, dev_sent_2 in dev_data_xnli[['sentence1', 'sentence2']].values:\n", 221 | " dev_tokenized_sent_1 = vocab[tokenizer(clean_text(dev_sent_1))]\n", 222 | " dev_tokenized_sent_2 = vocab[tokenizer(clean_text(dev_sent_2))]\n", 223 | "\n", 224 | " tokens = [vocab[vocab.bos_token]] \n", 225 | " tokens += pad_sequences([dev_tokenized_sent_1], \n", 226 | " SENT_MAX_LEN, \n", 227 | " value=vocab[vocab.padding_token], \n", 228 | " padding='post').tolist()[0] \n", 229 | " tokens += [vocab[vocab.sep_token]] \n", 230 | " tokens += pad_sequences([dev_tokenized_sent_2], \n", 231 | " SENT_MAX_LEN, \n", 232 | " value=vocab[vocab.padding_token], \n", 233 | " padding='post').tolist()[0] \n", 234 | " tokens += [vocab[vocab.eos_token]]\n", 235 | "\n", 236 | " dev_data_sents.append(tokens) \n", 237 | "\n", 238 | "dev_data_sents = np.array(dev_data_sents, dtype=np.int64)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "# Label을 Netural, Contradiction, Entailment 에서 숫자 형으로 변경한다.\n", 248 | "label_dict = {\"entailment\": 0, \"contradiction\": 1, \"neutral\": 2}\n", 249 | "\n", 250 | "def convert_int(label):\n", 251 | " num_label = label_dict[label] \n", 252 | " return num_label\n", 253 | "\n", 254 | "train_data_snli_xnli[\"gold_label_int\"] = train_data_snli_xnli[\"gold_label\"].apply(convert_int)\n", 255 | "train_data_labels = np.array(train_data_snli_xnli['gold_label_int'], dtype=int)\n", 256 | "\n", 257 | "dev_data_xnli[\"gold_label_int\"] = dev_data_xnli[\"gold_label\"].apply(convert_int)\n", 258 | "dev_data_labels = np.array(dev_data_xnli['gold_label_int'], dtype=int)\n", 259 | "\n", 260 | "print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "## 모델 학습" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "class TFGPT2Classifier(tf.keras.Model):\n", 277 | " def __init__(self, dir_path, num_class):\n", 278 | " super(TFGPT2Classifier, self).__init__()\n", 279 | " \n", 280 | " self.gpt2 = TFGPT2Model.from_pretrained(dir_path)\n", 281 | " self.num_class = num_class\n", 282 | " \n", 283 | " self.dropout = tf.keras.layers.Dropout(self.gpt2.config.summary_first_dropout)\n", 284 | " self.classifier = tf.keras.layers.Dense(self.num_class, \n", 285 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.gpt2.config.initializer_range), \n", 286 | " name=\"classifier\")\n", 287 | " \n", 288 | " def call(self, inputs):\n", 289 | " outputs = self.gpt2(inputs)\n", 290 | " pooled_output = outputs[0][:, -1]\n", 291 | "\n", 292 | " pooled_output = self.dropout(pooled_output)\n", 293 | " logits = self.classifier(pooled_output)\n", 294 | "\n", 295 | " return logits" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "BASE_MODEL_PATH = './gpt_ckpt'\n", 305 | "sim_model = TFGPT2Classifier(dir_path=BASE_MODEL_PATH, num_class=3)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "optimizer = tf.keras.optimizers.Adam(6.25e-5)\n", 315 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", 316 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n", 317 | "sim_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "model_name = \"tf2_gpt_kornli\"\n", 327 | "\n", 328 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)\n", 329 | "\n", 330 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 331 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 332 | "\n", 333 | "if os.path.exists(checkpoint_dir):\n", 334 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 335 | "else:\n", 336 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 337 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 338 | " \n", 339 | "cp_callback = ModelCheckpoint(\n", 340 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n", 341 | "\n", 342 | "history = sim_model.fit(train_data_sents, train_data_labels, \n", 343 | " epochs=NUM_EPOCHS,\n", 344 | " validation_data=(dev_data_sents, dev_data_labels),\n", 345 | " batch_size=BATCH_SIZE, \n", 346 | " callbacks=[earlystop_callback, cp_callback])" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "plot_graphs(history, 'accuracy')" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "scrolled": false 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "plot_graphs(history, 'loss')" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "## 모델 테스트" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "scrolled": true 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "# Load Test dataset\n", 385 | "TEST_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.test.ko.tsv')\n", 386 | "\n", 387 | "test_data_xnli = pd.read_csv(TEST_XNLI_DF, header=0, delimiter='\\t', quoting=3)" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "# test_data_xnli = test_data_xnli[:50] # for test\n", 397 | "\n", 398 | "test_data_sents = []\n", 399 | "\n", 400 | "for test_sent_1, test_sent_2 in test_data_xnli[['sentence1', 'sentence2']].values:\n", 401 | " test_tokenized_sent_1 = vocab[tokenizer(clean_text(test_sent_1))]\n", 402 | " test_tokenized_sent_2 = vocab[tokenizer(clean_text(test_sent_2))]\n", 403 | "\n", 404 | " tokens = [vocab[vocab.bos_token]] \n", 405 | " tokens += pad_sequences([test_tokenized_sent_1], \n", 406 | " SENT_MAX_LEN, \n", 407 | " value=vocab[vocab.padding_token], \n", 408 | " padding='post').tolist()[0] \n", 409 | " tokens += [vocab[vocab.sep_token]] \n", 410 | " tokens += pad_sequences([test_tokenized_sent_2], \n", 411 | " SENT_MAX_LEN, \n", 412 | " value=vocab[vocab.padding_token], \n", 413 | " padding='post').tolist()[0] \n", 414 | " tokens += [vocab[vocab.eos_token]]\n", 415 | "\n", 416 | " test_data_sents.append(tokens) \n", 417 | "\n", 418 | "test_data_sents = np.array(test_data_sents, dtype=np.int64)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "test_data_xnli[\"gold_label_int\"] = test_data_xnli[\"gold_label\"].apply(convert_int)\n", 428 | "test_data_labels = np.array(test_data_xnli['gold_label_int'], dtype=int)\n", 429 | "\n", 430 | "print(\"# sents: {}, # labels: {}\".format(len(test_data_sents), len(test_data_labels)))" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "sim_model.load_weights(checkpoint_path)\n", 440 | "\n", 441 | "results = sim_model.evaluate(test_data_sents, test_data_labels, batch_size=1024)\n", 442 | "print(\"test loss, test acc: \", results)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [] 451 | } 452 | ], 453 | "metadata": { 454 | "kernelspec": { 455 | "display_name": "Python 3", 456 | "language": "python", 457 | "name": "python3" 458 | }, 459 | "language_info": { 460 | "codemirror_mode": { 461 | "name": "ipython", 462 | "version": 3 463 | }, 464 | "file_extension": ".py", 465 | "mimetype": "text/x-python", 466 | "name": "python", 467 | "nbconvert_exporter": "python", 468 | "pygments_lexer": "ipython3", 469 | "version": "3.7.4" 470 | } 471 | }, 472 | "nbformat": 4, 473 | "nbformat_minor": 2 474 | } 475 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.2.3.bert_finetune_NER.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR/NER\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/NER/label.txt \\\n", 36 | " -O data_in/KOR/NER/label.txt\n", 37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/NER/test.tsv \\\n", 38 | " -O data_in/KOR/NER/test.tsv\n", 39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/NER/train.tsv \\\n", 40 | " -O data_in/KOR/NER/train.txt" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import os\n", 50 | "import re\n", 51 | "import numpy as np\n", 52 | "from tqdm import tqdm\n", 53 | "import json\n", 54 | "import copy\n", 55 | "\n", 56 | "import tensorflow as tf\n", 57 | "from transformers import *\n", 58 | "\n", 59 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 60 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 61 | "\n", 62 | "\n", 63 | "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n", 64 | "\n", 65 | "import pandas as pd\n", 66 | "import matplotlib.pyplot as plt" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# 시각화\n", 76 | "\n", 77 | "def plot_graphs(history, string):\n", 78 | " plt.plot(history.history[string])\n", 79 | " plt.xlabel(\"Epochs\")\n", 80 | " plt.ylabel(string)\n", 81 | " plt.legend([string])\n", 82 | " plt.show()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "#random seed 고정\n", 92 | "tf.random.set_seed(1234)\n", 93 | "np.random.seed(1234)\n", 94 | "\n", 95 | "BATCH_SIZE = 32\n", 96 | "NUM_EPOCHS = 3\n", 97 | "MAX_LEN = 111 # EDA에서 추출된 Max Length\n", 98 | "DATA_IN_PATH = 'data_in/KOR'\n", 99 | "DATA_OUT_PATH = \"data_out/KOR\"" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# 데이터 전처리 준비\n", 109 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"NER\", \"train.tsv\")\n", 110 | "DATA_LABEL_PATH = os.path.join(DATA_IN_PATH, \"NER\", \"label.txt\")\n", 111 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"NER\", \"test.tsv\")\n", 112 | "\n", 113 | "def read_file(input_path):\n", 114 | " \"\"\"Read tsv file, and return words and label as list\"\"\"\n", 115 | " with open(input_path, \"r\", encoding=\"utf-8\") as f:\n", 116 | " sentences = []\n", 117 | " labels = []\n", 118 | " for line in f:\n", 119 | " split_line = line.strip().split(\"\\t\")\n", 120 | " sentences.append(split_line[0])\n", 121 | " labels.append(split_line[1])\n", 122 | " return sentences, labels\n", 123 | "\n", 124 | "train_sentences, train_labels = read_file(DATA_TRAIN_PATH)\n", 125 | "\n", 126 | "train_ner_dict = {\"sentence\": train_sentences, \"label\": train_labels}\n", 127 | "train_ner_df = pd.DataFrame(train_ner_dict)\n", 128 | "\n", 129 | "test_sentences, test_labels = read_file(DATA_TEST_PATH)\n", 130 | "test_ner_dict = {\"sentence\": test_sentences, \"label\": test_labels}\n", 131 | "test_ner_df = pd.DataFrame(test_ner_dict)\n", 132 | "\n", 133 | "print(\"개체명 인식 학습 데이터 개수: {}\".format(len(train_ner_df)))\n", 134 | "print(\"개체명 인식 테스트 데이터 개수: {}\".format(len(test_ner_df)))\n", 135 | "\n", 136 | "# 개체명 인식 학습 데이터 개수: 81000\n", 137 | "# 개체명 인식 테스트 데이터 개수: 9000" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# Label 불러오기\n", 147 | "\n", 148 | "def get_labels(label_path):\n", 149 | " return [label.strip() for label in open(os.path.join(label_path), 'r', encoding='utf-8')]\n", 150 | "\n", 151 | "ner_labels = get_labels(DATA_LABEL_PATH)\n", 152 | "\n", 153 | "print(\"개체명 인식 레이블 개수: {}\".format(len(ner_labels)))" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# 버트 토크나이저 설정\n", 163 | "\n", 164 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt')\n", 165 | "\n", 166 | "pad_token_id = tokenizer.pad_token_id # 0\n", 167 | "pad_token_label_id = 0\n", 168 | "cls_token_label_id = 0\n", 169 | "sep_token_label_id = 0" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "def bert_tokenizer(sent, MAX_LEN):\n", 179 | " \n", 180 | " encoded_dict = tokenizer.encode_plus(\n", 181 | " text = sent,\n", 182 | " truncation=True,\n", 183 | " add_special_tokens = True, #'[CLS]'와 '[SEP]' 추가\n", 184 | " max_length = MAX_LEN, # 문장 패딩 및 자르기 진행\n", 185 | " pad_to_max_length = True,\n", 186 | " return_attention_mask = True # 어탠션 마스크 생성\n", 187 | " )\n", 188 | " \n", 189 | " input_id = encoded_dict['input_ids']\n", 190 | " attention_mask = encoded_dict['attention_mask'] \n", 191 | " token_type_id = encoded_dict['token_type_ids']\n", 192 | " \n", 193 | " return input_id, attention_mask, token_type_id\n", 194 | "\n", 195 | "def convert_label(words, labels_idx, ner_begin_label, max_seq_len):\n", 196 | " \n", 197 | " tokens = []\n", 198 | " label_ids = []\n", 199 | "\n", 200 | " for word, slot_label in zip(words, labels_idx):\n", 201 | "\n", 202 | " word_tokens = tokenizer.tokenize(word)\n", 203 | " if not word_tokens:\n", 204 | " word_tokens = [unk_token]\n", 205 | " tokens.extend(word_tokens)\n", 206 | " \n", 207 | " # 슬롯 레이블 값이 Begin이면 I로 추가\n", 208 | " if int(slot_label) in ner_begin_label:\n", 209 | " label_ids.extend([int(slot_label)] + [int(slot_label) + 1] * (len(word_tokens) - 1))\n", 210 | " else:\n", 211 | " label_ids.extend([int(slot_label)] * len(word_tokens))\n", 212 | " \n", 213 | " # [CLS] and [SEP] 설정\n", 214 | " special_tokens_count = 2\n", 215 | " if len(label_ids) > max_seq_len - special_tokens_count:\n", 216 | " label_ids = label_ids[: (max_seq_len - special_tokens_count)]\n", 217 | "\n", 218 | " # [SEP] 토큰 추가\n", 219 | " label_ids += [sep_token_label_id]\n", 220 | "\n", 221 | " # [CLS] 토큰 추가\n", 222 | " label_ids = [cls_token_label_id] + label_ids\n", 223 | " \n", 224 | " padding_length = max_seq_len - len(label_ids)\n", 225 | " label_ids = label_ids + ([pad_token_label_id] * padding_length)\n", 226 | " \n", 227 | " return label_ids" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "# 테스트용\n", 237 | "ner_begin_label = [ner_labels.index(begin_label) for begin_label in ner_labels if \"B\" in begin_label]\n", 238 | "ner_begin_label_string = [ner_labels[label_index] for label_index in ner_begin_label]\n", 239 | "\n", 240 | "print(ner_begin_label)\n", 241 | "print(ner_begin_label_string)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "ner_begin_label = [ner_labels.index(begin_label) for begin_label in ner_labels if \"B\" in begin_label]\n", 251 | "\n", 252 | "def create_inputs_targets(df):\n", 253 | " input_ids = []\n", 254 | " attention_masks = []\n", 255 | " token_type_ids = []\n", 256 | " label_list = []\n", 257 | "\n", 258 | " for i, data in enumerate(df[['sentence', 'label']].values):\n", 259 | " sentence, labels = data\n", 260 | " words = sentence.split()\n", 261 | " labels = labels.split()\n", 262 | " labels_idx = []\n", 263 | " \n", 264 | " for label in labels:\n", 265 | " labels_idx.append(ner_labels.index(label) if label in ner_labels else ner_labels.index(\"UNK\"))\n", 266 | "\n", 267 | " assert len(words) == len(labels_idx)\n", 268 | "\n", 269 | " input_id, attention_mask, token_type_id = bert_tokenizer(sentence, MAX_LEN)\n", 270 | "\n", 271 | " convert_label_id = convert_label(words, labels_idx, ner_begin_label, MAX_LEN)\n", 272 | "\n", 273 | " input_ids.append(input_id)\n", 274 | " attention_masks.append(attention_mask)\n", 275 | " token_type_ids.append(token_type_id)\n", 276 | " label_list.append(convert_label_id)\n", 277 | "\n", 278 | " input_ids = np.array(input_ids, dtype=int)\n", 279 | " attention_masks = np.array(attention_masks, dtype=int)\n", 280 | " token_type_ids = np.array(token_type_ids, dtype=int)\n", 281 | " label_list = np.asarray(label_list, dtype=int) #레이블 토크나이징 리스트\n", 282 | " inputs = (input_ids, attention_masks, token_type_ids)\n", 283 | " \n", 284 | " return inputs, label_list\n", 285 | "\n", 286 | "train_inputs, train_labels = create_inputs_targets(train_ner_df)\n", 287 | "test_inputs, test_labels = create_inputs_targets(test_ner_df)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "class TFBertNERClassifier(tf.keras.Model):\n", 297 | " def __init__(self, model_name, dir_path, num_class):\n", 298 | " super(TFBertNERClassifier, self).__init__()\n", 299 | "\n", 300 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n", 301 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n", 302 | " self.classifier = tf.keras.layers.Dense(num_class, \n", 303 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),\n", 304 | " name=\"ner_classifier\")\n", 305 | "\n", 306 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n", 307 | "\n", 308 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n", 309 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n", 310 | " sequence_output = outputs[0]\n", 311 | " \n", 312 | " sequence_output = self.dropout(sequence_output, training=training)\n", 313 | " logits = self.classifier(sequence_output)\n", 314 | " \n", 315 | "\n", 316 | " return logits" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "ner_model = TFBertNERClassifier(model_name='bert-base-multilingual-cased',\n", 326 | " dir_path='bert_ckpt',\n", 327 | " num_class=len(ner_labels))" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "def compute_loss(labels, logits):\n", 337 | " loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(\n", 338 | " from_logits=True, reduction=tf.keras.losses.Reduction.NONE\n", 339 | " )\n", 340 | "\n", 341 | " # 0의 레이블 값은 손실 값을 계산할 때 제외\n", 342 | " active_loss = tf.reshape(labels, (-1,)) != 0\n", 343 | " \n", 344 | " reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)\n", 345 | " \n", 346 | " labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)\n", 347 | " \n", 348 | " return loss_fn(labels, reduced_logits)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "class F1Metrics(tf.keras.callbacks.Callback):\n", 358 | " def __init__(self, x_eval, y_eval):\n", 359 | " self.x_eval = x_eval\n", 360 | " self.y_eval = y_eval\n", 361 | "\n", 362 | " def compute_f1_pre_rec(self, labels, preds):\n", 363 | "\n", 364 | " return {\n", 365 | " \"precision\": precision_score(labels, preds, suffix=True),\n", 366 | " \"recall\": recall_score(labels, preds, suffix=True),\n", 367 | " \"f1\": f1_score(labels, preds, suffix=True)\n", 368 | " }\n", 369 | "\n", 370 | "\n", 371 | " def show_report(self, labels, preds):\n", 372 | " return classification_report(labels, preds, suffix=True)\n", 373 | " \n", 374 | " def on_epoch_end(self, epoch, logs=None):\n", 375 | "\n", 376 | " results = {}\n", 377 | " \n", 378 | " pred = self.model.predict(self.x_eval)\n", 379 | " label = self.y_eval\n", 380 | " pred_argmax = np.argmax(pred, axis = 2)\n", 381 | "\n", 382 | " slot_label_map = {i: label for i, label in enumerate(ner_labels)}\n", 383 | "\n", 384 | " out_label_list = [[] for _ in range(label.shape[0])]\n", 385 | " preds_list = [[] for _ in range(label.shape[0])]\n", 386 | "\n", 387 | " for i in range(label.shape[0]):\n", 388 | " for j in range(label.shape[1]):\n", 389 | " if label[i, j] != 0:\n", 390 | " out_label_list[i].append(slot_label_map[label[i][j]])\n", 391 | " preds_list[i].append(slot_label_map[pred_argmax[i][j]])\n", 392 | " \n", 393 | " result = self.compute_f1_pre_rec(out_label_list, preds_list)\n", 394 | " results.update(result)\n", 395 | "\n", 396 | " print(\"********\")\n", 397 | " print(\"F1 Score\")\n", 398 | " for key in sorted(results.keys()):\n", 399 | " print(\"{}, {:.4f}\".format(key, results[key]))\n", 400 | " print(\"\\n\" + self.show_report(out_label_list, preds_list))\n", 401 | " print(\"********\")\n", 402 | "\n", 403 | "f1_score_callback = F1Metrics(test_inputs, test_labels)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule\n", 413 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n", 414 | "# ner_model.compile(optimizer=optimizer, loss=compute_loss, run_eagerly=True)\n", 415 | "ner_model.compile(optimizer=optimizer, loss=compute_loss)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "model_name = \"tf2_bert_ner\"\n", 425 | "\n", 426 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 427 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 428 | "\n", 429 | "# Create path if exists\n", 430 | "if os.path.exists(checkpoint_dir):\n", 431 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 432 | "else:\n", 433 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 434 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 435 | " \n", 436 | "cp_callback = ModelCheckpoint(\n", 437 | " checkpoint_path, verbose=1, save_best_only=True, save_weights_only=True)\n", 438 | "\n", 439 | "history = ner_model.fit(train_inputs, train_labels, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,\n", 440 | " callbacks=[cp_callback, f1_score_callback])\n", 441 | "\n", 442 | "print(history.history)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "plot_graphs(history, 'loss')" 452 | ] 453 | } 454 | ], 455 | "metadata": { 456 | "kernelspec": { 457 | "display_name": "Python 3", 458 | "language": "python", 459 | "name": "python3" 460 | }, 461 | "language_info": { 462 | "codemirror_mode": { 463 | "name": "ipython", 464 | "version": 3 465 | }, 466 | "file_extension": ".py", 467 | "mimetype": "text/x-python", 468 | "name": "python", 469 | "nbconvert_exporter": "python", 470 | "pygments_lexer": "ipython3", 471 | "version": "3.7.4" 472 | } 473 | }, 474 | "nbformat": 4, 475 | "nbformat_minor": 4 476 | } 477 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.2.4.bert_finetune_KorSTS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR/KorSTS\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-dev.tsv \\\n", 36 | " -O data_in/KOR/KorSTS/sts-dev.tsv\n", 37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-test.tsv \\\n", 38 | " -O data_in/KOR/KorSTS/sts-test.tsv\n", 39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-train.tsv \\\n", 40 | " -O data_in/KOR/KorSTS/sts-train.tsv" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "scrolled": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "import os\n", 52 | "import tensorflow as tf\n", 53 | "from transformers import BertTokenizer, TFBertModel\n", 54 | "\n", 55 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 56 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 57 | "\n", 58 | "import pandas as pd\n", 59 | "import matplotlib.pyplot as plt\n", 60 | "\n", 61 | "from tqdm import tqdm\n", 62 | "import numpy as np\n", 63 | "import re" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# 시각화\n", 73 | "def plot_graphs(history, string):\n", 74 | " plt.plot(history.history[string])\n", 75 | " plt.plot(history.history['val_'+string], '')\n", 76 | " plt.xlabel(\"Epochs\")\n", 77 | " plt.ylabel(string)\n", 78 | " plt.legend([string, 'val_'+string])\n", 79 | " plt.show()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "SEED_NUM = 1234\n", 89 | "tf.random.set_seed(SEED_NUM)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "scrolled": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\",\n", 101 | " cache_dir='bert_ckpt',\n", 102 | " do_lower_case=False)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "#random seed 고정\n", 112 | "\n", 113 | "tf.random.set_seed(0)\n", 114 | "np.random.seed(0)\n", 115 | "\n", 116 | "# BASE PARAM\n", 117 | "\n", 118 | "BATCH_SIZE = 32\n", 119 | "NUM_EPOCHS = 3\n", 120 | "VALID_SPLIT = 0.2\n", 121 | "MAX_LEN = 28 * 2 \n", 122 | "\n", 123 | "DATA_IN_PATH = 'data_in/KOR'\n", 124 | "DATA_OUT_PATH = \"data_out/KOR\"" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# Special Tokens\n", 134 | "print(tokenizer.all_special_tokens, \"\\n\", tokenizer.all_special_ids)\n", 135 | "\n", 136 | "# Test Tokenizers\n", 137 | "kor_encode = tokenizer.encode(\"안녕하세요, 반갑습니다\")\n", 138 | "eng_encode = tokenizer.encode(\"Hello world\")\n", 139 | "\n", 140 | "kor_decode = tokenizer.decode(kor_encode)\n", 141 | "eng_decode = tokenizer.decode(eng_encode)\n", 142 | "\n", 143 | "print(kor_encode)\n", 144 | "print(eng_encode)\n", 145 | "print(kor_decode)\n", 146 | "print(eng_decode)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "# KorSTS Dataset\n", 154 | "\n", 155 | "Data from Kakaobrain: https://github.com/kakaobrain/KorNLUDatasets" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# Load Train dataset\n", 165 | "\n", 166 | "TRAIN_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-train.tsv')\n", 167 | "DEV_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-dev.tsv')\n", 168 | "\n", 169 | "train_data = pd.read_csv(TRAIN_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n", 170 | "dev_data = pd.read_csv(DEV_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n", 171 | "\n", 172 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data), len(dev_data)))" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# Bert Tokenizer\n", 182 | "\n", 183 | "# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n", 184 | "\n", 185 | "def bert_tokenizer_v2(sent1, sent2, MAX_LEN):\n", 186 | " \n", 187 | " # For Two setenece input\n", 188 | " \n", 189 | " encoded_dict = tokenizer.encode_plus(\n", 190 | " text = sent1,\n", 191 | " text_pair = sent2,\n", 192 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n", 193 | " max_length = MAX_LEN, # Pad & truncate all sentences.\n", 194 | " pad_to_max_length = True,\n", 195 | " return_attention_mask = True # Construct attn. masks.\n", 196 | " \n", 197 | " )\n", 198 | " \n", 199 | " input_id = encoded_dict['input_ids']\n", 200 | " attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n", 201 | " token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n", 202 | " \n", 203 | " return input_id, attention_mask, token_type_id" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n", 213 | "def clean_text(sent):\n", 214 | " sent_clean = re.sub(\"[^a-zA-Z0-9ㄱ-ㅣ가-힣\\\\s]\", \" \", sent)\n", 215 | " return sent_clean\n", 216 | "\n", 217 | "input_ids = []\n", 218 | "attention_masks = []\n", 219 | "token_type_ids = []\n", 220 | "data_labels = []\n", 221 | "\n", 222 | "\n", 223 | "for sent1, sent2, score in train_data[['sentence1', 'sentence2', 'score']].values:\n", 224 | " try:\n", 225 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)\n", 226 | " input_ids.append(input_id)\n", 227 | " attention_masks.append(attention_mask)\n", 228 | " token_type_ids.append(token_type_id)\n", 229 | " data_labels.append(score)\n", 230 | " except Exception as e:\n", 231 | " print(e)\n", 232 | " print(sent1, sent2)\n", 233 | " pass\n", 234 | " \n", 235 | "train_input_ids = np.array(input_ids, dtype=int)\n", 236 | "train_attention_masks = np.array(attention_masks, dtype=int)\n", 237 | "train_type_ids = np.array(token_type_ids, dtype=int)\n", 238 | "train_inputs = (train_input_ids, train_attention_masks, train_type_ids)\n", 239 | "train_data_labels = np.array(data_labels)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "# DEV SET Preprocessing" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n", 256 | "input_ids = []\n", 257 | "attention_masks = []\n", 258 | "token_type_ids = []\n", 259 | "data_labels = []\n", 260 | "\n", 261 | "for sent1, sent2, score in dev_data[['sentence1', 'sentence2', 'score']].values:\n", 262 | " try:\n", 263 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)\n", 264 | " input_ids.append(input_id)\n", 265 | " attention_masks.append(attention_mask)\n", 266 | " token_type_ids.append(token_type_id)\n", 267 | " data_labels.append(score)\n", 268 | " except Exception as e:\n", 269 | " print(e)\n", 270 | " print(sent1, sent2)\n", 271 | " pass\n", 272 | " \n", 273 | "dev_input_ids = np.array(input_ids, dtype=int)\n", 274 | "dev_attention_masks = np.array(attention_masks, dtype=int)\n", 275 | "dev_type_ids = np.array(token_type_ids, dtype=int)\n", 276 | "dev_inputs = (dev_input_ids, dev_attention_masks, dev_type_ids)\n", 277 | "dev_data_labels = np.array(data_labels)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "class TFBertRegressor(tf.keras.Model):\n", 296 | " def __init__(self, model_name, dir_path, num_class):\n", 297 | " super(TFBertRegressor, self).__init__()\n", 298 | " \n", 299 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n", 300 | " self.num_class = num_class\n", 301 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n", 302 | " self.regressor = tf.keras.layers.Dense(self.num_class, \n", 303 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n", 304 | " name=\"regressor\")\n", 305 | " \n", 306 | " \n", 307 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n", 308 | " \n", 309 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n", 310 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n", 311 | " pooled_output = outputs[1]\n", 312 | " pooled_output = self.dropout(pooled_output, training=training)\n", 313 | " logits = self.regressor(pooled_output)\n", 314 | "\n", 315 | " return logits" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "scrolled": true 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "regression_model = TFBertRegressor(model_name='bert-base-multilingual-cased',\n", 327 | " dir_path='bert_ckpt',\n", 328 | " num_class=1)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "class PearsonCorrelationMetric(tf.keras.metrics.Metric):\n", 338 | " def __init__(self, name=\"pearson_correlation\", **kwargs):\n", 339 | " super(PearsonCorrelationMetric, self).__init__(name=name, **kwargs)\n", 340 | " self.y_true_list = []\n", 341 | " self.y_pred_list = []\n", 342 | "\n", 343 | " def update_state(self, y_true, y_pred, sample_weight=None):\n", 344 | " y_true = tf.reshape(y_true, shape=[-1])\n", 345 | " y_pred = tf.reshape(y_pred, shape=[-1])\n", 346 | " self.y_true_list.append(y_true)\n", 347 | " self.y_pred_list.append(y_pred)\n", 348 | "\n", 349 | " def result(self):\n", 350 | " y_true = tf.concat(self.y_true_list, -1)\n", 351 | " y_pred = tf.concat(self.y_pred_list, -1)\n", 352 | " pearson_correlation = self.pearson(y_true, y_pred)\n", 353 | " \n", 354 | " return pearson_correlation\n", 355 | "\n", 356 | " def reset_states(self):\n", 357 | " self.y_true_list = []\n", 358 | " self.y_pred_list = []\n", 359 | " \n", 360 | "\n", 361 | " def pearson(self, true, pred):\n", 362 | " m_true = tf.reduce_mean(true)\n", 363 | " m_pred = tf.reduce_mean(pred)\n", 364 | " m_true, m_pred = true-m_true, pred-m_pred\n", 365 | " num = tf.reduce_sum(tf.multiply(m_true, m_pred))\n", 366 | " den = tf.sqrt(tf.multiply(tf.reduce_sum(tf.square(m_true)), tf.reduce_sum(tf.square(m_pred)))) + 1e-12\n", 367 | " return num / den" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n", 377 | "loss = tf.keras.losses.MeanSquaredError()\n", 378 | "metric = PearsonCorrelationMetric()\n", 379 | "regression_model.compile(optimizer=optimizer, loss=loss, metrics=[metric], run_eagerly=True)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": { 386 | "scrolled": true 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "#학습 진행하기\n", 391 | "model_name = \"tf2_BERT_KorSTS\"\n", 392 | "\n", 393 | "# overfitting을 막기 위한 ealrystop 추가\n", 394 | "earlystop_callback = EarlyStopping(monitor='val_pearson_correlation', min_delta=0.0001,patience=2,mode='max')\n", 395 | "# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)\n", 396 | "# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\\\n", 397 | "\n", 398 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 399 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 400 | "\n", 401 | "# Create path if exists\n", 402 | "if os.path.exists(checkpoint_dir):\n", 403 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 404 | "else:\n", 405 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 406 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 407 | " \n", 408 | "cp_callback = ModelCheckpoint(\n", 409 | " checkpoint_path, monitor='val_pearson_correlation', verbose=1, save_best_only=True, save_weights_only=True,mode='max')\n", 410 | "\n", 411 | "# 학습과 eval 시작\n", 412 | "history = regression_model.fit(train_inputs, train_data_labels, epochs=NUM_EPOCHS,\n", 413 | " validation_data = (dev_inputs, dev_data_labels),\n", 414 | " batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])\n", 415 | "\n", 416 | "#steps_for_epoch\n", 417 | "print(history.history)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "plot_graphs(history, 'pearson_correlation')" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "plot_graphs(history, 'loss')" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "# KorSTS Test dataset" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "# Load Test dataset\n", 452 | "TEST_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-test.tsv')\n", 453 | "\n", 454 | "test_data = pd.read_csv(TEST_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n", 455 | "test_data.head()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "# Test set도 똑같은 방법으로 구성한다.\n", 465 | "input_ids = []\n", 466 | "attention_masks = []\n", 467 | "token_type_ids = []\n", 468 | "data_labels = []\n", 469 | "\n", 470 | "for sent1, sent2, score in test_data[['sentence1', 'sentence2', 'score']].values:\n", 471 | " try:\n", 472 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)\n", 473 | " input_ids.append(input_id)\n", 474 | " attention_masks.append(attention_mask)\n", 475 | " token_type_ids.append(token_type_id)\n", 476 | " data_labels.append(score)\n", 477 | " except Exception as e:\n", 478 | " print(e)\n", 479 | " print(sent1, sent2)\n", 480 | " pass\n", 481 | " \n", 482 | "test_input_ids = np.array(input_ids, dtype=int)\n", 483 | "test_attention_masks = np.array(attention_masks, dtype=int)\n", 484 | "test_type_ids = np.array(token_type_ids, dtype=int)\n", 485 | "test_inputs = (test_input_ids, test_attention_masks, test_type_ids)\n", 486 | "test_data_labels = np.array(data_labels)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "print(\"# sents: {}, # labels: {}\".format(len(test_input_ids), len(test_data_labels)))" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "regression_model.load_weights(checkpoint_path)\n", 505 | "\n", 506 | "results = regression_model.evaluate(test_inputs, test_data_labels, batch_size=512)\n", 507 | "print(\"test loss, test pearson correlation: \", results)" 508 | ] 509 | } 510 | ], 511 | "metadata": { 512 | "kernelspec": { 513 | "display_name": "Python 3", 514 | "language": "python", 515 | "name": "python3" 516 | }, 517 | "language_info": { 518 | "codemirror_mode": { 519 | "name": "ipython", 520 | "version": 3 521 | }, 522 | "file_extension": ".py", 523 | "mimetype": "text/x-python", 524 | "name": "python", 525 | "nbconvert_exporter": "python", 526 | "pygments_lexer": "ipython3", 527 | "version": "3.7.4" 528 | } 529 | }, 530 | "nbformat": 4, 531 | "nbformat_minor": 2 532 | } 533 | -------------------------------------------------------------------------------- /8.GPT3/8.4.gpt2_p_tuning_NSMC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### 주의!!\n", 8 | "\n", 9 | "이 실습은 가급적 NVIDIA GPU가 설치된 컴퓨터 환경이거나 Google Colab에서 진행해주세요." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "id": "2NmYZYYhXrcZ" 16 | }, 17 | "source": [ 18 | "## 환경 준비\n", 19 | "(Google Colab 환경에서 사용하세요)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "colab": { 27 | "base_uri": "https://localhost:8080/" 28 | }, 29 | "id": "6-bFpckCXrcb", 30 | "outputId": "041269a9-fc3e-44f9-cebd-7d26e4bd006f" 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 35 | "!pip install -r requirements.txt\n", 36 | "!pip install tensorflow==2.2.0" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "cvFHjoTCXrcc" 43 | }, 44 | "source": [ 45 | "## 데이터 다운로드\n", 46 | "(Google Colab 환경에서 사용하세요)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "colab": { 54 | "base_uri": "https://localhost:8080/" 55 | }, 56 | "id": "HbKNloVoXrcd", 57 | "outputId": "7b70fd06-d1f8-48b2-b316-0c25d432261f" 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "!mkdir -p data_in/KOR/naver_movie\n", 62 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n", 63 | " -O data_in/KOR/naver_movie/ratings_train.txt\n", 64 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n", 65 | " -O data_in/KOR/naver_movie/ratings_test.txt" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "id": "xs88fDX8Xrcd", 73 | "scrolled": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "import os\n", 78 | "import tensorflow as tf\n", 79 | "from transformers import TFGPT2LMHeadModel\n", 80 | "\n", 81 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 82 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 83 | "\n", 84 | "import gluonnlp as nlp\n", 85 | "from gluonnlp.data import SentencepieceTokenizer\n", 86 | "\n", 87 | "import pandas as pd\n", 88 | "import matplotlib.pyplot as plt\n", 89 | "\n", 90 | "import numpy as np\n", 91 | "import re" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "id": "XgV0aK1KXrce" 98 | }, 99 | "source": [ 100 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "id": "XmofLC_rXrce" 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "import wget\n", 112 | "import zipfile\n", 113 | "\n", 114 | "wget.download('https://github.com/NLP-kr/tensorflow-ml-nlp-tf2/releases/download/v1.0/gpt_ckpt.zip')\n", 115 | "\n", 116 | "with zipfile.ZipFile('gpt_ckpt.zip') as z:\n", 117 | " z.extractall()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "id": "TVExOYgEXrcf" 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "# 시각화\n", 129 | "\n", 130 | "def plot_graphs(history, string):\n", 131 | " plt.plot(history.history[string])\n", 132 | " plt.plot(history.history['val_'+string], '')\n", 133 | " plt.xlabel(\"Epochs\")\n", 134 | " plt.ylabel(string)\n", 135 | " plt.legend([string, 'val_'+string])\n", 136 | " plt.show()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "id": "s6dM4ebxXrcg" 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "SEED_NUM = 1234\n", 148 | "tf.random.set_seed(SEED_NUM)\n", 149 | "np.random.seed(SEED_NUM)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "id": "WQrjLpuV_cnI" 156 | }, 157 | "source": [ 158 | "## 피-튜닝 모델 구현" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "id": "g8V_Qsv3_NVE" 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "class TFGPT2PtuningClassifier(tf.keras.Model):\n", 170 | " def __init__(self, dir_path):\n", 171 | " super(TFGPT2PtuningClassifier, self).__init__()\n", 172 | " \n", 173 | " self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)\n", 174 | " self.gpt2.trainable = False\n", 175 | "\n", 176 | " self.prompt_embedding_size = self.gpt2.config.hidden_size\n", 177 | " self.prompt_emgedding = tf.keras.layers.Embedding(2, self.prompt_embedding_size, name='prompt_embedding')\n", 178 | " \n", 179 | " self.bilstm = tf.keras.Sequential(name='prompt_bilstm')\n", 180 | " self.bilstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.prompt_embedding_size, return_sequences=True)))\n", 181 | " self.bilstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.prompt_embedding_size, return_sequences=True)))\n", 182 | " \n", 183 | " self.mlp = tf.keras.Sequential(name='prompt_mlp')\n", 184 | " self.mlp.add(tf.keras.layers.Dense(self.prompt_embedding_size))\n", 185 | " self.mlp.add(tf.keras.layers.ReLU())\n", 186 | " self.mlp.add(tf.keras.layers.Dense(self.prompt_embedding_size))\n", 187 | "\n", 188 | " def generate_prompt_input(self, inputs_ids):\n", 189 | " inputs_embeds = self.gpt2.transformer.wte(inputs_ids[:, 1:-1])\n", 190 | "\n", 191 | " prompt_indexs = tf.concat([inputs_ids[:, 0:1], inputs_ids[:, -1:]], axis=-1)\n", 192 | " prompt_embeds = self.prompt_emgedding(prompt_indexs)\n", 193 | " prompt_embeds = self.bilstm(prompt_embeds)\n", 194 | " prompt_embeds = self.mlp(prompt_embeds)\n", 195 | " \n", 196 | " prompt_updated_inputs = tf.concat([prompt_embeds[:, 0:1, :], inputs_embeds, \n", 197 | " prompt_embeds[:, 1:, :]],\n", 198 | " axis=1)\n", 199 | " \n", 200 | " return prompt_updated_inputs\n", 201 | " \n", 202 | " def call(self, inputs):\n", 203 | " input_ids = inputs[0]\n", 204 | " attention_mask = inputs[1] if len(inputs) > 1 else None\n", 205 | "\n", 206 | " inputs_embeds = self.generate_prompt_input(input_ids)\n", 207 | " last_hidden_states, _ = self.gpt2({'inputs_ids': None, 'inputs_embeds': inputs_embeds, 'attention_mask': attention_mask})\n", 208 | " output = last_hidden_states[:, -1, :]\n", 209 | "\n", 210 | " return outputs" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": { 216 | "id": "pCN8Lh7gXrch" 217 | }, 218 | "source": [ 219 | "## 피-튜닝을 위한 네이버 영화 리뷰 데이터 전처리" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "id": "lr76g28XA1BP" 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "BATCH_SIZE = 32\n", 231 | "NUM_EPOCHS = 3\n", 232 | "VALID_SPLIT = 0.1\n", 233 | "SENT_MAX_LEN = 39" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "id": "lAaKKUqbXrch" 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n", 245 | "\n", 246 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n", 247 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n", 248 | " mask_token=None,\n", 249 | " sep_token='',\n", 250 | " cls_token=None,\n", 251 | " unknown_token='',\n", 252 | " padding_token='',\n", 253 | " bos_token='~~',\n", 254 | " eos_token='~~')" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "id": "6DlePiINXrch" 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "DATA_IN_PATH = './data_in/KOR'\n", 266 | "DATA_OUT_PATH = \"./data_out/KOR\"\n", 267 | "\n", 268 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_train.txt\")\n", 269 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_test.txt\")\n", 270 | "\n", 271 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n", 272 | "train_data = train_data.dropna()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": { 279 | "id": "4GKNnSYuXrcj" 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "# train_data = train_data[:50] # for test\n", 284 | "\n", 285 | "def clean_text(sent):\n", 286 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \"\", sent)\n", 287 | " return sent_clean\n", 288 | "\n", 289 | "def add_prompt_token(tokens):\n", 290 | " return [0] + tokens + [1]\n", 291 | "\n", 292 | "train_data_sents = []\n", 293 | "train_attn_mask = []\n", 294 | "train_data_labels = []\n", 295 | "\n", 296 | "for train_sent, train_label in train_data[['document', 'label']].values:\n", 297 | " train_text_label = '긍정' if train_label == 1 else '부정'\n", 298 | "\n", 299 | " train_tokenized_text = vocab[tokenizer(clean_text(train_sent))]\n", 300 | "\n", 301 | " tokens = [vocab[vocab.bos_token]] \n", 302 | " tokens += pad_sequences([train_tokenized_text], \n", 303 | " SENT_MAX_LEN, \n", 304 | " value=vocab[vocab.padding_token], \n", 305 | " padding='post').tolist()[0] \n", 306 | " tokens = add_prompt_token(tokens)\n", 307 | "\n", 308 | " train_attn_mask.append([1 if t != 3 else 0 for t in tokens])\n", 309 | " train_data_sents.append(tokens)\n", 310 | "\n", 311 | " label = vocab[tokenizer('긍정')] if train_label == 1 else vocab[tokenizer('부정')]\n", 312 | " train_data_labels.append(label)\n", 313 | "\n", 314 | "\n", 315 | "train_attn_mask = np.array(train_attn_mask, dtype=np.int64)\n", 316 | "train_data_sents = np.array(train_data_sents, dtype=np.int64)\n", 317 | "train_data_labels = np.array(train_data_labels, dtype=np.int64)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "colab": { 325 | "base_uri": "https://localhost:8080/" 326 | }, 327 | "id": "c-w5GU2IxkWv", 328 | "outputId": "29c84da5-4cdd-47ca-e575-77bcff453233" 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "print('입력 토큰 인덱스: ', train_data_sents[0])\n", 333 | "print('어텐션 마스크: ', train_attn_mask[0])\n", 334 | "print('정답 라벨: ', train_data_labels[0])" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": { 340 | "id": "12MlbiqIXrcj" 341 | }, 342 | "source": [ 343 | "## 네이버 영화 리뷰 감정 분석을 위한 피-튜닝 학습 " 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "colab": { 351 | "base_uri": "https://localhost:8080/" 352 | }, 353 | "id": "9J5VOzCwXrcj", 354 | "outputId": "c8eecfd7-6e68-4b14-f939-a5c0934ebd04" 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "BASE_MODEL_PATH = './gpt_ckpt'\n", 359 | "cls_model = TFGPT2PtuningClassifier(dir_path=BASE_MODEL_PATH)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": { 366 | "id": "FAKyQBJ_Xrck" 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)\n", 371 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", 372 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n", 373 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "colab": { 381 | "base_uri": "https://localhost:8080/" 382 | }, 383 | "id": "YCNdkkALXrck", 384 | "outputId": "23f2964e-5793-4518-fc7d-f53e0c056d52" 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "model_name = \"tf2_gpt2_ptuning_naver_movie\"\n", 389 | "\n", 390 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)\n", 391 | "\n", 392 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 393 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 394 | "\n", 395 | "if os.path.exists(checkpoint_dir):\n", 396 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 397 | "else:\n", 398 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 399 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 400 | " \n", 401 | "cp_callback = ModelCheckpoint(\n", 402 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n", 403 | "\n", 404 | "history = cls_model.fit((train_data_sents, train_attn_mask), train_data_labels, \n", 405 | " epochs=NUM_EPOCHS, \n", 406 | " batch_size=BATCH_SIZE,\n", 407 | " validation_split=VALID_SPLIT, \n", 408 | " callbacks=[earlystop_callback, cp_callback])" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": { 415 | "colab": { 416 | "base_uri": "https://localhost:8080/", 417 | "height": 279 418 | }, 419 | "id": "J8s2xkMcXrck", 420 | "outputId": "07b8d787-7bd0-46cc-e1ee-8a1ce00dea70" 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "plot_graphs(history, 'accuracy')" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": { 431 | "colab": { 432 | "base_uri": "https://localhost:8080/", 433 | "height": 279 434 | }, 435 | "id": "sWVxJEbRXrcl", 436 | "outputId": "f71c7219-b11b-4bd7-bffe-c624b7736279" 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "plot_graphs(history, 'loss')" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": { 446 | "id": "4jFe7XMeXrcl" 447 | }, 448 | "source": [ 449 | "## 네이버 영화 리뷰 모델 피-튜닝 테스트\n" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": { 456 | "colab": { 457 | "base_uri": "https://localhost:8080/", 458 | "height": 206 459 | }, 460 | "id": "za_BFNJsXrcl", 461 | "outputId": "16cdfa32-acd1-48be-88a6-66225338f537" 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\\t', quoting=3)\n", 466 | "test_data = test_data.dropna()\n", 467 | "test_data.head()" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": { 474 | "id": "ZvJV4mOcXrcl" 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "# test_data = test_data[:50] # for test\n", 479 | "\n", 480 | "test_data_sents = []\n", 481 | "test_data_labels = []\n", 482 | "test_attn_mask = []\n", 483 | "\n", 484 | "pred_tokens = []\n", 485 | "\n", 486 | "\n", 487 | "for test_sent, test_label in test_data[['document', 'label']].values:\n", 488 | " test_tokenized_text = vocab[tokenizer(clean_text(test_sent))]\n", 489 | "\n", 490 | " tokens = []\n", 491 | " tokens += pad_sequences([test_tokenized_text], \n", 492 | " SENT_MAX_LEN, \n", 493 | " value=vocab[vocab.padding_token], \n", 494 | " padding='post').tolist()[0] \n", 495 | " tokens = add_prompt_token(tokens)\n", 496 | " test_data_sents.append(tokens)\n", 497 | " mask = [1 if t != 3 else 0 for t in tokens]\n", 498 | " test_attn_mask.append(mask)\n", 499 | "\n", 500 | " label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]\n", 501 | " test_data_labels.append(label)\n", 502 | " \n", 503 | "test_attn_mask = np.array(test_attn_mask, dtype=np.int64)\n", 504 | "test_data_sents = np.array(test_data_sents, dtype=np.int64)\n", 505 | "test_data_labels = np.array(test_data_labels, dtype=np.int64)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "colab": { 513 | "base_uri": "https://localhost:8080/" 514 | }, 515 | "id": "lrHok3-CXrcl", 516 | "outputId": "908bef9c-133b-4cb0-a9e1-baa7cbc221e7" 517 | }, 518 | "outputs": [], 519 | "source": [ 520 | "print(\"num sents, labels {}, {}\".format(len(test_data_sents), len(test_data_labels)))" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": { 527 | "colab": { 528 | "base_uri": "https://localhost:8080/" 529 | }, 530 | "id": "hAHf4b0JXrcm", 531 | "outputId": "9b26654e-c5ea-439e-ed33-4508c9add548" 532 | }, 533 | "outputs": [], 534 | "source": [ 535 | "cls_model.load_weights(checkpoint_path)\n", 536 | "\n", 537 | "results = cls_model.evaluate((test_data_sents, test_attn_mask), test_data_labels, batch_size=1024)\n", 538 | "print(\"test loss, test acc: \", results)" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "id": "Ns83PcVeDGq3" 546 | }, 547 | "outputs": [], 548 | "source": [] 549 | } 550 | ], 551 | "metadata": { 552 | "accelerator": "GPU", 553 | "colab": { 554 | "collapsed_sections": [], 555 | "machine_shape": "hm", 556 | "name": "7.4.2.gpt2_ptune_w_mask_NSMC.ipynb", 557 | "provenance": [] 558 | }, 559 | "kernelspec": { 560 | "display_name": "Python 3", 561 | "language": "python", 562 | "name": "python3" 563 | }, 564 | "language_info": { 565 | "codemirror_mode": { 566 | "name": "ipython", 567 | "version": 3 568 | }, 569 | "file_extension": ".py", 570 | "mimetype": "text/x-python", 571 | "name": "python", 572 | "nbconvert_exporter": "python", 573 | "pygments_lexer": "ipython3", 574 | "version": "3.8.3" 575 | } 576 | }, 577 | "nbformat": 4, 578 | "nbformat_minor": 4 579 | } 580 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.4.4.gpt2_finetune_KorSTS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 데이터 다운로드" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "!mkdir -p data_in/KOR/KorSTS\n", 35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-dev.tsv \\\n", 36 | " -O data_in/KOR/KorSTS/sts-dev.tsv\n", 37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-test.tsv \\\n", 38 | " -O data_in/KOR/KorSTS/sts-test.tsv\n", 39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-train.tsv \\\n", 40 | " -O data_in/KOR/KorSTS/sts-train.tsv" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import os\n", 50 | "import tensorflow as tf\n", 51 | "from transformers import *\n", 52 | "\n", 53 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", 54 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 55 | "\n", 56 | "import gluonnlp as nlp\n", 57 | "from gluonnlp.data import SentencepieceTokenizer\n", 58 | "\n", 59 | "import pandas as pd\n", 60 | "import matplotlib.pyplot as plt\n", 61 | "\n", 62 | "from tqdm import tqdm\n", 63 | "import numpy as np\n", 64 | "import re" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n", 81 | "!unzip -o gpt_ckpt.zip" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# 시각화\n", 91 | "\n", 92 | "def plot_graphs(history, string):\n", 93 | " plt.plot(history.history[string])\n", 94 | " plt.plot(history.history['val_'+string], '')\n", 95 | " plt.xlabel(\"Epochs\")\n", 96 | " plt.ylabel(string)\n", 97 | " plt.legend([string, 'val_'+string])\n", 98 | " plt.show()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "SEED_NUM = 1234\n", 108 | "tf.random.set_seed(SEED_NUM)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n", 118 | "\n", 119 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH, alpha=0)\n", 120 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n", 121 | " mask_token=None,\n", 122 | " sep_token='',\n", 123 | " cls_token=None,\n", 124 | " unknown_token='',\n", 125 | " padding_token='',\n", 126 | " bos_token='~~',\n", 127 | " eos_token='~~')" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "# KoSTS Simliarity " 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "tf.random.set_seed(0)\n", 144 | "np.random.seed(0)\n", 145 | "\n", 146 | "BATCH_SIZE = 10\n", 147 | "NUM_EPOCHS = 3\n", 148 | "VALID_SPLIT = 0.2\n", 149 | "SENT_MAX_LEN = 14\n", 150 | "\n", 151 | "DATA_IN_PATH = 'data_in/KOR'\n", 152 | "DATA_OUT_PATH = \"data_out/KOR\"" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# Load Train dataset\n", 162 | "\n", 163 | "TRAIN_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-train.tsv')\n", 164 | "DEV_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-dev.tsv')\n", 165 | "\n", 166 | "train_data = pd.read_csv(TRAIN_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n", 167 | "dev_data = pd.read_csv(DEV_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n", 168 | "\n", 169 | "train_data = train_data.dropna()\n", 170 | "\n", 171 | "dev_data = dev_data.dropna()\n", 172 | "\n", 173 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data), len(dev_data)))" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n", 183 | "def clean_text(sent):\n", 184 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \" \", sent)\n", 185 | " return sent_clean\n", 186 | "\n", 187 | "train_data_sents1 = []\n", 188 | "train_data_sents2 = []\n", 189 | "train_labels = []\n", 190 | "\n", 191 | "\n", 192 | "for sent1, sent2, score in train_data[['sentence1', 'sentence2', 'score']].values:\n", 193 | " train_tokenized_sent_1 = vocab[tokenizer(clean_text(sent1))]\n", 194 | " train_tokenized_sent_2 = vocab[tokenizer(clean_text(sent2))]\n", 195 | " tokens1 = [vocab[vocab.bos_token]] \n", 196 | " tokens1 += pad_sequences([train_tokenized_sent_1], \n", 197 | " SENT_MAX_LEN, \n", 198 | " value=vocab[vocab.padding_token], \n", 199 | " padding='post').tolist()[0] \n", 200 | " tokens1 += [vocab[vocab.sep_token]] \n", 201 | " tokens1 += pad_sequences([train_tokenized_sent_2], \n", 202 | " SENT_MAX_LEN, \n", 203 | " value=vocab[vocab.padding_token], \n", 204 | " padding='post').tolist()[0] \n", 205 | " tokens1 += [vocab[vocab.eos_token]]\n", 206 | " tokens2 = [vocab[vocab.bos_token]] \n", 207 | " tokens2 += pad_sequences([train_tokenized_sent_2], \n", 208 | " SENT_MAX_LEN, \n", 209 | " value=vocab[vocab.padding_token], \n", 210 | " padding='post').tolist()[0] \n", 211 | " tokens2 += [vocab[vocab.sep_token]] \n", 212 | " tokens2 += pad_sequences([train_tokenized_sent_1], \n", 213 | " SENT_MAX_LEN, \n", 214 | " value=vocab[vocab.padding_token], \n", 215 | " padding='post').tolist()[0] \n", 216 | " tokens2 += [vocab[vocab.eos_token]]\n", 217 | " \n", 218 | " train_data_sents1.append(tokens1)\n", 219 | " train_data_sents2.append(tokens2)\n", 220 | " train_labels.append(score)\n", 221 | "\n", 222 | "train_data_sents1 = np.array(train_data_sents1, dtype=np.int64)\n", 223 | "train_data_sents2 = np.array(train_data_sents2, dtype=np.int64)\n", 224 | "train_data_sents = (train_data_sents1, train_data_sents2)\n", 225 | "train_data_labels = np.array(train_labels)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "dev_data_sents1 = []\n", 235 | "dev_data_sents2 = []\n", 236 | "dev_labels = []\n", 237 | "\n", 238 | "\n", 239 | "for sent1, sent2, score in dev_data[['sentence1', 'sentence2', 'score']].values:\n", 240 | " dev_tokenized_sent_1 = vocab[tokenizer(clean_text(sent1))]\n", 241 | " dev_tokenized_sent_2 = vocab[tokenizer(clean_text(sent2))]\n", 242 | " tokens1 = [vocab[vocab.bos_token]] \n", 243 | " tokens1 += pad_sequences([dev_tokenized_sent_1], \n", 244 | " SENT_MAX_LEN, \n", 245 | " value=vocab[vocab.padding_token], \n", 246 | " padding='post').tolist()[0] \n", 247 | " tokens1 += [vocab[vocab.sep_token]] \n", 248 | " tokens1 += pad_sequences([dev_tokenized_sent_2], \n", 249 | " SENT_MAX_LEN, \n", 250 | " value=vocab[vocab.padding_token], \n", 251 | " padding='post').tolist()[0] \n", 252 | " tokens1 += [vocab[vocab.eos_token]]\n", 253 | " tokens2 = [vocab[vocab.bos_token]] \n", 254 | " tokens2 += pad_sequences([dev_tokenized_sent_2], \n", 255 | " SENT_MAX_LEN, \n", 256 | " value=vocab[vocab.padding_token], \n", 257 | " padding='post').tolist()[0] \n", 258 | " tokens2 += [vocab[vocab.sep_token]] \n", 259 | " tokens2 += pad_sequences([dev_tokenized_sent_1], \n", 260 | " SENT_MAX_LEN, \n", 261 | " value=vocab[vocab.padding_token], \n", 262 | " padding='post').tolist()[0] \n", 263 | " tokens2 += [vocab[vocab.eos_token]]\n", 264 | " \n", 265 | " dev_data_sents1.append(tokens1)\n", 266 | " dev_data_sents2.append(tokens2)\n", 267 | " dev_labels.append(score)\n", 268 | "\n", 269 | "dev_data_sents1 = np.array(dev_data_sents1, dtype=np.int64)\n", 270 | "dev_data_sents2 = np.array(dev_data_sents2, dtype=np.int64)\n", 271 | "dev_data_sents = (dev_data_sents1, dev_data_sents2)\n", 272 | "dev_data_labels = np.array(dev_labels)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "print(\"Shape of dataset: train - ({}, {}), dev - ({}, {})\".format(train_data_sents[0].shape, train_data_sents[1].shape, dev_data_sents[0].shape, dev_data_sents[1].shape))" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "class TFGPT2Regressor(tf.keras.Model):\n", 291 | " def __init__(self, dir_path, num_class):\n", 292 | " super(TFGPT2Regressor, self).__init__()\n", 293 | " \n", 294 | " self.gpt2 = TFGPT2Model.from_pretrained(dir_path)\n", 295 | " self.num_class = num_class\n", 296 | " self.dropout = tf.keras.layers.Dropout(self.gpt2.config.summary_first_dropout)\n", 297 | " self.regressor = tf.keras.layers.Dense(self.num_class, \n", 298 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.gpt2.config.initializer_range), \n", 299 | " name=\"regressior\")\n", 300 | " \n", 301 | " def call(self, inputs):\n", 302 | " outputs1 = self.gpt2(inputs[0])\n", 303 | " outputs2 = self.gpt2(inputs[1])\n", 304 | " outputs = outputs1[0] + outputs2[0]\n", 305 | " pooled_output = outputs[:, -1, :]\n", 306 | "\n", 307 | " pooled_output = self.dropout(pooled_output)\n", 308 | " logits = self.regressor(pooled_output)\n", 309 | "\n", 310 | " return logits" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "regression_model = TFGPT2Regressor('./gpt_ckpt', 1)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "class PearsonCorrelationMetric(tf.keras.metrics.Metric):\n", 329 | " def __init__(self, name=\"pearson_correlation\", **kwargs):\n", 330 | " super(PearsonCorrelationMetric, self).__init__(name=name, **kwargs)\n", 331 | " self.y_true_list = []\n", 332 | " self.y_pred_list = []\n", 333 | "\n", 334 | " def update_state(self, y_true, y_pred, sample_weight=None):\n", 335 | " y_true = tf.reshape(y_true, shape=[-1])\n", 336 | " y_pred = tf.reshape(y_pred, shape=[-1])\n", 337 | " self.y_true_list.append(y_true)\n", 338 | " self.y_pred_list.append(y_pred)\n", 339 | "\n", 340 | " def result(self):\n", 341 | " y_true = tf.concat(self.y_true_list, -1)\n", 342 | " y_pred = tf.concat(self.y_pred_list, -1)\n", 343 | " pearson_correlation = self.pearson(y_true, y_pred)\n", 344 | " \n", 345 | " return pearson_correlation\n", 346 | "\n", 347 | " def reset_states(self):\n", 348 | " self.y_true_list = []\n", 349 | " self.y_pred_list = []\n", 350 | " \n", 351 | "\n", 352 | " def pearson(self, true, pred):\n", 353 | " m_true = tf.reduce_mean(true)\n", 354 | " m_pred = tf.reduce_mean(pred)\n", 355 | " m_true, m_pred = true-m_true, pred-m_pred\n", 356 | " r_num = tf.reduce_sum(tf.multiply(m_true, m_pred))\n", 357 | " r_den = tf.sqrt(tf.multiply(tf.reduce_sum(tf.square(m_true)), tf.reduce_sum(tf.square(m_pred)))) + 1e-12\n", 358 | " return r_num / r_den" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "optimizer = tf.keras.optimizers.Adam(6.25e-5)\n", 368 | "loss = tf.keras.losses.MeanSquaredError()\n", 369 | "metric = PearsonCorrelationMetric()\n", 370 | "regression_model.compile(optimizer=optimizer, loss=loss, metrics=[metric], run_eagerly=True)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "scrolled": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "model_name = \"tf2_gpt_korsts\"\n", 382 | "\n", 383 | "earlystop_callback = EarlyStopping(monitor='val_pearson_correlation', min_delta=0.0001,patience=3,mode='max')\n", 384 | "\n", 385 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 386 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 387 | "\n", 388 | "if os.path.exists(checkpoint_dir):\n", 389 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 390 | "else:\n", 391 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 392 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 393 | " \n", 394 | "cp_callback = ModelCheckpoint(\n", 395 | " checkpoint_path, monitor='val_pearson_correlation', verbose=1, save_best_only=True, save_weights_only=True,mode='max')\n", 396 | "\n", 397 | "history = regression_model.fit(train_data_sents, train_data_labels, epochs=NUM_EPOCHS,\n", 398 | " validation_data = (dev_data_sents, dev_data_labels),\n", 399 | " batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "plot_graphs(history, 'pearson_correlation')" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": { 415 | "scrolled": false 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "plot_graphs(history, 'loss')" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "# KorSTSTEST" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "scrolled": true 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "# Load Test dataset\n", 438 | "TEST_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-test.tsv')\n", 439 | "\n", 440 | "test_data = pd.read_csv(TEST_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n", 441 | "test_data = test_data.dropna()\n", 442 | "test_data.head()" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "test_data_sents1 = []\n", 452 | "test_data_sents2 = []\n", 453 | "test_labels = []\n", 454 | "\n", 455 | "\n", 456 | "for sent1, sent2, score in test_data[['sentence1', 'sentence2', 'score']].values:\n", 457 | " test_tokenized_sent_1 = vocab[tokenizer(clean_text(sent1))]\n", 458 | " test_tokenized_sent_2 = vocab[tokenizer(clean_text(sent2))]\n", 459 | " tokens1 = [vocab[vocab.bos_token]] \n", 460 | " tokens1 += pad_sequences([test_tokenized_sent_1], \n", 461 | " SENT_MAX_LEN, \n", 462 | " value=vocab[vocab.padding_token], \n", 463 | " padding='post').tolist()[0] \n", 464 | " tokens1 += [vocab[vocab.sep_token]] \n", 465 | " tokens1 += pad_sequences([test_tokenized_sent_2], \n", 466 | " SENT_MAX_LEN, \n", 467 | " value=vocab[vocab.padding_token], \n", 468 | " padding='post').tolist()[0] \n", 469 | " tokens1 += [vocab[vocab.eos_token]]\n", 470 | " tokens2 = [vocab[vocab.bos_token]] \n", 471 | " tokens2 += pad_sequences([test_tokenized_sent_2], \n", 472 | " SENT_MAX_LEN, \n", 473 | " value=vocab[vocab.padding_token], \n", 474 | " padding='post').tolist()[0] \n", 475 | " tokens2 += [vocab[vocab.sep_token]] \n", 476 | " tokens2 += pad_sequences([test_tokenized_sent_1], \n", 477 | " SENT_MAX_LEN, \n", 478 | " value=vocab[vocab.padding_token], \n", 479 | " padding='post').tolist()[0] \n", 480 | " tokens2 += [vocab[vocab.eos_token]]\n", 481 | " \n", 482 | " test_data_sents1.append(tokens1)\n", 483 | " test_data_sents2.append(tokens2)\n", 484 | " test_labels.append(score)\n", 485 | "\n", 486 | "test_data_sents1 = np.array(test_data_sents1, dtype=np.int64)\n", 487 | "test_data_sents2 = np.array(test_data_sents2, dtype=np.int64)\n", 488 | "test_data_sents = (test_data_sents1, test_data_sents2)\n", 489 | "test_data_labels = np.array(test_labels)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "print(\"# sents: {}, # labels: {}\".format(len(test_data_sents), len(test_data_labels)))" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "regression_model.load_weights(checkpoint_path)\n", 508 | "\n", 509 | "results = regression_model.evaluate(test_data_sents, test_data_labels, batch_size=512)\n", 510 | "print(\"test loss, test pearson correlation: \", results)" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [] 519 | } 520 | ], 521 | "metadata": { 522 | "kernelspec": { 523 | "display_name": "Python 3", 524 | "language": "python", 525 | "name": "python3" 526 | }, 527 | "language_info": { 528 | "codemirror_mode": { 529 | "name": "ipython", 530 | "version": 3 531 | }, 532 | "file_extension": ".py", 533 | "mimetype": "text/x-python", 534 | "name": "python", 535 | "nbconvert_exporter": "python", 536 | "pygments_lexer": "ipython3", 537 | "version": "3.7.4" 538 | } 539 | }, 540 | "nbformat": 4, 541 | "nbformat_minor": 2 542 | } 543 | -------------------------------------------------------------------------------- /7.PRETRAIN_METHOD/7.2.5.bert_finetune_KorQuAD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 환경 준비" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n", 17 | "!pip install -r requirements.txt\n", 18 | "!pip install tensorflow==2.2.0" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "colab": {}, 26 | "colab_type": "code", 27 | "executionInfo": { 28 | "elapsed": 12607, 29 | "status": "ok", 30 | "timestamp": 1594010753269, 31 | "user": { 32 | "displayName": "ChangWook Jun", 33 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 34 | "userId": "00685987924881157185" 35 | }, 36 | "user_tz": -540 37 | }, 38 | "id": "B9WLyWEWgdDR" 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import os\n", 43 | "import re\n", 44 | "import json\n", 45 | "import string\n", 46 | "import numpy as np\n", 47 | "import tensorflow as tf\n", 48 | "from tensorflow import keras\n", 49 | "from tensorflow.keras import layers\n", 50 | "from tokenizers import BertWordPieceTokenizer\n", 51 | "from transformers import BertTokenizer, TFBertModel\n", 52 | "\n", 53 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n", 54 | "import matplotlib.pyplot as plt\n", 55 | "import urllib\n", 56 | "\n", 57 | "MAX_LEN = 384\n", 58 | "EPOCHS = 3\n", 59 | "VERBOSE = 2\n", 60 | "BATCH_SIZE = 16" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "colab": {}, 68 | "colab_type": "code", 69 | "executionInfo": { 70 | "elapsed": 556, 71 | "status": "ok", 72 | "timestamp": 1594010762115, 73 | "user": { 74 | "displayName": "ChangWook Jun", 75 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 76 | "userId": "00685987924881157185" 77 | }, 78 | "user_tz": -540 79 | }, 80 | "id": "68HVB3dYgi0w" 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "DATA_OUT_PATH = './data_out/KOR'" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "colab": {}, 92 | "colab_type": "code", 93 | "executionInfo": { 94 | "elapsed": 639, 95 | "status": "ok", 96 | "timestamp": 1594010763471, 97 | "user": { 98 | "displayName": "ChangWook Jun", 99 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 100 | "userId": "00685987924881157185" 101 | }, 102 | "user_tz": -540 103 | }, 104 | "id": "zvoswBdyglTQ" 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "def plot_graphs(history, string, string_1, string_2):\n", 109 | " # loss \n", 110 | " plt.plot(history.history[string])\n", 111 | " plt.plot(history.history[string_1])\n", 112 | " plt.plot(history.history[string_2])\n", 113 | " plt.xlabel(\"Epochs\")\n", 114 | " plt.ylabel(string)\n", 115 | " plt.legend([string, string_1, string_2])\n", 116 | " plt.show()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "SEED_NUM = 1234\n", 126 | "tf.random.set_seed(SEED_NUM)\n", 127 | "np.random.seed(SEED_NUM)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "colab": { 135 | "base_uri": "https://localhost:8080/", 136 | "height": 65, 137 | "referenced_widgets": [ 138 | "bc7f3c579a324f77811bdd6ad6dd7dc0", 139 | "e31de13423d743e68d6c451d23c93cdf", 140 | "f8f80478dfca4894ac1ff8c2a082f734", 141 | "3be3c9704e934fb5a3d5847749d398ce", 142 | "2c0ecef646d44a0580cacefa5c3fd9f2", 143 | "1fde406732df4b5b90b7701dc7e4981e", 144 | "f58154a65f974e04bcf8af24b2884fdd", 145 | "a7d4d0c48cda4abdb106a6bcfb24359e" 146 | ] 147 | }, 148 | "colab_type": "code", 149 | "executionInfo": { 150 | "elapsed": 1217, 151 | "status": "ok", 152 | "timestamp": 1594010812799, 153 | "user": { 154 | "displayName": "ChangWook Jun", 155 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 156 | "userId": "00685987924881157185" 157 | }, 158 | "user_tz": -540 159 | }, 160 | "id": "HDI_cm3sgm6N", 161 | "outputId": "33078a97-0007-428b-9439-b67bd53cd994" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "# Save the slow pretrained tokenizer\n", 166 | "slow_tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", lowercase=False)\n", 167 | "save_path = \"bert-base-multilingual-cased/\"\n", 168 | "if not os.path.exists(save_path):\n", 169 | " os.makedirs(save_path)\n", 170 | "slow_tokenizer.save_pretrained(save_path)\n", 171 | "\n", 172 | "# Load the fast tokenizer from saved file\n", 173 | "tokenizer = BertWordPieceTokenizer(\"bert-base-multilingual-cased/vocab.txt\", lowercase=False)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "colab": { 181 | "base_uri": "https://localhost:8080/", 182 | "height": 83 183 | }, 184 | "colab_type": "code", 185 | "executionInfo": { 186 | "elapsed": 1750, 187 | "status": "ok", 188 | "timestamp": 1594010820826, 189 | "user": { 190 | "displayName": "ChangWook Jun", 191 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 192 | "userId": "00685987924881157185" 193 | }, 194 | "user_tz": -540 195 | }, 196 | "id": "an5cGi-GgpG4", 197 | "outputId": "c7753a24-f338-4a6d-8701-f78753f9b718" 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "train_data_url = \"https://korquad.github.io/dataset/KorQuAD_v1.0_train.json\"\n", 202 | "train_path = keras.utils.get_file(\"train.json\", train_data_url)\n", 203 | "eval_data_url = \"https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json\"\n", 204 | "eval_path = keras.utils.get_file(\"eval.json\", eval_data_url)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "!wget -P ./bert-base-multilingual-cased/ https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "!mv ./bert-base-multilingual-cased/bert-base-multilingual-cased-config.json ./bert-base-multilingual-cased/config.json" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "!wget -P ./bert-base-multilingual-cased/ https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "!mv ./bert-base-multilingual-cased/bert-base-multilingual-cased-tf_model.h5 ./bert-base-multilingual-cased/tf_model.h5" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "colab": { 248 | "base_uri": "https://localhost:8080/", 249 | "height": 50 250 | }, 251 | "colab_type": "code", 252 | "executionInfo": { 253 | "elapsed": 99893, 254 | "status": "ok", 255 | "timestamp": 1594011009085, 256 | "user": { 257 | "displayName": "ChangWook Jun", 258 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 259 | "userId": "00685987924881157185" 260 | }, 261 | "user_tz": -540 262 | }, 263 | "id": "PkuK7N_ngrMd", 264 | "outputId": "48275df3-52de-4623-dfc3-db6be9a54dfa" 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "class SquadExample:\n", 269 | " def __init__(self, question, context, start_char_idx, answer_text):\n", 270 | " self.question = question\n", 271 | " self.context = context\n", 272 | " self.start_char_idx = start_char_idx\n", 273 | " self.answer_text = answer_text\n", 274 | " self.skip = False\n", 275 | "\n", 276 | " def preprocess(self):\n", 277 | " context = self.context\n", 278 | " question = self.question\n", 279 | " answer_text = self.answer_text\n", 280 | " start_char_idx = self.start_char_idx\n", 281 | "\n", 282 | " # Clean context, answer and question\n", 283 | " context = \" \".join(str(context).split())\n", 284 | " question = \" \".join(str(question).split())\n", 285 | " answer = \" \".join(str(answer_text).split())\n", 286 | "\n", 287 | " # Find end character index of answer in context\n", 288 | " end_char_idx = start_char_idx + len(answer)\n", 289 | " if end_char_idx >= len(context):\n", 290 | " self.skip = True\n", 291 | " return\n", 292 | "\n", 293 | " # Mark the character indexes in context that are in answer\n", 294 | " is_char_in_ans = [0] * len(context)\n", 295 | " for idx in range(start_char_idx, end_char_idx):\n", 296 | " is_char_in_ans[idx] = 1\n", 297 | "\n", 298 | " # Tokenize context\n", 299 | " tokenized_context = tokenizer.encode(context)\n", 300 | "\n", 301 | " # Find tokens that were created from answer characters\n", 302 | " ans_token_idx = []\n", 303 | " for idx, (start, end) in enumerate(tokenized_context.offsets):\n", 304 | " if sum(is_char_in_ans[start:end]) > 0:\n", 305 | " ans_token_idx.append(idx)\n", 306 | "\n", 307 | " if len(ans_token_idx) == 0:\n", 308 | " self.skip = True\n", 309 | " return\n", 310 | "\n", 311 | " # Find start and end token index for tokens from answer\n", 312 | " start_token_idx = ans_token_idx[0]\n", 313 | " end_token_idx = ans_token_idx[-1]\n", 314 | "\n", 315 | " # Tokenize question\n", 316 | " tokenized_question = tokenizer.encode(question)\n", 317 | "\n", 318 | " # Create inputs\n", 319 | " input_ids = tokenized_context.ids + tokenized_question.ids[1:]\n", 320 | " token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(\n", 321 | " tokenized_question.ids[1:]\n", 322 | " )\n", 323 | " attention_mask = [1] * len(input_ids)\n", 324 | "\n", 325 | " # Pad and create attention masks.\n", 326 | " # Skip if truncation is needed\n", 327 | " padding_length = MAX_LEN - len(input_ids)\n", 328 | " if padding_length > 0: # pad\n", 329 | " input_ids = input_ids + ([0] * padding_length)\n", 330 | " attention_mask = attention_mask + ([0] * padding_length)\n", 331 | " token_type_ids = token_type_ids + ([0] * padding_length)\n", 332 | " elif padding_length < 0: # skip\n", 333 | " self.skip = True\n", 334 | " return\n", 335 | "\n", 336 | " self.input_ids = input_ids\n", 337 | " self.token_type_ids = token_type_ids\n", 338 | " self.attention_mask = attention_mask\n", 339 | " self.start_token_idx = start_token_idx\n", 340 | " self.end_token_idx = end_token_idx\n", 341 | " self.context_token_to_char = tokenized_context.offsets\n", 342 | "\n", 343 | "\n", 344 | "def create_squad_examples(raw_data):\n", 345 | " squad_examples = []\n", 346 | " for item in raw_data[\"data\"]:\n", 347 | " for para in item[\"paragraphs\"]:\n", 348 | " context = para[\"context\"]\n", 349 | " for qa in para[\"qas\"]:\n", 350 | " question = qa[\"question\"]\n", 351 | " answer_text = qa[\"answers\"][0][\"text\"]\n", 352 | " start_char_idx = qa[\"answers\"][0][\"answer_start\"]\n", 353 | " squad_eg = SquadExample(\n", 354 | " question, context, start_char_idx, answer_text\n", 355 | " )\n", 356 | " squad_eg.preprocess()\n", 357 | " squad_examples.append(squad_eg)\n", 358 | " return squad_examples\n", 359 | "\n", 360 | "\n", 361 | "def create_inputs_targets(squad_examples):\n", 362 | " dataset_dict = {\n", 363 | " \"input_ids\": [],\n", 364 | " \"token_type_ids\": [],\n", 365 | " \"attention_mask\": [],\n", 366 | " \"start_token_idx\": [],\n", 367 | " \"end_token_idx\": [],\n", 368 | " }\n", 369 | " for item in squad_examples:\n", 370 | " if item.skip == False:\n", 371 | " for key in dataset_dict:\n", 372 | " dataset_dict[key].append(getattr(item, key))\n", 373 | " for key in dataset_dict:\n", 374 | " dataset_dict[key] = np.array(dataset_dict[key])\n", 375 | "\n", 376 | " x = [\n", 377 | " dataset_dict[\"input_ids\"],\n", 378 | " dataset_dict[\"token_type_ids\"],\n", 379 | " dataset_dict[\"attention_mask\"],\n", 380 | " ]\n", 381 | " y = [dataset_dict[\"start_token_idx\"], dataset_dict[\"end_token_idx\"]]\n", 382 | " return x, y\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "with open(train_path) as f:\n", 392 | " raw_train_data = json.load(f)\n", 393 | "\n", 394 | "with open(eval_path) as f:\n", 395 | " raw_eval_data = json.load(f)\n", 396 | "\n", 397 | "\n", 398 | "train_squad_examples = create_squad_examples(raw_train_data)\n", 399 | "x_train, y_train = create_inputs_targets(train_squad_examples)\n", 400 | "print(f\"{len(train_squad_examples)} training points created.\")\n", 401 | "\n", 402 | "eval_squad_examples = create_squad_examples(raw_eval_data)\n", 403 | "x_eval, y_eval = create_inputs_targets(eval_squad_examples)\n", 404 | "print(f\"{len(eval_squad_examples)} evaluation points created.\")" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": { 411 | "colab": {}, 412 | "colab_type": "code", 413 | "executionInfo": { 414 | "elapsed": 690, 415 | "status": "ok", 416 | "timestamp": 1594011009787, 417 | "user": { 418 | "displayName": "ChangWook Jun", 419 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 420 | "userId": "00685987924881157185" 421 | }, 422 | "user_tz": -540 423 | }, 424 | "id": "mIjk3_XeguBj" 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "class TFBERTQuestionAnswering(tf.keras.Model):\n", 429 | " def __init__(self, model_name, dir_path, num_class):\n", 430 | " super(TFBERTQuestionAnswering, self).__init__()\n", 431 | " \n", 432 | " self.encoder = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n", 433 | " self.start_logit = tf.keras.layers.Dense(num_class, name=\"start_logit\", use_bias=False)\n", 434 | " self.end_logit = tf.keras.layers.Dense(num_class, name=\"end_logit\", use_bias=False)\n", 435 | " self.flatten = tf.keras.layers.Flatten() \n", 436 | " self.softmax = tf.keras.layers.Activation(tf.keras.activations.softmax)\n", 437 | " \n", 438 | " def call(self, inputs):\n", 439 | " input_ids, token_type_ids, attention_mask = inputs\n", 440 | " embedding = self.encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]\n", 441 | " start_logits = self.start_logit(embedding)\n", 442 | " start_logits = self.flatten(start_logits)\n", 443 | " \n", 444 | " end_logits = self.end_logit(embedding)\n", 445 | " end_logits = self.flatten(end_logits)\n", 446 | " \n", 447 | " start_probs = self.softmax(start_logits)\n", 448 | " end_probs = self.softmax(end_logits)\n", 449 | " \n", 450 | " return start_probs, end_probs" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "colab": { 458 | "base_uri": "https://localhost:8080/", 459 | "height": 120 460 | }, 461 | "colab_type": "code", 462 | "executionInfo": { 463 | "elapsed": 11135, 464 | "status": "ok", 465 | "timestamp": 1594011020239, 466 | "user": { 467 | "displayName": "ChangWook Jun", 468 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 469 | "userId": "00685987924881157185" 470 | }, 471 | "user_tz": -540 472 | }, 473 | "id": "k4t_2T7vgwOu", 474 | "outputId": "fd7dcb5d-bf36-496c-b53d-53e89962360a" 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "korquad_model = TFBERTQuestionAnswering(model_name='./bert-base-multilingual-cased/',dir_path='bert_ckpt', num_class=1)\n", 479 | "optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)\n", 480 | "loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": { 487 | "colab": {}, 488 | "colab_type": "code", 489 | "executionInfo": { 490 | "elapsed": 590, 491 | "status": "ok", 492 | "timestamp": 1594011103474, 493 | "user": { 494 | "displayName": "ChangWook Jun", 495 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 496 | "userId": "00685987924881157185" 497 | }, 498 | "user_tz": -540 499 | }, 500 | "id": "YZtVFA3PgyL0" 501 | }, 502 | "outputs": [], 503 | "source": [ 504 | "def normalized_answer(s): \n", 505 | " def remove_(text):\n", 506 | " ''' 불필요한 기호 제거 '''\n", 507 | " text = re.sub(\"'\", \" \", text)\n", 508 | " text = re.sub('\"', \" \", text)\n", 509 | " text = re.sub('《', \" \", text)\n", 510 | " text = re.sub('》', \" \", text)\n", 511 | " text = re.sub('<', \" \", text)\n", 512 | " text = re.sub('>', \" \", text) \n", 513 | " text = re.sub('〈', \" \", text)\n", 514 | " text = re.sub('〉', \" \", text) \n", 515 | " text = re.sub(\"\\(\", \" \", text)\n", 516 | " text = re.sub(\"\\)\", \" \", text)\n", 517 | " text = re.sub(\"‘\", \" \", text)\n", 518 | " text = re.sub(\"’\", \" \", text) \n", 519 | " return text\n", 520 | "\n", 521 | " def white_space_fix(text):\n", 522 | " return ' '.join(text.split())\n", 523 | "\n", 524 | " def remove_punc(text):\n", 525 | " exclude = set(string.punctuation)\n", 526 | " return ''.join(ch for ch in text if ch not in exclude)\n", 527 | "\n", 528 | " def lower(text):\n", 529 | " return text.lower()\n", 530 | "\n", 531 | " return white_space_fix(remove_punc(lower(remove_(s))))" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": { 538 | "colab": {}, 539 | "colab_type": "code", 540 | "executionInfo": { 541 | "elapsed": 720, 542 | "status": "ok", 543 | "timestamp": 1594011104061, 544 | "user": { 545 | "displayName": "ChangWook Jun", 546 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 547 | "userId": "00685987924881157185" 548 | }, 549 | "user_tz": -540 550 | }, 551 | "id": "rVTh1qKng1p8" 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "class ExactMatch(keras.callbacks.Callback):\n", 556 | " def __init__(self, x_eval, y_eval):\n", 557 | " self.x_eval = x_eval\n", 558 | " self.y_eval = y_eval\n", 559 | "\n", 560 | " def on_epoch_end(self, epoch, logs=None):\n", 561 | " pred_start, pred_end = self.model.predict(self.x_eval)\n", 562 | " count = 0\n", 563 | " eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]\n", 564 | " for idx, (start, end) in enumerate(zip(pred_start, pred_end)):\n", 565 | " squad_eg = eval_examples_no_skip[idx]\n", 566 | " offsets = squad_eg.context_token_to_char\n", 567 | " start = np.argmax(start)\n", 568 | " end = np.argmax(end)\n", 569 | " if start >= len(offsets):\n", 570 | " continue\n", 571 | " pred_char_start = offsets[start][0]\n", 572 | " if end < len(offsets):\n", 573 | " pred_char_end = offsets[end][1]\n", 574 | " pred_ans = squad_eg.context[pred_char_start:pred_char_end]\n", 575 | " else:\n", 576 | " pred_ans = squad_eg.context[pred_char_start:]\n", 577 | "\n", 578 | " normalized_pred_ans = normalized_answer(pred_ans)\n", 579 | " normalized_true_ans = normalized_answer(squad_eg.answer_text)\n", 580 | " if normalized_pred_ans in normalized_true_ans:\n", 581 | " count += 1\n", 582 | " acc = count / len(self.y_eval[0])\n", 583 | " print(f\"\\nepoch={epoch+1}, exact match score={acc:.2f}\")" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "metadata": { 590 | "colab": {}, 591 | "colab_type": "code", 592 | "executionInfo": { 593 | "elapsed": 399, 594 | "status": "ok", 595 | "timestamp": 1594011104303, 596 | "user": { 597 | "displayName": "ChangWook Jun", 598 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 599 | "userId": "00685987924881157185" 600 | }, 601 | "user_tz": -540 602 | }, 603 | "id": "sTgvtk0og4Ow" 604 | }, 605 | "outputs": [], 606 | "source": [ 607 | "exact_match_callback = ExactMatch(x_eval, y_eval)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": { 614 | "colab": {}, 615 | "colab_type": "code", 616 | "executionInfo": { 617 | "elapsed": 599, 618 | "status": "ok", 619 | "timestamp": 1594011105561, 620 | "user": { 621 | "displayName": "ChangWook Jun", 622 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 623 | "userId": "00685987924881157185" 624 | }, 625 | "user_tz": -540 626 | }, 627 | "id": "7EuBYS58g6QZ" 628 | }, 629 | "outputs": [], 630 | "source": [ 631 | "korquad_model.compile(optimizer=optimizer, loss=[loss, loss])" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": { 638 | "colab": { 639 | "base_uri": "https://localhost:8080/", 640 | "height": 50 641 | }, 642 | "colab_type": "code", 643 | "executionInfo": { 644 | "elapsed": 714, 645 | "status": "ok", 646 | "timestamp": 1594011106252, 647 | "user": { 648 | "displayName": "ChangWook Jun", 649 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 650 | "userId": "00685987924881157185" 651 | }, 652 | "user_tz": -540 653 | }, 654 | "id": "ZehxFPSrg8Q2", 655 | "outputId": "6a33f8a1-84d0-48c4-ac1e-5843daf1f2fb" 656 | }, 657 | "outputs": [], 658 | "source": [ 659 | "model_name = \"tf2_bert_korquad\"\n", 660 | "\n", 661 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n", 662 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n", 663 | "\n", 664 | "# Create path if exists\n", 665 | "if os.path.exists(checkpoint_dir):\n", 666 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n", 667 | "else:\n", 668 | " os.makedirs(checkpoint_dir, exist_ok=True)\n", 669 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n", 670 | " \n", 671 | "cp_callback = ModelCheckpoint(\n", 672 | " checkpoint_path, verbose=1, save_best_only=True, save_weights_only=True)" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": { 679 | "colab": { 680 | "base_uri": "https://localhost:8080/", 681 | "height": 383 682 | }, 683 | "colab_type": "code", 684 | "executionInfo": { 685 | "elapsed": 18126376, 686 | "status": "ok", 687 | "timestamp": 1594029233934, 688 | "user": { 689 | "displayName": "ChangWook Jun", 690 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64", 691 | "userId": "00685987924881157185" 692 | }, 693 | "user_tz": -540 694 | }, 695 | "id": "2ljuajCLmyws", 696 | "outputId": "e89526e8-e795-48df-eead-1a00b28005bf" 697 | }, 698 | "outputs": [], 699 | "source": [ 700 | "history = korquad_model.fit(\n", 701 | " x_train,\n", 702 | " y_train,\n", 703 | " epochs=EPOCHS, # For demonstration, 3 epochs are recommended\n", 704 | " verbose=VERBOSE,\n", 705 | " batch_size=BATCH_SIZE,\n", 706 | " callbacks=[exact_match_callback, cp_callback]\n", 707 | ")" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": {}, 714 | "outputs": [], 715 | "source": [ 716 | "print(history.history)" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": { 723 | "colab": {}, 724 | "colab_type": "code", 725 | "id": "QxaigHy2m4JB" 726 | }, 727 | "outputs": [], 728 | "source": [ 729 | "plot_graphs(history, 'loss', 'output_1_loss', 'output_2_loss')" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [] 738 | } 739 | ], 740 | "metadata": { 741 | "accelerator": "GPU", 742 | "colab": { 743 | "authorship_tag": "ABX9TyMn6I90a+EqoM9Ks6eBcRWt", 744 | "collapsed_sections": [], 745 | "name": "KorQuad_class.ipynb", 746 | "provenance": [] 747 | }, 748 | "kernelspec": { 749 | "display_name": "Python 3", 750 | "language": "python", 751 | "name": "python3" 752 | }, 753 | "language_info": { 754 | "codemirror_mode": { 755 | "name": "ipython", 756 | "version": 3 757 | }, 758 | "file_extension": ".py", 759 | "mimetype": "text/x-python", 760 | "name": "python", 761 | "nbconvert_exporter": "python", 762 | "pygments_lexer": "ipython3", 763 | "version": "3.7.4" 764 | }, 765 | "widgets": { 766 | "application/vnd.jupyter.widget-state+json": { 767 | "1fde406732df4b5b90b7701dc7e4981e": { 768 | "model_module": "@jupyter-widgets/base", 769 | "model_name": "LayoutModel", 770 | "state": { 771 | "_model_module": "@jupyter-widgets/base", 772 | "_model_module_version": "1.2.0", 773 | "_model_name": "LayoutModel", 774 | "_view_count": null, 775 | "_view_module": "@jupyter-widgets/base", 776 | "_view_module_version": "1.2.0", 777 | "_view_name": "LayoutView", 778 | "align_content": null, 779 | "align_items": null, 780 | "align_self": null, 781 | "border": null, 782 | "bottom": null, 783 | "display": null, 784 | "flex": null, 785 | "flex_flow": null, 786 | "grid_area": null, 787 | "grid_auto_columns": null, 788 | "grid_auto_flow": null, 789 | "grid_auto_rows": null, 790 | "grid_column": null, 791 | "grid_gap": null, 792 | "grid_row": null, 793 | "grid_template_areas": null, 794 | "grid_template_columns": null, 795 | "grid_template_rows": null, 796 | "height": null, 797 | "justify_content": null, 798 | "justify_items": null, 799 | "left": null, 800 | "margin": null, 801 | "max_height": null, 802 | "max_width": null, 803 | "min_height": null, 804 | "min_width": null, 805 | "object_fit": null, 806 | "object_position": null, 807 | "order": null, 808 | "overflow": null, 809 | "overflow_x": null, 810 | "overflow_y": null, 811 | "padding": null, 812 | "right": null, 813 | "top": null, 814 | "visibility": null, 815 | "width": null 816 | } 817 | }, 818 | "2c0ecef646d44a0580cacefa5c3fd9f2": { 819 | "model_module": "@jupyter-widgets/controls", 820 | "model_name": "ProgressStyleModel", 821 | "state": { 822 | "_model_module": "@jupyter-widgets/controls", 823 | "_model_module_version": "1.5.0", 824 | "_model_name": "ProgressStyleModel", 825 | "_view_count": null, 826 | "_view_module": "@jupyter-widgets/base", 827 | "_view_module_version": "1.2.0", 828 | "_view_name": "StyleView", 829 | "bar_color": null, 830 | "description_width": "initial" 831 | } 832 | }, 833 | "3be3c9704e934fb5a3d5847749d398ce": { 834 | "model_module": "@jupyter-widgets/controls", 835 | "model_name": "HTMLModel", 836 | "state": { 837 | "_dom_classes": [], 838 | "_model_module": "@jupyter-widgets/controls", 839 | "_model_module_version": "1.5.0", 840 | "_model_name": "HTMLModel", 841 | "_view_count": null, 842 | "_view_module": "@jupyter-widgets/controls", 843 | "_view_module_version": "1.5.0", 844 | "_view_name": "HTMLView", 845 | "description": "", 846 | "description_tooltip": null, 847 | "layout": "IPY_MODEL_a7d4d0c48cda4abdb106a6bcfb24359e", 848 | "placeholder": "", 849 | "style": "IPY_MODEL_f58154a65f974e04bcf8af24b2884fdd", 850 | "value": " 872k/872k [00:00<00:00, 3.17MB/s]" 851 | } 852 | }, 853 | "a7d4d0c48cda4abdb106a6bcfb24359e": { 854 | "model_module": "@jupyter-widgets/base", 855 | "model_name": "LayoutModel", 856 | "state": { 857 | "_model_module": "@jupyter-widgets/base", 858 | "_model_module_version": "1.2.0", 859 | "_model_name": "LayoutModel", 860 | "_view_count": null, 861 | "_view_module": "@jupyter-widgets/base", 862 | "_view_module_version": "1.2.0", 863 | "_view_name": "LayoutView", 864 | "align_content": null, 865 | "align_items": null, 866 | "align_self": null, 867 | "border": null, 868 | "bottom": null, 869 | "display": null, 870 | "flex": null, 871 | "flex_flow": null, 872 | "grid_area": null, 873 | "grid_auto_columns": null, 874 | "grid_auto_flow": null, 875 | "grid_auto_rows": null, 876 | "grid_column": null, 877 | "grid_gap": null, 878 | "grid_row": null, 879 | "grid_template_areas": null, 880 | "grid_template_columns": null, 881 | "grid_template_rows": null, 882 | "height": null, 883 | "justify_content": null, 884 | "justify_items": null, 885 | "left": null, 886 | "margin": null, 887 | "max_height": null, 888 | "max_width": null, 889 | "min_height": null, 890 | "min_width": null, 891 | "object_fit": null, 892 | "object_position": null, 893 | "order": null, 894 | "overflow": null, 895 | "overflow_x": null, 896 | "overflow_y": null, 897 | "padding": null, 898 | "right": null, 899 | "top": null, 900 | "visibility": null, 901 | "width": null 902 | } 903 | }, 904 | "bc7f3c579a324f77811bdd6ad6dd7dc0": { 905 | "model_module": "@jupyter-widgets/controls", 906 | "model_name": "HBoxModel", 907 | "state": { 908 | "_dom_classes": [], 909 | "_model_module": "@jupyter-widgets/controls", 910 | "_model_module_version": "1.5.0", 911 | "_model_name": "HBoxModel", 912 | "_view_count": null, 913 | "_view_module": "@jupyter-widgets/controls", 914 | "_view_module_version": "1.5.0", 915 | "_view_name": "HBoxView", 916 | "box_style": "", 917 | "children": [ 918 | "IPY_MODEL_f8f80478dfca4894ac1ff8c2a082f734", 919 | "IPY_MODEL_3be3c9704e934fb5a3d5847749d398ce" 920 | ], 921 | "layout": "IPY_MODEL_e31de13423d743e68d6c451d23c93cdf" 922 | } 923 | }, 924 | "e31de13423d743e68d6c451d23c93cdf": { 925 | "model_module": "@jupyter-widgets/base", 926 | "model_name": "LayoutModel", 927 | "state": { 928 | "_model_module": "@jupyter-widgets/base", 929 | "_model_module_version": "1.2.0", 930 | "_model_name": "LayoutModel", 931 | "_view_count": null, 932 | "_view_module": "@jupyter-widgets/base", 933 | "_view_module_version": "1.2.0", 934 | "_view_name": "LayoutView", 935 | "align_content": null, 936 | "align_items": null, 937 | "align_self": null, 938 | "border": null, 939 | "bottom": null, 940 | "display": null, 941 | "flex": null, 942 | "flex_flow": null, 943 | "grid_area": null, 944 | "grid_auto_columns": null, 945 | "grid_auto_flow": null, 946 | "grid_auto_rows": null, 947 | "grid_column": null, 948 | "grid_gap": null, 949 | "grid_row": null, 950 | "grid_template_areas": null, 951 | "grid_template_columns": null, 952 | "grid_template_rows": null, 953 | "height": null, 954 | "justify_content": null, 955 | "justify_items": null, 956 | "left": null, 957 | "margin": null, 958 | "max_height": null, 959 | "max_width": null, 960 | "min_height": null, 961 | "min_width": null, 962 | "object_fit": null, 963 | "object_position": null, 964 | "order": null, 965 | "overflow": null, 966 | "overflow_x": null, 967 | "overflow_y": null, 968 | "padding": null, 969 | "right": null, 970 | "top": null, 971 | "visibility": null, 972 | "width": null 973 | } 974 | }, 975 | "f58154a65f974e04bcf8af24b2884fdd": { 976 | "model_module": "@jupyter-widgets/controls", 977 | "model_name": "DescriptionStyleModel", 978 | "state": { 979 | "_model_module": "@jupyter-widgets/controls", 980 | "_model_module_version": "1.5.0", 981 | "_model_name": "DescriptionStyleModel", 982 | "_view_count": null, 983 | "_view_module": "@jupyter-widgets/base", 984 | "_view_module_version": "1.2.0", 985 | "_view_name": "StyleView", 986 | "description_width": "" 987 | } 988 | }, 989 | "f8f80478dfca4894ac1ff8c2a082f734": { 990 | "model_module": "@jupyter-widgets/controls", 991 | "model_name": "FloatProgressModel", 992 | "state": { 993 | "_dom_classes": [], 994 | "_model_module": "@jupyter-widgets/controls", 995 | "_model_module_version": "1.5.0", 996 | "_model_name": "FloatProgressModel", 997 | "_view_count": null, 998 | "_view_module": "@jupyter-widgets/controls", 999 | "_view_module_version": "1.5.0", 1000 | "_view_name": "ProgressView", 1001 | "bar_style": "success", 1002 | "description": "Downloading: 100%", 1003 | "description_tooltip": null, 1004 | "layout": "IPY_MODEL_1fde406732df4b5b90b7701dc7e4981e", 1005 | "max": 871891, 1006 | "min": 0, 1007 | "orientation": "horizontal", 1008 | "style": "IPY_MODEL_2c0ecef646d44a0580cacefa5c3fd9f2", 1009 | "value": 871891 1010 | } 1011 | } 1012 | } 1013 | } 1014 | }, 1015 | "nbformat": 4, 1016 | "nbformat_minor": 1 1017 | } 1018 | --------------------------------------------------------------------------------