├── asset
├── explain_1.png
├── explain_2.png
├── explain_3.png
└── explain_4.png
├── README.md
├── 7.PRETRAIN_METHOD
├── 7.4.1.gpt2_finetune_novel_LM.ipynb
├── 7.4.2.gpt2_finetune_NSMC.ipynb
├── 7.2.1.bert_finetune_NSMC.ipynb
├── 7.2.2.bert_finetune_KorNLI.ipynb
├── 7.4.3.gpt2_finetune_KorNLI.ipynb
├── 7.2.3.bert_finetune_NER.ipynb
├── 7.2.4.bert_finetune_KorSTS.ipynb
├── 7.4.4.gpt2_finetune_KorSTS.ipynb
└── 7.2.5.bert_finetune_KorQuAD.ipynb
└── 8.GPT3
├── 8.3.gpt2_fewshot_NSMC.ipynb
└── 8.4.gpt2_p_tuning_NSMC.ipynb
/asset/explain_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_1.png
--------------------------------------------------------------------------------
/asset/explain_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_2.png
--------------------------------------------------------------------------------
/asset/explain_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_3.png
--------------------------------------------------------------------------------
/asset/explain_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2-colab/HEAD/asset/explain_4.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLPBOOK (개정판) colab 실습
2 |
3 | 텐서플로2와 머신러닝으로 시작하는 자연어처리(로지스틱회귀부터 BERT와 GPT2까지) colab 실습 저장소
4 |
5 | ## 소개 (Introduction)
6 |
7 | 책에 수록된 자연어 처리 예제들 중 컴퓨터 리소스를 많이 필요로 하는 실슾 파일들을 모아놓은 저장소입니다.
8 |
9 | 해당 실습 파일들을 colab에 불러 실행해보실 수 있습니다.
10 |
11 | ## Colab 실행
12 |
13 | 1. https://colab.research.google.com 에 접속한다.
14 |
15 | 2. 깃헙 저장소의 파일을 불러온다.
16 |
17 |
18 |
19 |
20 |
21 | 3. 'Copy to Drive'를 클릭하여 실습 파일을 자신의 드라이브로 옮긴다.
22 |
23 |
24 |
25 |
26 |
27 | 4. Change runtime type 메뉴를 클릭하여 사용할 리소스를 GPU로 선택한다.
28 |
29 | > - Change runtime type을 클릭한다.
30 |
31 |
32 |
33 | > - GPU 리소스로 선택한다.
34 |
35 |
36 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.4.1.gpt2_finetune_novel_LM.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/finetune_data.txt \\\n",
36 | " -O data_in/KOR/finetune_data.txt "
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "import os\n",
46 | "\n",
47 | "import numpy as np\n",
48 | "import tensorflow as tf\n",
49 | "\n",
50 | "import gluonnlp as nlp\n",
51 | "from gluonnlp.data import SentencepieceTokenizer\n",
52 | "from transformers import TFGPT2LMHeadModel\n",
53 | "\n",
54 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
55 | "\n",
56 | "from nltk.tokenize import sent_tokenize"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요."
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n",
73 | "!unzip -o gpt_ckpt.zip"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "class GPT2Model(tf.keras.Model):\n",
83 | " def __init__(self, dir_path):\n",
84 | " super(GPT2Model, self).__init__()\n",
85 | " self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)\n",
86 | " \n",
87 | " def call(self, inputs):\n",
88 | " return self.gpt2(inputs)[0]"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "BASE_MODEL_PATH = './gpt_ckpt'\n",
98 | "gpt_model = GPT2Model(BASE_MODEL_PATH)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "BATCH_SIZE = 16\n",
108 | "NUM_EPOCHS = 10\n",
109 | "MAX_LEN = 30\n",
110 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n",
111 | "\n",
112 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n",
113 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n",
114 | " mask_token=None,\n",
115 | " sep_token=None,\n",
116 | " cls_token=None,\n",
117 | " unknown_token='',\n",
118 | " padding_token='',\n",
119 | " bos_token='',\n",
120 | " eos_token='')"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "def tf_top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-99999):\n",
130 | " _logits = logits.numpy()\n",
131 | " top_k = min(top_k, logits.shape[-1]) \n",
132 | " if top_k > 0:\n",
133 | " indices_to_remove = logits < tf.math.top_k(logits, top_k)[0][..., -1, None]\n",
134 | " _logits[indices_to_remove] = filter_value\n",
135 | "\n",
136 | " if top_p > 0.0:\n",
137 | " sorted_logits = tf.sort(logits, direction='DESCENDING')\n",
138 | " sorted_indices = tf.argsort(logits, direction='DESCENDING')\n",
139 | " cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)\n",
140 | "\n",
141 | " sorted_indices_to_remove = cumulative_probs > top_p\n",
142 | " sorted_indices_to_remove = tf.concat([[False], sorted_indices_to_remove[..., :-1]], axis=0)\n",
143 | " indices_to_remove = sorted_indices[sorted_indices_to_remove].numpy().tolist()\n",
144 | " \n",
145 | " _logits[indices_to_remove] = filter_value\n",
146 | " return tf.constant([_logits])\n",
147 | "\n",
148 | "\n",
149 | "def generate_sent(seed_word, model, max_step=100, greedy=False, top_k=0, top_p=0.):\n",
150 | " sent = seed_word\n",
151 | " toked = tokenizer(sent)\n",
152 | " \n",
153 | " for _ in range(max_step):\n",
154 | " input_ids = tf.constant([vocab[vocab.bos_token],] + vocab[toked])[None, :] \n",
155 | " outputs = model(input_ids)[:, -1, :]\n",
156 | " if greedy:\n",
157 | " gen = vocab.to_tokens(tf.argmax(outputs, axis=-1).numpy().tolist()[0])\n",
158 | " else:\n",
159 | " output_logit = tf_top_k_top_p_filtering(outputs[0], top_k=top_k, top_p=top_p)\n",
160 | " gen = vocab.to_tokens(tf.random.categorical(output_logit, 1).numpy().tolist()[0])[0]\n",
161 | " if gen == '':\n",
162 | " break\n",
163 | " sent += gen.replace('▁', ' ')\n",
164 | " toked = tokenizer(sent)\n",
165 | "\n",
166 | " return sent"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "generate_sent('이때', gpt_model, greedy=True)"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "generate_sent('이때', gpt_model, top_k=0, top_p=0.95)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "DATA_IN_PATH = './data_in/KOR/'\n",
194 | "TRAIN_DATA_FILE = 'finetune_data.txt'\n",
195 | "\n",
196 | "sents = [s[:-1] for s in open(DATA_IN_PATH + TRAIN_DATA_FILE).readlines()]"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "input_data = []\n",
206 | "output_data = []\n",
207 | "\n",
208 | "for s in sents:\n",
209 | " tokens = [vocab[vocab.bos_token],] + vocab[tokenizer(s)] + [vocab[vocab.eos_token],]\n",
210 | " input_data.append(tokens[:-1])\n",
211 | " output_data.append(tokens[1:])\n",
212 | "\n",
213 | "input_data = pad_sequences(input_data, MAX_LEN, value=vocab[vocab.padding_token])\n",
214 | "output_data = pad_sequences(output_data, MAX_LEN, value=vocab[vocab.padding_token])\n",
215 | "\n",
216 | "input_data = np.array(input_data, dtype=np.int64)\n",
217 | "output_data = np.array(output_data, dtype=np.int64)"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "loss_object = tf.keras.losses.SparseCategoricalCrossentropy(\n",
227 | " from_logits=True, reduction='none')\n",
228 | "\n",
229 | "train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')\n",
230 | "\n",
231 | "def loss_function(real, pred):\n",
232 | " mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))\n",
233 | " loss_ = loss_object(real, pred)\n",
234 | "\n",
235 | " mask = tf.cast(mask, dtype=loss_.dtype)\n",
236 | " loss_ *= mask\n",
237 | "\n",
238 | " return tf.reduce_mean(loss_)\n",
239 | "\n",
240 | "def accuracy_function(real, pred):\n",
241 | " mask = tf.math.logical_not(tf.math.equal(real, vocab[vocab.padding_token]))\n",
242 | " mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)\n",
243 | " pred *= mask \n",
244 | " acc = train_accuracy(real, pred)\n",
245 | "\n",
246 | " return tf.reduce_mean(acc)"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": [
255 | "gpt_model.compile(loss=loss_function,\n",
256 | " optimizer=tf.keras.optimizers.Adam(1e-4),\n",
257 | " metrics=[accuracy_function])"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "history = gpt_model.fit(input_data, output_data, \n",
267 | " batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,\n",
268 | " validation_split=0.1)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {},
275 | "outputs": [],
276 | "source": [
277 | "DATA_OUT_PATH = './data_out'\n",
278 | "model_name = \"tf2_gpt2_finetuned_model\"\n",
279 | "\n",
280 | "save_path = os.path.join(DATA_OUT_PATH, model_name)\n",
281 | "\n",
282 | "if not os.path.exists(save_path):\n",
283 | " os.makedirs(save_path)\n",
284 | "\n",
285 | "gpt_model.gpt2.save_pretrained(save_path)\n",
286 | "\n",
287 | "loaded_gpt_model = GPT2Model(save_path)"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "generate_sent('이때', gpt_model, greedy=True)"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "generate_sent('이때', gpt_model, top_k=0, top_p=0.95)"
306 | ]
307 | }
308 | ],
309 | "metadata": {
310 | "kernelspec": {
311 | "display_name": "Python 3",
312 | "language": "python",
313 | "name": "python3"
314 | },
315 | "language_info": {
316 | "codemirror_mode": {
317 | "name": "ipython",
318 | "version": 3
319 | },
320 | "file_extension": ".py",
321 | "mimetype": "text/x-python",
322 | "name": "python",
323 | "nbconvert_exporter": "python",
324 | "pygments_lexer": "ipython3",
325 | "version": "3.7.4"
326 | }
327 | },
328 | "nbformat": 4,
329 | "nbformat_minor": 2
330 | }
331 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.4.2.gpt2_finetune_NSMC.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR/naver_movie\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n",
36 | " -O data_in/KOR/naver_movie/ratings_train.txt\n",
37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n",
38 | " -O data_in/KOR/naver_movie/ratings_test.txt"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {
45 | "scrolled": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "import os\n",
50 | "import tensorflow as tf\n",
51 | "from transformers import TFGPT2Model\n",
52 | "\n",
53 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
54 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
55 | "\n",
56 | "import gluonnlp as nlp\n",
57 | "from gluonnlp.data import SentencepieceTokenizer\n",
58 | "\n",
59 | "import pandas as pd\n",
60 | "import matplotlib.pyplot as plt\n",
61 | "\n",
62 | "import numpy as np\n",
63 | "import re"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n",
80 | "!unzip -o gpt_ckpt.zip"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# 시각화\n",
90 | "\n",
91 | "def plot_graphs(history, string):\n",
92 | " plt.plot(history.history[string])\n",
93 | " plt.plot(history.history['val_'+string], '')\n",
94 | " plt.xlabel(\"Epochs\")\n",
95 | " plt.ylabel(string)\n",
96 | " plt.legend([string, 'val_'+string])\n",
97 | " plt.show()"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "SEED_NUM = 1234\n",
107 | "tf.random.set_seed(SEED_NUM)\n",
108 | "np.random.seed(SEED_NUM)"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "## 데이터 준비하기"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n",
125 | "\n",
126 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n",
127 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n",
128 | " mask_token=None,\n",
129 | " sep_token='',\n",
130 | " cls_token=None,\n",
131 | " unknown_token='',\n",
132 | " padding_token='',\n",
133 | " bos_token='',\n",
134 | " eos_token='')"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "BATCH_SIZE = 32\n",
144 | "NUM_EPOCHS = 3\n",
145 | "VALID_SPLIT = 0.1\n",
146 | "SENT_MAX_LEN = 39\n",
147 | "\n",
148 | "DATA_IN_PATH = './data_in/KOR'\n",
149 | "DATA_OUT_PATH = \"./data_out/KOR\""
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "# 데이터 전처리 준비\n",
159 | "\n",
160 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_train.txt\")\n",
161 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_test.txt\")\n",
162 | "\n",
163 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n",
164 | "train_data = train_data.dropna()\n",
165 | "train_data.head()\n",
166 | "\n",
167 | "print(\"Total # dataset: train - {}\".format(len(train_data)))"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "# 텍스트 전처리\n",
177 | "\n",
178 | "def clean_text(sent):\n",
179 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \"\", sent)\n",
180 | " return sent_clean"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "# train_data = train_data[:50] # for test\n",
190 | "\n",
191 | "train_data_sents = []\n",
192 | "train_data_labels = []\n",
193 | "\n",
194 | "for train_sent, train_label in train_data[['document', 'label']].values:\n",
195 | " train_tokenized_text = vocab[tokenizer(clean_text(train_sent))]\n",
196 | "\n",
197 | " tokens = [vocab[vocab.bos_token]] \n",
198 | " tokens += pad_sequences([train_tokenized_text], \n",
199 | " SENT_MAX_LEN, \n",
200 | " value=vocab[vocab.padding_token], \n",
201 | " padding='post').tolist()[0] \n",
202 | " tokens += [vocab[vocab.eos_token]]\n",
203 | "\n",
204 | " train_data_sents.append(tokens)\n",
205 | " train_data_labels.append(train_label)\n",
206 | "\n",
207 | "train_data_sents = np.array(train_data_sents, dtype=np.int64)\n",
208 | "train_data_labels = np.array(train_data_labels, dtype=np.int64)"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "## 모델 학습"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "class TFGPT2Classifier(tf.keras.Model):\n",
225 | " def __init__(self, dir_path, num_class):\n",
226 | " super(TFGPT2Classifier, self).__init__()\n",
227 | " \n",
228 | " self.gpt2 = TFGPT2Model.from_pretrained(dir_path)\n",
229 | " self.num_class = num_class\n",
230 | " \n",
231 | " self.dropout = tf.keras.layers.Dropout(self.gpt2.config.summary_first_dropout)\n",
232 | " self.classifier = tf.keras.layers.Dense(self.num_class, \n",
233 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.gpt2.config.initializer_range), \n",
234 | " name=\"classifier\")\n",
235 | " \n",
236 | " def call(self, inputs):\n",
237 | " outputs = self.gpt2(inputs)\n",
238 | " pooled_output = outputs[0][:, -1]\n",
239 | "\n",
240 | " pooled_output = self.dropout(pooled_output)\n",
241 | " logits = self.classifier(pooled_output)\n",
242 | "\n",
243 | " return logits"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "BASE_MODEL_PATH = './gpt_ckpt'\n",
253 | "cls_model = TFGPT2Classifier(dir_path=BASE_MODEL_PATH, num_class=2)"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "optimizer = tf.keras.optimizers.Adam(learning_rate=6.25e-5)\n",
263 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
264 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
265 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "model_name = \"tf2_gpt2_naver_movie\"\n",
275 | "\n",
276 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)\n",
277 | "\n",
278 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
279 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
280 | "\n",
281 | "if os.path.exists(checkpoint_dir):\n",
282 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
283 | "else:\n",
284 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
285 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
286 | " \n",
287 | "cp_callback = ModelCheckpoint(\n",
288 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n",
289 | "\n",
290 | "history = cls_model.fit(train_data_sents, train_data_labels, \n",
291 | " epochs=NUM_EPOCHS, \n",
292 | " batch_size=BATCH_SIZE,\n",
293 | " validation_split=VALID_SPLIT, \n",
294 | " callbacks=[earlystop_callback, cp_callback])"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": [
303 | "plot_graphs(history, 'accuracy')"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {},
310 | "outputs": [],
311 | "source": [
312 | "plot_graphs(history, 'loss')"
313 | ]
314 | },
315 | {
316 | "cell_type": "markdown",
317 | "metadata": {},
318 | "source": [
319 | "## 모델 테스트"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\\t', quoting=3)\n",
329 | "test_data = test_data.dropna()\n",
330 | "test_data.head()"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "# test_data = test_data[:50] # for test\n",
340 | "\n",
341 | "test_data_sents = []\n",
342 | "test_data_labels = []\n",
343 | "\n",
344 | "for test_sent, test_label in test_data[['document','label']].values:\n",
345 | " test_tokenized_text = vocab[tokenizer(clean_text(test_sent))]\n",
346 | "\n",
347 | " tokens = [vocab[vocab.bos_token]] \n",
348 | " tokens += pad_sequences([test_tokenized_text], \n",
349 | " SENT_MAX_LEN, \n",
350 | " value=vocab[vocab.padding_token], \n",
351 | " padding='post').tolist()[0] \n",
352 | " tokens += [vocab[vocab.eos_token]]\n",
353 | "\n",
354 | " test_data_sents.append(tokens)\n",
355 | " test_data_labels.append(test_label)\n",
356 | "\n",
357 | "test_data_sents = np.array(test_data_sents, dtype=np.int64)\n",
358 | "test_data_labels = np.array(test_data_labels, dtype=np.int64)"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "print(\"num sents, labels {}, {}\".format(len(test_data_sents), len(test_data_labels)))"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "cls_model.load_weights(checkpoint_path)\n",
377 | "\n",
378 | "results = cls_model.evaluate(test_data_sents, test_data_labels, batch_size=1024)\n",
379 | "print(\"test loss, test acc: \", results)"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {},
386 | "outputs": [],
387 | "source": []
388 | }
389 | ],
390 | "metadata": {
391 | "kernelspec": {
392 | "display_name": "Python 3",
393 | "language": "python",
394 | "name": "python3"
395 | },
396 | "language_info": {
397 | "codemirror_mode": {
398 | "name": "ipython",
399 | "version": 3
400 | },
401 | "file_extension": ".py",
402 | "mimetype": "text/x-python",
403 | "name": "python",
404 | "nbconvert_exporter": "python",
405 | "pygments_lexer": "ipython3",
406 | "version": "3.7.4"
407 | }
408 | },
409 | "nbformat": 4,
410 | "nbformat_minor": 2
411 | }
412 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.2.1.bert_finetune_NSMC.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR/naver_movie\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n",
36 | " -O data_in/KOR/naver_movie/ratings_train.txt\n",
37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n",
38 | " -O data_in/KOR/naver_movie/ratings_test.txt"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "import os\n",
48 | "import re\n",
49 | "import numpy as np\n",
50 | "from tqdm import tqdm\n",
51 | "\n",
52 | "import tensorflow as tf\n",
53 | "from transformers import *\n",
54 | "\n",
55 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
56 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
57 | "\n",
58 | "import pandas as pd\n",
59 | "import matplotlib.pyplot as plt"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# 시각화\n",
69 | "\n",
70 | "def plot_graphs(history, string):\n",
71 | " plt.plot(history.history[string])\n",
72 | " plt.plot(history.history['val_'+string], '')\n",
73 | " plt.xlabel(\"Epochs\")\n",
74 | " plt.ylabel(string)\n",
75 | " plt.legend([string, 'val_'+string])\n",
76 | " plt.show()"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "#random seed 고정\n",
86 | "tf.random.set_seed(1234)\n",
87 | "np.random.seed(1234)\n",
88 | "\n",
89 | "BATCH_SIZE = 32\n",
90 | "NUM_EPOCHS = 3\n",
91 | "VALID_SPLIT = 0.2\n",
92 | "MAX_LEN = 39 # EDA에서 추출된 Max Length\n",
93 | "DATA_IN_PATH = 'data_in/KOR'\n",
94 | "DATA_OUT_PATH = \"data_out/KOR\""
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt', do_lower_case=False)"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "## 토크나이저 테스트"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "test_sentence = \"안녕하세요, 반갑습니다.\"\n",
120 | "\n",
121 | "encode = tokenizer.encode(test_sentence)\n",
122 | "token_print = [tokenizer.decode(token) for token in encode]\n",
123 | "\n",
124 | "print(encode)\n",
125 | "print(token_print)"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "kor_encode = tokenizer.encode(\"안녕하세요, 반갑습니다\")\n",
135 | "eng_encode = tokenizer.encode(\"Hello world\")\n",
136 | "kor_decode = tokenizer.decode(kor_encode)\n",
137 | "eng_decode = tokenizer.decode(eng_encode)\n",
138 | "\n",
139 | "print(kor_encode)\n",
140 | "# [101, 9521, 118741, 35506, 24982, 48549, 117, 9321, 118610, 119081, 48345, 102]\n",
141 | "print(eng_encode)\n",
142 | "# [101, 31178, 11356, 102]\n",
143 | "print(kor_decode)\n",
144 | "# [CLS] 안녕하세요, 반갑습니다 [SEP]\n",
145 | "print(eng_decode)\n",
146 | "# [CLS] Hello world [SEP]"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "# Korean Movie Review Classification"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "scrolled": true
161 | },
162 | "outputs": [],
163 | "source": [
164 | "# 데이터 전처리 준비\n",
165 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_train.txt\")\n",
166 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_test.txt\")\n",
167 | "\n",
168 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n",
169 | "train_data = train_data.dropna()\n",
170 | "train_data.head()"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {},
177 | "outputs": [],
178 | "source": [
179 | "# 스페셜 토큰\n",
180 | "print(tokenizer.all_special_tokens, \"\\n\", tokenizer.all_special_ids)\n",
181 | "\n",
182 | "# 토크나이저 테스트하기\n",
183 | "kor_encode = tokenizer.encode(\"안녕하세요, 반갑습니다. \")\n",
184 | "eng_encode = tokenizer.encode(\"Hello world\")\n",
185 | "\n",
186 | "kor_decode = tokenizer.decode(kor_encode)\n",
187 | "eng_decode = tokenizer.decode(eng_encode)\n",
188 | "\n",
189 | "print(kor_encode)\n",
190 | "print(eng_encode)\n",
191 | "print(kor_decode)\n",
192 | "print(eng_decode)"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "# Bert Tokenizer\n",
202 | "\n",
203 | "# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n",
204 | "\n",
205 | "def bert_tokenizer(sent, MAX_LEN):\n",
206 | " \n",
207 | " encoded_dict = tokenizer.encode_plus(\n",
208 | " text = sent,\n",
209 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
210 | " max_length = MAX_LEN, # Pad & truncate all sentences.\n",
211 | " pad_to_max_length = True,\n",
212 | " return_attention_mask = True # Construct attn. masks.\n",
213 | " \n",
214 | " )\n",
215 | " \n",
216 | " input_id = encoded_dict['input_ids']\n",
217 | " attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n",
218 | " token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n",
219 | " \n",
220 | " return input_id, attention_mask, token_type_id"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "# train_data = train_data[:1000] # for test\n",
230 | "\n",
231 | "input_ids = []\n",
232 | "attention_masks = []\n",
233 | "token_type_ids = []\n",
234 | "train_data_labels = []\n",
235 | "\n",
236 | "for train_sent, train_label in tqdm(zip(train_data[\"document\"], train_data[\"label\"]), total=len(train_data)):\n",
237 | " try:\n",
238 | " input_id, attention_mask, token_type_id = bert_tokenizer(train_sent, MAX_LEN)\n",
239 | " \n",
240 | " input_ids.append(input_id)\n",
241 | " attention_masks.append(attention_mask)\n",
242 | " token_type_ids.append(token_type_id)\n",
243 | " train_data_labels.append(train_label)\n",
244 | "\n",
245 | " except Exception as e:\n",
246 | " print(e)\n",
247 | " print(train_sent)\n",
248 | " pass\n",
249 | "\n",
250 | "train_movie_input_ids = np.array(input_ids, dtype=int)\n",
251 | "train_movie_attention_masks = np.array(attention_masks, dtype=int)\n",
252 | "train_movie_type_ids = np.array(token_type_ids, dtype=int)\n",
253 | "train_movie_inputs = (train_movie_input_ids, train_movie_attention_masks, train_movie_type_ids)\n",
254 | "\n",
255 | "train_data_labels = np.asarray(train_data_labels, dtype=np.int32) #레이블 토크나이징 리스트\n",
256 | "\n",
257 | "print(\"# sents: {}, # labels: {}\".format(len(train_movie_input_ids), len(train_data_labels)))"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {},
264 | "outputs": [],
265 | "source": [
266 | "# 최대 길이: 39\n",
267 | "input_id = train_movie_input_ids[1]\n",
268 | "attention_mask = train_movie_attention_masks[1]\n",
269 | "token_type_id = train_movie_type_ids[1]\n",
270 | "\n",
271 | "print(input_id)\n",
272 | "print(attention_mask)\n",
273 | "print(token_type_id)\n",
274 | "print(tokenizer.decode(input_id))"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": null,
280 | "metadata": {},
281 | "outputs": [],
282 | "source": [
283 | "class TFBertClassifier(tf.keras.Model):\n",
284 | " def __init__(self, model_name, dir_path, num_class):\n",
285 | " super(TFBertClassifier, self).__init__()\n",
286 | "\n",
287 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n",
288 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n",
289 | " self.classifier = tf.keras.layers.Dense(num_class, \n",
290 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n",
291 | " name=\"classifier\")\n",
292 | " \n",
293 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n",
294 | " \n",
295 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n",
296 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
297 | " pooled_output = outputs[1] \n",
298 | " pooled_output = self.dropout(pooled_output, training=training)\n",
299 | " logits = self.classifier(pooled_output)\n",
300 | "\n",
301 | " return logits\n",
302 | "\n",
303 | "cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',\n",
304 | " dir_path='bert_ckpt',\n",
305 | " num_class=2)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "# 학습 준비하기\n",
315 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n",
316 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
317 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
318 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "model_name = \"tf2_bert_naver_movie\"\n",
328 | "\n",
329 | "# overfitting을 막기 위한 ealrystop 추가\n",
330 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)\n",
331 | "# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)\n",
332 | "# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\\\n",
333 | "\n",
334 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
335 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
336 | "\n",
337 | "# Create path if exists\n",
338 | "if os.path.exists(checkpoint_dir):\n",
339 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
340 | "else:\n",
341 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
342 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
343 | " \n",
344 | "cp_callback = ModelCheckpoint(\n",
345 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n",
346 | "\n",
347 | "# 학습과 eval 시작\n",
348 | "history = cls_model.fit(train_movie_inputs, train_data_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,\n",
349 | " validation_split = VALID_SPLIT, callbacks=[earlystop_callback, cp_callback])\n",
350 | "\n",
351 | "#steps_for_epoch\n",
352 | "\n",
353 | "print(history.history)"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "plot_graphs(history, 'loss')"
363 | ]
364 | },
365 | {
366 | "cell_type": "markdown",
367 | "metadata": {},
368 | "source": [
369 | "# Korean Movie Review Test 데이터"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "test_data = pd.read_csv(DATA_TEST_PATH, header = 0, delimiter = '\\t', quoting = 3)\n",
379 | "test_data = test_data.dropna()\n",
380 | "test_data.head()"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": null,
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "input_ids = []\n",
390 | "attention_masks = []\n",
391 | "token_type_ids = []\n",
392 | "test_data_labels = []\n",
393 | "\n",
394 | "for test_sent, test_label in tqdm(zip(test_data[\"document\"], test_data[\"label\"])):\n",
395 | " try:\n",
396 | " input_id, attention_mask, token_type_id = bert_tokenizer(test_sent, MAX_LEN)\n",
397 | "\n",
398 | " input_ids.append(input_id)\n",
399 | " attention_masks.append(attention_mask)\n",
400 | " token_type_ids.append(token_type_id)\n",
401 | " test_data_labels.append(test_label)\n",
402 | " except Exception as e:\n",
403 | " print(e)\n",
404 | " print(test_sent)\n",
405 | " pass\n",
406 | "\n",
407 | "test_movie_input_ids = np.array(input_ids, dtype=int)\n",
408 | "test_movie_attention_masks = np.array(attention_masks, dtype=int)\n",
409 | "test_movie_type_ids = np.array(token_type_ids, dtype=int)\n",
410 | "test_movie_inputs = (test_movie_input_ids, test_movie_attention_masks, test_movie_type_ids)\n",
411 | "\n",
412 | "test_data_labels = np.asarray(test_data_labels, dtype=np.int32) #레이블 토크나이징 리스트\n",
413 | "\n",
414 | "print(\"num sents, labels {}, {}\".format(len(test_movie_input_ids), len(test_data_labels)))"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "results = cls_model.evaluate(test_movie_inputs, test_data_labels, batch_size=1024)\n",
424 | "print(\"test loss, test acc: \", results)"
425 | ]
426 | }
427 | ],
428 | "metadata": {
429 | "kernelspec": {
430 | "display_name": "Python 3",
431 | "language": "python",
432 | "name": "python3"
433 | },
434 | "language_info": {
435 | "codemirror_mode": {
436 | "name": "ipython",
437 | "version": 3
438 | },
439 | "file_extension": ".py",
440 | "mimetype": "text/x-python",
441 | "name": "python",
442 | "nbconvert_exporter": "python",
443 | "pygments_lexer": "ipython3",
444 | "version": "3.7.4"
445 | }
446 | },
447 | "nbformat": 4,
448 | "nbformat_minor": 2
449 | }
450 |
--------------------------------------------------------------------------------
/8.GPT3/8.3.gpt2_fewshot_NSMC.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "#### 주의!!\n",
8 | "\n",
9 | "이 실습은 가급적 NVIDIA GPU가 설치된 컴퓨터 환경이거나 Google Colab에서 진행해주세요."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {
15 | "id": "2NmYZYYhXrcZ"
16 | },
17 | "source": [
18 | "## 환경 준비 \n",
19 | "(Google Colab 환경에서 사용하세요)"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {
26 | "id": "6-bFpckCXrcb"
27 | },
28 | "outputs": [],
29 | "source": [
30 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
31 | "!pip install -r requirements.txt\n",
32 | "!pip install tensorflow==2.2.0"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {
38 | "id": "cvFHjoTCXrcc"
39 | },
40 | "source": [
41 | "## 데이터 다운로드\n",
42 | "(Google Colab 환경에서 사용하세요)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {
49 | "id": "HbKNloVoXrcd"
50 | },
51 | "outputs": [],
52 | "source": [
53 | "!mkdir -p data_in/KOR/naver_movie\n",
54 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n",
55 | " -O data_in/KOR/naver_movie/ratings_train.txt\n",
56 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n",
57 | " -O data_in/KOR/naver_movie/ratings_test.txt"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {
64 | "id": "xs88fDX8Xrcd",
65 | "scrolled": true
66 | },
67 | "outputs": [],
68 | "source": [
69 | "import os\n",
70 | "import tensorflow as tf\n",
71 | "from transformers import TFGPT2LMHeadModel\n",
72 | "\n",
73 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
74 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
75 | "\n",
76 | "import gluonnlp as nlp\n",
77 | "from gluonnlp.data import SentencepieceTokenizer\n",
78 | "\n",
79 | "import pandas as pd\n",
80 | "import matplotlib.pyplot as plt\n",
81 | "\n",
82 | "import numpy as np\n",
83 | "import re\n",
84 | "\n",
85 | "import random\n",
86 | "from random import sample"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {
92 | "id": "XgV0aK1KXrce"
93 | },
94 | "source": [
95 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요."
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {
102 | "id": "XmofLC_rXrce"
103 | },
104 | "outputs": [],
105 | "source": [
106 | "import wget\n",
107 | "import zipfile\n",
108 | "\n",
109 | "wget.download('https://github.com/NLP-kr/tensorflow-ml-nlp-tf2/releases/download/v1.0/gpt_ckpt.zip')\n",
110 | "\n",
111 | "with zipfile.ZipFile('gpt_ckpt.zip') as z:\n",
112 | " z.extractall()"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "id": "TVExOYgEXrcf"
120 | },
121 | "outputs": [],
122 | "source": [
123 | "# 시각화\n",
124 | "\n",
125 | "def plot_graphs(history, string):\n",
126 | " plt.plot(history.history[string])\n",
127 | " plt.plot(history.history['val_'+string], '')\n",
128 | " plt.xlabel('Epochs')\n",
129 | " plt.ylabel(string)\n",
130 | " plt.legend([string, 'val_'+string])\n",
131 | " plt.show()"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "id": "s6dM4ebxXrcg"
139 | },
140 | "outputs": [],
141 | "source": [
142 | "SEED_NUM = 1234\n",
143 | "tf.random.set_seed(SEED_NUM)\n",
144 | "np.random.seed(SEED_NUM)\n",
145 | "random.seed(SEED_NUM)"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {
151 | "id": "WSYro-2hvbOI"
152 | },
153 | "source": [
154 | "## 퓨샷 러닝을 위한 네이버 영화 리뷰 모델 구성\n"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {
161 | "id": "lAaKKUqbXrch"
162 | },
163 | "outputs": [],
164 | "source": [
165 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n",
166 | "\n",
167 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n",
168 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n",
169 | " mask_token=None,\n",
170 | " sep_token='',\n",
171 | " cls_token=None,\n",
172 | " unknown_token='',\n",
173 | " padding_token='',\n",
174 | " bos_token='',\n",
175 | " eos_token='')"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {
182 | "id": "AypWVja1Xrcj"
183 | },
184 | "outputs": [],
185 | "source": [
186 | "class TFGPT2FewshotClassifier(tf.keras.Model):\n",
187 | " def __init__(self, dir_path):\n",
188 | " super(TFGPT2FewshotClassifier, self).__init__()\n",
189 | " self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)\n",
190 | " \n",
191 | " def call(self, inputs):\n",
192 | " outputs = self.gpt2({'input_ids': inputs})[0][:, -1, :]\n",
193 | "\n",
194 | " return outputs"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "colab": {
202 | "base_uri": "https://localhost:8080/"
203 | },
204 | "id": "9J5VOzCwXrcj",
205 | "outputId": "537cde6c-958a-4bc7-f98d-996b3bb13bb3"
206 | },
207 | "outputs": [],
208 | "source": [
209 | "BASE_MODEL_PATH = './gpt_ckpt'\n",
210 | "cls_model = TFGPT2FewshotClassifier(dir_path=BASE_MODEL_PATH)"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {
216 | "id": "pCN8Lh7gXrch"
217 | },
218 | "source": [
219 | "## 퓨샷 러닝을 위한 네이버 영화 리뷰 데이터 구성"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {
226 | "id": "Ct1IbwATXrci"
227 | },
228 | "outputs": [],
229 | "source": [
230 | "# 데이터 전처리 준비\n",
231 | "DATA_IN_PATH = './data_in/KOR'\n",
232 | "DATA_OUT_PATH = './data_out/KOR'\n",
233 | "\n",
234 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_train.txt')\n",
235 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, 'naver_movie', 'ratings_test.txt')\n",
236 | "\n",
237 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n",
238 | "train_data = train_data.dropna()"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {
245 | "colab": {
246 | "base_uri": "https://localhost:8080/"
247 | },
248 | "id": "WED9P9SUSyR9",
249 | "outputId": "5c4ba8bd-9a78-49fa-ad19-9fe14603723f"
250 | },
251 | "outputs": [],
252 | "source": [
253 | "print('데이터 positive 라벨: ', tokenizer('긍정'))\n",
254 | "print('데이터 negative 라벨: ', tokenizer('부정'))"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {
261 | "colab": {
262 | "base_uri": "https://localhost:8080/"
263 | },
264 | "id": "WaQ1miXfwQfn",
265 | "outputId": "6d506ff6-ac64-4478-b3cc-dbbf7aa9526d"
266 | },
267 | "outputs": [],
268 | "source": [
269 | "print('학습 예시 케이스 구조: ', tokenizer('문장: 오늘 기분이 좋아\\n감정: 긍정\\n'))"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {
276 | "colab": {
277 | "base_uri": "https://localhost:8080/"
278 | },
279 | "id": "7h0USc0RxQqG",
280 | "outputId": "7ad08962-3798-4653-ce76-b6a69a1eb4e9"
281 | },
282 | "outputs": [],
283 | "source": [
284 | "print('gpt2 최대 토큰 길이: ', cls_model.gpt2.config.n_ctx)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {
291 | "colab": {
292 | "base_uri": "https://localhost:8080/"
293 | },
294 | "id": "MRwI-RcOyFRj",
295 | "outputId": "ccbf5e78-27a2-4c4e-8b74-327de16c6673"
296 | },
297 | "outputs": [],
298 | "source": [
299 | "sent_lens = [len(tokenizer(s)) for s in train_data['document']]\n",
300 | "\n",
301 | "print('Few shot 케이스 토큰 평균 길이: ', np.mean(sent_lens))\n",
302 | "print('Few shot 케이스 토큰 최대 길이: ', np.max(sent_lens))\n",
303 | "print('Few shot 케이스 토큰 길이 표준편차: ',np.std(sent_lens))\n",
304 | "print('Few shot 케이스 토큰 길이 80 퍼센타일: ',np.percentile(sent_lens, 80))"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {
311 | "id": "PdIWfc6Pzyfz"
312 | },
313 | "outputs": [],
314 | "source": [
315 | "train_fewshot_data = []\n",
316 | "\n",
317 | "for train_sent, train_label in train_data[['document', 'label']].values:\n",
318 | " tokens = vocab[tokenizer(train_sent)]\n",
319 | "\n",
320 | " if len(tokens) <= 25:\n",
321 | " train_fewshot_data.append((train_sent, train_label))"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {
327 | "id": "4jFe7XMeXrcl"
328 | },
329 | "source": [
330 | "## 네이버 영화 리뷰 데이터를 활용한 퓨샷 러닝 및 평가"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {
337 | "colab": {
338 | "base_uri": "https://localhost:8080/",
339 | "height": 206
340 | },
341 | "id": "1_OhF3hVhK0y",
342 | "outputId": "6a661ba0-e27e-4aaf-ba2e-5d49809ef866"
343 | },
344 | "outputs": [],
345 | "source": [
346 | "test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\\t', quoting=3)\n",
347 | "test_data = test_data.dropna()\n",
348 | "test_data.head()"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {
355 | "id": "liE91_rhsQdY"
356 | },
357 | "outputs": [],
358 | "source": [
359 | "sample_size = 5000\n",
360 | "\n",
361 | "train_fewshot_samples = []\n",
362 | "\n",
363 | "for _ in range(sample_size):\n",
364 | " fewshot_examples = sample(train_fewshot_data, 30)\n",
365 | " train_fewshot_samples.append(fewshot_examples)\n",
366 | "\n",
367 | "if sample_size < len(test_data['id']):\n",
368 | " test_data = test_data.sample(sample_size, random_state=SEED_NUM)"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "metadata": {
375 | "id": "s-ZiFs-aRIXy"
376 | },
377 | "outputs": [],
378 | "source": [
379 | "def build_prompt_text(sent):\n",
380 | " return \"문장: \" + sent + '\\n감정: '\n",
381 | "\n",
382 | "def clean_text(sent):\n",
383 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \"\", sent)\n",
384 | " return sent_clean\n",
385 | "\n",
386 | "real_labels = []\n",
387 | "pred_tokens = []\n",
388 | "\n",
389 | "for i, (test_sent, test_label) in enumerate(test_data[['document','label']].values):\n",
390 | " tokens = [vocab[vocab.bos_token]]\n",
391 | "\n",
392 | " for ex in train_fewshot_samples[i]:\n",
393 | " example_text, example_label = ex\n",
394 | " cleaned_example_text = clean_text(example_text)\n",
395 | " appended_prompt_example_text = build_prompt_text(cleaned_example_text)\n",
396 | " appended_prompt_example_text += '긍정' if example_label == 1 else '부정' + '\\n'\n",
397 | "\n",
398 | " tokens += vocab[tokenizer(appended_prompt_example_text)]\n",
399 | "\n",
400 | " cleaned_sent = clean_text(test_sent)\n",
401 | " appended_prompt_sent = build_prompt_text(cleaned_sent)\n",
402 | " test_tokens = vocab[tokenizer(appended_prompt_sent)]\n",
403 | "\n",
404 | " tokens += test_tokens\n",
405 | "\n",
406 | " pred = tf.argmax(cls_model(np.array([tokens], dtype=np.int64)), axis=-1).numpy()\n",
407 | " label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]\n",
408 | "\n",
409 | " pred_tokens.append(pred[0])\n",
410 | " real_labels.append(label[0])"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {
417 | "colab": {
418 | "base_uri": "https://localhost:8080/"
419 | },
420 | "id": "0oZ1GfUPeuec",
421 | "outputId": "becdea25-d59b-4dd2-d6d6-9caece719dad"
422 | },
423 | "outputs": [],
424 | "source": [
425 | "accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]\n",
426 | "accuracy = len([m for m in accuracy_match if m]) / len(real_labels)\n",
427 | "\n",
428 | "print(accuracy)"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": null,
434 | "metadata": {
435 | "id": "vMYgRRGI-Gu4"
436 | },
437 | "outputs": [],
438 | "source": [
439 | "def build_prompt_text(sent):\n",
440 | " return '감정 분석 문장: ' + sent + '\\n결과: '\n",
441 | "\n",
442 | "real_labels = []\n",
443 | "pred_tokens = []\n",
444 | "\n",
445 | "\n",
446 | "for i, (test_sent, test_label) in enumerate(test_data[['document','label']].values):\n",
447 | " tokens = [vocab[vocab.bos_token]]\n",
448 | "\n",
449 | " for ex in train_fewshot_samples[i]:\n",
450 | " example_text, example_label = ex\n",
451 | " cleaned_example_text = clean_text(example_text)\n",
452 | " appended_prompt_example_text = build_prompt_text(cleaned_example_text)\n",
453 | " appended_prompt_example_text += '긍정' if example_label == 1 else '부정' + '\\n'\n",
454 | "\n",
455 | " tokens += vocab[tokenizer(appended_prompt_example_text)]\n",
456 | "\n",
457 | " cleaned_sent = clean_text(test_sent)\n",
458 | " appended_prompt_sent = build_prompt_text(cleaned_sent)\n",
459 | " test_tokens = vocab[tokenizer(appended_prompt_sent)]\n",
460 | "\n",
461 | " tokens += test_tokens\n",
462 | "\n",
463 | " pred = tf.argmax(cls_model(np.array([tokens], dtype=np.int64)), axis=-1).numpy()\n",
464 | " label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]\n",
465 | "\n",
466 | " pred_tokens.append(pred[0])\n",
467 | " real_labels.append(label[0])"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {
474 | "colab": {
475 | "base_uri": "https://localhost:8080/"
476 | },
477 | "id": "8ufjRihAzNBK",
478 | "outputId": "580a3580-8120-41bd-b0e6-5b94616fe0d1"
479 | },
480 | "outputs": [],
481 | "source": [
482 | "accuracy_match = [p == t for p, t in zip(pred_tokens, real_labels)]\n",
483 | "accuracy = len([m for m in accuracy_match if m]) / len(real_labels)\n",
484 | "\n",
485 | "print(accuracy)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": null,
491 | "metadata": {
492 | "id": "uDbMaaxr2kfL"
493 | },
494 | "outputs": [],
495 | "source": []
496 | }
497 | ],
498 | "metadata": {
499 | "accelerator": "GPU",
500 | "colab": {
501 | "collapsed_sections": [],
502 | "machine_shape": "hm",
503 | "name": "7.4.2.gpt2_fewshot_NSMC.ipynb",
504 | "provenance": []
505 | },
506 | "kernelspec": {
507 | "display_name": "Python 3",
508 | "language": "python",
509 | "name": "python3"
510 | },
511 | "language_info": {
512 | "codemirror_mode": {
513 | "name": "ipython",
514 | "version": 3
515 | },
516 | "file_extension": ".py",
517 | "mimetype": "text/x-python",
518 | "name": "python",
519 | "nbconvert_exporter": "python",
520 | "pygments_lexer": "ipython3",
521 | "version": "3.8.3"
522 | }
523 | },
524 | "nbformat": 4,
525 | "nbformat_minor": 4
526 | }
527 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.2.2.bert_finetune_KorNLI.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR/KorNLI\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/multinli.train.ko.tsv \\\n",
36 | " -O data_in/KOR/KorNLI/multinli.train.ko.tsv\n",
37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/snli_1.0_train.kor.tsv \\\n",
38 | " -O data_in/KOR/KorNLI/snli_1.0_train.kor.tsv\n",
39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.dev.ko.tsv \\\n",
40 | " -O data_in/KOR/KorNLI/xnli.dev.ko.tsv\n",
41 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.test.ko.tsv \\\n",
42 | " -O data_in/KOR/KorNLI/xnli.test.ko.tsv"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "import os\n",
52 | "import tensorflow as tf\n",
53 | "from transformers import BertTokenizer, TFBertModel\n",
54 | "\n",
55 | "import numpy as np\n",
56 | "import pandas as pd\n",
57 | "\n",
58 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
59 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
60 | "\n",
61 | "import matplotlib.pyplot as plt"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# 시각화\n",
71 | "\n",
72 | "def plot_graphs(history, string):\n",
73 | " plt.plot(history.history[string])\n",
74 | " plt.plot(history.history['val_'+string], '')\n",
75 | " plt.xlabel(\"Epochs\")\n",
76 | " plt.ylabel(string)\n",
77 | " plt.legend([string, 'val_'+string])\n",
78 | " plt.show()"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "#random seed 고정\n",
88 | "\n",
89 | "tf.random.set_seed(1234)\n",
90 | "np.random.seed(1234)\n",
91 | "\n",
92 | "# BASE PARAM\n",
93 | "\n",
94 | "BATCH_SIZE = 32\n",
95 | "NUM_EPOCHS = 3\n",
96 | "MAX_LEN = 24 * 2 # Average total * 2\n",
97 | "\n",
98 | "DATA_IN_PATH = './data_in/KOR'\n",
99 | "DATA_OUT_PATH = \"./data_out/KOR\""
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "# KorNLI Dataset\n",
107 | "\n",
108 | "Data from Kakaobrain: https://github.com/kakaobrain/KorNLUDatasets"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "# Load Train dataset\n",
118 | "\n",
119 | "TRAIN_SNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'snli_1.0_train.kor.tsv')\n",
120 | "TRAIN_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'multinli.train.ko.tsv')\n",
121 | "DEV_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.dev.ko.tsv')\n",
122 | "\n",
123 | "train_data_snli = pd.read_csv(TRAIN_SNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
124 | "train_data_xnli = pd.read_csv(TRAIN_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
125 | "dev_data_xnli = pd.read_csv(DEV_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
126 | "\n",
127 | "train_data_snli_xnli = train_data_snli.append(train_data_xnli)\n",
128 | "train_data_snli_xnli = train_data_snli_xnli.dropna()\n",
129 | "train_data_snli_xnli = train_data_snli_xnli.reset_index()\n",
130 | "\n",
131 | "dev_data_xnli = dev_data_xnli.dropna()\n",
132 | "\n",
133 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data_snli_xnli), len(dev_data_xnli)))"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "# Bert Tokenizer\n",
143 | "\n",
144 | "# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n",
145 | "\n",
146 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt', do_lower_case=False)\n",
147 | "\n",
148 | "def bert_tokenizer_v2(sent1, sent2, MAX_LEN):\n",
149 | " \n",
150 | " # For Two setenece input\n",
151 | " \n",
152 | " encoded_dict = tokenizer.encode_plus(\n",
153 | " text = sent1,\n",
154 | " text_pair = sent2,\n",
155 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
156 | " max_length = MAX_LEN, # Pad & truncate all sentences.\n",
157 | " pad_to_max_length = True,\n",
158 | " return_attention_mask = True # Construct attn. masks.\n",
159 | " \n",
160 | " )\n",
161 | " \n",
162 | " input_id = encoded_dict['input_ids']\n",
163 | " attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n",
164 | " token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n",
165 | " \n",
166 | " return input_id, attention_mask, token_type_id"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {},
173 | "outputs": [],
174 | "source": [
175 | "input_ids = []\n",
176 | "attention_masks = []\n",
177 | "token_type_ids = []\n",
178 | "\n",
179 | "for sent1, sent2 in zip(train_data_snli_xnli['sentence1'], train_data_snli_xnli['sentence2']):\n",
180 | " try:\n",
181 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
182 | "\n",
183 | " input_ids.append(input_id)\n",
184 | " attention_masks.append(attention_mask)\n",
185 | " token_type_ids.append(token_type_id)\n",
186 | " except Exception as e:\n",
187 | " print(e)\n",
188 | " print(sent1, sent2)\n",
189 | " pass\n",
190 | " \n",
191 | "train_snli_xnli_input_ids = np.array(input_ids, dtype=int)\n",
192 | "train_snli_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
193 | "train_snli_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
194 | "train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "# DEV SET Preprocessing"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
211 | "input_ids = []\n",
212 | "attention_masks = []\n",
213 | "token_type_ids = []\n",
214 | "\n",
215 | "for sent1, sent2 in zip(dev_data_xnli['sentence1'], dev_data_xnli['sentence2']):\n",
216 | " try:\n",
217 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
218 | "\n",
219 | " input_ids.append(input_id)\n",
220 | " attention_masks.append(attention_mask)\n",
221 | " token_type_ids.append(token_type_id)\n",
222 | " except Exception as e:\n",
223 | " print(e)\n",
224 | " print(sent1, sent2)\n",
225 | " pass\n",
226 | " \n",
227 | "dev_xnli_input_ids = np.array(input_ids, dtype=int)\n",
228 | "dev_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
229 | "dev_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
230 | "dev_xnli_inputs = (dev_xnli_input_ids, dev_xnli_attention_masks, dev_xnli_type_ids)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {
237 | "scrolled": true
238 | },
239 | "outputs": [],
240 | "source": [
241 | "# Label을 Netural, Contradiction, Entailment 에서 숫자 형으로 변경한다.\n",
242 | "label_dict = {\"entailment\": 0, \"contradiction\": 1, \"neutral\": 2}\n",
243 | "def convert_int(label):\n",
244 | " num_label = label_dict[label] \n",
245 | " return num_label\n",
246 | "\n",
247 | "train_data_snli_xnli[\"gold_label_int\"] = train_data_snli_xnli[\"gold_label\"].apply(convert_int)\n",
248 | "train_data_labels = np.array(train_data_snli_xnli['gold_label_int'], dtype=int)\n",
249 | "\n",
250 | "dev_data_xnli[\"gold_label_int\"] = dev_data_xnli[\"gold_label\"].apply(convert_int)\n",
251 | "dev_data_labels = np.array(dev_data_xnli['gold_label_int'], dtype=int)\n",
252 | "\n",
253 | "print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "class TFBertClassifier(tf.keras.Model):\n",
263 | " def __init__(self, model_name, dir_path, num_class):\n",
264 | " super(TFBertClassifier, self).__init__()\n",
265 | "\n",
266 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n",
267 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n",
268 | " self.classifier = tf.keras.layers.Dense(num_class, \n",
269 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n",
270 | " name=\"classifier\")\n",
271 | " \n",
272 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n",
273 | " \n",
274 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n",
275 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
276 | " pooled_output = outputs[1] \n",
277 | " pooled_output = self.dropout(pooled_output, training=training)\n",
278 | " logits = self.classifier(pooled_output)\n",
279 | "\n",
280 | " return logits\n",
281 | "\n",
282 | "cls_model = TFBertClassifier(model_name='bert-base-multilingual-cased',\n",
283 | " dir_path='bert_ckpt',\n",
284 | " num_class=3)"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {},
291 | "outputs": [],
292 | "source": [
293 | "# 학습 준비하기\n",
294 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n",
295 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
296 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
297 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {},
304 | "outputs": [],
305 | "source": [
306 | "#학습 진행하기\n",
307 | "model_name = \"tf2_KorNLI\"\n",
308 | "\n",
309 | "# overfitting을 막기 위한 ealrystop 추가\n",
310 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,patience=2)\n",
311 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
312 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
313 | "\n",
314 | "# Create path if exists\n",
315 | "if os.path.exists(checkpoint_dir):\n",
316 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
317 | "else:\n",
318 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
319 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
320 | " \n",
321 | "cp_callback = ModelCheckpoint(\n",
322 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n",
323 | "\n",
324 | "# 학습과 eval 시작\n",
325 | "history = cls_model.fit(train_snli_xnli_inputs, train_data_labels, epochs=NUM_EPOCHS,\n",
326 | " validation_data = (dev_xnli_inputs, dev_data_labels),\n",
327 | " batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])\n",
328 | "\n",
329 | "#steps_for_epoch\n",
330 | "print(history.history)"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "plot_graphs(history, 'accuracy')\n",
340 | "plot_graphs(history, 'loss')"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "# KorNLI Test dataset"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": null,
353 | "metadata": {},
354 | "outputs": [],
355 | "source": [
356 | "# Load Test dataset\n",
357 | "TEST_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.test.ko.tsv')\n",
358 | "\n",
359 | "test_data_xnli = pd.read_csv(TEST_XNLI_DF, header=0, delimiter = '\\t', quoting = 3)\n",
360 | "test_data_xnli = test_data_xnli.dropna()\n",
361 | "test_data_xnli.head()"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "metadata": {},
368 | "outputs": [],
369 | "source": [
370 | "# Test set도 똑같은 방법으로 구성한다.\n",
371 | "\n",
372 | "input_ids = []\n",
373 | "attention_masks = []\n",
374 | "token_type_ids = []\n",
375 | "\n",
376 | "for sent1, sent2 in zip(test_data_xnli['sentence1'], test_data_xnli['sentence2']):\n",
377 | " \n",
378 | " try:\n",
379 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)\n",
380 | "\n",
381 | " input_ids.append(input_id)\n",
382 | " attention_masks.append(attention_mask)\n",
383 | " token_type_ids.append(token_type_id)\n",
384 | " except Exception as e:\n",
385 | " print(e)\n",
386 | " print(sent1, sent2)\n",
387 | " pass\n",
388 | " \n",
389 | " \n",
390 | "test_xnli_input_ids = np.array(input_ids, dtype=int)\n",
391 | "test_xnli_attention_masks = np.array(attention_masks, dtype=int)\n",
392 | "test_xnli_type_ids = np.array(token_type_ids, dtype=int)\n",
393 | "test_xnli_inputs = (test_xnli_input_ids, test_xnli_attention_masks, test_xnli_type_ids)"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "test_data_xnli[\"gold_label_int\"] = test_data_xnli[\"gold_label\"].apply(convert_int)\n",
403 | "test_data_xnli_labels = np.array(test_data_xnli['gold_label_int'], dtype=int)\n",
404 | "\n",
405 | "print(\"# sents: {}, # labels: {}\".format(len(test_xnli_input_ids), len(test_data_xnli_labels)))"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "results = cls_model.evaluate(test_xnli_inputs, test_data_xnli_labels, batch_size=512)\n",
415 | "print(\"test loss, test acc: \", results)"
416 | ]
417 | }
418 | ],
419 | "metadata": {
420 | "kernelspec": {
421 | "display_name": "Python 3",
422 | "language": "python",
423 | "name": "python3"
424 | },
425 | "language_info": {
426 | "codemirror_mode": {
427 | "name": "ipython",
428 | "version": 3
429 | },
430 | "file_extension": ".py",
431 | "mimetype": "text/x-python",
432 | "name": "python",
433 | "nbconvert_exporter": "python",
434 | "pygments_lexer": "ipython3",
435 | "version": "3.7.4"
436 | }
437 | },
438 | "nbformat": 4,
439 | "nbformat_minor": 2
440 | }
441 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.4.3.gpt2_finetune_KorNLI.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR/KorNLI\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/multinli.train.ko.tsv \\\n",
36 | " -O data_in/KOR/KorNLI/multinli.train.ko.tsv\n",
37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/snli_1.0_train.kor.tsv \\\n",
38 | " -O data_in/KOR/KorNLI/snli_1.0_train.kor.tsv\n",
39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.dev.ko.tsv \\\n",
40 | " -O data_in/KOR/KorNLI/xnli.dev.ko.tsv\n",
41 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorNLI/xnli.test.ko.tsv \\\n",
42 | " -O data_in/KOR/KorNLI/xnli.test.ko.tsv"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "import os\n",
52 | "import tensorflow as tf\n",
53 | "from transformers import TFGPT2Model\n",
54 | "\n",
55 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
56 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
57 | "\n",
58 | "import gluonnlp as nlp\n",
59 | "from gluonnlp.data import SentencepieceTokenizer\n",
60 | "\n",
61 | "import pandas as pd\n",
62 | "import matplotlib.pyplot as plt\n",
63 | "\n",
64 | "import numpy as np\n",
65 | "import re"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요."
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n",
82 | "!unzip -o gpt_ckpt.zip"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "# 시각화\n",
92 | "\n",
93 | "def plot_graphs(history, string):\n",
94 | " plt.plot(history.history[string])\n",
95 | " plt.plot(history.history['val_'+string], '')\n",
96 | " plt.xlabel(\"Epochs\")\n",
97 | " plt.ylabel(string)\n",
98 | " plt.legend([string, 'val_'+string])\n",
99 | " plt.show()"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "SEED_NUM = 1234\n",
109 | "tf.random.set_seed(SEED_NUM)\n",
110 | "np.random.seed(SEED_NUM)"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "## 데이터 준비하기"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n",
127 | "\n",
128 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n",
129 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n",
130 | " mask_token=None,\n",
131 | " sep_token='',\n",
132 | " cls_token=None,\n",
133 | " unknown_token='',\n",
134 | " padding_token='',\n",
135 | " bos_token='',\n",
136 | " eos_token='')"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "BATCH_SIZE = 32\n",
146 | "NUM_EPOCHS = 3\n",
147 | "SENT_MAX_LEN = 31\n",
148 | "\n",
149 | "DATA_IN_PATH = './data_in/KOR'\n",
150 | "DATA_OUT_PATH = \"./data_out/KOR\""
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "# Load Train dataset\n",
160 | "\n",
161 | "TRAIN_SNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'snli_1.0_train.kor.tsv')\n",
162 | "TRAIN_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'multinli.train.ko.tsv')\n",
163 | "DEV_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.dev.ko.tsv')\n",
164 | "\n",
165 | "train_data_snli = pd.read_csv(TRAIN_SNLI_DF, header=0, delimiter='\\t', quoting=3)\n",
166 | "train_data_xnli = pd.read_csv(TRAIN_XNLI_DF, header=0, delimiter='\\t', quoting=3)\n",
167 | "dev_data_xnli = pd.read_csv(DEV_XNLI_DF, header=0, delimiter='\\t', quoting=3)\n",
168 | "\n",
169 | "train_data_snli_xnli = train_data_snli.append(train_data_xnli)\n",
170 | "train_data_snli_xnli = train_data_snli_xnli.dropna()\n",
171 | "train_data_snli_xnli = train_data_snli_xnli.reset_index()\n",
172 | "\n",
173 | "dev_data_xnli = dev_data_xnli.dropna()\n",
174 | "\n",
175 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data_snli_xnli), len(dev_data_xnli)))"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
185 | "def clean_text(sent):\n",
186 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \" \", sent)\n",
187 | " return sent_clean\n",
188 | "\n",
189 | "train_data_sents = []\n",
190 | "\n",
191 | "for train_sent_1, train_sent_2 in train_data_snli_xnli[['sentence1', 'sentence2']].values:\n",
192 | " train_tokenized_sent_1 = vocab[tokenizer(clean_text(train_sent_1))]\n",
193 | " train_tokenized_sent_2 = vocab[tokenizer(clean_text(train_sent_2))]\n",
194 | "\n",
195 | " tokens = [vocab[vocab.bos_token]] \n",
196 | " tokens += pad_sequences([train_tokenized_sent_1], \n",
197 | " SENT_MAX_LEN, \n",
198 | " value=vocab[vocab.padding_token], \n",
199 | " padding='post').tolist()[0] \n",
200 | " tokens += [vocab[vocab.sep_token]] \n",
201 | " tokens += pad_sequences([train_tokenized_sent_2], \n",
202 | " SENT_MAX_LEN, \n",
203 | " value=vocab[vocab.padding_token], \n",
204 | " padding='post').tolist()[0] \n",
205 | " tokens += [vocab[vocab.eos_token]]\n",
206 | "\n",
207 | " train_data_sents.append(tokens) \n",
208 | "\n",
209 | "train_data_sents = np.array(train_data_sents, dtype=np.int64)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "dev_data_sents = []\n",
219 | "\n",
220 | "for dev_sent_1, dev_sent_2 in dev_data_xnli[['sentence1', 'sentence2']].values:\n",
221 | " dev_tokenized_sent_1 = vocab[tokenizer(clean_text(dev_sent_1))]\n",
222 | " dev_tokenized_sent_2 = vocab[tokenizer(clean_text(dev_sent_2))]\n",
223 | "\n",
224 | " tokens = [vocab[vocab.bos_token]] \n",
225 | " tokens += pad_sequences([dev_tokenized_sent_1], \n",
226 | " SENT_MAX_LEN, \n",
227 | " value=vocab[vocab.padding_token], \n",
228 | " padding='post').tolist()[0] \n",
229 | " tokens += [vocab[vocab.sep_token]] \n",
230 | " tokens += pad_sequences([dev_tokenized_sent_2], \n",
231 | " SENT_MAX_LEN, \n",
232 | " value=vocab[vocab.padding_token], \n",
233 | " padding='post').tolist()[0] \n",
234 | " tokens += [vocab[vocab.eos_token]]\n",
235 | "\n",
236 | " dev_data_sents.append(tokens) \n",
237 | "\n",
238 | "dev_data_sents = np.array(dev_data_sents, dtype=np.int64)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "# Label을 Netural, Contradiction, Entailment 에서 숫자 형으로 변경한다.\n",
248 | "label_dict = {\"entailment\": 0, \"contradiction\": 1, \"neutral\": 2}\n",
249 | "\n",
250 | "def convert_int(label):\n",
251 | " num_label = label_dict[label] \n",
252 | " return num_label\n",
253 | "\n",
254 | "train_data_snli_xnli[\"gold_label_int\"] = train_data_snli_xnli[\"gold_label\"].apply(convert_int)\n",
255 | "train_data_labels = np.array(train_data_snli_xnli['gold_label_int'], dtype=int)\n",
256 | "\n",
257 | "dev_data_xnli[\"gold_label_int\"] = dev_data_xnli[\"gold_label\"].apply(convert_int)\n",
258 | "dev_data_labels = np.array(dev_data_xnli['gold_label_int'], dtype=int)\n",
259 | "\n",
260 | "print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))"
261 | ]
262 | },
263 | {
264 | "cell_type": "markdown",
265 | "metadata": {},
266 | "source": [
267 | "## 모델 학습"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {},
274 | "outputs": [],
275 | "source": [
276 | "class TFGPT2Classifier(tf.keras.Model):\n",
277 | " def __init__(self, dir_path, num_class):\n",
278 | " super(TFGPT2Classifier, self).__init__()\n",
279 | " \n",
280 | " self.gpt2 = TFGPT2Model.from_pretrained(dir_path)\n",
281 | " self.num_class = num_class\n",
282 | " \n",
283 | " self.dropout = tf.keras.layers.Dropout(self.gpt2.config.summary_first_dropout)\n",
284 | " self.classifier = tf.keras.layers.Dense(self.num_class, \n",
285 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.gpt2.config.initializer_range), \n",
286 | " name=\"classifier\")\n",
287 | " \n",
288 | " def call(self, inputs):\n",
289 | " outputs = self.gpt2(inputs)\n",
290 | " pooled_output = outputs[0][:, -1]\n",
291 | "\n",
292 | " pooled_output = self.dropout(pooled_output)\n",
293 | " logits = self.classifier(pooled_output)\n",
294 | "\n",
295 | " return logits"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "BASE_MODEL_PATH = './gpt_ckpt'\n",
305 | "sim_model = TFGPT2Classifier(dir_path=BASE_MODEL_PATH, num_class=3)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "optimizer = tf.keras.optimizers.Adam(6.25e-5)\n",
315 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
316 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
317 | "sim_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "model_name = \"tf2_gpt_kornli\"\n",
327 | "\n",
328 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)\n",
329 | "\n",
330 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
331 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
332 | "\n",
333 | "if os.path.exists(checkpoint_dir):\n",
334 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
335 | "else:\n",
336 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
337 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
338 | " \n",
339 | "cp_callback = ModelCheckpoint(\n",
340 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n",
341 | "\n",
342 | "history = sim_model.fit(train_data_sents, train_data_labels, \n",
343 | " epochs=NUM_EPOCHS,\n",
344 | " validation_data=(dev_data_sents, dev_data_labels),\n",
345 | " batch_size=BATCH_SIZE, \n",
346 | " callbacks=[earlystop_callback, cp_callback])"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "plot_graphs(history, 'accuracy')"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {
362 | "scrolled": false
363 | },
364 | "outputs": [],
365 | "source": [
366 | "plot_graphs(history, 'loss')"
367 | ]
368 | },
369 | {
370 | "cell_type": "markdown",
371 | "metadata": {},
372 | "source": [
373 | "## 모델 테스트"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": null,
379 | "metadata": {
380 | "scrolled": true
381 | },
382 | "outputs": [],
383 | "source": [
384 | "# Load Test dataset\n",
385 | "TEST_XNLI_DF = os.path.join(DATA_IN_PATH, 'KorNLI', 'xnli.test.ko.tsv')\n",
386 | "\n",
387 | "test_data_xnli = pd.read_csv(TEST_XNLI_DF, header=0, delimiter='\\t', quoting=3)"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {},
394 | "outputs": [],
395 | "source": [
396 | "# test_data_xnli = test_data_xnli[:50] # for test\n",
397 | "\n",
398 | "test_data_sents = []\n",
399 | "\n",
400 | "for test_sent_1, test_sent_2 in test_data_xnli[['sentence1', 'sentence2']].values:\n",
401 | " test_tokenized_sent_1 = vocab[tokenizer(clean_text(test_sent_1))]\n",
402 | " test_tokenized_sent_2 = vocab[tokenizer(clean_text(test_sent_2))]\n",
403 | "\n",
404 | " tokens = [vocab[vocab.bos_token]] \n",
405 | " tokens += pad_sequences([test_tokenized_sent_1], \n",
406 | " SENT_MAX_LEN, \n",
407 | " value=vocab[vocab.padding_token], \n",
408 | " padding='post').tolist()[0] \n",
409 | " tokens += [vocab[vocab.sep_token]] \n",
410 | " tokens += pad_sequences([test_tokenized_sent_2], \n",
411 | " SENT_MAX_LEN, \n",
412 | " value=vocab[vocab.padding_token], \n",
413 | " padding='post').tolist()[0] \n",
414 | " tokens += [vocab[vocab.eos_token]]\n",
415 | "\n",
416 | " test_data_sents.append(tokens) \n",
417 | "\n",
418 | "test_data_sents = np.array(test_data_sents, dtype=np.int64)"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": null,
424 | "metadata": {},
425 | "outputs": [],
426 | "source": [
427 | "test_data_xnli[\"gold_label_int\"] = test_data_xnli[\"gold_label\"].apply(convert_int)\n",
428 | "test_data_labels = np.array(test_data_xnli['gold_label_int'], dtype=int)\n",
429 | "\n",
430 | "print(\"# sents: {}, # labels: {}\".format(len(test_data_sents), len(test_data_labels)))"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {},
437 | "outputs": [],
438 | "source": [
439 | "sim_model.load_weights(checkpoint_path)\n",
440 | "\n",
441 | "results = sim_model.evaluate(test_data_sents, test_data_labels, batch_size=1024)\n",
442 | "print(\"test loss, test acc: \", results)"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "metadata": {},
449 | "outputs": [],
450 | "source": []
451 | }
452 | ],
453 | "metadata": {
454 | "kernelspec": {
455 | "display_name": "Python 3",
456 | "language": "python",
457 | "name": "python3"
458 | },
459 | "language_info": {
460 | "codemirror_mode": {
461 | "name": "ipython",
462 | "version": 3
463 | },
464 | "file_extension": ".py",
465 | "mimetype": "text/x-python",
466 | "name": "python",
467 | "nbconvert_exporter": "python",
468 | "pygments_lexer": "ipython3",
469 | "version": "3.7.4"
470 | }
471 | },
472 | "nbformat": 4,
473 | "nbformat_minor": 2
474 | }
475 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.2.3.bert_finetune_NER.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR/NER\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/NER/label.txt \\\n",
36 | " -O data_in/KOR/NER/label.txt\n",
37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/NER/test.tsv \\\n",
38 | " -O data_in/KOR/NER/test.tsv\n",
39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/NER/train.tsv \\\n",
40 | " -O data_in/KOR/NER/train.txt"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "import os\n",
50 | "import re\n",
51 | "import numpy as np\n",
52 | "from tqdm import tqdm\n",
53 | "import json\n",
54 | "import copy\n",
55 | "\n",
56 | "import tensorflow as tf\n",
57 | "from transformers import *\n",
58 | "\n",
59 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
60 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
61 | "\n",
62 | "\n",
63 | "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n",
64 | "\n",
65 | "import pandas as pd\n",
66 | "import matplotlib.pyplot as plt"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "# 시각화\n",
76 | "\n",
77 | "def plot_graphs(history, string):\n",
78 | " plt.plot(history.history[string])\n",
79 | " plt.xlabel(\"Epochs\")\n",
80 | " plt.ylabel(string)\n",
81 | " plt.legend([string])\n",
82 | " plt.show()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "#random seed 고정\n",
92 | "tf.random.set_seed(1234)\n",
93 | "np.random.seed(1234)\n",
94 | "\n",
95 | "BATCH_SIZE = 32\n",
96 | "NUM_EPOCHS = 3\n",
97 | "MAX_LEN = 111 # EDA에서 추출된 Max Length\n",
98 | "DATA_IN_PATH = 'data_in/KOR'\n",
99 | "DATA_OUT_PATH = \"data_out/KOR\""
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "# 데이터 전처리 준비\n",
109 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"NER\", \"train.tsv\")\n",
110 | "DATA_LABEL_PATH = os.path.join(DATA_IN_PATH, \"NER\", \"label.txt\")\n",
111 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"NER\", \"test.tsv\")\n",
112 | "\n",
113 | "def read_file(input_path):\n",
114 | " \"\"\"Read tsv file, and return words and label as list\"\"\"\n",
115 | " with open(input_path, \"r\", encoding=\"utf-8\") as f:\n",
116 | " sentences = []\n",
117 | " labels = []\n",
118 | " for line in f:\n",
119 | " split_line = line.strip().split(\"\\t\")\n",
120 | " sentences.append(split_line[0])\n",
121 | " labels.append(split_line[1])\n",
122 | " return sentences, labels\n",
123 | "\n",
124 | "train_sentences, train_labels = read_file(DATA_TRAIN_PATH)\n",
125 | "\n",
126 | "train_ner_dict = {\"sentence\": train_sentences, \"label\": train_labels}\n",
127 | "train_ner_df = pd.DataFrame(train_ner_dict)\n",
128 | "\n",
129 | "test_sentences, test_labels = read_file(DATA_TEST_PATH)\n",
130 | "test_ner_dict = {\"sentence\": test_sentences, \"label\": test_labels}\n",
131 | "test_ner_df = pd.DataFrame(test_ner_dict)\n",
132 | "\n",
133 | "print(\"개체명 인식 학습 데이터 개수: {}\".format(len(train_ner_df)))\n",
134 | "print(\"개체명 인식 테스트 데이터 개수: {}\".format(len(test_ner_df)))\n",
135 | "\n",
136 | "# 개체명 인식 학습 데이터 개수: 81000\n",
137 | "# 개체명 인식 테스트 데이터 개수: 9000"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {},
144 | "outputs": [],
145 | "source": [
146 | "# Label 불러오기\n",
147 | "\n",
148 | "def get_labels(label_path):\n",
149 | " return [label.strip() for label in open(os.path.join(label_path), 'r', encoding='utf-8')]\n",
150 | "\n",
151 | "ner_labels = get_labels(DATA_LABEL_PATH)\n",
152 | "\n",
153 | "print(\"개체명 인식 레이블 개수: {}\".format(len(ner_labels)))"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {},
160 | "outputs": [],
161 | "source": [
162 | "# 버트 토크나이저 설정\n",
163 | "\n",
164 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", cache_dir='bert_ckpt')\n",
165 | "\n",
166 | "pad_token_id = tokenizer.pad_token_id # 0\n",
167 | "pad_token_label_id = 0\n",
168 | "cls_token_label_id = 0\n",
169 | "sep_token_label_id = 0"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "def bert_tokenizer(sent, MAX_LEN):\n",
179 | " \n",
180 | " encoded_dict = tokenizer.encode_plus(\n",
181 | " text = sent,\n",
182 | " truncation=True,\n",
183 | " add_special_tokens = True, #'[CLS]'와 '[SEP]' 추가\n",
184 | " max_length = MAX_LEN, # 문장 패딩 및 자르기 진행\n",
185 | " pad_to_max_length = True,\n",
186 | " return_attention_mask = True # 어탠션 마스크 생성\n",
187 | " )\n",
188 | " \n",
189 | " input_id = encoded_dict['input_ids']\n",
190 | " attention_mask = encoded_dict['attention_mask'] \n",
191 | " token_type_id = encoded_dict['token_type_ids']\n",
192 | " \n",
193 | " return input_id, attention_mask, token_type_id\n",
194 | "\n",
195 | "def convert_label(words, labels_idx, ner_begin_label, max_seq_len):\n",
196 | " \n",
197 | " tokens = []\n",
198 | " label_ids = []\n",
199 | "\n",
200 | " for word, slot_label in zip(words, labels_idx):\n",
201 | "\n",
202 | " word_tokens = tokenizer.tokenize(word)\n",
203 | " if not word_tokens:\n",
204 | " word_tokens = [unk_token]\n",
205 | " tokens.extend(word_tokens)\n",
206 | " \n",
207 | " # 슬롯 레이블 값이 Begin이면 I로 추가\n",
208 | " if int(slot_label) in ner_begin_label:\n",
209 | " label_ids.extend([int(slot_label)] + [int(slot_label) + 1] * (len(word_tokens) - 1))\n",
210 | " else:\n",
211 | " label_ids.extend([int(slot_label)] * len(word_tokens))\n",
212 | " \n",
213 | " # [CLS] and [SEP] 설정\n",
214 | " special_tokens_count = 2\n",
215 | " if len(label_ids) > max_seq_len - special_tokens_count:\n",
216 | " label_ids = label_ids[: (max_seq_len - special_tokens_count)]\n",
217 | "\n",
218 | " # [SEP] 토큰 추가\n",
219 | " label_ids += [sep_token_label_id]\n",
220 | "\n",
221 | " # [CLS] 토큰 추가\n",
222 | " label_ids = [cls_token_label_id] + label_ids\n",
223 | " \n",
224 | " padding_length = max_seq_len - len(label_ids)\n",
225 | " label_ids = label_ids + ([pad_token_label_id] * padding_length)\n",
226 | " \n",
227 | " return label_ids"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {},
234 | "outputs": [],
235 | "source": [
236 | "# 테스트용\n",
237 | "ner_begin_label = [ner_labels.index(begin_label) for begin_label in ner_labels if \"B\" in begin_label]\n",
238 | "ner_begin_label_string = [ner_labels[label_index] for label_index in ner_begin_label]\n",
239 | "\n",
240 | "print(ner_begin_label)\n",
241 | "print(ner_begin_label_string)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": [
250 | "ner_begin_label = [ner_labels.index(begin_label) for begin_label in ner_labels if \"B\" in begin_label]\n",
251 | "\n",
252 | "def create_inputs_targets(df):\n",
253 | " input_ids = []\n",
254 | " attention_masks = []\n",
255 | " token_type_ids = []\n",
256 | " label_list = []\n",
257 | "\n",
258 | " for i, data in enumerate(df[['sentence', 'label']].values):\n",
259 | " sentence, labels = data\n",
260 | " words = sentence.split()\n",
261 | " labels = labels.split()\n",
262 | " labels_idx = []\n",
263 | " \n",
264 | " for label in labels:\n",
265 | " labels_idx.append(ner_labels.index(label) if label in ner_labels else ner_labels.index(\"UNK\"))\n",
266 | "\n",
267 | " assert len(words) == len(labels_idx)\n",
268 | "\n",
269 | " input_id, attention_mask, token_type_id = bert_tokenizer(sentence, MAX_LEN)\n",
270 | "\n",
271 | " convert_label_id = convert_label(words, labels_idx, ner_begin_label, MAX_LEN)\n",
272 | "\n",
273 | " input_ids.append(input_id)\n",
274 | " attention_masks.append(attention_mask)\n",
275 | " token_type_ids.append(token_type_id)\n",
276 | " label_list.append(convert_label_id)\n",
277 | "\n",
278 | " input_ids = np.array(input_ids, dtype=int)\n",
279 | " attention_masks = np.array(attention_masks, dtype=int)\n",
280 | " token_type_ids = np.array(token_type_ids, dtype=int)\n",
281 | " label_list = np.asarray(label_list, dtype=int) #레이블 토크나이징 리스트\n",
282 | " inputs = (input_ids, attention_masks, token_type_ids)\n",
283 | " \n",
284 | " return inputs, label_list\n",
285 | "\n",
286 | "train_inputs, train_labels = create_inputs_targets(train_ner_df)\n",
287 | "test_inputs, test_labels = create_inputs_targets(test_ner_df)"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "class TFBertNERClassifier(tf.keras.Model):\n",
297 | " def __init__(self, model_name, dir_path, num_class):\n",
298 | " super(TFBertNERClassifier, self).__init__()\n",
299 | "\n",
300 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n",
301 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n",
302 | " self.classifier = tf.keras.layers.Dense(num_class, \n",
303 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range),\n",
304 | " name=\"ner_classifier\")\n",
305 | "\n",
306 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n",
307 | "\n",
308 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n",
309 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
310 | " sequence_output = outputs[0]\n",
311 | " \n",
312 | " sequence_output = self.dropout(sequence_output, training=training)\n",
313 | " logits = self.classifier(sequence_output)\n",
314 | " \n",
315 | "\n",
316 | " return logits"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": [
325 | "ner_model = TFBertNERClassifier(model_name='bert-base-multilingual-cased',\n",
326 | " dir_path='bert_ckpt',\n",
327 | " num_class=len(ner_labels))"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "def compute_loss(labels, logits):\n",
337 | " loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(\n",
338 | " from_logits=True, reduction=tf.keras.losses.Reduction.NONE\n",
339 | " )\n",
340 | "\n",
341 | " # 0의 레이블 값은 손실 값을 계산할 때 제외\n",
342 | " active_loss = tf.reshape(labels, (-1,)) != 0\n",
343 | " \n",
344 | " reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)\n",
345 | " \n",
346 | " labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)\n",
347 | " \n",
348 | " return loss_fn(labels, reduced_logits)"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": [
357 | "class F1Metrics(tf.keras.callbacks.Callback):\n",
358 | " def __init__(self, x_eval, y_eval):\n",
359 | " self.x_eval = x_eval\n",
360 | " self.y_eval = y_eval\n",
361 | "\n",
362 | " def compute_f1_pre_rec(self, labels, preds):\n",
363 | "\n",
364 | " return {\n",
365 | " \"precision\": precision_score(labels, preds, suffix=True),\n",
366 | " \"recall\": recall_score(labels, preds, suffix=True),\n",
367 | " \"f1\": f1_score(labels, preds, suffix=True)\n",
368 | " }\n",
369 | "\n",
370 | "\n",
371 | " def show_report(self, labels, preds):\n",
372 | " return classification_report(labels, preds, suffix=True)\n",
373 | " \n",
374 | " def on_epoch_end(self, epoch, logs=None):\n",
375 | "\n",
376 | " results = {}\n",
377 | " \n",
378 | " pred = self.model.predict(self.x_eval)\n",
379 | " label = self.y_eval\n",
380 | " pred_argmax = np.argmax(pred, axis = 2)\n",
381 | "\n",
382 | " slot_label_map = {i: label for i, label in enumerate(ner_labels)}\n",
383 | "\n",
384 | " out_label_list = [[] for _ in range(label.shape[0])]\n",
385 | " preds_list = [[] for _ in range(label.shape[0])]\n",
386 | "\n",
387 | " for i in range(label.shape[0]):\n",
388 | " for j in range(label.shape[1]):\n",
389 | " if label[i, j] != 0:\n",
390 | " out_label_list[i].append(slot_label_map[label[i][j]])\n",
391 | " preds_list[i].append(slot_label_map[pred_argmax[i][j]])\n",
392 | " \n",
393 | " result = self.compute_f1_pre_rec(out_label_list, preds_list)\n",
394 | " results.update(result)\n",
395 | "\n",
396 | " print(\"********\")\n",
397 | " print(\"F1 Score\")\n",
398 | " for key in sorted(results.keys()):\n",
399 | " print(\"{}, {:.4f}\".format(key, results[key]))\n",
400 | " print(\"\\n\" + self.show_report(out_label_list, preds_list))\n",
401 | " print(\"********\")\n",
402 | "\n",
403 | "f1_score_callback = F1Metrics(test_inputs, test_labels)"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": null,
409 | "metadata": {},
410 | "outputs": [],
411 | "source": [
412 | "# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule\n",
413 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n",
414 | "# ner_model.compile(optimizer=optimizer, loss=compute_loss, run_eagerly=True)\n",
415 | "ner_model.compile(optimizer=optimizer, loss=compute_loss)"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "metadata": {},
422 | "outputs": [],
423 | "source": [
424 | "model_name = \"tf2_bert_ner\"\n",
425 | "\n",
426 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
427 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
428 | "\n",
429 | "# Create path if exists\n",
430 | "if os.path.exists(checkpoint_dir):\n",
431 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
432 | "else:\n",
433 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
434 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
435 | " \n",
436 | "cp_callback = ModelCheckpoint(\n",
437 | " checkpoint_path, verbose=1, save_best_only=True, save_weights_only=True)\n",
438 | "\n",
439 | "history = ner_model.fit(train_inputs, train_labels, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,\n",
440 | " callbacks=[cp_callback, f1_score_callback])\n",
441 | "\n",
442 | "print(history.history)"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "plot_graphs(history, 'loss')"
452 | ]
453 | }
454 | ],
455 | "metadata": {
456 | "kernelspec": {
457 | "display_name": "Python 3",
458 | "language": "python",
459 | "name": "python3"
460 | },
461 | "language_info": {
462 | "codemirror_mode": {
463 | "name": "ipython",
464 | "version": 3
465 | },
466 | "file_extension": ".py",
467 | "mimetype": "text/x-python",
468 | "name": "python",
469 | "nbconvert_exporter": "python",
470 | "pygments_lexer": "ipython3",
471 | "version": "3.7.4"
472 | }
473 | },
474 | "nbformat": 4,
475 | "nbformat_minor": 4
476 | }
477 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.2.4.bert_finetune_KorSTS.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR/KorSTS\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-dev.tsv \\\n",
36 | " -O data_in/KOR/KorSTS/sts-dev.tsv\n",
37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-test.tsv \\\n",
38 | " -O data_in/KOR/KorSTS/sts-test.tsv\n",
39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-train.tsv \\\n",
40 | " -O data_in/KOR/KorSTS/sts-train.tsv"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "scrolled": true
48 | },
49 | "outputs": [],
50 | "source": [
51 | "import os\n",
52 | "import tensorflow as tf\n",
53 | "from transformers import BertTokenizer, TFBertModel\n",
54 | "\n",
55 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
56 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
57 | "\n",
58 | "import pandas as pd\n",
59 | "import matplotlib.pyplot as plt\n",
60 | "\n",
61 | "from tqdm import tqdm\n",
62 | "import numpy as np\n",
63 | "import re"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "# 시각화\n",
73 | "def plot_graphs(history, string):\n",
74 | " plt.plot(history.history[string])\n",
75 | " plt.plot(history.history['val_'+string], '')\n",
76 | " plt.xlabel(\"Epochs\")\n",
77 | " plt.ylabel(string)\n",
78 | " plt.legend([string, 'val_'+string])\n",
79 | " plt.show()"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "SEED_NUM = 1234\n",
89 | "tf.random.set_seed(SEED_NUM)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "scrolled": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\",\n",
101 | " cache_dir='bert_ckpt',\n",
102 | " do_lower_case=False)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "#random seed 고정\n",
112 | "\n",
113 | "tf.random.set_seed(0)\n",
114 | "np.random.seed(0)\n",
115 | "\n",
116 | "# BASE PARAM\n",
117 | "\n",
118 | "BATCH_SIZE = 32\n",
119 | "NUM_EPOCHS = 3\n",
120 | "VALID_SPLIT = 0.2\n",
121 | "MAX_LEN = 28 * 2 \n",
122 | "\n",
123 | "DATA_IN_PATH = 'data_in/KOR'\n",
124 | "DATA_OUT_PATH = \"data_out/KOR\""
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# Special Tokens\n",
134 | "print(tokenizer.all_special_tokens, \"\\n\", tokenizer.all_special_ids)\n",
135 | "\n",
136 | "# Test Tokenizers\n",
137 | "kor_encode = tokenizer.encode(\"안녕하세요, 반갑습니다\")\n",
138 | "eng_encode = tokenizer.encode(\"Hello world\")\n",
139 | "\n",
140 | "kor_decode = tokenizer.decode(kor_encode)\n",
141 | "eng_decode = tokenizer.decode(eng_encode)\n",
142 | "\n",
143 | "print(kor_encode)\n",
144 | "print(eng_encode)\n",
145 | "print(kor_decode)\n",
146 | "print(eng_decode)"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "# KorSTS Dataset\n",
154 | "\n",
155 | "Data from Kakaobrain: https://github.com/kakaobrain/KorNLUDatasets"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "# Load Train dataset\n",
165 | "\n",
166 | "TRAIN_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-train.tsv')\n",
167 | "DEV_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-dev.tsv')\n",
168 | "\n",
169 | "train_data = pd.read_csv(TRAIN_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n",
170 | "dev_data = pd.read_csv(DEV_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n",
171 | "\n",
172 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data), len(dev_data)))"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# Bert Tokenizer\n",
182 | "\n",
183 | "# 참조: https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus\n",
184 | "\n",
185 | "def bert_tokenizer_v2(sent1, sent2, MAX_LEN):\n",
186 | " \n",
187 | " # For Two setenece input\n",
188 | " \n",
189 | " encoded_dict = tokenizer.encode_plus(\n",
190 | " text = sent1,\n",
191 | " text_pair = sent2,\n",
192 | " add_special_tokens = True, # Add '[CLS]' and '[SEP]'\n",
193 | " max_length = MAX_LEN, # Pad & truncate all sentences.\n",
194 | " pad_to_max_length = True,\n",
195 | " return_attention_mask = True # Construct attn. masks.\n",
196 | " \n",
197 | " )\n",
198 | " \n",
199 | " input_id = encoded_dict['input_ids']\n",
200 | " attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).\n",
201 | " token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences\n",
202 | " \n",
203 | " return input_id, attention_mask, token_type_id"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
213 | "def clean_text(sent):\n",
214 | " sent_clean = re.sub(\"[^a-zA-Z0-9ㄱ-ㅣ가-힣\\\\s]\", \" \", sent)\n",
215 | " return sent_clean\n",
216 | "\n",
217 | "input_ids = []\n",
218 | "attention_masks = []\n",
219 | "token_type_ids = []\n",
220 | "data_labels = []\n",
221 | "\n",
222 | "\n",
223 | "for sent1, sent2, score in train_data[['sentence1', 'sentence2', 'score']].values:\n",
224 | " try:\n",
225 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)\n",
226 | " input_ids.append(input_id)\n",
227 | " attention_masks.append(attention_mask)\n",
228 | " token_type_ids.append(token_type_id)\n",
229 | " data_labels.append(score)\n",
230 | " except Exception as e:\n",
231 | " print(e)\n",
232 | " print(sent1, sent2)\n",
233 | " pass\n",
234 | " \n",
235 | "train_input_ids = np.array(input_ids, dtype=int)\n",
236 | "train_attention_masks = np.array(attention_masks, dtype=int)\n",
237 | "train_type_ids = np.array(token_type_ids, dtype=int)\n",
238 | "train_inputs = (train_input_ids, train_attention_masks, train_type_ids)\n",
239 | "train_data_labels = np.array(data_labels)"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "metadata": {},
245 | "source": [
246 | "# DEV SET Preprocessing"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": [
255 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
256 | "input_ids = []\n",
257 | "attention_masks = []\n",
258 | "token_type_ids = []\n",
259 | "data_labels = []\n",
260 | "\n",
261 | "for sent1, sent2, score in dev_data[['sentence1', 'sentence2', 'score']].values:\n",
262 | " try:\n",
263 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)\n",
264 | " input_ids.append(input_id)\n",
265 | " attention_masks.append(attention_mask)\n",
266 | " token_type_ids.append(token_type_id)\n",
267 | " data_labels.append(score)\n",
268 | " except Exception as e:\n",
269 | " print(e)\n",
270 | " print(sent1, sent2)\n",
271 | " pass\n",
272 | " \n",
273 | "dev_input_ids = np.array(input_ids, dtype=int)\n",
274 | "dev_attention_masks = np.array(attention_masks, dtype=int)\n",
275 | "dev_type_ids = np.array(token_type_ids, dtype=int)\n",
276 | "dev_inputs = (dev_input_ids, dev_attention_masks, dev_type_ids)\n",
277 | "dev_data_labels = np.array(data_labels)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {},
284 | "outputs": [],
285 | "source": [
286 | "print(\"# train labels: {}, #dev labels: {}\".format(len(train_data_labels), len(dev_data_labels)))"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "class TFBertRegressor(tf.keras.Model):\n",
296 | " def __init__(self, model_name, dir_path, num_class):\n",
297 | " super(TFBertRegressor, self).__init__()\n",
298 | " \n",
299 | " self.bert = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n",
300 | " self.num_class = num_class\n",
301 | " self.dropout = tf.keras.layers.Dropout(self.bert.config.hidden_dropout_prob)\n",
302 | " self.regressor = tf.keras.layers.Dense(self.num_class, \n",
303 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(self.bert.config.initializer_range), \n",
304 | " name=\"regressor\")\n",
305 | " \n",
306 | " \n",
307 | " def call(self, inputs, attention_mask=None, token_type_ids=None, training=False):\n",
308 | " \n",
309 | " #outputs 값: # sequence_output, pooled_output, (hidden_states), (attentions)\n",
310 | " outputs = self.bert(inputs, attention_mask=attention_mask, token_type_ids=token_type_ids)\n",
311 | " pooled_output = outputs[1]\n",
312 | " pooled_output = self.dropout(pooled_output, training=training)\n",
313 | " logits = self.regressor(pooled_output)\n",
314 | "\n",
315 | " return logits"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {
322 | "scrolled": true
323 | },
324 | "outputs": [],
325 | "source": [
326 | "regression_model = TFBertRegressor(model_name='bert-base-multilingual-cased',\n",
327 | " dir_path='bert_ckpt',\n",
328 | " num_class=1)"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "class PearsonCorrelationMetric(tf.keras.metrics.Metric):\n",
338 | " def __init__(self, name=\"pearson_correlation\", **kwargs):\n",
339 | " super(PearsonCorrelationMetric, self).__init__(name=name, **kwargs)\n",
340 | " self.y_true_list = []\n",
341 | " self.y_pred_list = []\n",
342 | "\n",
343 | " def update_state(self, y_true, y_pred, sample_weight=None):\n",
344 | " y_true = tf.reshape(y_true, shape=[-1])\n",
345 | " y_pred = tf.reshape(y_pred, shape=[-1])\n",
346 | " self.y_true_list.append(y_true)\n",
347 | " self.y_pred_list.append(y_pred)\n",
348 | "\n",
349 | " def result(self):\n",
350 | " y_true = tf.concat(self.y_true_list, -1)\n",
351 | " y_pred = tf.concat(self.y_pred_list, -1)\n",
352 | " pearson_correlation = self.pearson(y_true, y_pred)\n",
353 | " \n",
354 | " return pearson_correlation\n",
355 | "\n",
356 | " def reset_states(self):\n",
357 | " self.y_true_list = []\n",
358 | " self.y_pred_list = []\n",
359 | " \n",
360 | "\n",
361 | " def pearson(self, true, pred):\n",
362 | " m_true = tf.reduce_mean(true)\n",
363 | " m_pred = tf.reduce_mean(pred)\n",
364 | " m_true, m_pred = true-m_true, pred-m_pred\n",
365 | " num = tf.reduce_sum(tf.multiply(m_true, m_pred))\n",
366 | " den = tf.sqrt(tf.multiply(tf.reduce_sum(tf.square(m_true)), tf.reduce_sum(tf.square(m_pred)))) + 1e-12\n",
367 | " return num / den"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {},
374 | "outputs": [],
375 | "source": [
376 | "optimizer = tf.keras.optimizers.Adam(3e-5)\n",
377 | "loss = tf.keras.losses.MeanSquaredError()\n",
378 | "metric = PearsonCorrelationMetric()\n",
379 | "regression_model.compile(optimizer=optimizer, loss=loss, metrics=[metric], run_eagerly=True)"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {
386 | "scrolled": true
387 | },
388 | "outputs": [],
389 | "source": [
390 | "#학습 진행하기\n",
391 | "model_name = \"tf2_BERT_KorSTS\"\n",
392 | "\n",
393 | "# overfitting을 막기 위한 ealrystop 추가\n",
394 | "earlystop_callback = EarlyStopping(monitor='val_pearson_correlation', min_delta=0.0001,patience=2,mode='max')\n",
395 | "# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)\n",
396 | "# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\\\n",
397 | "\n",
398 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
399 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
400 | "\n",
401 | "# Create path if exists\n",
402 | "if os.path.exists(checkpoint_dir):\n",
403 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
404 | "else:\n",
405 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
406 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
407 | " \n",
408 | "cp_callback = ModelCheckpoint(\n",
409 | " checkpoint_path, monitor='val_pearson_correlation', verbose=1, save_best_only=True, save_weights_only=True,mode='max')\n",
410 | "\n",
411 | "# 학습과 eval 시작\n",
412 | "history = regression_model.fit(train_inputs, train_data_labels, epochs=NUM_EPOCHS,\n",
413 | " validation_data = (dev_inputs, dev_data_labels),\n",
414 | " batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])\n",
415 | "\n",
416 | "#steps_for_epoch\n",
417 | "print(history.history)"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "plot_graphs(history, 'pearson_correlation')"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {},
433 | "outputs": [],
434 | "source": [
435 | "plot_graphs(history, 'loss')"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "metadata": {},
441 | "source": [
442 | "# KorSTS Test dataset"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "# Load Test dataset\n",
452 | "TEST_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-test.tsv')\n",
453 | "\n",
454 | "test_data = pd.read_csv(TEST_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n",
455 | "test_data.head()"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {},
462 | "outputs": [],
463 | "source": [
464 | "# Test set도 똑같은 방법으로 구성한다.\n",
465 | "input_ids = []\n",
466 | "attention_masks = []\n",
467 | "token_type_ids = []\n",
468 | "data_labels = []\n",
469 | "\n",
470 | "for sent1, sent2, score in test_data[['sentence1', 'sentence2', 'score']].values:\n",
471 | " try:\n",
472 | " input_id, attention_mask, token_type_id = bert_tokenizer_v2(clean_text(sent1), clean_text(sent2), MAX_LEN)\n",
473 | " input_ids.append(input_id)\n",
474 | " attention_masks.append(attention_mask)\n",
475 | " token_type_ids.append(token_type_id)\n",
476 | " data_labels.append(score)\n",
477 | " except Exception as e:\n",
478 | " print(e)\n",
479 | " print(sent1, sent2)\n",
480 | " pass\n",
481 | " \n",
482 | "test_input_ids = np.array(input_ids, dtype=int)\n",
483 | "test_attention_masks = np.array(attention_masks, dtype=int)\n",
484 | "test_type_ids = np.array(token_type_ids, dtype=int)\n",
485 | "test_inputs = (test_input_ids, test_attention_masks, test_type_ids)\n",
486 | "test_data_labels = np.array(data_labels)"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": null,
492 | "metadata": {},
493 | "outputs": [],
494 | "source": [
495 | "print(\"# sents: {}, # labels: {}\".format(len(test_input_ids), len(test_data_labels)))"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": null,
501 | "metadata": {},
502 | "outputs": [],
503 | "source": [
504 | "regression_model.load_weights(checkpoint_path)\n",
505 | "\n",
506 | "results = regression_model.evaluate(test_inputs, test_data_labels, batch_size=512)\n",
507 | "print(\"test loss, test pearson correlation: \", results)"
508 | ]
509 | }
510 | ],
511 | "metadata": {
512 | "kernelspec": {
513 | "display_name": "Python 3",
514 | "language": "python",
515 | "name": "python3"
516 | },
517 | "language_info": {
518 | "codemirror_mode": {
519 | "name": "ipython",
520 | "version": 3
521 | },
522 | "file_extension": ".py",
523 | "mimetype": "text/x-python",
524 | "name": "python",
525 | "nbconvert_exporter": "python",
526 | "pygments_lexer": "ipython3",
527 | "version": "3.7.4"
528 | }
529 | },
530 | "nbformat": 4,
531 | "nbformat_minor": 2
532 | }
533 |
--------------------------------------------------------------------------------
/8.GPT3/8.4.gpt2_p_tuning_NSMC.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "#### 주의!!\n",
8 | "\n",
9 | "이 실습은 가급적 NVIDIA GPU가 설치된 컴퓨터 환경이거나 Google Colab에서 진행해주세요."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {
15 | "id": "2NmYZYYhXrcZ"
16 | },
17 | "source": [
18 | "## 환경 준비\n",
19 | "(Google Colab 환경에서 사용하세요)"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {
26 | "colab": {
27 | "base_uri": "https://localhost:8080/"
28 | },
29 | "id": "6-bFpckCXrcb",
30 | "outputId": "041269a9-fc3e-44f9-cebd-7d26e4bd006f"
31 | },
32 | "outputs": [],
33 | "source": [
34 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
35 | "!pip install -r requirements.txt\n",
36 | "!pip install tensorflow==2.2.0"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {
42 | "id": "cvFHjoTCXrcc"
43 | },
44 | "source": [
45 | "## 데이터 다운로드\n",
46 | "(Google Colab 환경에서 사용하세요)"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "colab": {
54 | "base_uri": "https://localhost:8080/"
55 | },
56 | "id": "HbKNloVoXrcd",
57 | "outputId": "7b70fd06-d1f8-48b2-b316-0c25d432261f"
58 | },
59 | "outputs": [],
60 | "source": [
61 | "!mkdir -p data_in/KOR/naver_movie\n",
62 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_train.txt \\\n",
63 | " -O data_in/KOR/naver_movie/ratings_train.txt\n",
64 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/naver_movie/ratings_test.txt \\\n",
65 | " -O data_in/KOR/naver_movie/ratings_test.txt"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {
72 | "id": "xs88fDX8Xrcd",
73 | "scrolled": true
74 | },
75 | "outputs": [],
76 | "source": [
77 | "import os\n",
78 | "import tensorflow as tf\n",
79 | "from transformers import TFGPT2LMHeadModel\n",
80 | "\n",
81 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
82 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
83 | "\n",
84 | "import gluonnlp as nlp\n",
85 | "from gluonnlp.data import SentencepieceTokenizer\n",
86 | "\n",
87 | "import pandas as pd\n",
88 | "import matplotlib.pyplot as plt\n",
89 | "\n",
90 | "import numpy as np\n",
91 | "import re"
92 | ]
93 | },
94 | {
95 | "cell_type": "markdown",
96 | "metadata": {
97 | "id": "XgV0aK1KXrce"
98 | },
99 | "source": [
100 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "id": "XmofLC_rXrce"
108 | },
109 | "outputs": [],
110 | "source": [
111 | "import wget\n",
112 | "import zipfile\n",
113 | "\n",
114 | "wget.download('https://github.com/NLP-kr/tensorflow-ml-nlp-tf2/releases/download/v1.0/gpt_ckpt.zip')\n",
115 | "\n",
116 | "with zipfile.ZipFile('gpt_ckpt.zip') as z:\n",
117 | " z.extractall()"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {
124 | "id": "TVExOYgEXrcf"
125 | },
126 | "outputs": [],
127 | "source": [
128 | "# 시각화\n",
129 | "\n",
130 | "def plot_graphs(history, string):\n",
131 | " plt.plot(history.history[string])\n",
132 | " plt.plot(history.history['val_'+string], '')\n",
133 | " plt.xlabel(\"Epochs\")\n",
134 | " plt.ylabel(string)\n",
135 | " plt.legend([string, 'val_'+string])\n",
136 | " plt.show()"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "id": "s6dM4ebxXrcg"
144 | },
145 | "outputs": [],
146 | "source": [
147 | "SEED_NUM = 1234\n",
148 | "tf.random.set_seed(SEED_NUM)\n",
149 | "np.random.seed(SEED_NUM)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {
155 | "id": "WQrjLpuV_cnI"
156 | },
157 | "source": [
158 | "## 피-튜닝 모델 구현"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {
165 | "id": "g8V_Qsv3_NVE"
166 | },
167 | "outputs": [],
168 | "source": [
169 | "class TFGPT2PtuningClassifier(tf.keras.Model):\n",
170 | " def __init__(self, dir_path):\n",
171 | " super(TFGPT2PtuningClassifier, self).__init__()\n",
172 | " \n",
173 | " self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)\n",
174 | " self.gpt2.trainable = False\n",
175 | "\n",
176 | " self.prompt_embedding_size = self.gpt2.config.hidden_size\n",
177 | " self.prompt_emgedding = tf.keras.layers.Embedding(2, self.prompt_embedding_size, name='prompt_embedding')\n",
178 | " \n",
179 | " self.bilstm = tf.keras.Sequential(name='prompt_bilstm')\n",
180 | " self.bilstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.prompt_embedding_size, return_sequences=True)))\n",
181 | " self.bilstm.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self.prompt_embedding_size, return_sequences=True)))\n",
182 | " \n",
183 | " self.mlp = tf.keras.Sequential(name='prompt_mlp')\n",
184 | " self.mlp.add(tf.keras.layers.Dense(self.prompt_embedding_size))\n",
185 | " self.mlp.add(tf.keras.layers.ReLU())\n",
186 | " self.mlp.add(tf.keras.layers.Dense(self.prompt_embedding_size))\n",
187 | "\n",
188 | " def generate_prompt_input(self, inputs_ids):\n",
189 | " inputs_embeds = self.gpt2.transformer.wte(inputs_ids[:, 1:-1])\n",
190 | "\n",
191 | " prompt_indexs = tf.concat([inputs_ids[:, 0:1], inputs_ids[:, -1:]], axis=-1)\n",
192 | " prompt_embeds = self.prompt_emgedding(prompt_indexs)\n",
193 | " prompt_embeds = self.bilstm(prompt_embeds)\n",
194 | " prompt_embeds = self.mlp(prompt_embeds)\n",
195 | " \n",
196 | " prompt_updated_inputs = tf.concat([prompt_embeds[:, 0:1, :], inputs_embeds, \n",
197 | " prompt_embeds[:, 1:, :]],\n",
198 | " axis=1)\n",
199 | " \n",
200 | " return prompt_updated_inputs\n",
201 | " \n",
202 | " def call(self, inputs):\n",
203 | " input_ids = inputs[0]\n",
204 | " attention_mask = inputs[1] if len(inputs) > 1 else None\n",
205 | "\n",
206 | " inputs_embeds = self.generate_prompt_input(input_ids)\n",
207 | " last_hidden_states, _ = self.gpt2({'inputs_ids': None, 'inputs_embeds': inputs_embeds, 'attention_mask': attention_mask})\n",
208 | " output = last_hidden_states[:, -1, :]\n",
209 | "\n",
210 | " return outputs"
211 | ]
212 | },
213 | {
214 | "cell_type": "markdown",
215 | "metadata": {
216 | "id": "pCN8Lh7gXrch"
217 | },
218 | "source": [
219 | "## 피-튜닝을 위한 네이버 영화 리뷰 데이터 전처리"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {
226 | "id": "lr76g28XA1BP"
227 | },
228 | "outputs": [],
229 | "source": [
230 | "BATCH_SIZE = 32\n",
231 | "NUM_EPOCHS = 3\n",
232 | "VALID_SPLIT = 0.1\n",
233 | "SENT_MAX_LEN = 39"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {
240 | "id": "lAaKKUqbXrch"
241 | },
242 | "outputs": [],
243 | "source": [
244 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n",
245 | "\n",
246 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH)\n",
247 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n",
248 | " mask_token=None,\n",
249 | " sep_token='',\n",
250 | " cls_token=None,\n",
251 | " unknown_token='',\n",
252 | " padding_token='',\n",
253 | " bos_token='',\n",
254 | " eos_token='')"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {
261 | "id": "6DlePiINXrch"
262 | },
263 | "outputs": [],
264 | "source": [
265 | "DATA_IN_PATH = './data_in/KOR'\n",
266 | "DATA_OUT_PATH = \"./data_out/KOR\"\n",
267 | "\n",
268 | "DATA_TRAIN_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_train.txt\")\n",
269 | "DATA_TEST_PATH = os.path.join(DATA_IN_PATH, \"naver_movie\", \"ratings_test.txt\")\n",
270 | "\n",
271 | "train_data = pd.read_csv(DATA_TRAIN_PATH, header = 0, delimiter = '\\t', quoting = 3)\n",
272 | "train_data = train_data.dropna()"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {
279 | "id": "4GKNnSYuXrcj"
280 | },
281 | "outputs": [],
282 | "source": [
283 | "# train_data = train_data[:50] # for test\n",
284 | "\n",
285 | "def clean_text(sent):\n",
286 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \"\", sent)\n",
287 | " return sent_clean\n",
288 | "\n",
289 | "def add_prompt_token(tokens):\n",
290 | " return [0] + tokens + [1]\n",
291 | "\n",
292 | "train_data_sents = []\n",
293 | "train_attn_mask = []\n",
294 | "train_data_labels = []\n",
295 | "\n",
296 | "for train_sent, train_label in train_data[['document', 'label']].values:\n",
297 | " train_text_label = '긍정' if train_label == 1 else '부정'\n",
298 | "\n",
299 | " train_tokenized_text = vocab[tokenizer(clean_text(train_sent))]\n",
300 | "\n",
301 | " tokens = [vocab[vocab.bos_token]] \n",
302 | " tokens += pad_sequences([train_tokenized_text], \n",
303 | " SENT_MAX_LEN, \n",
304 | " value=vocab[vocab.padding_token], \n",
305 | " padding='post').tolist()[0] \n",
306 | " tokens = add_prompt_token(tokens)\n",
307 | "\n",
308 | " train_attn_mask.append([1 if t != 3 else 0 for t in tokens])\n",
309 | " train_data_sents.append(tokens)\n",
310 | "\n",
311 | " label = vocab[tokenizer('긍정')] if train_label == 1 else vocab[tokenizer('부정')]\n",
312 | " train_data_labels.append(label)\n",
313 | "\n",
314 | "\n",
315 | "train_attn_mask = np.array(train_attn_mask, dtype=np.int64)\n",
316 | "train_data_sents = np.array(train_data_sents, dtype=np.int64)\n",
317 | "train_data_labels = np.array(train_data_labels, dtype=np.int64)"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": null,
323 | "metadata": {
324 | "colab": {
325 | "base_uri": "https://localhost:8080/"
326 | },
327 | "id": "c-w5GU2IxkWv",
328 | "outputId": "29c84da5-4cdd-47ca-e575-77bcff453233"
329 | },
330 | "outputs": [],
331 | "source": [
332 | "print('입력 토큰 인덱스: ', train_data_sents[0])\n",
333 | "print('어텐션 마스크: ', train_attn_mask[0])\n",
334 | "print('정답 라벨: ', train_data_labels[0])"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {
340 | "id": "12MlbiqIXrcj"
341 | },
342 | "source": [
343 | "## 네이버 영화 리뷰 감정 분석을 위한 피-튜닝 학습 "
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": null,
349 | "metadata": {
350 | "colab": {
351 | "base_uri": "https://localhost:8080/"
352 | },
353 | "id": "9J5VOzCwXrcj",
354 | "outputId": "c8eecfd7-6e68-4b14-f939-a5c0934ebd04"
355 | },
356 | "outputs": [],
357 | "source": [
358 | "BASE_MODEL_PATH = './gpt_ckpt'\n",
359 | "cls_model = TFGPT2PtuningClassifier(dir_path=BASE_MODEL_PATH)"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {
366 | "id": "FAKyQBJ_Xrck"
367 | },
368 | "outputs": [],
369 | "source": [
370 | "optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)\n",
371 | "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
372 | "metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')\n",
373 | "cls_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": null,
379 | "metadata": {
380 | "colab": {
381 | "base_uri": "https://localhost:8080/"
382 | },
383 | "id": "YCNdkkALXrck",
384 | "outputId": "23f2964e-5793-4518-fc7d-f53e0c056d52"
385 | },
386 | "outputs": [],
387 | "source": [
388 | "model_name = \"tf2_gpt2_ptuning_naver_movie\"\n",
389 | "\n",
390 | "earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=2)\n",
391 | "\n",
392 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
393 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
394 | "\n",
395 | "if os.path.exists(checkpoint_dir):\n",
396 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
397 | "else:\n",
398 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
399 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
400 | " \n",
401 | "cp_callback = ModelCheckpoint(\n",
402 | " checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)\n",
403 | "\n",
404 | "history = cls_model.fit((train_data_sents, train_attn_mask), train_data_labels, \n",
405 | " epochs=NUM_EPOCHS, \n",
406 | " batch_size=BATCH_SIZE,\n",
407 | " validation_split=VALID_SPLIT, \n",
408 | " callbacks=[earlystop_callback, cp_callback])"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "metadata": {
415 | "colab": {
416 | "base_uri": "https://localhost:8080/",
417 | "height": 279
418 | },
419 | "id": "J8s2xkMcXrck",
420 | "outputId": "07b8d787-7bd0-46cc-e1ee-8a1ce00dea70"
421 | },
422 | "outputs": [],
423 | "source": [
424 | "plot_graphs(history, 'accuracy')"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": null,
430 | "metadata": {
431 | "colab": {
432 | "base_uri": "https://localhost:8080/",
433 | "height": 279
434 | },
435 | "id": "sWVxJEbRXrcl",
436 | "outputId": "f71c7219-b11b-4bd7-bffe-c624b7736279"
437 | },
438 | "outputs": [],
439 | "source": [
440 | "plot_graphs(history, 'loss')"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {
446 | "id": "4jFe7XMeXrcl"
447 | },
448 | "source": [
449 | "## 네이버 영화 리뷰 모델 피-튜닝 테스트\n"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {
456 | "colab": {
457 | "base_uri": "https://localhost:8080/",
458 | "height": 206
459 | },
460 | "id": "za_BFNJsXrcl",
461 | "outputId": "16cdfa32-acd1-48be-88a6-66225338f537"
462 | },
463 | "outputs": [],
464 | "source": [
465 | "test_data = pd.read_csv(DATA_TEST_PATH, header=0, delimiter='\\t', quoting=3)\n",
466 | "test_data = test_data.dropna()\n",
467 | "test_data.head()"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {
474 | "id": "ZvJV4mOcXrcl"
475 | },
476 | "outputs": [],
477 | "source": [
478 | "# test_data = test_data[:50] # for test\n",
479 | "\n",
480 | "test_data_sents = []\n",
481 | "test_data_labels = []\n",
482 | "test_attn_mask = []\n",
483 | "\n",
484 | "pred_tokens = []\n",
485 | "\n",
486 | "\n",
487 | "for test_sent, test_label in test_data[['document', 'label']].values:\n",
488 | " test_tokenized_text = vocab[tokenizer(clean_text(test_sent))]\n",
489 | "\n",
490 | " tokens = []\n",
491 | " tokens += pad_sequences([test_tokenized_text], \n",
492 | " SENT_MAX_LEN, \n",
493 | " value=vocab[vocab.padding_token], \n",
494 | " padding='post').tolist()[0] \n",
495 | " tokens = add_prompt_token(tokens)\n",
496 | " test_data_sents.append(tokens)\n",
497 | " mask = [1 if t != 3 else 0 for t in tokens]\n",
498 | " test_attn_mask.append(mask)\n",
499 | "\n",
500 | " label = vocab[tokenizer('긍정')] if test_label == 1 else vocab[tokenizer('부정')]\n",
501 | " test_data_labels.append(label)\n",
502 | " \n",
503 | "test_attn_mask = np.array(test_attn_mask, dtype=np.int64)\n",
504 | "test_data_sents = np.array(test_data_sents, dtype=np.int64)\n",
505 | "test_data_labels = np.array(test_data_labels, dtype=np.int64)"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "metadata": {
512 | "colab": {
513 | "base_uri": "https://localhost:8080/"
514 | },
515 | "id": "lrHok3-CXrcl",
516 | "outputId": "908bef9c-133b-4cb0-a9e1-baa7cbc221e7"
517 | },
518 | "outputs": [],
519 | "source": [
520 | "print(\"num sents, labels {}, {}\".format(len(test_data_sents), len(test_data_labels)))"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": null,
526 | "metadata": {
527 | "colab": {
528 | "base_uri": "https://localhost:8080/"
529 | },
530 | "id": "hAHf4b0JXrcm",
531 | "outputId": "9b26654e-c5ea-439e-ed33-4508c9add548"
532 | },
533 | "outputs": [],
534 | "source": [
535 | "cls_model.load_weights(checkpoint_path)\n",
536 | "\n",
537 | "results = cls_model.evaluate((test_data_sents, test_attn_mask), test_data_labels, batch_size=1024)\n",
538 | "print(\"test loss, test acc: \", results)"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": null,
544 | "metadata": {
545 | "id": "Ns83PcVeDGq3"
546 | },
547 | "outputs": [],
548 | "source": []
549 | }
550 | ],
551 | "metadata": {
552 | "accelerator": "GPU",
553 | "colab": {
554 | "collapsed_sections": [],
555 | "machine_shape": "hm",
556 | "name": "7.4.2.gpt2_ptune_w_mask_NSMC.ipynb",
557 | "provenance": []
558 | },
559 | "kernelspec": {
560 | "display_name": "Python 3",
561 | "language": "python",
562 | "name": "python3"
563 | },
564 | "language_info": {
565 | "codemirror_mode": {
566 | "name": "ipython",
567 | "version": 3
568 | },
569 | "file_extension": ".py",
570 | "mimetype": "text/x-python",
571 | "name": "python",
572 | "nbconvert_exporter": "python",
573 | "pygments_lexer": "ipython3",
574 | "version": "3.8.3"
575 | }
576 | },
577 | "nbformat": 4,
578 | "nbformat_minor": 4
579 | }
580 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.4.4.gpt2_finetune_KorSTS.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## 데이터 다운로드"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "!mkdir -p data_in/KOR/KorSTS\n",
35 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-dev.tsv \\\n",
36 | " -O data_in/KOR/KorSTS/sts-dev.tsv\n",
37 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-test.tsv \\\n",
38 | " -O data_in/KOR/KorSTS/sts-test.tsv\n",
39 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/7.PRETRAIN_METHOD/data_in/KOR/KorSTS/sts-train.tsv \\\n",
40 | " -O data_in/KOR/KorSTS/sts-train.tsv"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "import os\n",
50 | "import tensorflow as tf\n",
51 | "from transformers import *\n",
52 | "\n",
53 | "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
54 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
55 | "\n",
56 | "import gluonnlp as nlp\n",
57 | "from gluonnlp.data import SentencepieceTokenizer\n",
58 | "\n",
59 | "import pandas as pd\n",
60 | "import matplotlib.pyplot as plt\n",
61 | "\n",
62 | "from tqdm import tqdm\n",
63 | "import numpy as np\n",
64 | "import re"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "아레 실행 커멘드는 gpt_ckpt 폴더가 있지 않은 경우에만 실행해주세요."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "!wget https://www.dropbox.com/s/nzfa9xpzm4edp6o/gpt_ckpt.zip -O gpt_ckpt.zip\n",
81 | "!unzip -o gpt_ckpt.zip"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "# 시각화\n",
91 | "\n",
92 | "def plot_graphs(history, string):\n",
93 | " plt.plot(history.history[string])\n",
94 | " plt.plot(history.history['val_'+string], '')\n",
95 | " plt.xlabel(\"Epochs\")\n",
96 | " plt.ylabel(string)\n",
97 | " plt.legend([string, 'val_'+string])\n",
98 | " plt.show()"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "SEED_NUM = 1234\n",
108 | "tf.random.set_seed(SEED_NUM)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "TOKENIZER_PATH = './gpt_ckpt/gpt2_kor_tokenizer.spiece'\n",
118 | "\n",
119 | "tokenizer = SentencepieceTokenizer(TOKENIZER_PATH, alpha=0)\n",
120 | "vocab = nlp.vocab.BERTVocab.from_sentencepiece(TOKENIZER_PATH,\n",
121 | " mask_token=None,\n",
122 | " sep_token='',\n",
123 | " cls_token=None,\n",
124 | " unknown_token='',\n",
125 | " padding_token='',\n",
126 | " bos_token='',\n",
127 | " eos_token='')"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "# KoSTS Simliarity "
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "tf.random.set_seed(0)\n",
144 | "np.random.seed(0)\n",
145 | "\n",
146 | "BATCH_SIZE = 10\n",
147 | "NUM_EPOCHS = 3\n",
148 | "VALID_SPLIT = 0.2\n",
149 | "SENT_MAX_LEN = 14\n",
150 | "\n",
151 | "DATA_IN_PATH = 'data_in/KOR'\n",
152 | "DATA_OUT_PATH = \"data_out/KOR\""
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "# Load Train dataset\n",
162 | "\n",
163 | "TRAIN_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-train.tsv')\n",
164 | "DEV_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-dev.tsv')\n",
165 | "\n",
166 | "train_data = pd.read_csv(TRAIN_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n",
167 | "dev_data = pd.read_csv(DEV_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n",
168 | "\n",
169 | "train_data = train_data.dropna()\n",
170 | "\n",
171 | "dev_data = dev_data.dropna()\n",
172 | "\n",
173 | "print(\"Total # dataset: train - {}, dev - {}\".format(len(train_data), len(dev_data)))"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
183 | "def clean_text(sent):\n",
184 | " sent_clean = re.sub(\"[^가-힣ㄱ-ㅎㅏ-ㅣ\\\\s]\", \" \", sent)\n",
185 | " return sent_clean\n",
186 | "\n",
187 | "train_data_sents1 = []\n",
188 | "train_data_sents2 = []\n",
189 | "train_labels = []\n",
190 | "\n",
191 | "\n",
192 | "for sent1, sent2, score in train_data[['sentence1', 'sentence2', 'score']].values:\n",
193 | " train_tokenized_sent_1 = vocab[tokenizer(clean_text(sent1))]\n",
194 | " train_tokenized_sent_2 = vocab[tokenizer(clean_text(sent2))]\n",
195 | " tokens1 = [vocab[vocab.bos_token]] \n",
196 | " tokens1 += pad_sequences([train_tokenized_sent_1], \n",
197 | " SENT_MAX_LEN, \n",
198 | " value=vocab[vocab.padding_token], \n",
199 | " padding='post').tolist()[0] \n",
200 | " tokens1 += [vocab[vocab.sep_token]] \n",
201 | " tokens1 += pad_sequences([train_tokenized_sent_2], \n",
202 | " SENT_MAX_LEN, \n",
203 | " value=vocab[vocab.padding_token], \n",
204 | " padding='post').tolist()[0] \n",
205 | " tokens1 += [vocab[vocab.eos_token]]\n",
206 | " tokens2 = [vocab[vocab.bos_token]] \n",
207 | " tokens2 += pad_sequences([train_tokenized_sent_2], \n",
208 | " SENT_MAX_LEN, \n",
209 | " value=vocab[vocab.padding_token], \n",
210 | " padding='post').tolist()[0] \n",
211 | " tokens2 += [vocab[vocab.sep_token]] \n",
212 | " tokens2 += pad_sequences([train_tokenized_sent_1], \n",
213 | " SENT_MAX_LEN, \n",
214 | " value=vocab[vocab.padding_token], \n",
215 | " padding='post').tolist()[0] \n",
216 | " tokens2 += [vocab[vocab.eos_token]]\n",
217 | " \n",
218 | " train_data_sents1.append(tokens1)\n",
219 | " train_data_sents2.append(tokens2)\n",
220 | " train_labels.append(score)\n",
221 | "\n",
222 | "train_data_sents1 = np.array(train_data_sents1, dtype=np.int64)\n",
223 | "train_data_sents2 = np.array(train_data_sents2, dtype=np.int64)\n",
224 | "train_data_sents = (train_data_sents1, train_data_sents2)\n",
225 | "train_data_labels = np.array(train_labels)"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": null,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "dev_data_sents1 = []\n",
235 | "dev_data_sents2 = []\n",
236 | "dev_labels = []\n",
237 | "\n",
238 | "\n",
239 | "for sent1, sent2, score in dev_data[['sentence1', 'sentence2', 'score']].values:\n",
240 | " dev_tokenized_sent_1 = vocab[tokenizer(clean_text(sent1))]\n",
241 | " dev_tokenized_sent_2 = vocab[tokenizer(clean_text(sent2))]\n",
242 | " tokens1 = [vocab[vocab.bos_token]] \n",
243 | " tokens1 += pad_sequences([dev_tokenized_sent_1], \n",
244 | " SENT_MAX_LEN, \n",
245 | " value=vocab[vocab.padding_token], \n",
246 | " padding='post').tolist()[0] \n",
247 | " tokens1 += [vocab[vocab.sep_token]] \n",
248 | " tokens1 += pad_sequences([dev_tokenized_sent_2], \n",
249 | " SENT_MAX_LEN, \n",
250 | " value=vocab[vocab.padding_token], \n",
251 | " padding='post').tolist()[0] \n",
252 | " tokens1 += [vocab[vocab.eos_token]]\n",
253 | " tokens2 = [vocab[vocab.bos_token]] \n",
254 | " tokens2 += pad_sequences([dev_tokenized_sent_2], \n",
255 | " SENT_MAX_LEN, \n",
256 | " value=vocab[vocab.padding_token], \n",
257 | " padding='post').tolist()[0] \n",
258 | " tokens2 += [vocab[vocab.sep_token]] \n",
259 | " tokens2 += pad_sequences([dev_tokenized_sent_1], \n",
260 | " SENT_MAX_LEN, \n",
261 | " value=vocab[vocab.padding_token], \n",
262 | " padding='post').tolist()[0] \n",
263 | " tokens2 += [vocab[vocab.eos_token]]\n",
264 | " \n",
265 | " dev_data_sents1.append(tokens1)\n",
266 | " dev_data_sents2.append(tokens2)\n",
267 | " dev_labels.append(score)\n",
268 | "\n",
269 | "dev_data_sents1 = np.array(dev_data_sents1, dtype=np.int64)\n",
270 | "dev_data_sents2 = np.array(dev_data_sents2, dtype=np.int64)\n",
271 | "dev_data_sents = (dev_data_sents1, dev_data_sents2)\n",
272 | "dev_data_labels = np.array(dev_labels)"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {},
279 | "outputs": [],
280 | "source": [
281 | "print(\"Shape of dataset: train - ({}, {}), dev - ({}, {})\".format(train_data_sents[0].shape, train_data_sents[1].shape, dev_data_sents[0].shape, dev_data_sents[1].shape))"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": null,
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "class TFGPT2Regressor(tf.keras.Model):\n",
291 | " def __init__(self, dir_path, num_class):\n",
292 | " super(TFGPT2Regressor, self).__init__()\n",
293 | " \n",
294 | " self.gpt2 = TFGPT2Model.from_pretrained(dir_path)\n",
295 | " self.num_class = num_class\n",
296 | " self.dropout = tf.keras.layers.Dropout(self.gpt2.config.summary_first_dropout)\n",
297 | " self.regressor = tf.keras.layers.Dense(self.num_class, \n",
298 | " kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=self.gpt2.config.initializer_range), \n",
299 | " name=\"regressior\")\n",
300 | " \n",
301 | " def call(self, inputs):\n",
302 | " outputs1 = self.gpt2(inputs[0])\n",
303 | " outputs2 = self.gpt2(inputs[1])\n",
304 | " outputs = outputs1[0] + outputs2[0]\n",
305 | " pooled_output = outputs[:, -1, :]\n",
306 | "\n",
307 | " pooled_output = self.dropout(pooled_output)\n",
308 | " logits = self.regressor(pooled_output)\n",
309 | "\n",
310 | " return logits"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "regression_model = TFGPT2Regressor('./gpt_ckpt', 1)"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "class PearsonCorrelationMetric(tf.keras.metrics.Metric):\n",
329 | " def __init__(self, name=\"pearson_correlation\", **kwargs):\n",
330 | " super(PearsonCorrelationMetric, self).__init__(name=name, **kwargs)\n",
331 | " self.y_true_list = []\n",
332 | " self.y_pred_list = []\n",
333 | "\n",
334 | " def update_state(self, y_true, y_pred, sample_weight=None):\n",
335 | " y_true = tf.reshape(y_true, shape=[-1])\n",
336 | " y_pred = tf.reshape(y_pred, shape=[-1])\n",
337 | " self.y_true_list.append(y_true)\n",
338 | " self.y_pred_list.append(y_pred)\n",
339 | "\n",
340 | " def result(self):\n",
341 | " y_true = tf.concat(self.y_true_list, -1)\n",
342 | " y_pred = tf.concat(self.y_pred_list, -1)\n",
343 | " pearson_correlation = self.pearson(y_true, y_pred)\n",
344 | " \n",
345 | " return pearson_correlation\n",
346 | "\n",
347 | " def reset_states(self):\n",
348 | " self.y_true_list = []\n",
349 | " self.y_pred_list = []\n",
350 | " \n",
351 | "\n",
352 | " def pearson(self, true, pred):\n",
353 | " m_true = tf.reduce_mean(true)\n",
354 | " m_pred = tf.reduce_mean(pred)\n",
355 | " m_true, m_pred = true-m_true, pred-m_pred\n",
356 | " r_num = tf.reduce_sum(tf.multiply(m_true, m_pred))\n",
357 | " r_den = tf.sqrt(tf.multiply(tf.reduce_sum(tf.square(m_true)), tf.reduce_sum(tf.square(m_pred)))) + 1e-12\n",
358 | " return r_num / r_den"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {},
365 | "outputs": [],
366 | "source": [
367 | "optimizer = tf.keras.optimizers.Adam(6.25e-5)\n",
368 | "loss = tf.keras.losses.MeanSquaredError()\n",
369 | "metric = PearsonCorrelationMetric()\n",
370 | "regression_model.compile(optimizer=optimizer, loss=loss, metrics=[metric], run_eagerly=True)"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "scrolled": true
378 | },
379 | "outputs": [],
380 | "source": [
381 | "model_name = \"tf2_gpt_korsts\"\n",
382 | "\n",
383 | "earlystop_callback = EarlyStopping(monitor='val_pearson_correlation', min_delta=0.0001,patience=3,mode='max')\n",
384 | "\n",
385 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
386 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
387 | "\n",
388 | "if os.path.exists(checkpoint_dir):\n",
389 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
390 | "else:\n",
391 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
392 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
393 | " \n",
394 | "cp_callback = ModelCheckpoint(\n",
395 | " checkpoint_path, monitor='val_pearson_correlation', verbose=1, save_best_only=True, save_weights_only=True,mode='max')\n",
396 | "\n",
397 | "history = regression_model.fit(train_data_sents, train_data_labels, epochs=NUM_EPOCHS,\n",
398 | " validation_data = (dev_data_sents, dev_data_labels),\n",
399 | " batch_size=BATCH_SIZE, callbacks=[earlystop_callback, cp_callback])"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": null,
405 | "metadata": {},
406 | "outputs": [],
407 | "source": [
408 | "plot_graphs(history, 'pearson_correlation')"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": null,
414 | "metadata": {
415 | "scrolled": false
416 | },
417 | "outputs": [],
418 | "source": [
419 | "plot_graphs(history, 'loss')"
420 | ]
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | "# KorSTSTEST"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {
433 | "scrolled": true
434 | },
435 | "outputs": [],
436 | "source": [
437 | "# Load Test dataset\n",
438 | "TEST_STS_DF = os.path.join(DATA_IN_PATH, 'KorSTS', 'sts-test.tsv')\n",
439 | "\n",
440 | "test_data = pd.read_csv(TEST_STS_DF, header=0, delimiter = '\\t', quoting = 3)\n",
441 | "test_data = test_data.dropna()\n",
442 | "test_data.head()"
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "metadata": {},
449 | "outputs": [],
450 | "source": [
451 | "test_data_sents1 = []\n",
452 | "test_data_sents2 = []\n",
453 | "test_labels = []\n",
454 | "\n",
455 | "\n",
456 | "for sent1, sent2, score in test_data[['sentence1', 'sentence2', 'score']].values:\n",
457 | " test_tokenized_sent_1 = vocab[tokenizer(clean_text(sent1))]\n",
458 | " test_tokenized_sent_2 = vocab[tokenizer(clean_text(sent2))]\n",
459 | " tokens1 = [vocab[vocab.bos_token]] \n",
460 | " tokens1 += pad_sequences([test_tokenized_sent_1], \n",
461 | " SENT_MAX_LEN, \n",
462 | " value=vocab[vocab.padding_token], \n",
463 | " padding='post').tolist()[0] \n",
464 | " tokens1 += [vocab[vocab.sep_token]] \n",
465 | " tokens1 += pad_sequences([test_tokenized_sent_2], \n",
466 | " SENT_MAX_LEN, \n",
467 | " value=vocab[vocab.padding_token], \n",
468 | " padding='post').tolist()[0] \n",
469 | " tokens1 += [vocab[vocab.eos_token]]\n",
470 | " tokens2 = [vocab[vocab.bos_token]] \n",
471 | " tokens2 += pad_sequences([test_tokenized_sent_2], \n",
472 | " SENT_MAX_LEN, \n",
473 | " value=vocab[vocab.padding_token], \n",
474 | " padding='post').tolist()[0] \n",
475 | " tokens2 += [vocab[vocab.sep_token]] \n",
476 | " tokens2 += pad_sequences([test_tokenized_sent_1], \n",
477 | " SENT_MAX_LEN, \n",
478 | " value=vocab[vocab.padding_token], \n",
479 | " padding='post').tolist()[0] \n",
480 | " tokens2 += [vocab[vocab.eos_token]]\n",
481 | " \n",
482 | " test_data_sents1.append(tokens1)\n",
483 | " test_data_sents2.append(tokens2)\n",
484 | " test_labels.append(score)\n",
485 | "\n",
486 | "test_data_sents1 = np.array(test_data_sents1, dtype=np.int64)\n",
487 | "test_data_sents2 = np.array(test_data_sents2, dtype=np.int64)\n",
488 | "test_data_sents = (test_data_sents1, test_data_sents2)\n",
489 | "test_data_labels = np.array(test_labels)"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {},
496 | "outputs": [],
497 | "source": [
498 | "print(\"# sents: {}, # labels: {}\".format(len(test_data_sents), len(test_data_labels)))"
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": null,
504 | "metadata": {},
505 | "outputs": [],
506 | "source": [
507 | "regression_model.load_weights(checkpoint_path)\n",
508 | "\n",
509 | "results = regression_model.evaluate(test_data_sents, test_data_labels, batch_size=512)\n",
510 | "print(\"test loss, test pearson correlation: \", results)"
511 | ]
512 | },
513 | {
514 | "cell_type": "code",
515 | "execution_count": null,
516 | "metadata": {},
517 | "outputs": [],
518 | "source": []
519 | }
520 | ],
521 | "metadata": {
522 | "kernelspec": {
523 | "display_name": "Python 3",
524 | "language": "python",
525 | "name": "python3"
526 | },
527 | "language_info": {
528 | "codemirror_mode": {
529 | "name": "ipython",
530 | "version": 3
531 | },
532 | "file_extension": ".py",
533 | "mimetype": "text/x-python",
534 | "name": "python",
535 | "nbconvert_exporter": "python",
536 | "pygments_lexer": "ipython3",
537 | "version": "3.7.4"
538 | }
539 | },
540 | "nbformat": 4,
541 | "nbformat_minor": 2
542 | }
543 |
--------------------------------------------------------------------------------
/7.PRETRAIN_METHOD/7.2.5.bert_finetune_KorQuAD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 환경 준비"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "!wget https://raw.githubusercontent.com/NLP-kr/tensorflow-ml-nlp-tf2/master/requirements.txt -O requirements.txt\n",
17 | "!pip install -r requirements.txt\n",
18 | "!pip install tensorflow==2.2.0"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "colab": {},
26 | "colab_type": "code",
27 | "executionInfo": {
28 | "elapsed": 12607,
29 | "status": "ok",
30 | "timestamp": 1594010753269,
31 | "user": {
32 | "displayName": "ChangWook Jun",
33 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
34 | "userId": "00685987924881157185"
35 | },
36 | "user_tz": -540
37 | },
38 | "id": "B9WLyWEWgdDR"
39 | },
40 | "outputs": [],
41 | "source": [
42 | "import os\n",
43 | "import re\n",
44 | "import json\n",
45 | "import string\n",
46 | "import numpy as np\n",
47 | "import tensorflow as tf\n",
48 | "from tensorflow import keras\n",
49 | "from tensorflow.keras import layers\n",
50 | "from tokenizers import BertWordPieceTokenizer\n",
51 | "from transformers import BertTokenizer, TFBertModel\n",
52 | "\n",
53 | "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
54 | "import matplotlib.pyplot as plt\n",
55 | "import urllib\n",
56 | "\n",
57 | "MAX_LEN = 384\n",
58 | "EPOCHS = 3\n",
59 | "VERBOSE = 2\n",
60 | "BATCH_SIZE = 16"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {
67 | "colab": {},
68 | "colab_type": "code",
69 | "executionInfo": {
70 | "elapsed": 556,
71 | "status": "ok",
72 | "timestamp": 1594010762115,
73 | "user": {
74 | "displayName": "ChangWook Jun",
75 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
76 | "userId": "00685987924881157185"
77 | },
78 | "user_tz": -540
79 | },
80 | "id": "68HVB3dYgi0w"
81 | },
82 | "outputs": [],
83 | "source": [
84 | "DATA_OUT_PATH = './data_out/KOR'"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "colab": {},
92 | "colab_type": "code",
93 | "executionInfo": {
94 | "elapsed": 639,
95 | "status": "ok",
96 | "timestamp": 1594010763471,
97 | "user": {
98 | "displayName": "ChangWook Jun",
99 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
100 | "userId": "00685987924881157185"
101 | },
102 | "user_tz": -540
103 | },
104 | "id": "zvoswBdyglTQ"
105 | },
106 | "outputs": [],
107 | "source": [
108 | "def plot_graphs(history, string, string_1, string_2):\n",
109 | " # loss \n",
110 | " plt.plot(history.history[string])\n",
111 | " plt.plot(history.history[string_1])\n",
112 | " plt.plot(history.history[string_2])\n",
113 | " plt.xlabel(\"Epochs\")\n",
114 | " plt.ylabel(string)\n",
115 | " plt.legend([string, string_1, string_2])\n",
116 | " plt.show()"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {},
123 | "outputs": [],
124 | "source": [
125 | "SEED_NUM = 1234\n",
126 | "tf.random.set_seed(SEED_NUM)\n",
127 | "np.random.seed(SEED_NUM)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {
134 | "colab": {
135 | "base_uri": "https://localhost:8080/",
136 | "height": 65,
137 | "referenced_widgets": [
138 | "bc7f3c579a324f77811bdd6ad6dd7dc0",
139 | "e31de13423d743e68d6c451d23c93cdf",
140 | "f8f80478dfca4894ac1ff8c2a082f734",
141 | "3be3c9704e934fb5a3d5847749d398ce",
142 | "2c0ecef646d44a0580cacefa5c3fd9f2",
143 | "1fde406732df4b5b90b7701dc7e4981e",
144 | "f58154a65f974e04bcf8af24b2884fdd",
145 | "a7d4d0c48cda4abdb106a6bcfb24359e"
146 | ]
147 | },
148 | "colab_type": "code",
149 | "executionInfo": {
150 | "elapsed": 1217,
151 | "status": "ok",
152 | "timestamp": 1594010812799,
153 | "user": {
154 | "displayName": "ChangWook Jun",
155 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
156 | "userId": "00685987924881157185"
157 | },
158 | "user_tz": -540
159 | },
160 | "id": "HDI_cm3sgm6N",
161 | "outputId": "33078a97-0007-428b-9439-b67bd53cd994"
162 | },
163 | "outputs": [],
164 | "source": [
165 | "# Save the slow pretrained tokenizer\n",
166 | "slow_tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\", lowercase=False)\n",
167 | "save_path = \"bert-base-multilingual-cased/\"\n",
168 | "if not os.path.exists(save_path):\n",
169 | " os.makedirs(save_path)\n",
170 | "slow_tokenizer.save_pretrained(save_path)\n",
171 | "\n",
172 | "# Load the fast tokenizer from saved file\n",
173 | "tokenizer = BertWordPieceTokenizer(\"bert-base-multilingual-cased/vocab.txt\", lowercase=False)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {
180 | "colab": {
181 | "base_uri": "https://localhost:8080/",
182 | "height": 83
183 | },
184 | "colab_type": "code",
185 | "executionInfo": {
186 | "elapsed": 1750,
187 | "status": "ok",
188 | "timestamp": 1594010820826,
189 | "user": {
190 | "displayName": "ChangWook Jun",
191 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
192 | "userId": "00685987924881157185"
193 | },
194 | "user_tz": -540
195 | },
196 | "id": "an5cGi-GgpG4",
197 | "outputId": "c7753a24-f338-4a6d-8701-f78753f9b718"
198 | },
199 | "outputs": [],
200 | "source": [
201 | "train_data_url = \"https://korquad.github.io/dataset/KorQuAD_v1.0_train.json\"\n",
202 | "train_path = keras.utils.get_file(\"train.json\", train_data_url)\n",
203 | "eval_data_url = \"https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json\"\n",
204 | "eval_path = keras.utils.get_file(\"eval.json\", eval_data_url)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "!wget -P ./bert-base-multilingual-cased/ https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "!mv ./bert-base-multilingual-cased/bert-base-multilingual-cased-config.json ./bert-base-multilingual-cased/config.json"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "!wget -P ./bert-base-multilingual-cased/ https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "!mv ./bert-base-multilingual-cased/bert-base-multilingual-cased-tf_model.h5 ./bert-base-multilingual-cased/tf_model.h5"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {
247 | "colab": {
248 | "base_uri": "https://localhost:8080/",
249 | "height": 50
250 | },
251 | "colab_type": "code",
252 | "executionInfo": {
253 | "elapsed": 99893,
254 | "status": "ok",
255 | "timestamp": 1594011009085,
256 | "user": {
257 | "displayName": "ChangWook Jun",
258 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
259 | "userId": "00685987924881157185"
260 | },
261 | "user_tz": -540
262 | },
263 | "id": "PkuK7N_ngrMd",
264 | "outputId": "48275df3-52de-4623-dfc3-db6be9a54dfa"
265 | },
266 | "outputs": [],
267 | "source": [
268 | "class SquadExample:\n",
269 | " def __init__(self, question, context, start_char_idx, answer_text):\n",
270 | " self.question = question\n",
271 | " self.context = context\n",
272 | " self.start_char_idx = start_char_idx\n",
273 | " self.answer_text = answer_text\n",
274 | " self.skip = False\n",
275 | "\n",
276 | " def preprocess(self):\n",
277 | " context = self.context\n",
278 | " question = self.question\n",
279 | " answer_text = self.answer_text\n",
280 | " start_char_idx = self.start_char_idx\n",
281 | "\n",
282 | " # Clean context, answer and question\n",
283 | " context = \" \".join(str(context).split())\n",
284 | " question = \" \".join(str(question).split())\n",
285 | " answer = \" \".join(str(answer_text).split())\n",
286 | "\n",
287 | " # Find end character index of answer in context\n",
288 | " end_char_idx = start_char_idx + len(answer)\n",
289 | " if end_char_idx >= len(context):\n",
290 | " self.skip = True\n",
291 | " return\n",
292 | "\n",
293 | " # Mark the character indexes in context that are in answer\n",
294 | " is_char_in_ans = [0] * len(context)\n",
295 | " for idx in range(start_char_idx, end_char_idx):\n",
296 | " is_char_in_ans[idx] = 1\n",
297 | "\n",
298 | " # Tokenize context\n",
299 | " tokenized_context = tokenizer.encode(context)\n",
300 | "\n",
301 | " # Find tokens that were created from answer characters\n",
302 | " ans_token_idx = []\n",
303 | " for idx, (start, end) in enumerate(tokenized_context.offsets):\n",
304 | " if sum(is_char_in_ans[start:end]) > 0:\n",
305 | " ans_token_idx.append(idx)\n",
306 | "\n",
307 | " if len(ans_token_idx) == 0:\n",
308 | " self.skip = True\n",
309 | " return\n",
310 | "\n",
311 | " # Find start and end token index for tokens from answer\n",
312 | " start_token_idx = ans_token_idx[0]\n",
313 | " end_token_idx = ans_token_idx[-1]\n",
314 | "\n",
315 | " # Tokenize question\n",
316 | " tokenized_question = tokenizer.encode(question)\n",
317 | "\n",
318 | " # Create inputs\n",
319 | " input_ids = tokenized_context.ids + tokenized_question.ids[1:]\n",
320 | " token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(\n",
321 | " tokenized_question.ids[1:]\n",
322 | " )\n",
323 | " attention_mask = [1] * len(input_ids)\n",
324 | "\n",
325 | " # Pad and create attention masks.\n",
326 | " # Skip if truncation is needed\n",
327 | " padding_length = MAX_LEN - len(input_ids)\n",
328 | " if padding_length > 0: # pad\n",
329 | " input_ids = input_ids + ([0] * padding_length)\n",
330 | " attention_mask = attention_mask + ([0] * padding_length)\n",
331 | " token_type_ids = token_type_ids + ([0] * padding_length)\n",
332 | " elif padding_length < 0: # skip\n",
333 | " self.skip = True\n",
334 | " return\n",
335 | "\n",
336 | " self.input_ids = input_ids\n",
337 | " self.token_type_ids = token_type_ids\n",
338 | " self.attention_mask = attention_mask\n",
339 | " self.start_token_idx = start_token_idx\n",
340 | " self.end_token_idx = end_token_idx\n",
341 | " self.context_token_to_char = tokenized_context.offsets\n",
342 | "\n",
343 | "\n",
344 | "def create_squad_examples(raw_data):\n",
345 | " squad_examples = []\n",
346 | " for item in raw_data[\"data\"]:\n",
347 | " for para in item[\"paragraphs\"]:\n",
348 | " context = para[\"context\"]\n",
349 | " for qa in para[\"qas\"]:\n",
350 | " question = qa[\"question\"]\n",
351 | " answer_text = qa[\"answers\"][0][\"text\"]\n",
352 | " start_char_idx = qa[\"answers\"][0][\"answer_start\"]\n",
353 | " squad_eg = SquadExample(\n",
354 | " question, context, start_char_idx, answer_text\n",
355 | " )\n",
356 | " squad_eg.preprocess()\n",
357 | " squad_examples.append(squad_eg)\n",
358 | " return squad_examples\n",
359 | "\n",
360 | "\n",
361 | "def create_inputs_targets(squad_examples):\n",
362 | " dataset_dict = {\n",
363 | " \"input_ids\": [],\n",
364 | " \"token_type_ids\": [],\n",
365 | " \"attention_mask\": [],\n",
366 | " \"start_token_idx\": [],\n",
367 | " \"end_token_idx\": [],\n",
368 | " }\n",
369 | " for item in squad_examples:\n",
370 | " if item.skip == False:\n",
371 | " for key in dataset_dict:\n",
372 | " dataset_dict[key].append(getattr(item, key))\n",
373 | " for key in dataset_dict:\n",
374 | " dataset_dict[key] = np.array(dataset_dict[key])\n",
375 | "\n",
376 | " x = [\n",
377 | " dataset_dict[\"input_ids\"],\n",
378 | " dataset_dict[\"token_type_ids\"],\n",
379 | " dataset_dict[\"attention_mask\"],\n",
380 | " ]\n",
381 | " y = [dataset_dict[\"start_token_idx\"], dataset_dict[\"end_token_idx\"]]\n",
382 | " return x, y\n"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "with open(train_path) as f:\n",
392 | " raw_train_data = json.load(f)\n",
393 | "\n",
394 | "with open(eval_path) as f:\n",
395 | " raw_eval_data = json.load(f)\n",
396 | "\n",
397 | "\n",
398 | "train_squad_examples = create_squad_examples(raw_train_data)\n",
399 | "x_train, y_train = create_inputs_targets(train_squad_examples)\n",
400 | "print(f\"{len(train_squad_examples)} training points created.\")\n",
401 | "\n",
402 | "eval_squad_examples = create_squad_examples(raw_eval_data)\n",
403 | "x_eval, y_eval = create_inputs_targets(eval_squad_examples)\n",
404 | "print(f\"{len(eval_squad_examples)} evaluation points created.\")"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": null,
410 | "metadata": {
411 | "colab": {},
412 | "colab_type": "code",
413 | "executionInfo": {
414 | "elapsed": 690,
415 | "status": "ok",
416 | "timestamp": 1594011009787,
417 | "user": {
418 | "displayName": "ChangWook Jun",
419 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
420 | "userId": "00685987924881157185"
421 | },
422 | "user_tz": -540
423 | },
424 | "id": "mIjk3_XeguBj"
425 | },
426 | "outputs": [],
427 | "source": [
428 | "class TFBERTQuestionAnswering(tf.keras.Model):\n",
429 | " def __init__(self, model_name, dir_path, num_class):\n",
430 | " super(TFBERTQuestionAnswering, self).__init__()\n",
431 | " \n",
432 | " self.encoder = TFBertModel.from_pretrained(model_name, cache_dir=dir_path)\n",
433 | " self.start_logit = tf.keras.layers.Dense(num_class, name=\"start_logit\", use_bias=False)\n",
434 | " self.end_logit = tf.keras.layers.Dense(num_class, name=\"end_logit\", use_bias=False)\n",
435 | " self.flatten = tf.keras.layers.Flatten() \n",
436 | " self.softmax = tf.keras.layers.Activation(tf.keras.activations.softmax)\n",
437 | " \n",
438 | " def call(self, inputs):\n",
439 | " input_ids, token_type_ids, attention_mask = inputs\n",
440 | " embedding = self.encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]\n",
441 | " start_logits = self.start_logit(embedding)\n",
442 | " start_logits = self.flatten(start_logits)\n",
443 | " \n",
444 | " end_logits = self.end_logit(embedding)\n",
445 | " end_logits = self.flatten(end_logits)\n",
446 | " \n",
447 | " start_probs = self.softmax(start_logits)\n",
448 | " end_probs = self.softmax(end_logits)\n",
449 | " \n",
450 | " return start_probs, end_probs"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "metadata": {
457 | "colab": {
458 | "base_uri": "https://localhost:8080/",
459 | "height": 120
460 | },
461 | "colab_type": "code",
462 | "executionInfo": {
463 | "elapsed": 11135,
464 | "status": "ok",
465 | "timestamp": 1594011020239,
466 | "user": {
467 | "displayName": "ChangWook Jun",
468 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
469 | "userId": "00685987924881157185"
470 | },
471 | "user_tz": -540
472 | },
473 | "id": "k4t_2T7vgwOu",
474 | "outputId": "fd7dcb5d-bf36-496c-b53d-53e89962360a"
475 | },
476 | "outputs": [],
477 | "source": [
478 | "korquad_model = TFBERTQuestionAnswering(model_name='./bert-base-multilingual-cased/',dir_path='bert_ckpt', num_class=1)\n",
479 | "optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)\n",
480 | "loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": null,
486 | "metadata": {
487 | "colab": {},
488 | "colab_type": "code",
489 | "executionInfo": {
490 | "elapsed": 590,
491 | "status": "ok",
492 | "timestamp": 1594011103474,
493 | "user": {
494 | "displayName": "ChangWook Jun",
495 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
496 | "userId": "00685987924881157185"
497 | },
498 | "user_tz": -540
499 | },
500 | "id": "YZtVFA3PgyL0"
501 | },
502 | "outputs": [],
503 | "source": [
504 | "def normalized_answer(s): \n",
505 | " def remove_(text):\n",
506 | " ''' 불필요한 기호 제거 '''\n",
507 | " text = re.sub(\"'\", \" \", text)\n",
508 | " text = re.sub('\"', \" \", text)\n",
509 | " text = re.sub('《', \" \", text)\n",
510 | " text = re.sub('》', \" \", text)\n",
511 | " text = re.sub('<', \" \", text)\n",
512 | " text = re.sub('>', \" \", text) \n",
513 | " text = re.sub('〈', \" \", text)\n",
514 | " text = re.sub('〉', \" \", text) \n",
515 | " text = re.sub(\"\\(\", \" \", text)\n",
516 | " text = re.sub(\"\\)\", \" \", text)\n",
517 | " text = re.sub(\"‘\", \" \", text)\n",
518 | " text = re.sub(\"’\", \" \", text) \n",
519 | " return text\n",
520 | "\n",
521 | " def white_space_fix(text):\n",
522 | " return ' '.join(text.split())\n",
523 | "\n",
524 | " def remove_punc(text):\n",
525 | " exclude = set(string.punctuation)\n",
526 | " return ''.join(ch for ch in text if ch not in exclude)\n",
527 | "\n",
528 | " def lower(text):\n",
529 | " return text.lower()\n",
530 | "\n",
531 | " return white_space_fix(remove_punc(lower(remove_(s))))"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": null,
537 | "metadata": {
538 | "colab": {},
539 | "colab_type": "code",
540 | "executionInfo": {
541 | "elapsed": 720,
542 | "status": "ok",
543 | "timestamp": 1594011104061,
544 | "user": {
545 | "displayName": "ChangWook Jun",
546 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
547 | "userId": "00685987924881157185"
548 | },
549 | "user_tz": -540
550 | },
551 | "id": "rVTh1qKng1p8"
552 | },
553 | "outputs": [],
554 | "source": [
555 | "class ExactMatch(keras.callbacks.Callback):\n",
556 | " def __init__(self, x_eval, y_eval):\n",
557 | " self.x_eval = x_eval\n",
558 | " self.y_eval = y_eval\n",
559 | "\n",
560 | " def on_epoch_end(self, epoch, logs=None):\n",
561 | " pred_start, pred_end = self.model.predict(self.x_eval)\n",
562 | " count = 0\n",
563 | " eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]\n",
564 | " for idx, (start, end) in enumerate(zip(pred_start, pred_end)):\n",
565 | " squad_eg = eval_examples_no_skip[idx]\n",
566 | " offsets = squad_eg.context_token_to_char\n",
567 | " start = np.argmax(start)\n",
568 | " end = np.argmax(end)\n",
569 | " if start >= len(offsets):\n",
570 | " continue\n",
571 | " pred_char_start = offsets[start][0]\n",
572 | " if end < len(offsets):\n",
573 | " pred_char_end = offsets[end][1]\n",
574 | " pred_ans = squad_eg.context[pred_char_start:pred_char_end]\n",
575 | " else:\n",
576 | " pred_ans = squad_eg.context[pred_char_start:]\n",
577 | "\n",
578 | " normalized_pred_ans = normalized_answer(pred_ans)\n",
579 | " normalized_true_ans = normalized_answer(squad_eg.answer_text)\n",
580 | " if normalized_pred_ans in normalized_true_ans:\n",
581 | " count += 1\n",
582 | " acc = count / len(self.y_eval[0])\n",
583 | " print(f\"\\nepoch={epoch+1}, exact match score={acc:.2f}\")"
584 | ]
585 | },
586 | {
587 | "cell_type": "code",
588 | "execution_count": null,
589 | "metadata": {
590 | "colab": {},
591 | "colab_type": "code",
592 | "executionInfo": {
593 | "elapsed": 399,
594 | "status": "ok",
595 | "timestamp": 1594011104303,
596 | "user": {
597 | "displayName": "ChangWook Jun",
598 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
599 | "userId": "00685987924881157185"
600 | },
601 | "user_tz": -540
602 | },
603 | "id": "sTgvtk0og4Ow"
604 | },
605 | "outputs": [],
606 | "source": [
607 | "exact_match_callback = ExactMatch(x_eval, y_eval)"
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": null,
613 | "metadata": {
614 | "colab": {},
615 | "colab_type": "code",
616 | "executionInfo": {
617 | "elapsed": 599,
618 | "status": "ok",
619 | "timestamp": 1594011105561,
620 | "user": {
621 | "displayName": "ChangWook Jun",
622 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
623 | "userId": "00685987924881157185"
624 | },
625 | "user_tz": -540
626 | },
627 | "id": "7EuBYS58g6QZ"
628 | },
629 | "outputs": [],
630 | "source": [
631 | "korquad_model.compile(optimizer=optimizer, loss=[loss, loss])"
632 | ]
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": null,
637 | "metadata": {
638 | "colab": {
639 | "base_uri": "https://localhost:8080/",
640 | "height": 50
641 | },
642 | "colab_type": "code",
643 | "executionInfo": {
644 | "elapsed": 714,
645 | "status": "ok",
646 | "timestamp": 1594011106252,
647 | "user": {
648 | "displayName": "ChangWook Jun",
649 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
650 | "userId": "00685987924881157185"
651 | },
652 | "user_tz": -540
653 | },
654 | "id": "ZehxFPSrg8Q2",
655 | "outputId": "6a33f8a1-84d0-48c4-ac1e-5843daf1f2fb"
656 | },
657 | "outputs": [],
658 | "source": [
659 | "model_name = \"tf2_bert_korquad\"\n",
660 | "\n",
661 | "checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')\n",
662 | "checkpoint_dir = os.path.dirname(checkpoint_path)\n",
663 | "\n",
664 | "# Create path if exists\n",
665 | "if os.path.exists(checkpoint_dir):\n",
666 | " print(\"{} -- Folder already exists \\n\".format(checkpoint_dir))\n",
667 | "else:\n",
668 | " os.makedirs(checkpoint_dir, exist_ok=True)\n",
669 | " print(\"{} -- Folder create complete \\n\".format(checkpoint_dir))\n",
670 | " \n",
671 | "cp_callback = ModelCheckpoint(\n",
672 | " checkpoint_path, verbose=1, save_best_only=True, save_weights_only=True)"
673 | ]
674 | },
675 | {
676 | "cell_type": "code",
677 | "execution_count": null,
678 | "metadata": {
679 | "colab": {
680 | "base_uri": "https://localhost:8080/",
681 | "height": 383
682 | },
683 | "colab_type": "code",
684 | "executionInfo": {
685 | "elapsed": 18126376,
686 | "status": "ok",
687 | "timestamp": 1594029233934,
688 | "user": {
689 | "displayName": "ChangWook Jun",
690 | "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14GjljUh9LMLCM8kMgWLaX2xHiw2Cej8KoaOlkKxE=s64",
691 | "userId": "00685987924881157185"
692 | },
693 | "user_tz": -540
694 | },
695 | "id": "2ljuajCLmyws",
696 | "outputId": "e89526e8-e795-48df-eead-1a00b28005bf"
697 | },
698 | "outputs": [],
699 | "source": [
700 | "history = korquad_model.fit(\n",
701 | " x_train,\n",
702 | " y_train,\n",
703 | " epochs=EPOCHS, # For demonstration, 3 epochs are recommended\n",
704 | " verbose=VERBOSE,\n",
705 | " batch_size=BATCH_SIZE,\n",
706 | " callbacks=[exact_match_callback, cp_callback]\n",
707 | ")"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": null,
713 | "metadata": {},
714 | "outputs": [],
715 | "source": [
716 | "print(history.history)"
717 | ]
718 | },
719 | {
720 | "cell_type": "code",
721 | "execution_count": null,
722 | "metadata": {
723 | "colab": {},
724 | "colab_type": "code",
725 | "id": "QxaigHy2m4JB"
726 | },
727 | "outputs": [],
728 | "source": [
729 | "plot_graphs(history, 'loss', 'output_1_loss', 'output_2_loss')"
730 | ]
731 | },
732 | {
733 | "cell_type": "code",
734 | "execution_count": null,
735 | "metadata": {},
736 | "outputs": [],
737 | "source": []
738 | }
739 | ],
740 | "metadata": {
741 | "accelerator": "GPU",
742 | "colab": {
743 | "authorship_tag": "ABX9TyMn6I90a+EqoM9Ks6eBcRWt",
744 | "collapsed_sections": [],
745 | "name": "KorQuad_class.ipynb",
746 | "provenance": []
747 | },
748 | "kernelspec": {
749 | "display_name": "Python 3",
750 | "language": "python",
751 | "name": "python3"
752 | },
753 | "language_info": {
754 | "codemirror_mode": {
755 | "name": "ipython",
756 | "version": 3
757 | },
758 | "file_extension": ".py",
759 | "mimetype": "text/x-python",
760 | "name": "python",
761 | "nbconvert_exporter": "python",
762 | "pygments_lexer": "ipython3",
763 | "version": "3.7.4"
764 | },
765 | "widgets": {
766 | "application/vnd.jupyter.widget-state+json": {
767 | "1fde406732df4b5b90b7701dc7e4981e": {
768 | "model_module": "@jupyter-widgets/base",
769 | "model_name": "LayoutModel",
770 | "state": {
771 | "_model_module": "@jupyter-widgets/base",
772 | "_model_module_version": "1.2.0",
773 | "_model_name": "LayoutModel",
774 | "_view_count": null,
775 | "_view_module": "@jupyter-widgets/base",
776 | "_view_module_version": "1.2.0",
777 | "_view_name": "LayoutView",
778 | "align_content": null,
779 | "align_items": null,
780 | "align_self": null,
781 | "border": null,
782 | "bottom": null,
783 | "display": null,
784 | "flex": null,
785 | "flex_flow": null,
786 | "grid_area": null,
787 | "grid_auto_columns": null,
788 | "grid_auto_flow": null,
789 | "grid_auto_rows": null,
790 | "grid_column": null,
791 | "grid_gap": null,
792 | "grid_row": null,
793 | "grid_template_areas": null,
794 | "grid_template_columns": null,
795 | "grid_template_rows": null,
796 | "height": null,
797 | "justify_content": null,
798 | "justify_items": null,
799 | "left": null,
800 | "margin": null,
801 | "max_height": null,
802 | "max_width": null,
803 | "min_height": null,
804 | "min_width": null,
805 | "object_fit": null,
806 | "object_position": null,
807 | "order": null,
808 | "overflow": null,
809 | "overflow_x": null,
810 | "overflow_y": null,
811 | "padding": null,
812 | "right": null,
813 | "top": null,
814 | "visibility": null,
815 | "width": null
816 | }
817 | },
818 | "2c0ecef646d44a0580cacefa5c3fd9f2": {
819 | "model_module": "@jupyter-widgets/controls",
820 | "model_name": "ProgressStyleModel",
821 | "state": {
822 | "_model_module": "@jupyter-widgets/controls",
823 | "_model_module_version": "1.5.0",
824 | "_model_name": "ProgressStyleModel",
825 | "_view_count": null,
826 | "_view_module": "@jupyter-widgets/base",
827 | "_view_module_version": "1.2.0",
828 | "_view_name": "StyleView",
829 | "bar_color": null,
830 | "description_width": "initial"
831 | }
832 | },
833 | "3be3c9704e934fb5a3d5847749d398ce": {
834 | "model_module": "@jupyter-widgets/controls",
835 | "model_name": "HTMLModel",
836 | "state": {
837 | "_dom_classes": [],
838 | "_model_module": "@jupyter-widgets/controls",
839 | "_model_module_version": "1.5.0",
840 | "_model_name": "HTMLModel",
841 | "_view_count": null,
842 | "_view_module": "@jupyter-widgets/controls",
843 | "_view_module_version": "1.5.0",
844 | "_view_name": "HTMLView",
845 | "description": "",
846 | "description_tooltip": null,
847 | "layout": "IPY_MODEL_a7d4d0c48cda4abdb106a6bcfb24359e",
848 | "placeholder": "",
849 | "style": "IPY_MODEL_f58154a65f974e04bcf8af24b2884fdd",
850 | "value": " 872k/872k [00:00<00:00, 3.17MB/s]"
851 | }
852 | },
853 | "a7d4d0c48cda4abdb106a6bcfb24359e": {
854 | "model_module": "@jupyter-widgets/base",
855 | "model_name": "LayoutModel",
856 | "state": {
857 | "_model_module": "@jupyter-widgets/base",
858 | "_model_module_version": "1.2.0",
859 | "_model_name": "LayoutModel",
860 | "_view_count": null,
861 | "_view_module": "@jupyter-widgets/base",
862 | "_view_module_version": "1.2.0",
863 | "_view_name": "LayoutView",
864 | "align_content": null,
865 | "align_items": null,
866 | "align_self": null,
867 | "border": null,
868 | "bottom": null,
869 | "display": null,
870 | "flex": null,
871 | "flex_flow": null,
872 | "grid_area": null,
873 | "grid_auto_columns": null,
874 | "grid_auto_flow": null,
875 | "grid_auto_rows": null,
876 | "grid_column": null,
877 | "grid_gap": null,
878 | "grid_row": null,
879 | "grid_template_areas": null,
880 | "grid_template_columns": null,
881 | "grid_template_rows": null,
882 | "height": null,
883 | "justify_content": null,
884 | "justify_items": null,
885 | "left": null,
886 | "margin": null,
887 | "max_height": null,
888 | "max_width": null,
889 | "min_height": null,
890 | "min_width": null,
891 | "object_fit": null,
892 | "object_position": null,
893 | "order": null,
894 | "overflow": null,
895 | "overflow_x": null,
896 | "overflow_y": null,
897 | "padding": null,
898 | "right": null,
899 | "top": null,
900 | "visibility": null,
901 | "width": null
902 | }
903 | },
904 | "bc7f3c579a324f77811bdd6ad6dd7dc0": {
905 | "model_module": "@jupyter-widgets/controls",
906 | "model_name": "HBoxModel",
907 | "state": {
908 | "_dom_classes": [],
909 | "_model_module": "@jupyter-widgets/controls",
910 | "_model_module_version": "1.5.0",
911 | "_model_name": "HBoxModel",
912 | "_view_count": null,
913 | "_view_module": "@jupyter-widgets/controls",
914 | "_view_module_version": "1.5.0",
915 | "_view_name": "HBoxView",
916 | "box_style": "",
917 | "children": [
918 | "IPY_MODEL_f8f80478dfca4894ac1ff8c2a082f734",
919 | "IPY_MODEL_3be3c9704e934fb5a3d5847749d398ce"
920 | ],
921 | "layout": "IPY_MODEL_e31de13423d743e68d6c451d23c93cdf"
922 | }
923 | },
924 | "e31de13423d743e68d6c451d23c93cdf": {
925 | "model_module": "@jupyter-widgets/base",
926 | "model_name": "LayoutModel",
927 | "state": {
928 | "_model_module": "@jupyter-widgets/base",
929 | "_model_module_version": "1.2.0",
930 | "_model_name": "LayoutModel",
931 | "_view_count": null,
932 | "_view_module": "@jupyter-widgets/base",
933 | "_view_module_version": "1.2.0",
934 | "_view_name": "LayoutView",
935 | "align_content": null,
936 | "align_items": null,
937 | "align_self": null,
938 | "border": null,
939 | "bottom": null,
940 | "display": null,
941 | "flex": null,
942 | "flex_flow": null,
943 | "grid_area": null,
944 | "grid_auto_columns": null,
945 | "grid_auto_flow": null,
946 | "grid_auto_rows": null,
947 | "grid_column": null,
948 | "grid_gap": null,
949 | "grid_row": null,
950 | "grid_template_areas": null,
951 | "grid_template_columns": null,
952 | "grid_template_rows": null,
953 | "height": null,
954 | "justify_content": null,
955 | "justify_items": null,
956 | "left": null,
957 | "margin": null,
958 | "max_height": null,
959 | "max_width": null,
960 | "min_height": null,
961 | "min_width": null,
962 | "object_fit": null,
963 | "object_position": null,
964 | "order": null,
965 | "overflow": null,
966 | "overflow_x": null,
967 | "overflow_y": null,
968 | "padding": null,
969 | "right": null,
970 | "top": null,
971 | "visibility": null,
972 | "width": null
973 | }
974 | },
975 | "f58154a65f974e04bcf8af24b2884fdd": {
976 | "model_module": "@jupyter-widgets/controls",
977 | "model_name": "DescriptionStyleModel",
978 | "state": {
979 | "_model_module": "@jupyter-widgets/controls",
980 | "_model_module_version": "1.5.0",
981 | "_model_name": "DescriptionStyleModel",
982 | "_view_count": null,
983 | "_view_module": "@jupyter-widgets/base",
984 | "_view_module_version": "1.2.0",
985 | "_view_name": "StyleView",
986 | "description_width": ""
987 | }
988 | },
989 | "f8f80478dfca4894ac1ff8c2a082f734": {
990 | "model_module": "@jupyter-widgets/controls",
991 | "model_name": "FloatProgressModel",
992 | "state": {
993 | "_dom_classes": [],
994 | "_model_module": "@jupyter-widgets/controls",
995 | "_model_module_version": "1.5.0",
996 | "_model_name": "FloatProgressModel",
997 | "_view_count": null,
998 | "_view_module": "@jupyter-widgets/controls",
999 | "_view_module_version": "1.5.0",
1000 | "_view_name": "ProgressView",
1001 | "bar_style": "success",
1002 | "description": "Downloading: 100%",
1003 | "description_tooltip": null,
1004 | "layout": "IPY_MODEL_1fde406732df4b5b90b7701dc7e4981e",
1005 | "max": 871891,
1006 | "min": 0,
1007 | "orientation": "horizontal",
1008 | "style": "IPY_MODEL_2c0ecef646d44a0580cacefa5c3fd9f2",
1009 | "value": 871891
1010 | }
1011 | }
1012 | }
1013 | }
1014 | },
1015 | "nbformat": 4,
1016 | "nbformat_minor": 1
1017 | }
1018 |
--------------------------------------------------------------------------------