├── 1DCNN.ipynb ├── BadWordDetectionByRegularExpression.ipynb ├── BadWordDetectionByRegularExpression.py ├── EDA.ipynb ├── FastTextVocab.ipynb ├── JamoSplit.py ├── README.md ├── RandomForest.ipynb ├── Test.ipynb ├── TrigramVectorize.ipynb ├── cnn_model ├── rf_model └── screenshot ├── 1DCNN reuslt.png ├── 1DCNN_model_summay.png ├── randomforest_result.png ├── word_embedding_2dim.png └── word_embedding_most_simmilar.png /1DCNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ==== 1D CNN 모델링 ====" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 24, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "# 벡터화된 데이터 불러오기\n", 20 | "data = pd.read_json(\"./labeled_data.json\")\n", 21 | "data.columns = [\"label\", \"trigram\"]" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 25, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# 3 x 50 -> 150 차원으로 flastten해주는 과정\n", 31 | "data['trigram'] = data['trigram'].apply(lambda x: (np.array(x).reshape(-1)))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 26, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# train/test 데이터 분리\n", 41 | "from sklearn.model_selection import train_test_split\n", 42 | "\n", 43 | "y = data.pop('label')\n", 44 | "X = data\n", 45 | "\n", 46 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 27, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# keras 데이터 형식에 맞게 변환\n", 56 | "X_train = np.array(X_train['trigram'].tolist())\n", 57 | "X_test = np.array(X_test['trigram'].tolist())\n", 58 | "y_train = y_train.values\n", 59 | "y_test = y_test.values" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 28, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "(9779, 150)" 71 | ] 72 | }, 73 | "execution_count": 28, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "X_train.shape # row개수 x embeedding차원" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 29, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "(9779,)" 91 | ] 92 | }, 93 | "execution_count": 29, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "y_train.shape # row개수 x 1" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 30, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# keras 모델에 넣기 위한 reshape 과정\n", 109 | "def reshape(df, dim):\n", 110 | " return df.reshape(df.shape[0], dim, 1)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 31, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "(9779, 150, 1)" 122 | ] 123 | }, 124 | "execution_count": 31, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "X_train = reshape(X_train, 150)\n", 131 | "X_test = reshape(X_test, 150)\n", 132 | "X_train.shape" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "# ==== CNN ====" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 35, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# f1 score를 알기 위해 정의한 함수 \n", 149 | "# 출처: https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model\n", 150 | "from keras import backend as K\n", 151 | "\n", 152 | "def recall_m(y_true, y_pred):\n", 153 | " true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))\n", 154 | " possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))\n", 155 | " recall = true_positives / (possible_positives + K.epsilon())\n", 156 | " return recall\n", 157 | "\n", 158 | "def precision_m(y_true, y_pred):\n", 159 | " true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))\n", 160 | " predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))\n", 161 | " precision = true_positives / (predicted_positives + K.epsilon())\n", 162 | " return precision\n", 163 | "\n", 164 | "def f1_m(y_true, y_pred):\n", 165 | " precision = precision_m(y_true, y_pred)\n", 166 | " recall = recall_m(y_true, y_pred)\n", 167 | " return 2*((precision*recall)/(precision+recall+K.epsilon()))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 54, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "Model: \"sequential_1\"\n", 180 | "_________________________________________________________________\n", 181 | "Layer (type) Output Shape Param # \n", 182 | "=================================================================\n", 183 | "dense_1 (Dense) (None, 150, 150) 300 \n", 184 | "_________________________________________________________________\n", 185 | "batch_normalization_1 (Batch (None, 150, 150) 600 \n", 186 | "_________________________________________________________________\n", 187 | "conv1d_1 (Conv1D) (None, 146, 100) 75100 \n", 188 | "_________________________________________________________________\n", 189 | "conv1d_2 (Conv1D) (None, 144, 100) 30100 \n", 190 | "_________________________________________________________________\n", 191 | "max_pooling1d_1 (MaxPooling1 (None, 48, 100) 0 \n", 192 | "_________________________________________________________________\n", 193 | "dropout_1 (Dropout) (None, 48, 100) 0 \n", 194 | "_________________________________________________________________\n", 195 | "flatten_1 (Flatten) (None, 4800) 0 \n", 196 | "_________________________________________________________________\n", 197 | "dense_2 (Dense) (None, 50) 240050 \n", 198 | "_________________________________________________________________\n", 199 | "dense_3 (Dense) (None, 1) 51 \n", 200 | "=================================================================\n", 201 | "Total params: 346,201\n", 202 | "Trainable params: 345,901\n", 203 | "Non-trainable params: 300\n", 204 | "_________________________________________________________________\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "from keras.models import Sequential\n", 210 | "from keras.layers import Conv2D, Conv1D\n", 211 | "from keras.layers import MaxPooling1D\n", 212 | "from keras.layers import Activation, Dropout, Flatten, Dense\n", 213 | "from keras.callbacks import ModelCheckpoint\n", 214 | "from keras.layers.normalization import BatchNormalization\n", 215 | "from keras import backend as K\n", 216 | "\n", 217 | "K.clear_session() # session 초기화\n", 218 | "\n", 219 | "model = Sequential()\n", 220 | "model.add(Dense(150, input_shape=(X_train.shape[1], 1), activation=\"relu\"))\n", 221 | "model.add(BatchNormalization()) \n", 222 | "model.add(Conv1D(filters=100, kernel_size=5, activation='relu'))\n", 223 | "model.add(Conv1D(filters=100, kernel_size=3, activation='sigmoid'))\n", 224 | "model.add(MaxPooling1D(pool_size=3))\n", 225 | "model.add(Dropout(0.4))\n", 226 | "model.add(Flatten())\n", 227 | "model.add(Dense(50, activation='relu'))\n", 228 | "model.add(Dense(1, activation='sigmoid'))\n", 229 | "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_m])\n", 230 | "model.summary()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 55, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "Train on 6845 samples, validate on 2934 samples\n", 243 | "Epoch 1/50\n", 244 | "6845/6845 [==============================] - 42s 6ms/step - loss: 0.7094 - acc: 0.6936 - f1_m: 0.8004 - val_loss: 0.5816 - val_acc: 0.7035 - val_f1_m: 0.8249\n", 245 | "Epoch 2/50\n", 246 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.4939 - acc: 0.7157 - f1_m: 0.8330 - val_loss: 0.4598 - val_acc: 0.7035 - val_f1_m: 0.8249\n", 247 | "Epoch 3/50\n", 248 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.4410 - acc: 0.7234 - f1_m: 0.8367 - val_loss: 0.4408 - val_acc: 0.8054 - val_f1_m: 0.8685\n", 249 | "Epoch 4/50\n", 250 | "6845/6845 [==============================] - 40s 6ms/step - loss: 0.4249 - acc: 0.8183 - f1_m: 0.8788 - val_loss: 0.4292 - val_acc: 0.8030 - val_f1_m: 0.8674\n", 251 | "Epoch 5/50\n", 252 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.4133 - acc: 0.8250 - f1_m: 0.8824 - val_loss: 0.4192 - val_acc: 0.8108 - val_f1_m: 0.8719\n", 253 | "Epoch 6/50\n", 254 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.4040 - acc: 0.8324 - f1_m: 0.8864 - val_loss: 0.4098 - val_acc: 0.8211 - val_f1_m: 0.8746\n", 255 | "Epoch 7/50\n", 256 | "6845/6845 [==============================] - 40s 6ms/step - loss: 0.3929 - acc: 0.8365 - f1_m: 0.8888 - val_loss: 0.4214 - val_acc: 0.8132 - val_f1_m: 0.8604\n", 257 | "Epoch 8/50\n", 258 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3864 - acc: 0.8375 - f1_m: 0.8888 - val_loss: 0.4015 - val_acc: 0.8248 - val_f1_m: 0.8785\n", 259 | "Epoch 9/50\n", 260 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.3757 - acc: 0.8405 - f1_m: 0.8901 - val_loss: 0.3932 - val_acc: 0.8306 - val_f1_m: 0.8804\n", 261 | "Epoch 10/50\n", 262 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3698 - acc: 0.8437 - f1_m: 0.8924 - val_loss: 0.4015 - val_acc: 0.8255 - val_f1_m: 0.8816\n", 263 | "Epoch 11/50\n", 264 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3666 - acc: 0.8504 - f1_m: 0.8962 - val_loss: 0.3895 - val_acc: 0.8330 - val_f1_m: 0.8858\n", 265 | "Epoch 12/50\n", 266 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.3523 - acc: 0.8545 - f1_m: 0.8993 - val_loss: 0.3833 - val_acc: 0.8265 - val_f1_m: 0.8760\n", 267 | "Epoch 13/50\n", 268 | "6845/6845 [==============================] - 37s 5ms/step - loss: 0.3490 - acc: 0.8583 - f1_m: 0.9019 - val_loss: 0.3811 - val_acc: 0.8378 - val_f1_m: 0.8841\n", 269 | "Epoch 14/50\n", 270 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3513 - acc: 0.8581 - f1_m: 0.9019 - val_loss: 0.3754 - val_acc: 0.8354 - val_f1_m: 0.8853\n", 271 | "Epoch 15/50\n", 272 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3403 - acc: 0.8587 - f1_m: 0.9029 - val_loss: 0.3722 - val_acc: 0.8364 - val_f1_m: 0.8826\n", 273 | "Epoch 16/50\n", 274 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.3386 - acc: 0.8600 - f1_m: 0.9029 - val_loss: 0.3734 - val_acc: 0.8371 - val_f1_m: 0.8876\n", 275 | "Epoch 17/50\n", 276 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3316 - acc: 0.8659 - f1_m: 0.9067 - val_loss: 0.3803 - val_acc: 0.8323 - val_f1_m: 0.8858\n", 277 | "Epoch 18/50\n", 278 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.3252 - acc: 0.8675 - f1_m: 0.9075 - val_loss: 0.3826 - val_acc: 0.8374 - val_f1_m: 0.8892\n", 279 | "Epoch 19/50\n", 280 | "6845/6845 [==============================] - 37s 5ms/step - loss: 0.3212 - acc: 0.8647 - f1_m: 0.9064 - val_loss: 0.3712 - val_acc: 0.8436 - val_f1_m: 0.8912\n", 281 | "Epoch 20/50\n", 282 | "6845/6845 [==============================] - 40s 6ms/step - loss: 0.3176 - acc: 0.8691 - f1_m: 0.9082 - val_loss: 0.3647 - val_acc: 0.8412 - val_f1_m: 0.8853\n", 283 | "Epoch 21/50\n", 284 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.3157 - acc: 0.8684 - f1_m: 0.9079 - val_loss: 0.3687 - val_acc: 0.8449 - val_f1_m: 0.8922\n", 285 | "Epoch 22/50\n", 286 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3114 - acc: 0.8730 - f1_m: 0.9112 - val_loss: 0.3650 - val_acc: 0.8436 - val_f1_m: 0.8876\n", 287 | "Epoch 23/50\n", 288 | "6845/6845 [==============================] - 40s 6ms/step - loss: 0.3041 - acc: 0.8774 - f1_m: 0.9147 - val_loss: 0.3630 - val_acc: 0.8432 - val_f1_m: 0.8888\n", 289 | "Epoch 24/50\n", 290 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.3020 - acc: 0.8764 - f1_m: 0.9139 - val_loss: 0.3635 - val_acc: 0.8456 - val_f1_m: 0.8919\n", 291 | "Epoch 25/50\n", 292 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.3023 - acc: 0.8777 - f1_m: 0.9144 - val_loss: 0.3614 - val_acc: 0.8446 - val_f1_m: 0.8899\n", 293 | "Epoch 26/50\n", 294 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.3011 - acc: 0.8771 - f1_m: 0.9139 - val_loss: 0.3597 - val_acc: 0.8449 - val_f1_m: 0.8875\n", 295 | "Epoch 27/50\n", 296 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2972 - acc: 0.8792 - f1_m: 0.9154 - val_loss: 0.3562 - val_acc: 0.8449 - val_f1_m: 0.8865\n", 297 | "Epoch 28/50\n", 298 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2857 - acc: 0.8834 - f1_m: 0.9182 - val_loss: 0.3552 - val_acc: 0.8476 - val_f1_m: 0.8913\n", 299 | "Epoch 29/50\n", 300 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2885 - acc: 0.8827 - f1_m: 0.9175 - val_loss: 0.3692 - val_acc: 0.8371 - val_f1_m: 0.8869\n", 301 | "Epoch 30/50\n", 302 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2931 - acc: 0.8801 - f1_m: 0.9164 - val_loss: 0.3617 - val_acc: 0.8449 - val_f1_m: 0.8886\n", 303 | "Epoch 31/50\n", 304 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2857 - acc: 0.8811 - f1_m: 0.9163 - val_loss: 0.3647 - val_acc: 0.8476 - val_f1_m: 0.8923\n", 305 | "Epoch 32/50\n", 306 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2800 - acc: 0.8871 - f1_m: 0.9199 - val_loss: 0.3956 - val_acc: 0.8357 - val_f1_m: 0.8888\n", 307 | "Epoch 33/50\n", 308 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2806 - acc: 0.8858 - f1_m: 0.9200 - val_loss: 0.3623 - val_acc: 0.8408 - val_f1_m: 0.8827\n", 309 | "Epoch 34/50\n", 310 | "6845/6845 [==============================] - 38s 5ms/step - loss: 0.2854 - acc: 0.8846 - f1_m: 0.9191 - val_loss: 0.3729 - val_acc: 0.8459 - val_f1_m: 0.8925\n", 311 | "Epoch 35/50\n", 312 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2783 - acc: 0.8866 - f1_m: 0.9209 - val_loss: 0.3636 - val_acc: 0.8480 - val_f1_m: 0.8913\n", 313 | "Epoch 36/50\n", 314 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2770 - acc: 0.8877 - f1_m: 0.9213 - val_loss: 0.3598 - val_acc: 0.8507 - val_f1_m: 0.8937\n", 315 | "Epoch 37/50\n", 316 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2695 - acc: 0.8903 - f1_m: 0.9231 - val_loss: 0.3582 - val_acc: 0.8490 - val_f1_m: 0.8919\n", 317 | "Epoch 38/50\n", 318 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2660 - acc: 0.8936 - f1_m: 0.9258 - val_loss: 0.3578 - val_acc: 0.8497 - val_f1_m: 0.8945\n", 319 | "Epoch 39/50\n", 320 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2736 - acc: 0.8897 - f1_m: 0.9226 - val_loss: 0.3613 - val_acc: 0.8490 - val_f1_m: 0.8914\n", 321 | "Epoch 40/50\n", 322 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2655 - acc: 0.8916 - f1_m: 0.9236 - val_loss: 0.3636 - val_acc: 0.8436 - val_f1_m: 0.8852\n", 323 | "Epoch 41/50\n", 324 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2612 - acc: 0.8960 - f1_m: 0.9271 - val_loss: 0.3805 - val_acc: 0.8490 - val_f1_m: 0.8957\n", 325 | "Epoch 42/50\n", 326 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2553 - acc: 0.8991 - f1_m: 0.9292 - val_loss: 0.3678 - val_acc: 0.8483 - val_f1_m: 0.8930\n", 327 | "Epoch 43/50\n", 328 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2566 - acc: 0.8972 - f1_m: 0.9275 - val_loss: 0.3662 - val_acc: 0.8497 - val_f1_m: 0.8950\n", 329 | "Epoch 44/50\n", 330 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2638 - acc: 0.8951 - f1_m: 0.9258 - val_loss: 0.3665 - val_acc: 0.8480 - val_f1_m: 0.8926\n", 331 | "Epoch 45/50\n", 332 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2535 - acc: 0.8969 - f1_m: 0.9275 - val_loss: 0.3712 - val_acc: 0.8494 - val_f1_m: 0.8944\n", 333 | "Epoch 46/50\n", 334 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2553 - acc: 0.8941 - f1_m: 0.9259 - val_loss: 0.3795 - val_acc: 0.8398 - val_f1_m: 0.8891\n", 335 | "Epoch 47/50\n", 336 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2480 - acc: 0.9002 - f1_m: 0.9303 - val_loss: 0.3641 - val_acc: 0.8538 - val_f1_m: 0.8952\n", 337 | "Epoch 48/50\n", 338 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2496 - acc: 0.8977 - f1_m: 0.9277 - val_loss: 0.3657 - val_acc: 0.8504 - val_f1_m: 0.8942\n", 339 | "Epoch 49/50\n", 340 | "6845/6845 [==============================] - 39s 6ms/step - loss: 0.2513 - acc: 0.9005 - f1_m: 0.9299 - val_loss: 0.3588 - val_acc: 0.8463 - val_f1_m: 0.8896\n", 341 | "Epoch 50/50\n", 342 | "6845/6845 [==============================] - 38s 6ms/step - loss: 0.2577 - acc: 0.8960 - f1_m: 0.9259 - val_loss: 0.3680 - val_acc: 0.8517 - val_f1_m: 0.8961\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.3)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 64, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "4191/4191 [==============================] - 7s 2ms/step\n" 360 | ] 361 | }, 362 | { 363 | "data": { 364 | "text/plain": [ 365 | "[0.3625960384382891, 0.857790503502461, 0.9014614264113441]" 366 | ] 367 | }, 368 | "execution_count": 64, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "model.evaluate(X_test, y_test)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 62, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "4191/4191 [==============================] - 7s 2ms/step\n" 387 | ] 388 | }, 389 | { 390 | "ename": "TypeError", 391 | "evalue": "must be real number, not list", 392 | "output_type": "error", 393 | "traceback": [ 394 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 395 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 396 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n 테스트 정확도: %.4f\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 397 | "\u001b[0;31mTypeError\u001b[0m: must be real number, not list" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "print(\"\\n 테스트 정확도: %.4f\" % (model.evaluate(X_test, y_test)[1]))" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 61, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOydZ3hVxdaA30nvHUIKKRAglAChg6L0IgKKBSxXRQWx9wu2C9f6edVrF0VUBClyRQEREZAqVUIvgRQS0kglPSc5Zb4fcwgB0oCEOu/znCeHvffMrL1JZu1Za81aQkqJRqPRaDRnYnOpBdBoNBrN5YlWEBqNRqOpFq0gNBqNRlMtWkFoNBqNplq0gtBoNBpNtWgFodFoNJpq0QpCowGEELOEEG/W89okIcSgxpZJo7nUaAWh0Wg0mmrRCkKjuYoQQthdahk0Vw9aQWiuGKymnReFEHuFECVCiG+EEP5CiN+FEEVCiNVCCO8q148SQhwQQuQLIdYJIdpWORcthNhpbfcj4HTGWDcLIXZb224WQnSsp4wjhBC7hBCFQogUIcS0M85fb+0v33r+AetxZyHEB0KIZCFEgRDiL+uxfkKI1GqewyDr92lCiJ+EED8IIQqBB4QQPYQQW6xjZAghPhNCOFRp314IsUoIkSeEyBRCvCyEaCaEKBVC+Fa5rqsQIlsIYV+fe9dcfWgFobnSuA0YDLQGRgK/Ay8Dfqjf56cAhBCtgfnAM0ATYDnwqxDCwTpZLgbmAD7A/6z9Ym3bBfgWeATwBb4ClgohHOshXwlwH+AFjAAeFULcYu03xCrvp1aZOgO7re3eB7oCfawy/ROw1POZjAZ+so45FzADz1qfSW9gIPCYVQZ3YDWwAggEIoA/pZTHgXXAnVX6vRdYIKU01lMOzVWGVhCaK41PpZSZUso0YCOwTUq5S0pZDvwCRFuvGwv8JqVcZZ3g3gecURNwL8Ae+EhKaZRS/gT8XWWMCcBXUsptUkqzlPJ7oNzarlaklOuklPuklBYp5V6UkrrRevoeYLWUcr513Fwp5W4hhA3wIPC0lDLNOuZm6z3Vhy1SysXWMcuklDFSyq1SSpOUMgml4E7KcDNwXEr5gZTSIKUsklJus577HqUUEELYAnehlKjmGkUrCM2VRmaV72XV/NvN+j0QSD55QkppAVKAIOu5NHl6psrkKt9DgeetJpp8IUQ+0NzarlaEED2FEGutppkCYBLqTR5rHwnVNPNDmbiqO1cfUs6QobUQYpkQ4rjV7PR2PWQAWAK0E0K0QK3SCqSU289TJs1VgFYQmquVdNRED4AQQqAmxzQgAwiyHjtJSJXvKcBbUkqvKh8XKeX8eow7D1gKNJdSegJfAifHSQFaVtMmBzDUcK4EcKlyH7Yo81RVzkzJPB2IBVpJKT1QJri6ZEBKaQAWolY6/0CvHq55tILQXK0sBEYIIQZanazPo8xEm4EtgAl4SghhJ4QYA/So0vZrYJJ1NSCEEK5W57N7PcZ1B/KklAYhRA/g7irn5gKDhBB3Wsf1FUJ0tq5uvgX+K4QIFELYCiF6W30eRwAn6/j2wKtAXb4Qd6AQKBZCRAKPVjm3DGgmhHhGCOEohHAXQvSscn428AAwCvihHveruYrRCkJzVSKlPIyyp3+KekMfCYyUUlZIKSuAMaiJ8ATKX/FzlbY7UH6Iz6zn463X1ofHgNeFEEXAv1CK6mS/x4CbUMoqD+Wg7mQ9/QKwD+ULyQPeBWyklAXWPmeiVj8lwGlRTdXwAkoxFaGU3Y9VZChCmY9GAseBOKB/lfObUM7xnVb/heYaRuiCQRqNpipCiDXAPCnlzEsti+bSohWERqOpRAjRHViF8qEUXWp5NJcWbWLSaDQACCG+R+2ReEYrBw3oFYRGo9FoakCvIDQajUZTLVdNYi8/Pz8ZFhZ2qcXQaDSaK4qYmJgcKeWZe2uAq0hBhIWFsWPHjksthkaj0VxRCCGSazqnTUwajUajqRatIDQajUZTLVpBaDQajaZarhofRHUYjUZSU1MxGAyXWpQrFicnJ4KDg7G31zVjNJprjataQaSmpuLu7k5YWBinJ+7U1AcpJbm5uaSmphIeHn6pxdFoNBeZq9rEZDAY8PX11crhPBFC4Ovrq1dgGs01SqMqCCHEMCHEYSFEvBBiSjXnQ4UQfwpVY3idECK4yrn7hRBx1s/9FyDD+TbVoJ+fRnMt02gKwlrY5HNgONAOuEsI0e6My94HZkspOwKvA+9Y2/oAU4GeqDz9U0WVYvQajUZzJWOxSOKziqhvqqNCg5GFO1LYn1bQyJKdTmP6IHoA8VLKRAAhxAJUcfWDVa5phyquDrAWVUgeYCiwSkqZZ227ChiGqu+r0Wg0F4yUktIKM66OF9cVW2Gy8NzC3Szbm0Gorwt3dA3mtq7BBHg6n3VtXGYR329J4uedaZRWmLG1ETzRP4InBkRgb9v4HoLGHCGI02vlplqPVWUPcJv1+62AuxDCt55tEUJMFELsEELsyM7ObjDBG5L8/Hy++OKLc2530003kZ+f3wgSaTTXNmaLZPm+DEZ9ton2U//gpo838v4fh4lJPoHZ0rjJSw1GM5N+iGHZ3gz+0SuUAE8n3l95hOv+bw33f7ud3/ZmYDCaWXUwk3tnbmPwhxtYuCOV4R0CWPhIb0Z3CuTjP+O4ffpmErKLG1VWaNwVRHXG6zOf/gvAZ0KIB4ANqIpZpnq2RUo5A5gB0K1bt8syLe1JBfHYY4+ddtxsNmNra1tju+XLlze2aBrNNUWFycLiXWl8uSGBxOwSwnxdeLRfS2KSTjB9fQKfrY3Hx9WBG1s3oX9kU/q1aYKHU8OFdxeXm3j4+7/ZdjSPt27twD09Vcn05NwSfopJ5aeYVB6ftxNbG4HZIgnwdOLFoW0Y1705vm6qymyPcB8GtvXnlcX7GPHJRl4Z0Y57e4Y0mq+wMRVEKqpI/EmCUYXkK5FSpqNKPyKEcANuk1IWCCFSgX5ntF13IcL8+9cDHEwvvJAuzqJdoAdTR7av9ZopU6aQkJBA586dsbe3x83NjYCAAHbv3s3Bgwe55ZZbSElJwWAw8PTTTzNx4kTgVG6p4uJihg8fzvXXX8/mzZsJCgpiyZIlODufvRwF+Prrr5kxYwYVFRVEREQwZ84cXFxcyMzMZNKkSSQmJgIwffp0+vTpw+zZs3n//fcRQtCxY0fmzNF16jVXF2UVZuZtP8bMjYlkFBhoF+DBZ3dHM7xDALY2amItKDWyPi6btbFZrDucxS+70rC3FfRu6ceQdv4MaedPUw+n0/pNzy/jr7gcNsbnsCUhBw8ne+7tFcrt3YLPUiz5pRXc/93f7E8r4KOxnRnd+ZRBJNTXleeHtOGZQa3ZFJ/DmtgseoT7MKSdP3bVmJFGdAygW5g3L/xvD68t3s9vh/bw1MBQ+oR0OuvaC6XR6kEIIexQBdcHolYGfwN3SykPVLnGD1Xg3SKEeAswSyn/ZXVSxwBdrJfuBLqe9ElUR7du3eSZyfoOHTpE27ZtgUunIJKSkrj55pvZv38/69atY8SIEezfv79yX0FeXh4+Pj6UlZXRvXt31q9fj6+v72kKIiIigh07dtC5c2fuvPNORo0axb333lvteLm5ufj6+gLw6quv4u/vz5NPPsnYsWPp3bs3zzzzDGazmeLiYlJTUxkzZgybNm3Cz8+vUpYzqfocNZoria2JuTy5/L+U2O4hyP4GXugzliFta3/jLjOWs/zwTg4lu7L6YC5JuaUARId4MaitP9lF5WyMyyYhuwQAPzdHrovwJSWvlJ3H8nF1sOW2rsHc1zuMiKZuZBUZ+MfM7RzNLeHzu7swuJ0/BeUFrEpehZ+zH2EeYQS7B2Nnc27v61JKPt6wiZnx/8QOV/6+/3fs7c79nV8IESOl7FbduUZbQUgpTUKIJ4A/AFvgWynlASHE68AOKeVS1CrhHSGERJmYHre2zRNCvIFSKgCv16Yc6kNdE/nFokePHqdtOvvkk0/45ZdfAEhJSSEuLq5ygj9JeHg4nTt3BqBr164kJSXV2P/+/ft59dVXyc/Pp7i4mKFDhwKwZs0aZs+eDYCtrS2enp7Mnj2b22+/HT8/P4BqlYNGc7Eprihm+dHl/H38bwaEDGBw6OBznjzLKsy8uyKW2TFbcQ1fhpudM8fNc/j37sXsLBrJnW3upKVXy8rrC8oL2JC6gbUpa9mUtolSUykeDh4M7zucLj6DiT/mzapDWbz3x2Gc7G3oEe7LuO4hdG/hRiGxbMtYTd8u4Ux1Hsj3W5JZsD2F2VuS6dvKj5S8UrKKyvnuge5cF+FHdmk2E1dNJD4/vnJ8Oxs7mrs3J8wjjJZeLRnbZizNXJvVeo9JhUkszfwXXi52PNvh/85LOdRFo7rvpZTLgeVnHPtXle8/AT/V0PZb4NvGlO9S4OrqWvl93bp1rF69mi1btuDi4kK/fv2q3ZTm6OhY+d3W1paysrIa+3/ggQdYvHgxnTp1YtasWaxbt67Ga6WUep+DplFJL05nWeIyliUuo7C8kF6BvegT2IfeAb1p4nKqBIGUkj3Ze1gUt4gVR1dgMBsQFmdWJK0gyC2Y8e0fYHTEaJzsnGoZTbEjKY8X/reHpNxiQjv8hsXOg19v+ZXEgkR+PPwj/zvyP+bFzqOrf1d6B/Rm+/HtxGTGYJZm/Jz9GB4+nOim0WxK38Ti+MX8aP6Rlp4tubX/KN7zH4JZlLAjcyt/pf3Fl2t3UmGpwFbYYpZmRrTYwdtjpvHyTW1ZsP0YP2w9RmmFiTkP9aRrqDepRalMWDmBXEMunw34DC8nL44WHCWpIImkwiSSCpLYmLqRhYcXMq3PNAaHDq72Ho8WHOWhPx7CLM3MGvYtEd4RDfZ/VpWrOtXG5YC7uztFRdWX9y0oKMDb2xsXFxdiY2PZunXrBY9XVFREQEAARqORuXPnEhSkbJ0DBw5k+vTplSamkpISBg4cyK233sqzzz6Lr69vjSamq42VSSvxcPSgV0CvC+rHaDayO3s3rb1b4+no2UDSNQ6lxlLmHpqLSZq4p+09eDh4nHdfeYY8nlrzFC52LrT2bk0bnza09m5NC88W2NvaU1RRxKrkVSxNWEpMZgwAXf27EukTyZb0LfyW+BsArbxb0SegD95O3ixNWEpiQSL2wgljQSdK87rR2qstccVbyfTfyJvb3uSLPV9wb9t7ubPNndU+b4PRzAcrDzPzr6MEejozaWQOc+PjeLPXm3g5edHFqQtd/Lsw2TCZxfGLWXh4IZ/t/oyWni0Z32E8/Zv3p4NfB2yEsvuPjhhNUUURfyT9wZL4JXwY8yEf8mHleC09WzIuchzXBV5HtH80cw7O4dNdn3K04Cgf9/+YJwa0YtKNLTGYLLg52pGQn8DElRMxmA3MHDKTjk06AtCpyem+g2OFx/jnhn/y3LrnuK3Vbfyz+z9xsXepPJ9UkFSpHL4Z8k2jKQfQCqLR8fX15brrrqNDhw44Ozvj7+9feW7YsGF8+eWXdOzYkTZt2tCr14VNWABvvPEGPXv2JDQ0lKioqErl9PHHHzNx4kS++eYbbG1tmT59Or179+aVV17hxhtvxNbWlujoaGbNmnXBMlwsTpoFjhYc5e62d+Pn7Fdnm4WHF/LG1jcAGNtmLM91fe60P776UG4u55e4X/hm/zccLzmOnbCje7PuDAodxICQAfWSo77sztrNssRljO8wniC3syK968QiLSxLXMbHMR+TVZYFwNxDc3mk4yOMbTMWB1uHc+7z7W1vczD3IBFeEcyPnU+FpQJQZpIwjzBSilIoN5cT5hHGk9FPMqLFiErZLdLC4bzDbE7fzJb0LcyLnYfRYsTfoQ0i907ystszrF0oz97RmjbN3FkTG8m/lnYjw3AAn7BtfLLrE2bum8krvV5hVMtRFJeb+CsumzWxWayJzSanuJy7eoTw6MCmjFv+Kt2bdWdUy1Gnye/j5MODHR7kgfYPcMJwAl9n37Pu8STuDu7c3vp2bm99O0kFSaxKXoWvsy99AvucZQKa2HEirbxa8dJfLzF22Vg+6v8R0U2jcbO14UDuASatmoStsOW7Yd/R2rt1jWOGeIQwZ/gcPt/9Od/u/5aYzBj+c8N/aOvblqSCJB7848GLohygEZ3UF5u6nNSa8+dyeo4phSmsTVnLutR17MzciVmaAWju3pyvBn9Fc/fmNbZdlriMlze+TN/gvoR5hDH74GzCPMJ4+/q3iWoSVefYZaYyFh1ZxHf7vyOrLIvOTTpzV+RdHDlxhNXHVpNcmIxAEN00moEhAxkSNqROO3JNmC1mvt3/LZ/v/hyzNONi58Lz3Z7n9ta3V77h1sWurF28u/1dDuQeoINvByb3mIyjrSMfxnzIlowtBLkF8WT0kwwPH17vPlcnr+bZdc/yZPSTTOw4EZPFRHJhMkdOHOFw3mHi8+MJdAtkZIuRdPDrUKsJM6vQwNztcfzwdyy5BS70a9OE5we3ISr49NWBwWjmi7XxfLk+EUeX4wRHrCTVsJ9A890kJnTCaJa4O9lxQ6sm3NUjhOtb+TF5w2RWJq9k0ahFtPBsUa97aygS8xN5cs2TpJek80rPVwjzCOOJNU/g6eDJ10O+JsQjpN59bc/Yzkt/vUSeIY+HOjzEL3G/YJImZg6ZSSvvVg0ib21Oaq0gNHVyqZ9jhbmCRXGL+DH2RxIKEgCI8Iqgf/P+9G/eH7M088SaJ7C3sefLQV/SxqfNWX2sObaG59Y9R1f/rnw+8HOc7JzYlrGNVze9SnZpNo90eoQJURPOcoaaLWaOFR1jXco6Zh2YRZ4hj+7NuvNIx0fo0axH5QQopSQhP4FVx1bxZ/KfHD5xGIGgR7MejGgxgsGhg3FzcKvX/eaU5TBl4xS2ZWxjWNgwJnScwPt/v8+WjC30bNaTaX2mEeweXGP7tOI0Po75mN+Tfqepc1Oe6foMI1qMOE0JbE7bzH9j/svhE4dp69OWF7u/SPdm3WuVK9+Qz+glo/F38WfuiLnY25z7HgGLRbI5IZe525JZdTATk0VyfYQfTw9qRfew2s2bR3NKmLr0ABvi0nEOmoed+yGiXe/lsS4P0zXUu3Jn8eb0zTyy6hEmdZrE450fP2cZG4KC8gImb5jMpvRN2Ak7mns0Z8bgGef1wpBvyGfq5qmsSVmDt6M33wz9psGUA2gFcYkkalwef/xxNm3adNqxp59+mvHjxzf4WI3xHIsrirERNrWad0wWE78m/Mr0PdPJKMmgY5OODAsbRr/m/c5aKSTmJzJx1URKjCV8OuBTujU79fu+NWMrj61+jEifSL4e8jWu9qcCBQorCnl729v8lvgbUX5RjO8wnpSiFOJPxBOfH09CfkKlCaV3QG8e6fQIXf271nl/xwqP8VvibyxLXMaxomM42jrSv3l/bm5xM70De9do2tmUtomX/3qZUmMpL/V8iVsjbkUIgZSSRXGLeH/H+1ikhee7Ps8dbe7ARthQUF7AjswdbM/Yzvbj24nPj8fR1pHxHcYzvv34Gp+xRVr4LfE3Pt31KcdLjjOtzzTGtBpT4z29tPElVhxdwYKbF1SrhGvCZLaQcqKMlQeOM2/7MZJzS/F2seeObs25q0cI4X6udXdiRUrJ7pR8vFxs+eLAG6xIWsGjnR7l0U6PIoTAYDIwZukYbIQNi0YtwtHWse5OGwmzxcznuz/nYO5B3u77Nj5O5+/fk1KyJmUNrbxandMKpD5oBaG5IBr6OS5PXM60LdMwmo10atqJ3gG96RPYh3a+7bC1scUiLfyR9Adf7P6CpMIkOvh24MnoJ+kd2LtWk8XxkuNMXDWRtKI03rvxPQaEDGB31m4mrppIkFsQs4bNqtGZvOLoCt7Y+gaFFWqvjL+LPxHeEbTyakWEVwTtfNud11ublJJ9OftYlriMFUdXcKL8BLbClmD3YMI9wgn3PPVZk7KG7/Z/R4RXBO/f+P5pYZgnySjOYOrmqWzJ2ELHJh0xWUwcyj2EROJk60QX/y50b9adEeEjCHALqJeMa2JTeXPHFLLNe7ir5RM82+NhnB1O3+W/PmU9T6x5gkc7PcpjnR+rth8pJfFZxRxILyQhu5j4rGISsotJyimlwmwB1E7ge3qGMLR9M5zsa84kUB/MFjNTN09lScISHmj/AM91fY7Pdn/GjL0zmDF4Br0De19Q/9cKWkFozkJKSVFFEc52ztjb1m4qaKjnaDQbeW/He8yPnU9002iim0azJX0Lh/IOAeDh4EHPgJ6VNu0IrwiejH6S/s371zscN9+Qz+N/Ps7+3P1MiJrAvNh5eDt68/3w7+t0HucZ8jhWeIwWXi0uKMqnJowWI1vSt7A3ey9HC45ytPAoyQXJlSsUgDtb38mL3V+sNZxTSsnPcT/z1d6vCHANoGdAT3oG9CTKL+qcnc6Ld6Xx/P/2YCvM2DSbh73HfipyBhFmewudgr2ICvIktIlg6s7xeDt58eOIH0/7fbFYJDuPnWDlwUz+OHCcZOumMlsbQaiPCy2auNGyqSsRTdyIDvEmomn9zGz1xSItvLPtHRYcXsCwsGGsPraaoWFD+b++/9eg41zNaAWhOY1yUznpJemUGktxsnMi3DO8Vidl1ecopSS7LJvYvNjKz+G8w2SXZTM4dDB3R95Ne7+zNyVmFGfwwvoX2Juzl/va3cczXZ+ptGHnGfLYlrGNLelb2JKxBRc7Fx7p+AjDwofV23lalVJjKc+te45N6Zvwd/Fn9vDZBLoFnnM/FwOzxUx6STpHC47iZu9GF/8udTdqIL7bdJR//3qQXi18+Pq+bhSUGXht0zR25K6kqWUIealDOVFixCngf9h57sI99zna+7WnbYA7IT4u7Dx2glUHs8gpLj8tLUXPcB9CfV1xsLs49ciklHy480O+2/8d7g7uLL1laYNGkl3taAWhAdTbVm5ZLtll2QgEXo5e5BnyaOrS9LRNS2dy8jn+EvcLH+38iDzDqU3tzd2bE+kTiYudCyuTV1JmKqOjX0fGRY5jaNhQHGwd2JS2iSkbp2C0GHnzujcZFDqo0e/VaDay4PAC+gX3o7lHzZFNlztSSlJPlOHlYo97AyWOk1Ly4aojfLImniHt/PnkruhKc49FWvjP3/9h7qG5jIkYQ7RfX17b+izRHrfhaRjNoYxCErOLsUhwdbClX5umDGnvT//Ipg2a2O587mlJwhICXQPpEdDjkslxJaIVhIYyUxnpxekYTAY8HD1o5toMext7UopSKKooooVnixrNGocOHaLYu5gJKyfQsUlHhoYNJdInkjbebU6LzCmqKGJpwlIWxC4gqTAJHycfejTrwR9JfxDhHcGH/T4k1CP0Yt3yFU9ybgmTF+1la6JSyO6OdgR4OdHM05lATyeCvJwZ0r4ZbZq517tPi0UydekB5mxN5o6uwbwzJuqshHBSykpbvq2wJdQjlIUjF1Y6fA1GM8fySgnxcblgP4Lm0qMVxBWEm5sbxcUNk+fdbDFjMBsoqigitywXOxs7AlwD8HA8ZV83WUzE58djb2NPC88W1dr69x/Yz2P7HsPbyZu5N82tM1zTIi1szdjKgtgFbEzdyE0tbuLVXq/ibFd9BlrN6Zgtku82HeX9lYext7Hhsf4R2AjIKDCQUVBGRoGB9HwDOcXlAHQP8+buniEM7xBQ64RdYbLw/P/28OuedB65oQVThkfW6tv5bv93zNw3ky8GfXHWbl/N1YNWEFcQ56sgTBYTpcZSDGYDBpP6GC3GyvNeTl74u/hXm/SssLyQlKKUak1NFmnhr51/MeXwFOaNmEeYZ9g5yWW2mLG10W+Z9eVIZhEv/rSXPSn5DGrblDdviaKZZ/Uru7ySChbFpDJ3WzJJuaV4udhze5dg7u4ZgqezvTWKqISEbBVNFJtRxPFCA1OGRzLpxrMjpKrDIi3n5QfSXDloBQHw+xQ4vq9hB20WBcNrj5aYPHkyoaGhlQWDpk2bhhCCDRs2cOLECYxGI2+++SajR48GalcQxcXFjB49urLda/9+jX7D+lFsLGbBDwuY9cUshBC0bd+Wz2Z+RkFOAVOemUJyktrhe7IGRHVUZ2qSUpJWnMbh2MPYB9hzQ/AN5/ukNHVQYbIwfV0Cn62Nw93Jnqkj2zGqU2C9orcsFsmWRLX5bOUBtfmsKk72NrRs4kbLJm4M79CM4VH1C3/VXBtoBQGXTEHs2rWLZ555hvXr1wPQrl07VqxYgZeXFx4eHuTk5NCrVy/i4uIQQtSqIEwmEyeKTlBhV8Gx48cYO2Qsy7cvJzU+lSfuf4I169cQ3CyY/BP5+Pj4VFsDwtOz+n0A1ZmacspyyCzJpCSthB6dtOOvoSmtMLH+cDZ/HDjOmtgsCg0mRnYKZNrIdpUVxM6VrCIDS3enYyMELZu60bKJK4GeztjY6Ky9muq5JPUgLjvqmMgbi+joaLKyskhPTyc7Oxtvb28CAgJ49tln2bBhAzY2NqSlpZGZmUmzZrVvw5dS8vzk59m2aRt2tnZkH8/Gy+jF2h1rGXvHWEIC1A7LkxlZq6sBURMn/ROpRanklOXgbOdMZkkmHo4e2Dhc2yaGjIIy1sZmE+TtTERTNwI8nKqdcLOLytmXls/e1AIOphdiZyvwc3Os8nHAz92R+KxiVh44zsa4HMpNFrxd7BnSvhm3RgdxXcSFhWc2dXfi4b4XN/eQ5url2lEQl5Dbb7+dn376iePHjzNu3Djmzp1LdnY2MTEx2NvbExYWVm0diDOZPWc2WVlZrNm8hiDPIMLCwjBVmBqsroOnoyeFFYVkl2Vjgw2Odo4EugZyhCMX3PeViJSS/8Wk8savBykqN1Ued7a3pUUTV1o2cSPQy5mE7GL2pxWQUaD+D4WAcD9XBPBXUQ6FBtNZfQd5OXNXD7WjuHuYd7WlJTWaS41WEBeBcePGMWHCBHJycli/fj0LFy6kadOm2Nvbs3btWpKTk+vVz/G84/j6+eLn5ndau5rqOlRXA8LDo/YdwgGuAZQYS5BSEuIecs06mDMLDbz08z7WxGbRM9yH125uR0m5ifjsYhKylOM3JvkEy/amE+bnSs9wHzoEedIx2Iv2gR64Op760yo3mcktriCnuJyc4nKaujvRPtBDF2vSXPZoBXERaN++PUVFRTTBgZ0AACAASURBVAQFBREQEMA999zDyJEjie4aTWSHSFq3qTk3fFUG3zqYRXct4rqe19G5c2ciIyMr+6+urkNNNSBqw87GjnAPVRL1fGoFXOlIKVm8O42pSw5QYbYwdWQ77u8dVmlS6tni9NoBFous077vaGdLoJczgV46zFdzZXHtOKkvE0wWE/nl+eQZ8jCaVRiqs50z4Z7htb5RGkwGEvITaObarNYCJ43B5fgcG4PsonJe+WUfKw9m0iXEi/fv6ESLJg2bO0ijudzQTurLAIPJQK4hl4LyAqSUuNi74O/ij8li4njJcUpNpaeloT6T/PJ8gMu+tOWVytrYLF78aQ+FBhMvDY/k4b4tsNWRP5prHK0gGhmzxUxmaSYnDCcQQuU/8nHyqdxrYJEWssuyyS3LrVQQ+/bt4x//+Mdp/UhbydK1S6vd6KY5fwxGM++uiOW7TUm08Xdn7sO9zil1hUZzNaNnm0akxFhCWnEaRrMRX2df/Jz9zprgbYQNPk4+ZJdmYzAZcLJzIioqit27d5/WT1JBEl6OXhf7Fq5q4jKLeGrBbg5lFPJAnzCmDI/UuYU0mipoBdEIWKSFrNIscstysbe1J8wzrFbzkY+TDzllOeQacqstTJ9fno+NsMHdQb/Z1pdyk5lFMWmYpSTQ04kAT2cCvZzwdFYZR+dtP8Ybyw7i6mDHtw90Y0Ck/yWWWKO5/NAK4jzILs3mRPkJHG0dKz9Otk442DpQYa4grTiNcnM53k7e+Lv41xkqamdjh5ejF/nl+TR1bnp6QRZpobC8EA8HD50Tp57sTyvg+YV7OJxZdNY5Z3tbvF3sSS8w0LeVHx/c2Ymm7jUX59FormW0gjgPioxFWKQFk8VUuWegKnY2doR6hNa7SD2An7MfJwwnyDPk4e966m22qEKNdS06pwvKjOxIymNrYi7bjuaRV1LBbV2CuadnCE09zp7UK0wWPlsbz+dr4/F1deCb+7vRPtCzMgNqRoGBjPwyMgoNTAr34d6eoToFhUZTC1pBnAcV5go8HDwIdAtESkmFuYJyczkGs9pJ6+Pkc5qv4ZNPPmH69Om0a9eO9PR0du7cyVtvvcULL7xQeY2DrQMejh7kGfLwc/arXHUUlBdgZ2NXq4nqauJAegGLYtLYdjSXgxmFSAkOtjZ0DvEi3M+Vj/+M44t18dwUFcD9fcKIbu6FEIJDGYU8v3APBzMKuTU6iGkj2+PpolZizTydiL7E96XRXIloBXGOmCwmzBZz5SYyIQSOdo442jniQfW7lL/44gt+//13XF1dSU5OZvHixdVe5+fkR2F5ISfKT+Dn7IfJYqLYWIyPk8857bo1mUzY2V15/7WrDmbyxLydAHQJ8ebpga3oGe5LdIhXpfM4MbuY2VuS+SkmlSW70+kY7EmXEG/mbkvG09mer/7RlaHta89ppdFo6oc2ap8jFWZVYP5kda26mDRpEomJiYwaNYq5c+fSvXt37O1PL81YUlLCiBEj6NWtF2NuGMMP837AIi2s27yOu4ffzZDeQ+jRowdFRUUYDAbGjx9PVFQU0dHRrF27FoBZs2Zxxx13MHLkSIYMGQLAe++9R/fu3enYsSNTp05tuIfQCCzYfoxH5uwgspk7m6cMYP7EXjwzqDW9W/qeFlnUookb00a1Z+vLA3l9dHuKy03M2pzE0PbNWPnsjVo5aDQNyJX3mnmevLv9XWLzYi+4H5PFRLm5HGc7Z9r5tmNyj8m1Xv/ll1+yYsUK1q5di59f9Zk6V6xYQWBgIL/99htFFUUcSD1ATlEOD977IB998xG3DriVoqIinJ2d+fjjjwG1VyI2NpYhQ4Zw5IhKprdlyxb27t2Lj48PK1euJC4uju3btyOlZNSoUWzYsIEbbri8ajpIKfl0TTz/XXWEfm2a8PndXU7LY1QTbo523Nc7jHt7hpJTUq4dzRpNI6BXEOeIRVoQiAaNKIqKimL16tVMnjyZXVt34eftx7Z92/Bt6kvf3n0RQuDh4YGdnR1//fVX5Sa6yMhIQkNDKxXE4MGDK1N9r1y5kpUrVxIdHU2XLl2IjY0lLi6uwWRuCMwWyauL9/PfVUe4rUswX9/XrV7KoSo2NkIrB42mkbhmVhB1venXl5SiFAwmA628WzVIfwCtW7cmJiaG5cuX8/LLL9N3QF+6D+yOEOKs6KXacme5urqedt1LL73EI4880mByNiQGo5mn5u9i5cFMHu3Xkn8ObaOzm2o0lxl6BXGOlJvL6+1/qC/p6em4uLhw77338sILL3Bw70FatWlFTmYOe3buAaCoqAiTycQNN9zA3LlzAThy5AjHjh2jTZs2Z/U5dOhQvv3228rqdGlpaWRlZTWo3OeCyWzhUEYhC/9O4bXF+7npk42sOpTJ1JHtmDwsUisHjeYy5JpZQTQEJ0Na3ezPL8Pn8ePH6datG4WFhdjY2PDRRx9x8OBB9u3bx4svvoiNjQ329vZMnz6dNk3a8OOPP/Lkk09SVlaGs7Mzq1ev5rHHHmPSpElERUVhZ2fHrFmzcHQ8W2ENGTKEQ4cOVab3dnNz44cffqBp06bnff9Gs4WUvFLsbGywsxXY29rgYKu+A+SVVJBdXE5OUTk51voHmYUGDmUUcjCjEIPRomRxtKN9oAeTh0Vqp7JGcxmj032fA0azkSMnjhDgFoCPk0+D9Xu5c+jQIY4LX95YdpDEnJJzauvpbE+bZu5EBXnSMdiTqCBPwnxd9QY1jeYyQaf7biDKzeUAONo0rInpcqbcaCa3uJzxi/6mhZ8r74yJwsHWBqPZgtEiMZosGM0WJODj4oCfu0NlDWZfNwcc7XTyO43mSkUriHPg5B6Ia6HSmtkiySoykFNcQbnJwss3RfJAn3Ac7LTbSqO5VrjqFYSUssEcoOWWcmyEzVVdk0FKSX6ZkYwCAyazBS9ne4SHEwPbt7zUomk0movMVf066OTkRG5ubq2hoedChbkCB1uHqzbiptxo5mhOCSl5pTjYClr6ueIiy3B10bWUNZoGJX0XfDMUds0Fs+lSS1MjjfoqLIQYBnwM2AIzpZT/d8b5EOB7wMt6zRQp5XIhRBhwCDhsvXSrlHLSuY4fHBxMamoq2dnZ538TVcgqzcLexp4Kp4oG6e9yQUpJcbmJQoMJAXg422PnYMexXKVkg4ODL7WIGs3VxZo3IWWr+mz6CPq/Am1Hgc15vLNbLGAygINLg4vZaApCCGELfA4MBlKBv4UQS6WUB6tc9iqwUEo5XQjRDlgOhFnPJUgpO1+IDPb29oSHh19IF5UYzUbGzR3HhKgJPNH2iQbp83IgJvkEL/+8j8OZRQzv0Ixpo9rjX00qbY1G00Ac3wfxq6H/q9CkjVIW/7sfAjrBwH9By4FQXytFRQn8PBFM5XD3j1BH7ZlzpTFXED2AeCllIoAQYgEwGqiqICRUpkD1BNIbUZ4LIqU4BYu0EOoReqlFaRDMFskbyw7y/ZYkAjycmHlfNwa101XVNA2M2Qixy+Dvb6AkB8KuhxY3qp/O3pdaukvDpo/BwQ16PKyeQeQI2LsQ1r0NP9wGodfBsHeUwqiNgjSYPxYyD8DQt6ERCoo1poIIAlKq/DsV6HnGNdOAlUKIJwFXYFCVc+FCiF1AIfCqlHJjI8paJ0kFSQCEeYRdSjEaBItFMnnRXn6KSeX+3qH8c1jkOedA0mhqpeg4xHwPMd9BUQZ4hYBvBOyeC39/rSazgM5KWfh3UMqjKB0KM9T1RRlQmgsuvuAeoD4eAeAeCJ5B6i27viaV0jz11u7eTPXjdEZafimhIBXSd0JaDKTthJwj0LSdki/8RjVZn/l2LiUUpJxqE9wN2o2uXZYTybD/Z+j16CkFaWMLne+CDrfBzu9h/bswox/0egz6vQSO1WzMTYuB+XdBRSncvRBaDa7fszhHGnNWqG6NdKa3+C5glpTyAyFEb2COEKIDkAGESClzhRBdgcVCiPZSysLTBhBiIjARICQkpOHvoArJhckAhHpe2SsIKSWvLdnPTzGpPDuoNU8ParicUpprHIsZkjdBzCw4uAQsJjWR3/yRmsBsbMFUAWk7IHE9HF0Pmz9V1wHYOlgn8UBoFgXOPkpJFGUoW33RcbCGmuMfBXfNB6/mtcuUtlNNpMXHTx1zcDulcGwdIWMPlFjT0NjYq7Fb9FfHV09Tx5281KonrC8YCqxKIQZKc071a+sAfq2haS2bc7d8ppRj78fPPmfnAD0mQNTtatwtn6nnOOIDaD301HX7f4bFj4KbP9y3pPbxLpDGVBCpQNX/vWDONiE9BAwDkFJuEUI4AX5Syiyg3Ho8RgiRALQGTtsqLaWcAcwAtZO6MW7iJMmFyfg4+eDhUH1RoCsBKSWvLzvI3G3HeLRfS54aGHGpRdLUhMUCiWvh75mQvhvG/wY+Lc6vL0Mh7JkPO75TNu87v284OY1lkLgODi2DI7+rCd3RE3o8At0fAt8zwqPtHCC0j/r0fwnKiyH/mJrsXHxqt71LqfpP3gxLHoevB8C4edC8e/XXH1gMv0wC1yYwdq6StXKVkq4UTukJiBgIQV0hqItazdhV2QhblAlHN8DRdZC4QZnLEOo5th6q2gR2UfJ/1VeN9/BqsLU/W56SHNg5BzqOBY/Amu/T2RtGfgwdx8GvT8O8O6H9rTDs/5TyXfcOhPSGsT+Aa/UlBBqKxlQQfwOthBDhQBowDrj7jGuOAQOBWUKItoATkC2EaALkSSnNQogWQCsgsRFlrZOkwqQr2rwkpeTdFYf5blMSD14XrrOnNhQWy/lFntSEoQB2z1OKITceXPzUxLb8Rbjnp/o7LwGyDsH2r2Hvj1BRDJ7N4eBiiP9TTYrni5Sqn/0/q76MJUoptB6i7OmthoBDPUvkOrqBf7v6XSuEmhDbjVJv6vPHwqwRcMsX6q27qnwb3oe1b0Lznko5uDU59/sEcPeHjneoz0lTlLMXOLqffe3NH8HCf8DGD6DflLPPb58BpjK47qn6jR3aGyZtVD6LDe8rJWwxQqe7YeRHpyuyRqLRFISU0iSEeAL4AxXC+q2U8oAQ4nVgh5RyKfA88LUQ4lmU+ekBKaUUQtwAvC6EMAFmYJKUMq+xZK0PyYXJ9A3qeylFuCA+/jOOL9cncE/PEF67ue21rRyObYPyIvX253IeObWMZRD7G+xZAAlr1BuqXys1afm1hiatwbeVMqmUF6m35IqiU98txur7TdupnJXGEgjuDrfOgPa3wI5vYcUUZW5of0vd8sWtVqGTSRuVCaXDbcoh6t8BPusGq6cqE8r5KLbyIvX2fnAJuDWDTuOUUgjrq1YHF4umkfDwGvjxXlj0kPIZ3DhFmaCWPgH7/qfe1Ed+AvYNFJUnRO0mrXajIOpO2PCeWl0EVqmEXl4M276CNiPU6qO+2DnCjf+E9mPU/1tIb2Weukh/v1d1sr6GoriimN7ze/NMl2d4KOqhRhmjsZBS8uX6RN5dEcvtXYP5z20dr75EeUZD/SaBnHhY+QocWXHqmHe4UhRBXZWpoEkb9XZ4ponAYoFjW5Sp5uASKC8Ej2Boe7OaNLMPQ04clBec/33YOUGH29VkXnVyMZvg6/5Qkg2Pbz/byVqV2OWw4G61Wuj+IETfB66+p87v+0lNqLfOgE5jz02+nDhYcA/kxsGgf0PvJxp29XQ+mMph2XOw+wflIC7MgNTtMOA16Pv8RZtIKyk7AV/0BidPmLj+1O/lli/gj5fgoVXQvMfFlakOdLK+CyS5SDmoryQTk5SSv+Jz+OTPOP5OOsGoToG8ezUqh82fwarXoHkv9Tbb/hb1x1kVQwGs/496g7NzgsGvqwiak1Erx7bB/kWnt7FzUorCwU39LDuhIlYc3NRE1GkchF5/+gQpJRRnqbfZ3DhAqLaV/bipnzWZBpy8qo9YsbVT5ouZA5X9edg71bfPPAA/T1ARN+N/rz7Kp/0Y5Rhe84a6j/q+XR/6FX55VMl+3xIIv0xK19o5wujP1Kpt1VSwd4Y756i3+UuBszeM+gzm3gZr34Ihb6hQ3y2fQ0ify0451IVWEPXgZIjrlbAHQkrJ+iPZfPJnHDuP5RPg6cTro9tzd48QbK8m5SClmizXv6vMG0XH4denlK0+cgR0ukuFKO5ZAH++rpyb0feqjUhu1poYLW481V9RplIWJ5KUvb7cahKqKFbmAa8Q1TZyRM32dSGUzdrdH8Ib2BwZ3BW6PQjbvlTK6cwY+ZIcmD9OKaC75tccAmpjoxTk7FEq3LTPk7WPazErZfLXh2qVdeds8LzMdtYLAdc9rV4SnL3OzYTTGLQaBF0fUIo4cgTkHYXCVLj5v5dWrvNAK4h6kFyYjEAQ4tG4obQXgpSSdYez+fjPOHan5BPk5cybt3Tgjm7BV1/KbSnhj5dh6xdq0h/5iQodTN+pFMK+n+DAz8r+bi5XdtthiyCwlo357v4QedPFu4fzYeC/4NBSWPasMlWcjMs3VcCP/1CrlweW1x4hA0oxRgxSjs/oe2vesFaSq8xRiWvVhDf8PxfFMXrehJy5zeoSMuRNSFiroppsHdSeilZDLrVU54xWEPUgqTCJQLfAyzLNd4XJwtI96czcmEjs8SKCvZ15Z0wUt3UJvjpTc1vMaqWw6wfo+ajaQXrSzBPUVX2GvKVSGRxZoSbD9mMuvi26MXD2Uvf78wQV7tj9IaUsf3sOjm2G275RK436MOjf8OX1sPG/ygxyJum7rEonE0Z9Cl3ua9BbuepxdFfRVbNuBqTy+VyBv4NaQdSD5MLky87/UFBq5IdtyXy/OYmsonLa+Lvz3u0duSU6CHvbq1AxgHpT/nmCCrG8cbLaZVrdH52dg1oNXO4rgvMh6g7YNQf+/De0Hal8J7vmQN8XTg/1rItmHZQZbttX0GPi6dE5u35Qjl+3pvDgH8qJrzl3wq5Xv6MJf0KHMZdamvNCK4g6kFKSXJhM55YXlDewQeTIKa4gIbuYFfuPs3BHCqUVZvq28uO9OzpxQyu/xg9dzTqk3szbjgTvsMYd60yMZbDwPohbqZbvddnOr1aEgBH/hel91A7h9J0QebPKBnquDHhFKZi1b8GtX6qIoN8nq/QY4TfC7d+dHgGlOXf6TVafKxStIOog15BLibHkojuo96UWsCkhh4SsYhKyi4nPKqbQoFIS2NsKRnUK4uG+4bQNuEg7u80m+OkhyDoAK19VCcU6jVORMGdGDZ2kJFdF8xSkWvPrHIfCdPW9OFPZwQe/riJPaiP/mDJ3ZOxR0Tzdxjf8/V1J+LWC656BDf9RKSdu/er8wk09g6HXJNj0iTLDrX9XpcG47hkVJmqrp4drHf0bUAdHC44CFy/EtaDMyP/9Hsv87ccAaOruSMsmbozqHEjLJm5ENHWjXYAHvm4X2Vm4c5ZSDje9r8JG9yyApU+qqKE2N0Gb4WrSzzkC2UfUz7Iz9jbau1hz4ASqDWXbZ0DSX3D7tzXnk0lYoxSTxayic9oMb/RbvSLo+7xSrB3HVh8aW1+ufw52zoZ5d6gIqDtn151wTnPNoBVEHZxM0hfmGdao40gpWbY3g3//epC8knIm9A3nsX4ReLteBo7x0jxY85YKJ+3+sDJz9H1e7fzdMx/2W6OGwLqruPWpdAi+rZR92z1ArTSqmsHiV6sojxn9Yfi7yhF68ryUKrRyzRvQJFLlnTkzr8+1jL0T9H3uwvtx9oKh76gsoiM/vvQhoprLCq0g6iC5MBkHGweauTZrtDFST5Ty2uL9rD2cTVSQJ7PGd6dDUA1mm0vB+nfBkK+ShZ2cwIVQETPBXVVkTfYhtXv3XFJXRAyCSZvgl4kqMilxncoxg1DZKmOXqTQRoz6tf24fzbnT+S710WjOQCuIOkgqTCLEIwSbRijGIaXku01JvPfHYYSAf93cjvv7hF2cDW2F6TBnDLQcAEPfqjkELytWJXzrOl5FvlSHnUPdxU1qwt0f7v0FNn2oVilpMSpuPC9RKZ5ej12R4YEazdWAVhB1kFyYTEvPxjFtzNx4lLeWH2JAZFPeuKUDQV51OGsbivIilUI454h68zeXK9/CmROxlCpJnKPb+UXJ1BcbG2WyCr0eFj2sViv3L1VhghqN5pKhFUQtmCwmUopSGNB8QIP3/dveDN5afogRUQF8elf0hedIOrZVxa/3e0lV3KoJswl+ehAyD6oatkl/qcyfUiolUTUa5sgKtYt22LsXJ9wxpCc8sV0VkKkunbJGo7moaAVRCxnFGZgspgYPcd2RlMezC3fTLdSbD+7sdOHK4eASWDRBrQSOrFDx69XlApISfn9R7SU4WeUrwlrlddNHgISbPlBKwlSu0ln4tVE7di8WdYW8ajSai8ZVuuW2YThaqEJcwz3DG6zPxOxiJszeQZCXM1/f1w0n+wvMk7RtBiy8HwI6wvgVKq/O7NEqy+mZqdw3f6pqC1z3zKm9BELAoGlw/bPq3G/PqdTW275UfoBhb1dfHUuj0Vz16BVELVTWoW6gFURucTkPfPc3Qghmje9+YSGsUqp0C399qPYh3PaNyuD58J8qAmjlK8rhO+pT5UM48ItKi93+Vhg49fS+hLAeE/DXf8FYquoKtB5+aoWh0WiuObSCqIXkwmQ8HDzwcvS64L4MRjMPz95BZqGB+RN7Eep7AWGbpgq1SW3vApVl86YPTu16dfJQewY2faTSXGcdUvHyS55Q5Rdv+bL6XbdCqGyhQqiSiTb2KrpJo9Fcs1zzCsJoNrL9+PZqz+3P2U+YR9gF5zgyWyTPLNjN7pR8pt/TlS4hNaRXrg/lRSonUcIa6P8q3PDC2dFHQiiTUUBn5ZD+eYIqeD9ufu0FYoRQKRbcmql9B3pjmkZzTXPNK4giYxGTVk+q8fxtrW67oP4tFsnLP+9jxYHjvHZzO4Z1uMANd0uegMT1MPpzlcu/Nlr2h0fWq1w7vR6tXySSENBz4oXJqNForgqueQXh7uDOnOFzajzf2rv1efctpWTarwf4cUcKTw2I4KHrL9DZnbxFpbru91LdyuEkXiEw4v0LG1ej0VyTXPMKwt7Gns5NGz6Vt5SSt347xOwtyUy8oQXPDj5/RQOoyKI/Xgb3wGs31bVGo7mo6DDXRuKDlUeY+ddR7u8dykvDI6v3Y0ipCrZkH667w/2LVO7/gf/SeYk0Gs1FQSuIRuDTP+P4bG08d/VoztSR7Wt2cu/6AX7/J8y5VdVKqAljGayepvIddRzbKDJrNBrNmWgF0cDM2JDAB6uOMCY6iLduiap5l3Rxliq84x8FZSdgwT1gNFR/7dYvoDBV1Vo+n8IwGo1Gcx7o2aYBWRubxdvLYxnRMYD/3N6x9hQaK6aoDWm3f6sqgqXtUCmvz9z9XJylCsu3GVF9+gyNRqNpJLSCaED2pOYD8N87O2FnW8ujPbJS+RT6vgBNrMV1+r8Ke39UO6OrsvYtMBlUaU6NRqO5iFzzUUwNSX6pEXcnOxztasmvVF6s8h01iVSb2U5ywwsq9fafr6uqXpEjVMbVnbOhxyPgF9H4N6DRaDRV0CuIBqSwzIincx2J7da8CQWpMPITVWjnJEKozW+BnVVm1uP7lY/C0R1u/GfjCq7RaDTVoBVEA5JfZsTLpRYFkRqjsqR2f0jVPjgTe2eVDsPJA76/GRL+hBsnn1sZT41Go2kgtIJoQApqW0GYjcoJ7d5M7WWoCY8AGDdPhbb6tIDuExpHWI1Go6kD7YNoQPJLK4hs5lH9yc2fQuZ+GDsXnDxr7yioC0xYAw5up5uhNBqN5iKiFUQDUlBmxKO6FURWLKx/F9qOhLY3168z//YNK5xGo9GcI9rE1EBIKSmozgdRmgfzx4KjBwx/79IIp9FoNOeBXkE0EKUVZoxmiVfVFYSpQtVuKMyA8cuVf0Gj0WiuEOq9ghBCXC+EGG/93kQI0XCFmq8CCsqMAKec1FLC7y9C0kZV9jO42yWUTqPRaM6deikIIcRUYDLwkvWQPfBDYwl1JZJfqhREpYlp+9cQM0tthuukE+xpNJorj/quIG4FRgElAFLKdMC9sYS67Dm2FTa8B7kJlYdOriA8nO1VOdAVU6DNTTCglpBWjUajuYyprw+iQkophRASQAhxbRck2PwpxC5Tu6KDe0CncZTa9gGgaXkKLHlApdIYM0NnX9VoNFcs9VUQC4UQXwFeQogJwIPA140n1mVOWT40i4KoO2D3fPjtOfrZ2POFfTRhK3PA1g7umq/SZGg0Gs0VSr1eb6WU7wM/AYuANsC/pJSf1tVOCDFMCHFYCBEvhJhSzfkQIcRaIcQuIcReIcRNVc69ZG13WAgxtP63dBEwFIBnc7juaXhsCzyygQOBd9DdJhbbolQY+wN4h15qKTUajeaCqHMFIYSwBf6QUg4CVtW3Y2u7z4HBQCrwtxBiqZTyYJXLXgUWSimnCyHaAcuBMOv3cUB7IBBYLYRoLaU013f8RsVQAE4d1HchIKATvwc/zXeJwzn0Yi9wa3pp5dNoNJoGoM4VhHVSLhVC1JEf4ix6APFSykQpZQWwABh9ZvfAydwUnkC69ftoYIGUslxKeRSIt/Z3eWDIPytdRkGZETcXZ4RWDhqN5iqhvj4IA7BPCLEKayQTgJTyqVraBAEpVf6dCpyZwnQasFII8STgCgyq0nbrGW2DzhxACDERmAgQEhJSn/u4cCxmKC8EJ6/TDheU1pBmQ6PRaK5Q6qsgfrN+zoXq6m2eUU+Tu4BZUsoPhBC9gTlCiA71bIuUcgYwA6Bbt25nnW8UygvVz2pWEF5aQWg0mquIeikIKeX3QggHoLX10GEppbGOZqlA8yr/DuaUCekkDwHDrGNsEUI4AX71bHtpKFNlRc9UEPllFTR1d7oEAmk0Gk3jUN+d1P2AOJTT+QvgiBDihjqa/Q20EkKEW5XLOGDpGdccAwZax2gLOAHZ1uvGCSEcrSk9WgHb63VHjY2hQP10PsPEVJ9qchqNRnMFUV8T0wfAECnlYQAhRGtgvGJFpQAAED5JREFUPtC1pgZSSpMQ4gngD8AW+FZKeUAI8TqwQ0q5FHge+FoI8SzKhPSAlFICB4QQC4GDgAl4/LKKYIKzVxClWkFoNJqri/oqCPuTygH4//buPrau+r7j+PsTx44NIU+QlJAECJCxgAYJpAgVNvHQQraigqbSJQWEpkloE+2g2hPdWtplQ2qlrrSq0IC10VhHoYXBGk3ZOppQ6CMkoVkLJNA0oxCS4qBg58F2/PTdH+c4udjH9vHi4+uc83lJke/5+Vz790tu7uee3znn+yMiXpU06rthRKwnuXS1tu3umscvA5cN89x7gHty9m/idA2dYurrDw509TogzKxU8gbEZklfA76ebt8EbCmmS5PckSOIo1NM+zsHFeozMyuBvAHxJ8DtwJ+SXGH0LMm5iOrJmGJqG1zq28ysBPIGxFTgyxHxRThyl/S0wno1mXW2gaYk60Wn2n0EYWYllLfU6AagpWa7Bfju+HfnONDVnhw91FRpbevoBnwEYWblkjcgmiPi4MBG+viEYro0yQ0ERI2jq8k11aNHZmaFyBsQhyRdNLAhaQXQWUyXJrlh6jCBp5jMrFzynoO4A3hM0m6S+xVOA6q5jmZXe2YdJvAUk5mVS96AWAwsB04nWX70UjJqI1VCVzuc8p53NbV19nBiUwONDV49zszKI+872qcjYj8wi2R9hweBfyysV5NZZ/YUk48ezKxs8gbEQJmLDwL3R8S3gWqeke1qH1KHqa2jh5knVPOvw8zKK29AvJmuSf0RYL2kaWN4bnn0HobeziFHEPs7e5jZkne2zszs+JD3Tf4jJEX3VkZEGzAH+IvCejVZZZTZgKTU9yxf4mpmJZN3PYgO4Ima7T3AnqI6NWkNFxCu5GpmJVS9aaJjMUyp7/bOHt8DYWal44AYi4zV5Lp6+jjc2+/1qM2sdBwQYzGwFkTNVUy+i9rMysoBMRZZpb7Tu6h9ktrMysYBMRYZq8m1ey0IMyspB8RYdLVDwzRoPFr5fKDUt6eYzKxsHBBjMWKpbweEmZWLA2IshqnDBDDTRxBmVjIOiLEYpg7TFMH0JpfaMLNycUCMxTBTTDNbGpkyRXXqlJlZMRwQY5GxmlybS32bWUk5IMYiazW5Tpf6NrNyckDkFZE9xdTR7SMIMyslB0Re3Yegvze7UJ8DwsxKyAGR10CZjcFXMbmSq5mVlAMir4w6TP39ka4m54Aws/JxQOSVUYfpwOFe+sN3UZtZOTkg8spYTa69w2U2zKy8HBB5ZUwxHV0Lwpe5mln5OCDyOrKa3NEjiLbOpJKrjyDMrIwcEHmNeAThgDCz8nFA5NXVDk3ToeFoUb42n4MwsxJzQOSVUYfJa0GYWZk5IPIapg7TtKlTaG5sqFOnzMyK44DIK7MOk++iNrPyckDklbGaXFtnN7NafImrmZVToQEhaaWkVyTtkHRXxvfvlbQ1/fOqpLaa7/XVfG9dkf3MZZjV5Hz+wczKqrB1MiU1APcBHwB2AZskrYuIlwf2iYhP1Oz/cWB5zY/ojIhlRfVvzIZZTW7RnBPq1CEzs2IVeQRxCbAjInZGRDfwKHD9CPuvBh4psD//f/19cHj45UbNzMqoyIBYALxRs70rbRtC0hnAYmBjTXOzpM2SfiLphmGed1u6z+a9e/eOV7+HOrw/7dHQq5i8FoSZlVWRAaGMthhm31XA4xHRV9N2ekSsAD4KfEnS2UN+WMSDEbEiIlbMnTv32Hs8nIy7qLt7++no7vMRhJmVVpEBsQtYVLO9ENg9zL6rGDS9FBG70687ge/x7vMTE6tzaKlvl9kws7IrMiA2AUskLZbURBICQ65GknQuMBv4cU3bbEnT0senAJcBLw9+7oTJWE2uPS3UN8NHEGZWUoVdxRQRvZI+BnwHaADWRsRLktYAmyNiICxWA49GRO3001LgAUn9JCH2udqrnyacS32bWQUVFhAAEbEeWD+o7e5B25/NeN6PgN8qsm9jkrGa3EChPp+kNrOy8p3UeWStJudCfWZWcg6IPLraQVOSct+pI0cQPkltZiXlgMijsw2mzYApR/+62tIjiJOaHRBmVk4OiDwy6jDt7+xhRvNUGqZk3e5hZnb8c0DkkVGHqa2jm5meXjKzEnNA5DHManIu9W1mZeaAyCNjNbk2F+ozs5JzQOQxTKlvTzGZWZk5IPLIWE2u3YsFmVnJOSBG03sYejvfdRVTRLjUt5mVngNiNF1D14I41N1Hb3/4JjkzKzUHxGgy6zAllVw9xWRmZeaAGM2IdZh8mauZlZcDYjQZRxDtHS7UZ2bl54AYjVeTM7OKckCMJmM1uTaX+jazCnBAjGbE1eQcEGZWXg6I0XS1QUMTTG0+0tTW0UNjg2hpbKhjx8zMiuWAGM1AHSYdLevd3tnDzJYmJJf6NrPyckCMZlAdpn2Huvn+L/Yyf2bzCE8yMzv+Ta13Bya9mjpMh3v7+OOvb6H1wGG+snp5nTtmZlYsH0GMJl1NLiL4mydf5PnX9vGFGy9k+emz690zM7NCOSBGk04xPfDsTh7fsos7rl7Chy48rd69MjMrnANiNF1tvN7RyOf/azvXXTCfO9+/pN49MjObEA6IkUQQne38544uLlg4iy/ceKGvXDKzynBAjKB13z4UvfQ0nsQ/3XIxzb7vwcwqxAExgjWP/QiA33/fecyb4ctazaxaHBDD+HV7F6/+6k0ATjt1fp17Y2Y28RwQw3j6lVZmcCjZGLQetZlZFTgghrFhWytnTe9NNhwQZlZBDogMXT19/HDH21wyPz0pXbOanJlZVTggMjz3v/vo7OnjgrlpgwPCzCrIAZFh47a3aG6cwuITk3UfaJ5R3w6ZmdWBA2KQiGDjK61cfs4pNHYfgKbp0OCFgcysehwQg+xoPcgb+zq58jfnDSn1bWZWJQ6IQTZubwXgynPnJavJOSDMrKIcEINs2N7K0vkzOG1Wy9HV5MzMKsgLBvX1wJsvAHDwcC/x+iZuXn4avP4cHNgDJ7t6q5lVkwOiqx3WXgPAdOCxRuDF9A/AmZfXqWNmZvVVaEBIWgl8GWgAvhoRnxv0/XuBK9PNE4B5ETEr/d6twKfS7/19RDxUSCenzYCbnwDg/md28rM32/jK6otoGKjqveDiQn6tmdlkV1hASGoA7gM+AOwCNklaFxEvD+wTEZ+o2f/jwPL08RzgM8AKIIAt6XPfGfeOTm2Cc66mrz944Bt9XHnue2lYsmzcf42Z2fGmyJPUlwA7ImJnRHQDjwLXj7D/auCR9PG1wFMRsS8NhaeAlQX2la1vvMM7HT3J5a1mZlZoQCwA3qjZ3pW2DSHpDGAxsHEsz5V0m6TNkjbv3bv3mDq7cXsrDVPE7/zG3NF3NjOrgCIDImttzhhm31XA4xHRN5bnRsSDEbEiIlbMnXtsb+wbtrWy4ozZzGzxXdNmZlBsQOwCFtVsLwR2D7PvKo5OL431ucdsd1sn2399gKs8vWRmdkSRAbEJWCJpsaQmkhBYN3gnSecCs4Ef1zR/B7hG0mxJs4Fr0rZCPP1Kcvf01UsdEGZmAwq7iikieiV9jOSNvQFYGxEvSVoDbI6IgbBYDTwaEVHz3H2S/o4kZADWRMS+ovq6cVsri+a0cPbc6UX9CjOz406h90FExHpg/aC2uwdtf3aY564F1hbWuVRXTx8//OXbrHrv6UhZpz7MzKqp8rWY9nf2cO35p3Lt+afWuytmZpNK5UttzJvRzJdXLa93N8zMJp3KH0GYmVk2B4SZmWVyQJiZWSYHhJmZZXJAmJlZJgeEmZllckCYmVkmB4SZmWVSTQmk45qkvcCvRtntFODtCejOZFTVsXvc1eJxj90ZEZG5XkJpAiIPSZsjYkW9+1EPVR27x10tHvf48hSTmZllckCYmVmmqgXEg/XuQB1Vdewed7V43OOoUucgzMwsv6odQZiZWU4OCDMzy1SZgJC0UtIrknZIuqve/SmKpLWSWiW9WNM2R9JTkn6Rfp1dzz4WQdIiSU9L2ibpJUl3pO2lHrukZknPS/qfdNx/m7YvlvRcOu5vSmqqd1+LIKlB0k8l/Ue6XZVxvybp55K2Stqcto37a70SASGpAbgP+F3gPGC1pPPq26vC/DOwclDbXcCGiFgCbEi3y6YX+LOIWApcCtye/huXfeyHgasi4kJgGbBS0qXA54F703G/A/xRHftYpDuAbTXbVRk3wJURsazm/odxf61XIiCAS4AdEbEzIrqBR4Hr69ynQkTEs8C+Qc3XAw+ljx8CbpjQTk2AiNgTES+kjw+QvGksoORjj8TBdLMx/RPAVcDjaXvpxg0gaSHwQeCr6baowLhHMO6v9aoExALgjZrtXWlbVbwnIvZA8kYKzKtzfwol6UxgOfAcFRh7Os2yFWgFngJ+CbRFRG+6S1lf718C/hLoT7dPphrjhuRDwH9L2iLptrRt3F/rU4/1BxwnlNHm63tLSNJ04N+AOyNif/Khstwiog9YJmkW8CSwNGu3ie1VsSRdB7RGxBZJVww0Z+xaqnHXuCwidkuaBzwlaXsRv6QqRxC7gEU12wuB3XXqSz28JWk+QPq1tc79KYSkRpJweDginkibKzF2gIhoA75Hcg5mlqSBD4BlfL1fBnxI0mskU8ZXkRxRlH3cAETE7vRrK8mHgkso4LVelYDYBCxJr3BoAlYB6+rcp4m0Drg1fXwr8O069qUQ6fzz14BtEfHFmm+VeuyS5qZHDkhqAd5Pcv7laeDD6W6lG3dEfDIiFkbEmST/nzdGxE2UfNwAkk6UdNLAY+Aa4EUKeK1X5k5qSb9H8gmjAVgbEffUuUuFkPQIcAVJ+d+3gM8A/w58CzgdeB24MSIGn8g+rkm6HPg+8HOOzkn/Ncl5iNKOXdIFJCckG0g+8H0rItZIOovkk/Uc4KfAzRFxuH49LU46xfTnEXFdFcadjvHJdHMq8I2IuEfSyYzza70yAWFmZmNTlSkmMzMbIweEmZllckCYmVkmB4SZmWVyQJiZWSYHhNkkIOmKgYqkZpOFA8LMzDI5IMzGQNLN6foLWyU9kBbKOyjpHyS9IGmDpLnpvssk/UTSzyQ9OVCfX9I5kr6bruHwgqSz0x8/XdLjkrZLelhVKCRlk5oDwiwnSUuBPyAplLYM6ANuAk4EXoiIi4BnSO5eB/gX4K8i4gKSO7wH2h8G7kvXcHgfsCdtXw7cSbJmyVkk9YbM6qYq1VzNxsPVwMXApvTDfQtJQbR+4JvpPv8KPCFpJjArIp5J2x8CHktr6CyIiCcBIqILIP15z0fErnR7K3Am8IPih2WWzQFhlp+AhyLik+9qlD49aL+R6teMNG1UWzOoD///tDrzFJNZfhuAD6c1+AfWAD6D5P/RQAXRjwI/iIh24B1Jv5223wI8ExH7gV2Sbkh/xjRJJ0zoKMxy8icUs5wi4mVJnyJZyWsK0APcDhwCzpe0BWgnOU8BScnl+9MA2An8Ydp+C/CApDXpz7hxAodhlpuruZodI0kHI2J6vfthNt48xWRmZpl8BGFmZpl8BGFmZpkcEGZmlskBYWZmmRwQZmaWyQFhZmaZ/g8ZlRDqba1GEgAAAABJRU5ErkJggg==\n", 413 | "text/plain": [ 414 | "
" 415 | ] 416 | }, 417 | "metadata": { 418 | "needs_background": "light" 419 | }, 420 | "output_type": "display_data" 421 | } 422 | ], 423 | "source": [ 424 | "import matplotlib.pyplot as plt\n", 425 | "\n", 426 | "epochs = range(1, len(history.history['acc']) + 1)\n", 427 | "plt.plot(epochs, history.history['acc'])\n", 428 | "plt.plot(epochs, history.history['val_acc'])\n", 429 | "plt.plot(epochs, history.history['val_f1_m'])\n", 430 | "plt.title('model accuracy')\n", 431 | "plt.ylabel('score')\n", 432 | "plt.xlabel('epoch')\n", 433 | "plt.legend(['train_acc', 'val_acc', 'f1score'], loc='upper left')\n", 434 | "plt.savefig(\"./1DCNN reuslt.png\")\n", 435 | "plt.show()\n", 436 | "# train에 대해서 오버피팅 된다. val은 조금 증가하고 왔다갔다한다. epoch은 20정도만해도 충분한 것 같다." 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 58, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "model.save(\"./cnn_model\")" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [] 475 | } 476 | ], 477 | "metadata": { 478 | "kernelspec": { 479 | "display_name": "Python 3", 480 | "language": "python", 481 | "name": "python3" 482 | }, 483 | "language_info": { 484 | "codemirror_mode": { 485 | "name": "ipython", 486 | "version": 3 487 | }, 488 | "file_extension": ".py", 489 | "mimetype": "text/x-python", 490 | "name": "python", 491 | "nbconvert_exporter": "python", 492 | "pygments_lexer": "ipython3", 493 | "version": "3.7.4" 494 | } 495 | }, 496 | "nbformat": 4, 497 | "nbformat_minor": 4 498 | } 499 | -------------------------------------------------------------------------------- /BadWordDetectionByRegularExpression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# RegularExpression 사용하여 비속어 탐지\n", 8 | "\n", 9 | "- 1차적으로 문장에서 비속어가 있는 부분을 찾기 위함\n", 10 | "\n", 11 | "- 정규식표현을 사용하여 중간에 글자가 들어오는 부분도 고려\n", 12 | "\n", 13 | "- 크롤했던 키워드들 전부 추가함\n", 14 | "\n", 15 | "- 비속어가 있는 위치와 좌우 어절을 포함한 Trigram 반환" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 3, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import re\n", 25 | "\n", 26 | "# |는 OR 입니다. => (씨 or 시) AND (벌 or 빨 or 발 or 바)\n", 27 | "# 예) 쌍1년, 씨12발, 꼬a추\n", 28 | "# 중간에 2글자 까지 들어갈 수 있도록 허용하였습니다.\n", 29 | "\n", 30 | "patternList = [\n", 31 | " re.compile('((쌍|썅).{0,2}(놈|년))'),\n", 32 | " re.compile('(씨|시).{0,2}(벌|빨|발|바)'),\n", 33 | " re.compile('(병|븅).{0,2}(신|쉰|싄)'),\n", 34 | " re.compile('(좆|존|좃).{0,2}(같|되|는|나|돼)'),\n", 35 | " re.compile('(개|게).{0,2}(같|갓|새|세|쉐|끼)'),\n", 36 | " re.compile('(걸|느).{0,2}(레|금)'),\n", 37 | " re.compile('(꼬|꽂|고).{0,2}(추|츄)'),\n", 38 | " re.compile('(니|너).{0,2}(엄|엠|애|m|M)'),\n", 39 | " re.compile('(애|에).{0,2}(미)'),\n", 40 | " re.compile('(노).{0,2}(애미|앰|엠)'),\n", 41 | " re.compile('(섹|쎅).{0,2}(스|쓰)'),\n", 42 | " re.compile('(ㅅㅂ|ㅄ|ㄷㅊ)'),\n", 43 | " re.compile('(s|S)(e|E)(x|X)'),\n", 44 | " re.compile('(미|뮈|믜).{0,2}(친|췬|칀)'),\n", 45 | " # 마지막에 추가해서 넣은 키워드들 \n", 46 | " re.compile('자지|꼴깝|새끼들|애미|짜식|빠굴|씹년|미친넘|18년|폐녀자|미틴|이놈\\\n", 47 | " |조센징|미시촌|주접|붕가|패티쉬|쳐먹|뒤질래|쉐리|호로자식|개좌식|뭥미\\\n", 48 | " |별창|망나니|딸딸이|니에미|좃|십새|싸보이다|미췬|씨댕|새꺄|쎅스|10세|\\\n", 49 | " 상넘|꼰대|개놈|꼴갑|시벌탱|씨방새|발기|새끼|10새끼|꼴리|옘병|아구창|\\\n", 50 | " 개좆|아갈|창녀|염병|포르노|미친놈|음탕|또라이|좃나|한남충|조지다|호로|\\\n", 51 | " 후빨|조또|지랄|오지구|세끼|슨상님|병쉰|싸가지|빠큐|엠생|시궁창|꼬라지|\\\n", 52 | " 우라질|혼음|개빡|뒈진|멍청이|뒤진다|어미|듣보|꼴값|광녀|따먹기|양키|\\\n", 53 | " 잡종|상놈|넌씨눈|떡치기|개년|꼬추|쎄엑|개지랄|18|시부랄|느개비|오짐|\\\n", 54 | " 보지|부랄|고인물|찌질|정박아|뒤질|개쓰래기|좇같|후려|시키|육갑|씹새|\\\n", 55 | " 씝창|미쳤나|호모|조온나|씨파|쉬발|십세|병자|게새끼|개새끼|시부럴|개시키|\\\n", 56 | " 개민폐|언년|쓰발|sex|눈알|뽄세|씹새기|씨팔|앰창|놈|개수작|아가리|무뇌\\\n", 57 | " 오진다|창놈|좆같|병맛|로리타|그년|씨부럴|저능아|쌔끈|주뎅이|토끼다|대가리|\\\n", 58 | " 씹팔|디졌|대갈|엠창|트롤|개씹|썅넘|오졌|갈보|씨발|시발|개자식|극혐|개같은|\\\n", 59 | " 개짱|미친색|기레기|남혐|야설|이새끼|10창|18놈|섹스|씨불|성인체위|십팔|\\\n", 60 | " 벌레|빠가|운지|빙신|개돼지|장애인|씹창|썩을|꼬붕|매국노|18새끼|발놈|와꾸|\\\n", 61 | " 느금|허접|고추|미쳤니|노답|오져|같은년|좆까|돌았나|씨빨|새키|븅|좆만|존싫|\\\n", 62 | " 사이코|십새끼|섹수|조까|시끼|변태새끼|늬미|열폭|년|쥐랄|잡놈|존버|꼴리다|충|\\\n", 63 | " 자슥|모가지|씨벌탱|빠구리|니앰|싸가지없|쌍|개간|틀니|냄비|씨발년|시부리|쪽바리|\\\n", 64 | " 저년|씨부랄|씹탱|즤랄|골빈|샹년|젖탱|메갈|시팔|씨빠|쌍년|싸물|싸대기|스트립|\\\n", 65 | " 좆|씨볼|이씨|이년|이자식|오바|니미랄|새기|후레자식|호구|패드립|에로물|쌍욕|\\\n", 66 | " 호로놈|5지구|벼엉|찐따|간나|등신|애자|개같이|쓰레기|5지네|니미|뻑큐|좇|\\\n", 67 | " 개존|관종|빡촌|뒤져|좃밥|엿|귀두|좆나|개짜증|노무|놈현|개쩔|디질|싸죠|\\\n", 68 | " 씨부리|돌았네|개새|병신|씨바|양놈|쌍놈년|구라|머갈|불륜|성기|에로|년놈|\\\n", 69 | " 창년|낯짝|자위|불알|썅년|멍텅|오지네|왜놈|아닥|짱깨|이새키|색끼|주뎅|딜도|\\\n", 70 | " 대갈빡|정신병자|미친|한남|씨방|뻐큐|니미럴|사까시|존만한|꼴통|씨발놈|존나|\\\n", 71 | " |홍어|좆나게|후장|섹|놈들|개새키|븽신|개소리|미치|면상|시댕|갈레|돌아이\\\n", 72 | " |닥쳐|개같|쌉|정사|쒸벌|고자|좃또|조빠|씹|썅제기랄|버러지|십창|딴년|꺼져|\\\n", 73 | " |좇밥|뽄새|눈깔|쪼개|육봉|수간|틀딱|씹쉐|따까리|음란|씹덕|삥땅')]\n", 74 | "\n", 75 | "def return_bad_words_index(content, mode=0):\n", 76 | " # 정규식을 통하여 욕설있는 위치에 *표시 하여 리턴\n", 77 | " if mode == 0:\n", 78 | " for pattern in patternList:\n", 79 | " content = re.sub(pattern, \"**\", content)\n", 80 | " return content\n", 81 | " \n", 82 | " # 비속어 위치와 trigram 리턴\n", 83 | " else:\n", 84 | " # 문장의 음절과 어절간의 리스트 생성: 어절의 위치를 뽑기 위함\n", 85 | " token_position = []\n", 86 | " token_index = 0\n", 87 | " # 각 캐릭터 위치마다 어절 인덱스 저장\n", 88 | " for char in content:\n", 89 | " token_position.append(token_index)\n", 90 | " if char == \" \":\n", 91 | " token_index += 1\n", 92 | "\n", 93 | " # 정규식 표현을 통해 비속어 위치 찾기\n", 94 | " badwords = []\n", 95 | " for pattern in patternList:\n", 96 | " matchObjs = re.finditer(pattern, content)\n", 97 | " badwords += [token_position[obj.span()[0]] for obj in matchObjs] # 해당 단어가 속한 어절의 위치\n", 98 | " \n", 99 | " content = [\" \"]+ content.split(\" \") + [\" \"] # 어절을 반환하기 위한 스플릿 & 맨앞, 맨뒤 padding\n", 100 | " \n", 101 | " badwords = list(set(badwords)) # 중복제거\n", 102 | "\n", 103 | " return [(content[index], content[index+1], content[index+2], index) for index in badwords] # trigram(3어절 반환) & 단어 위치" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "# ==== 예시 테스트 ====" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 4, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "content = \"\"\"\n", 120 | "애미 지난해 12월 시발 미국의 자지 꼴갑들하고 헤지펀드 빠구리가 이렇게\n", 121 | "개새끼 엘리엇이 좃같네 시1발 한국 정부를 상대로 소송을 미친 제기했습니다 \n", 122 | "애미없네 삼성물산과 제일모직의 합병을 승인하는 과정에서 한국 정부가 부당하게 개입했고 \n", 123 | "이를 통해 자신들이 피해를 봤다는 건데요 법무부에 따르면 한국 정부는 증거와 증인이 없다며 \n", 124 | "조목조목 반박한 답변서를 엘리엇에게 보냈습니다 하지만 주진우 시사인 기자는 법무부의 답변서 \n", 125 | "내용이 수상하다고 지적합니다 향후 이재용 부회장의 대법원 판결에 영향을 줄 수 있는 내용이 담겨있다는 것인데요 \n", 126 | "tbs 라디오 김어준의 뉴스공장에 출연한 주진우 시사인 기자의 얘기를 들어보시죠 법무부 답변서 내용을 한마디로 \n", 127 | "정리하자면 삼성물산 합병과 관련해서는 모든 것이 합법적이다 이렇게 답변서가 말하고 있는 건가요 그러니까요 \n", 128 | "법무부가 주장하는 것은 삼성의 주장과 거의 동일하다 이렇게 볼 수 있습니다 아니 그런데 법무부가 삼성을 이렇게까지 \n", 129 | "대변할 이유가 없잖아요 물론 정부가 나서서 삼성을 도왔다 이렇게 하면 엘리엇 소송에서 불리하니까 이렇게 답변했다고 볼 수 있는데 \n", 130 | "이것은 굉장히 중요한 문제가 걸려 있습니다 그리고 이런 소송은 엘리엇이 8000억 원 넘는 피해를 봤다 \n", 131 | "이렇게 했을 때 너희가 어떻게 피해를 받았는지 구체적으로 입증해라 이렇게 소송을 대비하는 게 맞다고 합니다. 개새끼\n", 132 | "\"\"\"" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "기존 내용 \n", 145 | "애미 지난해 12월 시발 미국의 자지 꼴갑들하고 헤지펀드 빠구리가 이렇게\n", 146 | "개새끼 엘리엇이 좃같네 시1발 한국 정부를 상대로 소송을 미친 제기했습니다 \n", 147 | "애미없네 삼성물산과 제일모직의 합병을 승인하는 과정에서 한국 정부가 부당하게 개입했고 \n", 148 | "이를 통해 자신들이 피해를 봤다는 건데요 법무부에 따르면 한국 정부는 증거와 증인이 없다며 \n", 149 | "조목조목 반박한 답변서를 엘리엇에게 보냈습니다 하지만 주진우 시사인 기자는 법무부의 답변서 \n", 150 | "내용이 수상하다고 지적합니다 향후 이재용 부회장의 대법원 판결에 영향을 줄 수 있는 내용이 담겨있다는 것인데요 \n", 151 | "tbs 라디오 김어준의 뉴스공장에 출연한 주진우 시사인 기자의 얘기를 들어보시죠 법무부 답변서 내용을 한마디로 \n", 152 | "정리하자면 삼성물산 합병과 관련해서는 모든 것이 합법적이다 이렇게 답변서가 말하고 있는 건가요 그러니까요 \n", 153 | "법무부가 주장하는 것은 삼성의 주장과 거의 동일하다 이렇게 볼 수 있습니다 아니 그런데 법무부가 삼성을 이렇게까지 \n", 154 | "대변할 이유가 없잖아요 물론 정부가 나서서 삼성을 도왔다 이렇게 하면 엘리엇 소송에서 불리하니까 이렇게 답변했다고 볼 수 있는데 \n", 155 | "이것은 굉장히 중요한 문제가 걸려 있습니다 그리고 이런 소송은 엘리엇이 8000억 원 넘는 피해를 봤다 \n", 156 | "이렇게 했을 때 너희가 어떻게 피해를 받았는지 구체적으로 입증해라 이렇게 소송을 대비하는 게 맞다고 합니다. 개새끼\n", 157 | "\n", 158 | "필터 적용 후 내용 \n", 159 | "** 지난해 12월 ** 미국의 ** **들하고 헤지펀드 **가 이렇게\n", 160 | "** 엘리엇이 **네 ** 한국 정부를 상대로 소송을 ** 제기했습니다 \n", 161 | "**없네 삼성물산과 제일모직의 합병을 승인하는 과정에서 한국 정부가 부당하게 개입했고 \n", 162 | "이를 통해 자신들이 피해를 봤다는 건데요 법무부에 따르면 한국 정부는 증거와 증인이 없다며 \n", 163 | "조목조목 반박한 답변서를 엘리엇에게 보냈습니다 하지만 주진우 시사인 기자는 법무부의 답변서 \n", 164 | "내용이 수상하다고 지적합니다 향후 이재용 부회장의 대법원 판결에 영향을 줄 수 있는 내용이 담겨있다는 것인데요 \n", 165 | "tbs 라디오 김어준의 뉴스공장에 출연한 주진우 시사인 기자의 얘기를 들어보시죠 법무부 답변서 내용을 한마디로 \n", 166 | "정리하자면 삼성물산 합병과 관련해서는 모든 것이 합법적이다 이렇게 답변서가 말하고 있는 건가요 그러니까요 \n", 167 | "법무부가 주장하는 것은 삼성의 주장과 거의 동일하다 이렇게 볼 수 있습니다 아니 그런데 법무부가 삼성을 이렇게까지 \n", 168 | "대변할 이유가 없잖아요 물론 정부가 나서서 삼성을 도왔다 이렇게 하면 엘리엇 소송에서 불리하니까 이렇게 답변했다고 볼 수 있는데 \n", 169 | "이것은 굉장히 중요한 문제가 걸려 있습니다 그리고 이런 소송은 엘리엇이 8000억 원 넘는 피해를 봤다 \n", 170 | "이렇게 했을 때 너희가 어떻게 피해를 받았는지 구체적으로 입증해라 이렇게 소송을 대비하는 게 맞다고 합니다. **\n", 171 | "\n", 172 | "욕설 위치 반환 [(' ', '\\n애미', '지난해', 0), ('12월', '시발', '미국의', 3), ('미국의', '자지', '꼴갑들하고', 5), ('자지', '꼴갑들하고', '헤지펀드', 6), ('헤지펀드', '빠구리가', '이렇게\\n개새끼', 8), ('빠구리가', '이렇게\\n개새끼', '엘리엇이', 9), ('엘리엇이', '좃같네', '시1발', 11), ('좃같네', '시1발', '한국', 12), ('소송을', '미친', '제기했습니다', 17), ('제기했습니다', '\\n애미없네', '삼성물산과', 19), ('합니다.', '개새끼\\n', ' ', 159)]\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "print(\"기존 내용 \", content)\n", 178 | "print(\"필터 적용 후 내용\",return_bad_words_index(content, mode=0))\n", 179 | "print(\"욕설 위치 반환\", return_bad_words_index(content, mode=1))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.7.4" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 4 218 | } 219 | -------------------------------------------------------------------------------- /BadWordDetectionByRegularExpression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # RegularExpression 사용하여 비속어 탐지 5 | # 6 | # - 1차적으로 문장에서 비속어가 있는 부분을 찾기 위함 7 | # 8 | # - 정규식표현을 사용하여 중간에 글자가 들어오는 부분도 고려 9 | # 10 | # - 크롤했던 키워드들 전부 추가함 11 | # 12 | # - 비속어가 있는 위치와 좌우 어절을 포함한 Trigram 반환 13 | 14 | import re 15 | 16 | # |는 OR 입니다. => (씨 or 시) AND (벌 or 빨 or 발 or 바) 17 | # 예) 쌍1년, 씨12발, 꼬a추 18 | # 중간에 2글자 까지 들어갈 수 있도록 허용하였습니다. 19 | 20 | patternList = [ 21 | re.compile('((쌍|썅).{0,2}(놈|년))'), 22 | re.compile('(씨|시).{0,2}(벌|빨|발|바)'), 23 | re.compile('(병|븅).{0,2}(신|쉰|싄)'), 24 | re.compile('(좆|존|좃).{0,2}(같|되|는|나|돼)'), 25 | re.compile('(개|게).{0,2}(같|갓|새|세|쉐|끼)'), 26 | re.compile('(걸|느).{0,2}(레|금)'), 27 | re.compile('(꼬|꽂|고).{0,2}(추|츄)'), 28 | re.compile('(니|너).{0,2}(엄|엠|애|m|M)'), 29 | re.compile('(애|에).{0,2}(미)'), 30 | re.compile('(노).{0,2}(애미|앰|엠)'), 31 | re.compile('(섹|쎅).{0,2}(스|쓰)'), 32 | re.compile('(ㅅㅂ|ㅄ|ㄷㅊ)'), 33 | re.compile('(s|S)(e|E)(x|X)'), 34 | re.compile('(미|뮈|믜).{0,2}(친|췬|칀)'), 35 | # 마지막에 추가해서 넣은 키워드들 36 | re.compile('자지|꼴깝|새끼들|애미|짜식|빠굴|씹년|미친넘|18년|폐녀자|미틴|이놈\ 37 | |조센징|미시촌|주접|붕가|패티쉬|쳐먹|뒤질래|쉐리|호로자식|개좌식|뭥미\ 38 | |별창|망나니|딸딸이|니에미|좃|십새|싸보이다|미췬|씨댕|새꺄|쎅스|10세|\ 39 | 상넘|꼰대|개놈|꼴갑|시벌탱|씨방새|발기|새끼|10새끼|꼴리|옘병|아구창|\ 40 | 개좆|아갈|창녀|염병|포르노|미친놈|음탕|또라이|좃나|한남충|조지다|호로|\ 41 | 후빨|조또|지랄|오지구|세끼|슨상님|병쉰|싸가지|빠큐|엠생|시궁창|꼬라지|\ 42 | 우라질|혼음|개빡|뒈진|멍청이|뒤진다|어미|듣보|꼴값|광녀|따먹기|양키|\ 43 | 잡종|상놈|넌씨눈|떡치기|개년|꼬추|쎄엑|개지랄|18|시부랄|느개비|오짐|\ 44 | 보지|부랄|고인물|찌질|정박아|뒤질|개쓰래기|좇같|후려|시키|육갑|씹새|\ 45 | 씝창|미쳤나|호모|조온나|씨파|쉬발|십세|병자|게새끼|개새끼|시부럴|개시키|\ 46 | 개민폐|언년|쓰발|sex|눈알|뽄세|씹새기|씨팔|앰창|놈|개수작|아가리|무뇌\ 47 | 오진다|창놈|좆같|병맛|로리타|그년|씨부럴|저능아|쌔끈|주뎅이|토끼다|대가리|\ 48 | 씹팔|디졌|대갈|엠창|트롤|개씹|썅넘|오졌|갈보|씨발|시발|개자식|극혐|개같은|\ 49 | 개짱|미친색|기레기|남혐|야설|이새끼|10창|18놈|섹스|씨불|성인체위|십팔|\ 50 | 벌레|빠가|운지|빙신|개돼지|장애인|씹창|썩을|꼬붕|매국노|18새끼|발놈|와꾸|\ 51 | 느금|허접|고추|미쳤니|노답|오져|같은년|좆까|돌았나|씨빨|새키|븅|좆만|존싫|\ 52 | 사이코|십새끼|섹수|조까|시끼|변태새끼|늬미|열폭|년|쥐랄|잡놈|존버|꼴리다|충|\ 53 | 자슥|모가지|씨벌탱|빠구리|니앰|싸가지없|쌍|개간|틀니|냄비|씨발년|시부리|쪽바리|\ 54 | 저년|씨부랄|씹탱|즤랄|골빈|샹년|젖탱|메갈|시팔|씨빠|쌍년|싸물|싸대기|스트립|\ 55 | 좆|씨볼|이씨|이년|이자식|오바|니미랄|새기|후레자식|호구|패드립|에로물|쌍욕|\ 56 | 호로놈|5지구|벼엉|찐따|간나|등신|애자|개같이|쓰레기|5지네|니미|뻑큐|좇|\ 57 | 개존|관종|빡촌|뒤져|좃밥|엿|귀두|좆나|개짜증|노무|놈현|개쩔|디질|싸죠|\ 58 | 씨부리|돌았네|개새|병신|씨바|양놈|쌍놈년|구라|머갈|불륜|성기|에로|년놈|\ 59 | 창년|낯짝|자위|불알|썅년|멍텅|오지네|왜놈|아닥|짱깨|이새키|색끼|주뎅|딜도|\ 60 | 대갈빡|정신병자|미친|한남|씨방|뻐큐|니미럴|사까시|존만한|꼴통|씨발놈|존나|\ 61 | |홍어|좆나게|후장|섹|놈들|개새키|븽신|개소리|미치|면상|시댕|갈레|돌아이\ 62 | |닥쳐|개같|쌉|정사|쒸벌|고자|좃또|조빠|씹|썅제기랄|버러지|십창|딴년|꺼져|\ 63 | |좇밥|뽄새|눈깔|쪼개|육봉|수간|틀딱|씹쉐|따까리|음란|씹덕|삥땅')] 64 | 65 | def return_bad_words_index(content, mode=0): 66 | # 정규식을 통하여 욕설있는 위치에 *표시 하여 리턴 67 | if mode == 0: 68 | for pattern in patternList: 69 | content = re.sub(pattern, "**", content) 70 | return content 71 | 72 | # 비속어 위치와 trigram 리턴 73 | else: 74 | # 문장의 음절과 어절간의 리스트 생성: 어절의 위치를 뽑기 위함 75 | token_position = [] 76 | token_index = 0 77 | # 각 캐릭터 위치마다 어절 인덱스 저장 78 | for char in content: 79 | token_position.append(token_index) 80 | if char == " ": 81 | token_index += 1 82 | 83 | # 정규식 표현을 통해 비속어 위치 찾기 84 | badwords = [] 85 | for pattern in patternList: 86 | matchObjs = re.finditer(pattern, content) 87 | badwords += [token_position[obj.span()[0]] for obj in matchObjs] # 해당 단어가 속한 어절의 위치 88 | 89 | content = [" "]+ content.split(" ") + [" "] # 어절을 반환하기 위한 스플릿 & 맨앞, 맨뒤 padding 90 | 91 | badwords = list(set(badwords)) # 중복제거 92 | 93 | return [(content[index], content[index+1], content[index+2], index) for index in badwords] # trigram(3어절 반환) & 단어 위치 94 | 95 | 96 | # # ==== 예시 테스트 ==== 97 | 98 | content = """ 99 | 애미 지난해 12월 시발 미국의 자지 꼴갑들하고 헤지펀드 빠구리가 이렇게 100 | 개새끼 엘리엇이 좃같네 시1발 한국 정부를 상대로 소송을 미친 제기했습니다 101 | 애미없네 삼성물산과 제일모직의 합병을 승인하는 과정에서 한국 정부가 부당하게 개입했고 102 | 이를 통해 자신들이 피해를 봤다는 건데요 법무부에 따르면 한국 정부는 증거와 증인이 없다며 103 | 조목조목 반박한 답변서를 엘리엇에게 보냈습니다 하지만 주진우 시사인 기자는 법무부의 답변서 104 | 내용이 수상하다고 지적합니다 향후 이재용 부회장의 대법원 판결에 영향을 줄 수 있는 내용이 담겨있다는 것인데요 105 | tbs 라디오 김어준의 뉴스공장에 출연한 주진우 시사인 기자의 얘기를 들어보시죠 법무부 답변서 내용을 한마디로 106 | 정리하자면 삼성물산 합병과 관련해서는 모든 것이 합법적이다 이렇게 답변서가 말하고 있는 건가요 그러니까요 107 | 법무부가 주장하는 것은 삼성의 주장과 거의 동일하다 이렇게 볼 수 있습니다 아니 그런데 법무부가 삼성을 이렇게까지 108 | 대변할 이유가 없잖아요 물론 정부가 나서서 삼성을 도왔다 이렇게 하면 엘리엇 소송에서 불리하니까 이렇게 답변했다고 볼 수 있는데 109 | 이것은 굉장히 중요한 문제가 걸려 있습니다 그리고 이런 소송은 엘리엇이 8000억 원 넘는 피해를 봤다 110 | 이렇게 했을 때 너희가 어떻게 피해를 받았는지 구체적으로 입증해라 이렇게 소송을 대비하는 게 맞다고 합니다. 개새끼 111 | """ 112 | 113 | print("기존 내용 ", content) 114 | print("필터 적용 후 내용",return_bad_words_index(content, mode=0)) 115 | print("욕설 위치 반환", return_bad_words_index(content, mode=1)) -------------------------------------------------------------------------------- /JamoSplit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # 자모 스플릿 하는 코드 5 | 6 | # In[54]: 7 | 8 | 9 | CHOSUNGS = [u'ㄱ',u'ㄲ',u'ㄴ',u'ㄷ',u'ㄸ',u'ㄹ',u'ㅁ',u'ㅂ',u'ㅃ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅉ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ'] 10 | JOONGSUNGS = [u'ㅏ',u'ㅐ',u'ㅑ',u'ㅒ',u'ㅓ',u'ㅔ',u'ㅕ',u'ㅖ',u'ㅗ',u'ㅘ',u'ㅙ',u'ㅚ',u'ㅛ',u'ㅜ',u'ㅝ',u'ㅞ',u'ㅟ',u'ㅠ',u'ㅡ',u'ㅢ',u'ㅣ'] 11 | JONGSUNGS = [u'_',u'ㄱ',u'ㄲ',u'ㄳ',u'ㄴ',u'ㄵ',u'ㄶ',u'ㄷ',u'ㄹ',u'ㄺ',u'ㄻ',u'ㄼ',u'ㄽ',u'ㄾ',u'ㄿ',u'ㅀ',u'ㅁ',u'ㅂ',u'ㅄ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ'] 12 | TOTAL = CHOSUNGS + JOONGSUNGS + JONGSUNGS 13 | 14 | # 자모분리 15 | def jamo_split(word, end_char="_"): 16 | 17 | result = [] 18 | 19 | for char in word: 20 | 21 | character_code = ord(char) 22 | 23 | if 0xD7A3 < character_code or character_code < 0xAC00: 24 | result.append(char) 25 | continue 26 | 27 | chosung_index = int((((character_code - 0xAC00) / 28) / 21) % 19) 28 | joongsung_index = int(((character_code - 0xAC00) / 28) % 21) 29 | jongsung_index = int((character_code - 0xAC00) % 28) 30 | 31 | chosung = CHOSUNGS[chosung_index] 32 | joongsung = JOONGSUNGS[joongsung_index] 33 | jongsung = JONGSUNGS[jongsung_index] 34 | 35 | # 종성 범위 밖에 있는 것들은 end_char로 메꿔준다. 36 | if jongsung_index == 0: 37 | jongsung = end_char 38 | 39 | result.append(chosung) 40 | result.append(joongsung) 41 | result.append(jongsung) 42 | 43 | return "".join(result) 44 | 45 | 46 | # 자모결합 47 | def jamo_combine(word): 48 | 49 | result = "" 50 | index = 0 51 | 52 | while index < len(word): 53 | 54 | # 3개의 char를 보아 글자가 만들어지면 만들고 아니면 1개의 char만 추가한다. 55 | try: 56 | cho = CHOSUNGS.index(word[index]) * 21 * 28 57 | joong = JOONGSUNGS.index(word[index+1]) * 28 58 | jong = JONGSUNGS.index(word[index+2]) 59 | 60 | result += chr(cho + joong + jong + 0xAC00) 61 | index += 3 62 | 63 | except: 64 | result += word[index] 65 | index += 1 66 | 67 | return result -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BadWordDetection 2 | ## 비속어 탐지 모델 3 | 4 | > # Outline 5 | 6 | - 욕설 키워드 기반 크롤링(네이버 뉴스, 카페, 블로그, 디씨인사이드, 네이트 뉴스) 7 | - STT변환을 통해 나올 수 있는 단어(완전한 글자) 위주로 학습(약 14000개 라벨링) 8 | - 자모분리를 통한 fasttext word embedding vocab구성 9 | - RandomForest: accuracy: 약 86퍼, f1-score: 약85퍼 10 | - 1DCNN: accuracy: 약 86퍼, f1-score: 약89퍼 11 | - 학습 데이터가 커서 올라가지 않음 12 | 13 | --- 14 | 15 | > # Process 16 | 17 | - 문장에서 정규식표현으로 욕설이 나오는 부분 추출 18 | - 추출된 어절 중심으로 좌우 단어 trigram 반환 ex) (나는, 바보, 멍청이, 3) 3번째위치에 바보가 있고 좌우어절은 나는, 멍청이 이다 19 | - trigram을 fasttext embedding model을 활용하여 vectorize 20 | - vectorize된 데이터를 Random Forest or 1DCNN Model에 넣어 예측 21 | - EDA -> FastTextVocab -> TrigramVectorize -> 1DCNN or RandomForest -> Test 22 | --- 23 | 24 | > # Test 25 | 26 | - Pretrained 모델로 예측해보기 27 | - Test.ipynb 실행 28 | 29 | --- 30 | 31 | > # vocab 시각화 32 | 33 | ### vocab 2차원으로 임베딩 후 plot 34 | ![image](https://trello-attachments.s3.amazonaws.com/5d6cac86cbfe1b604908c66b/5da7f09503d7a77cb20ecdd2/3253a5efe850ba0f591326219ac3510f/word_embedding_2dim.png) 35 | 36 | ### 유사한 단어들 뽑아보기 37 | ![image](https://trello-attachments.s3.amazonaws.com/5d6cac86cbfe1b604908c66b/5da7f09503d7a77cb20ecdd2/247f9863ee368cb8a865dce8405c5f21/%EC%8A%A4%ED%81%AC%EB%A6%B0%EC%83%B7_2019-10-17_%EC%98%A4%ED%9B%84_1.42.00.png) 38 | 39 | --- 40 | 41 | > # 모델 결과 42 | 43 | ### 1DCNN 44 | ![image](https://trello-attachments.s3.amazonaws.com/5d831fcfe983994f027abcdf/432x288/81114955f146e917e23261cba4ea83d4/1DCNN_reuslt.png) 45 | 46 | ### Random Forest 47 | ![image](https://trello-attachments.s3.amazonaws.com/5d831fcfe983994f027abcdf/428x172/3be62876f0ae2aefcd430c1e2b810a9f/%E1%84%89%E1%85%B3%E1%84%8F%E1%85%B3%E1%84%85%E1%85%B5%E1%86%AB%E1%84%89%E1%85%A3%E1%86%BA_2019-10-17_%E1%84%8B%E1%85%A9%E1%84%92%E1%85%AE_2.07.58.png) 48 | -------------------------------------------------------------------------------- /RandomForest.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Random Forest" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# ==== input 형태 만들기 ====" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "import numpy as np\n", 25 | "\n", 26 | "data = pd.read_json(\"./labeled_data.json\")\n", 27 | "data.columns = [\"label\", \"trigram\"]" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# data flatten 해줌 3 x 50 = 150 feature\n", 37 | "data['trigram'] = data['trigram'].apply(lambda x: (np.array(x).reshape(-1)))\n", 38 | "\n", 39 | "# 훈련데이터 테스트데이터 분리\n", 40 | "from sklearn.model_selection import train_test_split\n", 41 | "\n", 42 | "y = data.pop('label')\n", 43 | "X = data\n", 44 | "\n", 45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n", 46 | "\n", 47 | "X_train = np.array(X_train['trigram'].tolist())\n", 48 | "X_test = np.array(X_test['trigram'].tolist())" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# ==== 모델링 ====" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 5, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 67 | " max_depth=2, max_features='auto', max_leaf_nodes=None,\n", 68 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 69 | " min_samples_leaf=1, min_samples_split=2,\n", 70 | " min_weight_fraction_leaf=0.0, n_estimators=100,\n", 71 | " n_jobs=None, oob_score=False, random_state=1, verbose=0,\n", 72 | " warm_start=False)" 73 | ] 74 | }, 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "from sklearn.ensemble import RandomForestClassifier\n", 82 | "\n", 83 | "# 기본모델로 돌려보기\n", 84 | "rf = RandomForestClassifier(n_estimators=100, max_depth=2,\n", 85 | " random_state=1)\n", 86 | "\n", 87 | "rf.fit(X_train, y_train)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "0.7163309131813069 0.7174898592221427\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "# score 출력 역시 좋지않다!\n", 105 | "print(rf.score(X_train, y_train), rf.score(X_test, y_test))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "print(rf.get_params())" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 9, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],\n", 134 | " 'max_depth': [10, 19, 28, 37, 46, 55, 64, 73, 82, 91, 100, None],\n", 135 | " 'min_samples_split': [2, 5, 10],\n", 136 | " 'min_samples_leaf': [1, 2, 4],\n", 137 | " 'bootstrap': [True, False]}" 138 | ] 139 | }, 140 | "execution_count": 9, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "# grid search를 위한 hyper parameter 범위 설정\n", 147 | "\n", 148 | "# Number of trees in random forest\n", 149 | "n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]\n", 150 | "# Maximum number of levels in tree\n", 151 | "max_depth = [int(x) for x in np.linspace(10, 100, num = 11)]\n", 152 | "max_depth.append(None)\n", 153 | "# Minimum number of samples required to split a node\n", 154 | "min_samples_split = [2, 5, 10]\n", 155 | "# Minimum number of samples required at each leaf node\n", 156 | "min_samples_leaf = [1, 2, 4]\n", 157 | "# Method of selecting samples for training each tree\n", 158 | "bootstrap = [True, False]\n", 159 | "# Create the random grid\n", 160 | "random_grid = {'n_estimators': n_estimators,\n", 161 | " 'max_depth': max_depth,\n", 162 | " 'min_samples_split': min_samples_split,\n", 163 | " 'min_samples_leaf': min_samples_leaf,\n", 164 | " 'bootstrap': bootstrap}\n", 165 | "random_grid" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 12, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "CPU times: user 18.4 s, sys: 1.64 s, total: 20.1 s\n", 178 | "Wall time: 56min 20s\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "from sklearn.model_selection import GridSearchCV\n", 184 | "from sklearn.pipeline import Pipeline\n", 185 | "from sklearn.preprocessing import StandardScaler\n", 186 | "\n", 187 | "# scaler한다음 rf grid search\n", 188 | "pipe_rf = Pipeline([('scl', StandardScaler()), ('rf', RandomForestClassifier(random_state=1))])\n", 189 | "\n", 190 | "param_grid = {'rf__n_estimators': n_estimators,\n", 191 | " 'rf__max_depth': max_depth,\n", 192 | " 'rf__min_samples_split': min_samples_split,\n", 193 | " 'rf__min_samples_leaf': min_samples_leaf}\n", 194 | "# 'rf__bootstrap': bootstrap}\n", 195 | "\n", 196 | "# 욕설아닌 것들을 잘 예측을 못하기 때문에 f1 score기준으로 search하였다.\n", 197 | "gs = GridSearchCV(estimator=pipe_rf, param_grid=param_grid,\n", 198 | " scoring='f1', cv=5, n_jobs=-1)\n", 199 | "%time gs = gs.fit(X_train, y_train)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 13, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "0.9041595093718033\n", 212 | "{'rf__max_depth': 28, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 80}\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "# grid search를 통해 나온 best score, parameters 보기\n", 218 | "print(gs.best_score_)\n", 219 | "print(gs.best_params_)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 20, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 231 | " max_depth=28, max_features='auto', max_leaf_nodes=None,\n", 232 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 233 | " min_samples_leaf=1, min_samples_split=2,\n", 234 | " min_weight_fraction_leaf=0.0, n_estimators=80,\n", 235 | " n_jobs=None, oob_score=False, random_state=None,\n", 236 | " verbose=0, warm_start=False)" 237 | ] 238 | }, 239 | "execution_count": 20, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "# 베스트 파라미터 적용시켜서 다시하기\n", 246 | "clf = RandomForestClassifier(max_depth=28, min_samples_leaf=1, min_samples_split=2, n_estimators=80)\n", 247 | "clf.fit(X_train, y_train)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 25, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "0.9988751406074241 0.8618468146027202\n", 260 | " precision recall f1-score support\n", 261 | "\n", 262 | " 0 0.83 0.65 0.73 1205\n", 263 | " 1 0.87 0.95 0.91 2986\n", 264 | "\n", 265 | " accuracy 0.86 4191\n", 266 | " macro avg 0.85 0.80 0.82 4191\n", 267 | "weighted avg 0.86 0.86 0.86 4191\n", 268 | "\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "from sklearn.metrics import classification_report, confusion_matrix \n", 274 | "# print prediction results \n", 275 | "predictions = clf.predict(X_test) \n", 276 | "print(clf.score(X_train, y_train), clf.score(X_test, y_test))\n", 277 | "print(classification_report(y_test, predictions)) " 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 22, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "[(74, 0.04833944969431489),\n", 289 | " (81, 0.024481502486178877),\n", 290 | " (73, 0.02281956474931012),\n", 291 | " (78, 0.022678854591804846),\n", 292 | " (61, 0.020070450843119064),\n", 293 | " (79, 0.01811390039696703),\n", 294 | " (82, 0.017959499259192598),\n", 295 | " (67, 0.01770153025298927),\n", 296 | " (85, 0.01736047291538577),\n", 297 | " (96, 0.01606220945314773),\n", 298 | " (98, 0.01531237403515211),\n", 299 | " (124, 0.014949087173775319),\n", 300 | " (71, 0.014630017385657945),\n", 301 | " (72, 0.013871216070973913),\n", 302 | " (93, 0.012456008243748231),\n", 303 | " (75, 0.012148444677806035),\n", 304 | " (83, 0.011277990090811298),\n", 305 | " (50, 0.010752187493707616),\n", 306 | " (62, 0.010748594436991686),\n", 307 | " (86, 0.009882246027366728)]" 308 | ] 309 | }, 310 | "execution_count": 22, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "tt = zip(list(range(len(clf.feature_importances_))), list(clf.feature_importances_))\n", 317 | "sorted(tt, key=lambda x: x[1], reverse=True)[:20] # 50 부터 100까지가 욕설의 형태를 띈 단어인데 그주변이 대부분이다." 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 26, 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "# save the model to disk\n", 327 | "import pickle\n", 328 | "filename = 'rf_model'\n", 329 | "pickle.dump(clf, open(filename, 'wb'))" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 27, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "name": "stdout", 339 | "output_type": "stream", 340 | "text": [ 341 | "[1]\n" 342 | ] 343 | }, 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "array([[0.05 , 0.95 ],\n", 348 | " [0.1625, 0.8375]])" 349 | ] 350 | }, 351 | "execution_count": 27, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | } 355 | ], 356 | "source": [ 357 | "# load the model from disk\n", 358 | "from sklearn.externals import joblib\n", 359 | "loaded_model = joblib.load(filename)\n", 360 | "result = loaded_model.predict(X_train[0:1])\n", 361 | "print(result)\n", 362 | "clf.predict_proba(X_train[0:2])" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [] 371 | } 372 | ], 373 | "metadata": { 374 | "kernelspec": { 375 | "display_name": "Python 3", 376 | "language": "python", 377 | "name": "python3" 378 | }, 379 | "language_info": { 380 | "codemirror_mode": { 381 | "name": "ipython", 382 | "version": 3 383 | }, 384 | "file_extension": ".py", 385 | "mimetype": "text/x-python", 386 | "name": "python", 387 | "nbconvert_exporter": "python", 388 | "pygments_lexer": "ipython3", 389 | "version": "3.7.4" 390 | } 391 | }, 392 | "nbformat": 4, 393 | "nbformat_minor": 4 394 | } 395 | -------------------------------------------------------------------------------- /Test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Using TensorFlow backend.\n", 13 | "/Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/sklearn/externals/joblib/__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", 14 | " warnings.warn(msg, category=DeprecationWarning)\n", 15 | "WARNING: Logging before flag parsing goes to stderr.\n", 16 | "W1018 17:35:39.563318 4502631872 deprecation_wrapper.py:119] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:541: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.\n", 17 | "\n", 18 | "W1018 17:35:39.586373 4502631872 deprecation_wrapper.py:119] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4432: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.\n", 19 | "\n", 20 | "W1018 17:35:39.658091 4502631872 deprecation_wrapper.py:119] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:66: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.\n", 21 | "\n", 22 | "W1018 17:35:39.658792 4502631872 deprecation_wrapper.py:119] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:148: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.\n", 23 | "\n", 24 | "W1018 17:35:39.698024 4502631872 deprecation_wrapper.py:119] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4267: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n", 25 | "\n", 26 | "W1018 17:35:39.703814 4502631872 deprecation.py:506] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3733: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 27 | "Instructions for updating:\n", 28 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 29 | "W1018 17:35:39.828878 4502631872 deprecation_wrapper.py:119] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.\n", 30 | "\n", 31 | "W1018 17:35:40.030106 4502631872 deprecation_wrapper.py:119] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", 32 | "\n", 33 | "W1018 17:35:40.036347 4502631872 deprecation.py:323] From /Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", 34 | "Instructions for updating:\n", 35 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "from BadWordDetectionByRegularExpression import return_bad_words_index\n", 41 | "from JamoSplit import jamo_split\n", 42 | "\n", 43 | "# model load\n", 44 | "from gensim.models import FastText\n", 45 | "from keras.models import load_model\n", 46 | "from sklearn.externals import joblib\n", 47 | "\n", 48 | "import numpy as np\n", 49 | "\n", 50 | "embedding_model = FastText.load(\"./gensim_festtext.model\")\n", 51 | "cnn_model = load_model(\"./cnn_model\")\n", 52 | "rf_model = joblib.load(\"./rf_model\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# ==== 데이터를 input에 넣기 위한 전처리 ====" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "text=\"\"\"\n", 69 | "오늘은 2018년이다. 근데 저 18년이 나에게 욕을했다. 엠창 개새끼라고 이것들이 다 욕으로 처리될까요 완전 개새끼구만 오늘 아침에는 18년이 기분이 좋았다 올해 18년 계획은 지금 존나 졸리다 어떡하지 프로그램 미운우리새끼는 욕이 아니고 저 새끼는 욕이겠지 아마 삼시세끼도 똑같겠지? 또 뭐가 있을까요...... ㅋㅋㅋㅋㅋㅋㅋㅋ십알은 어떨까요\n", 70 | "\"\"\"" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "trigram_list = return_bad_words_index(text, mode=1) # 욕설의 형태를 띄는 곳에가서 좌우단어 포함하여 trigram으로 반환" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "def chunks(l, n, trigram_list):\n", 89 | " '''\n", 90 | " vectroize 할 때 필요한 리스트를 청크별로 나누는 함수\n", 91 | " input : list, n(청크 단위)\n", 92 | " output : (n개씩 묶어진 list, word_index(단어위치))\n", 93 | " '''\n", 94 | " for i in range(0, len(l), n):\n", 95 | " yield (l[i:i + n], trigram_list[i//n][-1])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "[('저', '새끼는', '욕이겠지', 32),\n", 107 | " ('\\n오늘은', '2018년이다.', '근데', 1),\n", 108 | " ('아마', '삼시세끼도', '똑같겠지?', 35),\n", 109 | " ('저', '18년이', '나에게', 4),\n", 110 | " ('욕을했다.', '엠창', '개새끼라고', 7),\n", 111 | " ('엠창', '개새끼라고', '이것들이', 8),\n", 112 | " ('완전', '개새끼구만', '오늘', 14),\n", 113 | " ('아침에는', '18년이', '기분이', 17),\n", 114 | " ('올해', '18년', '계획은', 21),\n", 115 | " ('지금', '존나', '졸리다', 24),\n", 116 | " ('프로그램', '미운우리새끼는', '욕이', 28)]" 117 | ] 118 | }, 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "trigram_list" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 6, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stderr", 135 | "output_type": "stream", 136 | "text": [ 137 | "/Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n", 138 | " \n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "# vectorize : trigram을 150차원의 벡터 + word index형태의 리스트로 만들어주는 과정 \n", 144 | "trigram_vector = np.array([np.array(embedding_model[jamo_split(word)]) for trigram in trigram_list for word in trigram[:-1]])\n", 145 | "trigram_vector = np.array(list(chunks(trigram_vector, 3, trigram_list))) # 50차원의 3개의 vector가 1개의 trigram에 들어가기위해 나눠주는 과정\n", 146 | "trigram_vector = np.array([np.append(_[0].flatten(), _[1]) for _ in trigram_vector]) # 3 x 50 을 150차원으로 flatten + word index = 151dim" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "(11, 151)" 158 | ] 159 | }, 160 | "execution_count": 7, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "trigram_vector.shape" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 8, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "word_index = np.int8(trigram_vector[:, -1]) # word_index 단어위치를 뽑아내기\n", 176 | "trigram_vector = np.delete(trigram_vector, -1, axis=1) # word_index 지우기" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "# ==== 결과 확인 ====\n", 184 | "# ==== CNN ====" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 16, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "trigram_vector = trigram_vector.reshape(trigram_vector.shape[0], trigram_vector.shape[1], 1) # keras input 맞춰주기" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 17, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "(11, 150, 1)" 205 | ] 206 | }, 207 | "execution_count": 17, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "trigram_vector.shape" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 18, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "단어위치\n", 226 | " [32 1 35 4 7 8 14 17 21 24 28]\n", 227 | "예측 확률 값\n", 228 | " [[0.9957671 ]\n", 229 | " [0.62791127]\n", 230 | " [0.70563596]\n", 231 | " [0.98375434]\n", 232 | " [0.99994034]\n", 233 | " [0.9999865 ]\n", 234 | " [0.9995563 ]\n", 235 | " [0.9997824 ]\n", 236 | " [0.00102557]\n", 237 | " [0.9999993 ]\n", 238 | " [0.9139524 ]]\n", 239 | "Class와 단어 위치\n", 240 | " [(True, 32), (False, 1), (True, 35), (True, 4), (True, 7), (True, 8), (True, 14), (True, 17), (False, 21), (True, 24), (True, 28)]\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "# cnn\n", 246 | "print(\"단어위치\\n\", word_index)\n", 247 | "print(\"예측 확률 값\\n\", cnn_model.predict(trigram_vector))\n", 248 | "result = cnn_model.predict(trigram_vector) > 0.65 # 0.65보다 높으면 욕설\n", 249 | "result = result.reshape(-1).tolist()\n", 250 | "print(\"Class와 단어 위치\\n\", list(zip(result, word_index.tolist())))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 19, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "비속어\n", 263 | " [['저' '새끼는' '욕이겠지' '32']\n", 264 | " ['아마' '삼시세끼도' '똑같겠지?' '35']\n", 265 | " ['저' '18년이' '나에게' '4']\n", 266 | " ['욕을했다.' '엠창' '개새끼라고' '7']\n", 267 | " ['엠창' '개새끼라고' '이것들이' '8']\n", 268 | " ['완전' '개새끼구만' '오늘' '14']\n", 269 | " ['아침에는' '18년이' '기분이' '17']\n", 270 | " ['지금' '존나' '졸리다' '24']\n", 271 | " ['프로그램' '미운우리새끼는' '욕이' '28']]\n", 272 | "비속어 아닌것들\n", 273 | " [['\\n오늘은' '2018년이다.' '근데' '1']\n", 274 | " ['올해' '18년' '계획은' '21']]\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "# 결과 확인\n", 280 | "print(\"비속어\\n\", np.array(trigram_list)[np.array(result)])\n", 281 | "print(\"비속어 아닌것들\\n\", np.array(trigram_list)[np.array(result) == False])" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "# ==== RandomForest ====" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 20, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "trigram_vector = trigram_vector.reshape(trigram_vector.shape[0], trigram_vector.shape[1]) # random forest input 맞춰주기" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 21, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "단어위치\n", 310 | " [32 1 35 4 7 8 14 17 21 24 28]\n", 311 | "예측 확률 값\n", 312 | " [[0.3375 0.6625]\n", 313 | " [0.275 0.725 ]\n", 314 | " [0.35 0.65 ]\n", 315 | " [0.3125 0.6875]\n", 316 | " [0.125 0.875 ]\n", 317 | " [0.1125 0.8875]\n", 318 | " [0.2 0.8 ]\n", 319 | " [0.1875 0.8125]\n", 320 | " [0.6125 0.3875]\n", 321 | " [0.0875 0.9125]\n", 322 | " [0.325 0.675 ]]\n", 323 | "Class와 단어 위치\n", 324 | " [(True, 32), (True, 1), (False, 35), (True, 4), (True, 7), (True, 8), (True, 14), (True, 17), (False, 21), (True, 24), (True, 28)]\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "# randomforest\n", 330 | "print(\"단어위치\\n\", word_index)\n", 331 | "print(\"예측 확률 값\\n\", rf_model.predict_proba(trigram_vector))\n", 332 | "result = rf_model.predict_proba(trigram_vector)[:, 1] > 0.65 # 0.65보다 높으면 욕설\n", 333 | "result = result.tolist()\n", 334 | "# result = [_==1 for _ in result] # Boolean list로 만들기\n", 335 | "print(\"Class와 단어 위치\\n\", list(zip(result, word_index.tolist())))" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 22, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "비속어\n", 348 | " [['저' '새끼는' '욕이겠지' '32']\n", 349 | " ['\\n오늘은' '2018년이다.' '근데' '1']\n", 350 | " ['저' '18년이' '나에게' '4']\n", 351 | " ['욕을했다.' '엠창' '개새끼라고' '7']\n", 352 | " ['엠창' '개새끼라고' '이것들이' '8']\n", 353 | " ['완전' '개새끼구만' '오늘' '14']\n", 354 | " ['아침에는' '18년이' '기분이' '17']\n", 355 | " ['지금' '존나' '졸리다' '24']\n", 356 | " ['프로그램' '미운우리새끼는' '욕이' '28']]\n", 357 | "비속어 아닌것들\n", 358 | " [['아마' '삼시세끼도' '똑같겠지?' '35']\n", 359 | " ['올해' '18년' '계획은' '21']]\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "# 결과 확인\n", 365 | "print(\"비속어\\n\", np.array(trigram_list)[np.array(result)])\n", 366 | "print(\"비속어 아닌것들\\n\", np.array(trigram_list)[np.array(result) == False])" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.7.4" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 4 398 | } 399 | -------------------------------------------------------------------------------- /TrigramVectorize.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Trigram으로 나눠진 라벨링된 데이터를 불러와 벡터화 시키기" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 10, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "from JamoSplit import jamo_combine, jamo_split" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 11, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "data = pd.read_csv(\"./preprocessing_labeled_data.csv\", header=None)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# ==== input에 맞게 데이터 수정 ====" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 12, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# int형으로 문제가 발생하여 전부 str 타입으로 설정\n", 44 | "data[0] = data[0].astype(\"str\")\n", 45 | "data[1] = data[1].astype(\"str\")\n", 46 | "data[2] = data[2].astype(\"str\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 13, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# 빈칸이 nan되는 문제 다시 공백으로 체인지\n", 56 | "data[0] = data[0].apply(lambda x: \" \" if x == 'nan' else x)\n", 57 | "data[1] = data[1].apply(lambda x: \" \" if x == 'nan' else x)\n", 58 | "data[2] = data[2].apply(lambda x: \" \" if x == 'nan' else x)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 14, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# 데이터 한 column으로 합치기\n", 68 | "data['trigram'] = data[0] + \"$\" + data[1] + \"$\"+ data[2]\n", 69 | "del data[0], data[1], data[2] # 합친후에 삭제\n", 70 | "\n", 71 | "# 자모분리\n", 72 | "data['trigram'] = data['trigram'].apply(lambda x: jamo_split(x))\n", 73 | "# ㅂㅏ_ㅂㅗ_ 가 한 word가 될 수 있도록 만들어주는 과정\n", 74 | "data['trigram'] = data['trigram'].apply(lambda x: x.split(\"$\"))" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 15, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/html": [ 85 | "
\n", 86 | "\n", 99 | "\n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | "
3trigram
01[ㅁㅗㅅㅎㅏ_ㄴㅡㄴ, ㄱㅘㄴㅈㅗㅇ, ㅠㅠ]
11[ , ㅅㅟ_ㅂㅓㄹ, ]
20[ , ㅇㅠ_ㅎㅡㅇ, ㄱㅏ_ㅈㅣ_ㄱㅗ_]
30[ㅈㅓㅈㅂㅕㅇ, ㅈㅓㅈㄲㅗㄱㅈㅣ_, ]
41[ㅁㅣㄴㅈㅜ_ㅋㅟ_, ㄷㅗㄹㅇㅏㅆㄴㅏ_, ㅁㅣㄴㅈㅜ_ㅍㅐㄴㄷㅡㄹㅇㅡㄴ]
\n", 135 | "
" 136 | ], 137 | "text/plain": [ 138 | " 3 trigram\n", 139 | "0 1 [ㅁㅗㅅㅎㅏ_ㄴㅡㄴ, ㄱㅘㄴㅈㅗㅇ, ㅠㅠ]\n", 140 | "1 1 [ , ㅅㅟ_ㅂㅓㄹ, ]\n", 141 | "2 0 [ , ㅇㅠ_ㅎㅡㅇ, ㄱㅏ_ㅈㅣ_ㄱㅗ_]\n", 142 | "3 0 [ㅈㅓㅈㅂㅕㅇ, ㅈㅓㅈㄲㅗㄱㅈㅣ_, ]\n", 143 | "4 1 [ㅁㅣㄴㅈㅜ_ㅋㅟ_, ㄷㅗㄹㅇㅏㅆㄴㅏ_, ㅁㅣㄴㅈㅜ_ㅍㅐㄴㄷㅡㄹㅇㅡㄴ]" 144 | ] 145 | }, 146 | "execution_count": 15, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "data.head() # 3column이 label이다." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 16, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "# fasttext 모델 불러오기\n", 162 | "from gensim.models import FastText\n", 163 | "\n", 164 | "embedding_model = FastText.load(\"./gensim_festtext.model\")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 17, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stderr", 174 | "output_type": "stream", 175 | "text": [ 176 | "/Users/daier/anaconda3/envs/testEnv/lib/python3.7/site-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n", 177 | " \n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "# 각 단어를 벡터화 시켜주는 과정 3 x 50(embedding dimension) \n", 183 | "data['trigram'] = data['trigram'].apply(lambda x: [embedding_model[_] for _ in x])" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 19, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/html": [ 194 | "
\n", 195 | "\n", 208 | "\n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | "
3trigram
01[[-1.953795, -1.1928164, -0.18564849, -0.40059...
11[[0.01287517, 0.013801623, 0.017847467, -0.012...
20[[0.01287517, 0.013801623, 0.017847467, -0.012...
30[[-0.41459364, 0.14193316, -0.7080049, 0.40626...
41[[-0.17480232, -0.062112387, -0.6108945, 0.764...
\n", 244 | "
" 245 | ], 246 | "text/plain": [ 247 | " 3 trigram\n", 248 | "0 1 [[-1.953795, -1.1928164, -0.18564849, -0.40059...\n", 249 | "1 1 [[0.01287517, 0.013801623, 0.017847467, -0.012...\n", 250 | "2 0 [[0.01287517, 0.013801623, 0.017847467, -0.012...\n", 251 | "3 0 [[-0.41459364, 0.14193316, -0.7080049, 0.40626...\n", 252 | "4 1 [[-0.17480232, -0.062112387, -0.6108945, 0.764..." 253 | ] 254 | }, 255 | "execution_count": 19, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "data.head()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 18, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "# 벡터화된 데이터로 저장\n", 271 | "data.to_json(\"./labeled_data.json\")" 272 | ] 273 | } 274 | ], 275 | "metadata": { 276 | "kernelspec": { 277 | "display_name": "Python 3", 278 | "language": "python", 279 | "name": "python3" 280 | }, 281 | "language_info": { 282 | "codemirror_mode": { 283 | "name": "ipython", 284 | "version": 3 285 | }, 286 | "file_extension": ".py", 287 | "mimetype": "text/x-python", 288 | "name": "python", 289 | "nbconvert_exporter": "python", 290 | "pygments_lexer": "ipython3", 291 | "version": "3.7.4" 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 4 296 | } 297 | -------------------------------------------------------------------------------- /cnn_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smothly/bad-word-detection/6aba6a8186029099610ca0d3d4df50368f898ec3/cnn_model -------------------------------------------------------------------------------- /rf_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smothly/bad-word-detection/6aba6a8186029099610ca0d3d4df50368f898ec3/rf_model -------------------------------------------------------------------------------- /screenshot/1DCNN reuslt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smothly/bad-word-detection/6aba6a8186029099610ca0d3d4df50368f898ec3/screenshot/1DCNN reuslt.png -------------------------------------------------------------------------------- /screenshot/1DCNN_model_summay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smothly/bad-word-detection/6aba6a8186029099610ca0d3d4df50368f898ec3/screenshot/1DCNN_model_summay.png -------------------------------------------------------------------------------- /screenshot/randomforest_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smothly/bad-word-detection/6aba6a8186029099610ca0d3d4df50368f898ec3/screenshot/randomforest_result.png -------------------------------------------------------------------------------- /screenshot/word_embedding_2dim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smothly/bad-word-detection/6aba6a8186029099610ca0d3d4df50368f898ec3/screenshot/word_embedding_2dim.png -------------------------------------------------------------------------------- /screenshot/word_embedding_most_simmilar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smothly/bad-word-detection/6aba6a8186029099610ca0d3d4df50368f898ec3/screenshot/word_embedding_most_simmilar.png --------------------------------------------------------------------------------