├── README.md ├── .gitignore ├── submission.ipynb ├── [Analyse] LB Detect.ipynb ├── [Model] MPCNN.ipynb ├── [Feature] V1.ipynb ├── [Model] TextRNN.ipynb ├── [Model] TextRCNN.ipynb ├── [Model] CNN.ipynb ├── [Model] Multi LSTM CNN v0 word.ipynb ├── [Model] TextCNN.ipynb ├── [Model] Multi LSTM CNN v4.ipynb ├── [Model] Multi LSTM CNN v2.ipynb └── [Model] Multi LSTM CNN v5.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # paipaidai_comp 2 | 3 | [第三届魔镜杯大赛](https://ai.ppdai.com/mirror/goToMirrorDetail?mirrorId=1), 智能客服聊天机器人真实数据, 提高智能客服的识别能力和服务质量 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # Jupyter Notebook 6 | .ipynb_checkpoints 7 | 8 | # folder 9 | data/ 10 | log/ 11 | result/ 12 | models/ -------------------------------------------------------------------------------- /submission.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import glob\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "word_pred = pd.read_csv(\"./result/20180715-101519-Multi_LSTM_CNN_v3_word_best.csv\")\n", 24 | "char_pred = pd.read_csv(\"./result/20180715-132714-Multi_LSTM_CNN_v5_char_best.csv\")\n", 25 | "final_pred = (word_pred + char_pred) / 2\n", 26 | "final_pred.to_csv(\"./result/prediction_v5.csv\", index=False)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [] 37 | } 38 | ], 39 | "metadata": { 40 | "kernelspec": { 41 | "display_name": "Python 3", 42 | "language": "python", 43 | "name": "python3" 44 | }, 45 | "language_info": { 46 | "codemirror_mode": { 47 | "name": "ipython", 48 | "version": 3 49 | }, 50 | "file_extension": ".py", 51 | "mimetype": "text/x-python", 52 | "name": "python", 53 | "nbconvert_exporter": "python", 54 | "pygments_lexer": "ipython3", 55 | "version": "3.6.2" 56 | } 57 | }, 58 | "nbformat": 4, 59 | "nbformat_minor": 2 60 | } 61 | -------------------------------------------------------------------------------- /[Analyse] LB Detect.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "DATA_PATH = \"./ppd_data/\"\n", 24 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 25 | "TEST_PATH = DATA_PATH + \"test.csv\"" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "train_data = pd.read_csv(TRAIN_PATH)\n", 37 | "test_data = pd.read_csv(TEST_PATH)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "(254386, 172956)" 49 | ] 50 | }, 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "num_train, num_test = len(train_data), len(test_data)\n", 58 | "num_train, num_test" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "0.5191087559849992" 70 | ] 71 | }, 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "pos_rate = train_data[\"label\"].sum() / num_train\n", 79 | "pos_rate" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 6, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "pd.DataFrame(np.ones(shape=(num_test,)) * pos_rate, columns=[\"y_pre\"]).to_csv(\"./pred_const.csv\", index=False)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 7, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "0.5029553655657454" 102 | ] 103 | }, 104 | "execution_count": 7, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "log_loss = 0.693652\n", 111 | "\n", 112 | "r = (log_loss + np.log(1 - pos_rate)) / (np.log((1 - pos_rate) / pos_rate))\n", 113 | "r" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.6.2" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 2 138 | } 139 | -------------------------------------------------------------------------------- /[Model] MPCNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Multi-Perspective Sentence Similarity Modeling with Convolutional Neural Networks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import warnings\n", 19 | "warnings.filterwarnings(\"ignore\")\n", 20 | "\n", 21 | "import numpy as np\n", 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "DATA_PATH = \"./data/\"\n", 34 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 35 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 36 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 37 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 38 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 39 | "\n", 40 | "train_data = pd.read_csv(TRAIN_PATH)\n", 41 | "test_data = pd.read_csv(TEST_PATH)\n", 42 | "question_data = pd.read_csv(QUEST_PATH)\n", 43 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 44 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 45 | "\n", 46 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 47 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 48 | "\n", 49 | "label = train_data[\"label\"].values\n", 50 | "\n", 51 | "from keras.preprocessing.text import Tokenizer\n", 52 | "\n", 53 | "MAX_COUNT = 10000\n", 54 | "\n", 55 | "word_tokenizer = Tokenizer(MAX_COUNT)\n", 56 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 57 | "\n", 58 | "word_embedding_data = np.concatenate(\n", 59 | " (\n", 60 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 61 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 62 | " ),\n", 63 | " axis=0\n", 64 | ")\n", 65 | "\n", 66 | "char_tokenizer = Tokenizer(MAX_COUNT)\n", 67 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n", 68 | "\n", 69 | "char_embedding_data = np.concatenate(\n", 70 | " (\n", 71 | " np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n", 72 | " char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 73 | " ),\n", 74 | " axis=0\n", 75 | ")\n", 76 | "\n", 77 | "word_embedding_data.shape, char_embedding_data.shape" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "from keras.preprocessing.sequence import pad_sequences\n", 89 | "\n", 90 | "SEQ_LEN = 30\n", 91 | "\n", 92 | "def gen_word_data(data):\n", 93 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 94 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 95 | " return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 96 | " pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 97 | " \n", 98 | "def gen_char_data(data):\n", 99 | " seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n", 100 | " seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n", 101 | " return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 102 | " pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 103 | "\n", 104 | "word1, word2 = gen_word_data(train_data)\n", 105 | "char1, char2 = gen_char_data(train_data)\n", 106 | "test_word1, test_word2 = gen_word_data(test_data)\n", 107 | "test_char1, test_char2 = gen_char_data(test_data)\n", 108 | "\n", 109 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "from keras.models import Model\n", 121 | "from keras.layers.merge import concatenate\n", 122 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 123 | "from keras.layers import LSTM, Bidirectional, TimeDistributed\n", 124 | "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n", 125 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activationation" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "from sklearn.model_selection import train_test_split\n", 137 | "\n", 138 | "train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n", 139 | " word1, word2, train_data[\"label\"].values,\n", 140 | " test_size=0.2\n", 141 | ")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "embedding_layer = Embedding(\n", 153 | " input_dim=word_embedding_data.shape[0], # word/char switch\n", 154 | " output_dim=word_embedding_data.shape[1], # word/char switch\n", 155 | " weights=[word_embedding_data], # word/char switch\n", 156 | " input_length=SEQ_LEN,\n", 157 | " trainable=False\n", 158 | " )\n", 159 | "\n", 160 | "vector1 = embedding_layer(input1)\n", 161 | "vector2 = embedding_layer(input2)\n", 162 | "\n" 163 | ] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.6.2" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 2 187 | } 188 | -------------------------------------------------------------------------------- /[Feature] V1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 所有特征\n", 8 | "\n", 9 | "## 问题特征\n", 10 | "\n", 11 | "- 问题出现次数: 1\n", 12 | "- 问题单词数量: 2\n", 13 | "- 问题字符数量: 2\n", 14 | "- 问题Hash值: 2\n", 15 | "\n", 16 | "## 问题对特征\n", 17 | "\n", 18 | "- 问题对重复单词数量: 1\n", 19 | "- 问题对重复字符数量: 1\n", 20 | "\n", 21 | "## 图特征\n", 22 | "\n", 23 | "- Clique Size, 与此问题对相互毗邻结点组成的子图中结点的数量: 1\n", 24 | "- K-core, 每个点最大的K-core值: 2" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import networkx as nx\n", 36 | "import numpy as np\n", 37 | "import pandas as pd\n", 38 | "from itertools import combinations" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "DATA_PATH = \"./data/\"\n", 50 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 51 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 52 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 53 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 54 | "QUEST_PATH = DATA_PATH + \"question.csv\"" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "train_data = pd.read_csv(TRAIN_PATH)\n", 66 | "test_data = pd.read_csv(TEST_PATH)\n", 67 | "question_data = pd.read_csv(QUEST_PATH)\n", 68 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 69 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 70 | "\n", 71 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 72 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 73 | "\n", 74 | "label = train_data[\"label\"].values.copy()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "total_question = pd.concat([train_data[\"q1\"], train_data[\"q2\"], test_data[\"q1\"], test_data[\"q2\"]])\n", 86 | "question_feature = total_question.value_counts().reset_index()\n", 87 | "question_feature.columns = [\"qid\", \"q_count\"]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "unique_question = total_question.drop_duplicates().reset_index(drop=True)\n", 99 | "question_dict = pd.Series(unique_question.index, unique_question).to_dict()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "from keras.preprocessing.text import Tokenizer\n", 109 | "\n", 110 | "word_tokenizer = Tokenizer()\n", 111 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 112 | "char_tokenizer = Tokenizer()\n", 113 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "word_count = sorted(list(word_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)\n", 125 | "word_count = pd.DataFrame(word_count, columns=[\"word\", \"word_times\"])\n", 126 | "char_count = sorted(list(char_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)\n", 127 | "char_count = pd.DataFrame(char_count, columns=[\"cahr\", \"char_times\"])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "train = train_data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n", 139 | " .drop([\"qid\", \"label\"], axis=1) \\\n", 140 | " .rename(columns={\"words\": \"words1\", \"chars\": \"chars1\"}) \\\n", 141 | " .merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n", 142 | " .drop([\"qid\"], axis=1) \\\n", 143 | " .rename(columns={\"words\": \"words2\", \"chars\": \"chars2\"})\n", 144 | "test = test_data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n", 145 | " .drop([\"qid\"], axis=1) \\\n", 146 | " .rename(columns={\"words\": \"words1\", \"chars\": \"chars1\"}) \\\n", 147 | " .merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n", 148 | " .drop([\"qid\"], axis=1) \\\n", 149 | " .rename(columns={\"words\": \"words2\", \"chars\": \"chars2\"})" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "train1 = train.merge(question_feature, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n", 161 | " .drop(\"qid\", axis=1) \\\n", 162 | " .rename(columns={\"q_count\": \"q1_count\"})\n", 163 | "train1 = train1.merge(question_feature, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n", 164 | " .drop(\"qid\", axis=1) \\\n", 165 | " .rename(columns={\"q_count\": \"q2_count\"})" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "test1 = test.merge(question_feature, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n", 177 | " .drop(\"qid\", axis=1) \\\n", 178 | " .rename(columns={\"q_count\": \"q1_count\"})\n", 179 | "test1 = test1.merge(question_feature, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n", 180 | " .drop(\"qid\", axis=1) \\\n", 181 | " .rename(columns={\"q_count\": \"q2_count\"})" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "def question_feature(data):\n", 193 | " data[\"word1_len\"], data[\"word2_len\"] = data[\"words1\"].map(len), data[\"words2\"].map(len)\n", 194 | " data[\"char1_len\"], data[\"char2_len\"] = data[\"chars1\"].map(len), data[\"chars2\"].map(len)\n", 195 | " data[\"word_same\"] = data.apply(lambda x: len(set(x[\"words1\"]).intersection(set(x[\"words2\"]))), axis=1)\n", 196 | " data[\"char_same\"] = data.apply(lambda x: len(set(x[\"chars1\"]).intersection(set(x[\"chars2\"]))), axis=1)\n", 197 | " data[\"q1_hash\"], data[\"q2_hash\"] = data[\"q1\"].map(question_dict), data[\"q2\"].map(question_dict)\n", 198 | " return data\n", 199 | " \n", 200 | "train2, test2 = question_feature(train1), question_feature(test1)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "train2.head()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "graph = networkx.Graph()\n", 221 | "edges = [tuple(pair) for pair in pd.concat([train_data[[\"q1\", \"q2\"]], test_data[[\"q1\", \"q2\"]]]).values]\n", 222 | "graph.add_edges_from(edges)\n", 223 | "\n", 224 | "cliques = sorted(list(networkx.find_cliques(graph)), key=lambda x: len(x), reverse=True)\n", 225 | "map_label = dict(((x[0], x[1]), 1) for x in pd.concat([train_data[[\"q1\", \"q2\"]], test_data[[\"q1\", \"q2\"]]]).values)\n", 226 | "\n", 227 | "map_clique_size = {}\n", 228 | "for c in cliques:\n", 229 | " for q1, q2 in combinations(c, 2):\n", 230 | " if (q1, q2) in map_label:\n", 231 | " map_clique_size[q1, q2] = len(c)\n", 232 | " elif (q2, q1) in map_label:\n", 233 | " map_clique_size[q2, q1] = len(c)\n", 234 | "\n", 235 | "train2['clique_size'] = train2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)\n", 236 | "test2['clique_size'] = test2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "max_kcore = pd.DataFrame(list(nx.core_number(graph).items()), columns=[\"qid\", \"kcore\"])\n", 246 | "train3 = train2.merge(max_kcore, how=\"left\", left_on=\"q1\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q1_kcore\"}) \\\n", 247 | " .merge(max_kcore, how=\"left\", left_on=\"q2\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q2_kcore\"})\n", 248 | "test3 = test2.merge(max_kcore, how=\"left\", left_on=\"q1\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q1_kcore\"}) \\\n", 249 | " .merge(max_kcore, how=\"left\", left_on=\"q2\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q2_kcore\"})" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "train3.drop([\"q1\", \"q2\", \"words1\", \"chars1\", \"words2\", \"chars2\"], axis=1).to_csv(\"./data/train_feature.csv\", index=False)\n", 259 | "test3.drop([\"q1\", \"q2\", \"words1\", \"chars1\", \"words2\", \"chars2\"], axis=1).to_csv(\"./data/test_feature.csv\", index=False)" 260 | ] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "Python 3", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.6.2" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 2 284 | } 285 | -------------------------------------------------------------------------------- /[Model] TextRNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import warnings\n", 10 | "warnings.filterwarnings(\"ignore\")\n", 11 | "\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "from glob import glob\n", 15 | "from datetime import datetime\n", 16 | "\n", 17 | "DATA_PATH = \"./data/\"\n", 18 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 19 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 20 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 21 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 22 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 23 | "\n", 24 | "train_data = pd.read_csv(TRAIN_PATH)\n", 25 | "test_data = pd.read_csv(TEST_PATH)\n", 26 | "question_data = pd.read_csv(QUEST_PATH)\n", 27 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 28 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 29 | "\n", 30 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 31 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 32 | "\n", 33 | "label = train_data[\"label\"].values" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from keras.preprocessing.text import Tokenizer\n", 43 | "\n", 44 | "MAX_COUNT = 10000\n", 45 | "\n", 46 | "word_tokenizer = Tokenizer(MAX_COUNT)\n", 47 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 48 | "\n", 49 | "word_embedding_data = np.concatenate(\n", 50 | " (\n", 51 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 52 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 53 | " ),\n", 54 | " axis=0\n", 55 | ")\n", 56 | "\n", 57 | "char_tokenizer = Tokenizer(MAX_COUNT)\n", 58 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n", 59 | "\n", 60 | "char_embedding_data = np.concatenate(\n", 61 | " (\n", 62 | " np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n", 63 | " char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 64 | " ),\n", 65 | " axis=0\n", 66 | ")\n", 67 | "\n", 68 | "word_embedding_data.shape, char_embedding_data.shape" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from keras.preprocessing.sequence import pad_sequences\n", 78 | "\n", 79 | "SEQ_LEN = 25\n", 80 | "\n", 81 | "def gen_word_data(data):\n", 82 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 83 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 84 | " return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 85 | " pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 86 | " \n", 87 | "def gen_char_data(data):\n", 88 | " seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n", 89 | " seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n", 90 | " return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 91 | " pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 92 | "\n", 93 | "word1, word2 = gen_word_data(train_data)\n", 94 | "char1, char2 = gen_char_data(train_data)\n", 95 | "test_word1, test_word2 = gen_word_data(test_data)\n", 96 | "test_char1, test_char2 = gen_char_data(test_data)\n", 97 | "\n", 98 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "from keras.models import Model\n", 110 | "from keras.layers.merge import concatenate\n", 111 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 112 | "from keras.optimizers import Adam, Nadam, SGD\n", 113 | "from keras.layers import LSTM, Bidirectional, TimeDistributed, CuDNNLSTM\n", 114 | "from keras.layers import Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPool1D\n", 115 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "BATCH_SIZE = 1024\n", 127 | "NUM_EPOCHES = 50\n", 128 | "DROP_RATE = 0.3\n", 129 | "PATIENCE = 8\n", 130 | "\n", 131 | "LSTM_SIZE1 = 256\n", 132 | "LSTM_SIZE2 = 256\n", 133 | "\n", 134 | "DENSE_SIZE1 = 512\n", 135 | "DENSE_SIZE2 = 256" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "from sklearn.model_selection import StratifiedKFold\n", 145 | "\n", 146 | "best_results = []\n", 147 | "last_results = []\n", 148 | "\n", 149 | "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10, shuffle=True).split(X=char1, y=label)):\n", 150 | " train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index] # word/char switch\n", 151 | " dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index] # word/char switch\n", 152 | " \n", 153 | " input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 154 | " input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 155 | "\n", 156 | " embedding_layer = Embedding(\n", 157 | " input_dim=word_embedding_data.shape[0], # word/char switch\n", 158 | " output_dim=word_embedding_data.shape[1], # word/char switch\n", 159 | " weights=[word_embedding_data], # word/char switch\n", 160 | " input_length=SEQ_LEN,\n", 161 | " trainable=False\n", 162 | " )\n", 163 | " \n", 164 | " vector1 = embedding_layer(input1)\n", 165 | " vector2 = embedding_layer(input2)\n", 166 | " \n", 167 | " lstm_layer1 = Bidirectional(LSTM(LSTM_SIZE1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n", 168 | " layer1a = lstm_layer1(vector1)\n", 169 | " layer1b = lstm_layer1(vector2)\n", 170 | " lstm_layer2 = Bidirectional(LSTM(LSTM_SIZE2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n", 171 | " layer2a = lstm_layer2(layer1a)\n", 172 | " layer2b = lstm_layer2(layer1b)\n", 173 | " layer2a = GlobalMaxPool1D()(layer2a)\n", 174 | " layer2b = GlobalMaxPool1D()(layer2b)\n", 175 | " merge = concatenate([layer2a, layer2b])\n", 176 | " \n", 177 | " x = Dropout(DROP_RATE)(merge)\n", 178 | " x = BatchNormalization()(x)\n", 179 | " x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 180 | " x = Dropout(DROP_RATE)(x)\n", 181 | " x = BatchNormalization()(x)\n", 182 | " x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 183 | " x = Dropout(DROP_RATE)(x)\n", 184 | " x = BatchNormalization()(x)\n", 185 | " pred = Dense(1, activation=\"sigmoid\")(x)\n", 186 | " \n", 187 | " model = Model(inputs=[input1, input2], outputs=pred)\n", 188 | " model.compile(\n", 189 | " optimizer=\"nadam\",\n", 190 | " loss=\"binary_crossentropy\",\n", 191 | " metrics=[\"acc\"]\n", 192 | " )\n", 193 | " \n", 194 | " early_stopping = EarlyStopping(\"val_loss\", patience=PATIENCE)\n", 195 | " check_point = ModelCheckpoint(\n", 196 | " \"./log/%s.TextRNN.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 197 | " monitor=\"val_loss\",\n", 198 | " save_best_only=True,\n", 199 | " )\n", 200 | " \n", 201 | " fit_res = model.fit(\n", 202 | " x=[train_x1, train_x2],\n", 203 | " y=train_y,\n", 204 | " batch_size=BATCH_SIZE,\n", 205 | " epochs=NUM_EPOCHES,\n", 206 | " validation_data=([dev_x1, dev_x2], dev_y),\n", 207 | " shuffle=True,\n", 208 | " callbacks=[early_stopping, check_point]\n", 209 | " )\n", 210 | " \n", 211 | " pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 212 | " last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 213 | " \n", 214 | " print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 215 | " model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 216 | " pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 217 | " best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 218 | "\n", 219 | "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 220 | " \"./result/%s-TextRNN_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 221 | " index=False\n", 222 | ")\n", 223 | "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 224 | " \"./result/%s-TextRNN_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 225 | " index=False\n", 226 | ")" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 3", 242 | "language": "python", 243 | "name": "python3" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 3 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython3", 255 | "version": "3.6.2" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 2 260 | } 261 | -------------------------------------------------------------------------------- /[Model] TextRCNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings(\"ignore\")\n", 13 | "\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "from glob import glob\n", 17 | "from datetime import datetime\n", 18 | "\n", 19 | "DATA_PATH = \"./data/\"\n", 20 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 21 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 22 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 23 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 24 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 25 | "\n", 26 | "train_data = pd.read_csv(TRAIN_PATH)\n", 27 | "test_data = pd.read_csv(TEST_PATH)\n", 28 | "question_data = pd.read_csv(QUEST_PATH)\n", 29 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 30 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 31 | "\n", 32 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 33 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 34 | "\n", 35 | "label = train_data[\"label\"].values" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "Using TensorFlow backend.\n" 48 | ] 49 | }, 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "((10001, 300), (3049, 300))" 54 | ] 55 | }, 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "from keras.preprocessing.text import Tokenizer\n", 63 | "\n", 64 | "MAX_COUNT = 10000\n", 65 | "\n", 66 | "word_tokenizer = Tokenizer(MAX_COUNT)\n", 67 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 68 | "\n", 69 | "word_embedding_data = np.concatenate(\n", 70 | " (\n", 71 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 72 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 73 | " ),\n", 74 | " axis=0\n", 75 | ")\n", 76 | "\n", 77 | "char_tokenizer = Tokenizer(MAX_COUNT)\n", 78 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n", 79 | "\n", 80 | "char_embedding_data = np.concatenate(\n", 81 | " (\n", 82 | " np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n", 83 | " char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 84 | " ),\n", 85 | " axis=0\n", 86 | ")\n", 87 | "\n", 88 | "word_embedding_data.shape, char_embedding_data.shape" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 3, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "((254386, 25),\n", 100 | " (254386, 25),\n", 101 | " (172956, 25),\n", 102 | " (172956, 25),\n", 103 | " (254386, 25),\n", 104 | " (254386, 25),\n", 105 | " (172956, 25),\n", 106 | " (172956, 25))" 107 | ] 108 | }, 109 | "execution_count": 3, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "from keras.preprocessing.sequence import pad_sequences\n", 116 | "\n", 117 | "SEQ_LEN = 25\n", 118 | "\n", 119 | "def gen_word_data(data):\n", 120 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 121 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 122 | " return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 123 | " pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 124 | " \n", 125 | "def gen_char_data(data):\n", 126 | " seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n", 127 | " seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n", 128 | " return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 129 | " pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 130 | "\n", 131 | "word1, word2 = gen_word_data(train_data)\n", 132 | "char1, char2 = gen_char_data(train_data)\n", 133 | "test_word1, test_word2 = gen_word_data(test_data)\n", 134 | "test_char1, test_char2 = gen_char_data(test_data)\n", 135 | "\n", 136 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "from keras.models import Model\n", 148 | "from keras.layers.merge import concatenate\n", 149 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 150 | "from keras.optimizers import Adam, Nadam, SGD\n", 151 | "from keras.layers import LSTM, Bidirectional, TimeDistributed, CuDNNLSTM\n", 152 | "from keras.layers import Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D, MaxPool1D\n", 153 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 5, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# general\n", 165 | "BATCH_SIZE = 512\n", 166 | "NUM_EPOCHES = 30\n", 167 | "DROP_RATE = 0.3\n", 168 | "PATIENCE = 8\n", 169 | "# cnn\n", 170 | "CONV_LEN_1 = 128\n", 171 | "CONV_LEN_2 = 128\n", 172 | "CONV_LEN_3 = 128\n", 173 | "CONV_LEN_4 = 128\n", 174 | "CONV_LEN_5 = 128\n", 175 | "CONV_LEN_6 = 128\n", 176 | "CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6\n", 177 | "# lstm\n", 178 | "LSTM_SIZE1 = 256\n", 179 | "LSTM_SIZE2 = 256\n", 180 | "LSTM_DROP_RATE = 0.3\n", 181 | "# dense\n", 182 | "DENSE_SIZE1 = 512\n", 183 | "DENSE_SIZE2 = 256" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": true 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "def cnn_layer1(step_input, filters, kernel_size):\n", 195 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 196 | " conv_output = conv(step_input)\n", 197 | " conv_output = GlobalMaxPool1D()(conv_output)\n", 198 | " return conv_output\n", 199 | "\n", 200 | "def cnn_layer2(step_input, filters, kernel_size):\n", 201 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 202 | " conv_output = conv(step_input)\n", 203 | " conv_output = GlobalAveragePooling1D()(conv_output)\n", 204 | " return conv_output\n", 205 | "\n", 206 | "def cnn_layer3(step_input, filters, kernel_size):\n", 207 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 208 | " conv_output = conv(step_input)\n", 209 | " conv_output1 = GlobalMaxPool1D()(conv_output)\n", 210 | " conv_output2 = GlobalAveragePooling1D()(conv_output)\n", 211 | " conv_output = concatenate([conv_output1, conv_output2])\n", 212 | " return conv_output" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n", 225 | "Instructions for updating:\n", 226 | "`NHWC` for data_format is deprecated, use `NWC` instead\n", 227 | "Train on 228946 samples, validate on 25440 samples\n", 228 | "Epoch 1/30\n", 229 | "228946/228946 [==============================] - 304s 1ms/step - loss: 0.3771 - acc: 0.8290 - val_loss: 0.2841 - val_acc: 0.8757\n", 230 | "Epoch 2/30\n", 231 | "228946/228946 [==============================] - 299s 1ms/step - loss: 0.2587 - acc: 0.8887 - val_loss: 0.2429 - val_acc: 0.8976\n", 232 | "Epoch 3/30\n", 233 | "228946/228946 [==============================] - 299s 1ms/step - loss: 0.2155 - acc: 0.9091 - val_loss: 0.2256 - val_acc: 0.9086\n", 234 | "Epoch 4/30\n", 235 | "228946/228946 [==============================] - 297s 1ms/step - loss: 0.1888 - acc: 0.9217 - val_loss: 0.2054 - val_acc: 0.9134\n", 236 | "Epoch 5/30\n", 237 | "228946/228946 [==============================] - 296s 1ms/step - loss: 0.1684 - acc: 0.9307 - val_loss: 0.2144 - val_acc: 0.9143\n", 238 | "Epoch 6/30\n", 239 | "228946/228946 [==============================] - 295s 1ms/step - loss: 0.1524 - acc: 0.9382 - val_loss: 0.2035 - val_acc: 0.9181\n", 240 | "Epoch 7/30\n", 241 | "228946/228946 [==============================] - 296s 1ms/step - loss: 0.1361 - acc: 0.9441 - val_loss: 0.2061 - val_acc: 0.9204\n", 242 | "Epoch 8/30\n", 243 | "228946/228946 [==============================] - 297s 1ms/step - loss: 0.1254 - acc: 0.9490 - val_loss: 0.2181 - val_acc: 0.9162\n", 244 | "Epoch 9/30\n", 245 | " 70144/228946 [========>.....................] - ETA: 3:17 - loss: 0.1033 - acc: 0.9583" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "from sklearn.model_selection import StratifiedKFold\n", 251 | "\n", 252 | "best_results = []\n", 253 | "last_results = []\n", 254 | "\n", 255 | "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10, shuffle=True).split(X=word1, y=label)): # word/char switch\n", 256 | " train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index] # word/char switch\n", 257 | " dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index] # word/char switch\n", 258 | " \n", 259 | " input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 260 | " input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 261 | "\n", 262 | " embedding_layer = Embedding(\n", 263 | " input_dim=word_embedding_data.shape[0], # word/char switch\n", 264 | " output_dim=word_embedding_data.shape[1], # word/char switch\n", 265 | " weights=[word_embedding_data], # word/char switch\n", 266 | " input_length=SEQ_LEN,\n", 267 | " trainable=False\n", 268 | " )\n", 269 | " \n", 270 | " vector1 = embedding_layer(input1)\n", 271 | " vector2 = embedding_layer(input2)\n", 272 | " \n", 273 | " lstm_layer1 = Bidirectional(LSTM(LSTM_SIZE1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n", 274 | " layer1a = lstm_layer1(vector1)\n", 275 | " layer1b = lstm_layer1(vector2)\n", 276 | " lstm_layer2 = Bidirectional(LSTM(LSTM_SIZE2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True))\n", 277 | " layer2a = lstm_layer2(layer1a)\n", 278 | " layer2b = lstm_layer2(layer1b)\n", 279 | " layer2a = concatenate([vector1, layer2a])\n", 280 | " layer2b = concatenate([vector2, layer2b])\n", 281 | " \n", 282 | " # TODO: 这里还可以添加BatchNorm层\n", 283 | " \n", 284 | " conv1a, conv1b = cnn_layer1(layer2a, filters=CONV_LEN_1, kernel_size=1), cnn_layer1(layer2b, filters=CONV_LEN_1, kernel_size=1)\n", 285 | " conv2a, conv2b = cnn_layer1(layer2a, filters=CONV_LEN_2, kernel_size=2), cnn_layer1(layer2b, filters=CONV_LEN_2, kernel_size=2)\n", 286 | " conv3a, conv3b = cnn_layer1(layer2a, filters=CONV_LEN_3, kernel_size=3), cnn_layer1(layer2b, filters=CONV_LEN_3, kernel_size=3)\n", 287 | " conv4a, conv4b = cnn_layer1(layer2a, filters=CONV_LEN_4, kernel_size=4), cnn_layer1(layer2b, filters=CONV_LEN_4, kernel_size=4)\n", 288 | " conv5a, conv5b = cnn_layer1(layer2a, filters=CONV_LEN_5, kernel_size=5), cnn_layer1(layer2b, filters=CONV_LEN_5, kernel_size=5)\n", 289 | " conv6a, conv6b = cnn_layer1(layer2a, filters=CONV_LEN_6, kernel_size=6), cnn_layer1(layer2b, filters=CONV_LEN_6, kernel_size=6)\n", 290 | " \n", 291 | " merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 292 | " merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 293 | " diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 294 | " mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 295 | " merge = concatenate([diff, mult])\n", 296 | " \n", 297 | " x = Dropout(DROP_RATE)(merge)\n", 298 | " x = BatchNormalization()(x)\n", 299 | " x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 300 | " x = Dropout(DROP_RATE)(x)\n", 301 | " x = BatchNormalization()(x)\n", 302 | " x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 303 | " x = Dropout(DROP_RATE)(x)\n", 304 | " x = BatchNormalization()(x)\n", 305 | " pred = Dense(1, activation=\"sigmoid\")(x)\n", 306 | " \n", 307 | " model = Model(inputs=[input1, input2], outputs=pred)\n", 308 | " model.compile(\n", 309 | " optimizer=\"nadam\",\n", 310 | " loss=\"binary_crossentropy\",\n", 311 | " metrics=[\"acc\"]\n", 312 | " )\n", 313 | "\n", 314 | " early_stopping = EarlyStopping(\"val_loss\", patience=PATIENCE)\n", 315 | " check_point = ModelCheckpoint(\n", 316 | " \"./log/%s.TextRCNN.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 317 | " monitor=\"val_loss\",\n", 318 | " save_best_only=True,\n", 319 | " )\n", 320 | " \n", 321 | " fit_res = model.fit(\n", 322 | " x=[train_x1, train_x2],\n", 323 | " y=train_y,\n", 324 | " batch_size=BATCH_SIZE,\n", 325 | " epochs=NUM_EPOCHES,\n", 326 | " validation_data=([dev_x1, dev_x2], dev_y),\n", 327 | " shuffle=True,\n", 328 | " callbacks=[early_stopping, check_point]\n", 329 | " )\n", 330 | " \n", 331 | " pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 332 | " last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 333 | " \n", 334 | " print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 335 | " model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 336 | " pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 337 | " best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 338 | "\n", 339 | "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 340 | " \"./result/%s-TextRCNN_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 341 | " index=False\n", 342 | ")\n", 343 | "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 344 | " \"./result/%s-TextRCNN_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 345 | " index=False\n", 346 | ")" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": { 353 | "collapsed": true 354 | }, 355 | "outputs": [], 356 | "source": [] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "Python 3", 362 | "language": "python", 363 | "name": "python3" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.6.2" 376 | } 377 | }, 378 | "nbformat": 4, 379 | "nbformat_minor": 2 380 | } 381 | -------------------------------------------------------------------------------- /[Model] CNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings(\"ignore\")\n", 13 | "\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "from glob import glob\n", 17 | "from datetime import datetime" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "DATA_PATH = \"./data/\"\n", 29 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 30 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 31 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 32 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 33 | "QUEST_PATH = DATA_PATH + \"question.csv\"" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "train_data = pd.read_csv(TRAIN_PATH)\n", 45 | "test_data = pd.read_csv(TEST_PATH)\n", 46 | "question_data = pd.read_csv(QUEST_PATH)\n", 47 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 48 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 49 | "\n", 50 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 51 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from keras.preprocessing.text import Tokenizer\n", 61 | "\n", 62 | "MAX_WORD_NUMS = 10000\n", 63 | "\n", 64 | "word_tokenizer = Tokenizer(MAX_WORD_NUMS)\n", 65 | "word_tokenizer.fit_on_texts(question_data[\"words\"])" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "word_embedding_data = np.concatenate(\n", 75 | " (\n", 76 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 77 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values\n", 78 | " ),\n", 79 | " axis=0\n", 80 | ")\n", 81 | "word_embedding_data.shape" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "from keras.preprocessing.sequence import pad_sequences\n", 91 | "\n", 92 | "WORD_SEQ_LEN = 30\n", 93 | "\n", 94 | "def gen_data(data):\n", 95 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 96 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 97 | " return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 98 | " pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 99 | "\n", 100 | "word1, word2 = gen_data(train_data)\n", 101 | "test_word1, test_word2 = gen_data(test_data)\n", 102 | "\n", 103 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "label = train_data[\"label\"].values" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from keras.layers import Input, Embedding, Conv1D, GlobalAveragePooling1D, MaxPool1D, Lambda, Dropout, BatchNormalization, Dense, Flatten, K\n", 126 | "from keras.models import Model\n", 127 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 128 | "from keras.layers.merge import concatenate\n", 129 | "\n", 130 | "CONV_LEN_1 = 128\n", 131 | "CONV_LEN_2 = 128\n", 132 | "CONV_LEN_3 = 128\n", 133 | "CONV_LEN_4 = 128\n", 134 | "CONV_LEN_5 = 128\n", 135 | "CONV_LEN_6 = 128\n", 136 | "CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6\n", 137 | "DROP_RATE = 0.6\n", 138 | "DENSE_SIZE = 300\n", 139 | "BATCH_SIZE = 2048\n", 140 | "NUM_EPOCHES = 50" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "from sklearn.model_selection import train_test_split\n", 150 | "\n", 151 | "train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n", 152 | " word1, word2, train_data[\"label\"].values,\n", 153 | " test_size=0.2\n", 154 | ")\n", 155 | "\n", 156 | "word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 157 | "word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 158 | "\n", 159 | "embedding_layer = Embedding(\n", 160 | " input_dim=word_embedding_data.shape[0],\n", 161 | " output_dim=word_embedding_data.shape[1],\n", 162 | " weights=[word_embedding_data],\n", 163 | " input_length=WORD_SEQ_LEN,\n", 164 | " trainable=False\n", 165 | ")\n", 166 | "\n", 167 | "word_vector1 = embedding_layer(word_input1)\n", 168 | "word_vector2 = embedding_layer(word_input2)\n", 169 | "\n", 170 | "def cnn_layer(input1, input2, kernel_size, filters):\n", 171 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 172 | " \n", 173 | " conv_a = conv(input1)\n", 174 | " conv_a = MaxPool1D(pool_size=WORD_SEQ_LEN, strides=WORD_SEQ_LEN, padding=\"same\")(conv_a)\n", 175 | " conv_a = Flatten()(conv_a)\n", 176 | " \n", 177 | " conv_b = conv(input2)\n", 178 | " conv_b = MaxPool1D(pool_size=WORD_SEQ_LEN, strides=WORD_SEQ_LEN, padding=\"same\")(conv_b)\n", 179 | " conv_b = Flatten()(conv_b)\n", 180 | " return conv_a, conv_b\n", 181 | "\n", 182 | "conv1a, conv1b = cnn_layer(word_vector1, word_vector2, kernel_size=1, filters=CONV_LEN_1)\n", 183 | "conv2a, conv2b = cnn_layer(word_vector1, word_vector2, kernel_size=2, filters=CONV_LEN_2)\n", 184 | "conv3a, conv3b = cnn_layer(word_vector1, word_vector2, kernel_size=3, filters=CONV_LEN_3)\n", 185 | "conv4a, conv4b = cnn_layer(word_vector1, word_vector2, kernel_size=4, filters=CONV_LEN_4)\n", 186 | "conv5a, conv5b = cnn_layer(word_vector1, word_vector2, kernel_size=5, filters=CONV_LEN_5)\n", 187 | "conv6a, conv6b = cnn_layer(word_vector1, word_vector2, kernel_size=6, filters=CONV_LEN_6)\n", 188 | "\n", 189 | "merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 190 | "merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 191 | "diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 192 | "mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 193 | "merge = concatenate([diff, mult])\n", 194 | "\n", 195 | "x = Dropout(DROP_RATE)(merge)\n", 196 | "x = BatchNormalization()(x)\n", 197 | "x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n", 198 | "\n", 199 | "x = Dropout(DROP_RATE)(x)\n", 200 | "x = BatchNormalization()(x)\n", 201 | "pred = Dense(1, activation=\"sigmoid\")(x)\n", 202 | "\n", 203 | "model = Model(\n", 204 | " inputs = [word_input1, word_input2],\n", 205 | " outputs = pred\n", 206 | ")\n", 207 | "model.compile(\n", 208 | " optimizer=\"adam\",\n", 209 | " loss=\"binary_crossentropy\",\n", 210 | " metrics=[\"acc\"]\n", 211 | ")\n", 212 | "\n", 213 | "early_stop = EarlyStopping(\"val_loss\", patience=10)\n", 214 | "check_point = ModelCheckpoint(\n", 215 | " \"./log/%s.cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 216 | " monitor=\"val_loss\",\n", 217 | " save_best_only=True,\n", 218 | " save_weights_only=True\n", 219 | ")\n", 220 | "\n", 221 | "model_res = model.fit(\n", 222 | " x=[train_word1, train_word2],\n", 223 | " y=train_y,\n", 224 | " batch_size=BATCH_SIZE,\n", 225 | " epochs=NUM_EPOCHES,\n", 226 | " validation_data=([dev_word1, dev_word2], dev_y),\n", 227 | " shuffle=True,\n", 228 | " callbacks=[early_stop, check_point]\n", 229 | ")\n", 230 | "\n", 231 | "test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 232 | "pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_last.csv\", index=False)\n", 233 | "\n", 234 | "print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 235 | "model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 236 | "test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 237 | "pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_best.csv\", index=False)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": true 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "# from sklearn.model_selection import train_test_split\n", 249 | "\n", 250 | "# train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n", 251 | "# word1, word2, train_data[\"label\"].values,\n", 252 | "# test_size=0.2\n", 253 | "# )\n", 254 | "\n", 255 | "# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 256 | "# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 257 | "\n", 258 | "# embedding_layer = Embedding(\n", 259 | "# input_dim=word_embedding_data.shape[0],\n", 260 | "# output_dim=word_embedding_data.shape[1],\n", 261 | "# weights=[word_embedding_data],\n", 262 | "# input_length=WORD_SEQ_LEN,\n", 263 | "# trainable=False\n", 264 | "# )\n", 265 | "\n", 266 | "# word_vector1 = embedding_layer(word_input1)\n", 267 | "# word_vector2 = embedding_layer(word_input2)\n", 268 | "\n", 269 | "# conv1 = Conv1D(filters=CONV_LEN_1, kernel_size=1, padding=\"same\", activation=\"relu\")\n", 270 | "# conv1a = conv1(word_vector1)\n", 271 | "# conv1a = GlobalAveragePooling1D()(conv1a)\n", 272 | "# conv1b = conv1(word_vector2)\n", 273 | "# conv1b = GlobalAveragePooling1D()(conv1b)\n", 274 | "\n", 275 | "# conv2 = Conv1D(filters=CONV_LEN_2, kernel_size=2, padding=\"same\", activation=\"relu\")\n", 276 | "# conv2a = conv2(word_vector1)\n", 277 | "# conv2a = GlobalAveragePooling1D()(conv2a)\n", 278 | "# conv2b = conv2(word_vector2)\n", 279 | "# conv2b = GlobalAveragePooling1D()(conv2b)\n", 280 | "\n", 281 | "# conv3 = Conv1D(filters=CONV_LEN_3, kernel_size=3, padding=\"same\", activation=\"relu\")\n", 282 | "# conv3a = conv3(word_vector1)\n", 283 | "# conv3a = GlobalAveragePooling1D()(conv3a)\n", 284 | "# conv3b = conv3(word_vector2)\n", 285 | "# conv3b = GlobalAveragePooling1D()(conv3b)\n", 286 | "\n", 287 | "# conv4 = Conv1D(filters=CONV_LEN_4, kernel_size=4, padding=\"same\", activation=\"relu\")\n", 288 | "# conv4a = conv4(word_vector1)\n", 289 | "# conv4a = GlobalAveragePooling1D()(conv4a)\n", 290 | "# conv4b = conv4(word_vector2)\n", 291 | "# conv4b = GlobalAveragePooling1D()(conv4b)\n", 292 | "\n", 293 | "# conv5 = Conv1D(filters=CONV_LEN_5, kernel_size=5, padding=\"same\", activation=\"relu\")\n", 294 | "# conv5a = conv5(word_vector1)\n", 295 | "# conv5a = GlobalAveragePooling1D()(conv5a)\n", 296 | "# conv5b = conv5(word_vector2)\n", 297 | "# conv5b = GlobalAveragePooling1D()(conv5b)\n", 298 | "\n", 299 | "# conv6 = Conv1D(filters=CONV_LEN_6, kernel_size=6, padding=\"same\", activation=\"relu\")\n", 300 | "# conv6a = conv6(word_vector1)\n", 301 | "# conv6a = GlobalAveragePooling1D()(conv6a)\n", 302 | "# conv6b = conv6(word_vector2)\n", 303 | "# conv6b = GlobalAveragePooling1D()(conv6b)\n", 304 | "\n", 305 | "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 306 | "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 307 | "# # merge = concatenate([merge_a, merge_b])\n", 308 | "\n", 309 | "# diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(CONV_LEN,))([merge_a, merge_b])\n", 310 | "# mult = Lambda(lambda x: x[0] * x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])\n", 311 | "# merge = concatenate([diff, mult])\n", 312 | "\n", 313 | "# x = Dropout(DROP_RATE)(merge)\n", 314 | "# x = BatchNormalization()(x)\n", 315 | "# x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n", 316 | "\n", 317 | "# x = Dropout(DROP_RATE)(x)\n", 318 | "# x = BatchNormalization()(x)\n", 319 | "# pred = Dense(1, activation=\"sigmoid\")(x)\n", 320 | "\n", 321 | "# model = Model(\n", 322 | "# inputs = [word_input1, word_input2],\n", 323 | "# outputs = pred\n", 324 | "# )\n", 325 | "# model.compile(\n", 326 | "# optimizer=\"adam\",\n", 327 | "# loss=\"binary_crossentropy\",\n", 328 | "# metrics=[\"acc\"]\n", 329 | "# )\n", 330 | "\n", 331 | "# early_stop = EarlyStopping(\"val_loss\", patience=10)\n", 332 | "# check_point = ModelCheckpoint(\n", 333 | "# \"./log/%s.cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 334 | "# monitor=\"val_loss\",\n", 335 | "# save_best_only=True,\n", 336 | "# save_weights_only=True\n", 337 | "# )\n", 338 | "\n", 339 | "# model_res = model.fit(\n", 340 | "# x=[train_word1, train_word2],\n", 341 | "# y=train_y,\n", 342 | "# batch_size=BATCH_SIZE,\n", 343 | "# epochs=NUM_EPOCHES,\n", 344 | "# validation_data=([dev_word1, dev_word2], dev_y),\n", 345 | "# shuffle=True,\n", 346 | "# callbacks=[early_stop, check_point]\n", 347 | "# )\n", 348 | "\n", 349 | "# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 350 | "# pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_last.csv\", index=False)\n", 351 | "\n", 352 | "# print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 353 | "# model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 354 | "# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 355 | "# pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\"./result/pred_best.csv\", index=False)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": true 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "# from sklearn.model_selection import StratifiedKFold\n", 367 | "\n", 368 | "# pred_collect = []\n", 369 | "\n", 370 | "# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):\n", 371 | "# train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]\n", 372 | "# dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]\n", 373 | " \n", 374 | "# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 375 | "# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 376 | " \n", 377 | "# embedding_layer = Embedding(\n", 378 | "# input_dim=word_embedding_data.shape[0],\n", 379 | "# output_dim=word_embedding_data.shape[1],\n", 380 | "# weights=[word_embedding_data],\n", 381 | "# input_length=WORD_SEQ_LEN,\n", 382 | "# trainable=False\n", 383 | "# )\n", 384 | " \n", 385 | "# word_vector1 = embedding_layer(word_input1)\n", 386 | "# word_vector2 = embedding_layer(word_input2)\n", 387 | " \n", 388 | "# conv1 = Conv1D(filters=128, kernel_size=1, padding=\"same\", activation=\"relu\")\n", 389 | "# conv1a = conv1(word_vector1)\n", 390 | "# conv1a = GlobalAveragePooling1D()(conv1a)\n", 391 | "# conv1b = conv1(word_vector2)\n", 392 | "# conv1b = GlobalAveragePooling1D()(conv1b)\n", 393 | " \n", 394 | "# conv2 = Conv1D(filters=128, kernel_size=2, padding=\"same\", activation=\"relu\")\n", 395 | "# conv2a = conv2(word_vector1)\n", 396 | "# conv2a = GlobalAveragePooling1D()(conv2a)\n", 397 | "# conv2b = conv2(word_vector2)\n", 398 | "# conv2b = GlobalAveragePooling1D()(conv2b)\n", 399 | " \n", 400 | "# conv3 = Conv1D(filters=128, kernel_size=3, padding=\"same\", activation=\"relu\")\n", 401 | "# conv3a = conv3(word_vector1)\n", 402 | "# conv3a = GlobalAveragePooling1D()(conv3a)\n", 403 | "# conv3b = conv3(word_vector2)\n", 404 | "# conv3b = GlobalAveragePooling1D()(conv3b)\n", 405 | " \n", 406 | "# conv4 = Conv1D(filters=128, kernel_size=4, padding=\"same\", activation=\"relu\")\n", 407 | "# conv4a = conv4(word_vector1)\n", 408 | "# conv4a = GlobalAveragePooling1D()(conv4a)\n", 409 | "# conv4b = conv4(word_vector2)\n", 410 | "# conv4b = GlobalAveragePooling1D()(conv4b)\n", 411 | " \n", 412 | "# conv5 = Conv1D(filters=128, kernel_size=5, padding=\"same\", activation=\"relu\")\n", 413 | "# conv5a = conv5(word_vector1)\n", 414 | "# conv5a = GlobalAveragePooling1D()(conv5a)\n", 415 | "# conv5b = conv5(word_vector2)\n", 416 | "# conv5b = GlobalAveragePooling1D()(conv5b)\n", 417 | " \n", 418 | "# conv6 = Conv1D(filters=128, kernel_size=6, padding=\"same\", activation=\"relu\")\n", 419 | "# conv6a = conv6(word_vector1)\n", 420 | "# conv6a = GlobalAveragePooling1D()(conv6a)\n", 421 | "# conv6b = conv6(word_vector2)\n", 422 | "# conv6b = GlobalAveragePooling1D()(conv6b)\n", 423 | " \n", 424 | "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 425 | "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 426 | " \n", 427 | "# diff = Lambda(lambda x: x[0] - x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])\n", 428 | "# mult = Lambda(lambda x: x[0] * x[1], output_shape=(CONV_LEN,))([merge_a, merge_b])\n", 429 | " \n", 430 | "# merge = concatenate([diff, mult])\n", 431 | " \n", 432 | "# x = Dropout(DROP_RATE)(merge)\n", 433 | "# x = BatchNormalization()(x)\n", 434 | "# x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n", 435 | " \n", 436 | "# x = Dropout(DROP_RATE)(x)\n", 437 | "# x = BatchNormalization()(x)\n", 438 | "# pred = Dense(1, activation=\"sigmoid\")(x)\n", 439 | " \n", 440 | "# model = Model(\n", 441 | "# inputs = [word_input1, word_input2],\n", 442 | "# outputs = pred\n", 443 | "# )\n", 444 | "# model.compile(\n", 445 | "# optimizer=\"adam\",\n", 446 | "# loss=\"binary_crossentropy\",\n", 447 | "# metrics=[\"acc\"]\n", 448 | "# )\n", 449 | " \n", 450 | "# early_stop = EarlyStopping(\"val_loss\", patience=10)\n", 451 | "# check_point = ModelCheckpoint(\n", 452 | "# \"./log/cnn_%02d.{epoch:02d}_{val_loss:.3f}.hdf5\" % (i + 1),\n", 453 | "# monitor=\"val_loss\",\n", 454 | "# save_best_only=True,\n", 455 | "# save_weights_only=True\n", 456 | "# )\n", 457 | " \n", 458 | "# model_res = model.fit(\n", 459 | "# x=[train_word1, train_word2],\n", 460 | "# y=train_y,\n", 461 | "# batch_size=BATCH_SIZE,\n", 462 | "# epochs=NUM_EPOCHES,\n", 463 | "# validation_data=([dev_word1, dev_word2], dev_y),\n", 464 | "# shuffle=True,\n", 465 | "# callbacks=[early_stop, check_point]\n", 466 | "# )\n", 467 | " \n", 468 | "# print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 469 | "# model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 470 | "\n", 471 | "# test_pred = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 472 | "# pred_collect.append(pd.DataFrame(test_pred, columns=[\"y_pre\"]))\n", 473 | "\n", 474 | "# pd.DataFrame(pd.concat(pred_collect, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\"./result/pred.csv\", index=False)" 475 | ] 476 | } 477 | ], 478 | "metadata": { 479 | "kernelspec": { 480 | "display_name": "Python 3", 481 | "language": "python", 482 | "name": "python3" 483 | }, 484 | "language_info": { 485 | "codemirror_mode": { 486 | "name": "ipython", 487 | "version": 3 488 | }, 489 | "file_extension": ".py", 490 | "mimetype": "text/x-python", 491 | "name": "python", 492 | "nbconvert_exporter": "python", 493 | "pygments_lexer": "ipython3", 494 | "version": "3.6.2" 495 | } 496 | }, 497 | "nbformat": 4, 498 | "nbformat_minor": 2 499 | } 500 | -------------------------------------------------------------------------------- /[Model] Multi LSTM CNN v0 word.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings(\"ignore\")\n", 13 | "\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "from glob import glob\n", 17 | "from datetime import datetime" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "DATA_PATH = \"./data/\"\n", 29 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 30 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 31 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 32 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 33 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 34 | "\n", 35 | "train_data = pd.read_csv(TRAIN_PATH)\n", 36 | "test_data = pd.read_csv(TEST_PATH)\n", 37 | "question_data = pd.read_csv(QUEST_PATH)\n", 38 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 39 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 40 | "\n", 41 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 42 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 43 | "\n", 44 | "label = train_data[\"label\"].values" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "from keras.preprocessing.text import Tokenizer\n", 54 | "\n", 55 | "MAX_WORD_NUMS = 10000\n", 56 | "\n", 57 | "word_tokenizer = Tokenizer(MAX_WORD_NUMS)\n", 58 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 59 | "\n", 60 | "word_embedding_data = np.concatenate(\n", 61 | " (\n", 62 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 63 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_WORD_NUMS]].values\n", 64 | " ),\n", 65 | " axis=0\n", 66 | ")\n", 67 | "word_embedding_data.shape" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from keras.preprocessing.sequence import pad_sequences\n", 77 | "\n", 78 | "WORD_SEQ_LEN = 30\n", 79 | "\n", 80 | "def gen_data(data):\n", 81 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 82 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 83 | " return pad_sequences(seq_word1, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 84 | " pad_sequences(seq_word2, maxlen=WORD_SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 85 | "\n", 86 | "word1, word2 = gen_data(train_data)\n", 87 | "test_word1, test_word2 = gen_data(test_data)\n", 88 | "\n", 89 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "from keras.models import Model\n", 101 | "from keras.layers.merge import concatenate\n", 102 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 103 | "from keras.layers import LSTM, Bidirectional, TimeDistributed\n", 104 | "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n", 105 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "# general\n", 117 | "NUM_EPOCHES = 50\n", 118 | "BATCH_SIZE = 1024\n", 119 | "DENSE_SIZE = 300 # 512\n", 120 | "DROP_RATE = 0.3\n", 121 | "\n", 122 | "# cnn\n", 123 | "CONV_LEN_1 = 128\n", 124 | "CONV_LEN_2 = 128\n", 125 | "CONV_LEN_3 = 128\n", 126 | "CONV_LEN_4 = 128\n", 127 | "CONV_LEN_5 = 128\n", 128 | "CONV_LEN_6 = 128\n", 129 | "CONV_LEN = CONV_LEN_1 + CONV_LEN_2 + CONV_LEN_3 + CONV_LEN_4 + CONV_LEN_5 + CONV_LEN_6\n", 130 | "\n", 131 | "# lstm\n", 132 | "LSTM_SIZE_1 = 256\n", 133 | "LSTM_SIZE_2 = 256\n", 134 | "DROP_RATE_LSTM = 0.3" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "def cnn_layer_1(input1, input2, kernel_size, filters):\n", 146 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 147 | " \n", 148 | " conv_a = conv(input1)\n", 149 | " conv_a = GlobalAveragePooling1D()(conv_a)\n", 150 | " \n", 151 | " conv_b = conv(input2)\n", 152 | " conv_b = GlobalAveragePooling1D()(conv_b)\n", 153 | " return conv_a, conv_b" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# from sklearn.model_selection import train_test_split\n", 165 | "\n", 166 | "# train_word1, dev_word1, train_word2, dev_word2, train_y, dev_y = train_test_split(\n", 167 | "# word1, word2, train_data[\"label\"].values,\n", 168 | "# test_size=0.2\n", 169 | "# )\n", 170 | "\n", 171 | "# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 172 | "# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 173 | "\n", 174 | "# embedding_layer = Embedding(\n", 175 | "# input_dim=word_embedding_data.shape[0],\n", 176 | "# output_dim=word_embedding_data.shape[1],\n", 177 | "# weights=[word_embedding_data],\n", 178 | "# input_length=WORD_SEQ_LEN,\n", 179 | "# trainable=False\n", 180 | "# )\n", 181 | "\n", 182 | "# word_vector1 = embedding_layer(word_input1)\n", 183 | "# word_vector2 = embedding_layer(word_input2)\n", 184 | "\n", 185 | "# lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n", 186 | "# word_first_1 = lstm_layer1(word_vector1)\n", 187 | "# word_first_1 = Dropout(DROP_RATE)(word_first_1)\n", 188 | "# word_first_2 = lstm_layer1(word_vector2)\n", 189 | "# word_first_2 = Dropout(DROP_RATE)(word_first_2)\n", 190 | "\n", 191 | "# lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n", 192 | "# word_second_1 = lstm_layer2(word_first_1)\n", 193 | "# word_second_2 = lstm_layer2(word_first_2)\n", 194 | "\n", 195 | "# conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)\n", 196 | "# conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)\n", 197 | "# conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)\n", 198 | "# conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)\n", 199 | "# conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)\n", 200 | "# conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)\n", 201 | "\n", 202 | "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 203 | "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 204 | "# diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 205 | "# mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 206 | "# merge = concatenate([diff, mult])\n", 207 | "\n", 208 | "# x = Dropout(DROP_RATE)(merge)\n", 209 | "# x = BatchNormalization()(x)\n", 210 | "\n", 211 | "# x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n", 212 | "# x = Dropout(DROP_RATE)(x)\n", 213 | "# x = BatchNormalization()(x)\n", 214 | "\n", 215 | "# pred = Dense(1, activation=\"sigmoid\")(x)\n", 216 | "\n", 217 | "# model = Model(inputs=[word_input1, word_input2], outputs=pred)\n", 218 | "# model.compile(\n", 219 | "# optimizer=\"nadam\",\n", 220 | "# loss=\"binary_crossentropy\",\n", 221 | "# metrics=[\"acc\"]\n", 222 | "# )\n", 223 | "\n", 224 | "# early_stopping = EarlyStopping(\"val_loss\", patience=10)\n", 225 | "# check_point = ModelCheckpoint(\n", 226 | "# \"./log/%s.multi_lstm_cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 227 | "# monitor=\"val_loss\",\n", 228 | "# save_best_only=True,\n", 229 | "# save_weights_only=True\n", 230 | "# )\n", 231 | "\n", 232 | "# train_res = model.fit(\n", 233 | "# x=[train_word1, train_word2],\n", 234 | "# y=train_y,\n", 235 | "# batch_size=BATCH_SIZE,\n", 236 | "# epochs=NUM_EPOCHES,\n", 237 | "# validation_data=([dev_word1, dev_word2], dev_y),\n", 238 | "# shuffle=True,\n", 239 | "# callbacks=[early_stopping, check_point]\n", 240 | "# )\n", 241 | "\n", 242 | "# pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 243 | "# pd.DataFrame(pred_last, columns=[\"y_pre\"]).to_csv(\n", 244 | "# \"./result/%s-multilstm_cnn_pred_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 245 | "# index=False\n", 246 | "# )\n", 247 | "\n", 248 | "# print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 249 | "# model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 250 | "# pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 251 | "# pd.DataFrame(pred_best, columns=[\"y_pre\"]).to_csv(\n", 252 | "# \"./result/%s-multilstm_cnn_pred_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 253 | "# index=False\n", 254 | "# )" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "from sklearn.model_selection import StratifiedKFold\n", 264 | "\n", 265 | "best_results = []\n", 266 | "last_results = []\n", 267 | "\n", 268 | "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)):\n", 269 | " train_word1, train_word2, train_y = word1[train_index, :], word2[train_index, :], label[train_index]\n", 270 | " dev_word1, dev_word2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index]\n", 271 | " \n", 272 | " word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 273 | " word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 274 | "\n", 275 | " embedding_layer = Embedding(\n", 276 | " input_dim=word_embedding_data.shape[0],\n", 277 | " output_dim=word_embedding_data.shape[1],\n", 278 | " weights=[word_embedding_data],\n", 279 | " input_length=WORD_SEQ_LEN,\n", 280 | " trainable=False\n", 281 | " )\n", 282 | "\n", 283 | " word_vector1 = embedding_layer(word_input1)\n", 284 | " word_vector2 = embedding_layer(word_input2)\n", 285 | "\n", 286 | " lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n", 287 | " word_first_1 = lstm_layer1(word_vector1)\n", 288 | " word_first_1 = Dropout(DROP_RATE)(word_first_1)\n", 289 | " word_first_2 = lstm_layer1(word_vector2)\n", 290 | " word_first_2 = Dropout(DROP_RATE)(word_first_2)\n", 291 | "\n", 292 | " lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n", 293 | " word_second_1 = lstm_layer2(word_first_1)\n", 294 | " word_second_2 = lstm_layer2(word_first_2)\n", 295 | "\n", 296 | " conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)\n", 297 | " conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)\n", 298 | " conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)\n", 299 | " conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)\n", 300 | " conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)\n", 301 | " conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)\n", 302 | "\n", 303 | " merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 304 | " merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 305 | " diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 306 | " mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 307 | " merge = concatenate([diff, mult])\n", 308 | "\n", 309 | " x = Dropout(DROP_RATE)(merge)\n", 310 | " x = BatchNormalization()(x)\n", 311 | "\n", 312 | " x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n", 313 | " x = Dropout(DROP_RATE)(x)\n", 314 | " x = BatchNormalization()(x)\n", 315 | "\n", 316 | " pred = Dense(1, activation=\"sigmoid\")(x)\n", 317 | "\n", 318 | " model = Model(inputs=[word_input1, word_input2], outputs=pred)\n", 319 | " model.compile(\n", 320 | " optimizer=\"nadam\",\n", 321 | " loss=\"binary_crossentropy\",\n", 322 | " metrics=[\"acc\"]\n", 323 | " )\n", 324 | "\n", 325 | " early_stopping = EarlyStopping(\"val_loss\", patience=10)\n", 326 | " check_point = ModelCheckpoint(\n", 327 | " \"./log/%s.multi_lstm_cnn.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 328 | " monitor=\"val_loss\",\n", 329 | " save_best_only=True,\n", 330 | " save_weights_only=True\n", 331 | " )\n", 332 | "\n", 333 | " train_res = model.fit(\n", 334 | " x=[train_word1, train_word2],\n", 335 | " y=train_y,\n", 336 | " batch_size=BATCH_SIZE,\n", 337 | " epochs=NUM_EPOCHES,\n", 338 | " validation_data=([dev_word1, dev_word2], dev_y),\n", 339 | " shuffle=True,\n", 340 | " callbacks=[early_stopping, check_point]\n", 341 | " )\n", 342 | "\n", 343 | " pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 344 | " last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 345 | " \n", 346 | "\n", 347 | " print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 348 | " model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 349 | " pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 350 | " best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 351 | "\n", 352 | "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 353 | " \"./result/%s-multilstm_cnn_pred_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 354 | " index=False\n", 355 | ")\n", 356 | "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 357 | " \"./result/%s-multilstm_cnn_pred_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 358 | " index=False\n", 359 | ")" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": { 366 | "collapsed": true 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "# train_encode = []\n", 371 | "# test_encode = []\n", 372 | "\n", 373 | "# for model_name in glob(\"./models/*.hdf5\"):\n", 374 | "# word_input1 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 375 | "# word_input2 = Input(shape=(WORD_SEQ_LEN,), dtype=\"int32\")\n", 376 | "\n", 377 | "# embedding_layer = Embedding(\n", 378 | "# input_dim=word_embedding_data.shape[0],\n", 379 | "# output_dim=word_embedding_data.shape[1],\n", 380 | "# weights=[word_embedding_data],\n", 381 | "# input_length=WORD_SEQ_LEN,\n", 382 | "# trainable=False\n", 383 | "# )\n", 384 | "\n", 385 | "# word_vector1 = embedding_layer(word_input1)\n", 386 | "# word_vector2 = embedding_layer(word_input2)\n", 387 | "\n", 388 | "# lstm_layer1 = LSTM(LSTM_SIZE_1, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n", 389 | "# word_first_1 = lstm_layer1(word_vector1)\n", 390 | "# word_first_1 = Dropout(DROP_RATE)(word_first_1)\n", 391 | "# word_first_2 = lstm_layer1(word_vector2)\n", 392 | "# word_first_2 = Dropout(DROP_RATE)(word_first_2)\n", 393 | "\n", 394 | "# lstm_layer2 = LSTM(LSTM_SIZE_2, dropout=DROP_RATE, recurrent_dropout=DROP_RATE, return_sequences=True)\n", 395 | "# word_second_1 = lstm_layer2(word_first_1)\n", 396 | "# word_second_2 = lstm_layer2(word_first_2)\n", 397 | "\n", 398 | "# conv1a, conv1b = cnn_layer_1(word_second_1, word_second_2, kernel_size=1, filters=CONV_LEN_1)\n", 399 | "# conv2a, conv2b = cnn_layer_1(word_second_1, word_second_2, kernel_size=2, filters=CONV_LEN_2)\n", 400 | "# conv3a, conv3b = cnn_layer_1(word_second_1, word_second_2, kernel_size=3, filters=CONV_LEN_3)\n", 401 | "# conv4a, conv4b = cnn_layer_1(word_second_1, word_second_2, kernel_size=4, filters=CONV_LEN_4)\n", 402 | "# conv5a, conv5b = cnn_layer_1(word_second_1, word_second_2, kernel_size=5, filters=CONV_LEN_5)\n", 403 | "# conv6a, conv6b = cnn_layer_1(word_second_1, word_second_2, kernel_size=6, filters=CONV_LEN_6)\n", 404 | "\n", 405 | "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 406 | "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 407 | "# diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 408 | "# mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 409 | "# merge = concatenate([diff, mult])\n", 410 | "\n", 411 | "# x = Dropout(DROP_RATE)(merge)\n", 412 | "# x = BatchNormalization()(x)\n", 413 | "\n", 414 | "# x = Dense(DENSE_SIZE, activation=\"relu\")(x)\n", 415 | "# x = Dropout(DROP_RATE)(x)\n", 416 | "# x = BatchNormalization()(x)\n", 417 | "\n", 418 | "# pred = Dense(1, activation=\"sigmoid\")(x)\n", 419 | "\n", 420 | "# model = Model(inputs=[word_input1, word_input2], outputs=pred)\n", 421 | "# model.load_weights(model_name.replace(\"\\\\\", \"/\"))\n", 422 | " \n", 423 | "# encode_model = Model(inputs=[word_input1, word_input2], outputs=model.layers[-4].output)\n", 424 | "# train_feature = encode_model.predict([word1, word2], batch_size=BATCH_SIZE)\n", 425 | "# test_feature = encode_model.predict([test_word1, test_word2], batch_size=BATCH_SIZE)\n", 426 | "# train_encode.append(train_feature)\n", 427 | "# test_encode.append(test_feature)\n", 428 | "\n", 429 | "# train_dense = train_encode[0].copy()\n", 430 | "# for t in train_encode[1:]:\n", 431 | "# train_dense += t\n", 432 | "# train_dense = train_dense / 10\n", 433 | "# pd.DataFrame(train_dense).to_csv(\"train_input.csv\", index=False)\n", 434 | "\n", 435 | "# test_dense = test_encode[0].copy()\n", 436 | "# for t in test_encode[1:]:\n", 437 | "# test_dense += t\n", 438 | "# test_dense = test_dense / 10\n", 439 | "# pd.DataFrame(test_dense).to_csv(\"test_input.csv\", index=False)\n", 440 | "\n", 441 | "# import xgboost as xgb\n", 442 | "# from sklearn.model_selection import train_test_split\n", 443 | "\n", 444 | "# xgb_train_x, xgb_dev_x, xgb_train_y, xgb_dev_y = train_test_split(mean_dense, label, test_size=0.2, stratify=label)\n", 445 | "\n", 446 | "# train_data = xgb.DMatrix(xgb_train_x, xgb_train_y)\n", 447 | "# dev_data = xgb.DMatrix(xgb_dev_x, xgb_dev_y)\n", 448 | "\n", 449 | "# params = {\n", 450 | "# \"objective\": \"binary:logistic\",\n", 451 | "# \"eval_metric\": \"logloss\",\n", 452 | "# \"eta\": 0.01,\n", 453 | "# \"max_depth\": 5,\n", 454 | "# \"subsample\": 0.8,\n", 455 | "# \"colsample_bytree\": 0.8,\n", 456 | "# \"lambda\": 1,\n", 457 | "# }\n", 458 | "\n", 459 | "\n", 460 | "# boost = xgb.train(\n", 461 | "# params=params,\n", 462 | "# dtrain=train_data,\n", 463 | "# num_boost_round=200,\n", 464 | "# evals=[(dev_data, \"dev\")],\n", 465 | "# early_stopping_rounds=10,\n", 466 | "# )\n", 467 | "\n", 468 | "# test_pred = boost.predict(xgb.DMatrix(test_dense))\n", 469 | "# pd.DataFrame(test_pred, columns=[\"y_pre\"]).to_csv(\n", 470 | "# \"./result/%s-xgb_multilstm_cnn_pred_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 471 | "# index=False\n", 472 | "# )" 473 | ] 474 | } 475 | ], 476 | "metadata": { 477 | "kernelspec": { 478 | "display_name": "Python 3", 479 | "language": "python", 480 | "name": "python3" 481 | }, 482 | "language_info": { 483 | "codemirror_mode": { 484 | "name": "ipython", 485 | "version": 3 486 | }, 487 | "file_extension": ".py", 488 | "mimetype": "text/x-python", 489 | "name": "python", 490 | "nbconvert_exporter": "python", 491 | "pygments_lexer": "ipython3", 492 | "version": "3.6.2" 493 | } 494 | }, 495 | "nbformat": 4, 496 | "nbformat_minor": 2 497 | } 498 | -------------------------------------------------------------------------------- /[Model] TextCNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings(\"ignore\")\n", 13 | "\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "from glob import glob\n", 17 | "from datetime import datetime\n", 18 | "\n", 19 | "DATA_PATH = \"./data/\"\n", 20 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 21 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 22 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 23 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 24 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 25 | "\n", 26 | "train_data = pd.read_csv(TRAIN_PATH)\n", 27 | "test_data = pd.read_csv(TEST_PATH)\n", 28 | "question_data = pd.read_csv(QUEST_PATH)\n", 29 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 30 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 31 | "\n", 32 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 33 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 34 | "\n", 35 | "label = train_data[\"label\"].values" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | "Using TensorFlow backend.\n" 48 | ] 49 | }, 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "((10001, 300), (3049, 300))" 54 | ] 55 | }, 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "from keras.preprocessing.text import Tokenizer\n", 63 | "\n", 64 | "MAX_COUNT = 10000\n", 65 | "\n", 66 | "word_tokenizer = Tokenizer(MAX_COUNT)\n", 67 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 68 | "\n", 69 | "word_embedding_data = np.concatenate(\n", 70 | " (\n", 71 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 72 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 73 | " ),\n", 74 | " axis=0\n", 75 | ")\n", 76 | "\n", 77 | "char_tokenizer = Tokenizer(MAX_COUNT)\n", 78 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n", 79 | "\n", 80 | "char_embedding_data = np.concatenate(\n", 81 | " (\n", 82 | " np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n", 83 | " char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 84 | " ),\n", 85 | " axis=0\n", 86 | ")\n", 87 | "\n", 88 | "word_embedding_data.shape, char_embedding_data.shape" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 3, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "((254386, 30),\n", 100 | " (254386, 30),\n", 101 | " (172956, 30),\n", 102 | " (172956, 30),\n", 103 | " (254386, 30),\n", 104 | " (254386, 30),\n", 105 | " (172956, 30),\n", 106 | " (172956, 30))" 107 | ] 108 | }, 109 | "execution_count": 3, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "from keras.preprocessing.sequence import pad_sequences\n", 116 | "\n", 117 | "SEQ_LEN = 30\n", 118 | "\n", 119 | "def gen_word_data(data):\n", 120 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 121 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 122 | " return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 123 | " pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 124 | " \n", 125 | "def gen_char_data(data):\n", 126 | " seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n", 127 | " seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n", 128 | " return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 129 | " pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 130 | "\n", 131 | "word1, word2 = gen_word_data(train_data)\n", 132 | "char1, char2 = gen_char_data(train_data)\n", 133 | "test_word1, test_word2 = gen_word_data(test_data)\n", 134 | "test_char1, test_char2 = gen_char_data(test_data)\n", 135 | "\n", 136 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "from keras.models import Model\n", 148 | "from keras.layers.merge import concatenate\n", 149 | "from keras.callbacks import EarlyStopping, ModelCheckpoint\n", 150 | "from keras.optimizers import Adam, Nadam, SGD\n", 151 | "from keras.layers import LSTM, Bidirectional, TimeDistributed, CuDNNLSTM\n", 152 | "from keras.layers import Conv1D, GlobalMaxPool1D, GlobalAveragePooling1D\n", 153 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 5, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "BATCH_SIZE = 1024\n", 165 | "NUM_EPOCHES = 50\n", 166 | "DROP_RATE = 0.3\n", 167 | "\n", 168 | "CONV_FILTER_LAYER1 = 128\n", 169 | "CONV_FILTER_LAYER2 = 128\n", 170 | "\n", 171 | "DENSE_SIZE1 = 512\n", 172 | "DENSE_SIZE2 = 256" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "def textcnn_layer(input_tensor, kernel_size):\n", 184 | " conv_1 = Conv1D(filters=CONV_FILTER_LAYER1, kernel_size=kernel_size, padding=\"same\")(input_tensor)\n", 185 | " conv_1 = BatchNormalization()(conv_1)\n", 186 | " conv_1 = Activation(activation=\"relu\")(conv_1)\n", 187 | " conv_2 = Conv1D(filters=CONV_FILTER_LAYER2, kernel_size=kernel_size, padding=\"same\")(conv_1)\n", 188 | " conv_2 = BatchNormalization()(conv_2)\n", 189 | " conv_2 = Activation(activation=\"relu\")(conv_2)\n", 190 | " conv_2_max = GlobalMaxPool1D()(conv_2)\n", 191 | " conv_2_avg = GlobalAveragePooling1D()(conv_2)\n", 192 | " conv_2_merge = concatenate([conv_2_max, conv_2_avg])\n", 193 | " return conv_2_max" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 7, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n", 206 | "Instructions for updating:\n", 207 | "`NHWC` for data_format is deprecated, use `NWC` instead\n", 208 | "Train on 228946 samples, validate on 25440 samples\n", 209 | "Epoch 1/50\n", 210 | "228946/228946 [==============================] - 99s 433us/step - loss: 0.4382 - acc: 0.7953 - val_loss: 0.3387 - val_acc: 0.8498\n", 211 | "Epoch 2/50\n", 212 | "228946/228946 [==============================] - 90s 392us/step - loss: 0.3082 - acc: 0.8647 - val_loss: 0.2845 - val_acc: 0.8755\n", 213 | "Epoch 3/50\n", 214 | "228946/228946 [==============================] - 93s 405us/step - loss: 0.2626 - acc: 0.8868 - val_loss: 0.2673 - val_acc: 0.8857\n", 215 | "Epoch 4/50\n", 216 | "228946/228946 [==============================] - 92s 402us/step - loss: 0.2304 - acc: 0.9020 - val_loss: 0.2649 - val_acc: 0.8886\n", 217 | "Epoch 5/50\n", 218 | "228946/228946 [==============================] - 89s 391us/step - loss: 0.2048 - acc: 0.9137 - val_loss: 0.2551 - val_acc: 0.8915\n", 219 | "Epoch 6/50\n", 220 | "228946/228946 [==============================] - 92s 400us/step - loss: 0.1845 - acc: 0.9231 - val_loss: 0.2518 - val_acc: 0.8948\n", 221 | "Epoch 7/50\n", 222 | "228946/228946 [==============================] - 93s 408us/step - loss: 0.1643 - acc: 0.9329 - val_loss: 0.2583 - val_acc: 0.8960\n", 223 | "Epoch 8/50\n", 224 | " 28672/228946 [==>...........................] - ETA: 1:18 - loss: 0.1283 - acc: 0.9493" 225 | ] 226 | }, 227 | { 228 | "ename": "KeyboardInterrupt", 229 | "evalue": "", 230 | "output_type": "error", 231 | "traceback": [ 232 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 233 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 234 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdev_x1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_x2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 68\u001b[1;33m \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mearly_stopping\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_point\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 69\u001b[0m )\n\u001b[0;32m 70\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 235 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[0;32m 1040\u001b[0m \u001b[0minitial_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m validation_steps=validation_steps)\n\u001b[0m\u001b[0;32m 1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1044\u001b[0m def evaluate(self, x=None, y=None,\n", 236 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training_arrays.py\u001b[0m in \u001b[0;36mfit_loop\u001b[1;34m(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 198\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 199\u001b[1;33m \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 200\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 237 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m 2665\u001b[0m \u001b[1;34m'In order to feed symbolic tensors to a Keras model '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2666\u001b[0m 'in TensorFlow, you need tensorflow 1.8 or higher.')\n\u001b[1;32m-> 2667\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_legacy_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2668\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2669\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 238 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m_legacy_call\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m 2647\u001b[0m \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2648\u001b[0m updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[1;32m-> 2649\u001b[1;33m **self.session_kwargs)\n\u001b[0m\u001b[0;32m 2650\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2651\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 239 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 903\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 904\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[1;32m--> 905\u001b[1;33m run_metadata_ptr)\n\u001b[0m\u001b[0;32m 906\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 907\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 240 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1135\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1136\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[1;32m-> 1137\u001b[1;33m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[0;32m 1138\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1139\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 241 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1353\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1354\u001b[0m return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[1;32m-> 1355\u001b[1;33m options, run_metadata)\n\u001b[0m\u001b[0;32m 1356\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1357\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 242 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m 1359\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1360\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1361\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1362\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1363\u001b[0m \u001b[0mmessage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 243 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[1;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m 1338\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1339\u001b[0m return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[1;32m-> 1340\u001b[1;33m target_list, status, run_metadata)\n\u001b[0m\u001b[0;32m 1341\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1342\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 244 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "from sklearn.model_selection import StratifiedKFold\n", 250 | "\n", 251 | "best_results = []\n", 252 | "last_results = []\n", 253 | "\n", 254 | "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10, shuffle=True).split(X=char1, y=label)):\n", 255 | " train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index] # word/char switch\n", 256 | " dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index] # word/char switch\n", 257 | " \n", 258 | " input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 259 | " input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 260 | "\n", 261 | " embedding_layer = Embedding(\n", 262 | " input_dim=word_embedding_data.shape[0], # word/char switch\n", 263 | " output_dim=word_embedding_data.shape[1], # word/char switch\n", 264 | " weights=[word_embedding_data], # word/char switch\n", 265 | " input_length=SEQ_LEN,\n", 266 | " trainable=False\n", 267 | " )\n", 268 | " \n", 269 | " vector1 = embedding_layer(input1)\n", 270 | " vector2 = embedding_layer(input2)\n", 271 | " \n", 272 | " conv1a, conv1b = textcnn_layer(vector1, kernel_size=1), textcnn_layer(vector2, kernel_size=1)\n", 273 | " conv2a, conv2b = textcnn_layer(vector1, kernel_size=2), textcnn_layer(vector2, kernel_size=2)\n", 274 | " conv3a, conv3b = textcnn_layer(vector1, kernel_size=3), textcnn_layer(vector2, kernel_size=3)\n", 275 | " conv4a, conv4b = textcnn_layer(vector1, kernel_size=4), textcnn_layer(vector2, kernel_size=4)\n", 276 | " conv5a, conv5b = textcnn_layer(vector1, kernel_size=5), textcnn_layer(vector2, kernel_size=5)\n", 277 | " conv6a, conv6b = textcnn_layer(vector1, kernel_size=6), textcnn_layer(vector2, kernel_size=6)\n", 278 | " \n", 279 | " merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 280 | " merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 281 | " diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 282 | " mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 283 | " merge = concatenate([diff, mult])\n", 284 | " \n", 285 | " x = Dropout(DROP_RATE)(merge)\n", 286 | " x = BatchNormalization()(x)\n", 287 | " x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 288 | " x = Dropout(DROP_RATE)(x)\n", 289 | " x = BatchNormalization()(x)\n", 290 | " x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 291 | " x = Dropout(DROP_RATE)(x)\n", 292 | " x = BatchNormalization()(x)\n", 293 | " pred = Dense(1, activation=\"sigmoid\")(x)\n", 294 | " \n", 295 | " model = Model(inputs=[input1, input2], outputs=pred)\n", 296 | " model.compile(\n", 297 | " optimizer=\"nadam\",\n", 298 | " loss=\"binary_crossentropy\",\n", 299 | " metrics=[\"acc\"]\n", 300 | " )\n", 301 | " \n", 302 | " early_stopping = EarlyStopping(\"val_loss\", patience=10)\n", 303 | " check_point = ModelCheckpoint(\n", 304 | " \"./log/%s.TextCNN.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 305 | " monitor=\"val_loss\",\n", 306 | " save_best_only=True,\n", 307 | " )\n", 308 | " \n", 309 | " fit_res = model.fit(\n", 310 | " x=[train_x1, train_x2],\n", 311 | " y=train_y,\n", 312 | " batch_size=BATCH_SIZE,\n", 313 | " epochs=NUM_EPOCHES,\n", 314 | " validation_data=([dev_x1, dev_x2], dev_y),\n", 315 | " shuffle=True,\n", 316 | " callbacks=[early_stopping, check_point]\n", 317 | " )\n", 318 | " \n", 319 | " pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 320 | " last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 321 | " \n", 322 | " print(\"load model %s\" % (glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"),))\n", 323 | " model.load_weights(glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\"))\n", 324 | " pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 325 | " best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 326 | "\n", 327 | "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 328 | " \"./result/%s-TextCNN_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 329 | " index=False\n", 330 | ")\n", 331 | "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 332 | " \"./result/%s-TextCNN_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 333 | " index=False\n", 334 | ")" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "collapsed": true 342 | }, 343 | "outputs": [], 344 | "source": [] 345 | } 346 | ], 347 | "metadata": { 348 | "kernelspec": { 349 | "display_name": "Python 3", 350 | "language": "python", 351 | "name": "python3" 352 | }, 353 | "language_info": { 354 | "codemirror_mode": { 355 | "name": "ipython", 356 | "version": 3 357 | }, 358 | "file_extension": ".py", 359 | "mimetype": "text/x-python", 360 | "name": "python", 361 | "nbconvert_exporter": "python", 362 | "pygments_lexer": "ipython3", 363 | "version": "3.6.2" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 2 368 | } 369 | -------------------------------------------------------------------------------- /[Model] Multi LSTM CNN v4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 模型结构\n", 8 | "\n", 9 | "- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)\n", 10 | "- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中\n", 11 | "- 卷积层为双层, 最后的池化层有Average和Max两种\n", 12 | "- 对于每个问题, 将所有卷积核结果并起来\n", 13 | "- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import warnings\n", 25 | "warnings.filterwarnings(\"ignore\")\n", 26 | "\n", 27 | "import os\n", 28 | "import shutil\n", 29 | "import numpy as np\n", 30 | "import pandas as pd\n", 31 | "from glob import glob\n", 32 | "from datetime import datetime" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "try:\n", 44 | " os.mkdir(\"./log/\")\n", 45 | " os.mkdir(\"./result/\")\n", 46 | "except FileExistsError:\n", 47 | " pass" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "DATA_PATH = \"./data/\"\n", 57 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 58 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 59 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 60 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 61 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 62 | "\n", 63 | "train_data = pd.read_csv(TRAIN_PATH)\n", 64 | "test_data = pd.read_csv(TEST_PATH)\n", 65 | "question_data = pd.read_csv(QUEST_PATH)\n", 66 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 67 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 68 | "\n", 69 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 70 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 71 | "\n", 72 | "label = train_data[\"label\"].values\n", 73 | "\n", 74 | "from keras.preprocessing.text import Tokenizer\n", 75 | "\n", 76 | "MAX_COUNT = 10000\n", 77 | "\n", 78 | "word_tokenizer = Tokenizer(MAX_COUNT)\n", 79 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 80 | "\n", 81 | "word_embedding_data = np.concatenate(\n", 82 | " (\n", 83 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 84 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 85 | " ),\n", 86 | " axis=0\n", 87 | ")\n", 88 | "\n", 89 | "char_tokenizer = Tokenizer(MAX_COUNT)\n", 90 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n", 91 | "\n", 92 | "char_embedding_data = np.concatenate(\n", 93 | " (\n", 94 | " np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n", 95 | " char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 96 | " ),\n", 97 | " axis=0\n", 98 | ")\n", 99 | "\n", 100 | "word_embedding_data.shape, char_embedding_data.shape" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "from keras.preprocessing.sequence import pad_sequences\n", 110 | "\n", 111 | "SEQ_LEN = 25\n", 112 | "\n", 113 | "def gen_word_data(data):\n", 114 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 115 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 116 | " return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 117 | " pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 118 | " \n", 119 | "def gen_char_data(data):\n", 120 | " seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n", 121 | " seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n", 122 | " return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 123 | " pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 124 | "\n", 125 | "word1, word2 = gen_word_data(train_data)\n", 126 | "char1, char2 = gen_char_data(train_data)\n", 127 | "test_word1, test_word2 = gen_word_data(test_data)\n", 128 | "test_char1, test_char2 = gen_char_data(test_data)\n", 129 | "\n", 130 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "from keras.models import Model\n", 142 | "from keras.layers.merge import concatenate\n", 143 | "from keras.optimizers import Adam, SGD, Nadam\n", 144 | "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n", 145 | "from keras.layers import LSTM, Bidirectional, TimeDistributed\n", 146 | "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D\n", 147 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": true 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "# general\n", 159 | "NUM_EPOCHES = 50\n", 160 | "EPOCHES1 = 5\n", 161 | "EPOCHES2 = 25 # 5\n", 162 | "EPOCHES3 = 22\n", 163 | "BATCH_SIZE = 1024\n", 164 | "DROP_RATE = 0.3\n", 165 | "\n", 166 | "# cnn\n", 167 | "CONV_LEN1 = 128\n", 168 | "CONV_LEN2 = 128\n", 169 | "CONV_LEN3 = 128\n", 170 | "CONV_LEN4 = 128\n", 171 | "CONV_LEN5 = 128\n", 172 | "CONV_LEN6 = 128\n", 173 | "\n", 174 | "# lstm\n", 175 | "LSTM_SIZE1 = 256\n", 176 | "LSTM_SIZE2 = 256\n", 177 | "LSTM_DROP_RATE = 0.3\n", 178 | "\n", 179 | "# dense\n", 180 | "DENSE_INPUT = 300\n", 181 | "DENSE_SIZE1 = 512\n", 182 | "DENSE_SIZE2 = 256" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "def cnn_layer1(inputa, inputb, filters, kernel_size): # with average pooling\n", 194 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 195 | " conv_outputa = conv(inputa)\n", 196 | " conv_outputa = GlobalAveragePooling1D()(conv_outputa)\n", 197 | " conv_outputb = conv(inputb)\n", 198 | " conv_outputb = GlobalAveragePooling1D()(conv_outputb)\n", 199 | " return conv_outputa, conv_outputb\n", 200 | " \n", 201 | "def cnn_layer2(inputa, inputb, filters, kernel_size): # with max pooling\n", 202 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 203 | " conv_outputa = conv(inputa)\n", 204 | " conv_outputa = MaxPool1D(pool_size=SEQ_LEN)(conv_outputa)\n", 205 | " conv_outputa = Flatten()(conv_outputa)\n", 206 | " conv_outputb = conv(inputb)\n", 207 | " conv_outputb = MaxPool1D(pool_size=SEQ_LEN)(conv_outputb)\n", 208 | " conv_outputb = Flatten()(conv_outputb)\n", 209 | " return conv_outputa, conv_outputb\n", 210 | "\n", 211 | "def cnn_layer3(inputa, inputb, filters, kernel_size): # with both max and average poolings\n", 212 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 213 | " \n", 214 | " conv_outputa = conv(inputa)\n", 215 | " conv_outputa1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputa))\n", 216 | " conv_outputa2 = GlobalAveragePooling1D()(conv_outputa)\n", 217 | " conv_outputa = concatenate([conv_outputa1, conv_outputa2])\n", 218 | " \n", 219 | " conv_outputb = conv(inputb)\n", 220 | " conv_outputb1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputb))\n", 221 | " conv_outputb2 = GlobalAveragePooling1D()(conv_outputb)\n", 222 | " conv_outputb = concatenate([conv_outputb1, conv_outputb2])\n", 223 | " \n", 224 | " return conv_outputa, conv_outputb\n", 225 | "\n", 226 | "def cnn_double_layer(inputa, inputb, filters, kernel_size):\n", 227 | " conv1 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n", 228 | " conv2 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n", 229 | " \n", 230 | " conv1a = conv1(inputa)\n", 231 | " conv1a = BatchNormalization()(conv1a)\n", 232 | " conv1a = Activation(activation=\"relu\")(conv1a)\n", 233 | " conv2a = conv2(conv1a)\n", 234 | " conv2a = BatchNormalization()(conv2a)\n", 235 | " conv2a = Activation(activation=\"relu\")(conv2a)\n", 236 | " output_avg_a = GlobalAveragePooling1D()(conv2a)\n", 237 | " output_max_a = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2a))\n", 238 | " output_min_a = Lambda(lambda x: K.min(x, axis=1))(conv2a)\n", 239 | " output_a = concatenate([output_avg_a, output_max_a, output_min_a])\n", 240 | " \n", 241 | " conv1b = conv1(inputb)\n", 242 | " conv1b = BatchNormalization()(conv1b)\n", 243 | " conv1b = Activation(activation=\"relu\")(conv1b)\n", 244 | " conv2b = conv2(conv1b)\n", 245 | " conv2b = BatchNormalization()(conv2b)\n", 246 | " conv2b = Activation(activation=\"relu\")(conv2b)\n", 247 | " output_avg_b = GlobalAveragePooling1D()(conv2b)\n", 248 | " output_max_b = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2b))\n", 249 | " output_min_b = Lambda(lambda x: K.min(x, axis=1))(conv2b)\n", 250 | " output_b = concatenate([output_avg_b, output_max_b, output_min_b])\n", 251 | " \n", 252 | " return output_a, output_b\n", 253 | "\n", 254 | "\n", 255 | "def sim_l1(v1, v2):\n", 256 | " return Lambda(lambda x: K.sum(K.abs(x[0] - x[1]), axis=1))([v1, v2])\n", 257 | "\n", 258 | "def sim_l2(v1, v2):\n", 259 | " return Lambda(lambda x: K.sqrt(K.sum(K.square(x[0] - x[1]), axis=1)))([v1, v2])\n", 260 | "\n", 261 | "def sim_cos(v1, v2):\n", 262 | " return Lambda(lambda x: K.sum(x[0] * x[1], axis=1) / (K.sqrt(K.sum(x[0] * x[0], axis=1)) * K.sqrt(K.sum(x[1] * x[1], axis=1))))([v1, v2])\n", 263 | "\n", 264 | "def sim_vec(v1, v2):\n", 265 | " l1 = sim_l1(v1, v2)\n", 266 | " l2 = sim_l2(v1, v2)\n", 267 | " cos = sim_cos(v1, v2)\n", 268 | " vec = concatenate([Lambda(lambda x: K.reshape(x, shape=(-1, 1)))(t) for t in [l1, l2, cos]], axis=1)\n", 269 | " return vec\n", 270 | "\n", 271 | "def similarity_mpcnn(s1, s2):\n", 272 | " fea_h, fea_a = [], []\n", 273 | " out1, out2 = [], [] \n", 274 | " for i in range(len(s1)):\n", 275 | " avg1 = GlobalAveragePooling1D()(s1[i])\n", 276 | " max1 = GlobalMaxPooling1D()(s1[i])\n", 277 | " min1 = Lambda(lambda x: K.min(x, axis=1))(s1[i])\n", 278 | " out1.append([avg1, max1, min1])\n", 279 | " \n", 280 | " avg2 = GlobalAveragePooling1D()(s2[i])\n", 281 | " max2 = GlobalMaxPooling1D()(s2[i])\n", 282 | " min2 = Lambda(lambda x: K.min(x, axis=1))(s2[i])\n", 283 | " out2.append([avg2, max2, min2])\n", 284 | " \n", 285 | " output1, output2 = [], [] # pool nums\n", 286 | " for p in range(3):\n", 287 | " output1.append(concatenate([Lambda(lambda x:K.reshape(x, shape=(-1, 1, CONV_LEN1)))(t[p]) for t in out1], axis=1))\n", 288 | " output2.append(concatenate([Lambda(lambda x:K.reshape(x, shape=(-1, 1, CONV_LEN1)))(t[p]) for t in out2], axis=1))\n", 289 | " \n", 290 | " for p in range(3):\n", 291 | " for f in range(CONV_LEN1):\n", 292 | " fea_h.append(sim_vec(Lambda(lambda x: x[:, :, f])(output1[p]), Lambda(lambda x: x[:, :, f])(output2[p])))\n", 293 | " \n", 294 | " for p in range(3):\n", 295 | " for k1 in range(len(s1)):\n", 296 | " for k2 in range(len(s1)):\n", 297 | " fea_a.append(sim_vec(Lambda(lambda x: x[:, k1, :])(output1[p]), Lambda(lambda x: x[:, k2, :])(output2[p])))\n", 298 | " fea = concatenate(fea_h + fea_a, axis=1)\n", 299 | " return fea" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "# WORDS" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "from sklearn.model_selection import StratifiedKFold\n", 318 | "\n", 319 | "best_results = []\n", 320 | "last_results = []\n", 321 | "best_file_names = []\n", 322 | "\n", 323 | "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)): # word/char switch\n", 324 | " print(\"fold {} start\".format(i + 1))\n", 325 | " train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index] # word/char switch\n", 326 | " dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index] # word/char switch\n", 327 | " \n", 328 | " input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 329 | " input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 330 | "\n", 331 | " embedding_layer = Embedding(\n", 332 | " input_dim=word_embedding_data.shape[0], # word/char switch\n", 333 | " output_dim=word_embedding_data.shape[1], # word/char switch\n", 334 | " weights=[word_embedding_data], # word/char switch\n", 335 | " input_length=SEQ_LEN,\n", 336 | " trainable=False\n", 337 | " )\n", 338 | " \n", 339 | " vector1 = embedding_layer(input1)\n", 340 | " vector2 = embedding_layer(input2)\n", 341 | " \n", 342 | " input_layer = TimeDistributed(Dense(DENSE_INPUT))\n", 343 | " vector1 = input_layer(vector1)\n", 344 | " vector1 = BatchNormalization()(vector1)\n", 345 | " vector2 = input_layer(vector2)\n", 346 | " vector2 = BatchNormalization()(vector2)\n", 347 | " \n", 348 | " lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 349 | " layer1a = lstm_layer1(vector1)\n", 350 | " layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n", 351 | " layer1b = lstm_layer1(vector2)\n", 352 | " layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n", 353 | " lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 354 | " layer2a = lstm_layer2(layer1a)\n", 355 | " layer2b = lstm_layer2(layer1b)\n", 356 | " \n", 357 | " conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n", 358 | " conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n", 359 | " conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n", 360 | " conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n", 361 | " conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n", 362 | " conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n", 363 | "\n", 364 | " merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 365 | " merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 366 | " diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 367 | " mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 368 | " merge = concatenate([diff, mult])\n", 369 | " \n", 370 | " x = Dropout(DROP_RATE)(merge)\n", 371 | " x = BatchNormalization()(x)\n", 372 | " x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 373 | " x = Dropout(DROP_RATE)(x)\n", 374 | " x = BatchNormalization()(x)\n", 375 | " x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 376 | " x = Dropout(DROP_RATE)(x)\n", 377 | " x = BatchNormalization()(x)\n", 378 | " pred = Dense(1, activation=\"sigmoid\")(x)\n", 379 | " \n", 380 | " model = Model(inputs=[input1, input2], outputs=pred)\n", 381 | " model.compile(\n", 382 | " optimizer=\"nadam\",\n", 383 | " loss=\"binary_crossentropy\",\n", 384 | " metrics=[\"acc\"]\n", 385 | " )\n", 386 | " \n", 387 | " early_stopping = EarlyStopping(\"val_loss\", patience=8)\n", 388 | " lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.001)\n", 389 | " check_point = ModelCheckpoint(\n", 390 | " \"./log/%s.Multi_LSTM_CNN_v4.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 391 | " monitor=\"val_loss\",\n", 392 | " save_best_only=True,\n", 393 | " )\n", 394 | " \n", 395 | " fit_res = model.fit(\n", 396 | " x=[train_x1, train_x2],\n", 397 | " y=train_y,\n", 398 | " batch_size=BATCH_SIZE,\n", 399 | " epochs=NUM_EPOCHES,\n", 400 | " validation_data=([dev_x1, dev_x2], dev_y),\n", 401 | " shuffle=True,\n", 402 | " callbacks=[early_stopping, lr_reducer, check_point]\n", 403 | " )\n", 404 | " \n", 405 | " pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 406 | " last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 407 | " \n", 408 | " best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n", 409 | " best_file_names.append(best_model_file)\n", 410 | " print(\"load model %s\" % (best_model_file,))\n", 411 | " model.load_weights(best_model_file)\n", 412 | " pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 413 | " best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 414 | "\n", 415 | "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 416 | " \"./result/%s-Multi_LSTM_CNN_v4_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 417 | " index=False\n", 418 | ")\n", 419 | "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 420 | " \"./result/%s-Multi_LSTM_CNN_v4_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 421 | " index=False\n", 422 | ")\n", 423 | "\n", 424 | "model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n", 425 | "os.mkdir(model_path)\n", 426 | "for model_name in best_file_names:\n", 427 | " abs_name = os.path.split(model_name)[1]\n", 428 | " os.rename(model_name, model_path + abs_name)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "source": [ 437 | "# CHARS" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "# from sklearn.model_selection import StratifiedKFold\n", 447 | "\n", 448 | "# best_results = []\n", 449 | "# last_results = []\n", 450 | "# best_file_names = []\n", 451 | "\n", 452 | "# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=char1, y=label)): # word/char switch\n", 453 | "# print(\"fold {} start\".format(i + 1))\n", 454 | "# train_x1, train_x2, train_y = char1[train_index, :], char2[train_index, :], label[train_index] # word/char switch\n", 455 | "# dev_x1, dev_x2, dev_y = char1[dev_index, :], char2[dev_index, :], label[dev_index] # word/char switch\n", 456 | " \n", 457 | "# input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 458 | "# input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 459 | "\n", 460 | "# embedding_layer = Embedding(\n", 461 | "# input_dim=char_embedding_data.shape[0], # word/char switch\n", 462 | "# output_dim=char_embedding_data.shape[1], # word/char switch\n", 463 | "# weights=[char_embedding_data], # word/char switch\n", 464 | "# input_length=SEQ_LEN,\n", 465 | "# trainable=False\n", 466 | "# )\n", 467 | " \n", 468 | "# vector1 = embedding_layer(input1)\n", 469 | "# vector2 = embedding_layer(input2)\n", 470 | " \n", 471 | "# input_layer = TimeDistributed(Dense(DENSE_INPUT))\n", 472 | "# vector1 = input_layer(vector1)\n", 473 | "# vector1 = BatchNormalization()(vector1)\n", 474 | "# vector2 = input_layer(vector2)\n", 475 | "# vector2 = BatchNormalization()(vector2)\n", 476 | " \n", 477 | "# lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 478 | "# layer1a = lstm_layer1(vector1)\n", 479 | "# layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n", 480 | "# layer1b = lstm_layer1(vector2)\n", 481 | "# layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n", 482 | "# lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 483 | "# layer2a = lstm_layer2(layer1a)\n", 484 | "# layer2b = lstm_layer2(layer1b)\n", 485 | " \n", 486 | "# conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n", 487 | "# conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n", 488 | "# conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n", 489 | "# conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n", 490 | "# conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n", 491 | "# conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n", 492 | "\n", 493 | "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 494 | "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 495 | "# diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 496 | "# mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 497 | "# merge = concatenate([diff, mult])\n", 498 | " \n", 499 | "# x = Dropout(DROP_RATE)(merge)\n", 500 | "# x = BatchNormalization()(x)\n", 501 | "# x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 502 | "# x = Dropout(DROP_RATE)(x)\n", 503 | "# x = BatchNormalization()(x)\n", 504 | "# x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 505 | "# x = Dropout(DROP_RATE)(x)\n", 506 | "# x = BatchNormalization()(x)\n", 507 | "# pred = Dense(1, activation=\"sigmoid\")(x)\n", 508 | " \n", 509 | "# model = Model(inputs=[input1, input2], outputs=pred)\n", 510 | "# model.compile(\n", 511 | "# optimizer=\"nadam\",\n", 512 | "# loss=\"binary_crossentropy\",\n", 513 | "# metrics=[\"acc\"]\n", 514 | "# )\n", 515 | " \n", 516 | "# early_stopping = EarlyStopping(\"val_loss\", patience=8)\n", 517 | "# lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.001)\n", 518 | "# check_point = ModelCheckpoint(\n", 519 | "# \"./log/%s.Multi_LSTM_CNN_v4.char.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 520 | "# monitor=\"val_loss\",\n", 521 | "# save_best_only=True,\n", 522 | "# )\n", 523 | " \n", 524 | "# fit_res = model.fit(\n", 525 | "# x=[train_x1, train_x2],\n", 526 | "# y=train_y,\n", 527 | "# batch_size=BATCH_SIZE,\n", 528 | "# epochs=NUM_EPOCHES,\n", 529 | "# validation_data=([dev_x1, dev_x2], dev_y),\n", 530 | "# shuffle=True,\n", 531 | "# callbacks=[early_stopping, lr_reducer, check_point]\n", 532 | "# )\n", 533 | " \n", 534 | "# pred_last = model.predict([test_char1, test_char2], batch_size=BATCH_SIZE) # word/char switch\n", 535 | "# last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 536 | " \n", 537 | "# best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n", 538 | "# best_file_names.append(best_model_file)\n", 539 | "# print(\"load model %s\" % (best_model_file,))\n", 540 | "# model.load_weights(best_model_file)\n", 541 | "# pred_best = model.predict([test_char1, test_char2], batch_size=BATCH_SIZE) # word/char switch\n", 542 | "# best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 543 | "\n", 544 | "# pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 545 | "# \"./result/%s-Multi_LSTM_CNN_v4_char_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 546 | "# index=False\n", 547 | "# )\n", 548 | "# pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 549 | "# \"./result/%s-Multi_LSTM_CNN_v4_char_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 550 | "# index=False\n", 551 | "# )\n", 552 | "\n", 553 | "# model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n", 554 | "# os.mkdir(model_path)\n", 555 | "# for model_name in best_file_names:\n", 556 | "# abs_name = os.path.split(model_name)[1]\n", 557 | "# os.rename(model_name, model_path + abs_name)" 558 | ] 559 | } 560 | ], 561 | "metadata": { 562 | "kernelspec": { 563 | "display_name": "Python 3", 564 | "language": "python", 565 | "name": "python3" 566 | }, 567 | "language_info": { 568 | "codemirror_mode": { 569 | "name": "ipython", 570 | "version": 3 571 | }, 572 | "file_extension": ".py", 573 | "mimetype": "text/x-python", 574 | "name": "python", 575 | "nbconvert_exporter": "python", 576 | "pygments_lexer": "ipython3", 577 | "version": "3.6.2" 578 | } 579 | }, 580 | "nbformat": 4, 581 | "nbformat_minor": 2 582 | } 583 | -------------------------------------------------------------------------------- /[Model] Multi LSTM CNN v2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 模型结构\n", 8 | "\n", 9 | "- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)\n", 10 | "- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中\n", 11 | "- 卷积层为单层\n", 12 | "- 对于每个问题, 将所有卷积核结果并起来\n", 13 | "- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]\n", 14 | "\n", 15 | "## 训练技巧\n", 16 | "\n", 17 | "- 首先正常训练一定的epoch, 使用Adam方法\n", 18 | "- 待loss降到一定水平后, 开放embedding参数的训练, 继续使用Adam方法训练, 并加入学习率衰减callback # (效果不好, dev loss降不下去)\n", 19 | "- 待loss降到比较低的水平后, 改用SGD方法进行训练, 直至结束 # (貌似不可行, 取消)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")\n", 32 | "\n", 33 | "import numpy as np\n", 34 | "import pandas as pd\n", 35 | "from glob import glob\n", 36 | "from datetime import datetime" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stderr", 46 | "output_type": "stream", 47 | "text": [ 48 | "Using TensorFlow backend.\n" 49 | ] 50 | }, 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "((10001, 300), (3049, 300))" 55 | ] 56 | }, 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "output_type": "execute_result" 60 | } 61 | ], 62 | "source": [ 63 | "DATA_PATH = \"./data/\"\n", 64 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 65 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 66 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 67 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 68 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 69 | "\n", 70 | "train_data = pd.read_csv(TRAIN_PATH)\n", 71 | "test_data = pd.read_csv(TEST_PATH)\n", 72 | "question_data = pd.read_csv(QUEST_PATH)\n", 73 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 74 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 75 | "\n", 76 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 77 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 78 | "\n", 79 | "label = train_data[\"label\"].values\n", 80 | "\n", 81 | "from keras.preprocessing.text import Tokenizer\n", 82 | "\n", 83 | "MAX_COUNT = 10000\n", 84 | "\n", 85 | "word_tokenizer = Tokenizer(MAX_COUNT)\n", 86 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 87 | "\n", 88 | "word_embedding_data = np.concatenate(\n", 89 | " (\n", 90 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 91 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 92 | " ),\n", 93 | " axis=0\n", 94 | ")\n", 95 | "\n", 96 | "char_tokenizer = Tokenizer(MAX_COUNT)\n", 97 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n", 98 | "\n", 99 | "char_embedding_data = np.concatenate(\n", 100 | " (\n", 101 | " np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n", 102 | " char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 103 | " ),\n", 104 | " axis=0\n", 105 | ")\n", 106 | "\n", 107 | "word_embedding_data.shape, char_embedding_data.shape" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 3, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "((254386, 30),\n", 119 | " (254386, 30),\n", 120 | " (172956, 30),\n", 121 | " (172956, 30),\n", 122 | " (254386, 30),\n", 123 | " (254386, 30),\n", 124 | " (172956, 30),\n", 125 | " (172956, 30))" 126 | ] 127 | }, 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "from keras.preprocessing.sequence import pad_sequences\n", 135 | "\n", 136 | "SEQ_LEN = 30\n", 137 | "\n", 138 | "def gen_word_data(data):\n", 139 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 140 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 141 | " return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 142 | " pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 143 | " \n", 144 | "def gen_char_data(data):\n", 145 | " seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n", 146 | " seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n", 147 | " return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 148 | " pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 149 | "\n", 150 | "word1, word2 = gen_word_data(train_data)\n", 151 | "char1, char2 = gen_char_data(train_data)\n", 152 | "test_word1, test_word2 = gen_word_data(test_data)\n", 153 | "test_char1, test_char2 = gen_char_data(test_data)\n", 154 | "\n", 155 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 4, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "from keras.models import Model\n", 167 | "from keras.layers.merge import concatenate\n", 168 | "from keras.optimizers import Adam, SGD, Nadam\n", 169 | "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n", 170 | "from keras.layers import LSTM, Bidirectional, TimeDistributed\n", 171 | "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n", 172 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 5, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "# general\n", 184 | "NUM_EPOCHES = 30\n", 185 | "EPOCHES1 = 5\n", 186 | "EPOCHES2 = 25 # 5\n", 187 | "EPOCHES3 = 22\n", 188 | "BATCH_SIZE = 1024\n", 189 | "DROP_RATE = 0.3\n", 190 | "\n", 191 | "# cnn\n", 192 | "CONV_LEN1 = 128\n", 193 | "CONV_LEN2 = 128\n", 194 | "CONV_LEN3 = 128\n", 195 | "CONV_LEN4 = 128\n", 196 | "CONV_LEN5 = 128\n", 197 | "CONV_LEN6 = 128\n", 198 | "CONV_LEN = CONV_LEN1 + CONV_LEN2 + CONV_LEN3 + CONV_LEN4 + CONV_LEN5 + CONV_LEN6\n", 199 | "\n", 200 | "# lstm\n", 201 | "LSTM_SIZE1 = 256\n", 202 | "LSTM_SIZE2 = 256\n", 203 | "LSTM_DROP_RATE = 0.3\n", 204 | "\n", 205 | "# dense\n", 206 | "DENSE_SIZE1 = 512\n", 207 | "DENSE_SIZE2 = 256" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 6, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "def cnn_layer1(inputa, inputb, filters, kernel_size):\n", 219 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 220 | " conv_outputa = conv(inputa)\n", 221 | " conv_outputa = GlobalAveragePooling1D()(conv_outputa)\n", 222 | " conv_outputb = conv(inputb)\n", 223 | " conv_outputb = GlobalAveragePooling1D()(conv_outputb)\n", 224 | " return conv_outputa, conv_outputb\n", 225 | " \n", 226 | "def cnn_layer2(inputa, inputb, filters, kernel_size):\n", 227 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 228 | " conv_outputa = conv(inputa)\n", 229 | " conv_outputa = MaxPool1D(pool_size=SEQ_LEN)(conv_outputa)\n", 230 | " conv_outputa = Flatten()(conv_outputa)\n", 231 | " conv_outputb = conv(inputb)\n", 232 | " conv_outputb = MaxPool1D(pool_size=SEQ_LEN)(conv_outputb)\n", 233 | " conv_outputb = Flatten()(conv_outputb)\n", 234 | " return conv_outputa, conv_outputb\n", 235 | "\n", 236 | "def cnn_layer3(inputa, inputb, filters, kernel_size):\n", 237 | " conv = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\", activation=\"relu\")\n", 238 | " \n", 239 | " conv_outputa = conv(inputa)\n", 240 | " conv_outputa1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputa))\n", 241 | " conv_outputa2 = GlobalAveragePooling1D()(conv_outputa)\n", 242 | " conv_outputa = concatenate([conv_outputa1, conv_outputa2])\n", 243 | " \n", 244 | " conv_outputb = conv(inputb)\n", 245 | " conv_outputb1 = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv_outputb))\n", 246 | " conv_outputb2 = GlobalAveragePooling1D()(conv_outputb)\n", 247 | " conv_outputb = concatenate([conv_outputb1, conv_outputb2])\n", 248 | " \n", 249 | " return conv_outputa, conv_outputb" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "# WORDS" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 7, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n", 269 | "Instructions for updating:\n", 270 | "`NHWC` for data_format is deprecated, use `NWC` instead\n", 271 | "Train on 228946 samples, validate on 25440 samples\n", 272 | "Epoch 1/5\n", 273 | "228946/228946 [==============================] - 169s 738us/step - loss: 0.4468 - acc: 0.7899 - val_loss: 0.3382 - val_acc: 0.8574\n", 274 | "Epoch 2/5\n", 275 | "228946/228946 [==============================] - 160s 697us/step - loss: 0.3121 - acc: 0.8629 - val_loss: 0.2594 - val_acc: 0.8870\n", 276 | "Epoch 3/5\n", 277 | "228946/228946 [==============================] - 166s 726us/step - loss: 0.2687 - acc: 0.8839 - val_loss: 0.2528 - val_acc: 0.8978\n", 278 | "Epoch 4/5\n", 279 | "228946/228946 [==============================] - 166s 724us/step - loss: 0.2414 - acc: 0.8968 - val_loss: 0.2279 - val_acc: 0.9031\n", 280 | "Epoch 5/5\n", 281 | "228946/228946 [==============================] - 167s 730us/step - loss: 0.2240 - acc: 0.9047 - val_loss: 0.2136 - val_acc: 0.9123\n", 282 | "Train on 228946 samples, validate on 25440 samples\n", 283 | "Epoch 1/25\n", 284 | "228946/228946 [==============================] - 178s 776us/step - loss: 0.2172 - acc: 0.9079 - val_loss: 0.2190 - val_acc: 0.9129\n", 285 | "Epoch 2/25\n", 286 | "228946/228946 [==============================] - 173s 755us/step - loss: 0.2038 - acc: 0.9146 - val_loss: 0.2058 - val_acc: 0.9172\n", 287 | "Epoch 3/25\n", 288 | "228946/228946 [==============================] - 176s 770us/step - loss: 0.1922 - acc: 0.9196 - val_loss: 0.2084 - val_acc: 0.9175\n", 289 | "Epoch 4/25\n", 290 | "228946/228946 [==============================] - 175s 764us/step - loss: 0.1825 - acc: 0.9239 - val_loss: 0.2094 - val_acc: 0.9196\n", 291 | "Epoch 5/25\n", 292 | "228946/228946 [==============================] - 176s 770us/step - loss: 0.1746 - acc: 0.9277 - val_loss: 0.1939 - val_acc: 0.9237\n", 293 | "Epoch 6/25\n", 294 | "228946/228946 [==============================] - 176s 769us/step - loss: 0.1684 - acc: 0.9303 - val_loss: 0.1912 - val_acc: 0.9258\n", 295 | "Epoch 7/25\n", 296 | "228946/228946 [==============================] - 175s 766us/step - loss: 0.1616 - acc: 0.9342 - val_loss: 0.1897 - val_acc: 0.9267\n", 297 | "Epoch 8/25\n", 298 | "228946/228946 [==============================] - 172s 752us/step - loss: 0.1570 - acc: 0.9352 - val_loss: 0.1979 - val_acc: 0.9271\n", 299 | "Epoch 9/25\n", 300 | "228946/228946 [==============================] - 178s 779us/step - loss: 0.1525 - acc: 0.9376 - val_loss: 0.1896 - val_acc: 0.9278\n", 301 | "Epoch 10/25\n", 302 | "228946/228946 [==============================] - 178s 776us/step - loss: 0.1482 - acc: 0.9391 - val_loss: 0.2019 - val_acc: 0.9228\n", 303 | "Epoch 11/25\n", 304 | "228946/228946 [==============================] - 178s 775us/step - loss: 0.1446 - acc: 0.9411 - val_loss: 0.1884 - val_acc: 0.9289\n", 305 | "Epoch 12/25\n", 306 | "228946/228946 [==============================] - 176s 771us/step - loss: 0.1411 - acc: 0.9422 - val_loss: 0.1884 - val_acc: 0.9308\n", 307 | "Epoch 13/25\n", 308 | "228946/228946 [==============================] - 176s 768us/step - loss: 0.1387 - acc: 0.9427 - val_loss: 0.1833 - val_acc: 0.9323\n", 309 | "Epoch 14/25\n", 310 | "228946/228946 [==============================] - 176s 771us/step - loss: 0.1354 - acc: 0.9442 - val_loss: 0.1861 - val_acc: 0.9309\n", 311 | "Epoch 15/25\n", 312 | "228946/228946 [==============================] - 176s 771us/step - loss: 0.1327 - acc: 0.9455 - val_loss: 0.1982 - val_acc: 0.9292\n", 313 | "Epoch 16/25\n", 314 | "228946/228946 [==============================] - 177s 774us/step - loss: 0.1305 - acc: 0.9463 - val_loss: 0.2033 - val_acc: 0.9267\n", 315 | "Epoch 17/25\n", 316 | "159744/228946 [===================>..........] - ETA: 52s - loss: 0.1273 - acc: 0.9476" 317 | ] 318 | }, 319 | { 320 | "ename": "KeyboardInterrupt", 321 | "evalue": "", 322 | "output_type": "error", 323 | "traceback": [ 324 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 325 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 326 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdev_x1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_x2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 98\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 99\u001b[1;33m \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mearly_stopping\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlr_reducer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_point\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 100\u001b[0m )\n\u001b[0;32m 101\u001b[0m \u001b[1;31m# # 第三次训练\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 327 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[0;32m 1040\u001b[0m \u001b[0minitial_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m validation_steps=validation_steps)\n\u001b[0m\u001b[0;32m 1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1044\u001b[0m def evaluate(self, x=None, y=None,\n", 328 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training_arrays.py\u001b[0m in \u001b[0;36mfit_loop\u001b[1;34m(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 198\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 199\u001b[1;33m \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 200\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 329 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m 2665\u001b[0m \u001b[1;34m'In order to feed symbolic tensors to a Keras model '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2666\u001b[0m 'in TensorFlow, you need tensorflow 1.8 or higher.')\n\u001b[1;32m-> 2667\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_legacy_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2668\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2669\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 330 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m_legacy_call\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m 2647\u001b[0m \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2648\u001b[0m updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[1;32m-> 2649\u001b[1;33m **self.session_kwargs)\n\u001b[0m\u001b[0;32m 2650\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2651\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 331 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 903\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 904\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[1;32m--> 905\u001b[1;33m run_metadata_ptr)\n\u001b[0m\u001b[0;32m 906\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 907\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 332 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1135\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1136\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[1;32m-> 1137\u001b[1;33m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[0;32m 1138\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1139\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 333 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1353\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1354\u001b[0m return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[1;32m-> 1355\u001b[1;33m options, run_metadata)\n\u001b[0m\u001b[0;32m 1356\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1357\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 334 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m 1359\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1360\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1361\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1362\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1363\u001b[0m \u001b[0mmessage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 335 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[1;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m 1338\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1339\u001b[0m return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[1;32m-> 1340\u001b[1;33m target_list, status, run_metadata)\n\u001b[0m\u001b[0;32m 1341\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1342\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 336 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "from sklearn.model_selection import StratifiedKFold\n", 342 | "\n", 343 | "best_results = []\n", 344 | "last_results = []\n", 345 | "best_file_names = []\n", 346 | "\n", 347 | "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)): # word/char switch\n", 348 | " train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index] # word/char switch\n", 349 | " dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index] # word/char switch\n", 350 | " \n", 351 | " input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 352 | " input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 353 | "\n", 354 | " embedding_layer = Embedding(\n", 355 | " input_dim=word_embedding_data.shape[0], # word/char switch\n", 356 | " output_dim=word_embedding_data.shape[1], # word/char switch\n", 357 | " weights=[word_embedding_data], # word/char switch\n", 358 | " input_length=SEQ_LEN,\n", 359 | " trainable=False\n", 360 | " )\n", 361 | " \n", 362 | " vector1 = embedding_layer(input1)\n", 363 | " vector2 = embedding_layer(input2)\n", 364 | " \n", 365 | " lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 366 | " layer1a = lstm_layer1(vector1)\n", 367 | " layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n", 368 | " layer1b = lstm_layer1(vector2)\n", 369 | " layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n", 370 | "\n", 371 | " lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 372 | " layer2a = lstm_layer2(layer1a)\n", 373 | " layer2b = lstm_layer2(layer1b)\n", 374 | "# # 每个序列片拼接对应的原始embedding向量\n", 375 | "# layer2a = concatenate([vector1, layer2a])\n", 376 | "# layer2b = concatenate([vector2, layer2b])\n", 377 | " \n", 378 | " conv1a, conv1b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n", 379 | " conv2a, conv2b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n", 380 | " conv3a, conv3b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n", 381 | " conv4a, conv4b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n", 382 | " conv5a, conv5b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n", 383 | " conv6a, conv6b = cnn_layer3(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n", 384 | " \n", 385 | " merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 386 | " merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 387 | " diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 388 | " mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 389 | " merge = concatenate([diff, mult])\n", 390 | " \n", 391 | " x = Dropout(DROP_RATE)(merge)\n", 392 | " x = BatchNormalization()(x)\n", 393 | " x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 394 | " x = Dropout(DROP_RATE)(x)\n", 395 | " x = BatchNormalization()(x)\n", 396 | " x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 397 | " x = Dropout(DROP_RATE)(x)\n", 398 | " x = BatchNormalization()(x)\n", 399 | " pred = Dense(1, activation=\"sigmoid\")(x)\n", 400 | " \n", 401 | " model = Model(inputs=[input1, input2], outputs=pred)\n", 402 | " \n", 403 | " # 第一次训练\n", 404 | " model.compile(\n", 405 | " optimizer=\"nadam\",\n", 406 | " loss=\"binary_crossentropy\",\n", 407 | " metrics=[\"acc\"]\n", 408 | " )\n", 409 | " fit_res1 = model.fit(\n", 410 | " x=[train_x1, train_x2],\n", 411 | " y=train_y,\n", 412 | " batch_size=BATCH_SIZE,\n", 413 | " epochs=EPOCHES1,\n", 414 | " validation_data=([dev_x1, dev_x2], dev_y),\n", 415 | " shuffle=True,\n", 416 | " )\n", 417 | " # 第二次训练\n", 418 | " embedding_layer.trainable = True\n", 419 | " lr_reducer = ReduceLROnPlateau(factor=0.5, patience=4, min_lr=0.0005)\n", 420 | " early_stopping = EarlyStopping(\"val_loss\", patience=8)\n", 421 | " check_point = ModelCheckpoint(\n", 422 | " \"./log/%s.Multi_LSTM_CNN_v2.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 423 | " monitor=\"val_loss\",\n", 424 | " save_best_only=True,\n", 425 | " )\n", 426 | " model.compile(\n", 427 | " optimizer=\"nadam\",\n", 428 | " loss=\"binary_crossentropy\",\n", 429 | " metrics=[\"acc\"]\n", 430 | " )\n", 431 | " fit_res2 = model.fit(\n", 432 | " x=[train_x1, train_x2],\n", 433 | " y=train_y,\n", 434 | " batch_size=BATCH_SIZE,\n", 435 | " epochs=EPOCHES2,\n", 436 | " validation_data=([dev_x1, dev_x2], dev_y),\n", 437 | " shuffle=True,\n", 438 | " callbacks=[early_stopping, lr_reducer, check_point]\n", 439 | " )\n", 440 | " \n", 441 | "# # 第三次训练\n", 442 | "# model.compile(\n", 443 | "# optimizer=SGD(lr=0.001),\n", 444 | "# loss=\"binary_crossentropy\",\n", 445 | "# metrics=[\"acc\"]\n", 446 | "# )\n", 447 | "# fit_res3 = model.fit(\n", 448 | "# x=[train_x1, train_x2],\n", 449 | "# y=train_y,\n", 450 | "# batch_size=BATCH_SIZE,\n", 451 | "# epochs=EPOCHES3,\n", 452 | "# validation_data=([dev_x1, dev_x2], dev_y),\n", 453 | "# shuffle=True,\n", 454 | "# callbacks=[early_stopping, lr_reducer, check_point]\n", 455 | "# )\n", 456 | " \n", 457 | " pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 458 | " last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 459 | " \n", 460 | " best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n", 461 | " best_file_names.append(best_model_file)\n", 462 | " print(\"load model %s\" % (best_model_file,))\n", 463 | " model.load_weights(best_model_file)\n", 464 | " pred_best = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 465 | " best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 466 | "\n", 467 | "pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 468 | " \"./result/%s-Multi_LSTM_CNN_v2_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 469 | " index=False\n", 470 | ")\n", 471 | "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 472 | " \"./result/%s-Multi_LSTM_CNN_v2_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 473 | " index=False\n", 474 | ")" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": { 481 | "collapsed": true 482 | }, 483 | "outputs": [], 484 | "source": [] 485 | } 486 | ], 487 | "metadata": { 488 | "kernelspec": { 489 | "display_name": "Python 3", 490 | "language": "python", 491 | "name": "python3" 492 | }, 493 | "language_info": { 494 | "codemirror_mode": { 495 | "name": "ipython", 496 | "version": 3 497 | }, 498 | "file_extension": ".py", 499 | "mimetype": "text/x-python", 500 | "name": "python", 501 | "nbconvert_exporter": "python", 502 | "pygments_lexer": "ipython3", 503 | "version": "3.6.2" 504 | } 505 | }, 506 | "nbformat": 4, 507 | "nbformat_minor": 2 508 | } 509 | -------------------------------------------------------------------------------- /[Model] Multi LSTM CNN v5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 模型结构\n", 8 | "\n", 9 | "- 两层单向LSTM, 输出序列结果, 即(batch_size, step_size, feature_size)\n", 10 | "- 分别输入到1, 2, 3, 4, 5, 6共6个不同长度的卷积层中\n", 11 | "- 卷积层为双层, 最后的池化层有Average和Max两种\n", 12 | "- 对于每个问题, 将所有卷积核结果并起来\n", 13 | "- 将两个问题并起来的结果, 分别[相减并取绝对值], [x]\n", 14 | "- 融入特征" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import warnings\n", 26 | "warnings.filterwarnings(\"ignore\")\n", 27 | "\n", 28 | "import os\n", 29 | "import time\n", 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "from glob import glob\n", 33 | "from datetime import datetime" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "try:\n", 45 | " os.mkdir(\"./log/\")\n", 46 | " os.mkdir(\"./result/\")\n", 47 | "except FileExistsError:\n", 48 | " pass" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "DATA_PATH = \"./data/\"\n", 60 | "TRAIN_PATH = DATA_PATH + \"train.csv\"\n", 61 | "TEST_PATH = DATA_PATH + \"test.csv\"\n", 62 | "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n", 63 | "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n", 64 | "QUEST_PATH = DATA_PATH + \"question.csv\"\n", 65 | "TRAIN_FEATURE = DATA_PATH + \"train_feature.csv\"\n", 66 | "TEST_FEATURE = DATA_PATH + \"test_feature.csv\"" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stderr", 76 | "output_type": "stream", 77 | "text": [ 78 | "Using TensorFlow backend.\n" 79 | ] 80 | }, 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "((10001, 300), (3049, 300))" 85 | ] 86 | }, 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "train_data = pd.read_csv(TRAIN_PATH)\n", 94 | "test_data = pd.read_csv(TEST_PATH)\n", 95 | "question_data = pd.read_csv(QUEST_PATH)\n", 96 | "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 97 | "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n", 98 | "train_feature = pd.read_csv(TRAIN_FEATURE).values\n", 99 | "test_feature = pd.read_csv(TEST_FEATURE).values\n", 100 | "\n", 101 | "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n", 102 | "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n", 103 | "\n", 104 | "label = train_data[\"label\"].values\n", 105 | "\n", 106 | "from keras.preprocessing.text import Tokenizer\n", 107 | "\n", 108 | "MAX_COUNT = 10000\n", 109 | "\n", 110 | "word_tokenizer = Tokenizer(MAX_COUNT)\n", 111 | "word_tokenizer.fit_on_texts(question_data[\"words\"])\n", 112 | "\n", 113 | "word_embedding_data = np.concatenate(\n", 114 | " (\n", 115 | " np.zeros(shape=(1, word_embedding_data.shape[1]), dtype=np.float64),\n", 116 | " word_embedding_data.loc[list(word_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 117 | " ),\n", 118 | " axis=0\n", 119 | ")\n", 120 | "\n", 121 | "char_tokenizer = Tokenizer(MAX_COUNT)\n", 122 | "char_tokenizer.fit_on_texts(question_data[\"chars\"])\n", 123 | "\n", 124 | "char_embedding_data = np.concatenate(\n", 125 | " (\n", 126 | " np.zeros(shape=(1, char_embedding_data.shape[1]), dtype=np.float64),\n", 127 | " char_embedding_data.loc[list(char_tokenizer.word_index.keys())[:MAX_COUNT]].values\n", 128 | " ),\n", 129 | " axis=0\n", 130 | ")\n", 131 | "\n", 132 | "word_embedding_data.shape, char_embedding_data.shape" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 5, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "((254386, 25),\n", 144 | " (254386, 25),\n", 145 | " (172956, 25),\n", 146 | " (172956, 25),\n", 147 | " (254386, 25),\n", 148 | " (254386, 25),\n", 149 | " (172956, 25),\n", 150 | " (172956, 25))" 151 | ] 152 | }, 153 | "execution_count": 5, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "from keras.preprocessing.sequence import pad_sequences\n", 160 | "\n", 161 | "SEQ_LEN = 25\n", 162 | "\n", 163 | "def gen_word_data(data):\n", 164 | " seq_word1 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"words\"])\n", 165 | " seq_word2 = word_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"words\"])\n", 166 | " return pad_sequences(seq_word1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 167 | " pad_sequences(seq_word2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 168 | " \n", 169 | "def gen_char_data(data):\n", 170 | " seq_char1 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\")[\"chars\"])\n", 171 | " seq_char2 = char_tokenizer.texts_to_sequences(data.merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\")[\"chars\"])\n", 172 | " return pad_sequences(seq_char1, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\"), \\\n", 173 | " pad_sequences(seq_char2, maxlen=SEQ_LEN, padding=\"pre\",truncating=\"pre\")\n", 174 | "\n", 175 | "word1, word2 = gen_word_data(train_data)\n", 176 | "char1, char2 = gen_char_data(train_data)\n", 177 | "test_word1, test_word2 = gen_word_data(test_data)\n", 178 | "test_char1, test_char2 = gen_char_data(test_data)\n", 179 | "\n", 180 | "word1.shape, word2.shape, test_word1.shape, test_word2.shape, char1.shape, char2.shape, test_char1.shape, test_char2.shape" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 6, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "from keras.models import Model\n", 192 | "from keras.layers.merge import concatenate\n", 193 | "from keras.optimizers import Adam, SGD, Nadam\n", 194 | "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n", 195 | "from keras.layers import LSTM, Bidirectional, TimeDistributed\n", 196 | "from keras.layers import Conv1D, MaxPool1D, GlobalAveragePooling1D\n", 197 | "from keras.layers import Input, Embedding, Dropout, BatchNormalization, Dense, Flatten, Lambda, K, Activation" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 7, 203 | "metadata": { 204 | "collapsed": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "# general\n", 209 | "NUM_EPOCHES = 25\n", 210 | "EPOCHES1 = 5\n", 211 | "EPOCHES2 = 25 # 5\n", 212 | "EPOCHES3 = 22\n", 213 | "BATCH_SIZE = 1024\n", 214 | "DROP_RATE = 0.3\n", 215 | "\n", 216 | "# cnn\n", 217 | "CONV_LEN1 = 128\n", 218 | "CONV_LEN2 = 128\n", 219 | "CONV_LEN3 = 128\n", 220 | "CONV_LEN4 = 128\n", 221 | "CONV_LEN5 = 128\n", 222 | "CONV_LEN6 = 128\n", 223 | "\n", 224 | "# lstm\n", 225 | "LSTM_SIZE1 = 256\n", 226 | "LSTM_SIZE2 = 256\n", 227 | "LSTM_DROP_RATE = 0.3\n", 228 | "\n", 229 | "# dense\n", 230 | "DENSE_SIZE1 = 512\n", 231 | "DENSE_SIZE2 = 256\n", 232 | "DENSE_FEATURE = 32" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 8, 238 | "metadata": { 239 | "collapsed": true 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "def cnn_double_layer(inputa, inputb, filters, kernel_size):\n", 244 | " conv1 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n", 245 | " conv2 = Conv1D(filters=filters, kernel_size=kernel_size, padding=\"same\")\n", 246 | " \n", 247 | " conv1a = conv1(inputa)\n", 248 | " conv1a = BatchNormalization()(conv1a)\n", 249 | " conv1a = Activation(activation=\"relu\")(conv1a)\n", 250 | " conv2a = conv2(conv1a)\n", 251 | " conv2a = BatchNormalization()(conv2a)\n", 252 | " conv2a = Activation(activation=\"relu\")(conv2a)\n", 253 | " output_avg_a = GlobalAveragePooling1D()(conv2a)\n", 254 | " output_max_a = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2a))\n", 255 | " output_a = concatenate([output_avg_a, output_max_a])\n", 256 | " \n", 257 | " conv1b = conv1(inputb)\n", 258 | " conv1b = BatchNormalization()(conv1b)\n", 259 | " conv1b = Activation(activation=\"relu\")(conv1b)\n", 260 | " conv2b = conv2(conv1b)\n", 261 | " conv2b = BatchNormalization()(conv2b)\n", 262 | " conv2b = Activation(activation=\"relu\")(conv2b)\n", 263 | " output_avg_b = GlobalAveragePooling1D()(conv2b)\n", 264 | " output_max_b = Flatten()(MaxPool1D(pool_size=SEQ_LEN)(conv2b))\n", 265 | " output_b = concatenate([output_avg_b, output_max_b])\n", 266 | " \n", 267 | " return output_a, output_b" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "# WORDS" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 9, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "# from sklearn.model_selection import StratifiedKFold\n", 284 | "\n", 285 | "# best_results = []\n", 286 | "# # last_results = []\n", 287 | "# best_file_names = []\n", 288 | "# dev_predictions = []\n", 289 | "\n", 290 | "# for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=word1, y=label)): # word/char switch\n", 291 | "# print(\"-\" * 60)\n", 292 | "# print(\"Fold {} training start...\".format(i))\n", 293 | " \n", 294 | "# train_x1, train_x2, train_y = word1[train_index, :], word2[train_index, :], label[train_index] # word/char switch\n", 295 | "# dev_x1, dev_x2, dev_y = word1[dev_index, :], word2[dev_index, :], label[dev_index] # word/char switch\n", 296 | "# train_f, dev_f = train_feature[train_index, :], train_feature[dev_index, :]\n", 297 | " \n", 298 | "# input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 299 | "# input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 300 | "# inputf = Input(shape=(train_f.shape[1],), dtype=\"float32\")\n", 301 | "\n", 302 | "# embedding_layer = Embedding(\n", 303 | "# input_dim=word_embedding_data.shape[0], # word/char switch\n", 304 | "# output_dim=word_embedding_data.shape[1], # word/char switch\n", 305 | "# weights=[word_embedding_data], # word/char switch\n", 306 | "# input_length=SEQ_LEN,\n", 307 | "# trainable=False\n", 308 | "# )\n", 309 | " \n", 310 | "# vector1 = embedding_layer(input1)\n", 311 | "# vector2 = embedding_layer(input2)\n", 312 | " \n", 313 | "# lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 314 | "# layer1a = lstm_layer1(vector1)\n", 315 | "# layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n", 316 | "# layer1b = lstm_layer1(vector2)\n", 317 | "# layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n", 318 | "\n", 319 | "# lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 320 | "# layer2a = lstm_layer2(layer1a)\n", 321 | "# layer2b = lstm_layer2(layer1b)\n", 322 | " \n", 323 | "# conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n", 324 | "# conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n", 325 | "# conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n", 326 | "# conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n", 327 | "# conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n", 328 | "# conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n", 329 | " \n", 330 | "# merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 331 | "# merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 332 | "# diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 333 | "# mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 334 | "# merge = concatenate([diff, mult])\n", 335 | " \n", 336 | "# x = Dropout(DROP_RATE)(merge)\n", 337 | "# x = BatchNormalization()(x)\n", 338 | "# x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 339 | " \n", 340 | "# fe = BatchNormalization()(inputf)\n", 341 | "# fe = Dense(DENSE_FEATURE, activation=\"relu\")(fe)\n", 342 | " \n", 343 | "# x = concatenate([x, fe])\n", 344 | " \n", 345 | "# x = Dropout(DROP_RATE)(x)\n", 346 | "# x = BatchNormalization()(x)\n", 347 | "# x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 348 | "# x = Dropout(DROP_RATE)(x)\n", 349 | "# x = BatchNormalization()(x)\n", 350 | "# pred = Dense(1, activation=\"sigmoid\")(x)\n", 351 | " \n", 352 | "# model = Model(inputs=[input1, input2, inputf], outputs=pred)\n", 353 | "# model.compile(\n", 354 | "# optimizer=\"nadam\",\n", 355 | "# loss=\"binary_crossentropy\",\n", 356 | "# metrics=[\"acc\"]\n", 357 | "# )\n", 358 | " \n", 359 | "# early_stopping = EarlyStopping(\"val_loss\", patience=6)\n", 360 | "# lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0005)\n", 361 | "# check_point = ModelCheckpoint(\n", 362 | "# \"./log/%s.Multi_LSTM_CNN_v3.word.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 363 | "# monitor=\"val_loss\",\n", 364 | "# save_best_only=True,\n", 365 | "# )\n", 366 | " \n", 367 | "# fit_res = model.fit(\n", 368 | "# x=[train_x1, train_x2, train_f],\n", 369 | "# y=train_y,\n", 370 | "# batch_size=BATCH_SIZE,\n", 371 | "# epochs=NUM_EPOCHES,\n", 372 | "# validation_data=([dev_x1, dev_x2, dev_f], dev_y),\n", 373 | "# shuffle=True,\n", 374 | "# callbacks=[early_stopping, lr_reducer, check_point]\n", 375 | "# )\n", 376 | " \n", 377 | "# # pred_last = model.predict([test_word1, test_word2], batch_size=BATCH_SIZE) # word/char switch\n", 378 | "# # last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 379 | " \n", 380 | "# best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n", 381 | "# best_file_names.append(best_model_file)\n", 382 | "# print(\"load model %s\" % (best_model_file,))\n", 383 | "# model.load_weights(best_model_file)\n", 384 | "# pred_best = model.predict([test_word1, test_word2, test_feature], batch_size=BATCH_SIZE) # word/char switch\n", 385 | "# best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 386 | " \n", 387 | "# dev_pred = model.predict([dev_x1, dev_x2, dev_f], batch_size=BATCH_SIZE)\n", 388 | "# dev_result = pd.DataFrame({\"pred\": dev_pred.ravel(), \"label\": dev_y})\n", 389 | "# dev_predictions.append(dev_result)\n", 390 | "\n", 391 | "# # pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 392 | "# # \"./result/%s-Multi_LSTM_CNN_v5_word_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 393 | "# # index=False\n", 394 | "# # )\n", 395 | "# pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 396 | "# \"./result/%s-Multi_LSTM_CNN_v5_word_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 397 | "# index=False\n", 398 | "# )\n", 399 | "\n", 400 | "# total_dev = pd.concat(dev_predictions, axis=0)\n", 401 | "# total_dev.to_csv(\n", 402 | "# \"./result/%s-Multi_LSTM_CNN_v5_word_dev_result.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")),\n", 403 | "# index=False\n", 404 | "# )\n", 405 | "\n", 406 | "# model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n", 407 | "# os.mkdir(model_path)\n", 408 | "# for model_name in best_file_names:\n", 409 | "# abs_name = os.path.split(model_name)[1]\n", 410 | "# os.rename(model_name, model_path + abs_name)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "# CHARS" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 10, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "------------------------------------------------------------\n", 430 | "Fold 0 training start...\n", 431 | "WARNING:tensorflow:From C:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\util\\deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.\n", 432 | "Instructions for updating:\n", 433 | "`NHWC` for data_format is deprecated, use `NWC` instead\n", 434 | "Train on 228946 samples, validate on 25440 samples\n", 435 | "Epoch 1/25\n", 436 | "228946/228946 [==============================] - 198s 866us/step - loss: 0.3947 - acc: 0.8071 - val_loss: 0.3225 - val_acc: 0.8560\n", 437 | "Epoch 2/25\n", 438 | "228946/228946 [==============================] - 182s 797us/step - loss: 0.2892 - acc: 0.8666 - val_loss: 0.2669 - val_acc: 0.8855\n", 439 | "Epoch 3/25\n", 440 | "228946/228946 [==============================] - 187s 818us/step - loss: 0.2522 - acc: 0.8873 - val_loss: 0.2193 - val_acc: 0.9046\n", 441 | "Epoch 4/25\n", 442 | "228946/228946 [==============================] - 193s 843us/step - loss: 0.2297 - acc: 0.8979 - val_loss: 0.2147 - val_acc: 0.9078\n", 443 | "Epoch 5/25\n", 444 | "228946/228946 [==============================] - 194s 848us/step - loss: 0.2145 - acc: 0.9059 - val_loss: 0.1984 - val_acc: 0.9136\n", 445 | "Epoch 6/25\n", 446 | "228946/228946 [==============================] - 182s 797us/step - loss: 0.2030 - acc: 0.9114 - val_loss: 0.1981 - val_acc: 0.9162\n", 447 | "Epoch 7/25\n", 448 | "228946/228946 [==============================] - 188s 823us/step - loss: 0.1934 - acc: 0.9166 - val_loss: 0.1876 - val_acc: 0.9191\n", 449 | "Epoch 8/25\n", 450 | "228946/228946 [==============================] - 185s 809us/step - loss: 0.1857 - acc: 0.9196 - val_loss: 0.1846 - val_acc: 0.9231\n", 451 | "Epoch 9/25\n", 452 | "228946/228946 [==============================] - 192s 839us/step - loss: 0.1798 - acc: 0.9231 - val_loss: 0.1780 - val_acc: 0.9268\n", 453 | "Epoch 10/25\n", 454 | "228946/228946 [==============================] - 187s 818us/step - loss: 0.1743 - acc: 0.9254 - val_loss: 0.1820 - val_acc: 0.9276\n", 455 | "Epoch 11/25\n", 456 | "228946/228946 [==============================] - 192s 838us/step - loss: 0.1693 - acc: 0.9275 - val_loss: 0.1749 - val_acc: 0.9291\n", 457 | "Epoch 12/25\n", 458 | "228946/228946 [==============================] - 193s 842us/step - loss: 0.1663 - acc: 0.9290 - val_loss: 0.1740 - val_acc: 0.9301\n", 459 | "Epoch 13/25\n", 460 | "228946/228946 [==============================] - 193s 844us/step - loss: 0.1614 - acc: 0.9309 - val_loss: 0.1744 - val_acc: 0.9292\n", 461 | "Epoch 14/25\n", 462 | "228946/228946 [==============================] - 193s 844us/step - loss: 0.1577 - acc: 0.9333 - val_loss: 0.1710 - val_acc: 0.9306\n", 463 | "Epoch 15/25\n", 464 | "228946/228946 [==============================] - 194s 846us/step - loss: 0.1543 - acc: 0.9346 - val_loss: 0.1704 - val_acc: 0.9309\n", 465 | "Epoch 16/25\n", 466 | "228946/228946 [==============================] - 192s 839us/step - loss: 0.1524 - acc: 0.9356 - val_loss: 0.1730 - val_acc: 0.9318\n", 467 | "Epoch 17/25\n", 468 | "228946/228946 [==============================] - 193s 842us/step - loss: 0.1477 - acc: 0.9377 - val_loss: 0.1647 - val_acc: 0.9346\n", 469 | "Epoch 18/25\n", 470 | "228946/228946 [==============================] - 183s 799us/step - loss: 0.1451 - acc: 0.9387 - val_loss: 0.1720 - val_acc: 0.9333\n", 471 | "Epoch 19/25\n", 472 | "228946/228946 [==============================] - 183s 799us/step - loss: 0.1433 - acc: 0.9395 - val_loss: 0.1699 - val_acc: 0.9341\n", 473 | "Epoch 20/25\n", 474 | "228946/228946 [==============================] - 190s 831us/step - loss: 0.1409 - acc: 0.9407 - val_loss: 0.1677 - val_acc: 0.9338\n", 475 | "Epoch 21/25\n", 476 | "228946/228946 [==============================] - 190s 831us/step - loss: 0.1315 - acc: 0.9447 - val_loss: 0.1687 - val_acc: 0.9346\n", 477 | "Epoch 22/25\n", 478 | "228946/228946 [==============================] - 193s 842us/step - loss: 0.1256 - acc: 0.9471 - val_loss: 0.1647 - val_acc: 0.9372\n", 479 | "Epoch 23/25\n", 480 | "228946/228946 [==============================] - 193s 845us/step - loss: 0.1222 - acc: 0.9490 - val_loss: 0.1644 - val_acc: 0.9365\n", 481 | "Epoch 24/25\n", 482 | "228946/228946 [==============================] - 192s 839us/step - loss: 0.1197 - acc: 0.9498 - val_loss: 0.1695 - val_acc: 0.9358\n", 483 | "Epoch 25/25\n", 484 | "228946/228946 [==============================] - 192s 840us/step - loss: 0.1168 - acc: 0.9513 - val_loss: 0.1724 - val_acc: 0.9350\n", 485 | "load model ./log/20180715-102931.Multi_LSTM_CNN_v5.char.023.hdf5\n", 486 | "------------------------------------------------------------\n", 487 | "Fold 1 training start...\n", 488 | "Train on 228946 samples, validate on 25440 samples\n", 489 | "Epoch 1/25\n", 490 | "228946/228946 [==============================] - 198s 866us/step - loss: 0.3974 - acc: 0.8062 - val_loss: 0.3109 - val_acc: 0.8622\n", 491 | "Epoch 2/25\n", 492 | "228946/228946 [==============================] - 185s 807us/step - loss: 0.2894 - acc: 0.8665 - val_loss: 0.2533 - val_acc: 0.8928\n", 493 | "Epoch 3/25\n", 494 | "228946/228946 [==============================] - 190s 828us/step - loss: 0.2506 - acc: 0.8879 - val_loss: 0.2277 - val_acc: 0.9017\n", 495 | "Epoch 4/25\n", 496 | "228946/228946 [==============================] - 187s 815us/step - loss: 0.2285 - acc: 0.8992 - val_loss: 0.2086 - val_acc: 0.9099\n", 497 | "Epoch 5/25\n", 498 | "228946/228946 [==============================] - 183s 798us/step - loss: 0.2128 - acc: 0.9065 - val_loss: 0.2071 - val_acc: 0.9135\n", 499 | "Epoch 6/25\n", 500 | "228946/228946 [==============================] - 183s 798us/step - loss: 0.2018 - acc: 0.9124 - val_loss: 0.1964 - val_acc: 0.9168\n", 501 | "Epoch 7/25\n", 502 | "228946/228946 [==============================] - 189s 827us/step - loss: 0.1925 - acc: 0.9173 - val_loss: 0.1846 - val_acc: 0.9245\n", 503 | "Epoch 8/25\n", 504 | "228946/228946 [==============================] - 183s 799us/step - loss: 0.1860 - acc: 0.9197 - val_loss: 0.1793 - val_acc: 0.9238\n", 505 | "Epoch 9/25\n", 506 | "228946/228946 [==============================] - 188s 822us/step - loss: 0.1799 - acc: 0.9231 - val_loss: 0.1794 - val_acc: 0.9254\n", 507 | "Epoch 10/25\n", 508 | "228946/228946 [==============================] - 192s 839us/step - loss: 0.1736 - acc: 0.9255 - val_loss: 0.1836 - val_acc: 0.9225\n", 509 | "Epoch 11/25\n", 510 | "228946/228946 [==============================] - 188s 820us/step - loss: 0.1688 - acc: 0.9276 - val_loss: 0.1729 - val_acc: 0.9289\n", 511 | "Epoch 12/25\n", 512 | "228946/228946 [==============================] - 190s 831us/step - loss: 0.1657 - acc: 0.9291 - val_loss: 0.1711 - val_acc: 0.9296\n", 513 | "Epoch 13/25\n", 514 | "228946/228946 [==============================] - 191s 836us/step - loss: 0.1609 - acc: 0.9311 - val_loss: 0.1759 - val_acc: 0.9300\n", 515 | "Epoch 14/25\n", 516 | "228946/228946 [==============================] - 191s 835us/step - loss: 0.1578 - acc: 0.9328 - val_loss: 0.1673 - val_acc: 0.9318\n", 517 | "Epoch 15/25\n", 518 | "228946/228946 [==============================] - 191s 833us/step - loss: 0.1544 - acc: 0.9346 - val_loss: 0.1677 - val_acc: 0.9309\n", 519 | "Epoch 16/25\n", 520 | "228946/228946 [==============================] - 192s 840us/step - loss: 0.1521 - acc: 0.9353 - val_loss: 0.1710 - val_acc: 0.9315\n", 521 | "Epoch 17/25\n", 522 | "228946/228946 [==============================] - 193s 845us/step - loss: 0.1481 - acc: 0.9372 - val_loss: 0.1729 - val_acc: 0.9318\n", 523 | "Epoch 18/25\n", 524 | "228946/228946 [==============================] - 188s 820us/step - loss: 0.1380 - acc: 0.9415 - val_loss: 0.1689 - val_acc: 0.9338\n", 525 | "Epoch 19/25\n", 526 | "228946/228946 [==============================] - 191s 836us/step - loss: 0.1325 - acc: 0.9440 - val_loss: 0.1628 - val_acc: 0.9357\n", 527 | "Epoch 20/25\n", 528 | "228946/228946 [==============================] - 183s 798us/step - loss: 0.1287 - acc: 0.9464 - val_loss: 0.1655 - val_acc: 0.9359\n", 529 | "Epoch 21/25\n", 530 | "228946/228946 [==============================] - 183s 798us/step - loss: 0.1250 - acc: 0.9476 - val_loss: 0.1651 - val_acc: 0.9364\n", 531 | "Epoch 22/25\n", 532 | "228946/228946 [==============================] - 183s 798us/step - loss: 0.1227 - acc: 0.9490 - val_loss: 0.1689 - val_acc: 0.9354\n", 533 | "Epoch 23/25\n", 534 | "228946/228946 [==============================] - 183s 798us/step - loss: 0.1152 - acc: 0.9512 - val_loss: 0.1671 - val_acc: 0.9377\n", 535 | "Epoch 24/25\n", 536 | "228946/228946 [==============================] - 183s 799us/step - loss: 0.1117 - acc: 0.9535 - val_loss: 0.1682 - val_acc: 0.9373\n", 537 | "Epoch 25/25\n", 538 | "228946/228946 [==============================] - 189s 828us/step - loss: 0.1096 - acc: 0.9542 - val_loss: 0.1683 - val_acc: 0.9369\n", 539 | "load model ./log/20180715-115006.Multi_LSTM_CNN_v5.char.019.hdf5\n", 540 | "------------------------------------------------------------\n", 541 | "Fold 2 training start...\n", 542 | "Train on 228947 samples, validate on 25439 samples\n", 543 | "Epoch 1/25\n", 544 | "228947/228947 [==============================] - 193s 843us/step - loss: 0.3951 - acc: 0.8062 - val_loss: 0.3309 - val_acc: 0.8579\n", 545 | "Epoch 2/25\n" 546 | ] 547 | }, 548 | { 549 | "name": "stdout", 550 | "output_type": "stream", 551 | "text": [ 552 | "228947/228947 [==============================] - 183s 800us/step - loss: 0.2904 - acc: 0.8656 - val_loss: 0.2692 - val_acc: 0.8873\n", 553 | "Epoch 3/25\n", 554 | "228947/228947 [==============================] - 183s 799us/step - loss: 0.2526 - acc: 0.8868 - val_loss: 0.2245 - val_acc: 0.9035\n", 555 | "Epoch 4/25\n", 556 | "228947/228947 [==============================] - 183s 800us/step - loss: 0.2295 - acc: 0.8988 - val_loss: 0.2094 - val_acc: 0.9101\n", 557 | "Epoch 5/25\n", 558 | "228947/228947 [==============================] - 183s 800us/step - loss: 0.2141 - acc: 0.9066 - val_loss: 0.2024 - val_acc: 0.9129\n", 559 | "Epoch 6/25\n", 560 | "228947/228947 [==============================] - 183s 800us/step - loss: 0.2020 - acc: 0.9126 - val_loss: 0.1879 - val_acc: 0.9185\n", 561 | "Epoch 7/25\n", 562 | "228947/228947 [==============================] - 183s 800us/step - loss: 0.1924 - acc: 0.9168 - val_loss: 0.1918 - val_acc: 0.9192\n", 563 | "Epoch 8/25\n", 564 | "228947/228947 [==============================] - 187s 819us/step - loss: 0.1853 - acc: 0.9209 - val_loss: 0.1880 - val_acc: 0.9228\n", 565 | "Epoch 9/25\n", 566 | "228947/228947 [==============================] - 187s 817us/step - loss: 0.1789 - acc: 0.9233 - val_loss: 0.1787 - val_acc: 0.9243\n", 567 | "Epoch 10/25\n", 568 | "228947/228947 [==============================] - 189s 824us/step - loss: 0.1732 - acc: 0.9263 - val_loss: 0.1790 - val_acc: 0.9259\n", 569 | "Epoch 11/25\n", 570 | " 71680/228947 [========>.....................] - ETA: 2:04 - loss: 0.1647 - acc: 0.9303" 571 | ] 572 | }, 573 | { 574 | "ename": "KeyboardInterrupt", 575 | "evalue": "", 576 | "output_type": "error", 577 | "traceback": [ 578 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 579 | "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 580 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 90\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdev_x1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_x2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_f\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_y\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 91\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 92\u001b[1;33m \u001b[0mcallbacks\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mearly_stopping\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlr_reducer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcheck_point\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 93\u001b[0m )\n\u001b[0;32m 94\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 581 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)\u001b[0m\n\u001b[0;32m 1040\u001b[0m \u001b[0minitial_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0minitial_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msteps_per_epoch\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m validation_steps=validation_steps)\n\u001b[0m\u001b[0;32m 1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1044\u001b[0m def evaluate(self, x=None, y=None,\n", 582 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\engine\\training_arrays.py\u001b[0m in \u001b[0;36mfit_loop\u001b[1;34m(model, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mins_batch\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 198\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 199\u001b[1;33m \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mins_batch\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 200\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[0mouts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mouts\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 583 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m 2665\u001b[0m \u001b[1;34m'In order to feed symbolic tensors to a Keras model '\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2666\u001b[0m 'in TensorFlow, you need tensorflow 1.8 or higher.')\n\u001b[1;32m-> 2667\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_legacy_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2668\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2669\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 584 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\keras\\backend\\tensorflow_backend.py\u001b[0m in \u001b[0;36m_legacy_call\u001b[1;34m(self, inputs)\u001b[0m\n\u001b[0;32m 2647\u001b[0m \u001b[0msession\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mget_session\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2648\u001b[0m updated = session.run(fetches=fetches, feed_dict=feed_dict,\n\u001b[1;32m-> 2649\u001b[1;33m **self.session_kwargs)\n\u001b[0m\u001b[0;32m 2650\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mupdated\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moutputs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2651\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", 585 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 903\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 904\u001b[0m result = self._run(None, fetches, feed_dict, options_ptr,\n\u001b[1;32m--> 905\u001b[1;33m run_metadata_ptr)\n\u001b[0m\u001b[0;32m 906\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mrun_metadata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 907\u001b[0m \u001b[0mproto_data\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf_session\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTF_GetBuffer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrun_metadata_ptr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 586 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1135\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mfinal_fetches\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mfinal_targets\u001b[0m \u001b[1;32mor\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfeed_dict_tensor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1136\u001b[0m results = self._do_run(handle, final_targets, final_fetches,\n\u001b[1;32m-> 1137\u001b[1;33m feed_dict_tensor, options, run_metadata)\n\u001b[0m\u001b[0;32m 1138\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1139\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 587 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m 1353\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mhandle\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1354\u001b[0m return self._do_call(_run_fn, self._session, feeds, fetches, targets,\n\u001b[1;32m-> 1355\u001b[1;33m options, run_metadata)\n\u001b[0m\u001b[0;32m 1356\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1357\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_prun_fn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_session\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeeds\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetches\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 588 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m 1359\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_do_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1360\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1361\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1362\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0merrors\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mOpError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1363\u001b[0m \u001b[0mmessage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mas_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmessage\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 589 | "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\client\\session.py\u001b[0m in \u001b[0;36m_run_fn\u001b[1;34m(session, feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m 1338\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1339\u001b[0m return tf_session.TF_Run(session, options, feed_dict, fetch_list,\n\u001b[1;32m-> 1340\u001b[1;33m target_list, status, run_metadata)\n\u001b[0m\u001b[0;32m 1341\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1342\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_prun_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msession\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mhandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfeed_dict\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfetch_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 590 | "\u001b[1;31mKeyboardInterrupt\u001b[0m: " 591 | ] 592 | } 593 | ], 594 | "source": [ 595 | "from sklearn.model_selection import StratifiedKFold\n", 596 | "\n", 597 | "best_results = []\n", 598 | "# last_results = []\n", 599 | "best_file_names = []\n", 600 | "dev_predictions = []\n", 601 | "\n", 602 | "for i, (train_index, dev_index) in enumerate(StratifiedKFold(n_splits=10).split(X=char1, y=label)): # word/char switch\n", 603 | " print(\"-\" * 60)\n", 604 | " print(\"Fold {} training start...\".format(i))\n", 605 | " \n", 606 | " train_x1, train_x2, train_y = char1[train_index, :], char2[train_index, :], label[train_index] # word/char switch\n", 607 | " dev_x1, dev_x2, dev_y = char1[dev_index, :], char2[dev_index, :], label[dev_index] # word/char switch\n", 608 | " train_f, dev_f = train_feature[train_index, :], train_feature[dev_index, :]\n", 609 | " \n", 610 | " input1 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 611 | " input2 = Input(shape=(SEQ_LEN,), dtype=\"int32\")\n", 612 | " inputf = Input(shape=(train_f.shape[1],), dtype=\"float32\")\n", 613 | "\n", 614 | " embedding_layer = Embedding(\n", 615 | " input_dim=char_embedding_data.shape[0], # word/char switch\n", 616 | " output_dim=char_embedding_data.shape[1], # word/char switch\n", 617 | " weights=[char_embedding_data], # word/char switch\n", 618 | " input_length=SEQ_LEN,\n", 619 | " trainable=False\n", 620 | " )\n", 621 | " \n", 622 | " vector1 = embedding_layer(input1)\n", 623 | " vector2 = embedding_layer(input2)\n", 624 | " \n", 625 | " lstm_layer1 = LSTM(LSTM_SIZE1, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 626 | " layer1a = lstm_layer1(vector1)\n", 627 | " layer1a = Dropout(LSTM_DROP_RATE)(layer1a)\n", 628 | " layer1b = lstm_layer1(vector2)\n", 629 | " layer1b = Dropout(LSTM_DROP_RATE)(layer1b)\n", 630 | "\n", 631 | " lstm_layer2 = LSTM(LSTM_SIZE2, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE, return_sequences=True)\n", 632 | " layer2a = lstm_layer2(layer1a)\n", 633 | " layer2b = lstm_layer2(layer1b)\n", 634 | " \n", 635 | " conv1a, conv1b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN1, kernel_size=1)\n", 636 | " conv2a, conv2b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN2, kernel_size=2)\n", 637 | " conv3a, conv3b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN3, kernel_size=3)\n", 638 | " conv4a, conv4b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN4, kernel_size=4)\n", 639 | " conv5a, conv5b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN5, kernel_size=5)\n", 640 | " conv6a, conv6b = cnn_double_layer(layer2a, layer2b, filters=CONV_LEN6, kernel_size=6)\n", 641 | " \n", 642 | " merge_a = concatenate([conv1a, conv2a, conv3a, conv4a, conv5a, conv6a])\n", 643 | " merge_b = concatenate([conv1b, conv2b, conv3b, conv4b, conv5b, conv6b])\n", 644 | " diff = Lambda(lambda x: K.abs(x[0] - x[1]))([merge_a, merge_b])\n", 645 | " mult = Lambda(lambda x: x[0] * x[1])([merge_a, merge_b])\n", 646 | " merge = concatenate([diff, mult])\n", 647 | " \n", 648 | " x = Dropout(DROP_RATE)(merge)\n", 649 | " x = BatchNormalization()(x)\n", 650 | " x = Dense(DENSE_SIZE1, activation=\"relu\")(x)\n", 651 | " \n", 652 | " fe = BatchNormalization()(inputf)\n", 653 | " fe = Dense(DENSE_FEATURE, activation=\"relu\")(fe)\n", 654 | " \n", 655 | " x = concatenate([x, fe])\n", 656 | " \n", 657 | " x = Dropout(DROP_RATE)(x)\n", 658 | " x = BatchNormalization()(x)\n", 659 | " x = Dense(DENSE_SIZE2, activation=\"relu\")(x)\n", 660 | " x = Dropout(DROP_RATE)(x)\n", 661 | " x = BatchNormalization()(x)\n", 662 | " pred = Dense(1, activation=\"sigmoid\")(x)\n", 663 | " \n", 664 | " model = Model(inputs=[input1, input2, inputf], outputs=pred)\n", 665 | " model.compile(\n", 666 | " optimizer=\"nadam\",\n", 667 | " loss=\"binary_crossentropy\",\n", 668 | " metrics=[\"acc\"]\n", 669 | " )\n", 670 | " \n", 671 | " early_stopping = EarlyStopping(\"val_loss\", patience=6)\n", 672 | " lr_reducer = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0005)\n", 673 | " check_point = ModelCheckpoint(\n", 674 | " \"./log/%s.Multi_LSTM_CNN_v5.char.{epoch:03d}.hdf5\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 675 | " monitor=\"val_loss\",\n", 676 | " save_best_only=True,\n", 677 | " )\n", 678 | " \n", 679 | " fit_res = model.fit(\n", 680 | " x=[train_x1, train_x2, train_f],\n", 681 | " y=train_y,\n", 682 | " batch_size=BATCH_SIZE,\n", 683 | " epochs=NUM_EPOCHES,\n", 684 | " validation_data=([dev_x1, dev_x2, dev_f], dev_y),\n", 685 | " shuffle=True,\n", 686 | " callbacks=[early_stopping, lr_reducer, check_point]\n", 687 | " )\n", 688 | " \n", 689 | "# pred_last = model.predict([test_char1, test_char2, test_feature], batch_size=BATCH_SIZE) # word/char switch\n", 690 | "# last_results.append(pd.DataFrame(pred_last, columns=[\"y_pre\"]))\n", 691 | " \n", 692 | " best_model_file = glob(\"./log/*.hdf5\")[-1].replace(\"\\\\\", \"/\")\n", 693 | " best_file_names.append(best_model_file)\n", 694 | " print(\"load model %s\" % (best_model_file,))\n", 695 | " model.load_weights(best_model_file)\n", 696 | " pred_best = model.predict([test_char1, test_char2, test_feature], batch_size=BATCH_SIZE) # word/char switch\n", 697 | " best_results.append(pd.DataFrame(pred_best, columns=[\"y_pre\"]))\n", 698 | " \n", 699 | " dev_pred = model.predict([dev_x1, dev_x2, dev_f], batch_size=BATCH_SIZE)\n", 700 | " dev_result = pd.DataFrame({\"pred\": dev_pred.ravel(), \"label\": dev_y})\n", 701 | " dev_predictions.append(dev_result)\n", 702 | "\n", 703 | "# pd.DataFrame(pd.concat(last_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 704 | "# \"./result/%s-Multi_LSTM_CNN_v5_char_last.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 705 | "# index=False\n", 706 | "# )\n", 707 | "pd.DataFrame(pd.concat(best_results, axis=1).mean(axis=1), columns=[\"y_pre\"]).to_csv(\n", 708 | " \"./result/%s-Multi_LSTM_CNN_v5_char_best.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 709 | " index=False\n", 710 | ")\n", 711 | "\n", 712 | "total_dev = pd.concat(dev_predictions, axis=0)\n", 713 | "total_dev.to_csv(\n", 714 | " \"./result/%s-Multi_LSTM_CNN_v5_char_dev_result.csv\" % (datetime.now().strftime(\"%Y%m%d-%H%M%S\")), # word/char switch\n", 715 | " index=False\n", 716 | ")\n", 717 | "\n", 718 | "model_path = \"./log/\" + datetime.now().strftime(\"%Y%m%d-%H%M%S\") + \"/\"\n", 719 | "os.mkdir(model_path)\n", 720 | "for model_name in best_file_names:\n", 721 | " abs_name = os.path.split(model_name)[1]\n", 722 | " os.rename(model_name, model_path + abs_name)" 723 | ] 724 | } 725 | ], 726 | "metadata": { 727 | "kernelspec": { 728 | "display_name": "Python 3", 729 | "language": "python", 730 | "name": "python3" 731 | }, 732 | "language_info": { 733 | "codemirror_mode": { 734 | "name": "ipython", 735 | "version": 3 736 | }, 737 | "file_extension": ".py", 738 | "mimetype": "text/x-python", 739 | "name": "python", 740 | "nbconvert_exporter": "python", 741 | "pygments_lexer": "ipython3", 742 | "version": "3.6.2" 743 | } 744 | }, 745 | "nbformat": 4, 746 | "nbformat_minor": 2 747 | } 748 | --------------------------------------------------------------------------------